My initializer stuff

This commit is contained in:
2024-09-17 15:09:41 -03:00
commit 17aae61838
3644 changed files with 556522 additions and 0 deletions

View File

@@ -0,0 +1,116 @@
/*===---- algorithm - CUDA wrapper for <algorithm> -------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
#define __CLANG_CUDA_WRAPPERS_ALGORITHM
// This header defines __device__ overloads of std::min/max.
//
// Ideally we'd declare these functions only if we're <= C++11. In C++14,
// these functions are constexpr, and so are implicitly __host__ __device__.
//
// However, the compiler being in C++14 mode does not imply that the standard
// library supports C++14. There is no macro we can test to check that the
// stdlib has constexpr std::min/max. Thus we have to unconditionally define
// our device overloads.
//
// A host+device function cannot be overloaded, and a constexpr function
// implicitly become host device if there's no explicitly host or device
// overload preceding it. So the simple thing to do would be to declare our
// device min/max overloads, and then #include_next <algorithm>. This way our
// device overloads would come first, and so if we have a C++14 stdlib, its
// min/max won't become host+device and conflict with our device overloads.
//
// But that also doesn't work. libstdc++ is evil and declares std::min/max in
// an internal header that is included *before* <algorithm>. Thus by the time
// we're inside of this file, std::min/max may already have been declared, and
// thus we can't prevent them from becoming host+device if they're constexpr.
//
// Therefore we perpetrate the following hack: We mark our __device__ overloads
// with __attribute__((enable_if(true, ""))). This causes the signature of the
// function to change without changing anything else about it. (Except that
// overload resolution will prefer it over the __host__ __device__ version
// rather than considering them equally good).
#include_next <algorithm>
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
#pragma push_macro("_CPP14_CONSTEXPR")
#if __cplusplus >= 201402L
#define _CPP14_CONSTEXPR constexpr
#else
#define _CPP14_CONSTEXPR
#endif
template <class __T, class __Cmp>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__a, __b) ? __b : __a;
}
template <class __T>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
}
template <class __T, class __Cmp>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__b, __a) ? __b : __a;
}
template <class __T>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b) {
return __a < __b ? __a : __b;
}
#pragma pop_macro("_CPP14_CONSTEXPR")
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM

View File

@@ -0,0 +1,9 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/basic_string.h"
#pragma pop_macro("__noinline__")

View File

@@ -0,0 +1,9 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/basic_string.tcc"
#pragma pop_macro("__noinline__")

View File

@@ -0,0 +1,9 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/shared_ptr_base.h"
#pragma pop_macro("__noinline__")

View File

@@ -0,0 +1,90 @@
/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_CMATH
#define __CLANG_CUDA_WRAPPERS_CMATH
#include_next <cmath>
#if defined(_LIBCPP_STD_VER)
// libc++ will need long double variants of these functions, but CUDA does not
// provide them. We'll provide their declarations, which should allow the
// headers to parse, but would not allow accidental use of them on a GPU.
__attribute__((device)) long double logb(long double);
__attribute__((device)) long double scalbn(long double, int);
namespace std {
// For __constexpr_fmin/fmax we only need device-side overloads before c++14
// where they are not constexpr.
#if _LIBCPP_STD_VER < 14
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
return __builtin_fmaxf(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
return __builtin_fmax(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
return __builtin_fmaxl(__x, __y);
}
template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
using __result_type = typename __promote<_Tp, _Up>::type;
return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
}
#endif // _LIBCPP_STD_VER < 14
// For logb/scalbn templates we must always provide device overloads because
// libc++ implementation uses __builtin_XXX which gets translated into a libcall
// which we can't handle on GPU. We need to forward those to CUDA-provided
// implementations.
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
return ::logb(__x);
}
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
return ::scalbn(__x, __exp);
}
} // namespace std//
#endif // _LIBCPP_STD_VER
#endif // include guard

View File

@@ -0,0 +1,90 @@
/*===---- complex - CUDA wrapper for <complex> ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
#define __CLANG_CUDA_WRAPPERS_COMPLEX
// Wrapper around <complex> that forces its functions to be __host__
// __device__.
// First, include host-only headers we think are likely to be included by
// <complex>, so that the pragma below only applies to <complex> itself.
#if __cplusplus >= 201103L
#include <type_traits>
#endif
#include <stdexcept>
#include <cmath>
#include <sstream>
// Next, include our <algorithm> wrapper, to ensure that device overloads of
// std::min/max are available.
#include <algorithm>
#pragma clang force_cuda_host_device begin
// When compiling for device, ask libstdc++ to use its own implements of
// complex functions, rather than calling builtins (which resolve to library
// functions that don't exist when compiling CUDA device code).
//
// This is a little dicey, because it causes libstdc++ to define a different
// set of overloads on host and device.
//
// // Present only when compiling for host.
// __host__ __device__ void complex<float> sin(const complex<float>& x) {
// return __builtin_csinf(x);
// }
//
// // Present when compiling for host and for device.
// template <typename T>
// void __host__ __device__ complex<T> sin(const complex<T>& x) {
// return complex<T>(sin(x.real()) * cosh(x.imag()),
// cos(x.real()), sinh(x.imag()));
// }
//
// This is safe because when compiling for device, all function calls in
// __host__ code to sin() will still resolve to *something*, even if they don't
// resolve to the same function as they resolve to when compiling for host. We
// don't care that they don't resolve to the right function because we won't
// codegen this host code when compiling for device.
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#define _GLIBCXX_USE_C99_COMPLEX 0
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
// Work around a compatibility issue with libstdc++ 11.1.0
// https://bugs.llvm.org/show_bug.cgi?id=50383
#pragma push_macro("__failed_assertion")
#if _GLIBCXX_RELEASE == 11
#define __failed_assertion __cuda_failed_assertion
#endif
#include_next <complex>
#pragma pop_macro("__failed_assertion")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma clang force_cuda_host_device end
#endif // include guard

View File

@@ -0,0 +1,106 @@
/*===---- new - CUDA wrapper for <new> -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_NEW
#define __CLANG_CUDA_WRAPPERS_NEW
#include_next <new>
#if !defined(__device__)
// The header has been included too early from the standard C++ library
// and CUDA-specific macros are not available yet.
// Undo the include guard and try again later.
#undef __CLANG_CUDA_WRAPPERS_NEW
#else
#pragma push_macro("CUDA_NOEXCEPT")
#if __cplusplus >= 201103L
#define CUDA_NOEXCEPT noexcept
#else
#define CUDA_NOEXCEPT
#endif
// Device overrides for non-placement new and delete.
__device__ inline void *operator new(__SIZE_TYPE__ size) {
if (size == 0) {
size = 1;
}
return ::malloc(size);
}
__device__ inline void *operator new(__SIZE_TYPE__ size,
const std::nothrow_t &) CUDA_NOEXCEPT {
return ::operator new(size);
}
__device__ inline void *operator new[](__SIZE_TYPE__ size) {
return ::operator new(size);
}
__device__ inline void *operator new[](__SIZE_TYPE__ size,
const std::nothrow_t &) {
return ::operator new(size);
}
__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
if (ptr) {
::free(ptr);
}
}
__device__ inline void operator delete(void *ptr,
const std::nothrow_t &) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void *ptr,
const std::nothrow_t &) CUDA_NOEXCEPT {
::operator delete(ptr);
}
// Sized delete, C++14 only.
#if __cplusplus >= 201402L
__device__ inline void operator delete(void *ptr,
__SIZE_TYPE__ size) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void *ptr,
__SIZE_TYPE__ size) CUDA_NOEXCEPT {
::operator delete(ptr);
}
#endif
// Device overrides for placement new and delete.
__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
#pragma pop_macro("CUDA_NOEXCEPT")
#endif // __device__
#endif // include guard