Spaces:
Sleeping
Sleeping
| /** Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved. | |
| * NVIDIA CORPORATION and its licensors retain all intellectual property | |
| * and proprietary rights in and to this software, related documentation | |
| * and any modifications thereto. Any use, reproduction, disclosure or | |
| * distribution of this software and related documentation without an express | |
| * license agreement from NVIDIA CORPORATION is strictly prohibited. | |
| */ | |
| // All built-in types and functions. To be compatible with runtime NVRTC compilation | |
| // this header must be independently compilable (i.e.: without external SDK headers) | |
| // to achieve this we redefine a subset of CRT functions (printf, pow, sin, cos, etc) | |
| __device__ void __debugbreak() {} | |
| namespace wp | |
| { | |
| // numeric types (used from generated kernels) | |
| typedef float float32; | |
| typedef double float64; | |
| typedef int8_t int8; | |
| typedef uint8_t uint8; | |
| typedef int16_t int16; | |
| typedef uint16_t uint16; | |
| typedef int32_t int32; | |
| typedef uint32_t uint32; | |
| typedef int64_t int64; | |
| typedef uint64_t uint64; | |
| // matches Python string type for constant strings | |
| typedef const char* str; | |
| struct half; | |
| CUDA_CALLABLE half float_to_half(float x); | |
| CUDA_CALLABLE float half_to_float(half x); | |
| struct half | |
| { | |
| CUDA_CALLABLE inline half() : u(0) {} | |
| CUDA_CALLABLE inline half(float f) | |
| { | |
| *this = float_to_half(f); | |
| } | |
| unsigned short u; | |
| CUDA_CALLABLE inline bool operator==(const half& h) const { return u == h.u; } | |
| CUDA_CALLABLE inline bool operator!=(const half& h) const { return u != h.u; } | |
| CUDA_CALLABLE inline bool operator>(const half& h) const { return half_to_float(*this) > half_to_float(h); } | |
| CUDA_CALLABLE inline bool operator>=(const half& h) const { return half_to_float(*this) >= half_to_float(h); } | |
| CUDA_CALLABLE inline bool operator<(const half& h) const { return half_to_float(*this) < half_to_float(h); } | |
| CUDA_CALLABLE inline bool operator<=(const half& h) const { return half_to_float(*this) <= half_to_float(h); } | |
| CUDA_CALLABLE inline bool operator!() const | |
| { | |
| return float32(*this) == 0; | |
| } | |
| CUDA_CALLABLE inline half operator*=(const half& h) | |
| { | |
| half prod = half(float32(*this) * float32(h)); | |
| this->u = prod.u; | |
| return *this; | |
| } | |
| CUDA_CALLABLE inline half operator/=(const half& h) | |
| { | |
| half quot = half(float32(*this) / float32(h)); | |
| this->u = quot.u; | |
| return *this; | |
| } | |
| CUDA_CALLABLE inline half operator+=(const half& h) | |
| { | |
| half sum = half(float32(*this) + float32(h)); | |
| this->u = sum.u; | |
| return *this; | |
| } | |
| CUDA_CALLABLE inline half operator-=(const half& h) | |
| { | |
| half diff = half(float32(*this) - float32(h)); | |
| this->u = diff.u; | |
| return *this; | |
| } | |
| CUDA_CALLABLE inline operator float32() const { return float32(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator float64() const { return float64(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator int8() const { return int8(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator uint8() const { return uint8(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator int16() const { return int16(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator uint16() const { return uint16(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator int32() const { return int32(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator uint32() const { return uint32(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator int64() const { return int64(half_to_float(*this)); } | |
| CUDA_CALLABLE inline operator uint64() const { return uint64(half_to_float(*this)); } | |
| }; | |
| static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes"); | |
| typedef half float16; | |
| CUDA_CALLABLE inline half float_to_half(float x) | |
| { | |
| half h; | |
| asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(h.u) : "f"(x)); | |
| return h; | |
| } | |
| CUDA_CALLABLE inline float half_to_float(half x) | |
| { | |
| float val; | |
| asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(x.u)); | |
| return val; | |
| } | |
| // _Float16 is Clang's native half-precision floating-point type | |
| inline half float_to_half(float x) | |
| { | |
| _Float16 f16 = static_cast<_Float16>(x); | |
| return *reinterpret_cast<half*>(&f16); | |
| } | |
| inline float half_to_float(half h) | |
| { | |
| _Float16 f16 = *reinterpret_cast<_Float16*>(&h); | |
| return static_cast<float>(f16); | |
| } | |
| extern "C" WP_API uint16_t float_to_half_bits(float x); | |
| extern "C" WP_API float half_bits_to_float(uint16_t u); | |
| inline half float_to_half(float x) | |
| { | |
| half h; | |
| h.u = float_to_half_bits(x); | |
| return h; | |
| } | |
| inline float half_to_float(half h) | |
| { | |
| return half_bits_to_float(h.u); | |
| } | |
| // BAD operator implementations for fp16 arithmetic... | |
| // negation: | |
| inline CUDA_CALLABLE half operator - (half a) | |
| { | |
| return float_to_half( -half_to_float(a) ); | |
| } | |
| inline CUDA_CALLABLE half operator + (half a,half b) | |
| { | |
| return float_to_half( half_to_float(a) + half_to_float(b) ); | |
| } | |
| inline CUDA_CALLABLE half operator - (half a,half b) | |
| { | |
| return float_to_half( half_to_float(a) - half_to_float(b) ); | |
| } | |
| inline CUDA_CALLABLE half operator * (half a,half b) | |
| { | |
| return float_to_half( half_to_float(a) * half_to_float(b) ); | |
| } | |
| inline CUDA_CALLABLE half operator * (half a,double b) | |
| { | |
| return float_to_half( half_to_float(a) * b ); | |
| } | |
| inline CUDA_CALLABLE half operator * (double a,half b) | |
| { | |
| return float_to_half( a * half_to_float(b) ); | |
| } | |
| inline CUDA_CALLABLE half operator / (half a,half b) | |
| { | |
| return float_to_half( half_to_float(a) / half_to_float(b) ); | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE float cast_float(T x) { return (float)(x); } | |
| template <typename T> | |
| CUDA_CALLABLE int cast_int(T x) { return (int)(x); } | |
| template <typename T> | |
| CUDA_CALLABLE void adj_cast_float(T x, T& adj_x, float adj_ret) { adj_x += T(adj_ret); } | |
| template <typename T> | |
| CUDA_CALLABLE void adj_cast_int(T x, T& adj_x, int adj_ret) { adj_x += adj_ret; } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_int8(T, T&, int8) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_uint8(T, T&, uint8) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_int16(T, T&, int16) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_uint16(T, T&, uint16) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_int32(T, T&, int32) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_uint32(T, T&, uint32) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_int64(T, T&, int64) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_uint64(T, T&, uint64) {} | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float16 adj_ret) { adj_x += T(adj_ret); } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_float32(T x, T& adj_x, float32 adj_ret) { adj_x += T(adj_ret); } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_float64(T x, T& adj_x, float64 adj_ret) { adj_x += T(adj_ret); } | |
| // basic ops for integer types | |
| inline CUDA_CALLABLE int8 abs(int8 x) { return ::abs(x); } | |
| inline CUDA_CALLABLE int16 abs(int16 x) { return ::abs(x); } | |
| inline CUDA_CALLABLE int32 abs(int32 x) { return ::abs(x); } | |
| inline CUDA_CALLABLE int64 abs(int64 x) { return ::llabs(x); } | |
| inline CUDA_CALLABLE uint8 abs(uint8 x) { return x; } | |
| inline CUDA_CALLABLE uint16 abs(uint16 x) { return x; } | |
| inline CUDA_CALLABLE uint32 abs(uint32 x) { return x; } | |
| inline CUDA_CALLABLE uint64 abs(uint64 x) { return x; } | |
| DECLARE_INT_OPS(int8) | |
| DECLARE_INT_OPS(int16) | |
| DECLARE_INT_OPS(int32) | |
| DECLARE_INT_OPS(int64) | |
| DECLARE_INT_OPS(uint8) | |
| DECLARE_INT_OPS(uint16) | |
| DECLARE_INT_OPS(uint32) | |
| DECLARE_INT_OPS(uint64) | |
| inline CUDA_CALLABLE int8 step(int8 x) { return x < 0 ? 1 : 0; } | |
| inline CUDA_CALLABLE int16 step(int16 x) { return x < 0 ? 1 : 0; } | |
| inline CUDA_CALLABLE int32 step(int32 x) { return x < 0 ? 1 : 0; } | |
| inline CUDA_CALLABLE int64 step(int64 x) { return x < 0 ? 1 : 0; } | |
| inline CUDA_CALLABLE uint8 step(uint8 x) { return 0; } | |
| inline CUDA_CALLABLE uint16 step(uint16 x) { return 0; } | |
| inline CUDA_CALLABLE uint32 step(uint32 x) { return 0; } | |
| inline CUDA_CALLABLE uint64 step(uint64 x) { return 0; } | |
| inline CUDA_CALLABLE int8 sign(int8 x) { return x < 0 ? -1 : 1; } | |
| inline CUDA_CALLABLE int8 sign(int16 x) { return x < 0 ? -1 : 1; } | |
| inline CUDA_CALLABLE int8 sign(int32 x) { return x < 0 ? -1 : 1; } | |
| inline CUDA_CALLABLE int8 sign(int64 x) { return x < 0 ? -1 : 1; } | |
| inline CUDA_CALLABLE uint8 sign(uint8 x) { return 1; } | |
| inline CUDA_CALLABLE uint16 sign(uint16 x) { return 1; } | |
| inline CUDA_CALLABLE uint32 sign(uint32 x) { return 1; } | |
| inline CUDA_CALLABLE uint64 sign(uint64 x) { return 1; } | |
| inline bool CUDA_CALLABLE isfinite(half x) | |
| { | |
| return ::isfinite(float(x)); | |
| } | |
| inline bool CUDA_CALLABLE isfinite(float x) | |
| { | |
| return ::isfinite(x); | |
| } | |
| inline bool CUDA_CALLABLE isfinite(double x) | |
| { | |
| return ::isfinite(x); | |
| } | |
| inline CUDA_CALLABLE void print(float16 f) | |
| { | |
| printf("%g\n", half_to_float(f)); | |
| } | |
| inline CUDA_CALLABLE void print(float f) | |
| { | |
| printf("%g\n", f); | |
| } | |
| inline CUDA_CALLABLE void print(double f) | |
| { | |
| printf("%g\n", f); | |
| } | |
| // basic ops for float types | |
| DECLARE_FLOAT_OPS(float16) | |
| DECLARE_FLOAT_OPS(float32) | |
| DECLARE_FLOAT_OPS(float64) | |
| // basic ops for float types | |
| inline CUDA_CALLABLE float16 mod(float16 a, float16 b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b)); | |
| assert(0); | |
| } | |
| return fmodf(float(a), float(b)); | |
| } | |
| inline CUDA_CALLABLE float32 mod(float32 a, float32 b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || b == 0.0f) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b); | |
| assert(0); | |
| } | |
| return fmodf(a, b); | |
| } | |
| inline CUDA_CALLABLE double mod(double a, double b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || b == 0.0f) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b); | |
| assert(0); | |
| } | |
| return fmod(a, b); | |
| } | |
| inline CUDA_CALLABLE half log(half a) | |
| { | |
| if (!isfinite(a) || float(a) < 0.0f) | |
| { | |
| printf("%s:%d log(%f)\n", __FILE__, __LINE__, float(a)); | |
| assert(0); | |
| } | |
| return ::logf(a); | |
| } | |
| inline CUDA_CALLABLE float log(float a) | |
| { | |
| if (!isfinite(a) || a < 0.0f) | |
| { | |
| printf("%s:%d log(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::logf(a); | |
| } | |
| inline CUDA_CALLABLE double log(double a) | |
| { | |
| if (!isfinite(a) || a < 0.0) | |
| { | |
| printf("%s:%d log(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::log(a); | |
| } | |
| inline CUDA_CALLABLE half log2(half a) | |
| { | |
| if (!isfinite(a) || float(a) < 0.0f) | |
| { | |
| printf("%s:%d log2(%f)\n", __FILE__, __LINE__, float(a)); | |
| assert(0); | |
| } | |
| return ::log2f(float(a)); | |
| } | |
| inline CUDA_CALLABLE float log2(float a) | |
| { | |
| if (!isfinite(a) || a < 0.0f) | |
| { | |
| printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::log2f(a); | |
| } | |
| inline CUDA_CALLABLE double log2(double a) | |
| { | |
| if (!isfinite(a) || a < 0.0) | |
| { | |
| printf("%s:%d log2(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::log2(a); | |
| } | |
| inline CUDA_CALLABLE half log10(half a) | |
| { | |
| if (!isfinite(a) || float(a) < 0.0f) | |
| { | |
| printf("%s:%d log10(%f)\n", __FILE__, __LINE__, float(a)); | |
| assert(0); | |
| } | |
| return ::log10f(float(a)); | |
| } | |
| inline CUDA_CALLABLE float log10(float a) | |
| { | |
| if (!isfinite(a) || a < 0.0f) | |
| { | |
| printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::log10f(a); | |
| } | |
| inline CUDA_CALLABLE double log10(double a) | |
| { | |
| if (!isfinite(a) || a < 0.0) | |
| { | |
| printf("%s:%d log10(%f)\n", __FILE__, __LINE__, a); | |
| assert(0); | |
| } | |
| return ::log10(a); | |
| } | |
| inline CUDA_CALLABLE half exp(half a) | |
| { | |
| half result = ::expf(float(a)); | |
| if (!isfinite(a) || !isfinite(result)) | |
| { | |
| printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, float(a), float(result)); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE float exp(float a) | |
| { | |
| float result = ::expf(a); | |
| if (!isfinite(a) || !isfinite(result)) | |
| { | |
| printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE double exp(double a) | |
| { | |
| double result = ::exp(a); | |
| if (!isfinite(a) || !isfinite(result)) | |
| { | |
| printf("%s:%d exp(%f) = %f\n", __FILE__, __LINE__, a, result); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE half pow(half a, half b) | |
| { | |
| float result = ::powf(float(a), float(b)); | |
| if (!isfinite(float(a)) || !isfinite(float(b)) || !isfinite(result)) | |
| { | |
| printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, float(a), float(b), result); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE float pow(float a, float b) | |
| { | |
| float result = ::powf(a, b); | |
| if (!isfinite(a) || !isfinite(b) || !isfinite(result)) | |
| { | |
| printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE double pow(double a, double b) | |
| { | |
| double result = ::pow(a, b); | |
| if (!isfinite(a) || !isfinite(b) || !isfinite(result)) | |
| { | |
| printf("%s:%d pow(%f, %f) = %f\n", __FILE__, __LINE__, a, b, result); | |
| assert(0); | |
| } | |
| return result; | |
| } | |
| inline CUDA_CALLABLE half floordiv(half a, half b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || float(b) == 0.0f) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, float(a), float(b)); | |
| assert(0); | |
| } | |
| return floorf(float(a/b)); | |
| } | |
| inline CUDA_CALLABLE float floordiv(float a, float b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || b == 0.0f) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b); | |
| assert(0); | |
| } | |
| return floorf(a/b); | |
| } | |
| inline CUDA_CALLABLE double floordiv(double a, double b) | |
| { | |
| if (!isfinite(a) || !isfinite(b) || b == 0.0) | |
| { | |
| printf("%s:%d mod(%f, %f)\n", __FILE__, __LINE__, a, b); | |
| assert(0); | |
| } | |
| return ::floor(a/b); | |
| } | |
| inline CUDA_CALLABLE float leaky_min(float a, float b, float r) { return min(a, b); } | |
| inline CUDA_CALLABLE float leaky_max(float a, float b, float r) { return max(a, b); } | |
| inline CUDA_CALLABLE half abs(half x) { return ::fabsf(float(x)); } | |
| inline CUDA_CALLABLE float abs(float x) { return ::fabsf(x); } | |
| inline CUDA_CALLABLE double abs(double x) { return ::fabs(x); } | |
| inline CUDA_CALLABLE float acos(float x){ return ::acosf(min(max(x, -1.0f), 1.0f)); } | |
| inline CUDA_CALLABLE float asin(float x){ return ::asinf(min(max(x, -1.0f), 1.0f)); } | |
| inline CUDA_CALLABLE float atan(float x) { return ::atanf(x); } | |
| inline CUDA_CALLABLE float atan2(float y, float x) { return ::atan2f(y, x); } | |
| inline CUDA_CALLABLE float sin(float x) { return ::sinf(x); } | |
| inline CUDA_CALLABLE float cos(float x) { return ::cosf(x); } | |
| inline CUDA_CALLABLE double acos(double x){ return ::acos(min(max(x, -1.0), 1.0)); } | |
| inline CUDA_CALLABLE double asin(double x){ return ::asin(min(max(x, -1.0), 1.0)); } | |
| inline CUDA_CALLABLE double atan(double x) { return ::atan(x); } | |
| inline CUDA_CALLABLE double atan2(double y, double x) { return ::atan2(y, x); } | |
| inline CUDA_CALLABLE double sin(double x) { return ::sin(x); } | |
| inline CUDA_CALLABLE double cos(double x) { return ::cos(x); } | |
| inline CUDA_CALLABLE half acos(half x){ return ::acosf(min(max(float(x), -1.0f), 1.0f)); } | |
| inline CUDA_CALLABLE half asin(half x){ return ::asinf(min(max(float(x), -1.0f), 1.0f)); } | |
| inline CUDA_CALLABLE half atan(half x) { return ::atanf(float(x)); } | |
| inline CUDA_CALLABLE half atan2(half y, half x) { return ::atan2f(float(y), float(x)); } | |
| inline CUDA_CALLABLE half sin(half x) { return ::sinf(float(x)); } | |
| inline CUDA_CALLABLE half cos(half x) { return ::cosf(float(x)); } | |
| inline CUDA_CALLABLE float sqrt(float x) | |
| { | |
| if (x < 0.0f) | |
| { | |
| printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x); | |
| assert(0); | |
| } | |
| return ::sqrtf(x); | |
| } | |
| inline CUDA_CALLABLE double sqrt(double x) | |
| { | |
| if (x < 0.0) | |
| { | |
| printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, x); | |
| assert(0); | |
| } | |
| return ::sqrt(x); | |
| } | |
| inline CUDA_CALLABLE half sqrt(half x) | |
| { | |
| if (float(x) < 0.0f) | |
| { | |
| printf("%s:%d sqrt(%f)\n", __FILE__, __LINE__, float(x)); | |
| assert(0); | |
| } | |
| return ::sqrtf(float(x)); | |
| } | |
| inline CUDA_CALLABLE float cbrt(float x) { return ::cbrtf(x); } | |
| inline CUDA_CALLABLE double cbrt(double x) { return ::cbrt(x); } | |
| inline CUDA_CALLABLE half cbrt(half x) { return ::cbrtf(float(x)); } | |
| inline CUDA_CALLABLE float tan(float x) { return ::tanf(x); } | |
| inline CUDA_CALLABLE float sinh(float x) { return ::sinhf(x);} | |
| inline CUDA_CALLABLE float cosh(float x) { return ::coshf(x);} | |
| inline CUDA_CALLABLE float tanh(float x) { return ::tanhf(x);} | |
| inline CUDA_CALLABLE float degrees(float x) { return x * RAD_TO_DEG;} | |
| inline CUDA_CALLABLE float radians(float x) { return x * DEG_TO_RAD;} | |
| inline CUDA_CALLABLE double tan(double x) { return ::tan(x); } | |
| inline CUDA_CALLABLE double sinh(double x) { return ::sinh(x);} | |
| inline CUDA_CALLABLE double cosh(double x) { return ::cosh(x);} | |
| inline CUDA_CALLABLE double tanh(double x) { return ::tanh(x);} | |
| inline CUDA_CALLABLE double degrees(double x) { return x * RAD_TO_DEG;} | |
| inline CUDA_CALLABLE double radians(double x) { return x * DEG_TO_RAD;} | |
| inline CUDA_CALLABLE half tan(half x) { return ::tanf(float(x)); } | |
| inline CUDA_CALLABLE half sinh(half x) { return ::sinhf(float(x));} | |
| inline CUDA_CALLABLE half cosh(half x) { return ::coshf(float(x));} | |
| inline CUDA_CALLABLE half tanh(half x) { return ::tanhf(float(x));} | |
| inline CUDA_CALLABLE half degrees(half x) { return x * RAD_TO_DEG;} | |
| inline CUDA_CALLABLE half radians(half x) { return x * DEG_TO_RAD;} | |
| inline CUDA_CALLABLE float round(float x) { return ::roundf(x); } | |
| inline CUDA_CALLABLE float rint(float x) { return ::rintf(x); } | |
| inline CUDA_CALLABLE float trunc(float x) { return ::truncf(x); } | |
| inline CUDA_CALLABLE float floor(float x) { return ::floorf(x); } | |
| inline CUDA_CALLABLE float ceil(float x) { return ::ceilf(x); } | |
| inline CUDA_CALLABLE float frac(float x) { return x - trunc(x); } | |
| inline CUDA_CALLABLE double round(double x) { return ::round(x); } | |
| inline CUDA_CALLABLE double rint(double x) { return ::rint(x); } | |
| inline CUDA_CALLABLE double trunc(double x) { return ::trunc(x); } | |
| inline CUDA_CALLABLE double floor(double x) { return ::floor(x); } | |
| inline CUDA_CALLABLE double ceil(double x) { return ::ceil(x); } | |
| inline CUDA_CALLABLE double frac(double x) { return x - trunc(x); } | |
| inline CUDA_CALLABLE half round(half x) { return ::roundf(float(x)); } | |
| inline CUDA_CALLABLE half rint(half x) { return ::rintf(float(x)); } | |
| inline CUDA_CALLABLE half trunc(half x) { return ::truncf(float(x)); } | |
| inline CUDA_CALLABLE half floor(half x) { return ::floorf(float(x)); } | |
| inline CUDA_CALLABLE half ceil(half x) { return ::ceilf(float(x)); } | |
| inline CUDA_CALLABLE half frac(half x) { return float(x) - trunc(float(x)); } | |
| DECLARE_ADJOINTS(float16) | |
| DECLARE_ADJOINTS(float32) | |
| DECLARE_ADJOINTS(float64) | |
| template <typename C, typename T> | |
| CUDA_CALLABLE inline T select(const C& cond, const T& a, const T& b) | |
| { | |
| // The double NOT operator !! casts to bool without compiler warnings. | |
| return (!!cond) ? b : a; | |
| } | |
| template <typename C, typename T> | |
| CUDA_CALLABLE inline void adj_select(const C& cond, const T& a, const T& b, C& adj_cond, T& adj_a, T& adj_b, const T& adj_ret) | |
| { | |
| // The double NOT operator !! casts to bool without compiler warnings. | |
| if (!!cond) | |
| adj_b += adj_ret; | |
| else | |
| adj_a += adj_ret; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline T copy(const T& src) | |
| { | |
| return src; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_copy(const T& src, T& adj_src, T& adj_dest) | |
| { | |
| adj_src = adj_dest; | |
| adj_dest = T{}; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline void assign(T& dest, const T& src) | |
| { | |
| dest = src; | |
| } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_assign(T& dest, const T& src, T& adj_dest, T& adj_src) | |
| { | |
| // this is generally a non-differentiable operation since it violates SSA, | |
| // except in read-modify-write statements which are reversible through backpropagation | |
| adj_src = adj_dest; | |
| adj_dest = T{}; | |
| } | |
| // some helpful operator overloads (just for C++ use, these are not adjointed) | |
| template <typename T> | |
| CUDA_CALLABLE inline T& operator += (T& a, const T& b) { a = add(a, b); return a; } | |
| template <typename T> | |
| CUDA_CALLABLE inline T& operator -= (T& a, const T& b) { a = sub(a, b); return a; } | |
| template <typename T> | |
| CUDA_CALLABLE inline T operator+(const T& a, const T& b) { return add(a, b); } | |
| template <typename T> | |
| CUDA_CALLABLE inline T operator-(const T& a, const T& b) { return sub(a, b); } | |
| template <typename T> | |
| CUDA_CALLABLE inline T pos(const T& x) { return x; } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_pos(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(adj_ret); } | |
| // unary negation implemented as negative multiply, not sure the fp implications of this | |
| // may be better as 0.0 - x? | |
| template <typename T> | |
| CUDA_CALLABLE inline T neg(const T& x) { return T(0.0) - x; } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_neg(const T& x, T& adj_x, const T& adj_ret) { adj_x += T(-adj_ret); } | |
| // unary boolean negation | |
| template <typename T> | |
| CUDA_CALLABLE inline bool unot(const T& b) { return !b; } | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_unot(const T& b, T& adj_b, const bool& adj_ret) { } | |
| const int LAUNCH_MAX_DIMS = 4; // should match types.py | |
| struct launch_bounds_t | |
| { | |
| int shape[LAUNCH_MAX_DIMS]; // size of each dimension | |
| int ndim; // number of valid dimension | |
| size_t size; // total number of threads | |
| }; | |
| static size_t s_threadIdx; | |
| inline CUDA_CALLABLE size_t grid_index() | |
| { | |
| // Need to cast at least one of the variables being multiplied so that type promotion happens before the multiplication | |
| size_t grid_index = static_cast<size_t>(blockDim.x) * static_cast<size_t>(blockIdx.x) + static_cast<size_t>(threadIdx.x); | |
| return grid_index; | |
| return s_threadIdx; | |
| } | |
| inline CUDA_CALLABLE int tid(size_t index) | |
| { | |
| // For the 1-D tid() we need to warn the user if we're about to provide a truncated index | |
| // Only do this in _DEBUG when called from device to avoid excessive register allocation | |
| if (index > 2147483647) { | |
| printf("Warp warning: tid() is returning an overflowed int\n"); | |
| } | |
| return static_cast<int>(index); | |
| } | |
| inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, size_t index, const launch_bounds_t& launch_bounds) | |
| { | |
| const size_t n = launch_bounds.shape[1]; | |
| // convert to work item | |
| i = index/n; | |
| j = index%n; | |
| } | |
| inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, size_t index, const launch_bounds_t& launch_bounds) | |
| { | |
| const size_t n = launch_bounds.shape[1]; | |
| const size_t o = launch_bounds.shape[2]; | |
| // convert to work item | |
| i = index/(n*o); | |
| j = index%(n*o)/o; | |
| k = index%o; | |
| } | |
| inline CUDA_CALLABLE_DEVICE void tid(int& i, int& j, int& k, int& l, size_t index, const launch_bounds_t& launch_bounds) | |
| { | |
| const size_t n = launch_bounds.shape[1]; | |
| const size_t o = launch_bounds.shape[2]; | |
| const size_t p = launch_bounds.shape[3]; | |
| // convert to work item | |
| i = index/(n*o*p); | |
| j = index%(n*o*p)/(o*p); | |
| k = index%(o*p)/p; | |
| l = index%p; | |
| } | |
| template<typename T> | |
| inline CUDA_CALLABLE T atomic_add(T* buf, T value) | |
| { | |
| T old = buf[0]; | |
| buf[0] += value; | |
| return old; | |
| return atomicAdd(buf, value); | |
| } | |
| template<> | |
| inline CUDA_CALLABLE float16 atomic_add(float16* buf, float16 value) | |
| { | |
| float16 old = buf[0]; | |
| buf[0] += value; | |
| return old; | |
| __half r = atomicAdd(reinterpret_cast<__half*>(buf), *reinterpret_cast<__half*>(&value)); | |
| return *reinterpret_cast<float16*>(&r); | |
| //return atomicAdd(buf, value); | |
| /* Define __PTR for atomicAdd prototypes below, undef after done */ | |
| half r = 0.0; | |
| asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" | |
| : "=h"(r.u) | |
| : __PTR(buf), "h"(value.u) | |
| : "memory"); | |
| return r; | |
| } | |
| // emulate atomic float max | |
| inline CUDA_CALLABLE float atomic_max(float* address, float val) | |
| { | |
| int *address_as_int = (int*)address; | |
| int old = *address_as_int, assumed; | |
| while (val > __int_as_float(old)) | |
| { | |
| assumed = old; | |
| old = atomicCAS(address_as_int, assumed, | |
| __float_as_int(val)); | |
| } | |
| return __int_as_float(old); | |
| float old = *address; | |
| *address = max(old, val); | |
| return old; | |
| } | |
| // emulate atomic float min/max with atomicCAS() | |
| inline CUDA_CALLABLE float atomic_min(float* address, float val) | |
| { | |
| int *address_as_int = (int*)address; | |
| int old = *address_as_int, assumed; | |
| while (val < __int_as_float(old)) | |
| { | |
| assumed = old; | |
| old = atomicCAS(address_as_int, assumed, | |
| __float_as_int(val)); | |
| } | |
| return __int_as_float(old); | |
| float old = *address; | |
| *address = min(old, val); | |
| return old; | |
| } | |
| inline CUDA_CALLABLE int atomic_max(int* address, int val) | |
| { | |
| return atomicMax(address, val); | |
| int old = *address; | |
| *address = max(old, val); | |
| return old; | |
| } | |
| // atomic int min | |
| inline CUDA_CALLABLE int atomic_min(int* address, int val) | |
| { | |
| return atomicMin(address, val); | |
| int old = *address; | |
| *address = min(old, val); | |
| return old; | |
| } | |
| // default behavior for adjoint of atomic min/max operation that accumulates gradients for all elements matching the min/max value | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_atomic_minmax(T *addr, T *adj_addr, const T &value, T &adj_value) | |
| { | |
| if (value == *addr) | |
| adj_value += *adj_addr; | |
| } | |
| // for integral types we do not accumulate gradients | |
| CUDA_CALLABLE inline void adj_atomic_minmax(int8* buf, int8* adj_buf, const int8 &value, int8 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(uint8* buf, uint8* adj_buf, const uint8 &value, uint8 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(int16* buf, int16* adj_buf, const int16 &value, int16 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(uint16* buf, uint16* adj_buf, const uint16 &value, uint16 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(int32* buf, int32* adj_buf, const int32 &value, int32 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(uint32* buf, uint32* adj_buf, const uint32 &value, uint32 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(int64* buf, int64* adj_buf, const int64 &value, int64 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(uint64* buf, uint64* adj_buf, const uint64 &value, uint64 &adj_value) { } | |
| CUDA_CALLABLE inline void adj_atomic_minmax(bool* buf, bool* adj_buf, const bool &value, bool &adj_value) { } | |
| } // namespace wp | |
| // bool and printf are defined outside of the wp namespace in crt.h, hence | |
| // their adjoint counterparts are also defined in the global namespace. | |
| template <typename T> | |
| CUDA_CALLABLE inline void adj_bool(T, T&, bool) {} | |
| inline CUDA_CALLABLE void adj_printf(const char* fmt, ...) {} | |
| //-------------- | |
| namespace wp | |
| { | |
| // dot for scalar types just to make some templates compile for scalar/vector | |
| inline CUDA_CALLABLE float dot(float a, float b) { return mul(a, b); } | |
| inline CUDA_CALLABLE void adj_dot(float a, float b, float& adj_a, float& adj_b, float adj_ret) { adj_mul(a, b, adj_a, adj_b, adj_ret); } | |
| inline CUDA_CALLABLE float tensordot(float a, float b) { return mul(a, b); } | |
| DECLARE_INTERP_FUNCS(float16) | |
| DECLARE_INTERP_FUNCS(float32) | |
| DECLARE_INTERP_FUNCS(float64) | |
| inline CUDA_CALLABLE void print(const str s) | |
| { | |
| printf("%s\n", s); | |
| } | |
| inline CUDA_CALLABLE void print(int i) | |
| { | |
| printf("%d\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(short i) | |
| { | |
| printf("%hd\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(long i) | |
| { | |
| printf("%ld\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(long long i) | |
| { | |
| printf("%lld\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(unsigned i) | |
| { | |
| printf("%u\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(unsigned short i) | |
| { | |
| printf("%hu\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(unsigned long i) | |
| { | |
| printf("%lu\n", i); | |
| } | |
| inline CUDA_CALLABLE void print(unsigned long long i) | |
| { | |
| printf("%llu\n", i); | |
| } | |
| template<unsigned Length, typename Type> | |
| inline CUDA_CALLABLE void print(vec_t<Length, Type> v) | |
| { | |
| for( unsigned i=0; i < Length; ++i ) | |
| { | |
| printf("%g ", float(v[i])); | |
| } | |
| printf("\n"); | |
| } | |
| template<typename Type> | |
| inline CUDA_CALLABLE void print(quat_t<Type> i) | |
| { | |
| printf("%g %g %g %g\n", float(i.x), float(i.y), float(i.z), float(i.w)); | |
| } | |
| template<unsigned Rows,unsigned Cols,typename Type> | |
| inline CUDA_CALLABLE void print(const mat_t<Rows,Cols,Type> &m) | |
| { | |
| for( unsigned i=0; i< Rows; ++i ) | |
| { | |
| for( unsigned j=0; j< Cols; ++j ) | |
| { | |
| printf("%g ",float(m.data[i][j])); | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| template<typename Type> | |
| inline CUDA_CALLABLE void print(transform_t<Type> t) | |
| { | |
| printf("(%g %g %g) (%g %g %g %g)\n", float(t.p[0]), float(t.p[1]), float(t.p[2]), float(t.q.x), float(t.q.y), float(t.q.z), float(t.q.w)); | |
| } | |
| inline CUDA_CALLABLE void adj_print(int i, int adj_i) { printf("%d adj: %d\n", i, adj_i); } | |
| inline CUDA_CALLABLE void adj_print(float f, float adj_f) { printf("%g adj: %g\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(short f, short adj_f) { printf("%hd adj: %hd\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(long f, long adj_f) { printf("%ld adj: %ld\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(long long f, long long adj_f) { printf("%lld adj: %lld\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(unsigned f, unsigned adj_f) { printf("%u adj: %u\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(unsigned short f, unsigned short adj_f) { printf("%hu adj: %hu\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(unsigned long f, unsigned long adj_f) { printf("%lu adj: %lu\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(unsigned long long f, unsigned long long adj_f) { printf("%llu adj: %llu\n", f, adj_f); } | |
| inline CUDA_CALLABLE void adj_print(half h, half adj_h) { printf("%g adj: %g\n", half_to_float(h), half_to_float(adj_h)); } | |
| inline CUDA_CALLABLE void adj_print(double f, double adj_f) { printf("%g adj: %g\n", f, adj_f); } | |
| template<unsigned Length, typename Type> | |
| inline CUDA_CALLABLE void adj_print(vec_t<Length, Type> v, vec_t<Length, Type>& adj_v) { printf("%g %g adj: %g %g \n", v[0], v[1], adj_v[0], adj_v[1]); } | |
| template<unsigned Rows, unsigned Cols, typename Type> | |
| inline CUDA_CALLABLE void adj_print(mat_t<Rows, Cols, Type> m, mat_t<Rows, Cols, Type>& adj_m) { } | |
| template<typename Type> | |
| inline CUDA_CALLABLE void adj_print(quat_t<Type> q, quat_t<Type>& adj_q) { printf("%g %g %g %g adj: %g %g %g %g\n", q.x, q.y, q.z, q.w, adj_q.x, adj_q.y, adj_q.z, adj_q.w); } | |
| template<typename Type> | |
| inline CUDA_CALLABLE void adj_print(transform_t<Type> t, transform_t<Type>& adj_t) {} | |
| inline CUDA_CALLABLE void adj_print(str t, str& adj_t) {} | |
| template <typename T> | |
| inline CUDA_CALLABLE void expect_eq(const T& actual, const T& expected) | |
| { | |
| if (!(actual == expected)) | |
| { | |
| printf("Error, expect_eq() failed:\n"); | |
| printf("\t Expected: "); print(expected); | |
| printf("\t Actual: "); print(actual); | |
| } | |
| } | |
| template <typename T> | |
| inline CUDA_CALLABLE void adj_expect_eq(const T& a, const T& b, T& adj_a, T& adj_b) | |
| { | |
| // nop | |
| } | |
| template <typename T> | |
| inline CUDA_CALLABLE void expect_neq(const T& actual, const T& expected) | |
| { | |
| if (actual == expected) | |
| { | |
| printf("Error, expect_neq() failed:\n"); | |
| printf("\t Expected: "); print(expected); | |
| printf("\t Actual: "); print(actual); | |
| } | |
| } | |
| template <typename T> | |
| inline CUDA_CALLABLE void adj_expect_neq(const T& a, const T& b, T& adj_a, T& adj_b) | |
| { | |
| // nop | |
| } | |
| template <typename T> | |
| inline CUDA_CALLABLE void expect_near(const T& actual, const T& expected, const T& tolerance) | |
| { | |
| if (abs(actual - expected) > tolerance) | |
| { | |
| printf("Error, expect_near() failed with tolerance "); print(tolerance); | |
| printf("\t Expected: "); print(expected); | |
| printf("\t Actual: "); print(actual); | |
| } | |
| } | |
| inline CUDA_CALLABLE void expect_near(const vec3& actual, const vec3& expected, const float& tolerance) | |
| { | |
| const float diff = max(max(abs(actual[0] - expected[0]), abs(actual[1] - expected[1])), abs(actual[2] - expected[2])); | |
| if (diff > tolerance) | |
| { | |
| printf("Error, expect_near() failed with tolerance "); print(tolerance); | |
| printf("\t Expected: "); print(expected); | |
| printf("\t Actual: "); print(actual); | |
| } | |
| } | |
| template <typename T> | |
| inline CUDA_CALLABLE void adj_expect_near(const T& actual, const T& expected, const T& tolerance, T& adj_actual, T& adj_expected, T& adj_tolerance) | |
| { | |
| // nop | |
| } | |
| inline CUDA_CALLABLE void adj_expect_near(const vec3& actual, const vec3& expected, float tolerance, vec3& adj_actual, vec3& adj_expected, float adj_tolerance) | |
| { | |
| // nop | |
| } | |
| } // namespace wp | |
| // include array.h so we have the print, isfinite functions for the inner array types defined | |