| #define SLANG_PRELUDE_EXPORT |
|
|
| #ifdef __CUDACC_RTC__ |
| #define SLANG_CUDA_RTC 1 |
| #else |
| #define SLANG_CUDA_RTC 0 |
| #endif |
|
|
| #if SLANG_CUDA_RTC |
|
|
| #else |
|
|
| #include <cstdint> |
| #include <stdio.h> |
|
|
| #endif |
|
|
| |
| |
| |
| |
| |
|
|
| #ifdef SLANG_CUDA_ENABLE_HALF |
| |
| |
| # define __CUDA_NO_HALF2_OPERATORS__ |
| # include <cuda_fp16.h> |
| #endif |
|
|
| #ifdef SLANG_CUDA_ENABLE_OPTIX |
| #include <optix.h> |
| #endif |
|
|
| |
| #ifndef SLANG_OFFSET_OF |
| # define SLANG_OFFSET_OF(type, member) (size_t)((char*)&(((type *)0)->member) - (char*)0) |
| #endif |
|
|
| #ifndef SLANG_ALIGN_OF |
| # define SLANG_ALIGN_OF(type) __alignof__(type) |
| #endif |
|
|
| |
| #ifndef SLANG_INFINITY |
| # define SLANG_INFINITY ((float)(1e+300 * 1e+300)) |
| #endif |
|
|
| |
| #define SLANG_PRELUDE_ASSERT(x) |
|
|
| #ifndef SLANG_CUDA_WARP_SIZE |
| # define SLANG_CUDA_WARP_SIZE 32 |
| #endif |
|
|
| #define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1) |
| #define SLANG_CUDA_WARP_BITMASK (~int(0)) |
|
|
| |
| #define SLANG_FORCE_INLINE inline |
|
|
| #define SLANG_CUDA_CALL __device__ |
|
|
| #define SLANG_FORCE_INLINE inline |
| #define SLANG_INLINE inline |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| #define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); |
| #define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); |
|
|
| |
| #define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; |
| #define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; |
|
|
| |
| |
| #ifdef SLANG_ENABLE_BOUND_ZERO_INDEX |
| # define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) |
| # define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) |
| # define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) |
| #else |
| # define SLANG_BOUND_FIX(index, count) |
| # define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) |
| # define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) |
| #endif |
|
|
| #ifndef SLANG_BOUND_CHECK |
| # define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) |
| #endif |
|
|
| #ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS |
| # define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) |
| #endif |
|
|
| #ifndef SLANG_BOUND_CHECK_FIXED_ARRAY |
| # define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) |
| #endif |
|
|
| |
| |
| |
| |
| |
| |
| #ifndef SLANG_CUDA_BOUNDARY_MODE |
| # define SLANG_CUDA_BOUNDARY_MODE cudaBoundaryModeZero |
|
|
| |
| |
| |
| |
| |
| |
|
|
| # define SLANG_PTX_BOUNDARY_MODE "zero" |
| #endif |
|
|
| struct TypeInfo |
| { |
| size_t typeSize; |
| }; |
|
|
| template <typename T, size_t SIZE> |
| struct FixedArray |
| { |
| SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } |
| SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } |
| |
| T m_data[SIZE]; |
| }; |
|
|
| |
| |
| template <typename T> |
| struct Array |
| { |
| SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } |
| SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; } |
| |
| T* data; |
| size_t count; |
| }; |
|
|
| |
| typedef unsigned long long CUtexObject; |
| typedef unsigned long long CUsurfObject; |
|
|
| |
| |
| |
| |
| struct SamplerStateUnused; |
| typedef SamplerStateUnused* SamplerState; |
|
|
|
|
| |
| |
| typedef size_t NonUniformResourceIndex; |
|
|
| |
| template <typename T, int ROWS, int COLS> |
| struct Matrix; |
|
|
| typedef int1 bool1; |
| typedef int2 bool2; |
| typedef int3 bool3; |
| typedef int4 bool4; |
|
|
| #if SLANG_CUDA_RTC |
|
|
| typedef signed char int8_t; |
| typedef short int16_t; |
| typedef int int32_t; |
| typedef long long int64_t; |
|
|
| typedef unsigned char uint8_t; |
| typedef unsigned short uint16_t; |
| typedef unsigned int uint32_t; |
| typedef unsigned long long uint64_t; |
|
|
| #endif |
|
|
| typedef long long longlong; |
| typedef unsigned long long ulonglong; |
|
|
| typedef unsigned char uchar; |
| typedef unsigned short ushort; |
| typedef unsigned int uint; |
|
|
| union Union32 |
| { |
| uint32_t u; |
| int32_t i; |
| float f; |
| }; |
|
|
| union Union64 |
| { |
| uint64_t u; |
| int64_t i; |
| double d; |
| }; |
|
|
| template<typename T> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float make_float(T val) |
| { |
| return (float)val; |
| } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float _slang_fmod(float x, float y) |
| { |
| return ::fmodf(x, y); |
| } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double _slang_fmod(double x, double y) |
| { |
| return ::fmod(x, y); |
| } |
|
|
| #if SLANG_CUDA_ENABLE_HALF |
|
|
| |
| struct __half1 { __half x; }; |
| struct __align__(4) __half3 { __half x, y, z; }; |
| struct __align__(4) __half4 { __half x, y, z, w; }; |
| #endif |
|
|
| #define SLANG_VECTOR_GET_ELEMENT(T) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##1 x, int index) { return ((T*)(&x))[index]; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##2 x, int index) { return ((T*)(&x))[index]; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##3 x, int index) { return ((T*)(&x))[index]; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_vector_get_element(T##4 x, int index) { return ((T*)(&x))[index]; } |
| SLANG_VECTOR_GET_ELEMENT(int) |
| SLANG_VECTOR_GET_ELEMENT(uint) |
| SLANG_VECTOR_GET_ELEMENT(short) |
| SLANG_VECTOR_GET_ELEMENT(ushort) |
| SLANG_VECTOR_GET_ELEMENT(char) |
| SLANG_VECTOR_GET_ELEMENT(uchar) |
| SLANG_VECTOR_GET_ELEMENT(longlong) |
| SLANG_VECTOR_GET_ELEMENT(ulonglong) |
| SLANG_VECTOR_GET_ELEMENT(float) |
| SLANG_VECTOR_GET_ELEMENT(double) |
|
|
| #define SLANG_VECTOR_GET_ELEMENT_PTR(T) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##1* x, int index) { return ((T*)(x)) + index; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##2* x, int index) { return ((T*)(x)) + index; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##3* x, int index) { return ((T*)(x)) + index; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T* _slang_vector_get_element_ptr(T##4* x, int index) { return ((T*)(x)) + index; } |
| SLANG_VECTOR_GET_ELEMENT_PTR(int) |
| SLANG_VECTOR_GET_ELEMENT_PTR(uint) |
| SLANG_VECTOR_GET_ELEMENT_PTR(short) |
| SLANG_VECTOR_GET_ELEMENT_PTR(ushort) |
| SLANG_VECTOR_GET_ELEMENT_PTR(char) |
| SLANG_VECTOR_GET_ELEMENT_PTR(uchar) |
| SLANG_VECTOR_GET_ELEMENT_PTR(longlong) |
| SLANG_VECTOR_GET_ELEMENT_PTR(ulonglong) |
| SLANG_VECTOR_GET_ELEMENT_PTR(float) |
| SLANG_VECTOR_GET_ELEMENT_PTR(double) |
|
|
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_VECTOR_GET_ELEMENT(__half) |
| SLANG_VECTOR_GET_ELEMENT_PTR(__half) |
| #endif |
|
|
| #define SLANG_CUDA_VECTOR_BINARY_OP(T, n, op) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal, T##n other) \ |
| { \ |
| T##n result;\ |
| for (int i = 0; i < n; i++) \ |
| *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i); \ |
| return result;\ |
| } |
| #define SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, op) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool##n operator op(T##n thisVal, T##n other) \ |
| { \ |
| bool##n result;\ |
| for (int i = 0; i < n; i++) \ |
| *_slang_vector_get_element_ptr(&result, i) = (int)(_slang_vector_get_element(thisVal,i) op _slang_vector_get_element(other,i)); \ |
| return result;\ |
| } |
| #define SLANG_CUDA_VECTOR_UNARY_OP(T, n, op) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator op(T##n thisVal) \ |
| { \ |
| T##n result;\ |
| for (int i = 0; i < n; i++) \ |
| *_slang_vector_get_element_ptr(&result, i) = op _slang_vector_get_element(thisVal,i); \ |
| return result;\ |
| } |
|
|
| #define SLANG_CUDA_VECTOR_INT_OP(T, n) \ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, %)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, ^)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, &)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, |)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, >>)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, <<)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ |
| SLANG_CUDA_VECTOR_UNARY_OP(T, n, !)\ |
| SLANG_CUDA_VECTOR_UNARY_OP(T, n, -)\ |
| SLANG_CUDA_VECTOR_UNARY_OP(T, n, ~) |
|
|
| #define SLANG_CUDA_VECTOR_INT_OPS(T) \ |
| SLANG_CUDA_VECTOR_INT_OP(T, 2) \ |
| SLANG_CUDA_VECTOR_INT_OP(T, 3) \ |
| SLANG_CUDA_VECTOR_INT_OP(T, 4) |
|
|
| SLANG_CUDA_VECTOR_INT_OPS(int) |
| SLANG_CUDA_VECTOR_INT_OPS(uint) |
| SLANG_CUDA_VECTOR_INT_OPS(ushort) |
| SLANG_CUDA_VECTOR_INT_OPS(short) |
| SLANG_CUDA_VECTOR_INT_OPS(char) |
| SLANG_CUDA_VECTOR_INT_OPS(uchar) |
| SLANG_CUDA_VECTOR_INT_OPS(longlong) |
| SLANG_CUDA_VECTOR_INT_OPS(ulonglong) |
|
|
| #define SLANG_CUDA_VECTOR_FLOAT_OP(T, n) \ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, +)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, -)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, *)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, /)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, &&)\ |
| SLANG_CUDA_VECTOR_BINARY_OP(T, n, ||)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, >=)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, <=)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, ==)\ |
| SLANG_CUDA_VECTOR_BINARY_COMPARE_OP(T, n, !=)\ |
| SLANG_CUDA_VECTOR_UNARY_OP(T, n, -) |
| #define SLANG_CUDA_VECTOR_FLOAT_OPS(T) \ |
| SLANG_CUDA_VECTOR_FLOAT_OP(T, 2) \ |
| SLANG_CUDA_VECTOR_FLOAT_OP(T, 3) \ |
| SLANG_CUDA_VECTOR_FLOAT_OP(T, 4) |
|
|
| SLANG_CUDA_VECTOR_FLOAT_OPS(float) |
| SLANG_CUDA_VECTOR_FLOAT_OPS(double) |
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_CUDA_VECTOR_FLOAT_OPS(__half) |
| #endif |
| #define SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, n)\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n operator%(const T##n& left, const T##n& right) \ |
| {\ |
| T##n result;\ |
| for (int i = 0; i < n; i++) \ |
| *_slang_vector_get_element_ptr(&result, i) = _slang_fmod(_slang_vector_get_element(left,i), _slang_vector_get_element(right,i)); \ |
| return result;\ |
| } |
| #define SLANG_CUDA_FLOAT_VECTOR_MOD(T) \ |
| SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 2)\ |
| SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 3)\ |
| SLANG_CUDA_FLOAT_VECTOR_MOD_IMPL(T, 4) |
|
|
| SLANG_CUDA_FLOAT_VECTOR_MOD(float) |
| SLANG_CUDA_FLOAT_VECTOR_MOD(double) |
|
|
| #if SLANG_CUDA_RTC || SLANG_CUDA_ENABLE_HALF |
| #define SLANG_MAKE_VECTOR(T) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x, T y) { return T##2{x, y}; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x, T y, T z) { return T##3{ x, y, z }; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x, T y, T z, T w) { return T##4{ x, y, z, w }; } |
| #endif |
|
|
| #if SLANG_CUDA_RTC |
| SLANG_MAKE_VECTOR(int) |
| SLANG_MAKE_VECTOR(uint) |
| SLANG_MAKE_VECTOR(short) |
| SLANG_MAKE_VECTOR(ushort) |
| SLANG_MAKE_VECTOR(char) |
| SLANG_MAKE_VECTOR(uchar) |
| SLANG_MAKE_VECTOR(float) |
| SLANG_MAKE_VECTOR(double) |
| SLANG_MAKE_VECTOR(longlong) |
| SLANG_MAKE_VECTOR(ulonglong) |
| #endif |
|
|
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_MAKE_VECTOR(__half) |
| #endif |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool1 make_bool1(bool x) { return bool1{ x }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x, bool y) { return bool2{ x, y }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x, bool y, bool z) { return bool3{ x, y, z }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x, bool y, bool z, bool w) { return bool4{ x, y, z, w }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool2 make_bool2(bool x) { return bool2{ x, x }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool3 make_bool3(bool x) { return bool3{ x, x, x }; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool4 make_bool4(bool x) { return bool4{ x, x, x, x }; } |
|
|
| #if SLANG_CUDA_RTC |
| #define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##1 make_##T##1(T x) { return T##1{x}; }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); } |
| #else |
| #define SLANG_MAKE_VECTOR_FROM_SCALAR(T) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##2 make_##T##2(T x) { return make_##T##2(x, x); }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##3 make_##T##3(T x) { return make_##T##3(x, x, x); }\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##4 make_##T##4(T x) { return make_##T##4(x, x, x, x); } |
| #endif |
| SLANG_MAKE_VECTOR_FROM_SCALAR(int) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(uint) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(short) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(ushort) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(char) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(uchar) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(longlong) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(ulonglong) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(float) |
| SLANG_MAKE_VECTOR_FROM_SCALAR(double) |
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_MAKE_VECTOR_FROM_SCALAR(__half) |
| #if !SLANG_CUDA_RTC |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half1 make___half1(__half x) { return __half1{x}; } |
| #endif |
| #endif |
|
|
| #define SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(Fn,T,N) \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T##N Fn(T##N* address, T##N val) \ |
| {\ |
| T##N result; \ |
| for (int i = 0; i < N; i++) \ |
| *_slang_vector_get_element_ptr(&result, i) = Fn(_slang_vector_get_element_ptr(address, i), _slang_vector_get_element(val, i)); \ |
| return result; \ |
| }\ |
| |
| #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900 |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, float, 2) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, float, 4) |
| #endif |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, float, 3) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, int, 2) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, int, 3) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, int, 4) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, uint, 2) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, uint, 3) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, uint, 4) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 2) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 3) |
| SLANG_CUDA_VECTOR_ATOMIC_BINARY_IMPL(atomicAdd, ulonglong, 4) |
|
|
| template<typename T, int n> |
| struct GetVectorTypeImpl {}; |
|
|
| #define GET_VECTOR_TYPE_IMPL(T, n)\ |
| template<>\ |
| struct GetVectorTypeImpl<T,n>\ |
| {\ |
| typedef T##n type;\ |
| static SLANG_FORCE_INLINE SLANG_CUDA_CALL T##n fromScalar(T v) { return make_##T##n(v); } \ |
| }; |
| #define GET_VECTOR_TYPE_IMPL_N(T)\ |
| GET_VECTOR_TYPE_IMPL(T, 1)\ |
| GET_VECTOR_TYPE_IMPL(T, 2)\ |
| GET_VECTOR_TYPE_IMPL(T, 3)\ |
| GET_VECTOR_TYPE_IMPL(T, 4) |
|
|
| GET_VECTOR_TYPE_IMPL_N(int) |
| GET_VECTOR_TYPE_IMPL_N(uint) |
| GET_VECTOR_TYPE_IMPL_N(short) |
| GET_VECTOR_TYPE_IMPL_N(ushort) |
| GET_VECTOR_TYPE_IMPL_N(char) |
| GET_VECTOR_TYPE_IMPL_N(uchar) |
| GET_VECTOR_TYPE_IMPL_N(longlong) |
| GET_VECTOR_TYPE_IMPL_N(ulonglong) |
| GET_VECTOR_TYPE_IMPL_N(float) |
| GET_VECTOR_TYPE_IMPL_N(double) |
| #if SLANG_CUDA_ENABLE_HALF |
| GET_VECTOR_TYPE_IMPL_N(__half) |
| #endif |
| template<typename T, int n> |
| using Vector = typename GetVectorTypeImpl<T, n>::type; |
|
|
| template<typename T, int n, typename OtherT, int m> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, n> _slang_vector_reshape(const Vector<OtherT, m> other) |
| { |
| Vector<T, n> result; |
| for (int i = 0; i < n; i++) |
| { |
| OtherT otherElement = T(0); |
| if (i < m) |
| otherElement = _slang_vector_get_element(other, i); |
| *_slang_vector_get_element_ptr(&result, i) = (T)otherElement; |
| } |
| return result; |
| } |
|
|
| template <typename T, int ROWS, int COLS> |
| struct Matrix |
| { |
| Vector<T, COLS> rows[ROWS]; |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, COLS>& operator[](size_t index) { return rows[index]; } |
| }; |
|
|
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T scalar) |
| { |
| Matrix<T, ROWS, COLS> result; |
| for (int i = 0; i < ROWS; i++) |
| result.rows[i] = GetVectorTypeImpl<T, COLS>::fromScalar(scalar); |
| return result; |
|
|
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0) |
| { |
| Matrix<T, ROWS, COLS> result; |
| result.rows[0] = row0; |
| return result; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1) |
| { |
| Matrix<T, ROWS, COLS> result; |
| result.rows[0] = row0; |
| result.rows[1] = row1; |
| return result; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2) |
| { |
| Matrix<T, ROWS, COLS> result; |
| result.rows[0] = row0; |
| result.rows[1] = row1; |
| result.rows[2] = row2; |
| return result; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Vector<T, COLS>& row0, const Vector<T, COLS>& row1, const Vector<T, COLS>& row2, const Vector<T, COLS>& row3) |
| { |
| Matrix<T, ROWS, COLS> result; |
| result.rows[0] = row0; |
| result.rows[1] = row1; |
| result.rows[2] = row2; |
| result.rows[3] = row3; |
| return result; |
| } |
|
|
| template<typename T, int ROWS, int COLS, typename U, int otherRow, int otherCol> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(const Matrix<U, otherRow, otherCol>& other) |
| { |
| Matrix<T, ROWS, COLS> result; |
| int minRow = ROWS; |
| int minCol = COLS; |
| if (minRow > otherRow) minRow = otherRow; |
| if (minCol > otherCol) minCol = otherCol; |
| for (int i = 0; i < minRow; i++) |
| for (int j = 0; j < minCol; j++) |
| *_slang_vector_get_element_ptr(result.rows + i, j) = (T)_slang_vector_get_element(other.rows[i], j); |
| return result; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| rs.rows[0].x = v0; rs.rows[0].y = v1; |
| rs.rows[1].x = v2; rs.rows[1].y = v3; |
| return rs; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| if (COLS == 3) |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; |
| rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; |
| } |
| else |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; |
| rs.rows[1].x = v2; rs.rows[1].y = v3; |
| rs.rows[2].x = v4; rs.rows[2].y = v5; |
| } |
| return rs; |
|
|
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| if (COLS == 4) |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; |
| rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; |
| } |
| else |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; |
| rs.rows[1].x = v2; rs.rows[1].y = v3; |
| rs.rows[2].x = v4; rs.rows[2].y = v5; |
| rs.rows[3].x = v6; rs.rows[3].y = v7; |
| } |
| return rs; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; |
| rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; |
| rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; |
| return rs; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| if (COLS == 4) |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; |
| rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; |
| rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; |
| } |
| else |
| { |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; |
| rs.rows[1].x = v3; rs.rows[1].y = v4; rs.rows[1].z = v5; |
| rs.rows[2].x = v6; rs.rows[2].y = v7; rs.rows[2].z = v8; |
| rs.rows[3].x = v9; rs.rows[3].y = v10; rs.rows[3].z = v11; |
| } |
| return rs; |
| } |
|
|
| template<typename T, int ROWS, int COLS> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, ROWS, COLS> makeMatrix(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) |
| { |
| Matrix<T, ROWS, COLS> rs; |
| rs.rows[0].x = v0; rs.rows[0].y = v1; rs.rows[0].z = v2; rs.rows[0].w = v3; |
| rs.rows[1].x = v4; rs.rows[1].y = v5; rs.rows[1].z = v6; rs.rows[1].w = v7; |
| rs.rows[2].x = v8; rs.rows[2].y = v9; rs.rows[2].z = v10; rs.rows[2].w = v11; |
| rs.rows[3].x = v12; rs.rows[3].y = v13; rs.rows[3].z = v14; rs.rows[3].w = v15; |
| return rs; |
| } |
|
|
| #define SLANG_MATRIX_BINARY_OP(T, op) \ |
| template<int R, int C> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal, const Matrix<T, R, C>& other) \ |
| { \ |
| Matrix<T, R, C> result;\ |
| for (int i = 0; i < R; i++) \ |
| for (int j = 0; j < C; j++) \ |
| *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_vector_get_element(thisVal.rows[i], j) op _slang_vector_get_element(other.rows[i], j); \ |
| return result;\ |
| } |
|
|
| #define SLANG_MATRIX_UNARY_OP(T, op) \ |
| template<int R, int C> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator op(const Matrix<T, R, C>& thisVal) \ |
| { \ |
| Matrix<T, R, C> result;\ |
| for (int i = 0; i < R; i++) \ |
| for (int j = 0; j < C; j++) \ |
| *_slang_vector_get_element_ptr(result.rows+i,j) = op _slang_vector_get_element(thisVal.rows[i], j); \ |
| return result;\ |
| } |
| #define SLANG_INT_MATRIX_OPS(T) \ |
| SLANG_MATRIX_BINARY_OP(T, +)\ |
| SLANG_MATRIX_BINARY_OP(T, -)\ |
| SLANG_MATRIX_BINARY_OP(T, *)\ |
| SLANG_MATRIX_BINARY_OP(T, / )\ |
| SLANG_MATRIX_BINARY_OP(T, &)\ |
| SLANG_MATRIX_BINARY_OP(T, |)\ |
| SLANG_MATRIX_BINARY_OP(T, &&)\ |
| SLANG_MATRIX_BINARY_OP(T, ||)\ |
| SLANG_MATRIX_BINARY_OP(T, ^)\ |
| SLANG_MATRIX_BINARY_OP(T, %)\ |
| SLANG_MATRIX_UNARY_OP(T, !)\ |
| SLANG_MATRIX_UNARY_OP(T, ~) |
| #define SLANG_FLOAT_MATRIX_OPS(T) \ |
| SLANG_MATRIX_BINARY_OP(T, +)\ |
| SLANG_MATRIX_BINARY_OP(T, -)\ |
| SLANG_MATRIX_BINARY_OP(T, *)\ |
| SLANG_MATRIX_BINARY_OP(T, /)\ |
| SLANG_MATRIX_UNARY_OP(T, -) |
| SLANG_INT_MATRIX_OPS(int) |
| SLANG_INT_MATRIX_OPS(uint) |
| SLANG_INT_MATRIX_OPS(short) |
| SLANG_INT_MATRIX_OPS(ushort) |
| SLANG_INT_MATRIX_OPS(char) |
| SLANG_INT_MATRIX_OPS(uchar) |
| SLANG_INT_MATRIX_OPS(longlong) |
| SLANG_INT_MATRIX_OPS(ulonglong) |
| SLANG_FLOAT_MATRIX_OPS(float) |
| SLANG_FLOAT_MATRIX_OPS(double) |
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_FLOAT_MATRIX_OPS(__half) |
| #endif |
| #define SLANG_MATRIX_INT_NEG_OP(T) \ |
| template<int R, int C>\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator-(Matrix<T, R, C> thisVal) \ |
| { \ |
| Matrix<T, R, C> result;\ |
| for (int i = 0; i < R; i++) \ |
| for (int j = 0; j < C; j++) \ |
| *_slang_vector_get_element_ptr(result.rows+i,j) = 0 - _slang_vector_get_element(thisVal.rows[i], j); \ |
| return result;\ |
| } |
| SLANG_MATRIX_INT_NEG_OP(int) |
| SLANG_MATRIX_INT_NEG_OP(uint) |
| SLANG_MATRIX_INT_NEG_OP(short) |
| SLANG_MATRIX_INT_NEG_OP(ushort) |
| SLANG_MATRIX_INT_NEG_OP(char) |
| SLANG_MATRIX_INT_NEG_OP(uchar) |
| SLANG_MATRIX_INT_NEG_OP(longlong) |
| SLANG_MATRIX_INT_NEG_OP(ulonglong) |
|
|
| #define SLANG_FLOAT_MATRIX_MOD(T)\ |
| template<int R, int C> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<T, R, C> operator%(Matrix<T, R, C> left, Matrix<T, R, C> right) \ |
| {\ |
| Matrix<T, R, C> result;\ |
| for (int i = 0; i < R; i++) \ |
| for (int j = 0; j < C; j++) \ |
| *_slang_vector_get_element_ptr(result.rows+i,j) = _slang_fmod(_slang_vector_get_element(left.rows[i], j), _slang_vector_get_element(right.rows[i], j)); \ |
| return result;\ |
| } |
|
|
| SLANG_FLOAT_MATRIX_MOD(float) |
| SLANG_FLOAT_MATRIX_MOD(double) |
| #if SLANG_CUDA_ENABLE_HALF |
| template<int R, int C> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Matrix<__half, R, C> operator%(Matrix<__half, R, C> left, Matrix<__half, R, C> right) |
| { |
| Matrix<__half, R, C> result; |
| for (int i = 0; i < R; i++) |
| for (int j = 0; j < C; j++) |
| * _slang_vector_get_element_ptr(result.rows + i, j) = __float2half(_slang_fmod(__half2float(_slang_vector_get_element(left.rows[i], j)), __half2float(_slang_vector_get_element(right.rows[i], j)))); |
| return result; |
| } |
| #endif |
| #undef SLANG_FLOAT_MATRIX_MOD |
| #undef SLANG_MATRIX_BINARY_OP |
| #undef SLANG_MATRIX_UNARY_OP |
| #undef SLANG_INT_MATRIX_OPS |
| #undef SLANG_FLOAT_MATRIX_OPS |
| #undef SLANG_MATRIX_INT_NEG_OP |
| #undef SLANG_FLOAT_MATRIX_MOD |
|
|
| #define SLANG_SELECT_IMPL(T, N)\ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL Vector<T, N> _slang_select(bool##N condition, Vector<T, N> v0, Vector<T, N> v1) \ |
| { \ |
| Vector<T, N> result; \ |
| for (int i = 0; i < N; i++) \ |
| { \ |
| *_slang_vector_get_element_ptr(&result, i) = _slang_vector_get_element(condition, i) ? _slang_vector_get_element(v0, i) : _slang_vector_get_element(v1, i); \ |
| } \ |
| return result; \ |
| } |
| #define SLANG_SELECT_T(T)\ |
| SLANG_SELECT_IMPL(T, 2)\ |
| SLANG_SELECT_IMPL(T, 3)\ |
| SLANG_SELECT_IMPL(T, 4) |
|
|
| SLANG_SELECT_T(int) |
| SLANG_SELECT_T(uint) |
| SLANG_SELECT_T(short) |
| SLANG_SELECT_T(ushort) |
| SLANG_SELECT_T(char) |
| SLANG_SELECT_T(uchar) |
| SLANG_SELECT_T(float) |
| SLANG_SELECT_T(double) |
|
|
| template<typename T> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T _slang_select(bool condition, T v0, T v1) |
| { |
| return condition ? v0 : v1; |
| } |
|
|
| |
| |
| |
|
|
| #if SLANG_CUDA_ENABLE_HALF |
| SLANG_SELECT_T(__half) |
|
|
| |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 __ushort_as_half(const ushort2& i) { return __halves2half2(__ushort_as_half(i.x), __ushort_as_half(i.y)); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half3 __ushort_as_half(const ushort3& i) { return __half3{__ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z)}; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 __ushort_as_half(const ushort4& i) { return __half4{ __ushort_as_half(i.x), __ushort_as_half(i.y), __ushort_as_half(i.z), __ushort_as_half(i.w) }; } |
|
|
| |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort2 __half_as_ushort(const __half2& i) { return make_ushort2(__half_as_ushort(i.x), __half_as_ushort(i.y)); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort3 __half_as_ushort(const __half3& i) { return make_ushort3(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z)); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL ushort4 __half_as_ushort(const __half4& i) { return make_ushort4(__half_as_ushort(i.x), __half_as_ushort(i.y), __half_as_ushort(i.z), __half_as_ushort(i.w)); } |
|
|
| |
| |
| |
| |
| |
|
|
| template<> struct __nv_isurf_trait<__half> { typedef void type; }; |
| template<> struct __nv_isurf_trait<__half2> { typedef void type; }; |
| template<> struct __nv_isurf_trait<__half4> { typedef void type; }; |
|
|
| #define SLANG_DROP_PARENS(...) __VA_ARGS__ |
|
|
| #define SLANG_SURFACE_READ(FUNC_NAME, TYPE_ARGS, ARGS) \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half FUNC_NAME<__half>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| return __ushort_as_half(FUNC_NAME<ushort>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half2 FUNC_NAME<__half2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| return __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL __half4 FUNC_NAME<__half4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| return __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| } |
|
|
| SLANG_SURFACE_READ(surf1Dread, (int x), (x)) |
| SLANG_SURFACE_READ(surf2Dread, (int x, int y), (x, y)) |
| SLANG_SURFACE_READ(surf3Dread, (int x, int y, int z), (x, y, z)) |
| SLANG_SURFACE_READ(surf1DLayeredread, (int x, int layer), (x, layer)) |
| SLANG_SURFACE_READ(surf2DLayeredread, (int x, int y, int layer), (x, y, layer)) |
| SLANG_SURFACE_READ(surfCubemapread, (int x, int y, int face), (x, y, face)) |
| SLANG_SURFACE_READ(surfCubemapLayeredread, (int x, int y, int layerFace), (x, y, layerFace)) |
|
|
| #define SLANG_SURFACE_WRITE(FUNC_NAME, TYPE_ARGS, ARGS) \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half>(__half data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| FUNC_NAME<ushort>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half2>(__half2 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| FUNC_NAME<ushort2>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void FUNC_NAME<__half4>(__half4 data, cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| FUNC_NAME<ushort4>(__half_as_ushort(data), surfObj, SLANG_DROP_PARENS ARGS, boundaryMode); \ |
| } |
|
|
| SLANG_SURFACE_WRITE(surf1Dwrite, (int x), (x)) |
| SLANG_SURFACE_WRITE(surf2Dwrite, (int x, int y), (x, y)) |
| SLANG_SURFACE_WRITE(surf3Dwrite, (int x, int y, int z), (x, y, z)) |
| SLANG_SURFACE_WRITE(surf1DLayeredwrite, (int x, int layer), (x, layer)) |
| SLANG_SURFACE_WRITE(surf2DLayeredwrite, (int x, int y, int layer), (x, y, layer)) |
| SLANG_SURFACE_WRITE(surfCubemapwrite, (int x, int y, int face), (x, y, face)) |
| SLANG_SURFACE_WRITE(surfCubemapLayeredwrite, (int x, int y, int layerFace), (x, y, layerFace)) |
|
|
| |
| |
| |
| |
| |
|
|
| #define SLANG_SURFACE_READ_HALF_CONVERT(FUNC_NAME, TYPE_ARGS, ARGS) \ |
| \ |
| template <typename T> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL T FUNC_NAME##_convert(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode); \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float FUNC_NAME##_convert<float>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| return __ushort_as_half(FUNC_NAME<uint16_t>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float2 FUNC_NAME##_convert<float2>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| const __half2 v = __ushort_as_half(FUNC_NAME<ushort2>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| return float2{v.x, v.y}; \ |
| } \ |
| \ |
| template <> \ |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float4 FUNC_NAME##_convert<float4>(cudaSurfaceObject_t surfObj, SLANG_DROP_PARENS TYPE_ARGS, cudaSurfaceBoundaryMode boundaryMode) \ |
| { \ |
| const __half4 v = __ushort_as_half(FUNC_NAME<ushort4>(surfObj, SLANG_DROP_PARENS ARGS, boundaryMode)); \ |
| return float4{v.x, v.y, v.z, v.w}; \ |
| } |
|
|
| SLANG_SURFACE_READ_HALF_CONVERT(surf1Dread, (int x), (x)) |
| SLANG_SURFACE_READ_HALF_CONVERT(surf2Dread, (int x, int y), (x, y)) |
| SLANG_SURFACE_READ_HALF_CONVERT(surf3Dread, (int x, int y, int z), (x, y, z)) |
|
|
| #endif |
|
|
| |
|
|
| |
| |
|
|
| template <typename T> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode); |
| template <typename T> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode); |
| template <typename T> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert(T, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode); |
|
|
| |
| |
|
|
| |
|
|
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2};}\n\t" :: "l"(surfObj),"r"(x),"f"(v)); |
| } |
| |
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(v)); |
| } |
|
|
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float>(float v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(v)); |
| } |
|
|
| |
|
|
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y; |
| asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy)); |
| } |
| |
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y; |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy)); |
| } |
|
|
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float2>(float2 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y; |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy)); |
| } |
|
|
| |
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf1Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; |
| asm volatile ( "{sust.p.1d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1}], {%2,%3,%4,%5};}\n\t" :: "l"(surfObj),"r"(x),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); |
| } |
| |
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf2Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2}], {%3,%4,%5,%6};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); |
| } |
|
|
| template <> |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void surf3Dwrite_convert<float4>(float4 v, cudaSurfaceObject_t surfObj, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode) |
| { |
| const float vx = v.x, vy = v.y, vz = v.z, vw = v.w; |
| asm volatile ( "{sust.p.2d.b32." SLANG_PTX_BOUNDARY_MODE " [%0, {%1,%2,%3}], {%4,%5,%6,%7};}\n\t" :: "l"(surfObj),"r"(x),"r"(y),"r"(z),"f"(vx),"f"(vy),"f"(vz),"f"(vw)); |
| } |
|
|
| |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_ceil(float f) { return ::ceilf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_floor(float f) { return ::floorf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_round(float f) { return ::roundf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sin(float f) { return ::sinf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cos(float f) { return ::cosf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void F32_sincos(float f, float* s, float* c) { ::sincosf(f, s, c); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tan(float f) { return ::tanf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_asin(float f) { return ::asinf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_acos(float f) { return ::acosf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan(float f) { return ::atanf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sinh(float f) { return ::sinhf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_cosh(float f) { return ::coshf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_tanh(float f) { return ::tanhf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log2(float f) { return ::log2f(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log(float f) { return ::logf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_log10(float f) { return ::log10f(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp2(float f) { return ::exp2f(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_exp(float f) { return ::expf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_abs(float f) { return ::fabsf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_trunc(float f) { return ::truncf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sqrt(float f) { return ::sqrtf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_rsqrt(float f) { return ::rsqrtf(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_sign(float f) { return ( f == 0.0f) ? f : (( f < 0.0f) ? -1.0f : 1.0f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frac(float f) { return f - F32_floor(f); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isnan(float f) { return isnan(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isfinite(float f) { return isfinite(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F32_isinf(float f) { return isinf(f); } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_min(float a, float b) { return ::fminf(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_max(float a, float b) { return ::fmaxf(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_pow(float a, float b) { return ::powf(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fmod(float a, float b) { return ::fmodf(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_remainder(float a, float b) { return ::remainderf(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_atan2(float a, float b) { return float(::atan2(a, b)); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_frexp(float x, int* e) { return frexpf(x, e); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_modf(float x, float* ip) |
| { |
| return ::modff(x, ip); |
| } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t F32_asuint(float f) { Union32 u; u.f = f; return u.u; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t F32_asint(float f) { Union32 u; u.f = f; return u.i; } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float F32_fma(float a, float b, float c) { return ::fmaf(a, b, c); } |
|
|
|
|
| |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_ceil(double f) { return ::ceil(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_floor(double f) { return ::floor(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_round(double f) { return ::round(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sin(double f) { return ::sin(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cos(double f) { return ::cos(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_sincos(double f, double* s, double* c) { ::sincos(f, s, c); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tan(double f) { return ::tan(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_asin(double f) { return ::asin(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_acos(double f) { return ::acos(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan(double f) { return ::atan(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sinh(double f) { return ::sinh(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_cosh(double f) { return ::cosh(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_tanh(double f) { return ::tanh(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log2(double f) { return ::log2(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log(double f) { return ::log(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_log10(float f) { return ::log10(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp2(double f) { return ::exp2(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_exp(double f) { return ::exp(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_abs(double f) { return ::fabs(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_trunc(double f) { return ::trunc(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sqrt(double f) { return ::sqrt(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_rsqrt(double f) { return ::rsqrt(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_sign(double f) { return (f == 0.0) ? f : ((f < 0.0) ? -1.0 : 1.0); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frac(double f) { return f - F64_floor(f); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isnan(double f) { return isnan(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isfinite(double f) { return isfinite(f); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL bool F64_isinf(double f) { return isinf(f); } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_min(double a, double b) { return ::fmin(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_max(double a, double b) { return ::fmax(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_pow(double a, double b) { return ::pow(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fmod(double a, double b) { return ::fmod(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_remainder(double a, double b) { return ::remainder(a, b); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_atan2(double a, double b) { return ::atan2(a, b); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_frexp(double x, int* e) { return ::frexp(x, e); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_modf(double x, double* ip) |
| { |
| return ::modf(x, ip); |
| } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_asuint(double d, uint32_t* low, uint32_t* hi) |
| { |
| Union64 u; |
| u.d = d; |
| *low = uint32_t(u.u); |
| *hi = uint32_t(u.u >> 32); |
| } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL void F64_asint(double d, int32_t* low, int32_t* hi) |
| { |
| Union64 u; |
| u.d = d; |
| *low = int32_t(u.u); |
| *hi = int32_t(u.u >> 32); |
| } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c) { return ::fma(a, b, c); } |
|
|
| |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f) { return (f < 0) ? -f : f; } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b) { return a < b ? a : b; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b) { return a > b ? a : b; } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x) { Union32 u; u.i = x; return u.f; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x) { return uint32_t(x); } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi ) |
| { |
| Union64 u; |
| u.u = (uint64_t(hi) << 32) | uint32_t(low); |
| return u.d; |
| } |
|
|
| |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_abs(uint32_t f) { return f; } |
|
|
| |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_min(uint32_t a, uint32_t b) { return a < b ? a : b; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_max(uint32_t a, uint32_t b) { return a > b ? a : b; } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL float U32_asfloat(uint32_t x) { Union32 u; u.u = x; return u.f; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_asint(int32_t x) { return uint32_t(x); } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi) |
| { |
| Union64 u; |
| u.u = (uint64_t(hi) << 32) | low; |
| return u.d; |
| } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v) |
| { |
| |
| return __popc(v); |
| } |
|
|
|
|
| |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f) { return (f < 0) ? -f : f; } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b) { return a < b ? a : b; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) { return a > b ? a : b; } |
|
|
| |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) { return f; } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_min(uint64_t a, uint64_t b) { return a < b ? a : b; } |
| SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) { return a > b ? a : b; } |
|
|
| SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v) |
| { |
| |
| return __popcll(v); |
| } |
|
|
|
|
| |
|
|
|
|
| |
| |
|
|
| template <typename T> |
| struct StructuredBuffer |
| { |
| SLANG_CUDA_CALL const T& operator[](size_t index) const |
| { |
| #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT |
| SLANG_BOUND_CHECK(index, count); |
| #endif |
| return data[index]; |
| } |
|
|
| SLANG_CUDA_CALL const T& Load(size_t index) const |
| { |
| #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT |
| SLANG_BOUND_CHECK(index, count); |
| #endif |
| return data[index]; |
| } |
|
|
| #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT |
| SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } |
| #endif |
|
|
| T* data; |
| #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT |
| size_t count; |
| #endif |
| }; |
|
|
| template <typename T> |
| struct RWStructuredBuffer : StructuredBuffer<T> |
| { |
| SLANG_CUDA_CALL T& operator[](size_t index) const |
| { |
| #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT |
| SLANG_BOUND_CHECK(index, this->count); |
| #endif |
| return this->data[index]; |
| } |
| }; |
|
|
| |
| struct ByteAddressBuffer |
| { |
| SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } |
| SLANG_CUDA_CALL uint32_t Load(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); |
| return data[index >> 2]; |
| } |
| SLANG_CUDA_CALL uint2 Load2(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint2{data[dataIdx], data[dataIdx + 1]}; |
| } |
| SLANG_CUDA_CALL uint3 Load3(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; |
| } |
| SLANG_CUDA_CALL uint4 Load4(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; |
| } |
| template<typename T> |
| SLANG_CUDA_CALL T Load(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); |
| T data; |
| memcpy(&data, ((const char*)this->data) + index, sizeof(T)); |
| return data; |
| } |
| |
| const uint32_t* data; |
| size_t sizeInBytes; |
| }; |
|
|
| |
| |
| |
| struct RWByteAddressBuffer |
| { |
| SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } |
| |
| SLANG_CUDA_CALL uint32_t Load(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); |
| return data[index >> 2]; |
| } |
| SLANG_CUDA_CALL uint2 Load2(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint2{data[dataIdx], data[dataIdx + 1]}; |
| } |
| SLANG_CUDA_CALL uint3 Load3(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; |
| } |
| SLANG_CUDA_CALL uint4 Load4(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; |
| } |
| template<typename T> |
| SLANG_CUDA_CALL T Load(size_t index) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); |
| T data; |
| memcpy(&data, ((const char*)this->data) + index, sizeof(T)); |
| return data; |
| } |
| |
| SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); |
| data[index >> 2] = v; |
| } |
| SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| data[dataIdx + 0] = v.x; |
| data[dataIdx + 1] = v.y; |
| } |
| SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| data[dataIdx + 0] = v.x; |
| data[dataIdx + 1] = v.y; |
| data[dataIdx + 2] = v.z; |
| } |
| SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); |
| const size_t dataIdx = index >> 2; |
| data[dataIdx + 0] = v.x; |
| data[dataIdx + 1] = v.y; |
| data[dataIdx + 2] = v.z; |
| data[dataIdx + 3] = v.w; |
| } |
| template<typename T> |
| SLANG_CUDA_CALL void Store(size_t index, T const& value) const |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); |
| memcpy((char*)data + index, &value, sizeof(T)); |
| } |
| |
| |
| template <typename T> |
| SLANG_CUDA_CALL T* _getPtrAt(size_t index) |
| { |
| SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); |
| return (T*)(((char*)data) + index); |
| } |
| |
| uint32_t* data; |
| size_t sizeInBytes; |
| }; |
|
|
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| #ifndef SLANG_USE_ASM_LANE_ID |
| __forceinline__ __device__ uint32_t _getLaneId() |
| { |
| |
| |
| return threadIdx.x & SLANG_CUDA_WARP_MASK; |
| } |
| #else |
| __forceinline__ __device__ uint32_t _getLaneId() |
| { |
| |
| |
| |
| |
| unsigned ret; |
| asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret)); |
| return ret; |
| } |
| #endif |
|
|
| typedef int WarpMask; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| __forceinline__ __device__ WarpMask _getLaneLtMask() |
| { |
| return (int(1) << _getLaneId()) - 1; |
| } |
|
|
| |
| |
| |
| __forceinline__ __device__ WarpMask _getActiveMask() |
| { |
| return __ballot_sync(__activemask(), true); |
| } |
|
|
| |
| __forceinline__ __device__ WarpMask _getMultiPrefixMask(int mask) |
| { |
| return mask; |
| } |
|
|
| |
| |
| __inline__ __device__ bool _waveIsSingleLane(WarpMask mask) |
| { |
| return (mask & (mask - 1)) == 0; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| __inline__ __device__ int _waveCalcPow2Offset(WarpMask mask) |
| { |
| |
| if (mask == SLANG_CUDA_WARP_BITMASK) |
| { |
| return SLANG_CUDA_WARP_SIZE; |
| } |
| |
| if ((mask & (mask + 1)) == 0) |
| { |
| |
| const int offset = 32 - __clz(mask); |
| |
| if ((offset & (offset - 1)) == 0) |
| { |
| return offset; |
| } |
| } |
| return 0; |
| } |
|
|
| __inline__ __device__ bool _waveIsFirstLane() |
| { |
| const WarpMask mask = __activemask(); |
| |
| |
| |
| |
| |
| |
| return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId()); |
| } |
|
|
| template <typename T> |
| struct WaveOpOr |
| { |
| __inline__ __device__ static T getInitial(T a) { return 0; } |
| __inline__ __device__ static T doOp(T a, T b) { return a | b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpAnd |
| { |
| __inline__ __device__ static T getInitial(T a) { return ~T(0); } |
| __inline__ __device__ static T doOp(T a, T b) { return a & b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpXor |
| { |
| __inline__ __device__ static T getInitial(T a) { return 0; } |
| __inline__ __device__ static T doOp(T a, T b) { return a ^ b; } |
| __inline__ __device__ static T doInverse(T a, T b) { return a ^ b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpAdd |
| { |
| __inline__ __device__ static T getInitial(T a) { return 0; } |
| __inline__ __device__ static T doOp(T a, T b) { return a + b; } |
| __inline__ __device__ static T doInverse(T a, T b) { return a - b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpMul |
| { |
| __inline__ __device__ static T getInitial(T a) { return T(1); } |
| __inline__ __device__ static T doOp(T a, T b) { return a * b; } |
| |
| |
| __inline__ __device__ static T doInverse(T a, T b) { return a / b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpMax |
| { |
| __inline__ __device__ static T getInitial(T a) { return a; } |
| __inline__ __device__ static T doOp(T a, T b) { return a > b ? a : b; } |
| }; |
|
|
| template <typename T> |
| struct WaveOpMin |
| { |
| __inline__ __device__ static T getInitial(T a) { return a; } |
| __inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; } |
| }; |
|
|
| template <typename T> |
| struct ElementTypeTrait; |
|
|
| |
| template <> struct ElementTypeTrait<int> { typedef int Type; }; |
| template <> struct ElementTypeTrait<uint> { typedef uint Type; }; |
| template <> struct ElementTypeTrait<float> { typedef float Type; }; |
| template <> struct ElementTypeTrait<double> { typedef double Type; }; |
| template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; }; |
| template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; }; |
|
|
| |
| template <> struct ElementTypeTrait<int1> { typedef int Type; }; |
| template <> struct ElementTypeTrait<int2> { typedef int Type; }; |
| template <> struct ElementTypeTrait<int3> { typedef int Type; }; |
| template <> struct ElementTypeTrait<int4> { typedef int Type; }; |
|
|
| template <> struct ElementTypeTrait<uint1> { typedef uint Type; }; |
| template <> struct ElementTypeTrait<uint2> { typedef uint Type; }; |
| template <> struct ElementTypeTrait<uint3> { typedef uint Type; }; |
| template <> struct ElementTypeTrait<uint4> { typedef uint Type; }; |
|
|
| template <> struct ElementTypeTrait<float1> { typedef float Type; }; |
| template <> struct ElementTypeTrait<float2> { typedef float Type; }; |
| template <> struct ElementTypeTrait<float3> { typedef float Type; }; |
| template <> struct ElementTypeTrait<float4> { typedef float Type; }; |
|
|
| template <> struct ElementTypeTrait<double1> { typedef double Type; }; |
| template <> struct ElementTypeTrait<double2> { typedef double Type; }; |
| template <> struct ElementTypeTrait<double3> { typedef double Type; }; |
| template <> struct ElementTypeTrait<double4> { typedef double Type; }; |
|
|
| |
| template <typename T, int ROWS, int COLS> |
| struct ElementTypeTrait<Matrix<T, ROWS, COLS> > |
| { |
| typedef T Type; |
| }; |
|
|
| |
| template <typename INTF, typename T> |
| __device__ T _waveReduceScalar(WarpMask mask, T val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| if (offsetSize > 0) |
| { |
| |
| for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) |
| { |
| val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset)); |
| } |
| } |
| else if (!_waveIsSingleLane(mask)) |
| { |
| T result = INTF::getInitial(val); |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| result = INTF::doOp(result, __shfl_sync(mask, val, srcLane)); |
| remaining &= ~laneBit; |
| } |
| return result; |
| } |
| return val; |
| } |
|
|
|
|
| |
| template <typename INTF, typename T, size_t COUNT> |
| __device__ void _waveReduceMultiple(WarpMask mask, T* val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| if (offsetSize > 0) |
| { |
| |
| for (int offset = offsetSize >> 1; offset > 0; offset >>= 1) |
| { |
| for (size_t i = 0; i < COUNT; ++i) |
| { |
| val[i] = INTF::doOp(val[i], __shfl_xor_sync(mask, val[i], offset)); |
| } |
| } |
| } |
| else if (!_waveIsSingleLane(mask)) |
| { |
| |
| T originalVal[COUNT]; |
| for (size_t i = 0; i < COUNT; ++i) |
| { |
| const T v = val[i]; |
| originalVal[i] = v; |
| val[i] = INTF::getInitial(v); |
| } |
| |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| for (size_t i = 0; i < COUNT; ++i) |
| { |
| val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane)); |
| } |
| remaining &= ~laneBit; |
| } |
| } |
| } |
|
|
| template <typename INTF, typename T> |
| __device__ void _waveReduceMultiple(WarpMask mask, T* val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)val); |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveOr(WarpMask mask, T val) { return _waveReduceScalar<WaveOpOr<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveAnd(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveXor(WarpMask mask, T val) { return _waveReduceScalar<WaveOpXor<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveProduct(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMul<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveSum(WarpMask mask, T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveMin(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMin<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveMax(WarpMask mask, T val) { return _waveReduceScalar<WaveOpMax<T>, T>(mask, val); } |
|
|
| |
| #if __CUDA_ARCH__ >= 800 |
| template<> |
| __inline__ __device__ unsigned _waveOr<unsigned>(WarpMask mask, unsigned val) { return __reduce_or_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ unsigned _waveAnd<unsigned>(WarpMask mask, unsigned val) { return __reduce_and_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ unsigned _waveXor<unsigned>(WarpMask mask, unsigned val) { return __reduce_xor_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ unsigned _waveSum<unsigned>(WarpMask mask, unsigned val) { return __reduce_add_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ int _waveSum<int>(WarpMask mask, int val) { return __reduce_add_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ unsigned _waveMin<unsigned>(WarpMask mask, unsigned val) { return __reduce_min_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ int _waveMin<int>(WarpMask mask, int val) { return __reduce_min_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ unsigned _waveMax<unsigned>(WarpMask mask, unsigned val) { return __reduce_max_sync(mask, val); } |
|
|
| template<> |
| __inline__ __device__ int _waveMax<int>(WarpMask mask, int val) { return __reduce_max_sync(mask, val); } |
| #endif |
|
|
|
|
| |
|
|
| template <typename T> |
| __inline__ __device__ T _waveOrMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveAndMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveXorMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveProductMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveSumMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveMinMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(mask, &val); return val; } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveMaxMultiple(WarpMask mask, T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(mask, &val); return val; } |
|
|
|
|
| template <typename T> |
| __inline__ __device__ bool _waveAllEqual(WarpMask mask, T val) |
| { |
| int pred; |
| __match_all_sync(mask, val, &pred); |
| return pred != 0; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ bool _waveAllEqualMultiple(WarpMask mask, T inVal) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| const size_t count = sizeof(T) / sizeof(ElemType); |
| int pred; |
| const ElemType* src = (const ElemType*)&inVal; |
| for (size_t i = 0; i < count; ++i) |
| { |
| __match_all_sync(mask, src[i], &pred); |
| if (pred == 0) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveReadFirst(WarpMask mask, T val) |
| { |
| const int lowestLaneId = __ffs(mask) - 1; |
| return __shfl_sync(mask, val, lowestLaneId); |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveReadFirstMultiple(WarpMask mask, T inVal) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| const size_t count = sizeof(T) / sizeof(ElemType); |
| T outVal; |
| const ElemType* src = (const ElemType*)&inVal; |
| ElemType* dst = (ElemType*)&outVal; |
| const int lowestLaneId = __ffs(mask) - 1; |
| for (size_t i = 0; i < count; ++i) |
| { |
| dst[i] = __shfl_sync(mask, src[i], lowestLaneId); |
| } |
| return outVal; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _waveShuffleMultiple(WarpMask mask, T inVal, int lane) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| const size_t count = sizeof(T) / sizeof(ElemType); |
| T outVal; |
| const ElemType* src = (const ElemType*)&inVal; |
| ElemType* dst = (ElemType*)&outVal; |
| for (size_t i = 0; i < count; ++i) |
| { |
| dst[i] = __shfl_sync(mask, src[i], lane); |
| } |
| return outVal; |
| } |
|
|
| |
|
|
| |
| |
| template <typename INTF, typename T> |
| __device__ T _wavePrefixInvertableScalar(WarpMask mask, T val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| |
| const int laneId = _getLaneId(); |
| T result; |
| if (offsetSize > 0) |
| { |
| |
| result = val; |
| for (int i = 1; i < offsetSize; i += i) |
| { |
| const T readVal = __shfl_up_sync(mask, result, i, offsetSize); |
| if (laneId >= i) |
| { |
| result = INTF::doOp(result, readVal); |
| } |
| } |
| |
| result = INTF::doInverse(result, val); |
| } |
| else |
| { |
| result = INTF::getInitial(val); |
| if (!_waveIsSingleLane(mask)) |
| { |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| const T readValue = __shfl_sync(mask, val, srcLane); |
| |
| if (srcLane < laneId) |
| { |
| result = INTF::doOp(result, readValue); |
| } |
| remaining &= ~laneBit; |
| } |
| } |
| } |
| return result; |
| } |
| |
|
|
| |
| |
| template <typename INTF, typename T> |
| __device__ T _wavePrefixScalar(WarpMask mask, T val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| |
| const int laneId = _getLaneId(); |
| T result = INTF::getInitial(val); |
| if (offsetSize > 0) |
| { |
| |
| |
| |
| for (int i = 1; i < offsetSize; i += i) |
| { |
| const T readVal = __shfl_up_sync(mask, val, i, offsetSize); |
| if (laneId >= i) |
| { |
| result = INTF::doOp(result, readVal); |
| val = INTF::doOp(val, readVal); |
| } |
| } |
| } |
| else |
| { |
| if (!_waveIsSingleLane(mask)) |
| { |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| const T readValue = __shfl_sync(mask, val, srcLane); |
| |
| if (srcLane < laneId) |
| { |
| result = INTF::doOp(result, readValue); |
| } |
| remaining &= ~laneBit; |
| } |
| } |
| } |
| return result; |
| } |
|
|
|
|
| template <typename INTF, typename T, size_t COUNT> |
| __device__ T _waveOpCopy(T* dst, const T* src) |
| { |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| dst[j] = src[j]; |
| } |
| } |
|
|
|
|
| template <typename INTF, typename T, size_t COUNT> |
| __device__ T _waveOpDoInverse(T* inOut, const T* val) |
| { |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| inOut[j] = INTF::doInverse(inOut[j], val[j]); |
| } |
| } |
|
|
| template <typename INTF, typename T, size_t COUNT> |
| __device__ T _waveOpSetInitial(T* out, const T* val) |
| { |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| out[j] = INTF::getInitial(val[j]); |
| } |
| } |
|
|
| template <typename INTF, typename T, size_t COUNT> |
| __device__ T _wavePrefixInvertableMultiple(WarpMask mask, T* val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| |
| const int laneId = _getLaneId(); |
| T originalVal[COUNT]; |
| _waveOpCopy<INTF, T, COUNT>(originalVal, val); |
| |
| if (offsetSize > 0) |
| { |
| |
| for (int i = 1; i < offsetSize; i += i) |
| { |
| |
| |
| |
| |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| const T readVal = __shfl_up_sync(mask, val[j], i, offsetSize); |
| if (laneId >= i) |
| { |
| val[j] = INTF::doOp(val[j], readVal); |
| } |
| } |
| } |
| |
| _waveOpDoInverse<INTF, T, COUNT>(val, originalVal); |
| } |
| else |
| { |
| _waveOpSetInitial<INTF, T, COUNT>(val, val); |
| if (!_waveIsSingleLane(mask)) |
| { |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| |
| const T readValue = __shfl_sync(mask, originalVal[j], srcLane); |
| |
| if (srcLane < laneId) |
| { |
| val[j] = INTF::doOp(val[j], readValue); |
| } |
| remaining &= ~laneBit; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename INTF, typename T, size_t COUNT> |
| __device__ T _wavePrefixMultiple(WarpMask mask, T* val) |
| { |
| const int offsetSize = _waveCalcPow2Offset(mask); |
| |
| const int laneId = _getLaneId(); |
| |
| T work[COUNT]; |
| _waveOpCopy<INTF, T, COUNT>(work, val); |
| _waveOpSetInitial<INTF, T, COUNT>(val, val); |
| |
| if (offsetSize > 0) |
| { |
| |
| |
| |
| for (int i = 1; i < offsetSize; i += i) |
| { |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| const T readVal = __shfl_up_sync(mask, work[j], i, offsetSize); |
| if (laneId >= i) |
| { |
| work[j] = INTF::doOp(work[j], readVal); |
| val[j] = INTF::doOp(val[j], readVal); |
| } |
| } |
| } |
| } |
| else |
| { |
| if (!_waveIsSingleLane(mask)) |
| { |
| int remaining = mask; |
| while (remaining) |
| { |
| const int laneBit = remaining & -remaining; |
| |
| const int srcLane = __ffs(laneBit) - 1; |
| |
| for (size_t j = 0; j < COUNT; ++j) |
| { |
| |
| const T readValue = __shfl_sync(mask, work[j], srcLane); |
| |
| if (srcLane < laneId) |
| { |
| val[j] = INTF::doOp(val[j], readValue); |
| } |
| } |
| remaining &= ~laneBit; |
| } |
| } |
| } |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixProduct(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpMul<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixSum(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpAdd<T>, T>(mask, val); } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixXor(WarpMask mask, T val) { return _wavePrefixInvertableScalar<WaveOpXor<T>, T>(mask, val); } |
| |
| template <typename T> |
| __inline__ __device__ T _wavePrefixOr(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpOr<T>, T>(mask, val); } |
| |
| template <typename T> |
| __inline__ __device__ T _wavePrefixAnd(WarpMask mask, T val) { return _wavePrefixScalar<WaveOpAnd<T>, T>(mask, val); } |
| |
| |
| template <typename T> |
| __inline__ __device__ T _wavePrefixProductMultiple(WarpMask mask, T val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _wavePrefixInvertableMultiple<WaveOpMul<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); |
| return val; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixSumMultiple(WarpMask mask, T val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _wavePrefixInvertableMultiple<WaveOpAdd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); |
| return val; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixXorMultiple(WarpMask mask, T val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _wavePrefixInvertableMultiple<WaveOpXor<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); |
| return val; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixOrMultiple(WarpMask mask, T val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _wavePrefixMultiple<WaveOpOr<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); |
| return val; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| _wavePrefixMultiple<WaveOpAnd<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(mask, (ElemType*)&val); |
| return val; |
| } |
|
|
| template <typename T> |
| __inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val) |
| { |
| int pred; |
| return make_uint4(__match_all_sync(mask, val, &pred), 0, 0, 0); |
| } |
|
|
| template <typename T> |
| __inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal) |
| { |
| typedef typename ElementTypeTrait<T>::Type ElemType; |
| const size_t count = sizeof(T) / sizeof(ElemType); |
| int pred; |
| const ElemType* src = (const ElemType*)&inVal; |
| uint matchBits = 0xffffffff; |
| for (size_t i = 0; i < count && matchBits; ++i) |
| { |
| matchBits = matchBits & __match_all_sync(mask, src[i], &pred); |
| } |
| return make_uint4(matchBits, 0, 0, 0); |
| } |
|
|
| __device__ uint getAt(dim3 a, int b) |
| { |
| SLANG_PRELUDE_ASSERT(b >= 0 && b < 3); |
| return (&a.x)[b]; |
| } |
| __device__ uint3 operator*(uint3 a, dim3 b) |
| { |
| uint3 r; |
| r.x = a.x * b.x; |
| r.y = a.y * b.y; |
| r.z = a.z * b.z; |
| return r; |
| } |
|
|
| template<typename TResult, typename TInput> |
| __inline__ __device__ TResult slang_bit_cast(TInput val) |
| { |
| return *(TResult*)(&val); |
| } |
|
|
| |
|
|
|
|
| |
| |
| |
| struct UniformEntryPointParams; |
| struct UniformState; |
|
|
| |
| #ifdef SLANG_CUDA_ENABLE_OPTIX |
| struct RayDesc |
| { |
| float3 Origin; |
| float TMin; |
| float3 Direction; |
| float TMax; |
| }; |
|
|
| static __forceinline__ __device__ |
| void *unpackOptiXRayPayloadPointer(uint32_t i0, uint32_t i1) |
| { |
| const uint64_t uptr = static_cast<uint64_t>(i0) << 32 | i1; |
| void* ptr = reinterpret_cast<void*>(uptr); |
| return ptr; |
| } |
|
|
| static __forceinline__ __device__ |
| void packOptiXRayPayloadPointer(void* ptr, uint32_t& i0, uint32_t& i1) |
| { |
| const uint64_t uptr = reinterpret_cast<uint64_t>(ptr); |
| i0 = uptr >> 32; |
| i1 = uptr & 0x00000000ffffffff; |
| } |
|
|
| static __forceinline__ __device__ void *getOptiXRayPayloadPtr() |
| { |
| const uint32_t u0 = optixGetPayload_0(); |
| const uint32_t u1 = optixGetPayload_1(); |
| return unpackOptiXRayPayloadPointer(u0, u1); |
| } |
|
|
| template<typename T> |
| __forceinline__ __device__ void *traceOptiXRay( |
| OptixTraversableHandle AccelerationStructure, |
| uint32_t RayFlags, |
| uint32_t InstanceInclusionMask, |
| uint32_t RayContributionToHitGroupIndex, |
| uint32_t MultiplierForGeometryContributionToHitGroupIndex, |
| uint32_t MissShaderIndex, |
| RayDesc Ray, |
| T *Payload |
| ) { |
| uint32_t r0, r1; |
| packOptiXRayPayloadPointer((void*)Payload, r0, r1); |
| optixTrace( |
| AccelerationStructure, |
| Ray.Origin, |
| Ray.Direction, |
| Ray.TMin, |
| Ray.TMax, |
| 0.f, |
| InstanceInclusionMask, |
| RayFlags, |
| RayContributionToHitGroupIndex, |
| MultiplierForGeometryContributionToHitGroupIndex, |
| MissShaderIndex, |
| r0, r1 |
| ); |
| } |
|
|
| #endif |
|
|
| static const int kSlangTorchTensorMaxDim = 5; |
|
|
| |
| struct TensorView |
| { |
| uint8_t* data; |
| uint32_t strides[kSlangTorchTensorMaxDim]; |
| uint32_t sizes[kSlangTorchTensorMaxDim]; |
| uint32_t dimensionCount; |
|
|
| template<typename T> |
| __device__ T* data_ptr() |
| { |
| return reinterpret_cast<T*>(data); |
| } |
|
|
| template<typename T> |
| __device__ T* data_ptr_at(uint32_t index) |
| { |
| uint64_t offset = strides[0] * index; |
| return reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T> |
| __device__ T* data_ptr_at(uint2 index) |
| { |
| uint64_t offset = strides[0] * index.x + strides[1] * index.y; |
| return reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T> |
| __device__ T* data_ptr_at(uint3 index) |
| { |
| uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z; |
| return reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T> |
| __device__ T* data_ptr_at(uint4 index) |
| { |
| uint64_t offset = strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w; |
| return reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T, unsigned int N> |
| __device__ T* data_ptr_at(uint index[N]) |
| { |
| uint64_t offset = 0; |
| for (unsigned int i = 0; i < N; ++i) |
| { |
| offset += strides[i] * index[i]; |
| } |
| return reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T> |
| __device__ T& load(uint32_t x) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * x); |
| } |
| template<typename T> |
| __device__ T& load(uint32_t x, uint32_t y) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y); |
| } |
| template<typename T> |
| __device__ T& load(uint2 index) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y); |
| } |
| template<typename T> |
| __device__ T& load(uint32_t x, uint32_t y, uint32_t z) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z); |
| } |
| template<typename T> |
| __device__ T& load(uint3 index) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z); |
| } |
| template<typename T> |
| __device__ T& load(uint32_t x, uint32_t y, uint32_t z, uint32_t w) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w); |
| } |
| template<typename T> |
| __device__ T& load(uint4 index) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w); |
| } |
| template<typename T> |
| __device__ T& load(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4) |
| { |
| return *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4); |
| } |
|
|
| |
| template<typename T, unsigned int N> |
| __device__ T& load(uint index[N]) |
| { |
| uint64_t offset = 0; |
| for (unsigned int i = 0; i < N; ++i) |
| { |
| offset += strides[i] * index[i]; |
| } |
| return *reinterpret_cast<T*>(data + offset); |
| } |
|
|
| template<typename T> |
| __device__ void store(uint32_t x, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * x) = val; |
| } |
| template<typename T> |
| __device__ void store(uint32_t x, uint32_t y, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y) = val; |
| } |
| template<typename T> |
| __device__ void store(uint2 index, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y) = val; |
| } |
| template<typename T> |
| __device__ void store(uint32_t x, uint32_t y, uint32_t z, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * x + strides[1] * y + strides[2] * z) = val; |
| } |
| template<typename T> |
| __device__ void store(uint3 index, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z) = val; |
| } |
| template<typename T> |
| __device__ void store(uint32_t x, uint32_t y, uint32_t z, uint32_t w, T val) |
| { |
| *reinterpret_cast<T*>( |
| data + strides[0] * x + strides[1] * y + strides[2] * z + strides[3] * w) = val; |
| } |
| template<typename T> |
| __device__ void store(uint4 index, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * index.x + strides[1] * index.y + strides[2] * index.z + strides[3] * index.w) = val; |
| } |
| template<typename T> |
| __device__ void store(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, T val) |
| { |
| *reinterpret_cast<T*>(data + strides[0] * i0 + strides[1] * i1 + strides[2] * i2 + strides[3] * i3 + strides[4] * i4) = val; |
| } |
|
|
| |
| template<typename T, unsigned int N> |
| __device__ void store(uint index[N], T val) |
| { |
| uint64_t offset = 0; |
| for (unsigned int i = 0; i < N; ++i) |
| { |
| offset += strides[i] * index[i]; |
| } |
| *reinterpret_cast<T*>(data + offset) = val; |
| } |
| }; |
|
|