| #pragma once
|
|
|
| |
| * General settings and functions
|
| */
|
| const int WARP_SIZE = 32;
|
| const int MAX_BLOCK_SIZE = 1024;
|
|
|
| static int getNumThreads(int nElem) {
|
| int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
|
| for (int i = 0; i < 6; ++i) {
|
| if (nElem <= threadSizes[i]) {
|
| return threadSizes[i];
|
| }
|
| }
|
| return MAX_BLOCK_SIZE;
|
| }
|
|
|
| |
| * Reduction utilities
|
| */
|
| template <typename T>
|
| __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
|
| unsigned int mask = 0xffffffff) {
|
| #if CUDART_VERSION >= 9000
|
| return __shfl_xor_sync(mask, value, laneMask, width);
|
| #else
|
| return __shfl_xor(value, laneMask, width);
|
| #endif
|
| }
|
|
|
| __device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
|
|
|
| template<typename T>
|
| struct Pair {
|
| T v1, v2;
|
| __device__ Pair() {}
|
| __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
|
| __device__ Pair(T v) : v1(v), v2(v) {}
|
| __device__ Pair(int v) : v1(v), v2(v) {}
|
| __device__ Pair &operator+=(const Pair<T> &a) {
|
| v1 += a.v1;
|
| v2 += a.v2;
|
| return *this;
|
| }
|
| };
|
|
|
| template<typename T>
|
| static __device__ __forceinline__ T warpSum(T val) {
|
| #if __CUDA_ARCH__ >= 300
|
| for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
|
| val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
|
| }
|
| #else
|
| __shared__ T values[MAX_BLOCK_SIZE];
|
| values[threadIdx.x] = val;
|
| __threadfence_block();
|
| const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
|
| for (int i = 1; i < WARP_SIZE; i++) {
|
| val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
|
| }
|
| #endif
|
| return val;
|
| }
|
|
|
| template<typename T>
|
| static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
|
| value.v1 = warpSum(value.v1);
|
| value.v2 = warpSum(value.v2);
|
| return value;
|
| } |