| | #pragma once |
| |
|
| | #include <thrust/tuple.h> |
| |
|
| | #include <ATen/native/SharedReduceOps.h> |
| | #include <ATen/cuda/DeviceUtils.cuh> |
| |
|
| | namespace at { |
| | namespace native { |
| | namespace cuda_utils { |
| |
|
| | constexpr int kCUDABlockReduceNumThreads = 512; |
| | |
| | |
| | |
| | |
| | constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE; |
| |
|
| | |
| | |
| | |
| | |
| | template <typename T> |
| | __inline__ __device__ T WarpReduceSum(T val) { |
| | #pragma unroll |
| | for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { |
| | val += WARP_SHFL_DOWN(val, offset); |
| | } |
| | return val; |
| | } |
| |
|
| | struct Block1D { |
| | static __forceinline__ __device__ int Tid() { return threadIdx.x; } |
| |
|
| | static __forceinline__ __device__ int Warps() { |
| | return blockDim.x / C10_WARP_SIZE; |
| | } |
| | }; |
| |
|
| | struct Block2D { |
| | static __forceinline__ __device__ int Tid() { |
| | return threadIdx.x + threadIdx.y * blockDim.x; |
| | } |
| |
|
| | static __forceinline__ __device__ int Warps() { |
| | return blockDim.x * blockDim.y / C10_WARP_SIZE; |
| | } |
| | }; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | template <typename T, typename B = Block1D> |
| | __inline__ __device__ T BlockReduceSum(T val, T* shared) { |
| | const int tid = B::Tid(); |
| | const int lid = tid % C10_WARP_SIZE; |
| | const int wid = tid / C10_WARP_SIZE; |
| | val = WarpReduceSum(val); |
| | __syncthreads(); |
| | if (lid == 0) { |
| | shared[wid] = val; |
| | } |
| | __syncthreads(); |
| | val = (tid < B::Warps()) ? shared[lid] : T(0); |
| | if (wid == 0) { |
| | val = WarpReduceSum(val); |
| | } |
| | return val; |
| | } |
| |
|
| | template <typename T, class ReduceOp> |
| | __inline__ __device__ T WarpReduce(T val, const ReduceOp& op) { |
| | #pragma unroll |
| | for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { |
| | val = op.combine(val, op.warp_shfl_down(val, offset)); |
| | } |
| | return val; |
| | } |
| |
|
| | template <typename T, class ReduceOp, typename B = Block1D> |
| | __inline__ __device__ T |
| | BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) { |
| | const int tid = B::Tid(); |
| | const int lid = tid % C10_WARP_SIZE; |
| | const int wid = tid / C10_WARP_SIZE; |
| | val = WarpReduce(val, op); |
| | __syncthreads(); |
| | if (lid == 0) { |
| | shared[wid] = val; |
| | } |
| | __syncthreads(); |
| | val = (tid < B::Warps()) ? shared[lid] : identity_element; |
| | if (wid == 0) { |
| | val = WarpReduce(val, op); |
| | } |
| | return val; |
| | } |
| |
|
| | } |
| | } |
| | } |
| |
|