| | |
| |
|
| | #include <ATen/cuda/ApplyGridUtils.cuh> |
| | #include <ATen/cuda/detail/IndexUtils.cuh> |
| | #include <ATen/core/TensorBase.h> |
| | #include <ATen/ceil_div.h> |
| | #include <ATen/cuda/Atomic.cuh> |
| | #include <ATen/cuda/CUDAContext.h> |
| | #include <c10/macros/Macros.h> |
| | #include <ATen/native/Copy.h> |
| |
|
| | #include <math.h> |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | namespace at { |
| | namespace cuda { |
| |
|
| | |
| | enum class TensorArgType { ReadWrite, ReadOnly }; |
| |
|
| | namespace { |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | template <typename T1, typename IndexType, |
| | typename T2 = void, typename T3 = void, typename T4 = void> |
| | inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo, |
| | detail::TensorInfo<T2, IndexType>* bInfo = nullptr, |
| | detail::TensorInfo<T3, IndexType>* cInfo = nullptr, |
| | detail::TensorInfo<T4, IndexType>* dInfo = nullptr) { |
| | int numInfos = 1; |
| | int dims = aInfo->dims; |
| | IndexType *sizes[4] = { aInfo->sizes, }; |
| | IndexType *strides[4] = { aInfo->strides, }; |
| |
|
| | if (bInfo != nullptr) { |
| | ++numInfos; |
| | if (bInfo->dims != dims) return; |
| | sizes[1] = bInfo->sizes; |
| | strides[1] = bInfo->strides; |
| | } |
| |
|
| | if (cInfo != nullptr) { |
| | ++numInfos; |
| | if (cInfo->dims != dims) return; |
| | sizes[2] = cInfo->sizes; |
| | strides[2] = cInfo->strides; |
| | } |
| |
|
| | if (dInfo != nullptr) { |
| | ++numInfos; |
| | if (dInfo->dims != dims) return; |
| | sizes[3] = dInfo->sizes; |
| | strides[3] = dInfo->strides; |
| | } |
| |
|
| | |
| | |
| | for (int i = 1; i < numInfos; ++i) { |
| | for (int j = 0; j < dims; ++j) { |
| | if (sizes[i][j] != sizes[0][j]) return; |
| | } |
| | } |
| |
|
| | for (int i = 0; i < dims - 1; ++i) { |
| | |
| | if (sizes[0][i] == 1) continue; |
| |
|
| | for (int j = i + 1; j < dims; ++j) { |
| | if (sizes[0][j] == 1) continue; |
| |
|
| | |
| | bool hasIncreasingStrides = false; |
| | bool hasDecreasingStrides = false; |
| |
|
| | for (int k = 0; k < numInfos; k++) { |
| | IndexType stride_i = strides[k][i]; |
| | IndexType stride_j = strides[k][j]; |
| | if (stride_i < stride_j) { |
| | hasIncreasingStrides = true; |
| | } else if (stride_i > stride_j) { |
| | hasDecreasingStrides = true; |
| | } |
| | } |
| |
|
| | if (hasIncreasingStrides && !hasDecreasingStrides) { |
| | for (int k = 0; k < numInfos; k++) { |
| | IndexType size = sizes[k][i]; |
| | sizes[k][i] = sizes[k][j]; |
| | sizes[k][j] = size; |
| |
|
| | IndexType stride = strides[k][i]; |
| | strides[k][i] = strides[k][j]; |
| | strides[k][j] = stride; |
| | } |
| | } |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | template <typename Op, |
| | typename scalar, |
| | typename IndexType, |
| | int ADims, |
| | int remaining_steps, |
| | typename... Offsets> |
| | struct ApplyOp1 { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n, |
| | IndexType linearIndex, Offsets... aOffsets) { |
| | |
| | const IndexType aOffset = sizeof...(Offsets) < n ? |
| | detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0; |
| |
|
| | ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply( |
| | a, op, n, linearIndex + 1, aOffsets..., aOffset |
| | ); |
| | } |
| | }; |
| |
|
| | |
| | |
| | template <typename Op, |
| | typename scalar, |
| | typename IndexType, |
| | int ADims, |
| | typename Offset> |
| | struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, |
| | int n, IndexType linearIndex, Offset offset) { |
| | op(a.data[offset]); |
| | } |
| | }; |
| |
|
| | template <typename Op, |
| | typename scalar, |
| | typename IndexType, |
| | int ADims, |
| | typename... Offsets> |
| | struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n, |
| | IndexType linearIndex, Offsets... offsets) { |
| | op(n, a.data[offsets]...); |
| | } |
| | }; |
| |
|
| | template <typename Op, |
| | typename scalar, |
| | typename IndexType, |
| | int ADims, |
| | int step> |
| | |
| | C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) |
| | |
| | __global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a, |
| | IndexType totalElements, const Op op) { |
| | for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step; |
| | linearIndex < totalElements; |
| | linearIndex += gridDim.x * blockDim.x * step) { |
| | ApplyOp1<Op, scalar, IndexType, ADims, step>::apply( |
| | a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex); |
| | } |
| | } |
| |
|
| |
|
| | template <typename Op, |
| | typename scalar1, |
| | typename scalar2, |
| | typename IndexType, |
| | int ADims, |
| | int BDims, |
| | int remaining_steps, |
| | typename... Offsets> |
| | struct ApplyOp2 { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar1, IndexType> &a, |
| | detail::TensorInfo<scalar2, IndexType> &b, |
| | const Op &op, int64_t n, IndexType linearIndex, |
| | Offsets... aOffsets, Offsets... bOffsets) { |
| | |
| | const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ? |
| | detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0; |
| |
|
| | |
| | const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ? |
| | detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0; |
| |
|
| | ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply( |
| | a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset |
| | ); |
| | } |
| | }; |
| |
|
| | |
| | |
| | template <typename Op, |
| | typename scalar1, |
| | typename scalar2, |
| | typename IndexType, |
| | int ADims, |
| | int BDims, |
| | typename Offset> |
| | struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar1, IndexType> &a, |
| | detail::TensorInfo<scalar2, IndexType> &b, |
| | const Op &op, int , IndexType , |
| | Offset aOffset, Offset bOffset) { |
| | op(a.data[aOffset], b.data[bOffset]); |
| | } |
| | }; |
| |
|
| | template <typename Op, |
| | typename scalar1, |
| | typename scalar2, |
| | typename IndexType, |
| | int ADims, |
| | int BDims, |
| | typename... Offsets> |
| | struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> { |
| | __device__ __forceinline__ |
| | static void apply(detail::TensorInfo<scalar1, IndexType> &a, |
| | detail::TensorInfo<scalar2, IndexType> &b, |
| | const Op &op, int n, IndexType linearIndex, |
| | Offsets... aOffsets, Offsets... bOffsets) { |
| | op(n, a.data[aOffsets]..., b.data[bOffsets]...); |
| | } |
| | }; |
| |
|
| | template <typename Op, |
| | typename scalar1, |
| | typename scalar2, |
| | typename IndexType, |
| | int ADims, int BDims, |
| | int step, |
| | int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, |
| | int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM> |
| | |
| | C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) |
| | |
| | __global__ void |
| | kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a, |
| | detail::TensorInfo<scalar2, IndexType> b, |
| | IndexType totalElements, |
| | const Op op) { |
| | for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step; |
| | linearIndex < totalElements; |
| | linearIndex += gridDim.x * blockDim.x * step) { |
| | ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply( |
| | a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)), |
| | linearIndex); |
| | } |
| | } |
| |
|
| | } |
| |
|
| | template <typename scalar1, typename scalar2, int step, typename Op, |
| | int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, |
| | int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM> |
| | inline bool CUDA_tensor_apply2(at::TensorBase a, |
| | at::TensorBase b, |
| | const Op op, |
| | TensorArgType aType = TensorArgType::ReadWrite, |
| | TensorArgType bType = TensorArgType::ReadOnly) { |
| | TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(), |
| | "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got " |
| | "tensors with type ", a.device().type(), " and ", b.device().type()); |
| | int64_t totalElements = a.numel(); |
| |
|
| | if (totalElements != b.numel()) { |
| | return false; |
| | } |
| |
|
| | if (a.dim() > MAX_TENSORINFO_DIMS || |
| | b.dim() > MAX_TENSORINFO_DIMS) { |
| | return false; |
| | } |
| |
|
| | if (a.numel() == 0) { |
| | |
| | return true; |
| | } |
| | const dim3 block = getApplyBlock(max_threads_per_block); |
| |
|
| | dim3 grid; |
| | int64_t curDevice = current_device(); |
| | if (curDevice == -1) return false; |
| | if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) { |
| | return false; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | TensorBase oldA; |
| | TensorBase oldB; |
| |
|
| | if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { |
| | |
| | oldA = std::exchange(a, a.contiguous()); |
| | } |
| | if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { |
| | |
| | oldB = std::exchange(b, b.contiguous()); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | kernelPointwiseApply2<Op, \ |
| | scalar1, \ |
| | scalar2, \ |
| | TYPE, A, B, step, \ |
| | max_threads_per_block, \ |
| | min_blocks_per_sm> \ |
| | <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>( \ |
| | aInfo, bInfo, static_cast<TYPE>(totalElements), op); \ |
| | C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
|
| | |
| | switch (B) { \ |
| | case 1: \ |
| | HANDLE_CASE(TYPE, A, 1); \ |
| | break; \ |
| | case 2: \ |
| | HANDLE_CASE(TYPE, A, 2); \ |
| | break; \ |
| | default: \ |
| | HANDLE_CASE(TYPE, A, -1); \ |
| | break; \ |
| | } \ |
| | } |
| |
|
| | |
| | switch (A) { \ |
| | case 1: \ |
| | HANDLE_B_CASE(TYPE, 1, B); \ |
| | break; \ |
| | case 2: \ |
| | HANDLE_B_CASE(TYPE, 2, B); \ |
| | break; \ |
| | default: \ |
| | HANDLE_B_CASE(TYPE, -1, B); \ |
| | break; \ |
| | } \ |
| | } |
| |
|
| | if (detail::canUse32BitIndexMath(a) && |
| | detail::canUse32BitIndexMath(b)) { |
| | detail::TensorInfo<scalar1, unsigned int> aInfo = |
| | detail::getTensorInfo<scalar1, unsigned int>(a); |
| |
|
| | detail::TensorInfo<scalar2, unsigned int> bInfo = |
| | detail::getTensorInfo<scalar2, unsigned int>(b); |
| | rearrangeDims(&aInfo, &bInfo); |
| | aInfo.collapseDims(); |
| | bInfo.collapseDims(); |
| |
|
| | HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); |
| | } else { |
| | detail::TensorInfo<scalar1, uint64_t> aInfo = |
| | detail::getTensorInfo<scalar1, uint64_t>(a); |
| |
|
| | detail::TensorInfo<scalar2, uint64_t> bInfo = |
| | detail::getTensorInfo<scalar2, uint64_t>(b); |
| | rearrangeDims(&aInfo, &bInfo); |
| | aInfo.collapseDims(); |
| | bInfo.collapseDims(); |
| |
|
| | |
| | |
| | |
| | |
| | if (aInfo.dims == 1 && bInfo.dims == 1) { |
| | HANDLE_CASE(uint64_t, 1, 1); |
| | } else { |
| | HANDLE_CASE(uint64_t, -1, -1); |
| | } |
| | } |
| | |
| | |
| | |
| |
|
| | if (oldA.defined()) { |
| | at::native::copy_ignoring_overlaps(oldA, a); |
| | } |
| |
|
| | if (oldB.defined()) { |
| | at::native::copy_ignoring_overlaps(oldB, b); |
| | } |
| |
|
| | return true; |
| | } |
| |
|
| | |
| | template <typename scalar1, typename scalar2, typename Op, |
| | int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK, |
| | int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM> |
| | inline bool CUDA_tensor_apply2(const at::TensorBase &a, |
| | const at::TensorBase &b, |
| | const Op op, |
| | TensorArgType aType = TensorArgType::ReadWrite, |
| | TensorArgType bType = TensorArgType::ReadOnly) { |
| | return CUDA_tensor_apply2<scalar1, scalar2, 1, Op, |
| | max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType); |
| | } |
| |
|
| | } |
| | } |
| |
|