|
|
|
|
|
|
|
|
#include <ATen/cuda/ApplyGridUtils.cuh>
|
|
|
#include <ATen/cuda/detail/IndexUtils.cuh>
|
|
|
#include <ATen/core/TensorBase.h>
|
|
|
#include <ATen/ceil_div.h>
|
|
|
#include <ATen/cuda/Atomic.cuh>
|
|
|
#include <ATen/cuda/CUDAContext.h>
|
|
|
#include <c10/macros/Macros.h>
|
|
|
#include <ATen/native/Copy.h>
|
|
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace at::cuda {
|
|
|
|
|
|
|
|
|
enum class TensorArgType { ReadWrite, ReadOnly };
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T1, typename IndexType,
|
|
|
typename T2 = void, typename T3 = void, typename T4 = void>
|
|
|
inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
|
|
|
detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
|
|
|
detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
|
|
|
detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
|
|
|
int numInfos = 1;
|
|
|
int dims = aInfo->dims;
|
|
|
IndexType *sizes[4] = { aInfo->sizes, };
|
|
|
IndexType *strides[4] = { aInfo->strides, };
|
|
|
|
|
|
if (bInfo != nullptr) {
|
|
|
++numInfos;
|
|
|
if (bInfo->dims != dims) return;
|
|
|
sizes[1] = bInfo->sizes;
|
|
|
strides[1] = bInfo->strides;
|
|
|
}
|
|
|
|
|
|
if (cInfo != nullptr) {
|
|
|
++numInfos;
|
|
|
if (cInfo->dims != dims) return;
|
|
|
sizes[2] = cInfo->sizes;
|
|
|
strides[2] = cInfo->strides;
|
|
|
}
|
|
|
|
|
|
if (dInfo != nullptr) {
|
|
|
++numInfos;
|
|
|
if (dInfo->dims != dims) return;
|
|
|
sizes[3] = dInfo->sizes;
|
|
|
strides[3] = dInfo->strides;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 1; i < numInfos; ++i) {
|
|
|
for (int j = 0; j < dims; ++j) {
|
|
|
if (sizes[i][j] != sizes[0][j]) return;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
for (int i = 0; i < dims - 1; ++i) {
|
|
|
|
|
|
if (sizes[0][i] == 1) continue;
|
|
|
|
|
|
for (int j = i + 1; j < dims; ++j) {
|
|
|
if (sizes[0][j] == 1) continue;
|
|
|
|
|
|
|
|
|
bool hasIncreasingStrides = false;
|
|
|
bool hasDecreasingStrides = false;
|
|
|
|
|
|
for (int k = 0; k < numInfos; k++) {
|
|
|
IndexType stride_i = strides[k][i];
|
|
|
IndexType stride_j = strides[k][j];
|
|
|
if (stride_i < stride_j) {
|
|
|
hasIncreasingStrides = true;
|
|
|
} else if (stride_i > stride_j) {
|
|
|
hasDecreasingStrides = true;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (hasIncreasingStrides && !hasDecreasingStrides) {
|
|
|
for (int k = 0; k < numInfos; k++) {
|
|
|
IndexType size = sizes[k][i];
|
|
|
sizes[k][i] = sizes[k][j];
|
|
|
sizes[k][j] = size;
|
|
|
|
|
|
IndexType stride = strides[k][i];
|
|
|
strides[k][i] = strides[k][j];
|
|
|
strides[k][j] = stride;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
int remaining_steps,
|
|
|
typename... Offsets>
|
|
|
struct ApplyOp1 {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
|
|
|
IndexType linearIndex, Offsets... aOffsets) {
|
|
|
|
|
|
const IndexType aOffset = sizeof...(Offsets) < n ?
|
|
|
detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;
|
|
|
|
|
|
ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
|
|
|
a, op, n, linearIndex + 1, aOffsets..., aOffset
|
|
|
);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
typename Offset>
|
|
|
struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op,
|
|
|
int n, IndexType linearIndex, Offset offset) {
|
|
|
op(a.data[offset]);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
typename... Offsets>
|
|
|
struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
|
|
|
IndexType linearIndex, Offsets... offsets) {
|
|
|
op(n, a.data[offsets]...);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
int step>
|
|
|
|
|
|
C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
|
|
|
|
|
|
__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
|
|
|
IndexType totalElements, const Op op) {
|
|
|
for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
|
|
|
linearIndex < totalElements;
|
|
|
linearIndex += gridDim.x * blockDim.x * step) {
|
|
|
ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
|
|
|
a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar1,
|
|
|
typename scalar2,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
int BDims,
|
|
|
int remaining_steps,
|
|
|
typename... Offsets>
|
|
|
struct ApplyOp2 {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
|
|
|
detail::TensorInfo<scalar2, IndexType> &b,
|
|
|
const Op &op, int64_t n, IndexType linearIndex,
|
|
|
Offsets... aOffsets, Offsets... bOffsets) {
|
|
|
|
|
|
const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
|
|
|
detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
|
|
|
|
|
|
|
|
|
const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
|
|
|
detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
|
|
|
|
|
|
ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
|
|
|
a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
|
|
|
);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar1,
|
|
|
typename scalar2,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
int BDims,
|
|
|
typename Offset>
|
|
|
struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
|
|
|
detail::TensorInfo<scalar2, IndexType> &b,
|
|
|
const Op &op, int , IndexType ,
|
|
|
Offset aOffset, Offset bOffset) {
|
|
|
op(a.data[aOffset], b.data[bOffset]);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar1,
|
|
|
typename scalar2,
|
|
|
typename IndexType,
|
|
|
int ADims,
|
|
|
int BDims,
|
|
|
typename... Offsets>
|
|
|
struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
|
|
|
__device__ __forceinline__
|
|
|
static void apply(detail::TensorInfo<scalar1, IndexType> &a,
|
|
|
detail::TensorInfo<scalar2, IndexType> &b,
|
|
|
const Op &op, int n, IndexType linearIndex,
|
|
|
Offsets... aOffsets, Offsets... bOffsets) {
|
|
|
op(n, a.data[aOffsets]..., b.data[bOffsets]...);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
template <typename Op,
|
|
|
typename scalar1,
|
|
|
typename scalar2,
|
|
|
typename IndexType,
|
|
|
int ADims, int BDims,
|
|
|
int step,
|
|
|
int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
|
|
|
int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
|
|
|
|
|
|
C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
|
|
|
|
|
|
__global__ void
|
|
|
kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
|
|
|
detail::TensorInfo<scalar2, IndexType> b,
|
|
|
IndexType totalElements,
|
|
|
const Op op) {
|
|
|
for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
|
|
|
linearIndex < totalElements;
|
|
|
linearIndex += gridDim.x * blockDim.x * step) {
|
|
|
ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
|
|
|
a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
|
|
|
linearIndex);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
template <typename scalar1, typename scalar2, int step, typename Op,
|
|
|
int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
|
|
|
int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
|
|
|
inline bool CUDA_tensor_apply2(at::TensorBase a,
|
|
|
at::TensorBase b,
|
|
|
const Op op,
|
|
|
TensorArgType aType = TensorArgType::ReadWrite,
|
|
|
TensorArgType bType = TensorArgType::ReadOnly) {
|
|
|
TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
|
|
|
"CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
|
|
|
"tensors with type ", a.device().type(), " and ", b.device().type());
|
|
|
int64_t totalElements = a.numel();
|
|
|
|
|
|
if (totalElements != b.numel()) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (a.dim() > MAX_TENSORINFO_DIMS ||
|
|
|
b.dim() > MAX_TENSORINFO_DIMS) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
if (a.numel() == 0) {
|
|
|
|
|
|
return true;
|
|
|
}
|
|
|
const dim3 block = getApplyBlock(max_threads_per_block);
|
|
|
|
|
|
dim3 grid;
|
|
|
auto curDevice = current_device();
|
|
|
if (curDevice == -1) return false;
|
|
|
if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TensorBase oldA;
|
|
|
TensorBase oldB;
|
|
|
|
|
|
if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
|
|
|
|
|
|
oldA = std::exchange(a, a.contiguous());
|
|
|
}
|
|
|
if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
|
|
|
|
|
|
oldB = std::exchange(b, b.contiguous());
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
kernelPointwiseApply2<Op, \
|
|
|
scalar1, \
|
|
|
scalar2, \
|
|
|
TYPE, A, B, step, \
|
|
|
max_threads_per_block, \
|
|
|
min_blocks_per_sm> \
|
|
|
<<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>( \
|
|
|
aInfo, bInfo, static_cast<TYPE>(totalElements), op); \
|
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
|
|
|
|
|
|
|
switch (B) { \
|
|
|
case 1: \
|
|
|
HANDLE_CASE(TYPE, A, 1); \
|
|
|
break; \
|
|
|
case 2: \
|
|
|
HANDLE_CASE(TYPE, A, 2); \
|
|
|
break; \
|
|
|
default: \
|
|
|
HANDLE_CASE(TYPE, A, -1); \
|
|
|
break; \
|
|
|
} \
|
|
|
}
|
|
|
|
|
|
|
|
|
switch (A) { \
|
|
|
case 1: \
|
|
|
HANDLE_B_CASE(TYPE, 1, B); \
|
|
|
break; \
|
|
|
case 2: \
|
|
|
HANDLE_B_CASE(TYPE, 2, B); \
|
|
|
break; \
|
|
|
default: \
|
|
|
HANDLE_B_CASE(TYPE, -1, B); \
|
|
|
break; \
|
|
|
} \
|
|
|
}
|
|
|
|
|
|
if (detail::canUse32BitIndexMath(a) &&
|
|
|
detail::canUse32BitIndexMath(b)) {
|
|
|
detail::TensorInfo<scalar1, unsigned int> aInfo =
|
|
|
detail::getTensorInfo<scalar1, unsigned int>(a);
|
|
|
|
|
|
detail::TensorInfo<scalar2, unsigned int> bInfo =
|
|
|
detail::getTensorInfo<scalar2, unsigned int>(b);
|
|
|
rearrangeDims(&aInfo, &bInfo);
|
|
|
aInfo.collapseDims();
|
|
|
bInfo.collapseDims();
|
|
|
|
|
|
HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
|
|
|
} else {
|
|
|
detail::TensorInfo<scalar1, uint64_t> aInfo =
|
|
|
detail::getTensorInfo<scalar1, uint64_t>(a);
|
|
|
|
|
|
detail::TensorInfo<scalar2, uint64_t> bInfo =
|
|
|
detail::getTensorInfo<scalar2, uint64_t>(b);
|
|
|
rearrangeDims(&aInfo, &bInfo);
|
|
|
aInfo.collapseDims();
|
|
|
bInfo.collapseDims();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (aInfo.dims == 1 && bInfo.dims == 1) {
|
|
|
HANDLE_CASE(uint64_t, 1, 1);
|
|
|
} else {
|
|
|
HANDLE_CASE(uint64_t, -1, -1);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (oldA.defined()) {
|
|
|
at::native::copy_ignoring_overlaps(oldA, a);
|
|
|
}
|
|
|
|
|
|
if (oldB.defined()) {
|
|
|
at::native::copy_ignoring_overlaps(oldB, b);
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename scalar1, typename scalar2, typename Op,
|
|
|
int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
|
|
|
int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
|
|
|
inline bool CUDA_tensor_apply2(const at::TensorBase &a,
|
|
|
const at::TensorBase &b,
|
|
|
const Op op,
|
|
|
TensorArgType aType = TensorArgType::ReadWrite,
|
|
|
TensorArgType bType = TensorArgType::ReadOnly) {
|
|
|
return CUDA_tensor_apply2<scalar1, scalar2, 1, Op,
|
|
|
max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|