|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef GPU_BROADCAST |
|
|
#define GPU_BROADCAST |
|
|
|
|
|
#include "allocators.cuh" |
|
|
#include "broadcast_kernel.cuh" |
|
|
#include "math_functions.cuh" |
|
|
|
|
|
namespace minkowski { |
|
|
|
|
|
namespace detail { |
|
|
|
|
|
template <class T> struct IsIntType { static const bool value = false; }; |
|
|
|
|
|
template <> struct IsIntType<int> { static const bool value = true; }; |
|
|
|
|
|
template <typename Dtype> |
|
|
__device__ void atomic_addition_n(Dtype *__restrict__ dst, |
|
|
const Dtype *__restrict__ src, |
|
|
const int num_elements) { |
|
|
for (int i = 0; i < num_elements; ++i) |
|
|
atomicAdd(dst + i, src[i]); |
|
|
} |
|
|
|
|
|
|
|
|
template <typename Dtype> |
|
|
__device__ void multiplication_n(Dtype *__restrict__ dst, |
|
|
const Dtype *__restrict__ src, |
|
|
const int num_elements) { |
|
|
for (int i = 0; i < num_elements; ++i) |
|
|
dst[i] *= src[i]; |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype> |
|
|
__global__ void channelwise_addition(const int n, const int nchannel, |
|
|
const Dtype *__restrict__ d_glob_feat, |
|
|
const Itype *__restrict__ d_in_map, |
|
|
const Itype *__restrict__ d_out_map, |
|
|
Dtype *__restrict__ d_out_feat) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
atomic_addition_n(&d_out_feat[d_in_map[index] * nchannel], |
|
|
&d_glob_feat[d_out_map[index] * nchannel], nchannel); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype> |
|
|
__global__ void channelwise_multiplication( |
|
|
const int n, const int nchannel, const Dtype *__restrict__ d_glob_feat, |
|
|
const Itype *__restrict__ d_in_map, const Itype *__restrict__ d_out_map, |
|
|
Dtype *__restrict__ d_out_feat) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
multiplication_n(&d_out_feat[d_in_map[index] * nchannel], |
|
|
&d_glob_feat[d_out_map[index] * nchannel], nchannel); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype> |
|
|
__global__ void fill(const int n, Dtype *__restrict__ in_feat, |
|
|
const Dtype val) { |
|
|
CUDA_KERNEL_LOOP(index, n) { in_feat[index] = val; } |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype, typename ByteAllocator> |
|
|
void BroadcastForwardKernelGPU( |
|
|
const Dtype *d_in_feat, const int in_nrows, const Dtype *d_in_feat_global, |
|
|
const int in_nrows_global, Dtype *d_out_feat, const int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream) { |
|
|
|
|
|
|
|
|
size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin(); |
|
|
|
|
|
if (num_map != in_nrows) |
|
|
throw std::invalid_argument( |
|
|
"BroadcastForwardKernelGPU: kernel_map size != in_nrows"); |
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_out_feat, d_in_feat, |
|
|
sizeof(Dtype) * nchannel * in_nrows, |
|
|
cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
|
|
|
switch (op) { |
|
|
case BroadcastMode::ELEMENTWISE_ADDITON: |
|
|
detail::channelwise_addition<Dtype, Itype> |
|
|
<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, |
|
|
stream>>>(in_nrows, nchannel, d_in_feat_global, |
|
|
kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), |
|
|
d_out_feat); |
|
|
break; |
|
|
case BroadcastMode::ELEMENTWISE_MULTIPLICATION: |
|
|
detail::channelwise_multiplication<Dtype, Itype> |
|
|
<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, |
|
|
stream>>>(in_nrows, nchannel, d_in_feat_global, |
|
|
kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), |
|
|
d_out_feat); |
|
|
break; |
|
|
default: |
|
|
throw std::invalid_argument(Formatter() << "Operation not supported: " |
|
|
<< std::to_string(op)); |
|
|
} |
|
|
|
|
|
CUDA_CHECK(cudaGetLastError()); |
|
|
CUDA_CHECK(cudaDeviceSynchronize()); |
|
|
} |
|
|
|
|
|
template void |
|
|
BroadcastForwardKernelGPU<float, uint32_t, detail::default_allocator<char>>( |
|
|
const float *d_in_feat, int in_nrows, const float *d_in_feat_global, |
|
|
int in_nrows_global, float *d_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cuhandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastForwardKernelGPU<double, uint32_t, detail::default_allocator<char>>( |
|
|
const double *d_in_feat, int in_nrows, const double *d_in_feat_global, |
|
|
int in_nrows_global, double *d_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cuhandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastForwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>( |
|
|
const float *d_in_feat, int in_nrows, const float *d_in_feat_global, |
|
|
int in_nrows_global, float *d_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cuhandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastForwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>( |
|
|
const double *d_in_feat, int in_nrows, const double *d_in_feat_global, |
|
|
int in_nrows_global, double *d_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cuhandle, cudaStream_t stream); |
|
|
|
|
|
template <typename Dtype, typename Itype, typename ByteAllocator> |
|
|
void BroadcastBackwardKernelGPU( |
|
|
const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows, |
|
|
const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global, |
|
|
int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream) { |
|
|
Itype *d_scr, *d_in_map, *d_out_map; |
|
|
Dtype *d_dtype, *d_coo_val, *d_tmp_grad_in_feat_global, *d_tmp_grad_in_feat; |
|
|
|
|
|
const Dtype alpha = 1; |
|
|
const Dtype beta = 0; |
|
|
int nnz = in_nrows; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin(); |
|
|
|
|
|
if (num_map != in_nrows) |
|
|
throw std::invalid_argument( |
|
|
"BroadcastBackwardKernelGPU: kernel_map size != in_nrows"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMalloc((void **)&d_scr, |
|
|
2 * nnz * sizeof(Itype) + |
|
|
(in_nrows_global + 1) * sizeof(Itype) |
|
|
)); |
|
|
|
|
|
|
|
|
d_in_map = d_scr; |
|
|
|
|
|
d_out_map = d_scr + nnz; |
|
|
|
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMemcpy( |
|
|
d_in_map, |
|
|
(int *)kernel_map.in_maps.begin(), |
|
|
nnz * sizeof(int), cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
CUDA_CHECK(cudaMemcpy( |
|
|
d_out_map, |
|
|
(int *)kernel_map.out_maps.begin(), |
|
|
nnz * sizeof(int), cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMalloc( |
|
|
(void **)&d_dtype, |
|
|
nnz * sizeof(Dtype) + |
|
|
in_nrows * nchannel * sizeof(Dtype) + |
|
|
in_nrows_global * nchannel * sizeof(Dtype) |
|
|
)); |
|
|
|
|
|
|
|
|
d_tmp_grad_in_feat_global = d_dtype; |
|
|
d_tmp_grad_in_feat = d_tmp_grad_in_feat_global + |
|
|
in_nrows_global * nchannel; |
|
|
d_coo_val = d_tmp_grad_in_feat + in_nrows * nchannel; |
|
|
|
|
|
|
|
|
detail::fill<Dtype> |
|
|
<<<GET_BLOCKS(nnz, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, stream>>>( |
|
|
nnz, d_coo_val, (Dtype)1.); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
THRUST_CHECK(thrust::sort_by_key(thrust::device, |
|
|
d_out_map, |
|
|
d_out_map + nnz, |
|
|
d_in_map |
|
|
)); |
|
|
|
|
|
cusparseSpMMAlg_t mm_alg; |
|
|
#if defined(CUDART_VERSION) && (CUDART_VERSION < 10010) |
|
|
TORCH_CHECK(false, "spmm sparse-dense requires CUDA 10.1 or greater"); |
|
|
#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 10010) && \ |
|
|
(CUDART_VERSION < 11000) |
|
|
mm_alg = CUSPARSE_MM_ALG_DEFAULT; |
|
|
#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) |
|
|
mm_alg = CUSPARSE_SPMM_ALG_DEFAULT; |
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t dim_i = in_nrows_global, dim_j = in_nrows, dim_k = nchannel; |
|
|
constexpr bool is_float32 = std::is_same<Dtype, float>::value; |
|
|
cudaDataType cuda_data_type = is_float32 ? CUDA_R_32F : CUDA_R_64F; |
|
|
cusparseSpMatDescr_t sparse_descr; |
|
|
cusparseDnMatDescr_t dense_descr; |
|
|
cusparseDnMatDescr_t result_descr; |
|
|
CUSPARSE_CHECK(cusparseCreateCoo(&sparse_descr, |
|
|
dim_i, dim_j, nnz, |
|
|
d_out_map, |
|
|
d_in_map, |
|
|
d_coo_val, |
|
|
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, |
|
|
cuda_data_type)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
switch (op) { |
|
|
case BroadcastMode::ELEMENTWISE_ADDITON: |
|
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat, |
|
|
sizeof(Dtype) * nchannel * in_nrows, |
|
|
cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, |
|
|
dim_k, dim_j, dim_k, |
|
|
(void *)d_grad_out_feat, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, |
|
|
dim_i, dim_k, dim_i, |
|
|
(void *)d_tmp_grad_in_feat_global, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CUSPARSE_CHECK(cusparseSpMM(cushandle, |
|
|
CUSPARSE_OPERATION_NON_TRANSPOSE, |
|
|
CUSPARSE_OPERATION_TRANSPOSE, |
|
|
(void *)&alpha, |
|
|
sparse_descr, dense_descr, |
|
|
(void *)&beta, result_descr, |
|
|
cuda_data_type, mm_alg, 0)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
col2row_major<Dtype>(in_nrows_global, nchannel, d_tmp_grad_in_feat_global, |
|
|
d_grad_in_feat_global, stream); |
|
|
|
|
|
break; |
|
|
case BroadcastMode::ELEMENTWISE_MULTIPLICATION: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_tmp_grad_in_feat, d_grad_out_feat, |
|
|
sizeof(Dtype) * nchannel * in_nrows, |
|
|
cudaMemcpyDeviceToDevice)); |
|
|
gpu_multiplication<Dtype>(nchannel * in_nrows, d_in_feat, |
|
|
d_tmp_grad_in_feat, d_tmp_grad_in_feat, stream); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, |
|
|
dim_k, dim_j, dim_k, |
|
|
(void *)d_tmp_grad_in_feat, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, |
|
|
dim_i, dim_k, dim_i, |
|
|
(void *)d_tmp_grad_in_feat_global, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
|
|
|
CUSPARSE_CHECK(cusparseSpMM(cushandle, |
|
|
CUSPARSE_OPERATION_NON_TRANSPOSE, |
|
|
CUSPARSE_OPERATION_TRANSPOSE, |
|
|
(void *)&alpha, |
|
|
sparse_descr, dense_descr, |
|
|
(void *)&beta, result_descr, |
|
|
cuda_data_type, mm_alg, 0)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
col2row_major<Dtype>(in_nrows_global, nchannel, d_tmp_grad_in_feat_global, |
|
|
d_grad_in_feat_global, stream); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat, |
|
|
sizeof(Dtype) * nchannel * in_nrows, |
|
|
cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
detail::channelwise_multiplication<Dtype, Itype> |
|
|
<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, |
|
|
stream>>>(in_nrows, nchannel, d_in_feat_global, |
|
|
kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), |
|
|
d_grad_in_feat); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
break; |
|
|
default: |
|
|
throw std::invalid_argument(Formatter() << "Operation not supported: " |
|
|
<< std::to_string(op)); |
|
|
} |
|
|
|
|
|
cudaFree(d_scr); |
|
|
cudaFree(d_dtype); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseDestroySpMat(sparse_descr)); |
|
|
CUSPARSE_CHECK(cusparseDestroyDnMat(dense_descr)); |
|
|
CUSPARSE_CHECK(cusparseDestroyDnMat(result_descr)); |
|
|
|
|
|
CUDA_CHECK(cudaGetLastError()); |
|
|
CUDA_CHECK(cudaDeviceSynchronize()); |
|
|
} |
|
|
|
|
|
template void |
|
|
BroadcastBackwardKernelGPU<float, uint32_t, detail::default_allocator<char>>( |
|
|
const float *d_in_feat, float *d_grad_in_feat, int in_nrows, |
|
|
const float *d_in_feat_global, float *d_grad_in_feat_global, |
|
|
int in_nrows_global, const float *d_grad_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastBackwardKernelGPU<double, uint32_t, detail::default_allocator<char>>( |
|
|
const double *d_in_feat, double *d_grad_in_feat, int in_nrows, |
|
|
const double *d_in_feat_global, double *d_grad_in_feat_global, |
|
|
int in_nrows_global, const double *d_grad_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastBackwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>( |
|
|
const float *d_in_feat, float *d_grad_in_feat, int in_nrows, |
|
|
const float *d_in_feat_global, float *d_grad_in_feat_global, |
|
|
int in_nrows_global, const float *d_grad_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
BroadcastBackwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>( |
|
|
const double *d_in_feat, double *d_grad_in_feat, int in_nrows, |
|
|
const double *d_in_feat_global, double *d_grad_in_feat_global, |
|
|
int in_nrows_global, const double *d_grad_out_feat, int nchannel, |
|
|
BroadcastMode::Type const op, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|