/* * Copyright (c) 2020 NVIDIA Corporation. * Copyright (c) 2018-2020 Chris Choy (chrischoy@ai.stanford.edu). * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part * of the code. */ #ifndef GPU_BROADCAST #define GPU_BROADCAST #include "allocators.cuh" #include "broadcast_kernel.cuh" #include "math_functions.cuh" namespace minkowski { namespace detail { template struct IsIntType { static const bool value = false; }; template <> struct IsIntType { static const bool value = true; }; template __device__ void atomic_addition_n(Dtype *__restrict__ dst, const Dtype *__restrict__ src, const int num_elements) { for (int i = 0; i < num_elements; ++i) atomicAdd(dst + i, src[i]); } /* Must be applied to collision free destinations */ template __device__ void multiplication_n(Dtype *__restrict__ dst, const Dtype *__restrict__ src, const int num_elements) { for (int i = 0; i < num_elements; ++i) dst[i] *= src[i]; } template __global__ void channelwise_addition(const int n, const int nchannel, const Dtype *__restrict__ d_glob_feat, const Itype *__restrict__ d_in_map, const Itype *__restrict__ d_out_map, Dtype *__restrict__ d_out_feat) { CUDA_KERNEL_LOOP(index, n) { atomic_addition_n(&d_out_feat[d_in_map[index] * nchannel], &d_glob_feat[d_out_map[index] * nchannel], nchannel); } } template __global__ void channelwise_multiplication( const int n, const int nchannel, const Dtype *__restrict__ d_glob_feat, const Itype *__restrict__ d_in_map, const Itype *__restrict__ d_out_map, Dtype *__restrict__ d_out_feat) { CUDA_KERNEL_LOOP(index, n) { multiplication_n(&d_out_feat[d_in_map[index] * nchannel], &d_glob_feat[d_out_map[index] * nchannel], nchannel); } } template __global__ void fill(const int n, Dtype *__restrict__ in_feat, const Dtype val) { CUDA_KERNEL_LOOP(index, n) { in_feat[index] = val; } } } // namespace detail template void BroadcastForwardKernelGPU( const Dtype *d_in_feat, const int in_nrows, const Dtype *d_in_feat_global, const int in_nrows_global, Dtype *d_out_feat, const int nchannel, BroadcastMode::Type const op, gpu_kernel_map const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream) { // Sum all sizes size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin(); if (num_map != in_nrows) throw std::invalid_argument( "BroadcastForwardKernelGPU: kernel_map size != in_nrows"); // Copy all in_feat to out_feat CUDA_CHECK(cudaMemcpy(d_out_feat, d_in_feat, sizeof(Dtype) * nchannel * in_nrows, cudaMemcpyDeviceToDevice)); // To speed up, put switch outside for loops switch (op) { case BroadcastMode::ELEMENTWISE_ADDITON: // + detail::channelwise_addition <<>>(in_nrows, nchannel, d_in_feat_global, kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), d_out_feat); break; case BroadcastMode::ELEMENTWISE_MULTIPLICATION: // * detail::channelwise_multiplication <<>>(in_nrows, nchannel, d_in_feat_global, kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), d_out_feat); break; default: throw std::invalid_argument(Formatter() << "Operation not supported: " << std::to_string(op)); } CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } template void BroadcastForwardKernelGPU>( const float *d_in_feat, int in_nrows, const float *d_in_feat_global, int in_nrows_global, float *d_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cuhandle, cudaStream_t stream); template void BroadcastForwardKernelGPU>( const double *d_in_feat, int in_nrows, const double *d_in_feat_global, int in_nrows_global, double *d_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cuhandle, cudaStream_t stream); template void BroadcastForwardKernelGPU>( const float *d_in_feat, int in_nrows, const float *d_in_feat_global, int in_nrows_global, float *d_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cuhandle, cudaStream_t stream); template void BroadcastForwardKernelGPU>( const double *d_in_feat, int in_nrows, const double *d_in_feat_global, int in_nrows_global, double *d_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cuhandle, cudaStream_t stream); template void BroadcastBackwardKernelGPU( const Dtype *d_in_feat, Dtype *d_grad_in_feat, int in_nrows, const Dtype *d_in_feat_global, Dtype *d_grad_in_feat_global, int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream) { Itype *d_scr, *d_in_map, *d_out_map; //, *d_csr_row; Dtype *d_dtype, *d_coo_val, *d_tmp_grad_in_feat_global, *d_tmp_grad_in_feat; // cusparseMatDescr_t descr = 0; const Dtype alpha = 1; const Dtype beta = 0; int nnz = in_nrows; // if (in_maps.size() != 1) { // All in_maps[k] are contiguous. // TODO. Assert contiguous. // } // Sum all sizes size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin(); if (num_map != in_nrows) throw std::invalid_argument( "BroadcastBackwardKernelGPU: kernel_map size != in_nrows"); /* In Out Map prep */ // Malloc d_in_map, d_out_map, d_csr_row // CSR returns n_row + 1 CUDA_CHECK(cudaMalloc((void **)&d_scr, 2 * nnz * sizeof(Itype) + // in out maps (in_nrows_global + 1) * sizeof(Itype) // d_csr_row )); // COO cols d_in_map = d_scr; // nnz // COO rows d_out_map = d_scr + nnz; // nnz // CSR row indices // d_csr_row = d_scr + 2 * nnz; // in_nrows_global + 1 CUDA_CHECK(cudaMemcpy( d_in_map, (int *)kernel_map.in_maps.begin(), // in_maps are contiguous of size nnz nnz * sizeof(int), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy( d_out_map, (int *)kernel_map.out_maps.begin(), // out_maps are contiguous of size nnz nnz * sizeof(int), cudaMemcpyDeviceToDevice)); /* tmp in out feat */ // sparse gemm output CUDA_CHECK(cudaMalloc( (void **)&d_dtype, nnz * sizeof(Dtype) + // d_csr_val in_nrows * nchannel * sizeof(Dtype) + // tmp_grad_infeat in_nrows_global * nchannel * sizeof(Dtype) // tmp_grad_infeat_global )); // Divide the memory space into multiple chunks d_tmp_grad_in_feat_global = d_dtype; // in_nrows_global * nchannel d_tmp_grad_in_feat = d_tmp_grad_in_feat_global + in_nrows_global * nchannel; // in_nrows * nchannel d_coo_val = d_tmp_grad_in_feat + in_nrows * nchannel; // thrust::fill(d_csr_val.begin(), d_csr_val.end(), 1); detail::fill <<>>( nnz, d_coo_val, (Dtype)1.); // CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); // cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); // Sort COO first THRUST_CHECK(thrust::sort_by_key(thrust::device, // d_out_map, // key begin d_out_map + nnz, // key end d_in_map // value begin )); cusparseSpMMAlg_t mm_alg; #if defined(CUDART_VERSION) && (CUDART_VERSION < 10010) TORCH_CHECK(false, "spmm sparse-dense requires CUDA 10.1 or greater"); #elif defined(CUDART_VERSION) && (CUDART_VERSION >= 10010) && \ (CUDART_VERSION < 11000) mm_alg = CUSPARSE_MM_ALG_DEFAULT; #elif defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) mm_alg = CUSPARSE_SPMM_ALG_DEFAULT; #endif // +---------+ +---+ // | spm | | i | // +---------+ | n | // in_nrows | | // | F | // | | // +---+ // nchannel size_t dim_i = in_nrows_global, dim_j = in_nrows, dim_k = nchannel; constexpr bool is_float32 = std::is_same::value; cudaDataType cuda_data_type = is_float32 ? CUDA_R_32F : CUDA_R_64F; cusparseSpMatDescr_t sparse_descr; cusparseDnMatDescr_t dense_descr; cusparseDnMatDescr_t result_descr; CUSPARSE_CHECK(cusparseCreateCoo(&sparse_descr, // dim_i, dim_j, nnz, // d_out_map, // rows d_in_map, // cols d_coo_val, // coo vals CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cuda_data_type)); // buffer size 0 for CUSPARSE_SPMM_COO_ALG1, CUSPARSE_SPMM_COO_ALG3, // CUSPARSE_SPMM_COO_ALG4, and CUSPARSE_SPMM_CSR_ALG1 // To speed up, put switch outside for loops switch (op) { case BroadcastMode::ELEMENTWISE_ADDITON: // + // For grad_in_feat, copy all grad_out_feat to grad_in_feat CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat, sizeof(Dtype) * nchannel * in_nrows, cudaMemcpyDeviceToDevice)); CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, // dim_k, dim_j, dim_k, // (void *)d_grad_out_feat, // cuda_data_type, CUSPARSE_ORDER_COL)); CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, // dim_i, dim_k, dim_i, // (void *)d_tmp_grad_in_feat_global, // cuda_data_type, CUSPARSE_ORDER_COL)); // Transpose the output // WARNING: coo sorting must have been handled in the kernel map // decomposition. CUSPARSE_CHECK(cusparseSpMM(cushandle, // CUSPARSE_OPERATION_NON_TRANSPOSE, // CUSPARSE_OPERATION_TRANSPOSE, // (void *)&alpha, // sparse_descr, dense_descr, // (void *)&beta, result_descr, // cuda_data_type, mm_alg, 0)); // For grad_in_feat_glob, add all grad_out_feat /* CUSPARSE_CHECK( cusparse_csrmm(cushandle, CUSPARSE_OPERATION_NON_TRANSPOSE, // op(A) CUSPARSE_OPERATION_TRANSPOSE, // op(B) in_nrows_global, // M nchannel, // N in_nrows, // K nnz, &alpha, descr, d_csr_val, // val d_csr_row, // row d_in_map, // col d_grad_out_feat, // B nchannel, // ldb &beta, d_tmp_grad_in_feat_global, // C in_nrows_global // ldc )); */ col2row_major(in_nrows_global, nchannel, d_tmp_grad_in_feat_global, d_grad_in_feat_global, stream); break; case BroadcastMode::ELEMENTWISE_MULTIPLICATION: // * // Second, for grad_in_feat_global, copy in_feat to tmp, // Forward : (A^T(sparse) x global(dense)) (*) B(feat) = C(result) // grad global : A(sparse) (grad C (*) B) = CUDA_CHECK(cudaMemcpy(d_tmp_grad_in_feat, d_grad_out_feat, sizeof(Dtype) * nchannel * in_nrows, cudaMemcpyDeviceToDevice)); gpu_multiplication(nchannel * in_nrows, d_in_feat, d_tmp_grad_in_feat, d_tmp_grad_in_feat, stream); CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, // dim_k, dim_j, dim_k, // (void *)d_tmp_grad_in_feat, // cuda_data_type, CUSPARSE_ORDER_COL)); CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, // dim_i, dim_k, dim_i, // (void *)d_tmp_grad_in_feat_global, // cuda_data_type, CUSPARSE_ORDER_COL)); // Transpose the output CUSPARSE_CHECK(cusparseSpMM(cushandle, // CUSPARSE_OPERATION_NON_TRANSPOSE, // CUSPARSE_OPERATION_TRANSPOSE, // (void *)&alpha, // sparse_descr, dense_descr, // (void *)&beta, result_descr, // cuda_data_type, mm_alg, 0)); /*CUSPARSE_CHECK( cusparse_csrmm(cushandle, CUSPARSE_OPERATION_NON_TRANSPOSE, // op(A) CUSPARSE_OPERATION_TRANSPOSE, // op(B) in_nrows_global, // M nchannel, // N in_nrows, // K nnz, &alpha, descr, d_csr_val, // val d_csr_row, // row d_in_map, // col d_tmp_grad_in_feat, // B nchannel, // ldb &beta, d_tmp_grad_in_feat_global, // C in_nrows_global // ldc )); */ col2row_major(in_nrows_global, nchannel, d_tmp_grad_in_feat_global, d_grad_in_feat_global, stream); // First, for grad_in_feat // Copy in_feat_global to tmp, then multiply the tmp with grad_out_feat // Forward : (A^T(sparse) x global(dense)) (*) B(feat) = C(result) // grad feat : A^T(sparse) x global(dense) (*) grad C // Sort COO first // sort_coo_gpu(cushandle, in_nrows_global, in_nrows, nnz, d_out_map, // d_in_map); // cusparseSpMatDescr_t sparse_descr2; CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat, sizeof(Dtype) * nchannel * in_nrows, cudaMemcpyDeviceToDevice)); detail::channelwise_multiplication <<>>(in_nrows, nchannel, d_in_feat_global, kernel_map.in_maps.begin(), kernel_map.out_maps.begin(), d_grad_in_feat); /* row2col_major(in_nrows_global, nchannel, d_in_feat_global, d_tmp_grad_in_feat_global, stream); CUSPARSE_CHECK( cusparse_csrmm(cushandle, CUSPARSE_OPERATION_TRANSPOSE, // op(A) CUSPARSE_OPERATION_NON_TRANSPOSE, // op(B) in_nrows_global, // M nchannel, // N in_nrows, // K nnz, &alpha, descr, d_csr_val, // val d_csr_row, // row d_in_map, // col d_tmp_grad_in_feat_global, // B in_nrows_global, // ldb &beta, d_tmp_grad_in_feat, // C in_nrows // ldc )); col2row_major(in_nrows, nchannel, d_tmp_grad_in_feat, d_grad_in_feat, stream); gpu_multiplication(nchannel * in_nrows, d_grad_out_feat, d_grad_in_feat, d_grad_in_feat, stream); */ break; default: throw std::invalid_argument(Formatter() << "Operation not supported: " << std::to_string(op)); } cudaFree(d_scr); cudaFree(d_dtype); CUSPARSE_CHECK(cusparseDestroySpMat(sparse_descr)); CUSPARSE_CHECK(cusparseDestroyDnMat(dense_descr)); CUSPARSE_CHECK(cusparseDestroyDnMat(result_descr)); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } template void BroadcastBackwardKernelGPU>( const float *d_in_feat, float *d_grad_in_feat, int in_nrows, const float *d_in_feat_global, float *d_grad_in_feat_global, int in_nrows_global, const float *d_grad_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream); template void BroadcastBackwardKernelGPU>( const double *d_in_feat, double *d_grad_in_feat, int in_nrows, const double *d_in_feat_global, double *d_grad_in_feat_global, int in_nrows_global, const double *d_grad_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream); template void BroadcastBackwardKernelGPU>( const float *d_in_feat, float *d_grad_in_feat, int in_nrows, const float *d_in_feat_global, float *d_grad_in_feat_global, int in_nrows_global, const float *d_grad_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream); template void BroadcastBackwardKernelGPU>( const double *d_in_feat, double *d_grad_in_feat, int in_nrows, const double *d_in_feat_global, double *d_grad_in_feat_global, int in_nrows_global, const double *d_grad_out_feat, int nchannel, BroadcastMode::Type const op, gpu_kernel_map> const &kernel_map, cusparseHandle_t cushandle, cudaStream_t stream); } // namespace minkowski #endif // GPU_BROADCAST