depthsplat / MinkowskiEngine /src /broadcast_kernel.cu

Upload folder using huggingface_hub

a6dd040 verified 5 months ago

21.6 kB

	/*
	* Copyright (c) 2020 NVIDIA Corporation.
	* Copyright (c) 2018-2020 Chris Choy (chrischoy@ai.stanford.edu).
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
	* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
	* of the code.
	*/
	#ifndef GPU_BROADCAST
	#define GPU_BROADCAST

	#include "allocators.cuh"
	#include "broadcast_kernel.cuh"
	#include "math_functions.cuh"

	namespace minkowski {

	namespace detail {

	template <class T> struct IsIntType { static const bool value = false; };

	template <> struct IsIntType<int> { static const bool value = true; };

	template <typename Dtype>
	__device__ void atomic_addition_n(Dtype *__restrict__ dst,
	const Dtype *__restrict__ src,
	const int num_elements) {
	for (int i = 0; i < num_elements; ++i)
	atomicAdd(dst + i, src[i]);
	}

	/* Must be applied to collision free destinations */
	template <typename Dtype>
	__device__ void multiplication_n(Dtype *__restrict__ dst,
	const Dtype *__restrict__ src,
	const int num_elements) {
	for (int i = 0; i < num_elements; ++i)
	dst[i] *= src[i];
	}

	template <typename Dtype, typename Itype>
	__global__ void channelwise_addition(const int n, const int nchannel,
	const Dtype *__restrict__ d_glob_feat,
	const Itype *__restrict__ d_in_map,
	const Itype *__restrict__ d_out_map,
	Dtype *__restrict__ d_out_feat) {
	CUDA_KERNEL_LOOP(index, n) {
	atomic_addition_n(&d_out_feat[d_in_map[index] * nchannel],
	&d_glob_feat[d_out_map[index] * nchannel], nchannel);
	}
	}

	template <typename Dtype, typename Itype>
	__global__ void channelwise_multiplication(
	const int n, const int nchannel, const Dtype *__restrict__ d_glob_feat,
	const Itype __restrict__ d_in_map, const Itype __restrict__ d_out_map,
	Dtype *__restrict__ d_out_feat) {
	CUDA_KERNEL_LOOP(index, n) {
	multiplication_n(&d_out_feat[d_in_map[index] * nchannel],
	&d_glob_feat[d_out_map[index] * nchannel], nchannel);
	}
	}

	template <typename Dtype>
	__global__ void fill(const int n, Dtype *__restrict__ in_feat,
	const Dtype val) {
	CUDA_KERNEL_LOOP(index, n) { in_feat[index] = val; }
	}

	} // namespace detail

	template <typename Dtype, typename Itype, typename ByteAllocator>
	void BroadcastForwardKernelGPU(
	const Dtype d_in_feat, const int in_nrows, const Dtype d_in_feat_global,
	const int in_nrows_global, Dtype *d_out_feat, const int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<Itype, ByteAllocator> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream) {

	// Sum all sizes
	size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin();

	if (num_map != in_nrows)
	throw std::invalid_argument(
	"BroadcastForwardKernelGPU: kernel_map size != in_nrows");

	// Copy all in_feat to out_feat
	CUDA_CHECK(cudaMemcpy(d_out_feat, d_in_feat,
	sizeof(Dtype) * nchannel * in_nrows,
	cudaMemcpyDeviceToDevice));

	// To speed up, put switch outside for loops
	switch (op) {
	case BroadcastMode::ELEMENTWISE_ADDITON: // +
	detail::channelwise_addition<Dtype, Itype>
	<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
	stream>>>(in_nrows, nchannel, d_in_feat_global,
	kernel_map.in_maps.begin(), kernel_map.out_maps.begin(),
	d_out_feat);
	break;
	case BroadcastMode::ELEMENTWISE_MULTIPLICATION: // *
	detail::channelwise_multiplication<Dtype, Itype>
	<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
	stream>>>(in_nrows, nchannel, d_in_feat_global,
	kernel_map.in_maps.begin(), kernel_map.out_maps.begin(),
	d_out_feat);
	break;
	default:
	throw std::invalid_argument(Formatter() << "Operation not supported: "
	<< std::to_string(op));
	}

	CUDA_CHECK(cudaGetLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	}

	template void
	BroadcastForwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
	const float d_in_feat, int in_nrows, const float d_in_feat_global,
	int in_nrows_global, float *d_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	cusparseHandle_t cuhandle, cudaStream_t stream);

	template void
	BroadcastForwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
	const double d_in_feat, int in_nrows, const double d_in_feat_global,
	int in_nrows_global, double *d_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	cusparseHandle_t cuhandle, cudaStream_t stream);

	template void
	BroadcastForwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
	const float d_in_feat, int in_nrows, const float d_in_feat_global,
	int in_nrows_global, float *d_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	cusparseHandle_t cuhandle, cudaStream_t stream);

	template void
	BroadcastForwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
	const double d_in_feat, int in_nrows, const double d_in_feat_global,
	int in_nrows_global, double *d_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	cusparseHandle_t cuhandle, cudaStream_t stream);

	template <typename Dtype, typename Itype, typename ByteAllocator>
	void BroadcastBackwardKernelGPU(
	const Dtype d_in_feat, Dtype d_grad_in_feat, int in_nrows,
	const Dtype d_in_feat_global, Dtype d_grad_in_feat_global,
	int in_nrows_global, const Dtype *d_grad_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<Itype, ByteAllocator> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream) {
	Itype d_scr, d_in_map, d_out_map; //, d_csr_row;
	Dtype d_dtype, d_coo_val, d_tmp_grad_in_feat_global, d_tmp_grad_in_feat;
	// cusparseMatDescr_t descr = 0;
	const Dtype alpha = 1;
	const Dtype beta = 0;
	int nnz = in_nrows;

	// if (in_maps.size() != 1) {
	// All in_maps[k] are contiguous.
	// TODO. Assert contiguous.
	// }

	// Sum all sizes
	size_t const num_map = kernel_map.in_maps.end() - kernel_map.in_maps.begin();

	if (num_map != in_nrows)
	throw std::invalid_argument(
	"BroadcastBackwardKernelGPU: kernel_map size != in_nrows");

	/* In Out Map prep */
	// Malloc d_in_map, d_out_map, d_csr_row
	// CSR returns n_row + 1
	CUDA_CHECK(cudaMalloc((void **)&d_scr,
	2 * nnz * sizeof(Itype) + // in out maps
	(in_nrows_global + 1) * sizeof(Itype) // d_csr_row
	));

	// COO cols
	d_in_map = d_scr; // nnz
	// COO rows
	d_out_map = d_scr + nnz; // nnz
	// CSR row indices
	// d_csr_row = d_scr + 2 * nnz; // in_nrows_global + 1

	CUDA_CHECK(cudaMemcpy(
	d_in_map,
	(int *)kernel_map.in_maps.begin(), // in_maps are contiguous of size nnz
	nnz * sizeof(int), cudaMemcpyDeviceToDevice));

	CUDA_CHECK(cudaMemcpy(
	d_out_map,
	(int *)kernel_map.out_maps.begin(), // out_maps are contiguous of size nnz
	nnz * sizeof(int), cudaMemcpyDeviceToDevice));

	/* tmp in out feat */
	// sparse gemm output
	CUDA_CHECK(cudaMalloc(
	(void **)&d_dtype,
	nnz * sizeof(Dtype) + // d_csr_val
	in_nrows * nchannel * sizeof(Dtype) + // tmp_grad_infeat
	in_nrows_global * nchannel * sizeof(Dtype) // tmp_grad_infeat_global
	));

	// Divide the memory space into multiple chunks
	d_tmp_grad_in_feat_global = d_dtype; // in_nrows_global * nchannel
	d_tmp_grad_in_feat = d_tmp_grad_in_feat_global +
	in_nrows_global * nchannel; // in_nrows * nchannel
	d_coo_val = d_tmp_grad_in_feat + in_nrows * nchannel;

	// thrust::fill(d_csr_val.begin(), d_csr_val.end(), 1);
	detail::fill<Dtype>
	<<<GET_BLOCKS(nnz, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, stream>>>(
	nnz, d_coo_val, (Dtype)1.);

	// CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
	// cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
	// cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

	// Sort COO first
	THRUST_CHECK(thrust::sort_by_key(thrust::device, //
	d_out_map, // key begin
	d_out_map + nnz, // key end
	d_in_map // value begin
	));

	cusparseSpMMAlg_t mm_alg;
	#if defined(CUDART_VERSION) && (CUDART_VERSION < 10010)
	TORCH_CHECK(false, "spmm sparse-dense requires CUDA 10.1 or greater");
	#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 10010) && \
	(CUDART_VERSION < 11000)
	mm_alg = CUSPARSE_MM_ALG_DEFAULT;
	#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 11000)
	mm_alg = CUSPARSE_SPMM_ALG_DEFAULT;
	#endif

	// +---------+ +---+
	// \| spm \| \| i \|
	// +---------+ \| n \|
	// in_nrows \| \|
	// \| F \|
	// \| \|
	// +---+
	// nchannel
	size_t dim_i = in_nrows_global, dim_j = in_nrows, dim_k = nchannel;
	constexpr bool is_float32 = std::is_same<Dtype, float>::value;
	cudaDataType cuda_data_type = is_float32 ? CUDA_R_32F : CUDA_R_64F;
	cusparseSpMatDescr_t sparse_descr;
	cusparseDnMatDescr_t dense_descr;
	cusparseDnMatDescr_t result_descr;
	CUSPARSE_CHECK(cusparseCreateCoo(&sparse_descr, //
	dim_i, dim_j, nnz, //
	d_out_map, // rows
	d_in_map, // cols
	d_coo_val, // coo vals
	CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
	cuda_data_type));

	// buffer size 0 for CUSPARSE_SPMM_COO_ALG1, CUSPARSE_SPMM_COO_ALG3,
	// CUSPARSE_SPMM_COO_ALG4, and CUSPARSE_SPMM_CSR_ALG1

	// To speed up, put switch outside for loops
	switch (op) {
	case BroadcastMode::ELEMENTWISE_ADDITON: // +
	// For grad_in_feat, copy all grad_out_feat to grad_in_feat
	CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat,
	sizeof(Dtype) * nchannel * in_nrows,
	cudaMemcpyDeviceToDevice));

	CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, //
	dim_k, dim_j, dim_k, //
	(void *)d_grad_out_feat, //
	cuda_data_type, CUSPARSE_ORDER_COL));

	CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, //
	dim_i, dim_k, dim_i, //
	(void *)d_tmp_grad_in_feat_global, //
	cuda_data_type, CUSPARSE_ORDER_COL));

	// Transpose the output
	// WARNING: coo sorting must have been handled in the kernel map
	// decomposition.
	CUSPARSE_CHECK(cusparseSpMM(cushandle, //
	CUSPARSE_OPERATION_NON_TRANSPOSE, //
	CUSPARSE_OPERATION_TRANSPOSE, //
	(void *)&alpha, //
	sparse_descr, dense_descr, //
	(void *)&beta, result_descr, //
	cuda_data_type, mm_alg, 0));

	// For grad_in_feat_glob, add all grad_out_feat
	/*
	CUSPARSE_CHECK(
	cusparse_csrmm<Dtype>(cushandle,
	CUSPARSE_OPERATION_NON_TRANSPOSE, // op(A)
	CUSPARSE_OPERATION_TRANSPOSE, // op(B)
	in_nrows_global, // M
	nchannel, // N
	in_nrows, // K
	nnz, &alpha, descr,
	d_csr_val, // val
	d_csr_row, // row
	d_in_map, // col
	d_grad_out_feat, // B
	nchannel, // ldb
	&beta,
	d_tmp_grad_in_feat_global, // C
	in_nrows_global // ldc
	));
	*/

	col2row_major<Dtype>(in_nrows_global, nchannel, d_tmp_grad_in_feat_global,
	d_grad_in_feat_global, stream);

	break;
	case BroadcastMode::ELEMENTWISE_MULTIPLICATION: // *
	// Second, for grad_in_feat_global, copy in_feat to tmp,

	// Forward : (A^T(sparse) x global(dense)) (*) B(feat) = C(result)
	// grad global : A(sparse) (grad C (*) B) =
	CUDA_CHECK(cudaMemcpy(d_tmp_grad_in_feat, d_grad_out_feat,
	sizeof(Dtype) * nchannel * in_nrows,
	cudaMemcpyDeviceToDevice));
	gpu_multiplication<Dtype>(nchannel * in_nrows, d_in_feat,
	d_tmp_grad_in_feat, d_tmp_grad_in_feat, stream);

	CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, //
	dim_k, dim_j, dim_k, //
	(void *)d_tmp_grad_in_feat, //
	cuda_data_type, CUSPARSE_ORDER_COL));

	CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, //
	dim_i, dim_k, dim_i, //
	(void *)d_tmp_grad_in_feat_global, //
	cuda_data_type, CUSPARSE_ORDER_COL));

	// Transpose the output
	CUSPARSE_CHECK(cusparseSpMM(cushandle, //
	CUSPARSE_OPERATION_NON_TRANSPOSE, //
	CUSPARSE_OPERATION_TRANSPOSE, //
	(void *)&alpha, //
	sparse_descr, dense_descr, //
	(void *)&beta, result_descr, //
	cuda_data_type, mm_alg, 0));

	/*CUSPARSE_CHECK(
	cusparse_csrmm<Dtype>(cushandle,
	CUSPARSE_OPERATION_NON_TRANSPOSE, // op(A)
	CUSPARSE_OPERATION_TRANSPOSE, // op(B)
	in_nrows_global, // M
	nchannel, // N
	in_nrows, // K
	nnz, &alpha, descr,
	d_csr_val, // val
	d_csr_row, // row
	d_in_map, // col
	d_tmp_grad_in_feat, // B
	nchannel, // ldb
	&beta,
	d_tmp_grad_in_feat_global, // C
	in_nrows_global // ldc
	));
	*/
	col2row_major<Dtype>(in_nrows_global, nchannel, d_tmp_grad_in_feat_global,
	d_grad_in_feat_global, stream);

	// First, for grad_in_feat
	// Copy in_feat_global to tmp, then multiply the tmp with grad_out_feat

	// Forward : (A^T(sparse) x global(dense)) (*) B(feat) = C(result)
	// grad feat : A^T(sparse) x global(dense) (*) grad C

	// Sort COO first
	// sort_coo_gpu(cushandle, in_nrows_global, in_nrows, nnz, d_out_map,
	// d_in_map);
	// cusparseSpMatDescr_t sparse_descr2;
	CUDA_CHECK(cudaMemcpy(d_grad_in_feat, d_grad_out_feat,
	sizeof(Dtype) * nchannel * in_nrows,
	cudaMemcpyDeviceToDevice));

	detail::channelwise_multiplication<Dtype, Itype>
	<<<GET_BLOCKS(in_nrows, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
	stream>>>(in_nrows, nchannel, d_in_feat_global,
	kernel_map.in_maps.begin(), kernel_map.out_maps.begin(),
	d_grad_in_feat);

	/*
	row2col_major<Dtype>(in_nrows_global, nchannel, d_in_feat_global,
	d_tmp_grad_in_feat_global, stream);
	CUSPARSE_CHECK(
	cusparse_csrmm<Dtype>(cushandle,
	CUSPARSE_OPERATION_TRANSPOSE, // op(A)
	CUSPARSE_OPERATION_NON_TRANSPOSE, // op(B)
	in_nrows_global, // M
	nchannel, // N
	in_nrows, // K
	nnz, &alpha, descr,
	d_csr_val, // val
	d_csr_row, // row
	d_in_map, // col
	d_tmp_grad_in_feat_global, // B
	in_nrows_global, // ldb
	&beta,
	d_tmp_grad_in_feat, // C
	in_nrows // ldc
	));
	col2row_major<Dtype>(in_nrows, nchannel, d_tmp_grad_in_feat, d_grad_in_feat,
	stream);
	gpu_multiplication<Dtype>(nchannel * in_nrows, d_grad_out_feat,
	d_grad_in_feat, d_grad_in_feat, stream);
	*/

	break;
	default:
	throw std::invalid_argument(Formatter() << "Operation not supported: "
	<< std::to_string(op));
	}

	cudaFree(d_scr);
	cudaFree(d_dtype);

	CUSPARSE_CHECK(cusparseDestroySpMat(sparse_descr));
	CUSPARSE_CHECK(cusparseDestroyDnMat(dense_descr));
	CUSPARSE_CHECK(cusparseDestroyDnMat(result_descr));

	CUDA_CHECK(cudaGetLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	}

	template void
	BroadcastBackwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
	const float d_in_feat, float d_grad_in_feat, int in_nrows,
	const float d_in_feat_global, float d_grad_in_feat_global,
	int in_nrows_global, const float *d_grad_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream);

	template void
	BroadcastBackwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
	const double d_in_feat, double d_grad_in_feat, int in_nrows,
	const double d_in_feat_global, double d_grad_in_feat_global,
	int in_nrows_global, const double *d_grad_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream);

	template void
	BroadcastBackwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
	const float d_in_feat, float d_grad_in_feat, int in_nrows,
	const float d_in_feat_global, float d_grad_in_feat_global,
	int in_nrows_global, const float *d_grad_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream);

	template void
	BroadcastBackwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
	const double d_in_feat, double d_grad_in_feat, int in_nrows,
	const double d_in_feat_global, double d_grad_in_feat_global,
	int in_nrows_global, const double *d_grad_out_feat, int nchannel,
	BroadcastMode::Type const op,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	cusparseHandle_t cushandle, cudaStream_t stream);

	} // namespace minkowski

	#endif // GPU_BROADCAST