depthsplat / MinkowskiEngine /src /convolution_kernel.cu

Upload folder using huggingface_hub

a6dd040 verified 5 months ago

32.9 kB

	/*
	* Copyright (c) 2020 NVIDIA Corporation.
	* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
	* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
	* of the code.
	*/
	#ifndef GPU_CONVOLUTION
	#define GPU_CONVOLUTION

	#include <iostream>

	// Use the torch for GPU memory management. Thrust resize gives segfulat during
	// debugging -g #include <torch/extension.h>

	#include "allocators.cuh"
	#include "convolution_kernel.cuh"
	#include "math_functions.cuh"

	#include <ATen/cuda/CUDAUtils.h>
	#include <torch/extension.h>

	namespace minkowski {

	namespace detail {

	bool check_direct_gemm_forward(MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const &convolution_mode,
	long const sA, long const sB, long const N) {
	if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) \|\|
	(algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
	return true;
	if (convolution_mode == ConvolutionMode::COPY_GEMM)
	return false;
	if (sA == 32 && sB == 64 and N <= 490537) return true;
	if (sB <= 40) {
	if (sB <= 20) {
	return true;
	} else {
	if (N <= 295625) {
	return true;
	} else {
	return (sA <= 12);
	}
	}
	} else {
	if (sA <= 20)
	return true;
	else {
	if (N <= 74556) {
	return (sB <= 112);
	} else {
	return false;
	}
	}
	}
	}

	bool check_direct_gemm_backward(MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const &convolution_mode,
	long const sA, long const sB, long const N) {
	if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) \|\|
	(algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
	return true;
	if (convolution_mode == ConvolutionMode::COPY_GEMM)
	return false;
	if (sA == 32 && sB == 64 and N <= 490537) return true;
	if (sB <= 40) {
	if (sA <= 20)
	return true;
	else {
	if (N <= 490540) {
	return true;
	} else {
	return (sA <= 12);
	}
	}
	} else {
	if (sA <= 20) {
	return true;
	} else {
	if (N <= 30612) {
	return (sB <= 160);
	} else {
	return false;
	}
	}
	}
	}

	/**
	* Matrix multiplication (CUDA Kernel) on the device: C = A * B
	* wA is A's width and wB is B's width
	*/
	template <typename Dtype, typename Itype, int BLOCK_SIZE>
	__global__ void
	matmul(const Dtype *__restrict__ A, const int wA, const int hA, //
	const Dtype *__restrict__ B, const int wB, const int hB, //
	Dtype *__restrict__ C, //
	const Itype __restrict__ in_map, const Itype __restrict__ out_map) {
	// Use in_feat as A and kernel as B

	// Block index
	const int bx = blockIdx.x;
	const int by = blockIdx.y;

	// Thread index
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;

	// Coordinate. x is for rows, y is for columns.
	const int x = BLOCK_SIZE * bx + tx;
	const int y = BLOCK_SIZE * by + ty;

	// Csub is used to store the element of the block sub-matrix
	// that is computed by the thread
	Dtype Csub = 0;

	const Itype in_row = y < hA ? in_map[y] : 0;
	const Itype out_row = y < hA ? out_map[y] : 0;

	// Loop over all the sub-matrices of A and B
	// required to compute the block sub-matrix
	for (int s = 0; s < wA; s += BLOCK_SIZE) {
	// Declaration of the shared memory array As used to
	// store the sub-matrix of A
	__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];

	// Declaration of the shared memory array Bs used to
	// store the sub-matrix of B
	__shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];

	// Load the matrices from device memory
	// to shared memory; each thread loads
	// one element of each matrix
	As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
	Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;

	// Synchronize to make sure the matrices are loaded
	__syncthreads();

	// Multiply the two matrices together;
	// each thread computes one element
	// of the block sub-matrix
	#pragma unroll
	for (int k = 0; k < BLOCK_SIZE; ++k) {
	Csub += As[ty][k] * Bs[k][tx];
	}

	// Synchronize to make sure that the preceding
	// computation is done before loading two new
	// sub-matrices of A and B in the next iteration
	__syncthreads();
	}

	// Write the block sub-matrix to device memory;
	// each thread writes one element
	if (y < hA && x < wB)
	atomicAdd(&C[wB * out_row + x], Csub);
	// C[wB * out_row + x] += Csub;
	}

	/**
	* Matrix multiplication (CUDA Kernel) on the device: C = A * B^T, E = D^T * A
	* wA is A's width and wB is B's width
	*
	* +---+
	* \|B^T\|
	* +-------+
	* \| \| \|
	* \| A \| C \|
	* \| \| \|
	* \| \| \|
	* +------------------+
	* \| D^T \| E \|
	* +----------+---+
	*
	*/
	template <typename Dtype, typename Itype, int BLOCK_SIZE>
	__global__ void
	matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
	const Dtype *__restrict__ B, const int wB, const int hB, //
	const Dtype *__restrict__ D, const int wD, const int hD, //
	Dtype __restrict__ C, Dtype __restrict__ E,
	const Itype __restrict__ in_map, const Itype __restrict__ out_map) {
	// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D

	// Block index
	const int bx = blockIdx.x;
	const int by = blockIdx.y;

	// Thread index
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;

	// Coordinate. y is for rows, x is for columns.
	const int x = BLOCK_SIZE * bx + tx;
	const int y = BLOCK_SIZE * by + ty;

	const Itype in_row = y < hA ? in_map[y] : 0;
	const Itype out_row = y < hA ? out_map[y] : 0;

	// Csub is used to store the element of the block sub-matrix
	// that is computed by the thread
	Dtype Csub = 0;
	Dtype Esub = 0;

	// Declaration of the shared memory array As used to
	// store the sub-matrix of A
	__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];

	// Declaration of the shared memory array Bs used to
	// store the sub-matrix of B
	__shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];

	// Declaration of the shared memory array Ds used to
	// store the sub-matrix of D
	__shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];

	// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
	DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;

	// Loop over all the sub-matrices of A and B
	// required to compute the block sub-matrix
	for (int s = 0; s < wA; s += BLOCK_SIZE) {
	// Load the matrices from device memory
	// to shared memory; each thread loads
	// one element of each matrix
	As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;

	// Transposed kernel
	BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;

	// Synchronize to make sure the matrices are loaded
	__syncthreads();

	// Multiply the two matrices together;
	// each thread computes one element
	// of the block sub-matrix
	#pragma unroll
	for (int k = 0; k < BLOCK_SIZE; ++k) {
	Csub += As[ty][k] * BTs[k][tx];
	}

	// For Esub, reset to 0
	Esub = 0;
	#pragma unroll
	for (int k = 0; k < BLOCK_SIZE; ++k) {
	Esub += DTs[k][ty] * As[k][tx];
	}

	// Synchronize to make sure that the preceding
	// computation is done before loading two new
	// sub-matrices of A and B in the next iteration
	__syncthreads();

	// For the E matrix which requires accmulation of multiple blocks, use
	// atomic addition. This can be replaced with a more sophisticaed
	// reduction algorithm.
	if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
	atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
	}

	// Write the block sub-matrix to device memory;
	// each thread writes one element
	if (y < hA && x < hB)
	atomicAdd(&C[hB * in_row + x], Csub);
	}

	template <typename Dtype, typename Itype>
	__global__ void
	add_mapped_output_tr(const size_t n, const Dtype *__restrict__ in_feat,
	const size_t in_nchannel, Dtype *__restrict__ out_feat,
	const size_t out_nchannel, const Itype *__restrict__ map) {
	extern __shared__ Itype map_index[];
	// Block index
	const int bx = blockIdx.x;
	const int by = blockIdx.y;

	// Thread index
	const int tx = threadIdx.x;
	const int ty = threadIdx.y;

	// Coordinate. y is for rows, x is for columns.
	const int x = blockDim.x * bx + tx;
	const int y = blockDim.y * by + ty;

	if (x < n && ty == 0)
	map_index[tx] = map[x];

	__syncthreads();

	if (x < n && y < out_nchannel) {
	atomicAdd(&out_feat[map_index[tx] * out_nchannel + y],
	in_feat[y * in_nchannel + x]);
	}
	}

	} // namespace detail

	template <typename Dtype, typename Itype, typename ByteAllocator>
	void ConvolutionForwardKernelGPU(
	Dtype const *d_in_feat, //
	default_types::size_type const in_nchannel, //
	Dtype *d_out_feat, //
	default_types::size_type const out_nchannel, //
	Dtype *d_kernel, gpu_kernel_map<Itype, ByteAllocator> const &kernel_map,
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	ByteAllocator &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, //
	cublasHandle_t cuhandle, cudaStream_t stream) {

	size_t n_active_in_volume, shared_mem_size = -1;

	if (detail::check_direct_gemm_forward(algo_index, convolution_mode,
	in_nchannel, out_nchannel, in_nrows)) {
	// Define the shared memory size
	if ((in_nchannel > 16 && out_nchannel > 16 &&
	in_nchannel * out_nchannel >= 512) \|\|
	(in_nchannel > 24 && out_nchannel > 24))
	shared_mem_size = 32;
	else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
	shared_mem_size = 24;
	else if ((in_nchannel > 8 && out_nchannel > 8) \|\|
	(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
	shared_mem_size = 16;
	else
	shared_mem_size = 8;

	dim3 threads(shared_mem_size, shared_mem_size);

	// Iterate through each spatial kernel and get indices for in_map and
	// out_map
	for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
	auto const k = it->first;
	n_active_in_volume = kernel_map.size(k);
	if (n_active_in_volume == 0)
	continue;

	size_t const num_grid =
	(n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
	size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
	size_t const step = (n_active_in_volume + num_div - 1) / num_div;

	for (size_t s = 0; s < num_div; s++) {
	size_t const offset = step * s;
	size_t const remainder = n_active_in_volume - offset;
	size_t const curr_num_active = remainder < step ? remainder : step;
	dim3 const grid((out_nchannel + threads.x - 1) / threads.x,
	(curr_num_active + threads.y - 1) / threads.y);

	// copy in out map
	#ifdef DEBUG
	/*
	size_t map_size = curr_num_active;
	Itype p_kernel_map = (Itype )std::malloc(map_size * 3 *
	sizeof(Itype)); CUDA_CHECK(cudaMemcpy(p_kernel_map,
	kernel_map.kernels.begin(k), map_size * sizeof(Itype),
	cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size,
	kernel_map.in_maps.begin(k),
	map_size * sizeof(Itype),
	cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 *
	map_size, kernel_map.out_maps.begin(k), map_size * sizeof(Itype),
	cudaMemcpyDeviceToHost));

	for (size_t i = curr_num_active - 20; i < curr_num_active; ++i) {
	std::cout << p_kernel_map[i + 0 * map_size] << ":"
	<< p_kernel_map[i + 1 * map_size] << "->"
	<< p_kernel_map[i + 2 * map_size] << "\n";
	}

	CUDA_CHECK(cudaDeviceSynchronize());
	std::free(p_kernel_map);
	*/
	#endif

	switch (shared_mem_size) {
	case 32:
	detail::matmul<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
	d_in_feat, in_nchannel, curr_num_active,
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 24:
	detail::matmul<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
	d_in_feat, in_nchannel, curr_num_active,
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 16:
	detail::matmul<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
	d_in_feat, in_nchannel, curr_num_active,
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 8:
	detail::matmul<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
	d_in_feat, in_nchannel, curr_num_active,
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	}
	}
	#ifdef DEBUG
	LOG_DEBUG("k:", k, "num:", n_active_in_volume);
	CUDA_CHECK(cudaDeviceSynchronize());
	#endif
	CUDA_CHECK(cudaGetLastError());
	}
	} else { // copy gemm
	Itype const max_numel = kernel_map.max_size();
	LOG_DEBUG("max_numel:", max_numel);
	Dtype mapped_in_feat = reinterpret_cast<Dtype >(
	allocator.allocate(max_numel * in_nchannel * sizeof(Dtype)));
	Dtype mapped_out_feat = reinterpret_cast<Dtype >(
	allocator.allocate(max_numel * out_nchannel * sizeof(Dtype)));

	for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
	auto const k = it->first;
	n_active_in_volume = kernel_map.size(k);
	if (n_active_in_volume == 0)
	continue;

	LOG_DEBUG(n_active_in_volume * in_nchannel, in_nchannel);
	detail::shared_copy_kernel_map<Dtype, Itype>(
	// mapped_in_feat,
	mapped_in_feat, d_in_feat, kernel_map.in_maps.begin(k),
	n_active_in_volume * in_nchannel, in_nchannel);

	#ifdef DEBUG
	/*
	size_t map_size = std::min((size_t)n_active_in_volume, (size_t)100);
	Dtype *p_tmp =
	(Dtype )std::malloc(map_size in_nchannel * sizeof(Dtype));
	CUDA_CHECK(cudaMemcpy(p_tmp, mapped_in_feat,
	map_size * in_nchannel * sizeof(Dtype),
	cudaMemcpyDeviceToHost));

	for (size_t i = 0; i < map_size; ++i) {
	std::cout << PtrToString(&p_tmp[i * in_nchannel], in_nchannel) << "\n";
	}

	CUDA_CHECK(cudaDeviceSynchronize());
	std::free(p_tmp);
	*/
	#endif

	gpu_gemm<Dtype>(cuhandle, CblasNoTrans, CblasNoTrans,
	n_active_in_volume, // M
	out_nchannel, // N
	in_nchannel, // K
	1, // alpha
	mapped_in_feat, // A
	&d_kernel[k * in_nchannel * out_nchannel], // B
	0, // beta
	mapped_out_feat // C
	);

	detail::shared_accumulate_kernel_map<Dtype, Itype>(
	d_out_feat, mapped_out_feat, kernel_map.out_maps.begin(k),
	n_active_in_volume * out_nchannel, out_nchannel);
	}

	allocator.deallocate((char *)mapped_in_feat,
	max_numel * in_nchannel * sizeof(Dtype));
	allocator.deallocate((char *)mapped_out_feat,
	max_numel * out_nchannel * sizeof(Dtype));
	}
	CUDA_CHECK(cudaStreamSynchronize(stream));
	}

	// default_allocator
	template void
	ConvolutionForwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
	float const *d_in_feat, default_types::size_type const in_nchannel,
	float *d_out_feat, default_types::size_type const out_nchannel,
	float *d_kernel,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows,
	detail::default_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	template void
	ConvolutionForwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
	double const *d_in_feat, default_types::size_type const in_nchannel,
	double *d_out_feat, default_types::size_type const out_nchannel,
	double *d_kernel,
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows,
	detail::default_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	// c10_allocator
	template void
	ConvolutionForwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
	float const *d_in_feat, default_types::size_type const in_nchannel,
	float *d_out_feat, default_types::size_type const out_nchannel,
	float *d_kernel,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows,
	detail::c10_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	template void
	ConvolutionForwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
	double const *d_in_feat, default_types::size_type const in_nchannel,
	double *d_out_feat, default_types::size_type const out_nchannel,
	double *d_kernel,
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows,
	detail::c10_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	// Backward
	template <typename Dtype, typename Itype, typename ByteAllocator>
	void ConvolutionBackwardKernelGPU(
	Dtype const *d_in_feat, //
	Dtype *d_grad_in_feat, default_types::size_type const in_nchannel, //
	Dtype const *d_grad_out_feat,
	default_types::size_type const out_nchannel, //
	Dtype const d_kernel, Dtype d_grad_kernel, //
	gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, //
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	ByteAllocator &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream) {

	#ifdef DEBUG
	CUDA_CHECK_ARGS(cudaDeviceSynchronize(),
	"Error triggered from a previous kernel call.");
	#endif

	size_t n_active_in_volume, shared_mem_size = -1;
	// Define the shared memory size
	if ((in_nchannel > 16 && out_nchannel > 16 &&
	in_nchannel * out_nchannel >= 512) \|\|
	(in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
	shared_mem_size = 32;
	else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
	shared_mem_size = 24;
	else if ((in_nchannel > 8 && out_nchannel > 8) \|\|
	(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
	shared_mem_size = 16;
	else
	shared_mem_size = 8;

	if (!detail::check_direct_gemm_backward(
	algo_index, convolution_mode, in_nchannel, out_nchannel, in_nrows)) {
	// find max size
	size_t max_active = kernel_map.max_size();
	size_t in_buffer_size = max_active * in_nchannel * sizeof(Dtype);
	size_t out_buffer_size = max_active * out_nchannel * sizeof(Dtype);
	Dtype d_input_buffer = (Dtype )allocator.allocate(in_buffer_size);
	Dtype d_output_buffer = (Dtype )allocator.allocate(out_buffer_size);

	dim3 threads(32, shared_mem_size);
	#ifdef DEBUG
	timer t;
	#endif
	for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
	auto const k = it->first;
	n_active_in_volume = kernel_map.size(k);
	if (n_active_in_volume == 0)
	continue;

	// Copy (*p_in_maps)[k] to GPU
	Itype const *d_in_map = kernel_map.in_maps.begin(k);
	Itype const *d_out_map = kernel_map.out_maps.begin(k);
	// Copy gradients to the buffer
	#ifdef DEBUG
	t.tic();
	#endif
	detail::shared_copy_kernel_map<Dtype, Itype>(
	d_output_buffer, d_grad_out_feat, d_out_map,
	n_active_in_volume * out_nchannel, out_nchannel);
	#ifdef DEBUG
	CUDA_CHECK(cudaStreamSynchronize(stream));
	LOG_DEBUG("copy input", t.toc());
	t.tic();
	#endif
	gpu_gemm<Dtype>(cuhandle, CblasNoTrans, CblasTrans,
	in_nchannel, // M
	n_active_in_volume, // N
	out_nchannel, // K
	1, // alpha
	&d_kernel[k * in_nchannel * out_nchannel], // A
	d_output_buffer, // B
	0, // beta
	d_input_buffer // C
	);
	#ifdef DEBUG
	CUDA_CHECK(cudaStreamSynchronize(0));
	LOG_DEBUG("input grad gemm", t.toc());
	t.tic();
	#endif

	// Accumulate gradients back to the input grad feat
	// Put it back to the correct index
	dim3 const grid_tr(GET_BLOCKS(n_active_in_volume, threads.x),
	GET_BLOCKS(in_nchannel, threads.y));
	detail::add_mapped_output_tr<Dtype, Itype>
	<<<grid_tr, threads, threads.x * sizeof(Itype), stream>>>(
	n_active_in_volume,
	d_input_buffer, // In
	n_active_in_volume, // In channel
	d_grad_in_feat, in_nchannel, // Out
	d_in_map); // Out channel
	#ifdef DEBUG
	CUDA_CHECK(cudaStreamSynchronize(stream));
	LOG_DEBUG("accumulate in grad", t.toc());
	t.tic();
	#endif

	// Compute gradient for kernel
	// Copy features to the buffer
	dim3 const grid_in(GET_BLOCKS(n_active_in_volume, threads.x),
	GET_BLOCKS(in_nchannel, threads.y));
	detail::shared_copy_kernel_map<Dtype, Itype>(
	d_input_buffer, d_in_feat, d_in_map, n_active_in_volume * in_nchannel,
	in_nchannel);
	#ifdef DEBUG
	LOG_DEBUG("copy in feat to buffer", t.toc());
	t.tic();
	#endif
	// sync before the blas call
	CUDA_CHECK(cudaStreamSynchronize(stream));
	gpu_gemm<Dtype>(cuhandle, CblasTrans, CblasNoTrans,
	in_nchannel, // M
	out_nchannel, // N
	n_active_in_volume, // K
	1, // alpha
	d_input_buffer, // A
	d_output_buffer, // B
	1, // beta
	&d_grad_kernel[k * in_nchannel * out_nchannel] // C
	);
	CUDA_CHECK(cudaStreamSynchronize(0));
	#ifdef DEBUG
	LOG_DEBUG("grad kernel gemm", t.toc());
	t.tic();
	#endif
	}
	allocator.deallocate((char *)d_input_buffer, in_buffer_size);
	allocator.deallocate((char *)d_output_buffer, out_buffer_size);
	} else {
	dim3 threads(shared_mem_size, shared_mem_size);

	for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
	auto const k = it->first;
	n_active_in_volume = kernel_map.size(k);
	if (n_active_in_volume == 0)
	continue;

	size_t const num_grid =
	(n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
	size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
	size_t const step = (n_active_in_volume + num_div - 1) / num_div;

	for (int s = 0; s < num_div; s++) {
	size_t const offset = step * s;
	size_t const remainder = n_active_in_volume - offset;
	size_t const curr_num_active = remainder < step ? remainder : step;
	dim3 const grid((in_nchannel + threads.x - 1) / threads.x,
	(curr_num_active + threads.y - 1) / threads.y);

	switch (shared_mem_size) {
	case 32:
	detail::matmul2<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
	d_grad_out_feat, out_nchannel, curr_num_active, // A
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, // B
	d_in_feat, in_nchannel, curr_num_active, // D
	d_grad_in_feat, // C
	&d_grad_kernel[k * in_nchannel * out_nchannel], // E
	kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 24:
	detail::matmul2<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
	d_grad_out_feat, out_nchannel, curr_num_active, // A
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, // B
	d_in_feat, in_nchannel, curr_num_active, // D
	d_grad_in_feat, // C
	&d_grad_kernel[k * in_nchannel * out_nchannel], // E
	kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 16:
	detail::matmul2<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
	d_grad_out_feat, out_nchannel, curr_num_active, // A
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, // B
	d_in_feat, in_nchannel, curr_num_active, // D
	d_grad_in_feat, // C
	&d_grad_kernel[k * in_nchannel * out_nchannel], // E
	kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	case 8:
	detail::matmul2<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
	d_grad_out_feat, out_nchannel, curr_num_active, // A
	&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
	in_nchannel, // B
	d_in_feat, in_nchannel, curr_num_active, // D
	d_grad_in_feat, // C
	&d_grad_kernel[k * in_nchannel * out_nchannel], // E
	kernel_map.in_maps.begin(k) + offset,
	kernel_map.out_maps.begin(k) + offset);
	break;
	}
	}
	CUDA_CHECK(cudaGetLastError());
	}
	CUDA_CHECK(cudaStreamSynchronize(stream));
	}
	}

	// default_allocator
	template void
	ConvolutionBackwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
	float const d_in_feat, float d_grad_in_feat,
	default_types::size_type const in_nchannel, //
	float const *d_grad_out_feat,
	default_types::size_type const out_nchannel, //
	float const d_kernel, float p_grad_kernel, //
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const
	&kernel_map, //
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	detail::default_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	template void
	ConvolutionBackwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
	double const d_in_feat, double d_grad_in_feat,
	default_types::size_type const in_nchannel, //
	double const *d_grad_out_feat,
	default_types::size_type const out_nchannel, //
	double const d_kernel, double p_grad_kernel, //
	gpu_kernel_map<uint32_t, detail::default_allocator<char>> const
	&kernel_map, //
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	detail::default_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	// c10_allocator
	template void
	ConvolutionBackwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
	float const d_in_feat, float d_grad_in_feat,
	default_types::size_type const in_nchannel, //
	float const *d_grad_out_feat,
	default_types::size_type const out_nchannel, //
	float const d_kernel, float p_grad_kernel, //
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, //
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	detail::c10_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	template void
	ConvolutionBackwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
	double const d_in_feat, double d_grad_in_feat,
	default_types::size_type const in_nchannel, //
	double const *d_grad_out_feat,
	default_types::size_type const out_nchannel, //
	double const d_kernel, double p_grad_kernel, //
	gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, //
	default_types::size_type const in_nrows, //
	default_types::size_type const out_nrows, //
	detail::c10_allocator<char> &allocator, //
	MinkowskiAlgorithm::Mode const algo_index, //
	ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
	cudaStream_t stream);

	} // namespace minkowski

	#endif // end GPU_CONVOLUTION