depthsplat / MinkowskiEngine /src /convolution_kernel.cu
Yeqing0814's picture
Upload folder using huggingface_hub
a6dd040 verified
/*
* Copyright (c) 2020 NVIDIA Corporation.
* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
#ifndef GPU_CONVOLUTION
#define GPU_CONVOLUTION
#include <iostream>
// Use the torch for GPU memory management. Thrust resize gives segfulat during
// debugging -g #include <torch/extension.h>
#include "allocators.cuh"
#include "convolution_kernel.cuh"
#include "math_functions.cuh"
#include <ATen/cuda/CUDAUtils.h>
#include <torch/extension.h>
namespace minkowski {
namespace detail {
bool check_direct_gemm_forward(MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const &convolution_mode,
long const sA, long const sB, long const N) {
if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
(algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
return true;
if (convolution_mode == ConvolutionMode::COPY_GEMM)
return false;
if (sA == 32 && sB == 64 and N <= 490537) return true;
if (sB <= 40) {
if (sB <= 20) {
return true;
} else {
if (N <= 295625) {
return true;
} else {
return (sA <= 12);
}
}
} else {
if (sA <= 20)
return true;
else {
if (N <= 74556) {
return (sB <= 112);
} else {
return false;
}
}
}
}
bool check_direct_gemm_backward(MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const &convolution_mode,
long const sA, long const sB, long const N) {
if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
(algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
return true;
if (convolution_mode == ConvolutionMode::COPY_GEMM)
return false;
if (sA == 32 && sB == 64 and N <= 490537) return true;
if (sB <= 40) {
if (sA <= 20)
return true;
else {
if (N <= 490540) {
return true;
} else {
return (sA <= 12);
}
}
} else {
if (sA <= 20) {
return true;
} else {
if (N <= 30612) {
return (sB <= 160);
} else {
return false;
}
}
}
}
/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width
*/
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void
matmul(const Dtype *__restrict__ A, const int wA, const int hA, //
const Dtype *__restrict__ B, const int wB, const int hB, //
Dtype *__restrict__ C, //
const Itype *__restrict__ in_map, const Itype *__restrict__ out_map) {
// Use in_feat as A and kernel as B
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. x is for rows, y is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < wB)
atomicAdd(&C[wB * out_row + x], Csub);
// C[wB * out_row + x] += Csub;
}
/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B^T, E = D^T * A
* wA is A's width and wB is B's width
*
* +---+
* |B^T|
* +-------+
* | | |
* | A | C |
* | | |
* | | |
* +------------------+
* | D^T | E |
* +----------+---+
*
*/
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void
matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
const Dtype *__restrict__ B, const int wB, const int hB, //
const Dtype *__restrict__ D, const int wD, const int hD, //
Dtype *__restrict__ C, Dtype *__restrict__ E,
const Itype *__restrict__ in_map, const Itype *__restrict__ out_map) {
// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. y is for rows, x is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
Dtype Esub = 0;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Ds used to
// store the sub-matrix of D
__shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
// Transposed kernel
BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * BTs[k][tx];
}
// For Esub, reset to 0
Esub = 0;
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Esub += DTs[k][ty] * As[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// For the E matrix which requires accmulation of multiple blocks, use
// atomic addition. This can be replaced with a more sophisticaed
// reduction algorithm.
if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < hB)
atomicAdd(&C[hB * in_row + x], Csub);
}
template <typename Dtype, typename Itype>
__global__ void
add_mapped_output_tr(const size_t n, const Dtype *__restrict__ in_feat,
const size_t in_nchannel, Dtype *__restrict__ out_feat,
const size_t out_nchannel, const Itype *__restrict__ map) {
extern __shared__ Itype map_index[];
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. y is for rows, x is for columns.
const int x = blockDim.x * bx + tx;
const int y = blockDim.y * by + ty;
if (x < n && ty == 0)
map_index[tx] = map[x];
__syncthreads();
if (x < n && y < out_nchannel) {
atomicAdd(&out_feat[map_index[tx] * out_nchannel + y],
in_feat[y * in_nchannel + x]);
}
}
} // namespace detail
template <typename Dtype, typename Itype, typename ByteAllocator>
void ConvolutionForwardKernelGPU(
Dtype const *d_in_feat, //
default_types::size_type const in_nchannel, //
Dtype *d_out_feat, //
default_types::size_type const out_nchannel, //
Dtype *d_kernel, gpu_kernel_map<Itype, ByteAllocator> const &kernel_map,
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
ByteAllocator &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, //
cublasHandle_t cuhandle, cudaStream_t stream) {
size_t n_active_in_volume, shared_mem_size = -1;
if (detail::check_direct_gemm_forward(algo_index, convolution_mode,
in_nchannel, out_nchannel, in_nrows)) {
// Define the shared memory size
if ((in_nchannel > 16 && out_nchannel > 16 &&
in_nchannel * out_nchannel >= 512) ||
(in_nchannel > 24 && out_nchannel > 24))
shared_mem_size = 32;
else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
shared_mem_size = 24;
else if ((in_nchannel > 8 && out_nchannel > 8) ||
(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
shared_mem_size = 16;
else
shared_mem_size = 8;
dim3 threads(shared_mem_size, shared_mem_size);
// Iterate through each spatial kernel and get indices for in_map and
// out_map
for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
auto const k = it->first;
n_active_in_volume = kernel_map.size(k);
if (n_active_in_volume == 0)
continue;
size_t const num_grid =
(n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
size_t const step = (n_active_in_volume + num_div - 1) / num_div;
for (size_t s = 0; s < num_div; s++) {
size_t const offset = step * s;
size_t const remainder = n_active_in_volume - offset;
size_t const curr_num_active = remainder < step ? remainder : step;
dim3 const grid((out_nchannel + threads.x - 1) / threads.x,
(curr_num_active + threads.y - 1) / threads.y);
// copy in out map
#ifdef DEBUG
/*
size_t map_size = curr_num_active;
Itype *p_kernel_map = (Itype *)std::malloc(map_size * 3 *
sizeof(Itype)); CUDA_CHECK(cudaMemcpy(p_kernel_map,
kernel_map.kernels.begin(k), map_size * sizeof(Itype),
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size,
kernel_map.in_maps.begin(k),
map_size * sizeof(Itype),
cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 *
map_size, kernel_map.out_maps.begin(k), map_size * sizeof(Itype),
cudaMemcpyDeviceToHost));
for (size_t i = curr_num_active - 20; i < curr_num_active; ++i) {
std::cout << p_kernel_map[i + 0 * map_size] << ":"
<< p_kernel_map[i + 1 * map_size] << "->"
<< p_kernel_map[i + 2 * map_size] << "\n";
}
CUDA_CHECK(cudaDeviceSynchronize());
std::free(p_kernel_map);
*/
#endif
switch (shared_mem_size) {
case 32:
detail::matmul<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
d_in_feat, in_nchannel, curr_num_active,
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 24:
detail::matmul<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
d_in_feat, in_nchannel, curr_num_active,
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 16:
detail::matmul<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
d_in_feat, in_nchannel, curr_num_active,
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 8:
detail::matmul<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
d_in_feat, in_nchannel, curr_num_active,
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
}
}
#ifdef DEBUG
LOG_DEBUG("k:", k, "num:", n_active_in_volume);
CUDA_CHECK(cudaDeviceSynchronize());
#endif
CUDA_CHECK(cudaGetLastError());
}
} else { // copy gemm
Itype const max_numel = kernel_map.max_size();
LOG_DEBUG("max_numel:", max_numel);
Dtype *mapped_in_feat = reinterpret_cast<Dtype *>(
allocator.allocate(max_numel * in_nchannel * sizeof(Dtype)));
Dtype *mapped_out_feat = reinterpret_cast<Dtype *>(
allocator.allocate(max_numel * out_nchannel * sizeof(Dtype)));
for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
auto const k = it->first;
n_active_in_volume = kernel_map.size(k);
if (n_active_in_volume == 0)
continue;
LOG_DEBUG(n_active_in_volume * in_nchannel, in_nchannel);
detail::shared_copy_kernel_map<Dtype, Itype>(
// mapped_in_feat,
mapped_in_feat, d_in_feat, kernel_map.in_maps.begin(k),
n_active_in_volume * in_nchannel, in_nchannel);
#ifdef DEBUG
/*
size_t map_size = std::min((size_t)n_active_in_volume, (size_t)100);
Dtype *p_tmp =
(Dtype *)std::malloc(map_size * in_nchannel * sizeof(Dtype));
CUDA_CHECK(cudaMemcpy(p_tmp, mapped_in_feat,
map_size * in_nchannel * sizeof(Dtype),
cudaMemcpyDeviceToHost));
for (size_t i = 0; i < map_size; ++i) {
std::cout << PtrToString(&p_tmp[i * in_nchannel], in_nchannel) << "\n";
}
CUDA_CHECK(cudaDeviceSynchronize());
std::free(p_tmp);
*/
#endif
gpu_gemm<Dtype>(cuhandle, CblasNoTrans, CblasNoTrans,
n_active_in_volume, // M
out_nchannel, // N
in_nchannel, // K
1, // alpha
mapped_in_feat, // A
&d_kernel[k * in_nchannel * out_nchannel], // B
0, // beta
mapped_out_feat // C
);
detail::shared_accumulate_kernel_map<Dtype, Itype>(
d_out_feat, mapped_out_feat, kernel_map.out_maps.begin(k),
n_active_in_volume * out_nchannel, out_nchannel);
}
allocator.deallocate((char *)mapped_in_feat,
max_numel * in_nchannel * sizeof(Dtype));
allocator.deallocate((char *)mapped_out_feat,
max_numel * out_nchannel * sizeof(Dtype));
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
// default_allocator
template void
ConvolutionForwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
float const *d_in_feat, default_types::size_type const in_nchannel,
float *d_out_feat, default_types::size_type const out_nchannel,
float *d_kernel,
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows,
detail::default_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
template void
ConvolutionForwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
double const *d_in_feat, default_types::size_type const in_nchannel,
double *d_out_feat, default_types::size_type const out_nchannel,
double *d_kernel,
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map,
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows,
detail::default_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
// c10_allocator
template void
ConvolutionForwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
float const *d_in_feat, default_types::size_type const in_nchannel,
float *d_out_feat, default_types::size_type const out_nchannel,
float *d_kernel,
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows,
detail::c10_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
template void
ConvolutionForwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
double const *d_in_feat, default_types::size_type const in_nchannel,
double *d_out_feat, default_types::size_type const out_nchannel,
double *d_kernel,
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map,
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows,
detail::c10_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
// Backward
template <typename Dtype, typename Itype, typename ByteAllocator>
void ConvolutionBackwardKernelGPU(
Dtype const *d_in_feat, //
Dtype *d_grad_in_feat, default_types::size_type const in_nchannel, //
Dtype const *d_grad_out_feat,
default_types::size_type const out_nchannel, //
Dtype const *d_kernel, Dtype *d_grad_kernel, //
gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, //
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
ByteAllocator &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream) {
#ifdef DEBUG
CUDA_CHECK_ARGS(cudaDeviceSynchronize(),
"Error triggered from a previous kernel call.");
#endif
size_t n_active_in_volume, shared_mem_size = -1;
// Define the shared memory size
if ((in_nchannel > 16 && out_nchannel > 16 &&
in_nchannel * out_nchannel >= 512) ||
(in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
shared_mem_size = 32;
else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
shared_mem_size = 24;
else if ((in_nchannel > 8 && out_nchannel > 8) ||
(in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
shared_mem_size = 16;
else
shared_mem_size = 8;
if (!detail::check_direct_gemm_backward(
algo_index, convolution_mode, in_nchannel, out_nchannel, in_nrows)) {
// find max size
size_t max_active = kernel_map.max_size();
size_t in_buffer_size = max_active * in_nchannel * sizeof(Dtype);
size_t out_buffer_size = max_active * out_nchannel * sizeof(Dtype);
Dtype *d_input_buffer = (Dtype *)allocator.allocate(in_buffer_size);
Dtype *d_output_buffer = (Dtype *)allocator.allocate(out_buffer_size);
dim3 threads(32, shared_mem_size);
#ifdef DEBUG
timer t;
#endif
for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
auto const k = it->first;
n_active_in_volume = kernel_map.size(k);
if (n_active_in_volume == 0)
continue;
// Copy (*p_in_maps)[k] to GPU
Itype const *d_in_map = kernel_map.in_maps.begin(k);
Itype const *d_out_map = kernel_map.out_maps.begin(k);
// Copy gradients to the buffer
#ifdef DEBUG
t.tic();
#endif
detail::shared_copy_kernel_map<Dtype, Itype>(
d_output_buffer, d_grad_out_feat, d_out_map,
n_active_in_volume * out_nchannel, out_nchannel);
#ifdef DEBUG
CUDA_CHECK(cudaStreamSynchronize(stream));
LOG_DEBUG("copy input", t.toc());
t.tic();
#endif
gpu_gemm<Dtype>(cuhandle, CblasNoTrans, CblasTrans,
in_nchannel, // M
n_active_in_volume, // N
out_nchannel, // K
1, // alpha
&d_kernel[k * in_nchannel * out_nchannel], // A
d_output_buffer, // B
0, // beta
d_input_buffer // C
);
#ifdef DEBUG
CUDA_CHECK(cudaStreamSynchronize(0));
LOG_DEBUG("input grad gemm", t.toc());
t.tic();
#endif
// Accumulate gradients back to the input grad feat
// Put it back to the correct index
dim3 const grid_tr(GET_BLOCKS(n_active_in_volume, threads.x),
GET_BLOCKS(in_nchannel, threads.y));
detail::add_mapped_output_tr<Dtype, Itype>
<<<grid_tr, threads, threads.x * sizeof(Itype), stream>>>(
n_active_in_volume,
d_input_buffer, // In
n_active_in_volume, // In channel
d_grad_in_feat, in_nchannel, // Out
d_in_map); // Out channel
#ifdef DEBUG
CUDA_CHECK(cudaStreamSynchronize(stream));
LOG_DEBUG("accumulate in grad", t.toc());
t.tic();
#endif
// Compute gradient for kernel
// Copy features to the buffer
dim3 const grid_in(GET_BLOCKS(n_active_in_volume, threads.x),
GET_BLOCKS(in_nchannel, threads.y));
detail::shared_copy_kernel_map<Dtype, Itype>(
d_input_buffer, d_in_feat, d_in_map, n_active_in_volume * in_nchannel,
in_nchannel);
#ifdef DEBUG
LOG_DEBUG("copy in feat to buffer", t.toc());
t.tic();
#endif
// sync before the blas call
CUDA_CHECK(cudaStreamSynchronize(stream));
gpu_gemm<Dtype>(cuhandle, CblasTrans, CblasNoTrans,
in_nchannel, // M
out_nchannel, // N
n_active_in_volume, // K
1, // alpha
d_input_buffer, // A
d_output_buffer, // B
1, // beta
&d_grad_kernel[k * in_nchannel * out_nchannel] // C
);
CUDA_CHECK(cudaStreamSynchronize(0));
#ifdef DEBUG
LOG_DEBUG("grad kernel gemm", t.toc());
t.tic();
#endif
}
allocator.deallocate((char *)d_input_buffer, in_buffer_size);
allocator.deallocate((char *)d_output_buffer, out_buffer_size);
} else {
dim3 threads(shared_mem_size, shared_mem_size);
for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
auto const k = it->first;
n_active_in_volume = kernel_map.size(k);
if (n_active_in_volume == 0)
continue;
size_t const num_grid =
(n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
size_t const step = (n_active_in_volume + num_div - 1) / num_div;
for (int s = 0; s < num_div; s++) {
size_t const offset = step * s;
size_t const remainder = n_active_in_volume - offset;
size_t const curr_num_active = remainder < step ? remainder : step;
dim3 const grid((in_nchannel + threads.x - 1) / threads.x,
(curr_num_active + threads.y - 1) / threads.y);
switch (shared_mem_size) {
case 32:
detail::matmul2<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
d_grad_out_feat, out_nchannel, curr_num_active, // A
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, // B
d_in_feat, in_nchannel, curr_num_active, // D
d_grad_in_feat, // C
&d_grad_kernel[k * in_nchannel * out_nchannel], // E
kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 24:
detail::matmul2<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
d_grad_out_feat, out_nchannel, curr_num_active, // A
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, // B
d_in_feat, in_nchannel, curr_num_active, // D
d_grad_in_feat, // C
&d_grad_kernel[k * in_nchannel * out_nchannel], // E
kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 16:
detail::matmul2<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
d_grad_out_feat, out_nchannel, curr_num_active, // A
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, // B
d_in_feat, in_nchannel, curr_num_active, // D
d_grad_in_feat, // C
&d_grad_kernel[k * in_nchannel * out_nchannel], // E
kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
case 8:
detail::matmul2<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
d_grad_out_feat, out_nchannel, curr_num_active, // A
&d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
in_nchannel, // B
d_in_feat, in_nchannel, curr_num_active, // D
d_grad_in_feat, // C
&d_grad_kernel[k * in_nchannel * out_nchannel], // E
kernel_map.in_maps.begin(k) + offset,
kernel_map.out_maps.begin(k) + offset);
break;
}
}
CUDA_CHECK(cudaGetLastError());
}
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
// default_allocator
template void
ConvolutionBackwardKernelGPU<float, uint32_t, detail::default_allocator<char>>(
float const *d_in_feat, float *d_grad_in_feat,
default_types::size_type const in_nchannel, //
float const *d_grad_out_feat,
default_types::size_type const out_nchannel, //
float const *d_kernel, float *p_grad_kernel, //
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const
&kernel_map, //
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
detail::default_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
template void
ConvolutionBackwardKernelGPU<double, uint32_t, detail::default_allocator<char>>(
double const *d_in_feat, double *d_grad_in_feat,
default_types::size_type const in_nchannel, //
double const *d_grad_out_feat,
default_types::size_type const out_nchannel, //
double const *d_kernel, double *p_grad_kernel, //
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const
&kernel_map, //
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
detail::default_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
// c10_allocator
template void
ConvolutionBackwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>(
float const *d_in_feat, float *d_grad_in_feat,
default_types::size_type const in_nchannel, //
float const *d_grad_out_feat,
default_types::size_type const out_nchannel, //
float const *d_kernel, float *p_grad_kernel, //
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, //
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
detail::c10_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
template void
ConvolutionBackwardKernelGPU<double, uint32_t, detail::c10_allocator<char>>(
double const *d_in_feat, double *d_grad_in_feat,
default_types::size_type const in_nchannel, //
double const *d_grad_out_feat,
default_types::size_type const out_nchannel, //
double const *d_kernel, double *p_grad_kernel, //
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, //
default_types::size_type const in_nrows, //
default_types::size_type const out_nrows, //
detail::c10_allocator<char> &allocator, //
MinkowskiAlgorithm::Mode const algo_index, //
ConvolutionMode::Type const convolution_mode, cublasHandle_t cuhandle,
cudaStream_t stream);
} // namespace minkowski
#endif // end GPU_CONVOLUTION