|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef GPU_POOLING_AVG |
|
|
#define GPU_POOLING_AVG |
|
|
|
|
|
#include <cusparse.h> |
|
|
#include <limits> |
|
|
|
|
|
#include <thrust/execution_policy.h> |
|
|
#include <thrust/fill.h> |
|
|
#include <thrust/host_vector.h> |
|
|
|
|
|
#include <thrust/functional.h> |
|
|
#include <thrust/iterator/discard_iterator.h> |
|
|
#include <thrust/reduce.h> |
|
|
#include <thrust/sort.h> |
|
|
|
|
|
#include "allocators.cuh" |
|
|
#include "pooling_avg_kernel.cuh" |
|
|
#include "utils.hpp" |
|
|
|
|
|
namespace minkowski { |
|
|
|
|
|
template <typename Dtype> |
|
|
__global__ void fill(const int n, Dtype *in_feat, Dtype val) { |
|
|
CUDA_KERNEL_LOOP(index, n) { in_feat[index] = val; } |
|
|
} |
|
|
|
|
|
template <typename Dtype> |
|
|
__global__ void col2row_major(const int n, const int nrows, const int ncols, |
|
|
const Dtype *colA, Dtype *rowA) { |
|
|
int i, j; |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
i = index % nrows; |
|
|
j = index / nrows; |
|
|
rowA[i * ncols + j] = colA[index]; |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype> |
|
|
__global__ void col2row_major_with_div(const int n, const int nrows, |
|
|
const int ncols, |
|
|
const Dtype *num_nonzero, |
|
|
const Dtype *colA, Dtype *rowA) { |
|
|
int i, j; |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
i = index % nrows; |
|
|
j = index / nrows; |
|
|
if (num_nonzero[i] >= 1) { |
|
|
rowA[i * ncols + j] = colA[index] / num_nonzero[i]; |
|
|
} else { |
|
|
rowA[i * ncols + j] = colA[index]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Itype, typename Dtype> |
|
|
__global__ void |
|
|
unique_row2num_nonzero(const int n, Dtype *__restrict__ d_num_nonzero, |
|
|
const Itype *__restrict__ unique_row_ptr, |
|
|
const Dtype *__restrict__ reduced_val_ptr) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
d_num_nonzero[unique_row_ptr[index]] = reduced_val_ptr[index]; |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype> |
|
|
__global__ void set_gradient(const int n, const Dtype *d_grad_out, |
|
|
Dtype *d_grad_in, const Itype *out_index, |
|
|
int nchannel) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
atomicAdd(&d_grad_in[out_index[index]], d_grad_out[index]); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype> |
|
|
__global__ void |
|
|
set_gradient_nonzero(const int n, const Dtype *d_grad_out, Dtype *d_grad_in, |
|
|
int nchannel, const Itype *in_map, const Itype *out_map) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
int nrow = index / nchannel; |
|
|
int ch = index % nchannel; |
|
|
atomicAdd(&d_grad_in[in_map[nrow] * nchannel + ch], |
|
|
d_grad_out[out_map[nrow] * nchannel + ch]); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype> |
|
|
__global__ void |
|
|
set_gradient_nonzero_avg(const int n, const Dtype *d_grad_out, Dtype *d_grad_in, |
|
|
int nchannel, const Dtype *d_num_nonzero, |
|
|
const Itype *in_map, const Itype *out_map) { |
|
|
CUDA_KERNEL_LOOP(index, n) { |
|
|
int nrow = index / nchannel; |
|
|
int ch = index % nchannel; |
|
|
int curr_num_nonzero = d_num_nonzero[out_map[nrow]]; |
|
|
if (curr_num_nonzero >= 1) |
|
|
atomicAdd(&d_grad_in[in_map[nrow] * nchannel + ch], |
|
|
d_grad_out[out_map[nrow] * nchannel + ch] / curr_num_nonzero); |
|
|
} |
|
|
} |
|
|
|
|
|
template <typename Dtype, typename Itype, typename ByteAllocator> |
|
|
void NonzeroAvgPoolingForwardKernelGPU( |
|
|
Dtype const *d_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
Dtype *d_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
Dtype *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, |
|
|
bool const use_avg, |
|
|
ByteAllocator &allocator, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream) { |
|
|
const Dtype alpha = 1; |
|
|
const Dtype beta = 0; |
|
|
static_assert(sizeof(Itype) == sizeof(int), |
|
|
"cusparse requires int type index"); |
|
|
Dtype *d_ones, *d_coo_val, *d_tmp_out_feat; |
|
|
|
|
|
constexpr bool is_int32 = sizeof(Itype) == sizeof(int32_t); |
|
|
constexpr bool is_int64 = sizeof(Itype) == sizeof(int64_t); |
|
|
constexpr bool is_float32 = std::is_same<Dtype, float>::value; |
|
|
cudaDataType cuda_data_type = is_float32 ? CUDA_R_32F : CUDA_R_64F; |
|
|
|
|
|
cusparseSpMMAlg_t mm_alg; |
|
|
#if defined(CUDART_VERSION) && (CUDART_VERSION < 10010) |
|
|
ASSERT(false, "spmm sparse-dense requires CUDA 10.1 or greater"); |
|
|
#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 10010) && \ |
|
|
(CUDART_VERSION < 11000) |
|
|
mm_alg = CUSPARSE_COOMM_ALG1; |
|
|
static_assert(is_int32, "int64 cusparseSpMM requires CUDA 11.1 or greater"); |
|
|
#elif defined(CUDART_VERSION) && (CUDART_VERSION >= 11000) |
|
|
mm_alg = CUSPARSE_SPMM_COO_ALG1; |
|
|
static_assert(is_int32 || is_int64, "Invalid index type"); |
|
|
#endif |
|
|
|
|
|
|
|
|
size_t const sparse_nnzs = |
|
|
kernel_map.in_maps.end() - kernel_map.in_maps.begin(); |
|
|
static_assert(is_int32, "sort_coo supports int32"); |
|
|
sort_coo_gpu<ByteAllocator>(cushandle, out_nrows, in_nrows, sparse_nnzs, |
|
|
(int *)kernel_map.out_maps.begin(), |
|
|
(int *)kernel_map.in_maps.begin(), allocator); |
|
|
|
|
|
|
|
|
d_tmp_out_feat = |
|
|
(Dtype *)allocator.allocate(nchannel * out_nrows * sizeof(Dtype)); |
|
|
d_coo_val = (Dtype *)allocator.allocate(sparse_nnzs * sizeof(Dtype)); |
|
|
fill<Dtype><<<GET_BLOCKS(sparse_nnzs, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, |
|
|
stream>>>(sparse_nnzs, d_coo_val, (Dtype)1.); |
|
|
if (use_avg) { |
|
|
d_ones = (Dtype *)allocator.allocate(sparse_nnzs * sizeof(Dtype)); |
|
|
fill<Dtype><<<GET_BLOCKS(sparse_nnzs, CUDA_NUM_THREADS), CUDA_NUM_THREADS, |
|
|
0, stream>>>(sparse_nnzs, d_ones, (Dtype)1.); |
|
|
} |
|
|
|
|
|
#ifdef DEBUG |
|
|
std::cout << "sparse_nnzs: " << sparse_nnzs << "\n"; |
|
|
Itype *p_scr = (Itype *)std::malloc((sparse_nnzs)*2 * sizeof(Itype)); |
|
|
CUDA_CHECK(cudaMemcpy(p_scr, kernel_map.out_maps.begin(), |
|
|
sparse_nnzs * sizeof(Itype), cudaMemcpyDeviceToHost)); |
|
|
CUDA_CHECK(cudaMemcpy(p_scr + sparse_nnzs, kernel_map.in_maps.begin(), |
|
|
sparse_nnzs * sizeof(Itype), cudaMemcpyDeviceToHost)); |
|
|
|
|
|
Itype step = std::max<Itype>(sparse_nnzs / 100, 1); |
|
|
Itype i = 0; |
|
|
for (; i < sparse_nnzs;) { |
|
|
std::cout << i; |
|
|
std::cout << " out_map: " << p_scr[i] |
|
|
<< ", in_map: " << p_scr[i + sparse_nnzs] << "\n"; |
|
|
i += step; |
|
|
} |
|
|
i -= step; |
|
|
for (; i < sparse_nnzs; ++i) { |
|
|
std::cout << i; |
|
|
std::cout << " out_map: " << p_scr[i] |
|
|
<< ", in_map: " << p_scr[i + sparse_nnzs] << "\n"; |
|
|
} |
|
|
std::free(p_scr); |
|
|
std::cout << "done printing\n"; |
|
|
#endif |
|
|
|
|
|
Itype *sorted_row_ptr = |
|
|
(Itype *)allocator.allocate(2 * (sparse_nnzs + 1) * sizeof(Itype)); |
|
|
Itype *sorted_col_ptr = sorted_row_ptr + sparse_nnzs + 1; |
|
|
|
|
|
CUDA_CHECK(cudaMemcpy(sorted_row_ptr, kernel_map.out_maps.begin(), |
|
|
sparse_nnzs * sizeof(Itype), cudaMemcpyDeviceToDevice)); |
|
|
CUDA_CHECK(cudaMemcpy(sorted_col_ptr, kernel_map.in_maps.begin(), |
|
|
sparse_nnzs * sizeof(Itype), cudaMemcpyDeviceToDevice)); |
|
|
|
|
|
THRUST_CHECK(thrust::sort_by_key(thrust::device, |
|
|
sorted_row_ptr, |
|
|
sorted_row_ptr + sparse_nnzs, |
|
|
sorted_col_ptr)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t dim_i = out_nrows, dim_j = in_nrows, dim_k = nchannel; |
|
|
cusparseSpMatDescr_t sparse_descr; |
|
|
cusparseDnMatDescr_t dense_descr; |
|
|
cusparseDnMatDescr_t result_descr; |
|
|
CUSPARSE_CHECK( |
|
|
cusparseCreateCoo(&sparse_descr, |
|
|
dim_i, dim_j, sparse_nnzs, |
|
|
sorted_row_ptr, |
|
|
sorted_col_ptr, |
|
|
d_coo_val, |
|
|
is_int32 ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I, |
|
|
CUSPARSE_INDEX_BASE_ZERO, cuda_data_type)); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&dense_descr, |
|
|
dim_k, dim_j, dim_k, |
|
|
(void *)d_in_feat, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
CUSPARSE_CHECK(cusparseCreateDnMat(&result_descr, |
|
|
dim_i, dim_k, dim_i, |
|
|
(void *)d_tmp_out_feat, |
|
|
cuda_data_type, CUSPARSE_ORDER_COL)); |
|
|
|
|
|
size_t buffer_size = 0; |
|
|
CUSPARSE_CHECK(cusparseSpMM_bufferSize( |
|
|
cushandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, |
|
|
(void *)&alpha, sparse_descr, dense_descr, (void *)&beta, result_descr, |
|
|
cuda_data_type, mm_alg, &buffer_size)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CUSPARSE_CHECK(cusparseSpMM(cushandle, |
|
|
CUSPARSE_OPERATION_NON_TRANSPOSE, |
|
|
CUSPARSE_OPERATION_TRANSPOSE, |
|
|
(void *)&alpha, |
|
|
sparse_descr, dense_descr, |
|
|
(void *)&beta, result_descr, |
|
|
cuda_data_type, mm_alg, &buffer_size)); |
|
|
#ifdef DEBUG |
|
|
CUDA_CHECK(cudaStreamSynchronize(0)); |
|
|
#endif |
|
|
LOG_DEBUG("SPMM"); |
|
|
|
|
|
if (use_avg) { |
|
|
Itype *unique_row_ptr = |
|
|
(Itype *)allocator.allocate(sparse_nnzs * sizeof(Itype)); |
|
|
Dtype *reduced_val_ptr = |
|
|
(Dtype *)allocator.allocate(sparse_nnzs * sizeof(Dtype)); |
|
|
|
|
|
|
|
|
int num_unique_keys; |
|
|
try { |
|
|
auto end = thrust::reduce_by_key(thrust::device, |
|
|
sorted_row_ptr, |
|
|
sorted_row_ptr + sparse_nnzs, |
|
|
d_ones, |
|
|
unique_row_ptr, |
|
|
reduced_val_ptr |
|
|
); |
|
|
num_unique_keys = end.first - unique_row_ptr; |
|
|
LOG_DEBUG("Num unique keys:", num_unique_keys); |
|
|
} THRUST_CATCH; |
|
|
|
|
|
#ifdef DEBUG |
|
|
Itype *p_unique_row = (Itype *)std::malloc(num_unique_keys * sizeof(Itype)); |
|
|
CUDA_CHECK(cudaMemcpy(p_unique_row, unique_row_ptr, |
|
|
num_unique_keys * sizeof(Itype), |
|
|
cudaMemcpyDeviceToHost)); |
|
|
std::cout << "[" << PtrToString(p_unique_row, num_unique_keys) << "]\n"; |
|
|
std::free(p_unique_row); |
|
|
|
|
|
Dtype *p_reduced_val = |
|
|
(Dtype *)std::malloc(num_unique_keys * sizeof(Dtype)); |
|
|
CUDA_CHECK(cudaMemcpy(p_reduced_val, reduced_val_ptr, |
|
|
num_unique_keys * sizeof(Dtype), |
|
|
cudaMemcpyDeviceToHost)); |
|
|
std::cout << "[" << PtrToString(p_reduced_val, num_unique_keys) << "]\n"; |
|
|
std::free(p_reduced_val); |
|
|
#endif |
|
|
|
|
|
unique_row2num_nonzero<Itype, Dtype> |
|
|
<<<GET_BLOCKS(num_unique_keys, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, |
|
|
stream>>>(num_unique_keys, d_num_nonzero, unique_row_ptr, |
|
|
reduced_val_ptr); |
|
|
|
|
|
col2row_major_with_div<Dtype> |
|
|
<<<GET_BLOCKS(out_nrows * nchannel, CUDA_NUM_THREADS), CUDA_NUM_THREADS, |
|
|
0, stream>>>(out_nrows * nchannel, out_nrows, nchannel, |
|
|
d_num_nonzero, d_tmp_out_feat, d_out_feat); |
|
|
#ifdef DEBUG |
|
|
CUDA_CHECK(cudaStreamSynchronize(0)); |
|
|
#endif |
|
|
LOG_DEBUG("col2row"); |
|
|
|
|
|
|
|
|
allocator.deallocate((char *)unique_row_ptr, sparse_nnzs * sizeof(Itype)); |
|
|
allocator.deallocate((char *)reduced_val_ptr, sparse_nnzs * sizeof(Dtype)); |
|
|
} else { |
|
|
col2row_major<Dtype><<<GET_BLOCKS(out_nrows * nchannel, CUDA_NUM_THREADS), |
|
|
CUDA_NUM_THREADS, 0, stream>>>( |
|
|
out_nrows * nchannel, out_nrows, nchannel, d_tmp_out_feat, d_out_feat); |
|
|
} |
|
|
|
|
|
CUSPARSE_CHECK(cusparseDestroySpMat(sparse_descr)); |
|
|
CUSPARSE_CHECK(cusparseDestroyDnMat(dense_descr)); |
|
|
CUSPARSE_CHECK(cusparseDestroyDnMat(result_descr)); |
|
|
|
|
|
allocator.deallocate((char *)d_coo_val, sparse_nnzs * sizeof(Dtype)); |
|
|
allocator.deallocate((char *)d_tmp_out_feat, |
|
|
nchannel * out_nrows * sizeof(Dtype)); |
|
|
if (use_avg) |
|
|
allocator.deallocate((char *)d_ones, in_nrows * sizeof(Dtype)); |
|
|
|
|
|
allocator.deallocate((char *)sorted_row_ptr, |
|
|
2 * (sparse_nnzs + 1) * sizeof(Itype)); |
|
|
CUDA_CHECK(cudaStreamSynchronize(0)); |
|
|
} |
|
|
|
|
|
|
|
|
template void |
|
|
NonzeroAvgPoolingForwardKernelGPU<float, uint32_t, |
|
|
detail::default_allocator<char>>( |
|
|
float const *d_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
float *d_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
float *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const |
|
|
&kernel_map, |
|
|
bool const use_avg, |
|
|
detail::default_allocator<char> &allocator, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
NonzeroAvgPoolingForwardKernelGPU<double, uint32_t, |
|
|
detail::default_allocator<char>>( |
|
|
double const *d_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
double *d_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
double *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const |
|
|
&kernel_map, |
|
|
bool const use_avg, |
|
|
detail::default_allocator<char> &allocator, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
|
|
|
template void |
|
|
NonzeroAvgPoolingForwardKernelGPU<float, uint32_t, detail::c10_allocator<char>>( |
|
|
float const *d_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
float *d_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
float *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, |
|
|
detail::c10_allocator<char> &allocator, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
template void NonzeroAvgPoolingForwardKernelGPU<double, uint32_t, |
|
|
detail::c10_allocator<char>>( |
|
|
double const *d_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
double *d_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
double *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, |
|
|
detail::c10_allocator<char> &allocator, |
|
|
cusparseHandle_t cushandle, cudaStream_t stream); |
|
|
|
|
|
|
|
|
template <typename Dtype, typename Itype, typename ByteAllocator> |
|
|
void NonzeroAvgPoolingBackwardKernelGPU( |
|
|
Dtype *d_grad_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
Dtype const *d_grad_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
Dtype const *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<Itype, ByteAllocator> const &kernel_map, bool const use_avg, |
|
|
cudaStream_t stream) { |
|
|
|
|
|
|
|
|
size_t sparse_nnzs = kernel_map.in_maps.end() - kernel_map.in_maps.begin(); |
|
|
|
|
|
if (use_avg) { |
|
|
set_gradient_nonzero_avg<Dtype> |
|
|
<<<GET_BLOCKS(sparse_nnzs * nchannel, CUDA_NUM_THREADS), |
|
|
CUDA_NUM_THREADS, 0, stream>>>( |
|
|
sparse_nnzs * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel, |
|
|
d_num_nonzero, kernel_map.in_maps.cdata(), |
|
|
kernel_map.out_maps.cdata()); |
|
|
} else { |
|
|
set_gradient_nonzero<Dtype> |
|
|
<<<GET_BLOCKS(sparse_nnzs * nchannel, CUDA_NUM_THREADS), |
|
|
CUDA_NUM_THREADS, 0, stream>>>( |
|
|
sparse_nnzs * nchannel, d_grad_out_feat, d_grad_in_feat, nchannel, |
|
|
kernel_map.in_maps.cdata(), kernel_map.out_maps.cdata()); |
|
|
} |
|
|
|
|
|
CUDA_CHECK(cudaDeviceSynchronize()); |
|
|
} |
|
|
|
|
|
|
|
|
template void |
|
|
NonzeroAvgPoolingBackwardKernelGPU<float, uint32_t, |
|
|
detail::default_allocator<char>>( |
|
|
float *d_grad_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
float const *d_grad_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
float const *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, cudaStream_t stream); |
|
|
|
|
|
template void |
|
|
NonzeroAvgPoolingBackwardKernelGPU<double, uint32_t, |
|
|
detail::default_allocator<char>>( |
|
|
double *d_grad_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
double const *d_grad_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
double const *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::default_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, cudaStream_t stream); |
|
|
|
|
|
|
|
|
template void NonzeroAvgPoolingBackwardKernelGPU<float, uint32_t, |
|
|
detail::c10_allocator<char>>( |
|
|
float *d_grad_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
float const *d_grad_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
float const *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, cudaStream_t stream); |
|
|
|
|
|
template void NonzeroAvgPoolingBackwardKernelGPU<double, uint32_t, |
|
|
detail::c10_allocator<char>>( |
|
|
double *d_grad_in_feat, |
|
|
default_types::size_type const in_nrows, |
|
|
double const *d_grad_out_feat, |
|
|
default_types::size_type const out_nrows, |
|
|
double const *d_num_nonzero, |
|
|
default_types::size_type const nchannel, |
|
|
gpu_kernel_map<uint32_t, detail::c10_allocator<char>> const &kernel_map, |
|
|
bool const use_avg, cudaStream_t stream); |
|
|
|
|
|
} |
|
|
|
|
|
#endif |
|
|
|