depthsplat / MinkowskiEngine /tests /cpp /kernel_region_gpu_test.cu
Yeqing0814's picture
Upload folder using huggingface_hub
a6dd040 verified
/*
* Copyright (c) 2020 NVIDIA CORPORATION.
* Copyright (c) 2018-2020 Chris Choy (chrischoy@ai.stanford.edu)
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
#include "coordinate_map_functors.cuh"
#include "coordinate_map_gpu.cuh"
#include "gpu.cuh"
#include "kernel_region.hpp"
#include "types.hpp"
#include "utils.hpp"
#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/host_vector.h>
#include <torch/extension.h>
namespace minkowski {
using coordinate_type = int32_t;
using index_type = default_types::index_type;
using size_type = default_types::size_type;
/*
* The number of threads must be > coordinate_size
*/
__global__ void kernel_region_iterator_test(
coordinate_type const *__restrict__ p_coordinate,
size_type number_of_coordinates, //
gpu_kernel_region<coordinate_type> kernel,
coordinate_type *__restrict__ p_return_coordinates) {
extern __shared__ coordinate_type sh_coordinate[];
auto const tx = threadIdx.x;
auto const bx = blockIdx.x;
auto const x = blockDim.x * bx + tx;
size_type coordinate_size = kernel.coordinate_size();
size_type volume = kernel.volume();
coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size;
coordinate_type *sh_lb =
sh_coordinate + (CUDA_NUM_THREADS + tx) * coordinate_size;
coordinate_type *sh_ub =
sh_coordinate + (2 * CUDA_NUM_THREADS + tx) * coordinate_size;
index_type *sh_index = reinterpret_cast<index_type *>(
sh_coordinate + 3 * CUDA_NUM_THREADS * coordinate_size);
index_type *sh_tensor_stride = sh_index;
index_type *sh_kernel_size = sh_index + coordinate_size;
index_type *sh_dilation = sh_index + 2 * coordinate_size;
if (tx < coordinate_size - 1) {
sh_tensor_stride[tx] = kernel.tensor_stride()[tx];
sh_kernel_size[tx] = kernel.kernel_size()[tx];
sh_dilation[tx] = kernel.dilation()[tx];
}
__syncthreads();
if (x >= number_of_coordinates)
return;
// iterate and copy
index_type out_index = x * kernel.volume();
kernel.set_bounds(&p_coordinate[x * coordinate_size], sh_lb, sh_ub, sh_tmp);
for (auto const &coordinate : kernel) {
for (index_type i = 0; i < coordinate_size; ++i) {
p_return_coordinates[out_index * coordinate_size + i] = coordinate[i];
}
++out_index;
}
}
at::Tensor region_iterator_test(const torch::Tensor &coordinates,
const torch::Tensor &th_kernel_size) {
// Create TensorArgs. These record the names and positions of each tensor as
// parameters.
torch::TensorArg arg_coordinates(coordinates, "coordinates", 0);
torch::TensorArg arg_kernel_size(th_kernel_size, "kernel_size", 1);
torch::CheckedFrom c = "region_iterator_test";
torch::checkContiguous(c, arg_coordinates);
torch::checkContiguous(c, arg_kernel_size);
// must match coordinate_type
torch::checkScalarType(c, arg_coordinates, torch::kInt);
torch::checkScalarType(c, arg_kernel_size, torch::kInt);
torch::checkBackend(c, arg_coordinates.tensor, torch::Backend::CUDA);
torch::checkBackend(c, arg_kernel_size.tensor, torch::Backend::CPU);
torch::checkDim(c, arg_coordinates, 2);
torch::checkDim(c, arg_kernel_size, 1);
auto const N = (index_type)coordinates.size(0);
auto const coordinate_size = (index_type)coordinates.size(1);
coordinate_type *p_coordinate = coordinates.data_ptr<coordinate_type>();
coordinate_type *p_kernel_size = th_kernel_size.data_ptr<coordinate_type>();
default_types::stride_type tensor_stride(coordinate_size - 1);
default_types::stride_type kernel_size(coordinate_size - 1);
default_types::stride_type dilation(coordinate_size - 1);
for (index_type i = 0; i < coordinate_size - 1; ++i) {
tensor_stride[i] = 1;
kernel_size[i] = p_kernel_size[i];
dilation[i] = 1;
}
auto cpu_kernel = cpu_kernel_region<coordinate_type>(
RegionType::HYPER_CUBE, coordinate_size, tensor_stride.data(),
kernel_size.data(), dilation.data());
auto kernel = gpu_kernel_region<coordinate_type>(cpu_kernel.to_gpu());
LOG_DEBUG("initialize vectors");
torch::Tensor out_coordinates = torch::empty(
{N * kernel.volume(), coordinate_size}, coordinates.options());
uint32_t shared_memory_size_in_bytes =
3 * CUDA_NUM_THREADS * coordinate_size * sizeof(coordinate_type) +
3 * coordinate_size * sizeof(index_type);
kernel_region_iterator_test<<<GET_BLOCKS(N, CUDA_NUM_THREADS),
CUDA_NUM_THREADS,
shared_memory_size_in_bytes>>>(
p_coordinate, //
N, //
kernel, //
out_coordinates.data_ptr<coordinate_type>());
LOG_DEBUG("End call");
CUDA_CHECK(cudaStreamSynchronize(0));
return out_coordinates;
}
std::tuple<std::pair<cpu_in_maps, cpu_out_maps>, size_type, double>
kernel_map_test(const torch::Tensor &in_coordinates,
const torch::Tensor &out_coordinates,
const torch::Tensor &kernel_size,
uint32_t occupancy, //
uint32_t thread_dim) {
// Create TensorArgs. These record the names and positions of each tensor as
// parameters.
torch::TensorArg arg_in_coordinates(in_coordinates, "coordinates", 0);
torch::TensorArg arg_out_coordinates(out_coordinates, "coordinates", 1);
torch::TensorArg arg_kernel_size(kernel_size, "kernel_size", 2);
torch::CheckedFrom c = "region_iterator_test";
torch::checkContiguous(c, arg_in_coordinates);
torch::checkContiguous(c, arg_out_coordinates);
torch::checkContiguous(c, arg_kernel_size);
// must match coordinate_type
torch::checkScalarType(c, arg_in_coordinates, torch::kInt);
torch::checkScalarType(c, arg_out_coordinates, torch::kInt);
torch::checkScalarType(c, arg_kernel_size, torch::kInt);
torch::checkBackend(c, arg_in_coordinates.tensor, torch::Backend::CUDA);
torch::checkBackend(c, arg_out_coordinates.tensor, torch::Backend::CUDA);
torch::checkBackend(c, arg_kernel_size.tensor, torch::Backend::CPU);
torch::checkDim(c, arg_in_coordinates, 2);
torch::checkDim(c, arg_out_coordinates, 2);
torch::checkDim(c, arg_kernel_size, 1);
auto const N_in = (index_type)in_coordinates.size(0);
auto const D = (index_type)in_coordinates.size(1);
auto const N_out = (index_type)out_coordinates.size(0);
auto const D_out = (index_type)out_coordinates.size(1);
ASSERT(D == D_out, "dimension mismatch");
coordinate_type const *ptr_in = in_coordinates.data_ptr<coordinate_type>();
coordinate_type const *ptr_out = out_coordinates.data_ptr<coordinate_type>();
CoordinateMapGPU<coordinate_type> in_map{N_in, D, occupancy};
CoordinateMapGPU<coordinate_type> out_map{N_out, D, occupancy};
auto in_coordinate_range = coordinate_range<coordinate_type>(N_in, D, ptr_in);
in_map.insert<false>(in_coordinate_range.begin(), // key begin
in_coordinate_range.end()); // key end
auto out_coordinate_range =
coordinate_range<coordinate_type>(N_out, D, ptr_out);
out_map.insert<false>(out_coordinate_range.begin(), // key begin
out_coordinate_range.end()); // key end
LOG_DEBUG("coordinate initialization");
// Kernel region
coordinate_type *p_kernel_size = kernel_size.data_ptr<coordinate_type>();
default_types::stride_type tensor_stride;
default_types::stride_type s_kernel_size;
default_types::stride_type dilation;
for (index_type i = 0; i < D - 1; ++i) {
tensor_stride.push_back(1);
s_kernel_size.push_back(p_kernel_size[i]);
dilation.push_back(1);
}
auto region = cpu_kernel_region<coordinate_type>(
RegionType::HYPER_CUBE, D, tensor_stride.data(), s_kernel_size.data(),
dilation.data());
LOG_DEBUG("cpu_kernel_region initialized with volume", region.volume());
region.to_gpu();
auto gpu_region = gpu_kernel_region<coordinate_type>(region);
LOG_DEBUG("gpu_kernel_region initialization");
timer t;
t.tic();
auto kernel_map = in_map.kernel_map(
out_map, gpu_region, CUDAKernelMapMode::SPEED_OPTIMIZED, thread_dim);
double k_time = t.toc();
const auto volume = region.volume();
LOG_DEBUG("kernel_map done");
cpu_in_maps in_maps(volume);
cpu_in_maps out_maps(volume);
for (index_type i = 0; i < volume; ++i) {
size_type size = kernel_map.kernels.size(i);
LOG_DEBUG("kernel index", i, "/", volume, "with size", size);
in_maps[i].resize(size);
out_maps[i].resize(size);
if (size > 0) {
cudaMemcpy(in_maps[i].data(), //
kernel_map.in_maps.begin(i), sizeof(index_type) * size,
cudaMemcpyDeviceToHost);
cudaMemcpy(out_maps[i].data(), //
kernel_map.out_maps.begin(i), sizeof(index_type) * size,
cudaMemcpyDeviceToHost);
}
}
CUDA_CHECK(cudaStreamSynchronize(0));
return std::make_tuple(std::make_pair(in_maps, out_maps), out_map.size(),
k_time);
}
} // namespace minkowski
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("region_iterator_test", &minkowski::region_iterator_test,
"Minkowski Engine region iterator test");
m.def("kernel_map_test", &minkowski::kernel_map_test,
"Minkowski Engine kernel map test");
}