|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "coordinate_map_functors.cuh" |
|
|
#include "coordinate_map_gpu.cuh" |
|
|
#include "gpu.cuh" |
|
|
#include "kernel_region.hpp" |
|
|
#include "types.hpp" |
|
|
#include "utils.hpp" |
|
|
|
|
|
#include <thrust/device_vector.h> |
|
|
#include <thrust/for_each.h> |
|
|
#include <thrust/host_vector.h> |
|
|
|
|
|
#include <torch/extension.h> |
|
|
|
|
|
namespace minkowski { |
|
|
|
|
|
using coordinate_type = int32_t; |
|
|
using index_type = default_types::index_type; |
|
|
using size_type = default_types::size_type; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void kernel_region_iterator_test( |
|
|
coordinate_type const *__restrict__ p_coordinate, |
|
|
size_type number_of_coordinates, |
|
|
gpu_kernel_region<coordinate_type> kernel, |
|
|
coordinate_type *__restrict__ p_return_coordinates) { |
|
|
extern __shared__ coordinate_type sh_coordinate[]; |
|
|
|
|
|
auto const tx = threadIdx.x; |
|
|
auto const bx = blockIdx.x; |
|
|
auto const x = blockDim.x * bx + tx; |
|
|
|
|
|
size_type coordinate_size = kernel.coordinate_size(); |
|
|
size_type volume = kernel.volume(); |
|
|
|
|
|
coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; |
|
|
coordinate_type *sh_lb = |
|
|
sh_coordinate + (CUDA_NUM_THREADS + tx) * coordinate_size; |
|
|
coordinate_type *sh_ub = |
|
|
sh_coordinate + (2 * CUDA_NUM_THREADS + tx) * coordinate_size; |
|
|
|
|
|
index_type *sh_index = reinterpret_cast<index_type *>( |
|
|
sh_coordinate + 3 * CUDA_NUM_THREADS * coordinate_size); |
|
|
index_type *sh_tensor_stride = sh_index; |
|
|
index_type *sh_kernel_size = sh_index + coordinate_size; |
|
|
index_type *sh_dilation = sh_index + 2 * coordinate_size; |
|
|
|
|
|
if (tx < coordinate_size - 1) { |
|
|
sh_tensor_stride[tx] = kernel.tensor_stride()[tx]; |
|
|
sh_kernel_size[tx] = kernel.kernel_size()[tx]; |
|
|
sh_dilation[tx] = kernel.dilation()[tx]; |
|
|
} |
|
|
__syncthreads(); |
|
|
if (x >= number_of_coordinates) |
|
|
return; |
|
|
|
|
|
|
|
|
index_type out_index = x * kernel.volume(); |
|
|
kernel.set_bounds(&p_coordinate[x * coordinate_size], sh_lb, sh_ub, sh_tmp); |
|
|
for (auto const &coordinate : kernel) { |
|
|
for (index_type i = 0; i < coordinate_size; ++i) { |
|
|
p_return_coordinates[out_index * coordinate_size + i] = coordinate[i]; |
|
|
} |
|
|
++out_index; |
|
|
} |
|
|
} |
|
|
|
|
|
at::Tensor region_iterator_test(const torch::Tensor &coordinates, |
|
|
const torch::Tensor &th_kernel_size) { |
|
|
|
|
|
|
|
|
torch::TensorArg arg_coordinates(coordinates, "coordinates", 0); |
|
|
torch::TensorArg arg_kernel_size(th_kernel_size, "kernel_size", 1); |
|
|
|
|
|
torch::CheckedFrom c = "region_iterator_test"; |
|
|
torch::checkContiguous(c, arg_coordinates); |
|
|
torch::checkContiguous(c, arg_kernel_size); |
|
|
|
|
|
torch::checkScalarType(c, arg_coordinates, torch::kInt); |
|
|
torch::checkScalarType(c, arg_kernel_size, torch::kInt); |
|
|
torch::checkBackend(c, arg_coordinates.tensor, torch::Backend::CUDA); |
|
|
torch::checkBackend(c, arg_kernel_size.tensor, torch::Backend::CPU); |
|
|
torch::checkDim(c, arg_coordinates, 2); |
|
|
torch::checkDim(c, arg_kernel_size, 1); |
|
|
|
|
|
auto const N = (index_type)coordinates.size(0); |
|
|
auto const coordinate_size = (index_type)coordinates.size(1); |
|
|
coordinate_type *p_coordinate = coordinates.data_ptr<coordinate_type>(); |
|
|
coordinate_type *p_kernel_size = th_kernel_size.data_ptr<coordinate_type>(); |
|
|
|
|
|
default_types::stride_type tensor_stride(coordinate_size - 1); |
|
|
default_types::stride_type kernel_size(coordinate_size - 1); |
|
|
default_types::stride_type dilation(coordinate_size - 1); |
|
|
|
|
|
for (index_type i = 0; i < coordinate_size - 1; ++i) { |
|
|
tensor_stride[i] = 1; |
|
|
kernel_size[i] = p_kernel_size[i]; |
|
|
dilation[i] = 1; |
|
|
} |
|
|
|
|
|
auto cpu_kernel = cpu_kernel_region<coordinate_type>( |
|
|
RegionType::HYPER_CUBE, coordinate_size, tensor_stride.data(), |
|
|
kernel_size.data(), dilation.data()); |
|
|
auto kernel = gpu_kernel_region<coordinate_type>(cpu_kernel.to_gpu()); |
|
|
|
|
|
LOG_DEBUG("initialize vectors"); |
|
|
|
|
|
torch::Tensor out_coordinates = torch::empty( |
|
|
{N * kernel.volume(), coordinate_size}, coordinates.options()); |
|
|
|
|
|
uint32_t shared_memory_size_in_bytes = |
|
|
3 * CUDA_NUM_THREADS * coordinate_size * sizeof(coordinate_type) + |
|
|
3 * coordinate_size * sizeof(index_type); |
|
|
|
|
|
kernel_region_iterator_test<<<GET_BLOCKS(N, CUDA_NUM_THREADS), |
|
|
CUDA_NUM_THREADS, |
|
|
shared_memory_size_in_bytes>>>( |
|
|
p_coordinate, |
|
|
N, |
|
|
kernel, |
|
|
out_coordinates.data_ptr<coordinate_type>()); |
|
|
|
|
|
LOG_DEBUG("End call"); |
|
|
CUDA_CHECK(cudaStreamSynchronize(0)); |
|
|
|
|
|
return out_coordinates; |
|
|
} |
|
|
|
|
|
std::tuple<std::pair<cpu_in_maps, cpu_out_maps>, size_type, double> |
|
|
kernel_map_test(const torch::Tensor &in_coordinates, |
|
|
const torch::Tensor &out_coordinates, |
|
|
const torch::Tensor &kernel_size, |
|
|
uint32_t occupancy, |
|
|
uint32_t thread_dim) { |
|
|
|
|
|
|
|
|
torch::TensorArg arg_in_coordinates(in_coordinates, "coordinates", 0); |
|
|
torch::TensorArg arg_out_coordinates(out_coordinates, "coordinates", 1); |
|
|
torch::TensorArg arg_kernel_size(kernel_size, "kernel_size", 2); |
|
|
|
|
|
torch::CheckedFrom c = "region_iterator_test"; |
|
|
torch::checkContiguous(c, arg_in_coordinates); |
|
|
torch::checkContiguous(c, arg_out_coordinates); |
|
|
torch::checkContiguous(c, arg_kernel_size); |
|
|
|
|
|
torch::checkScalarType(c, arg_in_coordinates, torch::kInt); |
|
|
torch::checkScalarType(c, arg_out_coordinates, torch::kInt); |
|
|
torch::checkScalarType(c, arg_kernel_size, torch::kInt); |
|
|
torch::checkBackend(c, arg_in_coordinates.tensor, torch::Backend::CUDA); |
|
|
torch::checkBackend(c, arg_out_coordinates.tensor, torch::Backend::CUDA); |
|
|
torch::checkBackend(c, arg_kernel_size.tensor, torch::Backend::CPU); |
|
|
torch::checkDim(c, arg_in_coordinates, 2); |
|
|
torch::checkDim(c, arg_out_coordinates, 2); |
|
|
torch::checkDim(c, arg_kernel_size, 1); |
|
|
|
|
|
auto const N_in = (index_type)in_coordinates.size(0); |
|
|
auto const D = (index_type)in_coordinates.size(1); |
|
|
|
|
|
auto const N_out = (index_type)out_coordinates.size(0); |
|
|
auto const D_out = (index_type)out_coordinates.size(1); |
|
|
|
|
|
ASSERT(D == D_out, "dimension mismatch"); |
|
|
|
|
|
coordinate_type const *ptr_in = in_coordinates.data_ptr<coordinate_type>(); |
|
|
coordinate_type const *ptr_out = out_coordinates.data_ptr<coordinate_type>(); |
|
|
|
|
|
CoordinateMapGPU<coordinate_type> in_map{N_in, D, occupancy}; |
|
|
CoordinateMapGPU<coordinate_type> out_map{N_out, D, occupancy}; |
|
|
|
|
|
auto in_coordinate_range = coordinate_range<coordinate_type>(N_in, D, ptr_in); |
|
|
in_map.insert<false>(in_coordinate_range.begin(), |
|
|
in_coordinate_range.end()); |
|
|
|
|
|
auto out_coordinate_range = |
|
|
coordinate_range<coordinate_type>(N_out, D, ptr_out); |
|
|
out_map.insert<false>(out_coordinate_range.begin(), |
|
|
out_coordinate_range.end()); |
|
|
|
|
|
LOG_DEBUG("coordinate initialization"); |
|
|
|
|
|
|
|
|
coordinate_type *p_kernel_size = kernel_size.data_ptr<coordinate_type>(); |
|
|
default_types::stride_type tensor_stride; |
|
|
default_types::stride_type s_kernel_size; |
|
|
default_types::stride_type dilation; |
|
|
for (index_type i = 0; i < D - 1; ++i) { |
|
|
tensor_stride.push_back(1); |
|
|
s_kernel_size.push_back(p_kernel_size[i]); |
|
|
dilation.push_back(1); |
|
|
} |
|
|
|
|
|
auto region = cpu_kernel_region<coordinate_type>( |
|
|
RegionType::HYPER_CUBE, D, tensor_stride.data(), s_kernel_size.data(), |
|
|
dilation.data()); |
|
|
LOG_DEBUG("cpu_kernel_region initialized with volume", region.volume()); |
|
|
region.to_gpu(); |
|
|
auto gpu_region = gpu_kernel_region<coordinate_type>(region); |
|
|
LOG_DEBUG("gpu_kernel_region initialization"); |
|
|
|
|
|
timer t; |
|
|
t.tic(); |
|
|
auto kernel_map = in_map.kernel_map( |
|
|
out_map, gpu_region, CUDAKernelMapMode::SPEED_OPTIMIZED, thread_dim); |
|
|
double k_time = t.toc(); |
|
|
|
|
|
const auto volume = region.volume(); |
|
|
LOG_DEBUG("kernel_map done"); |
|
|
|
|
|
cpu_in_maps in_maps(volume); |
|
|
cpu_in_maps out_maps(volume); |
|
|
for (index_type i = 0; i < volume; ++i) { |
|
|
size_type size = kernel_map.kernels.size(i); |
|
|
LOG_DEBUG("kernel index", i, "/", volume, "with size", size); |
|
|
in_maps[i].resize(size); |
|
|
out_maps[i].resize(size); |
|
|
|
|
|
if (size > 0) { |
|
|
cudaMemcpy(in_maps[i].data(), |
|
|
kernel_map.in_maps.begin(i), sizeof(index_type) * size, |
|
|
cudaMemcpyDeviceToHost); |
|
|
cudaMemcpy(out_maps[i].data(), |
|
|
kernel_map.out_maps.begin(i), sizeof(index_type) * size, |
|
|
cudaMemcpyDeviceToHost); |
|
|
} |
|
|
} |
|
|
CUDA_CHECK(cudaStreamSynchronize(0)); |
|
|
|
|
|
return std::make_tuple(std::make_pair(in_maps, out_maps), out_map.size(), |
|
|
k_time); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { |
|
|
m.def("region_iterator_test", &minkowski::region_iterator_test, |
|
|
"Minkowski Engine region iterator test"); |
|
|
|
|
|
m.def("kernel_map_test", &minkowski::kernel_map_test, |
|
|
"Minkowski Engine kernel map test"); |
|
|
} |
|
|
|