/* * Copyright (c) 2020 NVIDIA CORPORATION. * Copyright (c) 2018-2020 Chris Choy (chrischoy@ai.stanford.edu) * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part * of the code. */ #include "coordinate_map_functors.cuh" #include "coordinate_map_gpu.cuh" #include "gpu.cuh" #include "kernel_map.cuh" #include "kernel_map.hpp" #include "sharedmem.cuh" #include #include #include #include #include #include #include namespace minkowski { namespace detail { template __global__ void remap_inverse_map(map_type __restrict__ map, // coordinate_type const *__restrict__ coordinates, // index_type *__restrict__ inverse_map, // size_type const num_threads, // size_type const coordinate_size // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { auto result = map.find( coordinate{&coordinates[x * coordinate_size]}); inverse_map[x] = result->second; } } template __global__ void insert_and_map_kernel(map_type __restrict__ map, // coordinate_type const *__restrict__ coordinates, // index_type *__restrict__ valid_map_index, // index_type *__restrict__ valid_row_index, // size_type const num_threads, // size_type const coordinate_size, // index_type const unused_key) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { // Returns pair auto const result = map.insert(thrust::make_pair( coordinate{&coordinates[x * coordinate_size]}, x)); // auto test = &coordinates[x * coordinate_size]; if (result.second) { valid_row_index[x] = x; // success map index. remove failed insertion with success. valid_map_index[x] = result.first.offset(); } else { valid_map_index[x] = unused_key; } } } } // namespace detail /* * Field Map */ namespace detail { template __global__ void quantize_coordinates_kernel( coordinate_field_type const *__restrict__ p_tfield, // coordinate_int_type *__restrict__ p_stensor, // index_type const *__restrict__ p_tensor_stride, // index_type const num_threads, index_type const coordinate_size) { // coordinate_size * sizeof(index_type) + coordinate_size * sizeof(float_type) // + THREADS * coordinate_size * sizeof(coordinate_type) extern __shared__ index_type sh_tensor_stride[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (stride_one) { if (x < num_threads) { if (x % coordinate_size == 0) p_stensor[x] = lrint(p_tfield[x]); else p_stensor[x] = floor(p_tfield[x]); } } else { for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = p_tensor_stride[i]; } __syncthreads(); if (x < num_threads) { // batch index if (x % coordinate_size == 0) p_stensor[x] = lrint(p_tfield[x]); else { index_type curr_tensor_stride = sh_tensor_stride[((x - 1) % coordinate_size)]; p_stensor[x] = floor(p_tfield[x] / curr_tensor_stride) * curr_tensor_stride; } } } } } // namespace detail template class TemplatedAllocator> void CoordinateFieldMapGPU:: quantize_coordinates(coordinate_int_type *d_dst_coordinates, stride_type const &tensor_stride) const { int64_t const stride_prod = std::accumulate( tensor_stride.begin(), tensor_stride.end(), 1, std::multiplies<>()); // Copy tensor_stride to device index_type *d_tensor_stride = reinterpret_cast( m_byte_allocator.allocate(m_coordinate_size * sizeof(index_type))); CUDA_CHECK(cudaMemcpy( d_tensor_stride, // dst tensor_stride.data(), // first element of the dereferenced iter. sizeof(index_type) * m_coordinate_size, // bytes cudaMemcpyHostToDevice)); size_type const num_threads = size() * m_coordinate_size; auto const num_blocks = GET_BLOCKS(num_threads, CUDA_NUM_THREADS); if (stride_prod == 1) { detail::quantize_coordinates_kernel <<>>( const_coordinate_data(), d_dst_coordinates, d_tensor_stride, num_threads, m_coordinate_size); } else { detail::quantize_coordinates_kernel <<>>( const_coordinate_data(), d_dst_coordinates, d_tensor_stride, num_threads, m_coordinate_size); } } /* * @brief Given a key iterator begin-end pair and a value iterator begin-end * pair, insert all elements. * * @note The key and value iterators can be 1) pointers, 2) coordinate or vector * iterators. * * @return none */ template class TemplatedAllocator> template void CoordinateMapGPU::insert( coordinate_iterator key_first, coordinate_iterator key_last) { size_type const N = key_last - key_first; LOG_DEBUG("key iterator length", N); if (N == 0) { m_size = 0; return; } m_valid_row_index.allocate(N); m_valid_map_index.allocate(N); // Copy the coordinates to m_coordinate base_type::reserve(N); CUDA_CHECK( cudaMemcpy(coordinate_data(), // dst key_first->data(), // first element of the dereferenced iter. sizeof(coordinate_type) * N * m_coordinate_size, // bytes cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaDeviceSynchronize()); LOG_DEBUG("Reserved and copiedm", N, "x", m_coordinate_size, "coordinates"); // compute cuda kernel call params size_type const num_threads = N; LOG_DEBUG("nm_threads", num_threads); size_type const num_blocks = GET_BLOCKS(num_threads, CUDA_NUM_THREADS); LOG_DEBUG("nm_blocks", num_blocks); index_type const unused_key = std::numeric_limits::max(); LOG_DEBUG("unused_key", unused_key); detail::insert_and_map_kernel<<>>( *m_map, // const_coordinate_data(), // m_valid_map_index.data(), // m_valid_row_index.data(), // num_threads, m_coordinate_size, unused_key); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Map size:", m_map->size()); // Valid row index auto valid_begin = thrust::make_zip_iterator( thrust::make_tuple(m_valid_map_index.begin(), m_valid_row_index.begin())); size_type const number_of_valid = thrust::remove_if(thrust::device, valid_begin, thrust::make_zip_iterator(thrust::make_tuple( m_valid_map_index.end(), m_valid_row_index.end())), detail::is_first(unused_key)) - valid_begin; m_valid_row_index.resize(number_of_valid); m_valid_map_index.resize(number_of_valid); m_size = number_of_valid; LOG_DEBUG("Number of successful insertion", m_size); if (remap // When remapping && number_of_valid != N // when the # of inserted items differ from the # // of successful insertions ) { m_inverse_row_index.allocate(N); thrust::counting_iterator count_begin{0}; thrust::for_each(count_begin, count_begin + number_of_valid, detail::update_value_with_offset{ *m_map, m_valid_map_index.data()}); size_type const num_threads = N; auto const num_blocks = GET_BLOCKS(num_threads, CUDA_NUM_THREADS); detail::remap_inverse_map <<>>(*m_map, // const_coordinate_data(), // m_inverse_row_index.data(), // num_threads, m_coordinate_size); LOG_DEBUG("Remapping finished"); } } // namespace minkowski template class TemplatedAllocator> template std::pair>, gpu_storage>> CoordinateMapGPU::insert_and_map( coordinate_iterator key_first, coordinate_iterator key_last) { LOG_DEBUG("insert_and_map"); insert(key_first, key_last); return std::make_pair(m_valid_row_index, m_inverse_row_index); } template class TemplatedAllocator> void CoordinateMapGPU:: initialize_valid_indices(size_t const N_unique) { m_valid_row_index.resize(N_unique); m_valid_map_index.resize(N_unique); m_size = N_unique; // Insert coordinates auto insert = detail::insert_coordinate{ *m_map, // map const_coordinate_data(), // coordinates, m_valid_row_index.data(), // valid row m_valid_map_index.data(), // iter offset m_coordinate_size}; thrust::counting_iterator count_begin{0}; thrust::for_each(thrust::device, count_begin, count_begin + N_unique, insert); } /* * @brief given a key iterator begin-end pair find all valid keys and its * index. * * @return a pair of (valid index, query value) vectors. */ template class TemplatedAllocator> std::pair>, gpu_storage>> CoordinateMapGPU::find( coordinate_iterator key_first, coordinate_iterator key_last) const { size_type N = key_last - key_first; LOG_DEBUG(N, "queries for find."); auto const find_functor = detail::find_coordinate( *m_map, key_first->data(), m_unused_element, m_coordinate_size); LOG_DEBUG("Find functor initialized."); auto const invalid_functor = detail::is_unused_pair(m_unused_element); LOG_DEBUG("Valid functor initialized."); thrust::counting_iterator index{0}; gpu_storage input_index(N); gpu_storage results(N); LOG_DEBUG("Initialized functors."); thrust::sequence(thrust::device, input_index.begin(), input_index.end()); thrust::transform(thrust::device, index, index + N, results.begin(), find_functor); size_type const number_of_valid = thrust::remove_if(thrust::device, thrust::make_zip_iterator(thrust::make_tuple( input_index.begin(), results.begin())), thrust::make_zip_iterator(thrust::make_tuple( input_index.end(), results.end())), invalid_functor) - thrust::make_zip_iterator( thrust::make_tuple(input_index.begin(), results.begin())); LOG_DEBUG("Number of valid", number_of_valid); input_index.resize(number_of_valid); results.resize(number_of_valid); return std::make_pair(input_index, results); } namespace detail { template __global__ void stride_copy(coordinate_type const *__restrict__ src_coordinates, // index_type const *__restrict__ src_valid_row_index, // index_type const *__restrict__ stride, // coordinate_type *__restrict__ dst_coordinates, // size_type const num_threads, size_type const coordinate_size) { extern __shared__ size_type sh_stride[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) sh_stride[i] = stride[i]; __syncthreads(); if (x < num_threads) { const index_type src_start = src_valid_row_index[x] * coordinate_size; const index_type dst_start = x * coordinate_size; dst_coordinates[dst_start] = src_coordinates[src_start]; for (index_type j = 1; j < coordinate_size; ++j) { dst_coordinates[dst_start + j] = (__float2int_rd( __fdiv_rd(src_coordinates[src_start + j], sh_stride[j - 1]))) * sh_stride[j - 1]; // (__double2int_rd( // __ddiv_rn(src_coordinates[src_start + j], sh_stride[j - 1]))) * // sh_stride[j - 1]; } } } } // namespace detail /* * @brief given a key iterator begin-end pair find all valid keys and its * index. * * @return a pair of (valid index, query value) vectors. */ template class TemplatedAllocator> CoordinateMapGPU CoordinateMapGPU::stride( stride_type const &stride) const { // Over estimate the reserve size to be size(); size_type const N = size(); LOG_DEBUG("Strided map with kernel stride:", stride); self_type stride_map( N, m_coordinate_size, m_hashtable_occupancy, detail::stride_tensor_stride(base_type::m_tensor_stride, stride), m_map_allocator, base_type::m_byte_allocator); index_storage_type out_device_tensor_stride(stride_map.get_tensor_stride()); // stride coordinates size_type const num_threads = N; auto const num_blocks = GET_BLOCKS(num_threads, CUDA_NUM_THREADS); detail::stride_copy <<>>( const_coordinate_data(), // m_valid_row_index.cbegin(), // out_device_tensor_stride.cbegin(), // stride_map.coordinate_data(), // num_threads, m_coordinate_size); LOG_DEBUG("Stride copy done."); auto &stride_valid_row_index = stride_map.m_valid_row_index; auto &stride_valid_map_index = stride_map.m_valid_map_index; stride_valid_row_index.resize(N); // row indices stride_valid_map_index.resize(N); // map offset // Insert coordinates index_type const unused_key = std::numeric_limits::max(); LOG_DEBUG("unused_key", unused_key); detail::insert_and_map_kernel<<>>( *stride_map.m_map, // stride_map.const_coordinate_data(), // stride_valid_map_index.data(), // stride_valid_row_index.data(), // num_threads, m_coordinate_size, unused_key); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Stride map insertion complete"); // Valid row index auto valid_begin = thrust::make_zip_iterator( thrust::make_tuple(stride_valid_map_index.begin(), // stride_valid_row_index.begin())); size_type const number_of_valid = thrust::remove_if(thrust::device, // valid_begin, // thrust::make_zip_iterator( thrust::make_tuple(stride_valid_map_index.end(), // stride_valid_row_index.end())), detail::is_first(unused_key)) - valid_begin; stride_valid_row_index.resize(number_of_valid); stride_valid_map_index.resize(number_of_valid); stride_map.m_size = number_of_valid; LOG_DEBUG("Reduced to", number_of_valid); // remap values thrust::counting_iterator count_begin{0}; thrust::for_each(count_begin, count_begin + number_of_valid, detail::update_value_with_offset{ *stride_map.m_map, stride_map.m_valid_map_index.data()}); LOG_DEBUG("Stride remap done"); return stride_map; } namespace detail { template __device__ bool is_coordinate_aligned(coordinate_type *point, index_type *out_tensor_stride, uint32_t const size) { for (uint32_t i = 0; i < size - 1; ++i) { if (point[i + 1] % out_tensor_stride[i] != 0) return false; } return true; } template __global__ void kernel_region_insert( size_type const num_threads, // map_type __restrict__ out_map, // coordinate_type const *const __restrict__ p_in_coordinates, // index_type const *const __restrict__ in_valid_row_index, // coordinate_type *__restrict__ p_out_coordinates, // index_type *__restrict__ out_valid_row_index, // index_type *__restrict__ out_valid_map_index, // gpu_kernel_region kernel, // size_type const *const __restrict__ out_tensor_stride, // index_type const unused_key) { // extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; size_type const coordinate_size = kernel.coordinate_size(); size_type const volume = kernel.volume(); // clang-format off size_type *sh_size = reinterpret_cast(sh_all); size_type *sh_tensor_stride = sh_size; size_type *sh_kernel_size = sh_tensor_stride + coordinate_size; size_type *sh_dilation = sh_kernel_size + coordinate_size; size_type *sh_out_tensor_stride = sh_dilation + coordinate_size; coordinate_type *sh_coordinate = reinterpret_cast(sh_out_tensor_stride + coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; // clang-format on for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = kernel.tensor_stride()[i]; sh_kernel_size[i] = kernel.kernel_size()[i]; sh_dilation[i] = kernel.dilation()[i]; sh_out_tensor_stride[i] = out_tensor_stride[i]; } __syncthreads(); auto sh_kernel = gpu_kernel_region( kernel, sh_tensor_stride, sh_kernel_size, sh_dilation); coordinate curr_coordinate(sh_tmp); if (x < num_threads) { // iterate over values index_type out_index = x * volume; // set bounds for the valid keys for (uint32_t kernel_ind = 0; kernel_ind < volume; ++kernel_ind) { sh_kernel.coordinate_at( kernel_ind, &p_in_coordinates[in_valid_row_index[x] * coordinate_size], sh_tmp); // Creating generative conv transpose if (kernel.is_transpose()) { // initialize out coordinate for (uint32_t i = 0; i < coordinate_size; ++i) p_out_coordinates[out_index * coordinate_size + i] = curr_coordinate[i]; auto const result = out_map.insert(thrust::make_pair( coordinate{ &p_out_coordinates[out_index * coordinate_size]}, out_index)); if (result.second) { // row index in the out_coordinates out_valid_row_index[out_index] = out_index; // offset in the coordinate map out_valid_map_index[out_index] = result.first.offset(); } else { out_valid_row_index[out_index] = unused_key; } ++out_index; } else { // skip if the coordinate is not aligned if (!is_coordinate_aligned(sh_tmp, sh_out_tensor_stride, coordinate_size)) { out_valid_row_index[out_index] = unused_key; ++out_index; } else { // initialize out coordinate for (uint32_t i = 0; i < coordinate_size; ++i) p_out_coordinates[out_index * coordinate_size + i] = curr_coordinate[i]; auto const result = out_map.insert(thrust::make_pair( coordinate{ &p_out_coordinates[out_index * coordinate_size]}, out_index)); if (result.second) { // row index in the out_coordinates out_valid_row_index[out_index] = out_index; // offset in the coordinate map out_valid_map_index[out_index] = result.first.offset(); } else { out_valid_row_index[out_index] = unused_key; } ++out_index; } } } } } } // namespace detail /* * @brief generate a region strided coordinate map * * @return a gpu_coordinate_map */ template class TemplatedAllocator> CoordinateMapGPU CoordinateMapGPU::stride_region( cpu_kernel_region &kernel, stride_type const &out_tensor_stride) const { ASSERT(m_coordinate_size == kernel.coordinate_size(), "Invalid kernel coordinate_size"); gpu_kernel_region gpu_kernel(kernel.to_gpu()); // Over estimate the reserve size to be size(); size_type const N_in = size(); size_type const N_out = N_in * kernel.volume(); LOG_DEBUG("Stride region out tensor stride:", out_tensor_stride, "with capacity:", N_out); self_type stride_map(N_out, m_coordinate_size, m_hashtable_occupancy, out_tensor_stride, m_map_allocator, base_type::m_byte_allocator); index_storage_type d_out_tensor_stride(out_tensor_stride); auto &out_valid_row_index = stride_map.m_valid_row_index; auto &out_valid_map_index = stride_map.m_valid_map_index; out_valid_row_index.resize(N_out); out_valid_map_index.resize(N_out); index_type const unused_key = std::numeric_limits::max(); // (THREAD * D + 3 * D) * 4 uint32_t const shared_memory_size_in_bytes = 4 * m_coordinate_size * sizeof(index_type) + // stride, kernel, dilation CUDA_NUM_THREADS * m_coordinate_size * sizeof(coordinate_type); // tmp detail::kernel_region_insert <<>>(N_in, // *stride_map.m_map, // const_coordinate_data(), // m_valid_row_index.cbegin(), // stride_map.coordinate_data(), // out_valid_row_index.data(), // out_valid_map_index.data(), // gpu_kernel, // d_out_tensor_stride.cbegin(), // unused_key); // CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("kernel_region_insert done"); // LOG_DEBUG("valid row index", out_valid_row_index); // LOG_DEBUG("valid map offset", out_valid_map_index); // remove unused_keys auto valid_begin = thrust::make_zip_iterator( thrust::make_tuple(out_valid_row_index.begin(), // out_valid_map_index.begin())); size_type const number_of_valid = thrust::remove_if(thrust::device, // valid_begin, // thrust::make_zip_iterator( thrust::make_tuple(out_valid_row_index.end(), // out_valid_map_index.end())), detail::is_first(unused_key)) - valid_begin; out_valid_row_index.resize(number_of_valid); out_valid_map_index.resize(number_of_valid); stride_map.m_size = number_of_valid; LOG_DEBUG("Reduced to", number_of_valid); // remap values thrust::counting_iterator count_begin{0}; thrust::for_each(count_begin, count_begin + number_of_valid, detail::update_value_with_offset{ *stride_map.m_map, out_valid_map_index.data()}); LOG_DEBUG("Stride remap done"); return stride_map; } namespace detail { template __global__ void copy_column_with_valid( dst_coordinate_type *__restrict__ dst_coordinates, // size_type const num_threads, // src_coordinate_type const *__restrict__ src_coordinates, // index_type const *__restrict__ src_valid_row_index, // size_type const coordinate_size) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { if (stride_src) dst_coordinates[x] = src_coordinates[src_valid_row_index[x] * coordinate_size]; else dst_coordinates[x * coordinate_size] = src_coordinates[src_valid_row_index[x]]; } } template __global__ void copy_column(dst_coordinate_type *__restrict__ dst_coordinates, // size_type const num_threads, // src_coordinate_type const *__restrict__ src_coordinates, // size_type const coordinate_size) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { if (stride_src) dst_coordinates[x] = src_coordinates[x * coordinate_size]; else dst_coordinates[x * coordinate_size] = src_coordinates[x]; } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU CoordinateMapGPU::origin() const { size_type const N = size(); LOG_DEBUG("Origin map from in map size:", N); // tensor stride is set to {0,..., 0} for the origin map. stride_type origin_tensor_stride(m_coordinate_size - 1); std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(), [](auto &i) { i = 0; }); // thrust unique for unique batch index coordinate_type *d_batch_indices = reinterpret_cast( m_byte_allocator.allocate(N * sizeof(coordinate_type))); detail::copy_column_with_valid <<>>( d_batch_indices, N, const_coordinate_data(), m_valid_row_index.cbegin(), m_coordinate_size); #ifdef DEBUG CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("copied batch indices"); #endif // Sort and unique thrust::sort(thrust::device, d_batch_indices, d_batch_indices + N); #ifdef DEBUG CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("sorted batch indices"); #endif auto d_batch_indices_end = thrust::unique(thrust::device, d_batch_indices, d_batch_indices + N); size_type const N_unique = d_batch_indices_end - d_batch_indices; #ifdef DEBUG size_t Nsize = std::min(N_unique, 100); std::vector tmp(Nsize); CUDA_CHECK(cudaMemcpy(tmp.data(), d_batch_indices, Nsize * sizeof(coordinate_type), cudaMemcpyDeviceToHost)); LOG_DEBUG("sort and unique batch", tmp); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("unique done"); #endif // Create origin map LOG_DEBUG("Origin map with size:", N_unique, " tensor stride:", origin_tensor_stride); self_type origin_map(N_unique, m_coordinate_size, m_hashtable_occupancy, origin_tensor_stride, m_map_allocator, base_type::m_byte_allocator); CUDA_CHECK( cudaMemset(origin_map.coordinate_data(), 0, N_unique * m_coordinate_size * sizeof(coordinate_type))); detail::copy_column <<>>( origin_map.coordinate_data(), N_unique, d_batch_indices, m_coordinate_size); #ifdef DEBUG CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("copied batch indices to the origin_map"); #endif auto &origin_valid_row_index = origin_map.m_valid_row_index; auto &origin_valid_map_index = origin_map.m_valid_map_index; origin_valid_row_index.resize(N_unique); origin_valid_map_index.resize(N_unique); origin_map.m_size = N_unique; // Insert coordinates auto insert = detail::insert_coordinate{ *origin_map.m_map, // map origin_map.const_coordinate_data(), // coordinates, origin_valid_row_index.data(), // valid row origin_valid_map_index.data(), // iter offset m_coordinate_size}; thrust::counting_iterator count_begin{0}; thrust::for_each(thrust::device, count_begin, count_begin + N_unique, insert); #ifdef DEBUG CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("origin map insertion"); #endif m_byte_allocator.deallocate((char *)d_batch_indices, N * sizeof(coordinate_type)); return origin_map; } template class TemplatedAllocator> CoordinateMapGPU CoordinateFieldMapGPU::origin() const { size_type const N = size(); LOG_DEBUG("Origin map from in map size:", N); // tensor stride is set to {0,..., 0} for the origin map. stride_type origin_tensor_stride(m_coordinate_size - 1); std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(), [](auto &i) { i = 0; }); // thrust unique for unique batch index coordinate_int_type *d_batch_indices = reinterpret_cast( m_byte_allocator.allocate(N * sizeof(coordinate_int_type))); detail::copy_column <<>>( d_batch_indices, N, const_coordinate_data(), m_coordinate_size); // Sort and unique thrust::sort(thrust::device, d_batch_indices, d_batch_indices + N); auto d_batch_indices_end = thrust::unique(thrust::device, d_batch_indices, d_batch_indices + N); size_type const N_unique = d_batch_indices_end - d_batch_indices; // Create origin map LOG_DEBUG("Origin map with size:", N_unique, " tensor stride:", origin_tensor_stride); CoordinateMapGPU origin_map( N_unique, m_coordinate_size, 50, origin_tensor_stride); CUDA_CHECK( cudaMemset(origin_map.coordinate_data(), 0, N_unique * m_coordinate_size * sizeof(coordinate_int_type))); detail::copy_column <<>>( origin_map.coordinate_data(), N_unique, d_batch_indices, m_coordinate_size); m_byte_allocator.deallocate((char *)d_batch_indices, N * sizeof(coordinate_type)); origin_map.initialize_valid_indices(N_unique); return origin_map; } namespace detail { template __global__ void origin_field_map_kernel( size_type const num_threads, // coordinate_field_type const *__restrict__ d_field_coords, // map_type const __restrict__ origin_map, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, // index_type *__restrict__ p_kernels, // size_type const coordinate_size) { extern __shared__ coordinate_int_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; // clang-format off coordinate_int_type *sh_tmp = sh_all + tx * coordinate_size; // clang-format on if (x < num_threads) for (index_type i = 0; i < coordinate_size; ++i) sh_tmp[i] = 0; __syncthreads(); if (x < num_threads) { sh_tmp[0] = coordinate_int_type(lroundf(d_field_coords[x * coordinate_size])); auto origin_iter = origin_map.find(coordinate(sh_tmp)); auto out_index = origin_iter->second; p_in_maps[x] = x; p_out_maps[x] = out_index; // origin_map row index // For kernel_map decompose() p_kernels[x] = out_index; } } } // namespace detail template class TemplatedAllocator> CoordinateFieldMapGPU::kernel_map_type CoordinateFieldMapGPU:: origin_map(CoordinateMapGPU const &origin_map, uint32_t thread_dim) const { ASSERT(std::all_of(origin_map.get_tensor_stride().begin(), origin_map.get_tensor_stride().end(), [](auto const &i) { return i == 0; }), "Invalid origin tensor stride", origin_map.get_tensor_stride()); // reserve size(); size_type const in_size = size(); LOG_DEBUG("in_map size:", in_size, "origin_map size:", origin_map.size()); // (THREAD * D) * 4 uint32_t const shared_memory_size_in_bytes = thread_dim * m_coordinate_size * sizeof(coordinate_int_type); // tmp size_type const num_threads = in_size; auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); LOG_DEBUG("origin_map num block", num_blocks); LOG_DEBUG("origin_map shared_memory size", shared_memory_size_in_bytes); LOG_DEBUG("origin_map threads dim", thread_dim); LOG_DEBUG("origin_map num threads", num_threads); kernel_map_type kernel_map(in_size, base_type::m_byte_allocator); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Allocated kernel_map."); detail::origin_field_map_kernel <<>>( num_threads, // const_coordinate_data(), // origin_map.const_hash_map(), // kernel_map.in_maps.begin(), // kernel_map.out_maps.begin(), // kernel_map.kernels.begin(), // m_coordinate_size); CUDA_CHECK(cudaStreamSynchronize(0)); THRUST_CHECK(kernel_map.decompose()); LOG_DEBUG("origin map decomposed"); return kernel_map; } namespace detail { template __global__ void prune_copy_and_insert( size_type const num_threads, // size_type const coordinate_size, // index_type const unused_map_offset, // index_type const *const __restrict__ in_valid_row_index, // coordinate_type const *const __restrict__ in_coordinates, // bool const *const __restrict__ keep_begin, // index_type const *const __restrict__ inclusive_scan_keep, // map_type __restrict__ out_map, // coordinate_type *__restrict__ out_coordinates, // index_type *__restrict__ out_valid_row_index, // index_type *__restrict__ out_valid_map_offset // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { if (!keep_begin[x]) { out_valid_map_offset[x] = unused_map_offset; } else { // If keep, auto out_row_index = (x < 1) ? 0 : inclusive_scan_keep[x - 1]; coordinate_type const *curr_in_coord = &in_coordinates[in_valid_row_index[x] * coordinate_size]; coordinate_type *curr_out_coord = &out_coordinates[out_row_index * coordinate_size]; for (index_type i = 0; i < coordinate_size; ++i) curr_out_coord[i] = curr_in_coord[i]; // insert to the out_map auto coord = coordinate{curr_out_coord}; // remap the value in the next kernel call auto result = out_map.insert(thrust::make_pair(coord, 0)); out_valid_row_index[x] = out_row_index; if (result.second) out_valid_map_offset[x] = result.first.offset(); else out_valid_map_offset[x] = unused_map_offset; } } } template __global__ void remap(size_type const num_threads, // map_type const __restrict__ out_map, // index_type *__restrict__ out_valid_map_offset // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { auto &pair = out_map.data()[out_valid_map_offset[x]]; pair.second = x; } } template __global__ void typed_copy(uint32_t const num_threads, // Dtype *__restrict__ dst, // Stype const *__restrict__ src // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { dst[x] = src[x]; } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU CoordinateMapGPU::prune( bool const *keep_begin, bool const *keep_end) const { size_type const N = size(); ASSERT(N == keep_end - keep_begin, "Invalid keep size"); LOG_DEBUG("Prune size:", N); // exclusive sum for coordinate copy. auto const inclusive_scan_size = N * sizeof(index_type); index_type *d_inclusive_scan = (index_type *)m_byte_allocator.allocate(inclusive_scan_size); // bool -> index_type detail::typed_copy<<>>( N, d_inclusive_scan, keep_begin); CUDA_CHECK(cudaStreamSynchronize(0)); thrust::inclusive_scan(thrust::device, d_inclusive_scan, d_inclusive_scan + N, d_inclusive_scan); index_type N_pruned; CUDA_CHECK(cudaMemcpy(&N_pruned, d_inclusive_scan + N - 1, sizeof(index_type), cudaMemcpyDeviceToHost)); LOG_DEBUG("Pruned N:", N_pruned); // create a coordinate_map self_type pruned_map(N, m_coordinate_size, m_hashtable_occupancy, base_type::m_tensor_stride, m_map_allocator, base_type::m_byte_allocator); // Copy and insert kernel that first checks keep[i] is true and insert at // inclusive_scan[i - 1]. auto &out_valid_map_offset = pruned_map.m_valid_map_index; auto &out_valid_row_index = pruned_map.m_valid_row_index; out_valid_map_offset.resize(N); out_valid_row_index.resize(N); index_type const unused_map_offset = std::numeric_limits::max(); detail::prune_copy_and_insert <<>>( N, m_coordinate_size, unused_map_offset, m_valid_row_index.cbegin(), const_coordinate_data(), keep_begin, d_inclusive_scan, *(pruned_map.m_map), pruned_map.coordinate_data(), out_valid_row_index.data(), out_valid_map_offset.data()); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Pruned hash map size:", pruned_map.size()); // Remove not inserted rows auto valid_begin = thrust::make_zip_iterator(thrust::make_tuple( out_valid_map_offset.begin(), out_valid_row_index.begin())); size_type const number_of_valid = thrust::remove_if( thrust::device, valid_begin, thrust::make_zip_iterator(thrust::make_tuple( out_valid_map_offset.end(), out_valid_row_index.end())), detail::is_first(unused_map_offset)) - valid_begin; LOG_DEBUG("number of valid rows:", number_of_valid); out_valid_map_offset.resize(number_of_valid); out_valid_row_index.resize(number_of_valid); pruned_map.m_size = number_of_valid; // remap the final map values detail::remap <<>>( number_of_valid, *(pruned_map.m_map), out_valid_map_offset.data()); CUDA_CHECK(cudaStreamSynchronize(0)); m_byte_allocator.deallocate((char *)d_inclusive_scan, inclusive_scan_size); return pruned_map; } // Merge namespace detail { template __global__ void copy_coordinates_by_offset(map_type __restrict__ map, // coordinate_type *__restrict__ coordinates, // index_type const *__restrict__ map_offsets, // size_type const num_threads, // size_type const coordinate_size // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { typename map_type::value_type const *p_value = map.data() + map_offsets[x]; // Compute Capabilities 3.5 or newer coordinate_type *dst_coordinate = coordinates + p_value->second * coordinate_size; for (index_type i = 0; i < coordinate_size; ++i) dst_coordinate[i] = p_value->first[i]; } } template __global__ void copy_coordinates_by_valid_row( // map_type __restrict__ map, // coordinate_type const *__restrict__ in_coordinates, // coordinate_type *__restrict__ out_coordinates, // index_type const *__restrict__ valid_row, // size_type const num_threads, // size_type const coordinate_size // ) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { // Compute Capabilities 3.5 or newer index_type const row_index = x / coordinate_size; index_type const col_index = x % coordinate_size; out_coordinates[row_index * coordinate_size + col_index] = in_coordinates[valid_row[row_index] * coordinate_size + col_index]; } } template __global__ void insert_and_map_kernel_with_offset( map_type __restrict__ map, // coordinate_type const *__restrict__ coordinates, // index_type const coordinate_row_offset, // index_type *__restrict__ valid_map_index, // index_type *__restrict__ valid_row_index, // size_type const num_threads, // size_type const coordinate_size, // index_type const unused_key) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { // m_map.insert(pair); // Returns pair auto const result = map.insert(thrust::make_pair( coordinate{&coordinates[x * coordinate_size]}, x)); if (result.second) { valid_row_index[x] = x + coordinate_row_offset; // success map index. remove failed insertion with success. valid_map_index[x] = result.first.offset(); } else { valid_map_index[x] = unused_key; } } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU CoordinateMapGPU::merge( std::vector> const &maps) const { // reserve size size_t all_size = std::accumulate( maps.begin(), maps.end(), 0, [](size_t sum, const self_type &map) { return sum + map.size(); }); LOG_DEBUG("Out merge map capacity:", all_size); self_type merged_map(all_size, m_coordinate_size, m_hashtable_occupancy, base_type::m_tensor_stride, m_map_allocator, base_type::m_byte_allocator); merged_map.m_valid_row_index.resize(all_size); merged_map.m_valid_map_index.resize(all_size); // Copy valid coordinates to the merged map coordinate_type *curr_coordinates = merged_map.coordinate_data(); index_type *curr_valid_map_offset = merged_map.m_valid_map_index.data(); index_type *curr_valid_row_index = merged_map.m_valid_row_index.data(); index_type const unused_key = std::numeric_limits::max(); index_type row_offset{0}; for (self_type const &map : maps) { size_type const num_threads = map.size(); if (num_threads == 0) continue; size_type const num_blocks = GET_BLOCKS(num_threads * m_coordinate_size, CUDA_NUM_THREADS); LOG_DEBUG("Current merge map size:", num_threads); detail::copy_coordinates_by_valid_row <<>>(map.const_coordinate_data(), // curr_coordinates, // map.m_valid_row_index.cdata(), // num_threads * m_coordinate_size, // m_coordinate_size); detail::insert_and_map_kernel_with_offset <<>>(*(merged_map.m_map), curr_coordinates, // row_offset, // curr_valid_map_offset, // curr_valid_row_index, // num_threads, m_coordinate_size, unused_key); CUDA_CHECK(cudaStreamSynchronize(0)); curr_coordinates += num_threads * m_coordinate_size; curr_valid_map_offset += num_threads; curr_valid_row_index += num_threads; row_offset += num_threads; } // Remove invalid maps auto valid_begin = thrust::make_zip_iterator( thrust::make_tuple(merged_map.m_valid_map_index.begin(), merged_map.m_valid_row_index.begin())); size_type const number_of_valid = thrust::remove_if(thrust::device, valid_begin, thrust::make_zip_iterator(thrust::make_tuple( merged_map.m_valid_map_index.end(), merged_map.m_valid_row_index.end())), detail::is_first(unused_key)) - valid_begin; // remap the final map row index and the map offset detail::remap <<>>( number_of_valid, *(merged_map.m_map), merged_map.m_valid_map_index.data()); merged_map.m_valid_row_index.resize(number_of_valid); merged_map.m_valid_map_index.resize(number_of_valid); merged_map.m_size = number_of_valid; return merged_map; } namespace detail { template __global__ void count_kernel(map_type const __restrict__ in_map, // map_type const __restrict__ out_map, // index_type const *const __restrict__ out_valid_map_index, // size_type const num_threads, // gpu_kernel_region kernel, // index_type *__restrict__ p_count_per_thread) { extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; size_type const coordinate_size = kernel.coordinate_size(); size_type const volume = kernel.volume(); // clang-format off size_type *sh_size = reinterpret_cast(sh_all); size_type *sh_tensor_stride = sh_size; size_type *sh_kernel_size = sh_tensor_stride + coordinate_size; size_type *sh_dilation = sh_kernel_size + coordinate_size; coordinate_type *sh_coordinate = reinterpret_cast(sh_dilation + coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; // clang-format on auto const equal = out_map.get_key_equal(); // kernel_maps for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = kernel.tensor_stride()[i]; sh_kernel_size[i] = kernel.kernel_size()[i]; sh_dilation[i] = kernel.dilation()[i]; } __syncthreads(); auto sh_kernel = gpu_kernel_region( kernel, sh_tensor_stride, sh_kernel_size, sh_dilation); coordinate point(sh_tmp); auto const unused_key = out_map.get_unused_key(); if (x < num_threads) { size_type count = 0; typename map_type::value_type const &out_value = out_map.data()[out_valid_map_index[x]]; // valid_index guarantees that it contains a valid value if (!equal(out_value.first, unused_key)) { for (auto kernel_ind = 0; kernel_ind < volume; ++kernel_ind) { sh_kernel.coordinate_at(kernel_ind, out_value.first.data(), sh_tmp); if (in_map.find(point) != in_map.end()) { ++count; } } } p_count_per_thread[x] = count; } } template __global__ void preallocated_kernel_map_iteration( map_type const __restrict__ in_map, // map_type const __restrict__ out_map, // index_type const *const __restrict__ out_valid_map_index, // size_type const num_threads, // gpu_kernel_region kernel, // index_type const *const __restrict__ inclusive_count_cumsum_per_thread, // index_type *__restrict__ p_kernels, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps) { extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; size_type const coordinate_size = kernel.coordinate_size(); size_type const volume = kernel.volume(); // clang-format off size_type *sh_size = reinterpret_cast(sh_all); size_type *sh_tensor_stride = sh_size; size_type *sh_kernel_size = sh_tensor_stride + coordinate_size; size_type *sh_dilation = sh_kernel_size + coordinate_size; coordinate_type *sh_coordinate = reinterpret_cast(sh_dilation + coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; // clang-format on auto const equal = out_map.get_key_equal(); for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = kernel.tensor_stride()[i]; sh_kernel_size[i] = kernel.kernel_size()[i]; sh_dilation[i] = kernel.dilation()[i]; } __syncthreads(); auto sh_kernel = gpu_kernel_region( kernel, sh_tensor_stride, sh_kernel_size, sh_dilation); coordinate curr_coordinate(sh_tmp); auto const unused_key = out_map.get_unused_key(); if (x < num_threads) { // iterate over values auto kernel_map_index = (x < 1) ? 0 : inclusive_count_cumsum_per_thread[x - 1]; typename map_type::value_type const &out_value = out_map.data()[out_valid_map_index[x]]; if (!equal(out_value.first, unused_key)) { // set bounds for the valid keys for (uint32_t kernel_index = 0; kernel_index < volume; ++kernel_index) { sh_kernel.coordinate_at(kernel_index, out_value.first.data(), sh_tmp); auto const &in_result = in_map.find(curr_coordinate); if (in_result != in_map.end()) { // insert to p_kernels[kernel_map_index] = kernel_index; p_in_maps[kernel_map_index] = (*in_result).second; p_out_maps[kernel_map_index] = out_value.second; ++kernel_map_index; } } } } } template __global__ void direct_in_out_map(size_type const num_threads, // map_type const __restrict__ in_map, // map_type const __restrict__ out_map, // index_type const *const __restrict__ out_valid_map_offset, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, index_type const unused_key) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { typename map_type::value_type const &out_value = out_map.data()[out_valid_map_offset[x]]; auto const &result = in_map.find(out_value.first); if (result != in_map.end()) { p_in_maps[x] = (*result).second; p_out_maps[x] = out_value.second; } else { p_in_maps[x] = unused_key; } } } template __global__ void direct_kernel_map(map_type const __restrict__ in_map, // map_type const __restrict__ out_map, // index_type const *const __restrict__ out_valid_map_index, // size_type const num_threads, // gpu_kernel_region kernel, // index_type *__restrict__ p_kernels, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, index_type const unused_map_value) { extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; size_type const coordinate_size = kernel.coordinate_size(); size_type const volume = kernel.volume(); // clang-format off size_type *sh_size = reinterpret_cast(sh_all); size_type *sh_tensor_stride = sh_size; size_type *sh_kernel_size = sh_tensor_stride + coordinate_size; size_type *sh_dilation = sh_kernel_size + coordinate_size; coordinate_type *sh_coordinate = reinterpret_cast(sh_dilation + coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; // clang-format on auto const equal = out_map.get_key_equal(); for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = kernel.tensor_stride()[i]; sh_kernel_size[i] = kernel.kernel_size()[i]; sh_dilation[i] = kernel.dilation()[i]; } __syncthreads(); auto sh_kernel = gpu_kernel_region( kernel, sh_tensor_stride, sh_kernel_size, sh_dilation); auto const unused_key = out_map.get_unused_key(); if (x < num_threads) { // iterate over values index_type kernel_index = x % volume; typename map_type::value_type const &out_value = out_map.data()[out_valid_map_index[x / volume]]; if (!equal(out_value.first, unused_key)) { // set bounds for the valid keys // TODO: copy the curr_coordinate to sh_curr_coordinate sh_kernel.coordinate_at(kernel_index, out_value.first.data(), sh_tmp); auto const &in_result = in_map.find(coordinate(sh_tmp)); if (in_result != in_map.end()) { // insert to p_kernels[x] = kernel_index; p_in_maps[x] = (*in_result).second; p_out_maps[x] = out_value.second; } else { p_kernels[x] = unused_map_value; } } } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU::kernel_map_type CoordinateMapGPU::kernel_map( self_type const &out_map, gpu_kernel_region const &kernel, CUDAKernelMapMode::Mode kernel_map_mode, uint32_t thread_dim) const { // Over estimate the reserve size to be size(); size_type const out_size = out_map.size(); size_type const kernel_volume = kernel.volume(); ASSERT(kernel_volume > 0, "Invalid kernel"); if (kernel_volume == 1) { // directly iterate over all output first by finding all in out map. auto const N = out_size; LOG_DEBUG("out_map size:", N); index_type *in_out_map = (index_type *)base_type::m_byte_allocator.allocate( 2 * (N + 1) * sizeof(index_type)); index_type *ins = in_out_map; index_type *outs = in_out_map + N + 1; // for __restrict__ collision prevention index_type unused_key = std::numeric_limits::max(); detail::direct_in_out_map <<>>( N, *m_map, // *(out_map.m_map), // out_map.m_valid_map_index.cdata(), // ins, // in map outs, // out map unused_key); LOG_DEBUG("Direct in out map copy done"); auto begin = thrust::make_zip_iterator(thrust::make_tuple(ins, outs)); auto const valid_size = thrust::remove_if( thrust::device, begin, thrust::make_zip_iterator(thrust::make_tuple(ins + N, outs + N)), detail::is_first(unused_key)) - begin; LOG_DEBUG("Valid size:", valid_size); kernel_map_type kernel_map(valid_size, base_type::m_byte_allocator, false); CUDA_CHECK(cudaMemcpy(kernel_map.in_maps.data(), ins, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(kernel_map.out_maps.data(), outs, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); base_type::m_byte_allocator.deallocate((char *)in_out_map, 2 * (N + 1) * sizeof(index_type)); LOG_DEBUG("Cleaning up"); return kernel_map; } else if (kernel_map_mode == CUDAKernelMapMode::MEMORY_EFFICIENT && kernel.region_type() != RegionType::CUSTOM) { // (THREAD * D + 3 * D) * 4 uint32_t const shared_memory_size_in_bytes = 3 * m_coordinate_size * sizeof(index_type) + // stride, kernel, dilation thread_dim * m_coordinate_size * sizeof(coordinate_type); // tmp // clang-format on size_type const num_threads = out_size; auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); LOG_DEBUG("num block", num_blocks); LOG_DEBUG("out_map size", out_map.size()); LOG_DEBUG("shared_memory size", shared_memory_size_in_bytes); LOG_DEBUG("threads dim", thread_dim); LOG_DEBUG("num threads", num_threads); index_type *d_p_count_per_thread = reinterpret_cast( base_type::m_byte_allocator.allocate(num_threads * sizeof(index_type))); // Initialize count per thread detail::count_kernel <<>>( *m_map, // *out_map.m_map, // out_map.m_valid_map_index.cbegin(), // num_threads, // kernel, // d_p_count_per_thread); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("count_kernel finished"); thrust::inclusive_scan(thrust::device, d_p_count_per_thread, d_p_count_per_thread + num_threads, d_p_count_per_thread); index_type num_kernel_map; // type following the kernel map allocator CUDA_CHECK(cudaMemcpy(&num_kernel_map, d_p_count_per_thread + num_threads - 1, sizeof(index_type), cudaMemcpyDeviceToHost)); // set kernel map LOG_DEBUG("Found", num_kernel_map, "kernel map elements."); kernel_map_type kernel_map(num_kernel_map, base_type::m_byte_allocator); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Allocated kernel_map."); detail::preallocated_kernel_map_iteration <<>>( *m_map, // *out_map.m_map, // out_map.m_valid_map_index.cbegin(), // num_threads, // kernel, // d_p_count_per_thread, // kernel_map.kernels.begin(), // kernel_map.in_maps.begin(), // kernel_map.out_maps.begin()); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Preallocated kernel map done"); THRUST_CHECK(kernel_map.decompose()); base_type::m_byte_allocator.deallocate( reinterpret_cast(d_p_count_per_thread), num_threads * sizeof(index_type)); LOG_DEBUG("cudaFree"); return kernel_map; } else if (kernel_map_mode == CUDAKernelMapMode::SPEED_OPTIMIZED && kernel.region_type() != RegionType::CUSTOM) { // (THREAD * 3 * D + 3 * D) * 4 uint32_t const shared_memory_size_in_bytes = 3 * m_coordinate_size * sizeof(index_type) + // stride, kernel, dilation (thread_dim + (thread_dim + kernel_volume - 1) / kernel_volume) * m_coordinate_size * sizeof(coordinate_type); // tmp coordinate + current coordinate size_type const num_threads = out_size * kernel_volume; auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); LOG_DEBUG("num block", num_blocks); LOG_DEBUG("out_map size", out_map.size()); LOG_DEBUG("kernel_volume", kernel_volume); LOG_DEBUG("shared_memory size", shared_memory_size_in_bytes); LOG_DEBUG("threads dim", thread_dim); LOG_DEBUG("num threads", num_threads); index_type unused_map_value = std::numeric_limits::max(); index_type *d_p_valid_in_index = reinterpret_cast(base_type::m_byte_allocator.allocate( 3 * (num_threads + 1) * sizeof(index_type))); index_type *d_p_valid_out_index = d_p_valid_in_index + num_threads + 1; index_type *d_p_valid_kernel_index = d_p_valid_out_index + num_threads + 1; // Initialize count per thread detail::direct_kernel_map <<>>( *m_map, // *out_map.m_map, // out_map.m_valid_map_index.cbegin(), // num_threads, // kernel, // d_p_valid_kernel_index, // d_p_valid_in_index, // d_p_valid_out_index, // unused_map_value); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("direct_kernel_map finished"); auto begin = thrust::make_zip_iterator(thrust::make_tuple( d_p_valid_kernel_index, d_p_valid_in_index, d_p_valid_out_index)); auto const valid_size = thrust::remove_if(thrust::device, begin, thrust::make_zip_iterator(thrust::make_tuple( d_p_valid_kernel_index + num_threads, d_p_valid_in_index + num_threads, d_p_valid_out_index + num_threads)), detail::is_first(unused_map_value)) - begin; LOG_DEBUG("Valid size:", valid_size); kernel_map_type kernel_map(valid_size, base_type::m_byte_allocator); CUDA_CHECK(cudaMemcpy(kernel_map.kernels.data(), d_p_valid_kernel_index, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(kernel_map.in_maps.data(), d_p_valid_in_index, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(kernel_map.out_maps.data(), d_p_valid_out_index, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); THRUST_CHECK(kernel_map.decompose()); base_type::m_byte_allocator.deallocate( reinterpret_cast(d_p_valid_in_index), 3 * (num_threads + 1) * sizeof(index_type)); LOG_DEBUG("cudaFree"); return kernel_map; } else { // kernel volume == 1 ASSERT(false, "Not implemented"); } } namespace detail { template __global__ void stride_map_kernel(map_type const __restrict__ in_map, // map_type const __restrict__ out_map, // index_type const *const __restrict__ in_valid_map_index, // size_type const num_threads, // index_type const *const __restrict__ stride, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, size_type const coordinate_size, index_type const unused_key) { extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; // clang-format off size_type *sh_size = reinterpret_cast(sh_all); size_type *sh_stride = sh_size; coordinate_type *sh_coordinate = reinterpret_cast(sh_size + coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; // clang-format on for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_stride[i] = stride[i]; } __syncthreads(); if (x >= num_threads) return; typename map_type::value_type const &in_value = in_map.data()[in_valid_map_index[x]]; sh_tmp[0] = in_value.first[0]; for (index_type j = 1; j < coordinate_size; ++j) { sh_tmp[j] = (__float2int_rd(__fdiv_rd(in_value.first[j], sh_stride[j - 1]))) * sh_stride[j - 1]; } auto out_iter = out_map.find(coordinate(sh_tmp)); if (out_iter == out_map.end()) { p_in_maps[x] = unused_key; } else { p_in_maps[x] = in_value.second; p_out_maps[x] = out_iter->second; } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU::kernel_map_type CoordinateMapGPU::stride_map( self_type const &out_map, stride_type const &out_tensor_stride, uint32_t thread_dim) const { LOG_DEBUG("generating stride_map from stride", base_type::m_tensor_stride, "to", out_map.get_tensor_stride()); // Over estimate the reserve size to be size(); size_type const in_size = size(); index_storage_type d_out_tensor_stride(out_tensor_stride); index_type unused_key = std::numeric_limits::max(); // (THREAD * D + D) * 4 uint32_t const shared_memory_size_in_bytes = m_coordinate_size * sizeof(index_type) + // stride thread_dim * m_coordinate_size * sizeof(coordinate_type); // tmp size_type const num_threads = in_size; auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); LOG_DEBUG("num block", num_blocks); LOG_DEBUG("shared_memory size", shared_memory_size_in_bytes); LOG_DEBUG("threads dim", thread_dim); LOG_DEBUG("num threads", num_threads); index_type *in_out_map = (index_type *)base_type::m_byte_allocator.allocate( 2 * (in_size + 1) * sizeof(index_type)); index_type *ins = in_out_map; index_type *outs = in_out_map + in_size + 1; // for __restrict__ collision prevention LOG_DEBUG("Allocated temporary memory"); LOG_DEBUG("out_map size", out_map.size(), "out tensor stride:", out_map.get_tensor_stride(), "coordinate_size", m_coordinate_size); detail::stride_map_kernel <<>>( *m_map, // *out_map.m_map, // m_valid_map_index.cbegin(), // num_threads, // d_out_tensor_stride.cbegin(), // ins, // outs, // m_coordinate_size, // unused_key); auto begin = thrust::make_zip_iterator(thrust::make_tuple(ins, outs)); auto const valid_size = thrust::remove_if(thrust::device, begin, thrust::make_zip_iterator( thrust::make_tuple(ins + in_size, outs + in_size)), detail::is_first(unused_key)) - begin; LOG_DEBUG("Valid size:", valid_size); kernel_map_type kernel_map(valid_size, base_type::m_byte_allocator, false); CUDA_CHECK(cudaMemcpy(kernel_map.in_maps.data(), ins, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); CUDA_CHECK(cudaMemcpy(kernel_map.out_maps.data(), outs, valid_size * sizeof(index_type), cudaMemcpyDeviceToDevice)); base_type::m_byte_allocator.deallocate( (char *)in_out_map, 2 * (in_size + 1) * sizeof(index_type)); return kernel_map; } namespace detail { template __global__ void origin_map_kernel(map_type const __restrict__ in_map, // map_type const __restrict__ origin_map, // index_type const *const __restrict__ in_valid_map_index, // size_type const num_threads, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, index_type *__restrict__ p_kernels, size_type const coordinate_size) { extern __shared__ coordinate_type sh_all[]; auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; // clang-format off coordinate_type *sh_tmp = sh_all + tx * coordinate_size; // clang-format on if (x < num_threads) for (index_type i = 0; i < coordinate_size; ++i) sh_tmp[i] = 0; __syncthreads(); if (x < num_threads) { typename map_type::value_type const &in_value = in_map.data()[in_valid_map_index[x]]; sh_tmp[0] = in_value.first[0]; auto origin_iter = origin_map.find(coordinate(sh_tmp)); p_in_maps[x] = in_value.second; p_out_maps[x] = origin_iter->second; // origin_map row index // For kernel_map decompose() p_kernels[x] = origin_iter->second; } } } // namespace detail template class TemplatedAllocator> CoordinateMapGPU::kernel_map_type CoordinateMapGPU::origin_map( self_type const &origin_map, uint32_t thread_dim) const { ASSERT(std::all_of(origin_map.get_tensor_stride().begin(), origin_map.get_tensor_stride().end(), [](auto const &i) { return i == 0; }), "Invalid origin tensor stride", origin_map.get_tensor_stride()); // reserve size(); size_type const in_size = size(); LOG_DEBUG("in_map size:", in_size, "origin_map size:", origin_map.size()); // (THREAD * D) * 4 uint32_t const shared_memory_size_in_bytes = thread_dim * m_coordinate_size * sizeof(coordinate_type); // tmp size_type const num_threads = in_size; auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); LOG_DEBUG("origin_map num block", num_blocks); LOG_DEBUG("origin_map shared_memory size", shared_memory_size_in_bytes); LOG_DEBUG("origin_map threads dim", thread_dim); LOG_DEBUG("origin_map num threads", num_threads); kernel_map_type kernel_map(in_size, base_type::m_byte_allocator); CUDA_CHECK(cudaStreamSynchronize(0)); LOG_DEBUG("Allocated kernel_map."); detail::origin_map_kernel <<>>( *m_map, // *origin_map.m_map, // m_valid_map_index.cbegin(), // num_threads, // kernel_map.in_maps.begin(), // kernel_map.out_maps.begin(), // kernel_map.kernels.begin(), // m_coordinate_size); CUDA_CHECK(cudaStreamSynchronize(0)); THRUST_CHECK(kernel_map.decompose()); LOG_DEBUG("origin map decomposed"); return kernel_map; } namespace detail { template __global__ void interpolation_kernel(map_type __restrict__ in_map, // index_type const num_threads, // float_type const *__restrict__ p_tfield, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, // float_type *__restrict__ p_weights, // stride_type const *__restrict__ p_tensor_stride, // index_type const unused_map_value, index_type const coordinate_size, index_type const neighbor_volume) { // coordinate_size * sizeof(index_type) + coordinate_size * sizeof(float_type) // + THREADS * coordinate_size * sizeof(coordinate_type) SharedMemory shared; float_type *sh_all = shared.getPointer(); auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; float_type *sh_tfield = sh_all + tx * coordinate_size; coordinate_type *sh_coordinate = reinterpret_cast( sh_all + CUDA_NUM_THREADS * coordinate_size); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; index_type *sh_tensor_stride = reinterpret_cast( sh_coordinate + CUDA_NUM_THREADS * coordinate_size); auto const equal = in_map.get_key_equal(); for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = p_tensor_stride[i]; } if (x < num_threads) { index_type const offset = coordinate_size * (x / neighbor_volume); for (index_type i = 0; i < coordinate_size; ++i) { sh_tfield[i] = p_tfield[offset + i]; } } __syncthreads(); if (x < num_threads) { // iterate over values uint32_t neighbor_ind = x % neighbor_volume; // batch index sh_tmp[0] = lrint(sh_tfield[0]); uint32_t mask = 1; for (uint32_t j = coordinate_size - 1; j > 0; --j) { index_type curr_tensor_stride = sh_tensor_stride[j - 1]; if ((neighbor_ind & mask) == 0) sh_tmp[j] = floor(sh_tfield[j] / curr_tensor_stride) * curr_tensor_stride; else sh_tmp[j] = floor(sh_tfield[j] / curr_tensor_stride) * curr_tensor_stride + curr_tensor_stride; mask = mask << 1; } auto const &in_result = in_map.find(coordinate(sh_tmp)); if (in_result != in_map.end()) { p_in_maps[x] = (*in_result).second; p_out_maps[x] = x / neighbor_volume; // Compute weight float_type weight = 1; for (uint32_t j = 1; j < coordinate_size; ++j) { weight *= 1 - abs(sh_tfield[j] - sh_tmp[j]) / sh_tensor_stride[j - 1]; } p_weights[x] = weight; } else { p_in_maps[x] = unused_map_value; } } } template __global__ void field_map_kernel(map_type __restrict__ in_map, // index_type const num_threads, // float_type const *__restrict__ p_tfield, // index_type *__restrict__ p_in_maps, // index_type *__restrict__ p_out_maps, // stride_type const *__restrict__ p_tensor_stride, // index_type const unused_map_value, index_type const coordinate_size) { // coordinate_size * sizeof(index_type) + coordinate_size * sizeof(float_type) // + THREADS * coordinate_size * sizeof(coordinate_type) SharedMemory shared; float_type *sh_all = shared.getPointer(); auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; coordinate_type *sh_coordinate = reinterpret_cast(sh_all); coordinate_type *sh_tmp = sh_coordinate + tx * coordinate_size; index_type *sh_tensor_stride = reinterpret_cast( sh_coordinate + CUDA_NUM_THREADS * coordinate_size); auto const equal = in_map.get_key_equal(); for (index_type i = tx; i < coordinate_size - 1; i += blockDim.x) { sh_tensor_stride[i] = p_tensor_stride[i]; } __syncthreads(); index_type const offset = coordinate_size * x; if (x < num_threads) { // iterate over values float_type const *curr_tfield = p_tfield + offset; // batch index sh_tmp[0] = lrint(curr_tfield[0]); for (uint32_t j = coordinate_size - 1; j > 0; --j) { index_type curr_tensor_stride = sh_tensor_stride[j - 1]; sh_tmp[j] = floor(curr_tfield[j] / curr_tensor_stride) * curr_tensor_stride; } auto const &in_result = in_map.find(coordinate(sh_tmp)); if (in_result != in_map.end()) { p_in_maps[x] = (*in_result).second; p_out_maps[x] = x; } else { p_in_maps[x] = unused_map_value; } } } // interpolation map inst template std::vector interpolation_map_weight_tfield_type( uint32_t const num_tfield, // uint32_t const coordinate_size, // index_type const unused_key, // field_type const *const p_tfield, // map_type &map, // stride_type const *const p_tensor_stride, // ByteAllocatorType const &byte_allocator, c10::TensorOptions tfield_options) { uint32_t const neighbor_volume = std::pow(2, (coordinate_size - 1)); size_type num_threads = neighbor_volume * num_tfield; LOG_DEBUG("neighbor_volume:", neighbor_volume, "num_tfield:", num_tfield, "num_threads:", num_threads); index_type *d_in_map = reinterpret_cast( byte_allocator.allocate(num_threads * sizeof(index_type))); index_type *d_out_map = reinterpret_cast( byte_allocator.allocate(num_threads * sizeof(index_type))); field_type *d_weight = reinterpret_cast( byte_allocator.allocate(num_threads * sizeof(field_type))); size_type shared_memory_size_in_bytes = coordinate_size * CUDA_NUM_THREADS * sizeof(field_type) + coordinate_size * CUDA_NUM_THREADS * sizeof(coordinate_type) + coordinate_size * sizeof(index_type); LOG_DEBUG("Shared memory size:", shared_memory_size_in_bytes); interpolation_kernel <<>>(map, // num_threads, // p_tfield, // d_in_map, // d_out_map, // d_weight, // p_tensor_stride, // unused_key, // coordinate_size, // neighbor_volume); // remove unused_keys auto valid_begin = thrust::make_zip_iterator(thrust::make_tuple(d_in_map, // d_out_map, d_weight)); size_type const number_of_valid = thrust::remove_if(thrust::device, // valid_begin, // thrust::make_zip_iterator(thrust::make_tuple( d_in_map + num_threads, // d_out_map + num_threads, d_weight + num_threads)), detail::is_first(unused_key)) - valid_begin; LOG_DEBUG("number_of_valid:", number_of_valid); auto final_in_map = torch::empty({number_of_valid}, tfield_options.dtype(torch::kInt32).requires_grad(false)); auto final_out_map = torch::empty({number_of_valid}, tfield_options.dtype(torch::kInt32).requires_grad(false)); auto final_weights = torch::empty({number_of_valid}, tfield_options.requires_grad(false)); if (number_of_valid > 0) { CUDA_CHECK(cudaMemcpy(final_in_map.template data_ptr(), d_in_map, number_of_valid * sizeof(int32_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(final_out_map.template data_ptr(), d_out_map, number_of_valid * sizeof(int32_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(final_weights.template data_ptr(), d_weight, number_of_valid * sizeof(field_type), cudaMemcpyHostToDevice)); } byte_allocator.deallocate((char *)d_in_map, num_threads * sizeof(index_type)); byte_allocator.deallocate((char *)d_out_map, num_threads * sizeof(index_type)); byte_allocator.deallocate((char *)d_weight, num_threads * sizeof(field_type)); return {final_in_map, final_out_map, final_weights}; } // interpolation map inst template std::pair field_map_type(uint32_t const num_tfield, // uint32_t const coordinate_size, // index_type const unused_key, // field_type const *const p_tfield, // map_type &map, // stride_type const *const p_tensor_stride, // ByteAllocatorType const &byte_allocator) { size_type num_threads = num_tfield; LOG_DEBUG("num_threads:", num_threads); index_type *d_in_map = reinterpret_cast( byte_allocator.allocate(num_threads * sizeof(index_type))); index_type *d_out_map = reinterpret_cast( byte_allocator.allocate(num_threads * sizeof(index_type))); size_type shared_memory_size_in_bytes = coordinate_size * CUDA_NUM_THREADS * sizeof(coordinate_type) + coordinate_size * sizeof(index_type); LOG_DEBUG("Shared memory size:", shared_memory_size_in_bytes); field_map_kernel <<>>(map, // num_threads, // p_tfield, // d_in_map, // d_out_map, // p_tensor_stride, // unused_key, // coordinate_size); // remove unused_keys auto valid_begin = thrust::make_zip_iterator(thrust::make_tuple(d_in_map, d_out_map)); size_type const number_of_valid = thrust::remove_if(thrust::device, // valid_begin, // thrust::make_zip_iterator( thrust::make_tuple(d_in_map + num_threads, // d_out_map + num_threads)), detail::is_first(unused_key)) - valid_begin; LOG_DEBUG("number_of_valid:", number_of_valid); auto curr_device = at::cuda::current_device(); auto tfield_options = torch::TensorOptions({at::kCUDA, curr_device}) .dtype(torch::kInt32) .requires_grad(false); auto final_in_map = torch::empty({number_of_valid}, tfield_options); auto final_out_map = torch::empty({number_of_valid}, tfield_options); if (number_of_valid > 0) { CUDA_CHECK(cudaMemcpy(final_in_map.template data_ptr(), d_in_map, number_of_valid * sizeof(int32_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(final_out_map.template data_ptr(), d_out_map, number_of_valid * sizeof(int32_t), cudaMemcpyHostToDevice)); } byte_allocator.deallocate((char *)d_in_map, num_threads * sizeof(index_type)); byte_allocator.deallocate((char *)d_out_map, num_threads * sizeof(index_type)); return {final_in_map, final_out_map}; } } // namespace detail template class TemplatedAllocator> std::vector CoordinateMapGPU::interpolation_map_weight( at::Tensor const &tfield) const { // Over estimate the reserve size to be size(); ASSERT(tfield.dim() == 2, "Invalid tfield dimension"); ASSERT(tfield.size(1) == m_coordinate_size, "Invalid tfield size"); size_type const num_tfield = tfield.size(0); uint32_t const neighbor_volume = std::pow(2, (m_coordinate_size - 1)); index_type const unused_key = std::numeric_limits::max(); LOG_DEBUG("map size", m_size); switch (tfield.scalar_type()) { case at::ScalarType::Double: return detail::interpolation_map_weight_tfield_type< coordinate_type, index_type, size_type, index_type, double, map_type, TemplatedAllocator>(num_tfield, // m_coordinate_size, // unused_key, // tfield.template data_ptr(), // *m_map, // m_device_tensor_stride.cbegin(), // m_byte_allocator, // tfield.options()); case at::ScalarType::Float: return detail::interpolation_map_weight_tfield_type< coordinate_type, index_type, size_type, index_type, float, map_type, TemplatedAllocator>(num_tfield, // m_coordinate_size, // unused_key, // tfield.template data_ptr(), // *m_map, // m_device_tensor_stride.cbegin(), // m_byte_allocator, // tfield.options()); default: ASSERT(false, "Unsupported float type"); } } template class TemplatedAllocator> template std::pair CoordinateMapGPU::field_map( coordinate_field_type const *p_tfield, size_type const num_tfield) const { index_type const unused_key = std::numeric_limits::max(); LOG_DEBUG("map size", m_size); return detail::field_map_type>( num_tfield, // m_coordinate_size, // unused_key, // p_tfield, // *m_map, // m_device_tensor_stride.cbegin(), // m_byte_allocator); } /** * Union map */ namespace detail { template __global__ void union_map_kernel(size_type const num_threads, // map_type const __restrict__ in_map, // map_type const __restrict__ union_map, // index_type const *const __restrict__ in_valid_map_index, // tensor_type *__restrict__ p_in_maps, // tensor_type *__restrict__ p_union_maps, size_type const coordinate_size) { auto const tx = threadIdx.x; auto const bx = blockIdx.x; auto const x = blockDim.x * bx + tx; if (x < num_threads) { typename map_type::value_type const &in_value = in_map.data()[in_valid_map_index[x]]; auto union_iter = union_map.find(in_value.first); p_in_maps[x] = in_value.second; p_union_maps[x] = union_iter->second; } } } // namespace detail template class TemplatedAllocator> std::vector CoordinateMapGPU::union_map( std::vector> const &in_maps, uint32_t thread_dim) const { auto options = torch::TensorOptions({at::kCUDA, at::cuda::current_device()}) .dtype(torch::kInt64) .requires_grad(false); std::vector union_maps; for (self_type const &in_map : in_maps) { size_type const num_threads = in_map.m_valid_map_index.size(); auto const num_blocks = GET_BLOCKS(num_threads, thread_dim); at::Tensor curr_map = torch::empty({2, num_threads}, options); LOG_DEBUG("in_map size", num_threads, ", num block", num_blocks, ", threads dim", thread_dim); int64_t *d_in_map = curr_map.template data_ptr(); detail::union_map_kernel <<>>(num_threads, // *in_map.m_map, // *m_map, // in_map.m_valid_map_index.cbegin(), // d_in_map, // d_in_map + num_threads, // m_coordinate_size); CUDA_CHECK(cudaStreamSynchronize(0)); union_maps.push_back(std::move(curr_map)); } return union_maps; } // Helper functions template class TemplatedAllocator> void CoordinateMapGPU::copy_coordinates( coordinate_type *dst_coordinate) const { size_type const num_threads = size(); if (num_threads <= 0) return; // Copy by offset // size_type const num_blocks = GET_BLOCKS(num_threads, CUDA_NUM_THREADS); // detail::copy_coordinates_by_offset // <<>>( // *m_map, // // dst_coordinate, // // m_valid_map_index.data(), // // num_threads, // // m_coordinate_size); size_type const num_blocks = GET_BLOCKS(num_threads * m_coordinate_size, CUDA_NUM_THREADS); detail::copy_coordinates_by_valid_row <<>>( // *m_map, // const_coordinate_data(), // dst_coordinate, // m_valid_row_index.cbegin(), // num_threads * m_coordinate_size, // m_coordinate_size); } // Template instantiation template class CoordinateFieldMapGPU; template class CoordinateFieldMapGPU; template class CoordinateMapGPU; template class CoordinateMapGPU; template std::pair< gpu_storage>, gpu_storage>> CoordinateMapGPU:: insert_and_map( coordinate_iterator key_first, coordinate_iterator key_last); template std::pair< gpu_storage>, gpu_storage>> CoordinateMapGPU:: insert_and_map( coordinate_iterator key_first, coordinate_iterator key_last); template std::pair< gpu_storage>, gpu_storage>> CoordinateMapGPU:: insert_and_map( coordinate_iterator key_first, coordinate_iterator key_last); template std::pair< gpu_storage>, gpu_storage>> CoordinateMapGPU:: insert_and_map( coordinate_iterator key_first, coordinate_iterator key_last); template std::pair CoordinateMapGPU:: field_map(float const *p_tfield, default_types::size_type const num_tfield) const; template std::pair CoordinateMapGPU:: field_map(float const *p_tfield, default_types::size_type const num_tfield) const; } // namespace minkowski