/* Copyright (c) 2020 NVIDIA CORPORATION. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part * of the code. */ #ifndef COORDINATE_MAP_CPU_HPP #define COORDINATE_MAP_CPU_HPP #include "coordinate_map.hpp" #include "kernel_map.hpp" #include "kernel_region.hpp" #include #include #include namespace minkowski { namespace detail { template bool is_coordinate_aligned(coordinate const &point, stride_type const &stride) { for (uint32_t i = 0; i < stride.size(); ++i) { if (point[i + 1] % stride[i] != 0) return false; } return true; } template std::pair field_map_kernel(uint32_t const num_tfield, // uint32_t const coordinate_size, // Dtype const *const p_tfield, // MapType const &in_map, // default_types::stride_type const &tensor_stride) { constexpr bool is_float32 = std::is_same::value; at::ScalarType const float_type = is_float32 ? at::ScalarType::Float : at::ScalarType::Double; cpu_in_maps in_maps = initialize_maps(1, num_tfield); cpu_out_maps out_maps = initialize_maps(1, num_tfield); uint32_t num_used{0}; // OMP // size_t stride = max((size_t)100, numElements / (2 * // omp_get_max_threads())); size_t N = (numElements + stride - 1) / // stride; // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and // the job sizes. If some jobs finish earlier than others due to // imbalance in hash distribution, these threads will be idle. const size_t N = 2 * omp_get_max_threads(); const size_t stride = (num_tfield + N - 1) / N; LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride."); #pragma omp parallel for for (uint32_t n = 0; n < N; n++) { // temporary variables for each thread std::vector curr_vec(coordinate_size); coordinate curr_coordinate(curr_vec.data()); uint32_t curr_index_begin; for (auto i = stride * n; i < std::min((n + 1) * stride, uint64_t(num_tfield)); ++i) { // batch index curr_vec[0] = std::lroundf(p_tfield[i * coordinate_size]); for (uint32_t j = 1; j < coordinate_size; ++j) { auto const curr_tensor_stride = tensor_stride[j - 1]; curr_vec[j] = curr_tensor_stride * std::floor(p_tfield[coordinate_size * i + j] / curr_tensor_stride); } const auto iter_in = in_map.find(curr_coordinate); // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_out->first.data(), // coordinate_size), // "->", PtrToString(point.data(), // coordinate_size)); if (iter_in != in_map.end()) { #pragma omp atomic capture { curr_index_begin = num_used; num_used += 1; } // Ensure that in_maps and out_maps are resized accordingly in_maps[0][curr_index_begin] = iter_in->second; out_maps[0][curr_index_begin] = i; // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_in->first.data(), // coordinate_size), // "->", // PtrToString(iter_out->first.data(), // coordinate_size)); } } } auto final_in_map = torch::empty( {num_used}, torch::TensorOptions().dtype(torch::kInt32).requires_grad(false)); auto final_out_map = torch::empty( {num_used}, torch::TensorOptions().dtype(torch::kInt32).requires_grad(false)); std::copy_n(in_maps[0].data(), num_used, final_in_map.template data_ptr()); std::copy_n(out_maps[0].data(), num_used, final_out_map.template data_ptr()); return std::make_pair(final_in_map, final_out_map); } template std::vector interpolation_map_weight_kernel( uint32_t const num_tfield, // uint32_t const coordinate_size, // Dtype const *const p_tfield, // MapType const &in_map, // default_types::stride_type const &tensor_stride) { constexpr bool is_float32 = std::is_same::value; at::ScalarType const float_type = is_float32 ? at::ScalarType::Float : at::ScalarType::Double; uint32_t const neighbor_volume = std::pow(2, (coordinate_size - 1)); LOG_DEBUG("neighbor_volume :", neighbor_volume, "num_tfield:", num_tfield); cpu_in_maps in_maps = initialize_maps(neighbor_volume, num_tfield); cpu_out_maps out_maps = initialize_maps(neighbor_volume, num_tfield); std::vector> weights = initialize_maps>(neighbor_volume, num_tfield); std::vector num_used(neighbor_volume); std::for_each(num_used.begin(), num_used.end(), [](auto &i) { i = 0; }); // OMP // size_t stride = max((size_t)100, numElements / (2 * // omp_get_max_threads())); size_t N = (numElements + stride - 1) / // stride; // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and // the job sizes. If some jobs finish earlier than others due to // imbalance in hash distribution, these threads will be idle. const size_t N = 2 * omp_get_max_threads(); const size_t stride = (num_tfield + N - 1) / N; LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride."); #pragma omp parallel for for (uint32_t n = 0; n < N; n++) { // temporary variables for each thread std::vector curr_vec(coordinate_size), lb(coordinate_size), ub(coordinate_size); coordinate curr_coordinate(curr_vec.data()); uint32_t curr_index_begin; for (auto i = stride * n; i < std::min((n + 1) * stride, uint64_t(num_tfield)); ++i) { // batch index curr_vec[0] = std::lroundf(p_tfield[i * coordinate_size]); for (uint32_t j = 1; j < coordinate_size; ++j) { auto const curr_tensor_stride = tensor_stride[j - 1]; lb[j] = curr_tensor_stride * std::floor(p_tfield[coordinate_size * i + j] / curr_tensor_stride); ub[j] = lb[j] + curr_tensor_stride; curr_vec[j] = lb[j]; } // For elements in the current region for (uint32_t neighbor_ind = 0; neighbor_ind < neighbor_volume; ++neighbor_ind) { uint32_t mask = 1; for (uint32_t j = coordinate_size - 1; j > 0; --j) { if ((neighbor_ind & mask) == 0) curr_vec[j] = lb[j]; else curr_vec[j] = ub[j]; mask = mask << 1; } const auto iter_in = in_map.find(curr_coordinate); // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_out->first.data(), // coordinate_size), // "->", PtrToString(point.data(), // coordinate_size)); if (iter_in != in_map.end()) { #pragma omp atomic capture { curr_index_begin = num_used[neighbor_ind]; num_used[neighbor_ind] += 1; } // Compute weights Dtype weight = 1.0; for (uint32_t j = 1; j < coordinate_size; ++j) { weight *= 1 - std::abs(p_tfield[coordinate_size * i + j] - curr_vec[j]) / tensor_stride[j - 1]; } // Ensure that in_maps and out_maps are resized accordingly in_maps[neighbor_ind][curr_index_begin] = iter_in->second; out_maps[neighbor_ind][curr_index_begin] = i; weights[neighbor_ind][curr_index_begin] = weight; // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_in->first.data(), // coordinate_size), // "->", // PtrToString(iter_out->first.data(), // coordinate_size)); } } } } auto const total_num_used = std::accumulate(num_used.begin(), num_used.end(), 0); auto final_in_map = torch::empty( {total_num_used}, torch::TensorOptions().dtype(torch::kInt32).requires_grad(false)); auto final_out_map = torch::empty( {total_num_used}, torch::TensorOptions().dtype(torch::kInt32).requires_grad(false)); auto final_weights = torch::empty( {total_num_used}, torch::TensorOptions().dtype(float_type).requires_grad(false)); uint32_t final_begin = 0; for (uint32_t i = 0; i < neighbor_volume; ++i) { uint32_t const max_num = num_used[i]; LOG_DEBUG("kernel index", i, "size:", max_num); std::copy_n(in_maps[i].data(), max_num, &final_in_map.template data_ptr()[final_begin]); std::copy_n(out_maps[i].data(), max_num, &final_out_map.template data_ptr()[final_begin]); std::copy_n(weights[i].data(), max_num, final_weights.template data_ptr() + final_begin); final_begin += max_num; } return {final_in_map, final_out_map, final_weights}; } } // namespace detail /* * Inherit from the CoordinateMap for a specific map type. */ // clang-format off template class TemplatedAllocator = std::allocator> class CoordinateMapCPU : public CoordinateMap { public: using base_type = CoordinateMap; using self_type = CoordinateMapCPU; using size_type = typename base_type::size_type; using index_type = typename base_type::index_type; using stride_type = typename base_type::stride_type; using key_type = coordinate; using mapped_type = default_types::index_type; using hasher = detail::coordinate_murmur3; using key_equal = detail::coordinate_equal_to; using map_type = robin_hood::unordered_flat_map; using value_type = typename map_type::value_type; using iterator = typename map_type::iterator; using const_iterator = typename map_type::const_iterator; using index_vector_type = typename base_type::index_vector_type; using byte_allocator_type = TemplatedAllocator; // clang-format on public: CoordinateMapCPU() = delete; CoordinateMapCPU(size_type const number_of_coordinates, size_type const coordinate_size, stride_type const &stride = {1}, byte_allocator_type alloc = byte_allocator_type()) : base_type(number_of_coordinates, coordinate_size, stride, alloc), m_map( map_type{0, hasher{coordinate_size}, key_equal{coordinate_size}}) { m_map.reserve(number_of_coordinates); } /* * @brief given a key iterator begin-end pair and a value iterator begin-end * pair, insert all elements. * * @return none */ void insert(coordinate_type const *coordinate_begin, coordinate_type const *coordinate_end) { size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size; base_type::allocate(N); index_type value = 0; for (coordinate_type const *key = coordinate_begin; key != coordinate_end; key += m_coordinate_size, ++value) { // value_type ctor needed because this might be called with std::pair's insert(key_type(key), value); } } /* * @brief given a key iterator begin-end pair and a value iterator begin-end * pair, insert all elements. * * @return pair, vector> if return_unique_inverse_map. * mapping is a vector of unique indices and inverse_mapping is a vector of * indices that reconstructs the original coordinate from the list of unique * coordinates. * * >>> unique_coordinates = input_coordinates[mapping] * >>> reconstructed_coordinates = unique_coordinates[inverse_mapping] * >>> torch.all(reconstructed_coordinates == input_coordinates) */ template std::pair, std::vector> // return maps insert_and_map(coordinate_type const *coordinate_begin, coordinate_type const *coordinate_end) { size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size; std::vector mapping, inverse_mapping; base_type::allocate(N); mapping.reserve(N); inverse_mapping.reserve(N); index_type value{0}, row_index{0}; for (coordinate_type const *key = coordinate_begin; key != coordinate_end; key += m_coordinate_size, row_index += 1) { // value_type ctor needed because this might be called with std::pair's auto const result = insert(key_type(key), value); if (result.second) { mapping.push_back(row_index); inverse_mapping.push_back(value); } else { // result.first is an iterator of pair inverse_mapping.push_back(result.first->second); } value += remap ? result.second : 1; } return std::make_pair(std::move(mapping), std::move(inverse_mapping)); } /* * @brief given a key iterator begin-end pair find all valid keys and its * index. * * @return a pair of (valid index, query value) vectors. */ template std::pair find(key_iterator key_first, key_iterator key_last) { size_type N = key_last - key_first; ASSERT(N <= base_type::m_capacity, "Invalid search range. Current capacity:", base_type::m_capacity, ", search range:", N); // reserve the result slots index_vector_type valid_query_index, query_result; valid_query_index.reserve(N); query_result.reserve(N); key_iterator key_curr{key_first}; for (; key_curr != key_last; ++key_curr) { auto const query_iter = m_map.find(*key_curr); // If valid query if (query_iter != m_map.end()) { valid_query_index.push_back(key_curr - key_first); query_result.push_back(query_iter->second); } } return std::make_pair(valid_query_index, query_result); } // Network specific functions. /* * @brief strided coordinate map. */ self_type stride(stride_type const &stride) const { ASSERT(stride.size() == m_coordinate_size - 1, "Invalid stride", stride); // Over estimate the reserve size to be size(); self_type stride_map( size(), m_coordinate_size, detail::stride_tensor_stride(base_type::m_tensor_stride, stride), base_type::m_byte_allocator); index_type c = 0; std::vector dst(m_coordinate_size); coordinate strided_coordinate(&dst[0]); for (auto const &kv : m_map) { detail::stride_coordinate(kv.first, dst, stride_map.m_tensor_stride); auto result = stride_map.insert(strided_coordinate, c); c += result.second; } return stride_map; } /***************************************************************************** * Map generation ****************************************************************************/ /* * @brief strided coordinate map for region. */ self_type stride_region(cpu_kernel_region const &kernel, stride_type const &out_tensor_stride) const { ASSERT(kernel.coordinate_size() == m_coordinate_size, "Invalid kernel"); // Over estimate the reserve size to be size(); self_type stride_map(size() * kernel.volume(), m_coordinate_size, out_tensor_stride, base_type::m_byte_allocator); auto ckernel = cpu_kernel_region(kernel); std::vector tmp(m_coordinate_size); coordinate point(tmp.data()); index_type num_used{0}; if (kernel.is_transpose()) { for (auto iter_in = m_map.begin(); iter_in != m_map.end(); ++iter_in) { // For elements in the current region for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume(); ++kernel_ind) { ckernel.coordinate_at(kernel_ind, iter_in->first.data(), tmp.data()); auto const result = stride_map.insert(point, num_used); num_used += result.second; } } } else { LOG_DEBUG("stride_region with no transpose"); // Expand coordinates with regular conv for (auto iter_in = m_map.begin(); iter_in != m_map.end(); ++iter_in) { // For elements in the current region for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume(); ++kernel_ind) { // TODO replace with more efficient code ckernel.coordinate_at(kernel_ind, iter_in->first.data(), tmp.data()); if (detail::is_coordinate_aligned( point, out_tensor_stride)) { auto const result = stride_map.insert(point, num_used); num_used += result.second; } } } } return stride_map; } /* * @brief strided coordinate map. */ self_type origin() const { // tensor stride is set to {0,..., 0} for the origin map. stride_type origin_tensor_stride(m_coordinate_size - 1); std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(), [](auto &i) { i = 0; }); // Over estimate the reserve size to be size(); self_type origin_map(size(), m_coordinate_size, origin_tensor_stride, base_type::m_byte_allocator); index_type c = 0; std::vector dst(m_coordinate_size); std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; }); coordinate tmp_coordinate(&dst[0]); for (auto const &kv : m_map) { dst[0] = kv.first[0]; auto result = origin_map.insert(tmp_coordinate, c); c += result.second; } return origin_map; } /* * @brief generate a new coordinate map that only keeps coordinates with true * keep mask */ self_type prune(bool const *keep_begin, bool const *keep_end) const { ASSERT(keep_end - keep_begin == size(), "Invalid range for pruning"); // Over estimate the reserve size to be size(); self_type pruned_map(size(), m_coordinate_size, base_type::m_tensor_stride, base_type::m_byte_allocator); index_type c = 0; for (auto const &kv : m_map) { // Use the row index defined if (keep_begin[kv.second]) { auto result = pruned_map.insert(kv.first, c); c += result.second; } } LOG_DEBUG("size:", pruned_map.size(), "capacity:", pruned_map.capacity()); return pruned_map; } self_type merge(const self_type &other) const { std::vector> maps{*this, other}; // maps.push_back(*this); // maps.push_back(other); return merge(maps); } self_type merge(const std::vector> &maps) const { // merge all input maps size_t all_size = std::accumulate( maps.begin(), maps.end(), 0, [](size_t sum, const self_type &map) { return sum + map.size(); }); self_type merged_map(all_size, m_coordinate_size, base_type::m_tensor_stride, base_type::m_byte_allocator); // Push all coordinates index_type c = 0; for (self_type const &map : maps) { for (auto const &kv : map.m_map) { auto result = merged_map.insert(kv.first, c); c += result.second; } } return merged_map; } /***************************************************************************** * Kernel map ****************************************************************************/ cpu_kernel_map kernel_map(self_type const &out_coordinate_map, cpu_kernel_region const &kernel) const { // Over estimate the reserve size to be size(); size_type out_size = out_coordinate_map.size(); size_type kernel_volume = kernel.volume(); LOG_DEBUG("kernel volume:", kernel_volume, "out_size:", out_size); cpu_in_maps in_maps = initialize_maps(kernel_volume, out_size); cpu_out_maps out_maps = initialize_maps(kernel_volume, out_size); std::vector num_used(kernel_volume); std::for_each(num_used.begin(), num_used.end(), [](auto &i) { i = 0; }); // OMP const auto &out_mmap = out_coordinate_map.m_map; const size_t out_map_num_elements = out_mmap.capacity(); // size_t stride = max((size_t)100, numElements / (2 * // omp_get_max_threads())); size_t N = (numElements + stride - 1) / // stride; // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and the // job sizes. If some jobs finish earlier than others due to imbalance in // hash distribution, these threads will be idle. size_t N = 2 * omp_get_max_threads(); const size_t stride = (out_map_num_elements + N - 1) / N; N = (out_map_num_elements + stride - 1) / stride; LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride."); LOG_DEBUG((kernel.region_type() != RegionType::CUSTOM && kernel_volume == 1) ? "single kernel" : "otherwise"); // When no need to iterate through the region // Put if outside the loop for speed if (kernel.region_type() != RegionType::CUSTOM && kernel_volume == 1) { index_type curr_index_begin = 0; for (auto iter_out = out_mmap.begin(); iter_out != out_mmap.end(); ++iter_out) { const auto iter_in = m_map.find(iter_out->first); if (iter_in != m_map.end()) { in_maps[0][curr_index_begin] = iter_in->second; out_maps[0][curr_index_begin] = iter_out->second; ++curr_index_begin; } } num_used[0] = curr_index_begin; } else { #pragma omp parallel for for (index_type n = 0; n < N; n++) { auto ckernel = cpu_kernel_region(kernel); // temporary variables for each thread std::vector tmp(m_coordinate_size); coordinate curr_kernel_coordinate(tmp.data()); index_type curr_index_begin; for (auto iter_out = out_mmap.begin(stride * n); iter_out.num_steps() < std::min(stride, out_map_num_elements - n * stride); ++iter_out) { // For elements in the current region for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume(); ++kernel_ind) { // If the input coord exists ckernel.coordinate_at(kernel_ind, iter_out->first.data(), tmp.data()); const auto iter_in = m_map.find(curr_kernel_coordinate); // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_out->first.data(), m_coordinate_size), // "->", PtrToString(point.data(), m_coordinate_size)); if (iter_in != m_map.end()) { #pragma omp atomic capture { curr_index_begin = num_used[kernel_ind]; num_used[kernel_ind] += 1; } // Ensure that in_maps and out_maps are resized accordingly in_maps[kernel_ind][curr_index_begin] = iter_in->second; out_maps[kernel_ind][curr_index_begin] = iter_out->second; // LOG_DEBUG(kernel_ind, ":", // PtrToString(iter_in->first.data(), // m_coordinate_size), // "->", // PtrToString(iter_out->first.data(), // m_coordinate_size)); } } } } } for (index_type i = 0; i < kernel_volume; ++i) { index_type max_num = num_used[i]; LOG_DEBUG("kernel index", i, "size:", max_num); in_maps[i].resize(max_num); out_maps[i].resize(max_num); } return std::make_pair(in_maps, out_maps); } cpu_kernel_map stride_map(self_type const &out_coordinate_map, stride_type const &out_tensor_stride) const { // generate an in-out (kernel) map that maps all input points in the same // voxel to strided output voxel. size_type in_size = size(); LOG_DEBUG("Generate stride_map with in NNZ:", in_size, "out NNZ:", out_coordinate_map.size(), "out_tensor_stride:", out_tensor_stride); cpu_in_maps in_maps = initialize_maps(1, in_size); cpu_out_maps out_maps = initialize_maps(1, in_size); LOG_DEBUG("stride map in_maps.size():", in_size); LOG_DEBUG("stride map out_maps.size():", out_coordinate_map.size()); // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and the // job sizes. If some jobs finish earlier than others due to imbalance in // hash distribution, these threads will be idle. const size_t in_map_num_elements = m_map.capacity(); size_t N = 2 * omp_get_max_threads(); const size_t stride = (in_map_num_elements + N - 1) / N; N = (in_map_num_elements + stride - 1) / stride; LOG_DEBUG("kernel map with", N, "chunks."); index_type num_used = 0; #pragma omp parallel for for (index_type n = 0; n < N; ++n) { index_type curr_index_begin; std::vector dst(m_coordinate_size); for (auto iter_in = m_map.begin(stride * n); iter_in.num_steps() < std::min(stride, in_map_num_elements - n * stride); ++iter_in) { detail::stride_coordinate(iter_in->first, dst, out_tensor_stride); const auto iter_out = out_coordinate_map.find(coordinate(dst.data())); ASSERT(iter_out != out_coordinate_map.m_map.cend(), "Invalid out_coordinate_map"); #pragma omp atomic capture { curr_index_begin = num_used; num_used += 1; } in_maps[0][curr_index_begin] = iter_in->second; out_maps[0][curr_index_begin] = iter_out->second; } } return std::make_pair(move(in_maps), move(out_maps)); } cpu_kernel_map origin_map(self_type const &origin_coordinate_map) const { // generate an in-out (kernel) map that maps all input points in the same // voxel to strided output voxel. ASSERT(std::all_of(origin_coordinate_map.get_tensor_stride().begin(), origin_coordinate_map.get_tensor_stride().end(), [](auto const &i) { return i == 0; }), "Invalid origin tensor stride", origin_coordinate_map.get_tensor_stride()); size_type const in_size = size(); size_type const out_size = origin_coordinate_map.size(); LOG_DEBUG("Generate origin_map with in NNZ:", in_size, "out NNZ:", out_size); ASSERT(in_size > out_size, "Invalid out_coordinate_map"); std::vector> in_out(in_size); // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and the // job sizes. If some jobs finish earlier than others due to imbalance in // hash distribution, these threads will be idle. size_t const in_map_num_elements = m_map.capacity(); size_t N = 2 * omp_get_max_threads(); size_t const stride = (in_map_num_elements + N - 1) / N; N = (in_map_num_elements + stride - 1) / stride; LOG_DEBUG("kernel map with", N, "chunks."); size_type num_used = 0; #pragma omp parallel for for (index_type n = 0; n < N; ++n) { index_type curr_index_begin; std::vector dst(m_coordinate_size); std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; }); for (auto iter_in = m_map.begin(stride * n); iter_in.num_steps() < std::min(stride, in_map_num_elements - n * stride); ++iter_in) { dst[0] = iter_in->first[0]; const auto iter_origin = origin_coordinate_map.find(coordinate(dst.data())); ASSERT(iter_origin != origin_coordinate_map.m_map.cend(), "Invalid origin_coordinate_map"); index_type origin_row_index = iter_origin->second; #pragma omp atomic capture { curr_index_begin = num_used; num_used += 1; } in_out[curr_index_begin] = std::make_pair(iter_in->second, origin_row_index); } } // Decomposed kernel map auto batch_indices = origin_coordinate_map.batch_indices(); return cpu_kernel_map(in_out, batch_indices); } /***************************************************************************** * Interpolation ****************************************************************************/ /* * Given a continuous tensor field, return the weights and associated kernel * map */ std::vector interpolation_map_weight(at::Tensor const &tfield) const { // Over estimate the reserve size to be size(); ASSERT(tfield.dim() == 2, "Invalid tfield dimension"); ASSERT(tfield.size(1) == m_coordinate_size, "Invalid tfield size"); // AT_DISPATCH_FLOATING_TYPES( // tfield.scalar_type(), "interpolation_map_weight_kernel", [&] { switch (tfield.scalar_type()) { case at::ScalarType::Double: return detail::interpolation_map_weight_kernel( tfield.size(0), // m_coordinate_size, // tfield.template data_ptr(), // m_map, // base_type::m_tensor_stride); case at::ScalarType::Float: return detail::interpolation_map_weight_kernel( tfield.size(0), // m_coordinate_size, // tfield.template data_ptr(), // m_map, // base_type::m_tensor_stride); default: ASSERT(false, "Unsupported float type"); } } template std::pair field_map(coordinate_field_type const *p_tfield, size_type const num_tfield) const { return detail::field_map_kernel(num_tfield, // m_coordinate_size, // p_tfield, // m_map, // base_type::m_tensor_stride); } /***************************************************************************** * Union Map ****************************************************************************/ /* * Find mapping from all inputs to self. The mapping is a list of 2xN tensors */ std::vector union_map( std::vector> const &in_maps) const { std::vector in_out_maps; for (self_type const &in_map : in_maps) { size_type const N = in_map.size(); size_type const capacity = in_map.m_map.capacity(); auto curr_map = torch::empty( {2, N}, torch::TensorOptions().dtype(torch::kInt64).requires_grad(false)); int64_t *p_in_rows = curr_map.template data_ptr(); std::fill_n(p_in_rows, 2 * N, -1); int64_t *p_union_rows = p_in_rows + N; index_type offset{0}; for (auto iter_in = in_map.m_map.begin(); iter_in.num_steps() < capacity; ++iter_in) { p_in_rows[offset] = iter_in->second; // WARNING: This assumes that the union map has a corresponding // coordinate for speed. This will results in an unexpected behavior // if the union map does not contain the coordinate. p_union_rows[offset] = find(iter_in->first)->second; LOG_DEBUG("offset:", offset, " in:", p_in_rows[offset], " out:", p_union_rows[offset]); offset++; } in_out_maps.push_back(std::move(curr_map)); } return in_out_maps; } inline size_type size() const noexcept { return m_map.size(); } std::string to_string() const { Formatter o; o << "CoordinateMapCPU:" << size() << "x" << m_coordinate_size; return o.str(); } using base_type::capacity; using base_type::coordinate_size; using base_type::get_tensor_stride; inline void reserve(size_type c) { base_type::reserve(c); m_map.reserve(c); } void copy_coordinates(coordinate_type *dst_coordinate) const { if (m_map.size() == 0) return; size_t const capacity = m_map.capacity(); size_t N = omp_get_max_threads(); const size_t stride = (capacity + N - 1) / N; N = (capacity + stride - 1) / stride; LOG_DEBUG("kernel map with", N, "chunks, stride", stride, "capacity", capacity); // When no need to iterate through the region // Put if outside the loop for speed #pragma omp parallel for for (index_type n = 0; n < N; ++n) { for (auto it = m_map.begin(stride * n); // it.num_steps() < std::min(stride, capacity - n * stride); // ++it) { std::copy_n(it->first.data(), m_coordinate_size, dst_coordinate + m_coordinate_size * it->second); } } } std::vector batch_indices() const { std::vector indices(size()); for (auto it = m_map.begin(); it != m_map.end(); ++it) { indices[it->second] = it->first.data()[0]; } return indices; } std::pair insert(key_type const &key, mapped_type const &val) { ASSERT(val < base_type::m_capacity, "Invalid mapped value: ", val, ", current capacity: ", base_type::m_capacity); coordinate_type *ptr = &base_type::m_coordinates[val * m_coordinate_size]; std::copy_n(key.data(), m_coordinate_size, ptr); return m_map.insert(value_type(coordinate{ptr}, val)); } inline iterator find(key_type const &key) { return m_map.find(key); } inline const_iterator find(key_type const &key) const { return m_map.find(key); } inline const_iterator cend() const { return m_map.cend(); } private: using base_type::m_coordinate_size; map_type m_map; }; // Field map template class TemplatedAllocator = std::allocator> class CoordinateFieldMapCPU : public CoordinateMap { // Coordinate wrapper public: using base_type = CoordinateMap; using coordinate_map_type = CoordinateMapCPU; using self_type = CoordinateFieldMapCPU; using size_type = typename base_type::size_type; using index_type = typename base_type::index_type; using stride_type = typename base_type::stride_type; using byte_allocator_type = TemplatedAllocator; public: CoordinateFieldMapCPU() = delete; CoordinateFieldMapCPU(size_type const number_of_coordinates, size_type const coordinate_size, stride_type const &stride = {1}, byte_allocator_type alloc = byte_allocator_type()) : base_type(number_of_coordinates, coordinate_size, stride, alloc), m_size(number_of_coordinates) { base_type::reserve(number_of_coordinates); } /* * @brief given a key iterator begin-end pair and a value iterator begin-end * pair, insert all elements. * * @return none */ void insert(coordinate_field_type const *coordinate_begin, coordinate_field_type const *coordinate_end) { size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size; base_type::allocate(N); // copy data directly to the ptr std::copy_n(coordinate_begin, N * m_coordinate_size, base_type::coordinate_data()); } using base_type::const_coordinate_data; using base_type::coordinate_data; void copy_coordinates(coordinate_field_type *dst_coordinate) const { std::copy_n(base_type::const_coordinate_data(), size() * m_coordinate_size, dst_coordinate); } void quantize_coordinates(coordinate_int_type *p_dst_coordinates, stride_type const &tensor_stride) const { coordinate_field_type const *const p_tfield = const_coordinate_data(); int64_t const stride_prod = std::accumulate( tensor_stride.begin(), tensor_stride.end(), 1, std::multiplies<>()); ASSERT(stride_prod > 0, "Invalid stride"); const size_t N = omp_get_max_threads(); const size_t stride = (size() + N - 1) / N; LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride."); if (stride_prod == 1) { #pragma omp parallel for for (uint32_t n = 0; n < N; n++) { for (auto i = stride * n; i < std::min((n + 1) * stride, uint64_t(size())); ++i) { // batch index coordinate_int_type *p_curr_dst = &p_dst_coordinates[i * m_coordinate_size]; p_curr_dst[0] = std::lroundf(p_tfield[i * m_coordinate_size]); for (uint32_t j = 1; j < m_coordinate_size; ++j) { p_curr_dst[j] = std::floor(p_tfield[m_coordinate_size * i + j]); } } } } else { #pragma omp parallel for for (uint32_t n = 0; n < N; n++) { for (auto i = stride * n; i < std::min((n + 1) * stride, uint64_t(size())); ++i) { // batch index coordinate_int_type *p_curr_dst = &p_dst_coordinates[i * m_coordinate_size]; p_curr_dst[0] = std::lroundf(p_tfield[i * m_coordinate_size]); for (uint32_t j = 1; j < m_coordinate_size; ++j) { auto const curr_tensor_stride = tensor_stride[j - 1]; p_curr_dst[j] = curr_tensor_stride * std::floor(p_tfield[m_coordinate_size * i + j] / curr_tensor_stride); } } } } } coordinate_map_type origin() const { // tensor stride is set to {0,..., 0} for the origin map. stride_type origin_tensor_stride(m_coordinate_size - 1); std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(), [](auto &i) { i = 0; }); // Over estimate the reserve size to be size(); CoordinateMapCPU origin_map( size(), m_coordinate_size, origin_tensor_stride, base_type::m_byte_allocator); coordinate_field_type const *const p_tfield = const_coordinate_data(); std::vector dst(m_coordinate_size); std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; }); coordinate tmp_coordinate(&dst[0]); index_type c = 0; for (size_t i = 0; i < size(); ++i) { dst[0] = coordinate_int_type(p_tfield[i * m_coordinate_size]); auto result = origin_map.insert(tmp_coordinate, c); c += result.second; } return origin_map; } cpu_kernel_map origin_map(coordinate_map_type const &origin_coordinate_map) const { // generate an in-out (kernel) map that maps all input points in the same // voxel to strided output voxel. ASSERT(std::all_of(origin_coordinate_map.get_tensor_stride().begin(), origin_coordinate_map.get_tensor_stride().end(), [](auto const &i) { return i == 0; }), "Invalid origin tensor stride", origin_coordinate_map.get_tensor_stride()); size_type const in_size = size(); size_type const out_size = origin_coordinate_map.size(); LOG_DEBUG("Generate origin_map with in NNZ:", in_size, "out NNZ:", out_size); ASSERT(in_size > out_size, "Invalid out_coordinate_map"); std::vector> in_out(in_size); // compute the chunk size per thread. // There's a trade-off between the thread initialization overhead and the // job sizes. If some jobs finish earlier than others due to imbalance in // hash distribution, these threads will be idle. size_t const in_map_num_elements = size(); size_t N = 2 * omp_get_max_threads(); size_t const stride = (in_map_num_elements + N - 1) / N; N = (in_map_num_elements + stride - 1) / stride; LOG_DEBUG("kernel map with", N, "chunks", stride, "stride"); coordinate_field_type const *const p_tfield = const_coordinate_data(); size_type num_used = 0; #pragma omp parallel for for (index_type n = 0; n < N; ++n) { index_type curr_index_begin; std::vector dst(m_coordinate_size); std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; }); // LOG_DEBUG(n, "th chunk contains", // std::min((n + 1) * stride, in_map_num_elements) - n * stride, // "elements"); for (size_t i = stride * n; i < std::min((n + 1) * stride, in_map_num_elements); ++i) { dst[0] = p_tfield[i * m_coordinate_size]; // LOG_DEBUG(i, "th row batch index", dst[0]); const auto iter_origin = origin_coordinate_map.find( coordinate(dst.data())); ASSERT(iter_origin != origin_coordinate_map.cend(), "Invalid origin_coordinate_map"); index_type origin_row_index = iter_origin->second; #pragma omp atomic capture { curr_index_begin = num_used; num_used += 1; } in_out[curr_index_begin] = std::make_pair(i, origin_row_index); } } // #ifdef DEBUG // LOG_DEBUG("Kernel map"); // for (auto iter : in_out) { // std::cout << iter.first << ":" << iter.second << "\n"; // } // #endif // Decomposed kernel map auto batch_indices = origin_coordinate_map.batch_indices(); return cpu_kernel_map(in_out, batch_indices); } inline size_type size() const noexcept { return m_size; } std::string to_string() const { Formatter o; o << "CoordinateFieldMapCPU:" << size() << "x" << m_coordinate_size; return o.str(); } private: using base_type::m_coordinate_size; size_type m_size; }; } // namespace minkowski #endif // COORDINATE_MAP_CPU