depthsplat / MinkowskiEngine /src /coordinate_map_cpu.hpp
Yeqing0814's picture
Upload folder using huggingface_hub
a6dd040 verified
/* Copyright (c) 2020 NVIDIA CORPORATION.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
#ifndef COORDINATE_MAP_CPU_HPP
#define COORDINATE_MAP_CPU_HPP
#include "coordinate_map.hpp"
#include "kernel_map.hpp"
#include "kernel_region.hpp"
#include <numeric>
#include <omp.h>
#include <torch/extension.h>
namespace minkowski {
namespace detail {
template <typename coordinate_type,
typename stride_type = default_types::stride_type>
bool is_coordinate_aligned(coordinate<coordinate_type> const &point,
stride_type const &stride) {
for (uint32_t i = 0; i < stride.size(); ++i) {
if (point[i + 1] % stride[i] != 0)
return false;
}
return true;
}
template <typename coordinate_type, typename Dtype, typename MapType>
std::pair<at::Tensor, at::Tensor>
field_map_kernel(uint32_t const num_tfield, //
uint32_t const coordinate_size, //
Dtype const *const p_tfield, //
MapType const &in_map, //
default_types::stride_type const &tensor_stride) {
constexpr bool is_float32 = std::is_same<Dtype, float>::value;
at::ScalarType const float_type =
is_float32 ? at::ScalarType::Float : at::ScalarType::Double;
cpu_in_maps in_maps = initialize_maps<cpu_in_map>(1, num_tfield);
cpu_out_maps out_maps = initialize_maps<cpu_out_map>(1, num_tfield);
uint32_t num_used{0};
// OMP
// size_t stride = max((size_t)100, numElements / (2 *
// omp_get_max_threads())); size_t N = (numElements + stride - 1) /
// stride;
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and
// the job sizes. If some jobs finish earlier than others due to
// imbalance in hash distribution, these threads will be idle.
const size_t N = 2 * omp_get_max_threads();
const size_t stride = (num_tfield + N - 1) / N;
LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride.");
#pragma omp parallel for
for (uint32_t n = 0; n < N; n++) {
// temporary variables for each thread
std::vector<coordinate_type> curr_vec(coordinate_size);
coordinate<coordinate_type> curr_coordinate(curr_vec.data());
uint32_t curr_index_begin;
for (auto i = stride * n;
i < std::min<uint64_t>((n + 1) * stride, uint64_t(num_tfield)); ++i) {
// batch index
curr_vec[0] = std::lroundf(p_tfield[i * coordinate_size]);
for (uint32_t j = 1; j < coordinate_size; ++j) {
auto const curr_tensor_stride = tensor_stride[j - 1];
curr_vec[j] =
curr_tensor_stride *
std::floor(p_tfield[coordinate_size * i + j] / curr_tensor_stride);
}
const auto iter_in = in_map.find(curr_coordinate);
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_out->first.data(),
// coordinate_size),
// "->", PtrToString(point.data(),
// coordinate_size));
if (iter_in != in_map.end()) {
#pragma omp atomic capture
{
curr_index_begin = num_used;
num_used += 1;
}
// Ensure that in_maps and out_maps are resized accordingly
in_maps[0][curr_index_begin] = iter_in->second;
out_maps[0][curr_index_begin] = i;
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_in->first.data(),
// coordinate_size),
// "->",
// PtrToString(iter_out->first.data(),
// coordinate_size));
}
}
}
auto final_in_map = torch::empty(
{num_used},
torch::TensorOptions().dtype(torch::kInt32).requires_grad(false));
auto final_out_map = torch::empty(
{num_used},
torch::TensorOptions().dtype(torch::kInt32).requires_grad(false));
std::copy_n(in_maps[0].data(), num_used,
final_in_map.template data_ptr<int>());
std::copy_n(out_maps[0].data(), num_used,
final_out_map.template data_ptr<int>());
return std::make_pair(final_in_map, final_out_map);
}
template <typename coordinate_type, typename Dtype, typename MapType>
std::vector<at::Tensor> interpolation_map_weight_kernel(
uint32_t const num_tfield, //
uint32_t const coordinate_size, //
Dtype const *const p_tfield, //
MapType const &in_map, //
default_types::stride_type const &tensor_stride) {
constexpr bool is_float32 = std::is_same<Dtype, float>::value;
at::ScalarType const float_type =
is_float32 ? at::ScalarType::Float : at::ScalarType::Double;
uint32_t const neighbor_volume = std::pow(2, (coordinate_size - 1));
LOG_DEBUG("neighbor_volume :", neighbor_volume, "num_tfield:", num_tfield);
cpu_in_maps in_maps =
initialize_maps<cpu_in_map>(neighbor_volume, num_tfield);
cpu_out_maps out_maps =
initialize_maps<cpu_out_map>(neighbor_volume, num_tfield);
std::vector<std::vector<Dtype>> weights =
initialize_maps<std::vector<Dtype>>(neighbor_volume, num_tfield);
std::vector<uint32_t> num_used(neighbor_volume);
std::for_each(num_used.begin(), num_used.end(), [](auto &i) { i = 0; });
// OMP
// size_t stride = max((size_t)100, numElements / (2 *
// omp_get_max_threads())); size_t N = (numElements + stride - 1) /
// stride;
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and
// the job sizes. If some jobs finish earlier than others due to
// imbalance in hash distribution, these threads will be idle.
const size_t N = 2 * omp_get_max_threads();
const size_t stride = (num_tfield + N - 1) / N;
LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride.");
#pragma omp parallel for
for (uint32_t n = 0; n < N; n++) {
// temporary variables for each thread
std::vector<coordinate_type> curr_vec(coordinate_size), lb(coordinate_size),
ub(coordinate_size);
coordinate<coordinate_type> curr_coordinate(curr_vec.data());
uint32_t curr_index_begin;
for (auto i = stride * n;
i < std::min<uint64_t>((n + 1) * stride, uint64_t(num_tfield)); ++i) {
// batch index
curr_vec[0] = std::lroundf(p_tfield[i * coordinate_size]);
for (uint32_t j = 1; j < coordinate_size; ++j) {
auto const curr_tensor_stride = tensor_stride[j - 1];
lb[j] =
curr_tensor_stride *
std::floor(p_tfield[coordinate_size * i + j] / curr_tensor_stride);
ub[j] = lb[j] + curr_tensor_stride;
curr_vec[j] = lb[j];
}
// For elements in the current region
for (uint32_t neighbor_ind = 0; neighbor_ind < neighbor_volume;
++neighbor_ind) {
uint32_t mask = 1;
for (uint32_t j = coordinate_size - 1; j > 0; --j) {
if ((neighbor_ind & mask) == 0)
curr_vec[j] = lb[j];
else
curr_vec[j] = ub[j];
mask = mask << 1;
}
const auto iter_in = in_map.find(curr_coordinate);
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_out->first.data(),
// coordinate_size),
// "->", PtrToString(point.data(),
// coordinate_size));
if (iter_in != in_map.end()) {
#pragma omp atomic capture
{
curr_index_begin = num_used[neighbor_ind];
num_used[neighbor_ind] += 1;
}
// Compute weights
Dtype weight = 1.0;
for (uint32_t j = 1; j < coordinate_size; ++j) {
weight *=
1 - std::abs(p_tfield[coordinate_size * i + j] - curr_vec[j]) /
tensor_stride[j - 1];
}
// Ensure that in_maps and out_maps are resized accordingly
in_maps[neighbor_ind][curr_index_begin] = iter_in->second;
out_maps[neighbor_ind][curr_index_begin] = i;
weights[neighbor_ind][curr_index_begin] = weight;
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_in->first.data(),
// coordinate_size),
// "->",
// PtrToString(iter_out->first.data(),
// coordinate_size));
}
}
}
}
auto const total_num_used =
std::accumulate(num_used.begin(), num_used.end(), 0);
auto final_in_map = torch::empty(
{total_num_used},
torch::TensorOptions().dtype(torch::kInt32).requires_grad(false));
auto final_out_map = torch::empty(
{total_num_used},
torch::TensorOptions().dtype(torch::kInt32).requires_grad(false));
auto final_weights = torch::empty(
{total_num_used},
torch::TensorOptions().dtype(float_type).requires_grad(false));
uint32_t final_begin = 0;
for (uint32_t i = 0; i < neighbor_volume; ++i) {
uint32_t const max_num = num_used[i];
LOG_DEBUG("kernel index", i, "size:", max_num);
std::copy_n(in_maps[i].data(), max_num,
&final_in_map.template data_ptr<int>()[final_begin]);
std::copy_n(out_maps[i].data(), max_num,
&final_out_map.template data_ptr<int>()[final_begin]);
std::copy_n(weights[i].data(), max_num,
final_weights.template data_ptr<Dtype>() + final_begin);
final_begin += max_num;
}
return {final_in_map, final_out_map, final_weights};
}
} // namespace detail
/*
* Inherit from the CoordinateMap for a specific map type.
*/
// clang-format off
template <typename coordinate_type,
template <typename T> class TemplatedAllocator = std::allocator>
class CoordinateMapCPU : public CoordinateMap<coordinate_type, TemplatedAllocator> {
public:
using base_type = CoordinateMap<coordinate_type, TemplatedAllocator>;
using self_type = CoordinateMapCPU<coordinate_type, TemplatedAllocator>;
using size_type = typename base_type::size_type;
using index_type = typename base_type::index_type;
using stride_type = typename base_type::stride_type;
using key_type = coordinate<coordinate_type>;
using mapped_type = default_types::index_type;
using hasher = detail::coordinate_murmur3<coordinate_type>;
using key_equal = detail::coordinate_equal_to<coordinate_type>;
using map_type =
robin_hood::unordered_flat_map<key_type, // key
mapped_type, // mapped_type
hasher, // hasher
key_equal // equality
>;
using value_type = typename map_type::value_type;
using iterator = typename map_type::iterator;
using const_iterator = typename map_type::const_iterator;
using index_vector_type = typename base_type::index_vector_type;
using byte_allocator_type = TemplatedAllocator<char>;
// clang-format on
public:
CoordinateMapCPU() = delete;
CoordinateMapCPU(size_type const number_of_coordinates,
size_type const coordinate_size,
stride_type const &stride = {1},
byte_allocator_type alloc = byte_allocator_type())
: base_type(number_of_coordinates, coordinate_size, stride, alloc),
m_map(
map_type{0, hasher{coordinate_size}, key_equal{coordinate_size}}) {
m_map.reserve(number_of_coordinates);
}
/*
* @brief given a key iterator begin-end pair and a value iterator begin-end
* pair, insert all elements.
*
* @return none
*/
void insert(coordinate_type const *coordinate_begin,
coordinate_type const *coordinate_end) {
size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size;
base_type::allocate(N);
index_type value = 0;
for (coordinate_type const *key = coordinate_begin; key != coordinate_end;
key += m_coordinate_size, ++value) {
// value_type ctor needed because this might be called with std::pair's
insert(key_type(key), value);
}
}
/*
* @brief given a key iterator begin-end pair and a value iterator begin-end
* pair, insert all elements.
*
* @return pair<vector<long>, vector<long>> if return_unique_inverse_map.
* mapping is a vector of unique indices and inverse_mapping is a vector of
* indices that reconstructs the original coordinate from the list of unique
* coordinates.
*
* >>> unique_coordinates = input_coordinates[mapping]
* >>> reconstructed_coordinates = unique_coordinates[inverse_mapping]
* >>> torch.all(reconstructed_coordinates == input_coordinates)
*/
template <bool remap>
std::pair<std::vector<int64_t>, std::vector<int64_t>> // return maps
insert_and_map(coordinate_type const *coordinate_begin,
coordinate_type const *coordinate_end) {
size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size;
std::vector<int64_t> mapping, inverse_mapping;
base_type::allocate(N);
mapping.reserve(N);
inverse_mapping.reserve(N);
index_type value{0}, row_index{0};
for (coordinate_type const *key = coordinate_begin; key != coordinate_end;
key += m_coordinate_size, row_index += 1) {
// value_type ctor needed because this might be called with std::pair's
auto const result = insert(key_type(key), value);
if (result.second) {
mapping.push_back(row_index);
inverse_mapping.push_back(value);
} else {
// result.first is an iterator of pair<key, mapped_type>
inverse_mapping.push_back(result.first->second);
}
value += remap ? result.second : 1;
}
return std::make_pair(std::move(mapping), std::move(inverse_mapping));
}
/*
* @brief given a key iterator begin-end pair find all valid keys and its
* index.
*
* @return a pair of (valid index, query value) vectors.
*/
template <typename key_iterator>
std::pair<index_vector_type, index_vector_type> find(key_iterator key_first,
key_iterator key_last) {
size_type N = key_last - key_first;
ASSERT(N <= base_type::m_capacity,
"Invalid search range. Current capacity:", base_type::m_capacity,
", search range:", N);
// reserve the result slots
index_vector_type valid_query_index, query_result;
valid_query_index.reserve(N);
query_result.reserve(N);
key_iterator key_curr{key_first};
for (; key_curr != key_last; ++key_curr) {
auto const query_iter = m_map.find(*key_curr);
// If valid query
if (query_iter != m_map.end()) {
valid_query_index.push_back(key_curr - key_first);
query_result.push_back(query_iter->second);
}
}
return std::make_pair(valid_query_index, query_result);
}
// Network specific functions.
/*
* @brief strided coordinate map.
*/
self_type stride(stride_type const &stride) const {
ASSERT(stride.size() == m_coordinate_size - 1, "Invalid stride", stride);
// Over estimate the reserve size to be size();
self_type stride_map(
size(), m_coordinate_size,
detail::stride_tensor_stride(base_type::m_tensor_stride, stride),
base_type::m_byte_allocator);
index_type c = 0;
std::vector<coordinate_type> dst(m_coordinate_size);
coordinate<coordinate_type> strided_coordinate(&dst[0]);
for (auto const &kv : m_map) {
detail::stride_coordinate<coordinate_type>(kv.first, dst,
stride_map.m_tensor_stride);
auto result = stride_map.insert(strided_coordinate, c);
c += result.second;
}
return stride_map;
}
/*****************************************************************************
* Map generation
****************************************************************************/
/*
* @brief strided coordinate map for region.
*/
self_type stride_region(cpu_kernel_region<coordinate_type> const &kernel,
stride_type const &out_tensor_stride) const {
ASSERT(kernel.coordinate_size() == m_coordinate_size, "Invalid kernel");
// Over estimate the reserve size to be size();
self_type stride_map(size() * kernel.volume(), m_coordinate_size,
out_tensor_stride, base_type::m_byte_allocator);
auto ckernel = cpu_kernel_region<coordinate_type>(kernel);
std::vector<coordinate_type> tmp(m_coordinate_size);
coordinate<coordinate_type> point(tmp.data());
index_type num_used{0};
if (kernel.is_transpose()) {
for (auto iter_in = m_map.begin(); iter_in != m_map.end(); ++iter_in) {
// For elements in the current region
for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume();
++kernel_ind) {
ckernel.coordinate_at(kernel_ind, iter_in->first.data(), tmp.data());
auto const result = stride_map.insert(point, num_used);
num_used += result.second;
}
}
} else {
LOG_DEBUG("stride_region with no transpose");
// Expand coordinates with regular conv
for (auto iter_in = m_map.begin(); iter_in != m_map.end(); ++iter_in) {
// For elements in the current region
for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume();
++kernel_ind) {
// TODO replace with more efficient code
ckernel.coordinate_at(kernel_ind, iter_in->first.data(), tmp.data());
if (detail::is_coordinate_aligned<coordinate_type, stride_type>(
point, out_tensor_stride)) {
auto const result = stride_map.insert(point, num_used);
num_used += result.second;
}
}
}
}
return stride_map;
}
/*
* @brief strided coordinate map.
*/
self_type origin() const {
// tensor stride is set to {0,..., 0} for the origin map.
stride_type origin_tensor_stride(m_coordinate_size - 1);
std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(),
[](auto &i) { i = 0; });
// Over estimate the reserve size to be size();
self_type origin_map(size(), m_coordinate_size, origin_tensor_stride,
base_type::m_byte_allocator);
index_type c = 0;
std::vector<coordinate_type> dst(m_coordinate_size);
std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; });
coordinate<coordinate_type> tmp_coordinate(&dst[0]);
for (auto const &kv : m_map) {
dst[0] = kv.first[0];
auto result = origin_map.insert(tmp_coordinate, c);
c += result.second;
}
return origin_map;
}
/*
* @brief generate a new coordinate map that only keeps coordinates with true
* keep mask
*/
self_type prune(bool const *keep_begin, bool const *keep_end) const {
ASSERT(keep_end - keep_begin == size(), "Invalid range for pruning");
// Over estimate the reserve size to be size();
self_type pruned_map(size(), m_coordinate_size, base_type::m_tensor_stride,
base_type::m_byte_allocator);
index_type c = 0;
for (auto const &kv : m_map) {
// Use the row index defined
if (keep_begin[kv.second]) {
auto result = pruned_map.insert(kv.first, c);
c += result.second;
}
}
LOG_DEBUG("size:", pruned_map.size(), "capacity:", pruned_map.capacity());
return pruned_map;
}
self_type merge(const self_type &other) const {
std::vector<std::reference_wrapper<self_type>> maps{*this, other};
// maps.push_back(*this);
// maps.push_back(other);
return merge(maps);
}
self_type
merge(const std::vector<std::reference_wrapper<self_type>> &maps) const {
// merge all input maps
size_t all_size = std::accumulate(
maps.begin(), maps.end(), 0,
[](size_t sum, const self_type &map) { return sum + map.size(); });
self_type merged_map(all_size, m_coordinate_size,
base_type::m_tensor_stride,
base_type::m_byte_allocator);
// Push all coordinates
index_type c = 0;
for (self_type const &map : maps) {
for (auto const &kv : map.m_map) {
auto result = merged_map.insert(kv.first, c);
c += result.second;
}
}
return merged_map;
}
/*****************************************************************************
* Kernel map
****************************************************************************/
cpu_kernel_map
kernel_map(self_type const &out_coordinate_map,
cpu_kernel_region<coordinate_type> const &kernel) const {
// Over estimate the reserve size to be size();
size_type out_size = out_coordinate_map.size();
size_type kernel_volume = kernel.volume();
LOG_DEBUG("kernel volume:", kernel_volume, "out_size:", out_size);
cpu_in_maps in_maps = initialize_maps<cpu_in_map>(kernel_volume, out_size);
cpu_out_maps out_maps =
initialize_maps<cpu_out_map>(kernel_volume, out_size);
std::vector<size_type> num_used(kernel_volume);
std::for_each(num_used.begin(), num_used.end(), [](auto &i) { i = 0; });
// OMP
const auto &out_mmap = out_coordinate_map.m_map;
const size_t out_map_num_elements = out_mmap.capacity();
// size_t stride = max((size_t)100, numElements / (2 *
// omp_get_max_threads())); size_t N = (numElements + stride - 1) /
// stride;
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and the
// job sizes. If some jobs finish earlier than others due to imbalance in
// hash distribution, these threads will be idle.
size_t N = 2 * omp_get_max_threads();
const size_t stride = (out_map_num_elements + N - 1) / N;
N = (out_map_num_elements + stride - 1) / stride;
LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride.");
LOG_DEBUG((kernel.region_type() != RegionType::CUSTOM && kernel_volume == 1)
? "single kernel"
: "otherwise");
// When no need to iterate through the region
// Put if outside the loop for speed
if (kernel.region_type() != RegionType::CUSTOM && kernel_volume == 1) {
index_type curr_index_begin = 0;
for (auto iter_out = out_mmap.begin(); iter_out != out_mmap.end();
++iter_out) {
const auto iter_in = m_map.find(iter_out->first);
if (iter_in != m_map.end()) {
in_maps[0][curr_index_begin] = iter_in->second;
out_maps[0][curr_index_begin] = iter_out->second;
++curr_index_begin;
}
}
num_used[0] = curr_index_begin;
} else {
#pragma omp parallel for
for (index_type n = 0; n < N; n++) {
auto ckernel = cpu_kernel_region<coordinate_type>(kernel);
// temporary variables for each thread
std::vector<coordinate_type> tmp(m_coordinate_size);
coordinate<coordinate_type> curr_kernel_coordinate(tmp.data());
index_type curr_index_begin;
for (auto iter_out = out_mmap.begin(stride * n);
iter_out.num_steps() <
std::min(stride, out_map_num_elements - n * stride);
++iter_out) {
// For elements in the current region
for (uint32_t kernel_ind = 0; kernel_ind < ckernel.volume();
++kernel_ind) {
// If the input coord exists
ckernel.coordinate_at(kernel_ind, iter_out->first.data(),
tmp.data());
const auto iter_in = m_map.find(curr_kernel_coordinate);
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_out->first.data(), m_coordinate_size),
// "->", PtrToString(point.data(), m_coordinate_size));
if (iter_in != m_map.end()) {
#pragma omp atomic capture
{
curr_index_begin = num_used[kernel_ind];
num_used[kernel_ind] += 1;
}
// Ensure that in_maps and out_maps are resized accordingly
in_maps[kernel_ind][curr_index_begin] = iter_in->second;
out_maps[kernel_ind][curr_index_begin] = iter_out->second;
// LOG_DEBUG(kernel_ind, ":",
// PtrToString(iter_in->first.data(),
// m_coordinate_size),
// "->",
// PtrToString(iter_out->first.data(),
// m_coordinate_size));
}
}
}
}
}
for (index_type i = 0; i < kernel_volume; ++i) {
index_type max_num = num_used[i];
LOG_DEBUG("kernel index", i, "size:", max_num);
in_maps[i].resize(max_num);
out_maps[i].resize(max_num);
}
return std::make_pair(in_maps, out_maps);
}
cpu_kernel_map stride_map(self_type const &out_coordinate_map,
stride_type const &out_tensor_stride) const {
// generate an in-out (kernel) map that maps all input points in the same
// voxel to strided output voxel.
size_type in_size = size();
LOG_DEBUG("Generate stride_map with in NNZ:", in_size,
"out NNZ:", out_coordinate_map.size(),
"out_tensor_stride:", out_tensor_stride);
cpu_in_maps in_maps = initialize_maps<cpu_in_map>(1, in_size);
cpu_out_maps out_maps = initialize_maps<cpu_out_map>(1, in_size);
LOG_DEBUG("stride map in_maps.size():", in_size);
LOG_DEBUG("stride map out_maps.size():", out_coordinate_map.size());
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and the
// job sizes. If some jobs finish earlier than others due to imbalance in
// hash distribution, these threads will be idle.
const size_t in_map_num_elements = m_map.capacity();
size_t N = 2 * omp_get_max_threads();
const size_t stride = (in_map_num_elements + N - 1) / N;
N = (in_map_num_elements + stride - 1) / stride;
LOG_DEBUG("kernel map with", N, "chunks.");
index_type num_used = 0;
#pragma omp parallel for
for (index_type n = 0; n < N; ++n) {
index_type curr_index_begin;
std::vector<coordinate_type> dst(m_coordinate_size);
for (auto iter_in = m_map.begin(stride * n);
iter_in.num_steps() <
std::min(stride, in_map_num_elements - n * stride);
++iter_in) {
detail::stride_coordinate<coordinate_type>(iter_in->first, dst,
out_tensor_stride);
const auto iter_out =
out_coordinate_map.find(coordinate<coordinate_type>(dst.data()));
ASSERT(iter_out != out_coordinate_map.m_map.cend(),
"Invalid out_coordinate_map");
#pragma omp atomic capture
{
curr_index_begin = num_used;
num_used += 1;
}
in_maps[0][curr_index_begin] = iter_in->second;
out_maps[0][curr_index_begin] = iter_out->second;
}
}
return std::make_pair(move(in_maps), move(out_maps));
}
cpu_kernel_map origin_map(self_type const &origin_coordinate_map) const {
// generate an in-out (kernel) map that maps all input points in the same
// voxel to strided output voxel.
ASSERT(std::all_of(origin_coordinate_map.get_tensor_stride().begin(),
origin_coordinate_map.get_tensor_stride().end(),
[](auto const &i) { return i == 0; }),
"Invalid origin tensor stride",
origin_coordinate_map.get_tensor_stride());
size_type const in_size = size();
size_type const out_size = origin_coordinate_map.size();
LOG_DEBUG("Generate origin_map with in NNZ:", in_size,
"out NNZ:", out_size);
ASSERT(in_size > out_size, "Invalid out_coordinate_map");
std::vector<std::pair<index_type, index_type>> in_out(in_size);
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and the
// job sizes. If some jobs finish earlier than others due to imbalance in
// hash distribution, these threads will be idle.
size_t const in_map_num_elements = m_map.capacity();
size_t N = 2 * omp_get_max_threads();
size_t const stride = (in_map_num_elements + N - 1) / N;
N = (in_map_num_elements + stride - 1) / stride;
LOG_DEBUG("kernel map with", N, "chunks.");
size_type num_used = 0;
#pragma omp parallel for
for (index_type n = 0; n < N; ++n) {
index_type curr_index_begin;
std::vector<coordinate_type> dst(m_coordinate_size);
std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; });
for (auto iter_in = m_map.begin(stride * n);
iter_in.num_steps() <
std::min(stride, in_map_num_elements - n * stride);
++iter_in) {
dst[0] = iter_in->first[0];
const auto iter_origin =
origin_coordinate_map.find(coordinate<coordinate_type>(dst.data()));
ASSERT(iter_origin != origin_coordinate_map.m_map.cend(),
"Invalid origin_coordinate_map");
index_type origin_row_index = iter_origin->second;
#pragma omp atomic capture
{
curr_index_begin = num_used;
num_used += 1;
}
in_out[curr_index_begin] =
std::make_pair(iter_in->second, origin_row_index);
}
}
// Decomposed kernel map
auto batch_indices = origin_coordinate_map.batch_indices();
return cpu_kernel_map(in_out, batch_indices);
}
/*****************************************************************************
* Interpolation
****************************************************************************/
/*
* Given a continuous tensor field, return the weights and associated kernel
* map
*/
std::vector<at::Tensor>
interpolation_map_weight(at::Tensor const &tfield) const {
// Over estimate the reserve size to be size();
ASSERT(tfield.dim() == 2, "Invalid tfield dimension");
ASSERT(tfield.size(1) == m_coordinate_size, "Invalid tfield size");
// AT_DISPATCH_FLOATING_TYPES(
// tfield.scalar_type(), "interpolation_map_weight_kernel", [&] {
switch (tfield.scalar_type()) {
case at::ScalarType::Double:
return detail::interpolation_map_weight_kernel<coordinate_type, double,
map_type>(
tfield.size(0), //
m_coordinate_size, //
tfield.template data_ptr<double>(), //
m_map, //
base_type::m_tensor_stride);
case at::ScalarType::Float:
return detail::interpolation_map_weight_kernel<coordinate_type, float,
map_type>(
tfield.size(0), //
m_coordinate_size, //
tfield.template data_ptr<float>(), //
m_map, //
base_type::m_tensor_stride);
default:
ASSERT(false, "Unsupported float type");
}
}
template <typename coordinate_field_type>
std::pair<at::Tensor, at::Tensor>
field_map(coordinate_field_type const *p_tfield,
size_type const num_tfield) const {
return detail::field_map_kernel<coordinate_type, coordinate_field_type,
map_type>(num_tfield, //
m_coordinate_size, //
p_tfield, //
m_map, //
base_type::m_tensor_stride);
}
/*****************************************************************************
* Union Map
****************************************************************************/
/*
* Find mapping from all inputs to self. The mapping is a list of 2xN tensors
*/
std::vector<at::Tensor> union_map(
std::vector<std::reference_wrapper<self_type>> const &in_maps) const {
std::vector<at::Tensor> in_out_maps;
for (self_type const &in_map : in_maps) {
size_type const N = in_map.size();
size_type const capacity = in_map.m_map.capacity();
auto curr_map = torch::empty(
{2, N},
torch::TensorOptions().dtype(torch::kInt64).requires_grad(false));
int64_t *p_in_rows = curr_map.template data_ptr<int64_t>();
std::fill_n(p_in_rows, 2 * N, -1);
int64_t *p_union_rows = p_in_rows + N;
index_type offset{0};
for (auto iter_in = in_map.m_map.begin(); iter_in.num_steps() < capacity;
++iter_in) {
p_in_rows[offset] = iter_in->second;
// WARNING: This assumes that the union map has a corresponding
// coordinate for speed. This will results in an unexpected behavior
// if the union map does not contain the coordinate.
p_union_rows[offset] = find(iter_in->first)->second;
LOG_DEBUG("offset:", offset, " in:", p_in_rows[offset],
" out:", p_union_rows[offset]);
offset++;
}
in_out_maps.push_back(std::move(curr_map));
}
return in_out_maps;
}
inline size_type size() const noexcept { return m_map.size(); }
std::string to_string() const {
Formatter o;
o << "CoordinateMapCPU:" << size() << "x" << m_coordinate_size;
return o.str();
}
using base_type::capacity;
using base_type::coordinate_size;
using base_type::get_tensor_stride;
inline void reserve(size_type c) {
base_type::reserve(c);
m_map.reserve(c);
}
void copy_coordinates(coordinate_type *dst_coordinate) const {
if (m_map.size() == 0)
return;
size_t const capacity = m_map.capacity();
size_t N = omp_get_max_threads();
const size_t stride = (capacity + N - 1) / N;
N = (capacity + stride - 1) / stride;
LOG_DEBUG("kernel map with", N, "chunks, stride", stride, "capacity",
capacity);
// When no need to iterate through the region
// Put if outside the loop for speed
#pragma omp parallel for
for (index_type n = 0; n < N; ++n) {
for (auto it = m_map.begin(stride * n); //
it.num_steps() < std::min(stride, capacity - n * stride); //
++it) {
std::copy_n(it->first.data(), m_coordinate_size,
dst_coordinate + m_coordinate_size * it->second);
}
}
}
std::vector<coordinate_type> batch_indices() const {
std::vector<coordinate_type> indices(size());
for (auto it = m_map.begin(); it != m_map.end(); ++it) {
indices[it->second] = it->first.data()[0];
}
return indices;
}
std::pair<iterator, bool> insert(key_type const &key,
mapped_type const &val) {
ASSERT(val < base_type::m_capacity, "Invalid mapped value: ", val,
", current capacity: ", base_type::m_capacity);
coordinate_type *ptr = &base_type::m_coordinates[val * m_coordinate_size];
std::copy_n(key.data(), m_coordinate_size, ptr);
return m_map.insert(value_type(coordinate<coordinate_type>{ptr}, val));
}
inline iterator find(key_type const &key) { return m_map.find(key); }
inline const_iterator find(key_type const &key) const {
return m_map.find(key);
}
inline const_iterator cend() const { return m_map.cend(); }
private:
using base_type::m_coordinate_size;
map_type m_map;
};
// Field map
template <typename coordinate_field_type, typename coordinate_int_type,
template <typename T> class TemplatedAllocator = std::allocator>
class CoordinateFieldMapCPU
: public CoordinateMap<coordinate_field_type, TemplatedAllocator> {
// Coordinate wrapper
public:
using base_type = CoordinateMap<coordinate_field_type, TemplatedAllocator>;
using coordinate_map_type =
CoordinateMapCPU<coordinate_int_type, TemplatedAllocator>;
using self_type =
CoordinateFieldMapCPU<coordinate_field_type, coordinate_int_type,
TemplatedAllocator>;
using size_type = typename base_type::size_type;
using index_type = typename base_type::index_type;
using stride_type = typename base_type::stride_type;
using byte_allocator_type = TemplatedAllocator<char>;
public:
CoordinateFieldMapCPU() = delete;
CoordinateFieldMapCPU(size_type const number_of_coordinates,
size_type const coordinate_size,
stride_type const &stride = {1},
byte_allocator_type alloc = byte_allocator_type())
: base_type(number_of_coordinates, coordinate_size, stride, alloc),
m_size(number_of_coordinates) {
base_type::reserve(number_of_coordinates);
}
/*
* @brief given a key iterator begin-end pair and a value iterator begin-end
* pair, insert all elements.
*
* @return none
*/
void insert(coordinate_field_type const *coordinate_begin,
coordinate_field_type const *coordinate_end) {
size_type N = (coordinate_end - coordinate_begin) / m_coordinate_size;
base_type::allocate(N);
// copy data directly to the ptr
std::copy_n(coordinate_begin, N * m_coordinate_size,
base_type::coordinate_data());
}
using base_type::const_coordinate_data;
using base_type::coordinate_data;
void copy_coordinates(coordinate_field_type *dst_coordinate) const {
std::copy_n(base_type::const_coordinate_data(), size() * m_coordinate_size,
dst_coordinate);
}
void quantize_coordinates(coordinate_int_type *p_dst_coordinates,
stride_type const &tensor_stride) const {
coordinate_field_type const *const p_tfield = const_coordinate_data();
int64_t const stride_prod = std::accumulate(
tensor_stride.begin(), tensor_stride.end(), 1, std::multiplies<>());
ASSERT(stride_prod > 0, "Invalid stride");
const size_t N = omp_get_max_threads();
const size_t stride = (size() + N - 1) / N;
LOG_DEBUG("kernel map with", N, "chunks and", stride, "stride.");
if (stride_prod == 1) {
#pragma omp parallel for
for (uint32_t n = 0; n < N; n++) {
for (auto i = stride * n;
i < std::min<uint64_t>((n + 1) * stride, uint64_t(size())); ++i) {
// batch index
coordinate_int_type *p_curr_dst =
&p_dst_coordinates[i * m_coordinate_size];
p_curr_dst[0] = std::lroundf(p_tfield[i * m_coordinate_size]);
for (uint32_t j = 1; j < m_coordinate_size; ++j) {
p_curr_dst[j] = std::floor(p_tfield[m_coordinate_size * i + j]);
}
}
}
} else {
#pragma omp parallel for
for (uint32_t n = 0; n < N; n++) {
for (auto i = stride * n;
i < std::min<uint64_t>((n + 1) * stride, uint64_t(size())); ++i) {
// batch index
coordinate_int_type *p_curr_dst =
&p_dst_coordinates[i * m_coordinate_size];
p_curr_dst[0] = std::lroundf(p_tfield[i * m_coordinate_size]);
for (uint32_t j = 1; j < m_coordinate_size; ++j) {
auto const curr_tensor_stride = tensor_stride[j - 1];
p_curr_dst[j] = curr_tensor_stride *
std::floor(p_tfield[m_coordinate_size * i + j] /
curr_tensor_stride);
}
}
}
}
}
coordinate_map_type origin() const {
// tensor stride is set to {0,..., 0} for the origin map.
stride_type origin_tensor_stride(m_coordinate_size - 1);
std::for_each(origin_tensor_stride.begin(), origin_tensor_stride.end(),
[](auto &i) { i = 0; });
// Over estimate the reserve size to be size();
CoordinateMapCPU<coordinate_int_type, TemplatedAllocator> origin_map(
size(), m_coordinate_size, origin_tensor_stride,
base_type::m_byte_allocator);
coordinate_field_type const *const p_tfield = const_coordinate_data();
std::vector<coordinate_int_type> dst(m_coordinate_size);
std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; });
coordinate<coordinate_int_type> tmp_coordinate(&dst[0]);
index_type c = 0;
for (size_t i = 0; i < size(); ++i) {
dst[0] = coordinate_int_type(p_tfield[i * m_coordinate_size]);
auto result = origin_map.insert(tmp_coordinate, c);
c += result.second;
}
return origin_map;
}
cpu_kernel_map
origin_map(coordinate_map_type const &origin_coordinate_map) const {
// generate an in-out (kernel) map that maps all input points in the same
// voxel to strided output voxel.
ASSERT(std::all_of(origin_coordinate_map.get_tensor_stride().begin(),
origin_coordinate_map.get_tensor_stride().end(),
[](auto const &i) { return i == 0; }),
"Invalid origin tensor stride",
origin_coordinate_map.get_tensor_stride());
size_type const in_size = size();
size_type const out_size = origin_coordinate_map.size();
LOG_DEBUG("Generate origin_map with in NNZ:", in_size,
"out NNZ:", out_size);
ASSERT(in_size > out_size, "Invalid out_coordinate_map");
std::vector<std::pair<index_type, index_type>> in_out(in_size);
// compute the chunk size per thread.
// There's a trade-off between the thread initialization overhead and the
// job sizes. If some jobs finish earlier than others due to imbalance in
// hash distribution, these threads will be idle.
size_t const in_map_num_elements = size();
size_t N = 2 * omp_get_max_threads();
size_t const stride = (in_map_num_elements + N - 1) / N;
N = (in_map_num_elements + stride - 1) / stride;
LOG_DEBUG("kernel map with", N, "chunks", stride, "stride");
coordinate_field_type const *const p_tfield = const_coordinate_data();
size_type num_used = 0;
#pragma omp parallel for
for (index_type n = 0; n < N; ++n) {
index_type curr_index_begin;
std::vector<coordinate_int_type> dst(m_coordinate_size);
std::for_each(dst.begin(), dst.end(), [](auto &i) { i = 0; });
// LOG_DEBUG(n, "th chunk contains",
// std::min((n + 1) * stride, in_map_num_elements) - n * stride,
// "elements");
for (size_t i = stride * n;
i < std::min((n + 1) * stride, in_map_num_elements); ++i) {
dst[0] = p_tfield[i * m_coordinate_size];
// LOG_DEBUG(i, "th row batch index", dst[0]);
const auto iter_origin = origin_coordinate_map.find(
coordinate<coordinate_int_type>(dst.data()));
ASSERT(iter_origin != origin_coordinate_map.cend(),
"Invalid origin_coordinate_map");
index_type origin_row_index = iter_origin->second;
#pragma omp atomic capture
{
curr_index_begin = num_used;
num_used += 1;
}
in_out[curr_index_begin] = std::make_pair(i, origin_row_index);
}
}
// #ifdef DEBUG
// LOG_DEBUG("Kernel map");
// for (auto iter : in_out) {
// std::cout << iter.first << ":" << iter.second << "\n";
// }
// #endif
// Decomposed kernel map
auto batch_indices = origin_coordinate_map.batch_indices();
return cpu_kernel_map(in_out, batch_indices);
}
inline size_type size() const noexcept { return m_size; }
std::string to_string() const {
Formatter o;
o << "CoordinateFieldMapCPU:" << size() << "x" << m_coordinate_size;
return o.str();
}
private:
using base_type::m_coordinate_size;
size_type m_size;
};
} // namespace minkowski
#endif // COORDINATE_MAP_CPU