instant-ngp build

7873319 over 3 years ago

10.2 kB

	/*
	* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* NVIDIA CORPORATION and its licensors retain all intellectual property
	* and proprietary rights in and to this software, related documentation
	* and any modifications thereto. Any use, reproduction, disclosure or
	* distribution of this software and related documentation without an express
	* license agreement from NVIDIA CORPORATION is strictly prohibited.
	*/

	/** @file triangle_octree.cuh
	* @author Thomas Müller, NVIDIA
	*/

	#pragma once

	#include <neural-graphics-primitives/triangle_bvh.cuh>
	#include <neural-graphics-primitives/thread_pool.h>

	#include <Eigen/Dense>

	#include <tiny-cuda-nn/gpu_memory.h>

	#include <stack>

	namespace std {
	template<>
	struct less<ngp::Vector4i16> {
	bool operator()(const ngp::Vector4i16& a, const ngp::Vector4i16& b) const {
	for(size_t i = 0; i < 4; ++i) {
	if (a[i] < b[i]) return true;
	if (a[i] > b[i]) return false;
	}
	return false;
	}
	};

	template <>
	struct hash<ngp::Vector4i16> {
	size_t operator()(const ngp::Vector4i16& x) const {
	return (size_t)x.x() * 73856093 + (size_t)x.y() * 19349663 + (size_t)x.z() * 83492791 + (size_t)x.w() * 25165843;
	}
	};
	}

	NGP_NAMESPACE_BEGIN

	struct TriangleOctreeNode {
	int children[8];
	Vector3i16 pos;
	uint8_t depth;
	};

	struct TriangleOctreeDualNode {
	uint32_t vertices[8];
	};

	inline void write_brick_voxel_positions(Eigen::Vector3f* dst, uint32_t B, float size, const Eigen::Vector3f& base_pos) {
	float rstep = size / (B - 1);
	for (uint32_t z = 0; z < B; ++z) {
	for (uint32_t y = 0; y < B; ++y) {
	for (uint32_t x = 0; x < B; ++x) {
	dst++ = base_pos + Eigen::Vector3f{x rstep, y * rstep, z * rstep};
	}
	}
	}
	}

	class TriangleOctree {
	public:
	std::vector<Eigen::Vector3f> build_brick_voxel_position_list(uint32_t B) const { // brick size B^3
	std::vector<Eigen::Vector3f> brick_pos;
	brick_pos.resize(m_dual_nodes.size() * B * B * B);
	float rstep = 1.f / (B - 1);
	write_brick_voxel_positions(brick_pos.data(), B, 1.f, Eigen::Vector3f::Zero());
	uint32_t last_level = 0;
	uint32_t prev_ni = 0, ni = 0;
	for (ni = 0; ni < (uint32_t)m_nodes.size(); ++ni) {
	auto const &n = m_nodes[ni];
	if (n.depth > last_level) {
	tlog::info() << (ni-prev_ni) << " bricks at level " << last_level << " with " << (ni-prev_ni)BB*B << " values.";
	last_level = n.depth;
	prev_ni = ni;
	}
	float child_size = std::scalbnf(1.f, -(int)n.depth - 1);
	Vector3i16 child_pos_base = n.pos * (uint16_t)2;
	for (int i = 0; i < 8; ++i) {
	int child_idx = n.children[i];
	if (child_idx < 0)
	continue;
	Vector3i16 child_pos = child_pos_base;
	if (i&1) ++child_pos.x();
	if (i&2) ++child_pos.y();
	if (i&4) ++child_pos.z();
	Eigen::Vector3f base_pos = child_pos.cast<float>() * child_size;
	write_brick_voxel_positions(brick_pos.data() + (child_idx * B * B * B), B, child_size, base_pos);
	}
	}
	tlog::info() << (ni-prev_ni) << " bricks at level " << last_level << " with " << (ni-prev_ni)BB*B << " values.";
	tlog::info() << (m_dual_nodes.size()-ni) << " bricks at level " << last_level+1 << " with " << (m_dual_nodes.size()-ni)BB*B << " values.";
	return brick_pos;
	}

	void build(const TriangleBvh& bvh, const std::vector<Triangle>& triangles, uint32_t max_depth) {
	m_nodes.clear();
	m_dual_nodes.clear();

	// Root
	m_nodes.emplace_back();
	m_dual_nodes.emplace_back();

	// Start counting at the 1 root node that we already have
	std::atomic<int> node_counter{1};
	int n_nodes = 0;

	ThreadPool pool;

	// Only generate nodes up to max_depth-1! The dual nodes will truly reach to the max depth
	for (uint8_t depth = 0; depth < max_depth-1; ++depth) {
	int n_pending_nodes_to_build = node_counter - n_nodes;

	// Reserve enough memory as if the next layer was dense. This allows parallelizing the following code
	if (depth < max_depth-2) {
	m_nodes.resize(node_counter + n_pending_nodes_to_build*8);
	} else {
	// Tighten the memory back down when we're done.
	m_nodes.resize(node_counter);
	}

	int last_n_nodes = n_nodes;
	n_nodes = node_counter;
	pool.parallelFor<int>(last_n_nodes, node_counter, [&](size_t parent_idx) {
	Vector3i16 child_pos_base = m_nodes[parent_idx].pos * (uint16_t)2;
	float size = std::scalbnf(1.0f, -depth-1);

	for (uint32_t i = 0; i < 8; ++i) {
	Vector3i16 child_pos = child_pos_base;
	if (i&1) ++child_pos.x();
	if (i&2) ++child_pos.y();
	if (i&4) ++child_pos.z();

	BoundingBox bb = {size * child_pos.cast<float>(), size * (child_pos + Vector3i16::Constant(1)).cast<float>()};

	if (!bvh.touches_triangle(bb, triangles.data())) {
	m_nodes[parent_idx].children[i] = -1;
	continue;
	}

	int node_idx = node_counter++;
	m_nodes[parent_idx].children[i] = node_idx;

	// Create regular nodes one layer less deep as the dual nodes
	if (depth < max_depth-2) {
	m_nodes[node_idx].pos = {(uint16_t)child_pos.x(), (uint16_t)child_pos.y(), (uint16_t)child_pos.z()};
	m_nodes[node_idx].depth = (uint8_t)(depth+1);
	}
	}
	});
	}

	m_dual_nodes.resize(node_counter);

	tlog::success() << "Built TriangleOctree: depth=" << max_depth << " nodes=" << m_nodes.size() << " dual_nodes=" << m_dual_nodes.size() << ". Populating dual nodes...";

	// TODO: find a fast lockfree hashmap implementation and parallelize the bottom for loop
	std::unordered_map<Vector4i16, uint32_t> coords;
	coords.reserve(m_dual_nodes.size()*8);
	m_n_vertices = 0;
	auto generate_dual_coords = [&](TriangleOctreeDualNode& dual_node, int depth, const Vector3i16 pos) {
	for (uint32_t i = 0; i < 8; ++i) {
	Vector4i16 coord = {(uint16_t)pos.x(), (uint16_t)pos.y(), (uint16_t)pos.z(), (uint16_t)depth};
	if (i&1) ++coord.x();
	if (i&2) ++coord.y();
	if (i&4) ++coord.z();

	auto p = coords.insert({coord, m_n_vertices});
	if (p.second) {
	++m_n_vertices;
	}

	dual_node.vertices[i] = p.first->second;
	}
	};

	generate_dual_coords(m_dual_nodes[0], 0, {0, 0, 0});
	for (auto& node : m_nodes) {
	for (uint32_t i = 0; i < 8; ++i) {
	auto child_idx = node.children[i];
	if (child_idx == -1) {
	continue;
	}

	Vector3i16 child_pos = node.pos * (uint16_t)2;
	if (i&1) ++child_pos.x();
	if (i&2) ++child_pos.y();
	if (i&4) ++child_pos.z();

	generate_dual_coords(m_dual_nodes[child_idx], node.depth+1, child_pos);
	}
	}

	m_depth = max_depth;

	m_nodes_gpu.resize_and_copy_from_host(m_nodes);
	m_dual_nodes_gpu.resize_and_copy_from_host(m_dual_nodes);
	}

	uint32_t depth() const {
	return m_depth;
	}

	uint32_t n_vertices() const {
	return m_n_vertices;
	}

	uint32_t n_nodes() const {
	return (uint32_t)m_nodes.size();
	}

	uint32_t n_dual_nodes() const {
	return (uint32_t)m_dual_nodes.size();
	}

	template <typename F>
	__device__ static uint8_t traverse(const TriangleOctreeNode* nodes, const TriangleOctreeDualNode* dual_nodes, int max_depth, Eigen::Vector3f pos, F fun) {
	int node_idx = 0;

	for (uint8_t depth = 0; true; ++depth) {
	fun(dual_nodes[node_idx], depth, pos);

	// Dual nodes are one layer deeper than regular nodes
	if (depth >= max_depth-1) {
	return depth+1;
	}

	// Traverse

	uint8_t child_in_node = 0;

	NGP_PRAGMA_UNROLL
	for (uint8_t i = 0; i < 3; ++i) {
	if (pos[i] >= 0.5f) {
	child_in_node \|= (1 << i);
	pos[i] = (pos[i] - 0.5f) * 2;
	} else {
	pos[i] *= 2;
	}
	}

	node_idx = nodes[node_idx].children[child_in_node];

	if (node_idx < 0) {
	return depth+1;
	}
	}
	return max_depth;
	}

	__device__ static bool contains(const TriangleOctreeNode* nodes, int max_depth, Eigen::Vector3f pos) {
	const TriangleOctreeNode* node = &nodes[0];

	for (uint8_t depth = 0; depth < max_depth-1; ++depth) {
	uint8_t child_in_node = 0;

	NGP_PRAGMA_UNROLL
	for (uint8_t i = 0; i < 3; ++i) {
	if (pos[i] >= 0.5f) {
	child_in_node \|= (1 << i);
	pos[i] = (pos[i] - 0.5f) * 2;
	} else {
	pos[i] *= 2;
	}
	}

	int child_idx = node->children[child_in_node];

	if (child_idx < 0) {
	return false;
	}

	node = &nodes[child_idx];
	}

	return true;
	}

	__device__ static float ray_intersect(const TriangleOctreeNode* nodes, int max_depth, const Eigen::Vector3f& ro, const Eigen::Vector3f& rd) {
	FixedStack<int, 64> query_stack;
	query_stack.push(0);

	static constexpr float MAX_DIST = 1000.f;

	float mint = MAX_DIST;

	// Ensure that closer children are checked last such that they rise to the top of the stack
	uint8_t reorder_mask = 0;
	if (rd.x() > 0) reorder_mask \|= 1;
	if (rd.y() > 0) reorder_mask \|= 2;
	if (rd.z() > 0) reorder_mask \|= 4;

	while (!query_stack.empty()) {
	int idx = query_stack.pop();

	const TriangleOctreeNode& node = nodes[idx];

	for (uint8_t aux = 0; aux < 8; ++aux) {
	uint8_t i = aux ^ reorder_mask;
	int child_idx = node.children[i];

	uint8_t depth = node.depth+1;
	// Intermediate leaf
	if (child_idx == -1) {
	continue;
	}

	Vector3i16 pos = node.pos * (uint16_t)2;
	if (i&1) ++pos.x();
	if (i&2) ++pos.y();
	if (i&4) ++pos.z();

	float size = scalbnf(1.0f, -depth);

	BoundingBox bb = {size * pos.cast<float>(), size * (pos + Vector3i16::Constant(1)).cast<float>()};
	Eigen::Vector2f t = bb.ray_intersect(ro, rd);
	if (t.y() >= 0 && t.y() < MAX_DIST && t.x() < mint) {
	// All children's children are gonna be leaves,
	// so we can cut to the chase and terminate here already.
	if (depth == max_depth-1) {
	if (t.x() >= 0) {
	mint = t.x();
	}
	} else {
	query_stack.push(child_idx);
	}
	}
	}
	}

	return mint;
	}

	const TriangleOctreeNode* nodes_gpu() const {
	return m_nodes_gpu.data();
	}

	const TriangleOctreeDualNode* dual_nodes_gpu() const {
	return m_dual_nodes_gpu.data();
	}

	private:
	std::vector<TriangleOctreeNode> m_nodes;
	std::vector<TriangleOctreeDualNode> m_dual_nodes;

	tcnn::GPUMemory<TriangleOctreeNode> m_nodes_gpu;
	tcnn::GPUMemory<TriangleOctreeDualNode> m_dual_nodes_gpu;

	uint32_t m_n_vertices = 0;
	uint32_t m_depth = 0;
	};

	NGP_NAMESPACE_END