paulpanwang's picture
Upload folder using huggingface_hub
476e0f0 verified
/*
* Copyright (C) 2023, Inria
* GRAPHDECO research group, https://team.inria.fr/graphdeco
* All rights reserved.
*
* This software is free for non-commercial, research and evaluation use
* under the terms of the LICENSE.md file.
*
* For inquiries contact george.drettakis@inria.fr
*/
#include "forward.h"
#include "auxiliary.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <iostream>
#include <stdio.h>
#include <cmath>
namespace cg = cooperative_groups;
// Forward method for converting the input spherical harmonics
// coefficients of each Gaussian to a simple RGB color.
__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
{
// The implementation is loosely based on code for
// "Differentiable Point-Based Radiance Fields for
// Efficient View Synthesis" by Zhang et al. (2022)
glm::vec3 pos = means[idx];
glm::vec3 dir = pos - campos;
dir = dir / glm::length(dir);
glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
glm::vec3 result = SH_C0 * sh[0];
if (deg > 0)
{
float x = dir.x;
float y = dir.y;
float z = dir.z;
result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
if (deg > 1)
{
float xx = x * x, yy = y * y, zz = z * z;
float xy = x * y, yz = y * z, xz = x * z;
result = result +
SH_C2[0] * xy * sh[4] +
SH_C2[1] * yz * sh[5] +
SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
SH_C2[3] * xz * sh[7] +
SH_C2[4] * (xx - yy) * sh[8];
if (deg > 2)
{
result = result +
SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
SH_C3[1] * xy * z * sh[10] +
SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
SH_C3[5] * z * (xx - yy) * sh[14] +
SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
}
}
}
result += 0.5f;
// RGB colors are clamped to positive values. If values are
// clamped, we need to keep track of this for the backward pass.
clamped[3 * idx + 0] = (result.x < 0);
clamped[3 * idx + 1] = (result.y < 0);
clamped[3 * idx + 2] = (result.z < 0);
return glm::max(result, 0.0f);
}
// Forward version of 2D covariance matrix computation
template<bool INTE = false>
__device__ bool computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, float kernel_size, const float* cov3D, const float* viewmatrix,
float* cov2D, float* camera_plane, float3* output_normal, float2* ray_plane, float& coef, float* invraycov3Ds = nullptr)
{
// The following models the steps outlined by equations 29
// and 31 in "EWA Splatting" (Zwicker et al., 2002).
// Additionally considers aspect / scaling of viewport.
// Transposes used to account for row-/column-major conventions.
float3 t = transformPoint4x3(mean, viewmatrix);
const float limx = 1.3f * tan_fovx;
const float limy = 1.3f * tan_fovy;
float txtz = t.x / t.z;
float tytz = t.y / t.z;
t.x = min(limx, max(-limx, txtz)) * t.z;
t.y = min(limy, max(-limy, tytz)) * t.z;
txtz = t.x / t.z;
tytz = t.y / t.z;
glm::mat3 J = glm::mat3(
focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
0, 0, 0);
glm::mat3 W = glm::mat3(
viewmatrix[0], viewmatrix[4], viewmatrix[8],
viewmatrix[1], viewmatrix[5], viewmatrix[9],
viewmatrix[2], viewmatrix[6], viewmatrix[10]);
glm::mat3 T = W * J;
glm::mat3 Vrk = glm::mat3(
cov3D[0], cov3D[1], cov3D[2],
cov3D[1], cov3D[3], cov3D[4],
cov3D[2], cov3D[4], cov3D[5]);
glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
// output[0] = { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
cov2D[0] = float(cov[0][0] + kernel_size);
cov2D[1] = float(cov[0][1]);
cov2D[2] = float(cov[1][1] + kernel_size);
const float det_0 = max(1e-6, cov[0][0] * cov[1][1] - cov[0][1] * cov[0][1]);
const float det_1 = max(1e-6, (cov[0][0] + kernel_size) * (cov[1][1] + kernel_size) - cov[0][1] * cov[0][1]);
coef = sqrt(det_0 / (det_1+1e-6) + 1e-6);
if (det_0 <= 1e-6 || det_1 <= 1e-6){
coef = 0.0f;
}
// glm::mat3 testm = glm::mat3{
// 1,2,3,
// 4,5,6,
// 7,8,9,
// };
// glm::vec3 testv = {1,1,1};
// glm::vec3 resultm = testm * testv;
// printf("%f %f %f\n", resultm[0], resultm[1],resultm[2]); 12.000000 15.000000 18.000000
glm::mat3 Vrk_eigen_vector;
glm::vec3 Vrk_eigen_value;
int D = glm_modification::findEigenvaluesSymReal(Vrk,Vrk_eigen_value,Vrk_eigen_vector);
unsigned int min_id = Vrk_eigen_value[0]>Vrk_eigen_value[1]? (Vrk_eigen_value[1]>Vrk_eigen_value[2]?2:1):(Vrk_eigen_value[0]>Vrk_eigen_value[2]?2:0);
glm::mat3 Vrk_inv;
bool well_conditioned = Vrk_eigen_value[min_id]>0.00000001;
glm::vec3 eigenvector_min;
if(well_conditioned)
{
glm::mat3 diag = glm::mat3( 1/Vrk_eigen_value[0], 0, 0,
0, 1/Vrk_eigen_value[1], 0,
0, 0, 1/Vrk_eigen_value[2] );
Vrk_inv = Vrk_eigen_vector * diag * glm::transpose(Vrk_eigen_vector);
}
else
{
eigenvector_min = Vrk_eigen_vector[min_id];
Vrk_inv = glm::outerProduct(eigenvector_min,eigenvector_min);
}
glm::mat3 cov_cam_inv = glm::transpose(W) * Vrk_inv * W;
glm::vec3 uvh = {txtz, tytz, 1};
glm::vec3 uvh_m = cov_cam_inv * uvh;
glm::vec3 uvh_mn = glm::normalize(uvh_m);
if(isnan(uvh_mn.x)|| D==0)
{
for(int ch = 0; ch < 6; ch++)
camera_plane[ch] = 0;
*output_normal = {0,0,0};
*ray_plane = {0,0};
}
else
{
float u2 = txtz * txtz;
float v2 = tytz * tytz;
float uv = txtz * tytz;
float l = sqrt(t.x*t.x+t.y*t.y+t.z*t.z);
glm::mat3 nJ = glm::mat3(
1 / t.z, 0.0f, -(t.x) / (t.z * t.z),
0.0f, 1 / t.z, -(t.y) / (t.z * t.z),
t.x/l, t.y/l, t.z/l);
glm::mat3 nJ_inv = glm::mat3(
v2 + 1, -uv, 0,
-uv, u2 + 1, 0,
-txtz, -tytz, 0
);
if constexpr (INTE)
{
glm::mat3 inv_cov_ray;
if(well_conditioned)
{
float ltz = u2+v2+1;
glm::mat3 nJ_inv_full = t.z/(u2+v2+1) * \
glm::mat3(
v2 + 1, -uv, txtz/l*ltz,
-uv, u2 + 1, tytz/l*ltz,
-txtz, -tytz, 1/l*ltz);
glm::mat3 T2 = W * glm::transpose(nJ_inv_full);
inv_cov_ray = glm::transpose(T2) * Vrk_inv * T2;
}
else
{
glm::mat3 T2 = W * nJ;
glm::mat3 cov_ray = glm::transpose(T2) * Vrk_inv * T2;
glm::mat3 cov_eigen_vector;
glm::vec3 cov_eigen_value;
glm_modification::findEigenvaluesSymReal(cov_ray,cov_eigen_value,cov_eigen_vector);
unsigned int min_id = cov_eigen_value[0]>cov_eigen_value[1]? (cov_eigen_value[1]>cov_eigen_value[2]?2:1):(cov_eigen_value[0]>cov_eigen_value[2]?2:0);
float lambda1 = cov_eigen_value[(min_id+1)%3];
float lambda2 = cov_eigen_value[(min_id+2)%3];
float lambda3 = cov_eigen_value[min_id];
glm::mat3 new_cov_eigen_vector = glm::mat3();
new_cov_eigen_vector[0] = cov_eigen_vector[(min_id+1)%3];
new_cov_eigen_vector[1] = cov_eigen_vector[(min_id+2)%3];
new_cov_eigen_vector[2] = cov_eigen_vector[min_id];
glm::vec3 r3 = glm::vec3(new_cov_eigen_vector[0][2],new_cov_eigen_vector[1][2],new_cov_eigen_vector[2][2]);
glm::mat3 cov2d = glm::mat3(
1/lambda1,0,-r3[0]/r3[2]/lambda1,
0,1/lambda2,-r3[1]/r3[2]/lambda2,
-r3[0]/r3[2]/lambda1,-r3[1]/r3[2]/lambda2,0
);
glm::mat3 inv_cov_ray = new_cov_eigen_vector * cov2d * glm::transpose(new_cov_eigen_vector);
}
glm::mat3 scale = glm::mat3(1/focal_x,0,0,
0, 1/focal_y,0,
0,0,1);
inv_cov_ray = scale * inv_cov_ray * scale;
invraycov3Ds[0] = inv_cov_ray[0][0];
invraycov3Ds[1] = inv_cov_ray[0][1];
invraycov3Ds[2] = inv_cov_ray[0][2];
invraycov3Ds[3] = inv_cov_ray[1][1];
invraycov3Ds[4] = inv_cov_ray[1][2];
invraycov3Ds[5] = inv_cov_ray[2][2];
}
float vbn = glm::dot(uvh_mn, uvh);
float factor_normal = l / (u2+v2+1);
glm::vec3 plane = nJ_inv * (uvh_mn/max(vbn,0.0000001f));
float nl = u2+v2+1;
glm::vec2 camera_plane_x = {(-(v2 + 1)*t.z+plane[0]*t.x)/nl/focal_x, (uv*t.z+plane[1]*t.x)/nl/focal_y};
glm::vec2 camera_plane_y = {(uv*t.z+plane[0]*t.y)/nl/focal_x, (-(u2 + 1)*t.z+plane[1]*t.y)/nl/focal_y};
glm::vec2 camera_plane_z = {(t.x+plane[0]*t.z)/nl/focal_x, (t.y+plane[1]*t.z)/nl/focal_y};
*ray_plane = {plane[0]*l/nl/focal_x, plane[1]*l/nl/focal_y};
camera_plane[0] = camera_plane_x.x;
camera_plane[1] = camera_plane_x.y;
camera_plane[2] = camera_plane_y.x;
camera_plane[3] = camera_plane_y.y;
camera_plane[4] = camera_plane_z.x;
camera_plane[5] = camera_plane_z.y;
glm::vec3 ray_normal_vector = {-plane[0]*factor_normal, -plane[1]*factor_normal, -1};
glm::vec3 cam_normal_vector = nJ * ray_normal_vector;
glm::vec3 normal_vector = glm::normalize(cam_normal_vector);
*output_normal = {normal_vector.x, normal_vector.y, normal_vector.z};
}
return well_conditioned;
}
// Forward method for converting scale and rotation properties of each
// Gaussian to a 3D covariance matrix in world space. Also takes care
// of quaternion normalization.
__device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
{
// Create scaling matrix
glm::mat3 S = glm::mat3(1.0f);
S[0][0] = mod * scale.x;
S[1][1] = mod * scale.y;
S[2][2] = mod * scale.z;
// Normalize quaternion to get valid rotation
glm::vec4 q = rot;// / glm::length(rot);
float r = q.x;
float x = q.y;
float y = q.z;
float z = q.w;
// Compute rotation matrix from quaternion
glm::mat3 R = glm::mat3(
1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
);
glm::mat3 M = S * R;
// Compute 3D world covariance matrix Sigma
glm::mat3 Sigma = glm::transpose(M) * M;
// Covariance is symmetric, only store upper right
cov3D[0] = Sigma[0][0];
cov3D[1] = Sigma[0][1];
cov3D[2] = Sigma[0][2];
cov3D[3] = Sigma[1][1];
cov3D[4] = Sigma[1][2];
cov3D[5] = Sigma[2][2];
}
// Perform initial steps for each Gaussian prior to rasterization.
template<int C, bool INTE = false>
__global__ void preprocessCUDA(int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
const float kernel_size,
int* radii,
float2* points_xy_image,
float3* view_points,
float* depths,
float* camera_planes,
float2* ray_planes,
float3* normals,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered,
float* invraycov3Ds,
float* ts,
bool* conditions)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.
radii[idx] = 0;
tiles_touched[idx] = 0;
// Perform near culling, quit if outside.
float3 p_view;
if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// If 3D covariance matrix is precomputed, use it, otherwise compute
// from scaling and rotation parameters.
const float* cov3D;
if (cov3D_precomp != nullptr)
{
cov3D = cov3D_precomp + idx * 6;
}
else
{
computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
cov3D = cov3Ds + idx * 6;
}
// Compute 2D screen-space covariance matrix
float cov2D[3];
float ceof;
bool condition = computeCov2D<INTE>(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, kernel_size, cov3D, viewmatrix, cov2D, camera_planes + idx * 6, normals + idx, ray_planes + idx, ceof, invraycov3Ds + idx * 6);
if constexpr (INTE)
{
conditions[idx] = condition;
}
ts[idx] = sqrt(p_view.x*p_view.x+p_view.y*p_view.y+p_view.z*p_view.z);
const float3 cov = {cov2D[0], cov2D[1], cov2D[2]};
// Invert covariance (EWA algorithm)
float det = (cov.x * cov.z - cov.y * cov.y);
if (det == 0.0f)
return;
float det_inv = 1.f / det;
float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
// Compute extent in screen space (by finding eigenvalues of
// 2D covariance matrix). Use extent to compute a bounding rectangle
// of screen-space tiles that this Gaussian overlaps with. Quit if
// rectangle covers 0 tiles.
float mid = 0.5f * (cov.x + cov.z);
float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
uint2 rect_min, rect_max;
getRect(point_image, my_radius, rect_min, rect_max, grid);
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
return;
// If colors have been precomputed, use them, otherwise convert
// spherical harmonics coefficients to RGB color.
if (colors_precomp == nullptr)
{
glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
rgb[idx * C + 0] = result.x;
rgb[idx * C + 1] = result.y;
rgb[idx * C + 2] = result.z;
}
// Store some useful helper data for the next steps.
depths[idx] = p_view.z;
view_points[idx] = p_view;
radii[idx] = my_radius;
points_xy_image[idx] = point_image;
// Inverse 2D covariance and opacity neatly pack into one float4
conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] * ceof};
tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.
template <uint32_t CHANNELS, bool COORD, bool DEPTH, bool NORMAL>
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float* __restrict__ view_points,
const float2* __restrict__ points_xy_image,
const float* __restrict__ features,
const float* __restrict__ ts,
const float* __restrict__ camera_planes,
const float2* __restrict__ ray_planes,
const float3* __restrict__ normals,
const float4* __restrict__ conic_opacity,
const float focal_x,
const float focal_y,
float* __restrict__ out_alpha,
uint32_t* __restrict__ n_contrib,
const float* __restrict__ bg_color,
float* __restrict__ out_color,
float* __restrict__ out_coord,
float* __restrict__ out_mcoord,
float* __restrict__ out_normal,
float* __restrict__ out_depth,
float* __restrict__ out_mdepth,
float* __restrict__ accum_coord,
float* __restrict__ accum_depth,
float* __restrict__ normal_length
)
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block();
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
uint32_t pix_id = W * pix.y + pix.x;
float2 pixf = { (float)pix.x, (float)pix.y };
float2 pixnf = {(pixf.x-W/2.f)/focal_x,(pixf.y-H/2.f)/focal_y};
float ln = sqrt(pixnf.x*pixnf.x+pixnf.y*pixnf.y+1);
constexpr bool GEO = DEPTH || COORD || NORMAL;
// Check if this thread is associated with a valid pixel or outside.
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize
bool done = !inside;
// Load start/end range of IDs to process in bit sorted list.
uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
int toDo = range.y - range.x;
// Allocate storage for batches of collectively fetched data.
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float collected_feature[BLOCK_SIZE * CHANNELS];
__shared__ float collected_camera_plane[BLOCK_SIZE * 6];
__shared__ float collected_mean3d[BLOCK_SIZE * 3];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
__shared__ float collected_ts[BLOCK_SIZE];
__shared__ float2 collected_ray_planes[BLOCK_SIZE];
__shared__ float3 collected_normals[BLOCK_SIZE];
// Initialize helper variables
float T = 1.0f;
uint32_t contributor = 0;
uint32_t last_contributor = 0;
uint32_t max_contributor = -1;
float C[CHANNELS] = { 0 };
float weight = 0;
float Coord[3] = { 0 };
float mCoord[3] = { 0 };
float Depth = 0;
float mDepth = 0;
float Normal[3] = {0};
float last_depth = 0;
float last_weight = 0;
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
int coll_id = point_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
for(int ch = 0; ch < CHANNELS; ch++)
collected_feature[ch * BLOCK_SIZE + block.thread_rank()] = features[coll_id * CHANNELS + ch];
if constexpr (COORD)
{
for(int ch = 0; ch < 6; ch++)
collected_camera_plane[ch * BLOCK_SIZE + block.thread_rank()] = camera_planes[coll_id * 6 + ch];
for(int ch = 0; ch < 3; ch++)
collected_mean3d[ch * BLOCK_SIZE + block.thread_rank()] = view_points[coll_id * 3 + ch];
}
if constexpr (DEPTH)
{
collected_ts[block.thread_rank()] = ts[coll_id];
collected_ray_planes[block.thread_rank()] = ray_planes[coll_id];
}
if constexpr (NORMAL)
{
collected_normals[block.thread_rank()] = normals[coll_id];
}
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)
float2 xy = collected_xy[j];
float2 d = { xy.x - pixf.x, xy.y - pixf.y };
float4 con_o = collected_conic_opacity[j];
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f){
continue;
}
// Eq. (2) from 3D Gaussian splatting paper.
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power));
if (alpha < 1.0f / 255.0f)
continue;
float test_T = T * (1 - alpha);
if (test_T < 0.0001f)
{
done = true;
continue;
}
const float aT = alpha * T;
// Eq. (3) from 3D Gaussian splatting paper.
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += collected_feature[j + BLOCK_SIZE * ch] * aT;
bool before_median = T > 0.5;
if constexpr (COORD)
{
float2 camera_plane0 = {collected_camera_plane[j], collected_camera_plane[j + BLOCK_SIZE]};
float2 camera_plane1 = {collected_camera_plane[j + BLOCK_SIZE * 2], collected_camera_plane[j + BLOCK_SIZE * 3]};
float2 camera_plane2 = {collected_camera_plane[j + BLOCK_SIZE * 4], collected_camera_plane[j + BLOCK_SIZE * 5]};
float coord[3] = {collected_mean3d[j] + camera_plane0.x * d.x + camera_plane0.y * d.y,
collected_mean3d[j + BLOCK_SIZE] + camera_plane1.x * d.x + camera_plane1.y * d.y,
collected_mean3d[j + BLOCK_SIZE * 2] + camera_plane2.x * d.x + camera_plane2.y * d.y};
for(int ch = 0; ch < 3; ch++)
Coord[ch] += coord[ch] * aT;
if (before_median){
for(int ch = 0; ch < 3; ch++)
mCoord[ch] = coord[ch];
}
}
if constexpr (DEPTH)
{
float t_center = collected_ts[j];
float2 ray_plane = collected_ray_planes[j];
float t = t_center + (ray_plane.x * d.x + ray_plane.y * d.y);
// float depth = t/ln;
Depth += t * aT;
if (before_median) mDepth = t;
}
if constexpr (NORMAL)
{
Normal[0] += collected_normals[j].x * aT;
Normal[1] += collected_normals[j].y * aT;
Normal[2] += collected_normals[j].z * aT;
}
if constexpr (GEO)
{
if (before_median)
max_contributor = contributor;
}
weight += aT;
T = test_T;
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
}
}
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
n_contrib[pix_id] = last_contributor;
n_contrib[pix_id + H * W] = max_contributor;
for (int ch = 0; ch < CHANNELS; ch++)
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
out_alpha[pix_id] = weight; //1 - T;
if constexpr (COORD)
{
if(last_contributor)
{
for (int ch = 0; ch < 3; ch++)
{
out_coord[ch * H * W + pix_id] = Coord[ch] / weight;
}
}
else
{
for (int ch = 0; ch < 3; ch++)
out_coord[ch * H * W + pix_id] = 0;
}
for (int ch = 0; ch < 3; ch++)
{
accum_coord[ch * H * W + pix_id] = Coord[ch];
out_mcoord[ch * H * W + pix_id] = mCoord[ch];
}
}
if constexpr (DEPTH)
{
float depth_ln = Depth/ln;
accum_depth[pix_id] = depth_ln;
if(last_contributor)
{
out_depth[pix_id] = depth_ln/weight;
}
else
{
out_depth[pix_id] = 0;
}
out_mdepth[pix_id] = mDepth/ln;
}
if constexpr (NORMAL)
{
if(last_contributor)
{
float len_normal = sqrt(Normal[0]*Normal[0]+Normal[1]*Normal[1]+Normal[2]*Normal[2]);
normal_length[pix_id] = len_normal;
len_normal = max(len_normal, NORMALIZE_EPS);
for (int ch = 0; ch < 3; ch++)
out_normal[ch * H * W + pix_id] = Normal[ch]/len_normal;
}
else
{
normal_length[pix_id] = 1;
for (int ch = 0; ch < 3; ch++)
out_normal[ch * H * W + pix_id] = 0;
}
}
}
}
// the Bool inputs can be replaced by an enumeration variable for different functions.
void FORWARD::render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float* view_points,
const float2* means2D,
const float* colors,
const float* ts,
const float* camera_planes,
const float2* ray_planes,
const float3* normals,
const float4* conic_opacity,
const float focal_x, float focal_y,
float* out_alpha,
uint32_t* n_contrib,
const float* bg_color,
float* out_color,
float* out_coord,
float* out_mcoord,
float* out_normal,
float* out_depth,
float* out_mdepth,
float* accum_coord,
float* accum_depth,
float* normal_length,
bool require_coord,
bool require_depth)
{
#define RENDER_CUDA_CALL(template_coord, template_depth, template_normal) \
renderCUDA<NUM_CHANNELS, template_coord, template_depth, template_normal> <<<grid, block>>> ( \
ranges, point_list, W, H, view_points, means2D, colors, ts, camera_planes, ray_planes, \
normals, conic_opacity, focal_x, focal_y, out_alpha, n_contrib, bg_color, out_color, \
out_coord, out_mcoord, out_normal, out_depth, out_mdepth, \
accum_coord, accum_depth, normal_length)
if (require_coord && require_depth)
RENDER_CUDA_CALL(true, true, true);
else if (require_coord && !require_depth)
RENDER_CUDA_CALL(true, false, true);
else if(!require_coord && require_depth)
RENDER_CUDA_CALL(false, true, true);
else
RENDER_CUDA_CALL(false, false, false);
#undef RENDER_CUDA_CALL
}
void FORWARD::preprocess(int P, int D, int M,
const float* means3D,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
const float kernel_size,
int* radii,
float2* means2D,
float3* view_points,
float* depths,
float* camera_planes,
float2* ray_planes,
float* ts,
float3* normals,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered,
bool integrate,
float* invraycov3Ds,
bool* condition)
{
if(integrate)
preprocessCUDA<NUM_CHANNELS, true> << <(P + 255) / 256, 256 >> > (
P, D, M,
means3D,
scales,
scale_modifier,
rotations,
opacities,
shs,
clamped,
cov3D_precomp,
colors_precomp,
viewmatrix,
projmatrix,
cam_pos,
W, H,
tan_fovx, tan_fovy,
focal_x, focal_y,
kernel_size,
radii,
means2D,
view_points,
depths,
camera_planes,
ray_planes,
normals,
cov3Ds,
rgb,
conic_opacity,
grid,
tiles_touched,
prefiltered,
invraycov3Ds,
ts,
condition
);
else
preprocessCUDA<NUM_CHANNELS, false> << <(P + 255) / 256, 256 >> > (
P, D, M,
means3D,
scales,
scale_modifier,
rotations,
opacities,
shs,
clamped,
cov3D_precomp,
colors_precomp,
viewmatrix,
projmatrix,
cam_pos,
W, H,
tan_fovx, tan_fovy,
focal_x, focal_y,
kernel_size,
radii,
means2D,
view_points,
depths,
camera_planes,
ray_planes,
normals,
cov3Ds,
rgb,
conic_opacity,
grid,
tiles_touched,
prefiltered,
invraycov3Ds,
ts,
condition
);
}
//follow code is adopted from GOF for marching tetrahedra https://github.com/autonomousvision/gaussian-opacity-fields
// Perform initial steps for each Gaussian prior to rasterization.
template<int C>
__global__ void preprocessPointsCUDA(int P, int D, int M,
const float* points3D,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
float2* points2D,
float* depths,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.
tiles_touched[idx] = 0;
// Perform near culling, quit if outside.
float3 p_view;
if (!in_frustum(idx, points3D, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting
float3 p_orig = { points3D[3 * idx], points3D[3 * idx + 1], points3D[3 * idx + 2] };
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
float2 point_image = {focal_x * p_view.x / (p_view.z + 0.0000001f) + W/2., focal_y * p_view.y / (p_view.z + 0.0000001f) + H/2.};
// If the point is outside the image, quit.
if (point_image.x < 0 || point_image.x >= W || point_image.y < 0 || point_image.y >= H)
return;
// Store some useful helper data for the next steps.
depths[idx] = sqrt(p_view.x*p_view.x+p_view.y*p_view.y+p_view.z*p_view.z);
points2D[idx] = point_image;
tiles_touched[idx] = 1;
}
void FORWARD::preprocess_points(int PN, int D, int M,
const float* points3D,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
float2* points2D,
float* depths,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
preprocessPointsCUDA<NUM_CHANNELS> << <(PN + 255) / 256, 256 >> > (
PN, D, M,
points3D,
viewmatrix,
projmatrix,
cam_pos,
W, H,
tan_fovx, tan_fovy,
focal_x, focal_y,
points2D,
depths,
grid,
tiles_touched,
prefiltered
);
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.
template <uint32_t CHANNELS>
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
integrateCUDA(
const uint2* __restrict__ gaussian_ranges,
const uint2* __restrict__ point_ranges,
const uint32_t* __restrict__ gaussian_list,
const uint32_t* __restrict__ point_list,
int W, int H,
float focal_x, float focal_y,
const float2* __restrict__ subpixel_offset,
const float2* __restrict__ points2D,
const float2* __restrict__ gaussians2D,
const float* __restrict__ features,
const float* __restrict__ depths_plane,
const float2* __restrict__ ray_planes,
const float* __restrict__ cov3Ds,
const float* viewmatrix,
const float3* __restrict__ points3D,
const float3* __restrict__ gaussians3D,
const float3* __restrict__ scales,
const float* __restrict__ invraycov3Ds,
const float* __restrict__ point_depths,
const float* __restrict__ gaussian_depths,
const float4* __restrict__ conic_opacity,
const bool* __restrict__ condition,
float* __restrict__ final_T,
uint32_t* __restrict__ n_contrib,
// float* __restrict__ center_depth,
// float4* __restrict__ point_alphas,
const float* __restrict__ bg_color,
float* __restrict__ out_color,
float* __restrict__ out_alpha_integrated,
float* __restrict__ out_color_integrated,
float* __restrict__ out_coordinate2d,
float* __restrict__ out_sdf)
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block();
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
uint32_t pix_id = W * pix.y + pix.x;
float2 pixf = { (float)pix.x + 0.5f, (float)pix.y + 0.5f}; // TODO plus 0.5
// Check if this thread is associated with a valid pixel or outside.
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize
bool done = !inside;
const float depth_input = inside ? subpixel_offset[pix_id].x : 0.0f;
// Load start/end range of IDs to process in bit sorted list.
uint2 range = gaussian_ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
int toDo = range.y - range.x;
uint2 p_range = point_ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int p_rounds = ((p_range.y - p_range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
int p_toDo = p_range.y - p_range.x;
// Allocate storage for batches of collectively fetched data.
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE]; // only need opacity
__shared__ float collected_depth[BLOCK_SIZE];
__shared__ float2 collected_depth_plane[BLOCK_SIZE];
__shared__ float3 collected_scale[BLOCK_SIZE];
__shared__ float collected_invraycov[BLOCK_SIZE*6];
__shared__ bool collected_condition[BLOCK_SIZE];
// Initialize helper variables
float T = 1.0f;
uint32_t contributor = 0;
uint32_t last_contributor = 0;
float C[CHANNELS*2+2] = { 0 };
float mid_depth_center = 0;
float2 mid_plane = {0,0};
float2 mid_mean2d = {0,0};
uint32_t n_contrib_local = 0;
uint16_t contributed_ids[MAX_NUM_CONTRIBUTORS*4] = { 0 };
// use 4 additional corner points so that we have more accurate estimation of contributed_ids
float corner_Ts[5] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
float offset_xs[5] = { 0.0f, -0.5f, 0.5f, -0.5f, 0.5f };
float offset_ys[5] = { 0.0f, -0.5f, -0.5f, 0.5f, 0.5f };
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
int coll_id = gaussian_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
collected_xy[block.thread_rank()] = gaussians2D[coll_id];
collected_scale[block.thread_rank()] = scales[coll_id];
collected_depth[block.thread_rank()] = gaussian_depths[coll_id];
collected_depth_plane[block.thread_rank()] = ray_planes[coll_id];
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)
// float2 xy = collected_xy[j];
// float2 d = { xy.x - pixf.x, xy.y - pixf.y };
float4 con_o = collected_conic_opacity[j];
float3 scale_j = collected_scale[j];
float depth_center = collected_depth[j];
float2 depth_plane = collected_depth_plane[j];
float2 xy = collected_xy[j];
bool used = false;
for (int k = 0; k < 5; ++k){
float2 d = { xy.x - pixf.x - offset_xs[k], xy.y - pixf.y - offset_ys[k]};
float depth = depth_center + (depth_plane.x * d.x + depth_plane.y * d.y);
// depth must be positive otherwise it is not valid and we skip it
// if (depth <= NEAR_PLANE)
// continue;
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f){
continue;
}
// Eq. (2) from 3D Gaussian splatting paper.
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power));
if (alpha < 1.0f / 255.0f)
continue;
float test_T = corner_Ts[k] * (1 - alpha);
if (test_T < 0.0001f)
{
// done = true;
continue;
}
if (k == 0){
// Eq. (3) from 3D Gaussian splatting paper.
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
}
// store maximal depth
if (depth > C[CHANNELS * 2]){
C[CHANNELS * 2] = depth;
}
if (k == 0){
C[CHANNELS * 2 + 1] += alpha * T;
C[CHANNELS] += depth * alpha * T;
if (T > 0.5){
C[CHANNELS+1] = depth;
mid_depth_center = depth_center;
mid_plane = depth_plane;
mid_mean2d = xy;
}
T = test_T;
}
corner_Ts[k] = test_T;
used = true;
}
if (used){
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
contributed_ids[n_contrib_local] = (uint16_t)contributor;
n_contrib_local += 1;
if (n_contrib_local >= MAX_NUM_CONTRIBUTORS * 4){
done = true;
printf("ERROR: Maximal contributors are met. This should be fixed! %d\n", n_contrib_local);
break;
}
}
}
}
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
final_T[pix_id] = T;
n_contrib[pix_id] = last_contributor;
for (int ch = 0; ch < CHANNELS; ch++)
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
out_color[3 * H * W + pix_id] = C[CHANNELS];
out_color[4 * H * W + pix_id] = C[CHANNELS+1];
// depth and alpha
out_color[DEPTH_OFFSET * H * W + pix_id] = C[CHANNELS * 2];
out_color[ALPHA_OFFSET * H * W + pix_id] = C[CHANNELS * 2 + 1];
}
// use maximal depth for the current pixel
const float max_depth = C[CHANNELS * 2];
// Allocate storage for batches of collectively fetched data.
int projected_ids[MAX_NUM_PROJECTED] = { 0 };
float2 projected_xy[MAX_NUM_PROJECTED] = { 0.f };
float projected_depth[MAX_NUM_PROJECTED] = { 0.f };
//TODO add a for loop here in case we got more points than MAX_NUM_PROJECTED
uint32_t point_counter_last = 0;
bool point_done = !inside;
int total_projected = 0;
//TODO this for loop is not necessary if we take the minimal value from multiple views
while (true)
{
// End if entire block votes that it is done integrating for all points
int num_done = __syncthreads_count(point_done);
if (num_done == BLOCK_SIZE)
break;
int num_projected = 0;
bool excced_max_projected = false;
done = false;
uint32_t point_counter = 0;
p_toDo = p_range.y - p_range.x;
// check how many points projected to this pixel
// Iterate over batches until all done or range is complete
for (int i = 0; i < p_rounds; i++, p_toDo -= BLOCK_SIZE)
{
//TODO here is not necessary
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
block.sync();
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (p_range.x + progress < p_range.y)
{
int coll_id = point_list[p_range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points2D[coll_id];
collected_depth[block.thread_rank()] = point_depths[coll_id];
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, p_toDo); j++)
{
point_counter++;
if (point_counter <= point_counter_last){
continue;
}
float2 point_xy = collected_xy[j];
float depth = collected_depth[j];
// if (abs(point_xy.x - pixf.x) < 0.5 && abs(point_xy.y - pixf.y) < 0.5){
if ((point_xy.x >= (pixf.x - 0.5)) && (point_xy.x < (pixf.x + 0.5)) &&
(point_xy.y >= (pixf.y - 0.5)) && (point_xy.y < (pixf.y + 0.5))){
//TODO check the condition here
if (true || max_depth <= 0 || depth < max_depth){
if (num_projected >= MAX_NUM_PROJECTED){
done = true;
excced_max_projected = true;
break;
}
projected_ids[num_projected] = collected_id[j];
projected_xy[num_projected] = point_xy;
projected_depth[num_projected] = depth;
num_projected += 1;
}
}
}
}
point_counter_last = point_counter - 1;
point_done = !excced_max_projected;
total_projected += num_projected;
// reiterate all primitives
toDo = range.y - range.x;
done = false;
//TODO we could allocate the memory with dynamic size
float point_alphas[MAX_NUM_PROJECTED] = { 0.f};
float point_Ts[MAX_NUM_PROJECTED] = {0.f};
// float point_sdf[MAX_NUM_PROJECTED] = {0.f};
for (int i = 0; i < num_projected; i++){
point_Ts[i] = 1.f;
}
uint32_t num_iterated = 0;
bool second_done = !inside;
uint16_t num_contributed_second = 0;
//TODO Note that the range is not correct for the near by points, but we use it as approximation for speed up
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(second_done);
if (num_done == BLOCK_SIZE)
break;
block.sync();
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
int coll_id = gaussian_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
collected_xy[block.thread_rank()] = gaussians2D[coll_id];
collected_scale[block.thread_rank()] = scales[coll_id];
collected_depth[block.thread_rank()] = gaussian_depths[coll_id];
collected_depth_plane[block.thread_rank()] = ray_planes[coll_id];
collected_condition[block.thread_rank()] = condition[coll_id];
for (int ii = 0; ii < 6; ii++)
collected_invraycov[6 * block.thread_rank() + ii] = invraycov3Ds[coll_id * 6 + ii];
}
block.sync();
// Iterate over current batch
for (int j = 0; !second_done && j < min(BLOCK_SIZE, toDo); j++)
{
num_iterated++;
if (num_iterated > last_contributor){
second_done = true;
continue;
}
if (num_iterated != (uint32_t)contributed_ids[num_contributed_second]){
continue;
} else{
num_contributed_second += 1;
}
float4 con_o = collected_conic_opacity[j];
float3 scale_j = collected_scale[j];
float depth_center = collected_depth[j];
float2 depth_plane = collected_depth_plane[j];
float2 xy = collected_xy[j];
glm::mat3 invraycov = glm::mat3(collected_invraycov[6*j+0], collected_invraycov[6*j+1], collected_invraycov[6*j+2],
collected_invraycov[6*j+1], collected_invraycov[6*j+3], collected_invraycov[6*j+4],
collected_invraycov[6*j+2], collected_invraycov[6*j+4], collected_invraycov[6*j+5]);
// iterate over all projected points
for (int k = 0; k < num_projected; k++){
// create the ray
float2 d = { xy.x - projected_xy[k].x, xy.y - projected_xy[k].y};
float depth = depth_center + (depth_plane.x * d.x + depth_plane.y * d.y);
float alpha;
if(collected_condition[j])
{
glm::vec3 delta_u = glm::vec3(d.x, d.y, depth_center-min(projected_depth[k],depth));
float power = -0.5f * (glm::dot(delta_u, invraycov*delta_u));
alpha = min(0.99f, con_o.w * exp(power));
}
else
{
if(projected_depth[k]<depth)
alpha = 0;
else
{
glm::vec3 delta_u = glm::vec3(d.x, d.y, depth_center);
float power = -0.5f * (glm::dot(delta_u, invraycov*delta_u));
alpha = min(0.99f, con_o.w * exp(power));
}
}
// TODO check here
if (alpha < 1.0f / 255.0f){
continue;
}
float test_T = point_Ts[k] * (1 - alpha);
point_alphas[k] += alpha * point_Ts[k];
point_Ts[k] = test_T;
}
}
}
float3 ray_point = { (pixf.x - W/2.) / focal_x, (pixf.y - H/2.) / focal_y, 1.0 };
float depth2distance = sqrt(ray_point.x*ray_point.x+ray_point.y*ray_point.y+ray_point.z*ray_point.z);
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
// write alphas
for (int k = 0; k < num_projected; k++){
out_alpha_integrated[projected_ids[k]] = point_alphas[k];
// write colors
for (int ch = 0; ch < CHANNELS; ch++)
out_color_integrated[CHANNELS * projected_ids[k] + ch] = C[ch] + T * bg_color[ch];;
out_coordinate2d[2 * projected_ids[k]] = projected_xy[k].x;
out_coordinate2d[2 * projected_ids[k]+1] = projected_xy[k].y;
if(projected_depth[k]>0)
{
float2 d = { mid_mean2d.x - projected_xy[k].x, mid_mean2d.y - projected_xy[k].y};
float depth = mid_depth_center + (mid_plane.x * d.x + mid_plane.y * d.y);
out_sdf[projected_ids[k]] = depth - projected_depth[k];
}
}
}
}
if (inside){
// use the distortion channel to store the number of projected points
out_color[DISTORTION_OFFSET * H * W + pix_id] = (float)total_projected;
}
}
void FORWARD::integrate(
const dim3 grid, dim3 block,
const uint2* gaussian_ranges,
const uint2* point_ranges,
const uint32_t* gaussian_list,
const uint32_t* point_list,
int W, int H,
float focal_x, float focal_y,
const float2* subpixel_offset,
const float2* points2D,
const float2* gaussians2D,
const float* colors,
const float* depths_plane,
const float2* ray_planes,
const float* cov3Ds,
const float* viewmatrix,
const float3* points3D,
const float3* gaussians3D,
const float3* scales,
const float* invraycov3Ds,
const float* point_depths,
const float* gaussian_depths,
const float4* conic_opacity,
const bool* condition,
float* final_T,
uint32_t* n_contrib,
// float* center_depth,
// float4* center_alphas,
const float* bg_color,
float* out_color,
float* out_alpha_integrated,
float* out_color_integrated,
float* out_coordinate2d,
float* out_sdf)
{
integrateCUDA<NUM_CHANNELS> << <grid, block >> > (
gaussian_ranges,
point_ranges,
gaussian_list,
point_list,
W, H,
focal_x, focal_y,
subpixel_offset,
points2D,
gaussians2D,
colors,
depths_plane,
ray_planes,
cov3Ds,
viewmatrix,
points3D,
gaussians3D,
scales,
invraycov3Ds,
point_depths,
gaussian_depths,
conic_opacity,
condition,
final_T,
n_contrib,
// center_depth,
// center_alphas,
bg_color,
out_color,
out_alpha_integrated,
out_color_integrated,
out_coordinate2d,
out_sdf);
}