nvpanoptix-3d / nvpanoptix_3d /utils /coords_transform.py

Update model inference code and environment setup instructions (#4)

f4a0919 verified 3 months ago

8.87 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Coordinate transform utils."""

	import torch
	import MinkowskiEngine as Me
	from typing import List

	from ..reconstruction.frustum import \
	generate_frustum, compute_camera2frustum_transform


	def transform_feat3d_coordinates(
	feat3d, intrinsic,
	image_size=(120, 160),
	depth_min=0.4, depth_max=6.0,
	voxel_size=0.03
	):
	"""
	Transform feat3d coordinates to match Uni3D coordinate system

	Args:
	feat3d: Me.SparseTensor from occupancy-aware lifting
	intrinsic: Camera intrinsic matrix (4x4)
	image_size: tuple of (height, width)
	depth_min, depth_max: depth range
	voxel_size: voxel size in meters
	Returns:
	Me.SparseTensor with transformed coordinates
	"""
	device = feat3d.device
	coords = feat3d.C.clone()

	# step 1: Apply coordinate flip (as done in BackProjection line 33)
	coords[:, 1:3] = 256 - coords[:, 1:3] # flip x, y coordinates
	batch_indices = coords[:, 0].unique()

	compute_once = True
	if intrinsic.dim() == 3: # batched intrinsics
	# check if all intrinsics are identical
	if len(batch_indices) > 1:
	compute_once = torch.allclose(intrinsic[0:1].expand_as(intrinsic), intrinsic, atol=1e-6)
	intrinsic_ref = intrinsic[0] if compute_once else None
	else:
	intrinsic_ref = intrinsic

	if compute_once:
	intrinsic_batch = intrinsic_ref
	intrinsic_inverse = torch.inverse(intrinsic_batch)
	frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
	camera2frustum, padding_offsets = compute_camera2frustum_transform(
	frustum.to(device), voxel_size,
	frustum_dimensions=torch.tensor([256, 256, 256], device=device)
	)
	# pre-move to device and pre-compute inverse
	camera2frustum = camera2frustum.to(device)
	padding_offsets = padding_offsets.to(device)
	camera2frustum_inv = torch.inverse(camera2frustum).float()
	ones_offset = torch.tensor([1., 1., 1.], device=device)

	transformed_coords_list = []

	for batch_idx in batch_indices:
	batch_mask = coords[:, 0] == batch_idx
	batch_coords = coords[batch_mask, 1:].float() # convert to float once per batch

	# use pre-computed values or compute per-batch
	if not compute_once:
	intrinsic_batch = intrinsic[int(batch_idx)]
	intrinsic_inverse = torch.inverse(intrinsic_batch)
	frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
	camera2frustum, padding_offsets = compute_camera2frustum_transform(
	frustum.to(device), voxel_size,
	frustum_dimensions=torch.tensor([256, 256, 256], device=device)
	)
	camera2frustum = camera2frustum.float().to(device)
	padding_offsets = padding_offsets.to(device)
	camera2frustum_inv = torch.inverse(camera2frustum).float()
	ones_offset = torch.tensor([1., 1., 1.], device=device)

	# convert voxel coordinates to world coordinates (reverse of BackProjection)
	batch_coords_adjusted = batch_coords - padding_offsets - ones_offset

	# convert to homogeneous coordinates
	homogenous_coords = torch.cat([
	batch_coords_adjusted,
	torch.ones(batch_coords_adjusted.shape[0], 1, device=device)
	], dim=1) # [N_batch, 4]

	# apply transformations: world space -> frustum space
	world_coords = torch.mm(camera2frustum_inv, homogenous_coords.t())
	final_coords_homog = torch.mm(camera2frustum.float(), world_coords.float())
	final_coords = final_coords_homog.t()[:, :3]

	# add padding offsets (as done in SparseProjection.projection())
	final_coords = final_coords + padding_offsets

	# add batch index back
	batch_column = torch.full(
	(final_coords.shape[0], 1),
	batch_idx,
	device=device,
	dtype=torch.float32
	)
	final_batch_coords = torch.cat([batch_column, final_coords], dim=1)
	transformed_coords_list.append(final_batch_coords)

	transformed_coords = torch.cat(transformed_coords_list, dim=0)

	transformed_feat3d = Me.SparseTensor(
	features=feat3d.F,
	coordinates=transformed_coords.int(),
	tensor_stride=feat3d.tensor_stride,
	quantization_mode=feat3d.quantization_mode
	)

	return transformed_feat3d


	def fuse_sparse_tensors(tensor1: Me.SparseTensor, tensor2: Me.SparseTensor) -> Me.SparseTensor:
	"""
	Efficiently fuse two sparse tensors
	Args:
	tensor1 (Me.SparseTensor): First sparse tensor
	tensor2 (Me.SparseTensor): Second sparse tensor

	Returns:
	Me.SparseTensor: Fused sparse tensor with concatenated features
	"""
	device = tensor1.device
	dtype = tensor1.F.dtype

	# get coordinates and features
	coords1, feats1 = tensor1.C, tensor1.F
	coords2, feats2 = tensor2.C, tensor2.F

	feat_dim1, feat_dim2 = feats1.shape[1], feats2.shape[1]
	fused_feat_dim = feat_dim1 + feat_dim2

	# concatenate coordinates and create source tracking
	all_coords = torch.cat([coords1, coords2], dim=0)
	n_coords1 = coords1.shape[0]

	# convert each coordinate row to a view that can be uniqued
	coord_view = all_coords.view(all_coords.shape[0], -1)

	# use torch.unique with return_inverse to get mapping
	unique_coord_view, inverse_indices = torch.unique(coord_view, dim=0, return_inverse=True)
	unique_coords = unique_coord_view.view(-1, coords1.shape[1])
	n_unique = unique_coords.shape[0]

	# split inverse indices for each tensor
	inv_indices_1 = inverse_indices[:n_coords1]
	inv_indices_2 = inverse_indices[n_coords1:]

	# pre-allocate with zeros for automatic padding
	fused_features = torch.zeros(n_unique, fused_feat_dim, device=device, dtype=dtype)

	# tensor1 features go to positions [0:feat_dim1]
	fused_features[inv_indices_1, :feat_dim1] = feats1

	# tensor2 features go to positions [feat_dim1:feat_dim1+feat_dim2]
	fused_features[inv_indices_2, feat_dim1:] = feats2
	fused_tensor = Me.SparseTensor(
	features=fused_features,
	coordinates=unique_coords.int(),
	tensor_stride=tensor1.tensor_stride,
	quantization_mode=tensor1.quantization_mode
	)
	return fused_tensor


	def generate_multiscale_feat3d(transformed_feat3d: Me.SparseTensor) -> List[Me.SparseTensor]:
	"""
	Generate multi-scale sparse 3D features
	from transformed_feat3d to match sparse_multi_scale_features structure.
	Args:
	transformed_feat3d (Me.SparseTensor):
	Input sparse tensor from occupancy-aware lifting (256 grid)

	Returns:
	List[Me.SparseTensor]: Multi-scale sparse tensors
	at scales [1/2, 1/4, 1/8] corresponding to [128, 64, 32] grid sizes
	"""
	device = transformed_feat3d.device

	# use consistent stride 2 for progressive downsampling
	# this ensures proper 1/2, 1/4, 1/8 scaling from original 256 grid
	pooling_op = Me.MinkowskiMaxPooling(
	kernel_size=3,
	stride=2,
	dimension=3
	).to(device)

	multi_scale_feat3d = []
	current_tensor = transformed_feat3d
	target_strides = [2, 4, 8] # Expected final strides for each scale

	# generate features at each scale by progressive pooling with stride 2
	for _, target_stride in enumerate(target_strides):
	# apply stride-2 pooling to get next scale
	pooled_tensor = pooling_op(current_tensor)

	# ensure the tensor stride matches expected value
	# the stride should be: 2^(i+1) relative to original
	if pooled_tensor.tensor_stride != target_stride:
	pooled_tensor = Me.SparseTensor(
	features=pooled_tensor.F,
	coordinates=pooled_tensor.C,
	tensor_stride=target_stride,
	quantization_mode=pooled_tensor.quantization_mode
	)

	multi_scale_feat3d.append(pooled_tensor)

	# use pooled tensor as input for next scale (progressive downsampling)
	# this gives us: 256 → 128 → 64 → 32 grid sizes
	current_tensor = pooled_tensor

	return multi_scale_feat3d