# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Coordinate transform utils.""" import torch import MinkowskiEngine as Me from typing import List from ..reconstruction.frustum import \ generate_frustum, compute_camera2frustum_transform def transform_feat3d_coordinates( feat3d, intrinsic, image_size=(120, 160), depth_min=0.4, depth_max=6.0, voxel_size=0.03 ): """ Transform feat3d coordinates to match Uni3D coordinate system Args: feat3d: Me.SparseTensor from occupancy-aware lifting intrinsic: Camera intrinsic matrix (4x4) image_size: tuple of (height, width) depth_min, depth_max: depth range voxel_size: voxel size in meters Returns: Me.SparseTensor with transformed coordinates """ device = feat3d.device coords = feat3d.C.clone() # step 1: Apply coordinate flip (as done in BackProjection line 33) coords[:, 1:3] = 256 - coords[:, 1:3] # flip x, y coordinates batch_indices = coords[:, 0].unique() compute_once = True if intrinsic.dim() == 3: # batched intrinsics # check if all intrinsics are identical if len(batch_indices) > 1: compute_once = torch.allclose(intrinsic[0:1].expand_as(intrinsic), intrinsic, atol=1e-6) intrinsic_ref = intrinsic[0] if compute_once else None else: intrinsic_ref = intrinsic if compute_once: intrinsic_batch = intrinsic_ref intrinsic_inverse = torch.inverse(intrinsic_batch) frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max) camera2frustum, padding_offsets = compute_camera2frustum_transform( frustum.to(device), voxel_size, frustum_dimensions=torch.tensor([256, 256, 256], device=device) ) # pre-move to device and pre-compute inverse camera2frustum = camera2frustum.to(device) padding_offsets = padding_offsets.to(device) camera2frustum_inv = torch.inverse(camera2frustum).float() ones_offset = torch.tensor([1., 1., 1.], device=device) transformed_coords_list = [] for batch_idx in batch_indices: batch_mask = coords[:, 0] == batch_idx batch_coords = coords[batch_mask, 1:].float() # convert to float once per batch # use pre-computed values or compute per-batch if not compute_once: intrinsic_batch = intrinsic[int(batch_idx)] intrinsic_inverse = torch.inverse(intrinsic_batch) frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max) camera2frustum, padding_offsets = compute_camera2frustum_transform( frustum.to(device), voxel_size, frustum_dimensions=torch.tensor([256, 256, 256], device=device) ) camera2frustum = camera2frustum.float().to(device) padding_offsets = padding_offsets.to(device) camera2frustum_inv = torch.inverse(camera2frustum).float() ones_offset = torch.tensor([1., 1., 1.], device=device) # convert voxel coordinates to world coordinates (reverse of BackProjection) batch_coords_adjusted = batch_coords - padding_offsets - ones_offset # convert to homogeneous coordinates homogenous_coords = torch.cat([ batch_coords_adjusted, torch.ones(batch_coords_adjusted.shape[0], 1, device=device) ], dim=1) # [N_batch, 4] # apply transformations: world space -> frustum space world_coords = torch.mm(camera2frustum_inv, homogenous_coords.t()) final_coords_homog = torch.mm(camera2frustum.float(), world_coords.float()) final_coords = final_coords_homog.t()[:, :3] # add padding offsets (as done in SparseProjection.projection()) final_coords = final_coords + padding_offsets # add batch index back batch_column = torch.full( (final_coords.shape[0], 1), batch_idx, device=device, dtype=torch.float32 ) final_batch_coords = torch.cat([batch_column, final_coords], dim=1) transformed_coords_list.append(final_batch_coords) transformed_coords = torch.cat(transformed_coords_list, dim=0) transformed_feat3d = Me.SparseTensor( features=feat3d.F, coordinates=transformed_coords.int(), tensor_stride=feat3d.tensor_stride, quantization_mode=feat3d.quantization_mode ) return transformed_feat3d def fuse_sparse_tensors(tensor1: Me.SparseTensor, tensor2: Me.SparseTensor) -> Me.SparseTensor: """ Efficiently fuse two sparse tensors Args: tensor1 (Me.SparseTensor): First sparse tensor tensor2 (Me.SparseTensor): Second sparse tensor Returns: Me.SparseTensor: Fused sparse tensor with concatenated features """ device = tensor1.device dtype = tensor1.F.dtype # get coordinates and features coords1, feats1 = tensor1.C, tensor1.F coords2, feats2 = tensor2.C, tensor2.F feat_dim1, feat_dim2 = feats1.shape[1], feats2.shape[1] fused_feat_dim = feat_dim1 + feat_dim2 # concatenate coordinates and create source tracking all_coords = torch.cat([coords1, coords2], dim=0) n_coords1 = coords1.shape[0] # convert each coordinate row to a view that can be uniqued coord_view = all_coords.view(all_coords.shape[0], -1) # use torch.unique with return_inverse to get mapping unique_coord_view, inverse_indices = torch.unique(coord_view, dim=0, return_inverse=True) unique_coords = unique_coord_view.view(-1, coords1.shape[1]) n_unique = unique_coords.shape[0] # split inverse indices for each tensor inv_indices_1 = inverse_indices[:n_coords1] inv_indices_2 = inverse_indices[n_coords1:] # pre-allocate with zeros for automatic padding fused_features = torch.zeros(n_unique, fused_feat_dim, device=device, dtype=dtype) # tensor1 features go to positions [0:feat_dim1] fused_features[inv_indices_1, :feat_dim1] = feats1 # tensor2 features go to positions [feat_dim1:feat_dim1+feat_dim2] fused_features[inv_indices_2, feat_dim1:] = feats2 fused_tensor = Me.SparseTensor( features=fused_features, coordinates=unique_coords.int(), tensor_stride=tensor1.tensor_stride, quantization_mode=tensor1.quantization_mode ) return fused_tensor def generate_multiscale_feat3d(transformed_feat3d: Me.SparseTensor) -> List[Me.SparseTensor]: """ Generate multi-scale sparse 3D features from transformed_feat3d to match sparse_multi_scale_features structure. Args: transformed_feat3d (Me.SparseTensor): Input sparse tensor from occupancy-aware lifting (256 grid) Returns: List[Me.SparseTensor]: Multi-scale sparse tensors at scales [1/2, 1/4, 1/8] corresponding to [128, 64, 32] grid sizes """ device = transformed_feat3d.device # use consistent stride 2 for progressive downsampling # this ensures proper 1/2, 1/4, 1/8 scaling from original 256 grid pooling_op = Me.MinkowskiMaxPooling( kernel_size=3, stride=2, dimension=3 ).to(device) multi_scale_feat3d = [] current_tensor = transformed_feat3d target_strides = [2, 4, 8] # Expected final strides for each scale # generate features at each scale by progressive pooling with stride 2 for _, target_stride in enumerate(target_strides): # apply stride-2 pooling to get next scale pooled_tensor = pooling_op(current_tensor) # ensure the tensor stride matches expected value # the stride should be: 2^(i+1) relative to original if pooled_tensor.tensor_stride != target_stride: pooled_tensor = Me.SparseTensor( features=pooled_tensor.F, coordinates=pooled_tensor.C, tensor_stride=target_stride, quantization_mode=pooled_tensor.quantization_mode ) multi_scale_feat3d.append(pooled_tensor) # use pooled tensor as input for next scale (progressive downsampling) # this gives us: 256 → 128 → 64 → 32 grid sizes current_tensor = pooled_tensor return multi_scale_feat3d