File size: 8,868 Bytes

f4a0919

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Coordinate transform utils."""

import torch
import MinkowskiEngine as Me
from typing import List

from ..reconstruction.frustum import \
    generate_frustum, compute_camera2frustum_transform


def transform_feat3d_coordinates(
    feat3d, intrinsic,
    image_size=(120, 160),
    depth_min=0.4, depth_max=6.0,
    voxel_size=0.03
):
    """
    Transform feat3d coordinates to match Uni3D coordinate system

    Args:
        feat3d: Me.SparseTensor from occupancy-aware lifting
        intrinsic: Camera intrinsic matrix (4x4)
        image_size: tuple of (height, width)
        depth_min, depth_max: depth range
        voxel_size: voxel size in meters
    Returns:
        Me.SparseTensor with transformed coordinates
    """
    device = feat3d.device
    coords = feat3d.C.clone()

    # step 1: Apply coordinate flip (as done in BackProjection line 33)
    coords[:, 1:3] = 256 - coords[:, 1:3]  # flip x, y coordinates
    batch_indices = coords[:, 0].unique()

    compute_once = True
    if intrinsic.dim() == 3:  # batched intrinsics
        # check if all intrinsics are identical
        if len(batch_indices) > 1:
            compute_once = torch.allclose(intrinsic[0:1].expand_as(intrinsic), intrinsic, atol=1e-6)
        intrinsic_ref = intrinsic[0] if compute_once else None
    else:
        intrinsic_ref = intrinsic

    if compute_once:
        intrinsic_batch = intrinsic_ref
        intrinsic_inverse = torch.inverse(intrinsic_batch)
        frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
        camera2frustum, padding_offsets = compute_camera2frustum_transform(
            frustum.to(device), voxel_size,
            frustum_dimensions=torch.tensor([256, 256, 256], device=device)
        )
        # pre-move to device and pre-compute inverse
        camera2frustum = camera2frustum.to(device)
        padding_offsets = padding_offsets.to(device)
        camera2frustum_inv = torch.inverse(camera2frustum).float()
        ones_offset = torch.tensor([1., 1., 1.], device=device)

    transformed_coords_list = []

    for batch_idx in batch_indices:
        batch_mask = coords[:, 0] == batch_idx
        batch_coords = coords[batch_mask, 1:].float()  # convert to float once per batch

        # use pre-computed values or compute per-batch
        if not compute_once:
            intrinsic_batch = intrinsic[int(batch_idx)]
            intrinsic_inverse = torch.inverse(intrinsic_batch)
            frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
            camera2frustum, padding_offsets = compute_camera2frustum_transform(
                frustum.to(device), voxel_size,
                frustum_dimensions=torch.tensor([256, 256, 256], device=device)
            )
            camera2frustum = camera2frustum.float().to(device)
            padding_offsets = padding_offsets.to(device)
            camera2frustum_inv = torch.inverse(camera2frustum).float()
            ones_offset = torch.tensor([1., 1., 1.], device=device)

        # convert voxel coordinates to world coordinates (reverse of BackProjection)
        batch_coords_adjusted = batch_coords - padding_offsets - ones_offset

        # convert to homogeneous coordinates
        homogenous_coords = torch.cat([
            batch_coords_adjusted,
            torch.ones(batch_coords_adjusted.shape[0], 1, device=device)
        ], dim=1)  # [N_batch, 4]

        # apply transformations: world space -> frustum space
        world_coords = torch.mm(camera2frustum_inv, homogenous_coords.t())
        final_coords_homog = torch.mm(camera2frustum.float(), world_coords.float())
        final_coords = final_coords_homog.t()[:, :3]

        # add padding offsets (as done in SparseProjection.projection())
        final_coords = final_coords + padding_offsets

        # add batch index back
        batch_column = torch.full(
            (final_coords.shape[0], 1),
            batch_idx,
            device=device,
            dtype=torch.float32
        )
        final_batch_coords = torch.cat([batch_column, final_coords], dim=1)
        transformed_coords_list.append(final_batch_coords)

    transformed_coords = torch.cat(transformed_coords_list, dim=0)

    transformed_feat3d = Me.SparseTensor(
        features=feat3d.F,
        coordinates=transformed_coords.int(),
        tensor_stride=feat3d.tensor_stride,
        quantization_mode=feat3d.quantization_mode
    )

    return transformed_feat3d


def fuse_sparse_tensors(tensor1: Me.SparseTensor, tensor2: Me.SparseTensor) -> Me.SparseTensor:
    """
    Efficiently fuse two sparse tensors
    Args:
        tensor1 (Me.SparseTensor): First sparse tensor
        tensor2 (Me.SparseTensor): Second sparse tensor

    Returns:
        Me.SparseTensor: Fused sparse tensor with concatenated features
    """
    device = tensor1.device
    dtype = tensor1.F.dtype

    # get coordinates and features
    coords1, feats1 = tensor1.C, tensor1.F
    coords2, feats2 = tensor2.C, tensor2.F

    feat_dim1, feat_dim2 = feats1.shape[1], feats2.shape[1]
    fused_feat_dim = feat_dim1 + feat_dim2

    # concatenate coordinates and create source tracking
    all_coords = torch.cat([coords1, coords2], dim=0)
    n_coords1 = coords1.shape[0]

    # convert each coordinate row to a view that can be uniqued
    coord_view = all_coords.view(all_coords.shape[0], -1)

    # use torch.unique with return_inverse to get mapping
    unique_coord_view, inverse_indices = torch.unique(coord_view, dim=0, return_inverse=True)
    unique_coords = unique_coord_view.view(-1, coords1.shape[1])
    n_unique = unique_coords.shape[0]

    # split inverse indices for each tensor
    inv_indices_1 = inverse_indices[:n_coords1]
    inv_indices_2 = inverse_indices[n_coords1:]

    # pre-allocate with zeros for automatic padding
    fused_features = torch.zeros(n_unique, fused_feat_dim, device=device, dtype=dtype)

    # tensor1 features go to positions [0:feat_dim1]
    fused_features[inv_indices_1, :feat_dim1] = feats1

    # tensor2 features go to positions [feat_dim1:feat_dim1+feat_dim2]
    fused_features[inv_indices_2, feat_dim1:] = feats2
    fused_tensor = Me.SparseTensor(
        features=fused_features,
        coordinates=unique_coords.int(),
        tensor_stride=tensor1.tensor_stride,
        quantization_mode=tensor1.quantization_mode
    )
    return fused_tensor


def generate_multiscale_feat3d(transformed_feat3d: Me.SparseTensor) -> List[Me.SparseTensor]:
    """
    Generate multi-scale sparse 3D features
    from transformed_feat3d to match sparse_multi_scale_features structure.
    Args:
        transformed_feat3d (Me.SparseTensor):
        Input sparse tensor from occupancy-aware lifting (256 grid)

    Returns:
        List[Me.SparseTensor]: Multi-scale sparse tensors
        at scales [1/2, 1/4, 1/8] corresponding to [128, 64, 32] grid sizes
    """
    device = transformed_feat3d.device

    # use consistent stride 2 for progressive downsampling
    # this ensures proper 1/2, 1/4, 1/8 scaling from original 256 grid
    pooling_op = Me.MinkowskiMaxPooling(
        kernel_size=3,
        stride=2,
        dimension=3
    ).to(device)

    multi_scale_feat3d = []
    current_tensor = transformed_feat3d
    target_strides = [2, 4, 8]  # Expected final strides for each scale

    # generate features at each scale by progressive pooling with stride 2
    for _, target_stride in enumerate(target_strides):
        # apply stride-2 pooling to get next scale
        pooled_tensor = pooling_op(current_tensor)

        # ensure the tensor stride matches expected value
        # the stride should be: 2^(i+1) relative to original
        if pooled_tensor.tensor_stride != target_stride:
            pooled_tensor = Me.SparseTensor(
                features=pooled_tensor.F,
                coordinates=pooled_tensor.C,
                tensor_stride=target_stride,
                quantization_mode=pooled_tensor.quantization_mode
            )

        multi_scale_feat3d.append(pooled_tensor)

        # use pooled tensor as input for next scale (progressive downsampling)
        # this gives us: 256 → 128 → 64 → 32 grid sizes
        current_tensor = pooled_tensor

    return multi_scale_feat3d