vpraveen-nv

tinlam-nv commited on Dec 9, 2025

Commit

f4a0919

verified ·

1 Parent(s): 753da32

Update model inference code and environment setup instructions (#4)

Browse files

- Update model inference code and environment setup instructions (911b37924fa99d3bb5958065d613047c8f21d115)
- Add 3D checkpoint (57eed615af595e01cd946aa2cf033d13ac027b21)
- Add 2D TorchScript model (6f5c9c267e07aa33e76aa77ec4db3b152b2e16b1)

Co-authored-by: Lam Thanh Tin <tinlam-nv@users.noreply.huggingface.co>

Files changed (25) hide show

README.md +57 -0
config.json +19 -0
model.py +987 -0
nvpanoptix_3d/__init__.py +15 -0
nvpanoptix_3d/blocks.py +417 -0
nvpanoptix_3d/model_3d.py +355 -0
nvpanoptix_3d/mp_occ/__init__.py +15 -0
nvpanoptix_3d/mp_occ/back_projection.py +114 -0
nvpanoptix_3d/mp_occ/multiplane_occupancy.py +175 -0
nvpanoptix_3d/mp_occ/occupancy_aware_lifting.py +202 -0
nvpanoptix_3d/reconstruction/__init__.py +20 -0
nvpanoptix_3d/reconstruction/decoder.py +385 -0
nvpanoptix_3d/reconstruction/frustum.py +112 -0
nvpanoptix_3d/reconstruction/reprojection.py +235 -0
nvpanoptix_3d/utils/__init__.py +15 -0
nvpanoptix_3d/utils/coords_transform.py +232 -0
nvpanoptix_3d/utils/frustum.py +192 -0
nvpanoptix_3d/utils/helper.py +326 -0
nvpanoptix_3d/utils/point_features.py +127 -0
nvpanoptix_3d/utils/sparse_tensor.py +257 -0
preprocessing.py +328 -0
requirements.txt +30 -0
visualization.py +470 -0
weights/model_2d_fp32.pt +3 -0
weights/tao_vggt_front3d.pth +3 -0

README.md CHANGED Viewed

@@ -20,6 +20,63 @@ Global <br>
 This model is intended for researchers and developers building 3D scene understanding applications for indoor environments, including robotics navigation, augmented reality, virtual reality, and architectural visualization. <br>
 ## Release Date
 Hugging Face: 11/25/2025 via https://huggingface.co/nvidia/3d_panoptic_reconstruction <br>

 This model is intended for researchers and developers building 3D scene understanding applications for indoor environments, including robotics navigation, augmented reality, virtual reality, and architectural visualization. <br>
+## How to use
+### Setup environment
+```bash
+# Setup NVPanoptix-3D env (CUDA 11.8):
+conda create -n nvpanoptix python=3.10 -y
+source activate nvpanoptix / conda activate nvpanoptix
+apt-get update && apt-get install -y git git-lfs ninja-build cmake libopenblas-dev
+git lfs install
+git clone https://huggingface.co/nvidia/nvpanoptix-3d
+cd nvpanoptix-3d
+pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
+pip install -r requirements.txt
+# Temporarily set CUDA architecture list for MinkowskiEngine
+export TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0+PTX"
+pip install ninja && FORCE_CUDA=1 pip install git+https://github.com/NVIDIA/MinkowskiEngine.git --no-build-isolation
+```
+### Quick Start
+```python
+from model import PanopticRecon3DModel
+from preprocessing import load_image
+from visualization import save_outputs
+from PIL import Image
+import numpy as np
+# Load model from local directory
+model = PanopticRecon3DModel.from_pretrained("nvpanoptix-3d")
+# Or load from HF repo
+# model = PanopticRecon3DModel.from_pretrained("nvidia/nvpanoptix-3d")
+# Load and preprocess image
+image_path = "path/to/your/image.png"
+# keep original image for visualization
+orig_image = Image.open(image_path).convert("RGB")
+orig_image = np.array(orig_image)
+# load processed image for inference
+image = load_image(image_path, target_size=(320, 240))
+# Run inference
+outputs = model.predict(image)
+# Save results (2D segmentation, depth map, 3D mesh)
+save_outputs(outputs, "output_dir/", original_image=orig_image)
+# Access individual outputs
+print(f"2D Panoptic: {outputs.panoptic_seg_2d.shape}")   # (120, 160)
+print(f"2D Depth: {outputs.depth_2d.shape}")             # (120, 160)
+print(f"3D Geometry: {outputs.geometry_3d.shape}")       # (256, 256, 256)
+print(f"3D Semantic: {outputs.semantic_seg_3d.shape}")   # (256, 256, 256)
+```
 ## Release Date
 Hugging Face: 11/25/2025 via https://huggingface.co/nvidia/3d_panoptic_reconstruction <br>

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "num_classes": 13,
+  "num_thing_classes": 9,
+  "object_mask_threshold": 0.8,
+  "overlap_threshold": 0.5,
+  "frustum_dims": 256,
+  "truncation": 3.0,
+  "iso_recon_value": 2.0,
+  "voxel_size": 0.03,
+  "depth_min": 0.4,
+  "depth_max": 6.0,
+  "target_size": [320, 240],
+  "reduced_target_size": [160, 120],
+  "size_divisibility": 32,
+  "downsample_factor": 1,
+  "is_matterport": false,
+  "use_fp16_2d": false
+}

model.py ADDED Viewed

	@@ -0,0 +1,987 @@

+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+NVPanoptix-3D Model.
+"""
+import json
+from pathlib import Path
+from omegaconf import OmegaConf
+from dataclasses import dataclass
+from typing import Optional, Tuple, List, Dict, Any, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+from preprocessing import create_frustum_mask, DEFAULT_INTRINSIC
+from nvpanoptix_3d.model_3d import Panoptic3DModel
+from nvpanoptix_3d.utils.helper import get_kept_mapping, retry_if_cuda_oom
+from nvpanoptix_3d.utils.coords_transform import (
+    transform_feat3d_coordinates, fuse_sparse_tensors, generate_multiscale_feat3d
+)
+# Weight file names (stored in weights/ subdirectory)
+WEIGHTS_DIR = "weights"
+TORCHSCRIPT_2D_FILENAME = "model_2d_fp32.pt"
+CHECKPOINT_3D_FILENAME = "tao_vggt_front3d.pth"
+@dataclass
+class PanopticRecon3DConfig:
+    """Configuration for Panoptic Recon 3D model.
+    This config is JSON-serializable and will be saved to config.json
+    when using save_pretrained or push_to_hub.
+    """
+    # Model architecture
+    num_classes: int = 13
+    num_thing_classes: int = 9
+    object_mask_threshold: float = 0.8
+    overlap_threshold: float = 0.5
+    test_topk_per_image: int = 100
+    # Backbone
+    backbone_type: str = "vggt"
+    # Mask Former
+    hidden_dim: int = 256
+    num_queries: int = 100
+    mask_dim: int = 256
+    depth_dim: int = 256
+    dec_layers: int = 10
+    # 3D Frustum
+    frustum_dims: int = 256
+    truncation: float = 3.0
+    iso_recon_value: float = 2.0
+    voxel_size: float = 0.03
+    # Projection
+    depth_feature_dim: int = 256
+    sign_channel: bool = True
+    # Dataset/preprocessing
+    target_size: Tuple[int, int] = (320, 240)
+    reduced_target_size: Tuple[int, int] = (160, 120)
+    depth_size: Tuple[int, int] = (120, 160)
+    depth_min: float = 0.4
+    depth_max: float = 6.0
+    depth_scale: float = 25.0
+    pixel_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406)
+    pixel_std: Tuple[float, float, float] = (0.229, 0.224, 0.225)
+    ignore_label: int = 255
+    size_divisibility: int = 32
+    downsample_factor: int = 1
+    # Model paths
+    torchscript_2d_path: Optional[str] = None
+    use_fp16_2d: bool = False
+    # Dataset mode
+    is_matterport: bool = False
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary."""
+        return {
+            "num_classes": self.num_classes,
+            "num_thing_classes": self.num_thing_classes,
+            "object_mask_threshold": self.object_mask_threshold,
+            "overlap_threshold": self.overlap_threshold,
+            "test_topk_per_image": self.test_topk_per_image,
+            "backbone_type": self.backbone_type,
+            "hidden_dim": self.hidden_dim,
+            "num_queries": self.num_queries,
+            "mask_dim": self.mask_dim,
+            "depth_dim": self.depth_dim,
+            "dec_layers": self.dec_layers,
+            "frustum_dims": self.frustum_dims,
+            "truncation": self.truncation,
+            "iso_recon_value": self.iso_recon_value,
+            "voxel_size": self.voxel_size,
+            "depth_feature_dim": self.depth_feature_dim,
+            "sign_channel": self.sign_channel,
+            "target_size": list(self.target_size),
+            "reduced_target_size": list(self.reduced_target_size),
+            "depth_size": list(self.depth_size),
+            "depth_min": self.depth_min,
+            "depth_max": self.depth_max,
+            "depth_scale": self.depth_scale,
+            "pixel_mean": list(self.pixel_mean),
+            "pixel_std": list(self.pixel_std),
+            "ignore_label": self.ignore_label,
+            "size_divisibility": self.size_divisibility,
+            "downsample_factor": self.downsample_factor,
+            "torchscript_2d_path": self.torchscript_2d_path,
+            "use_fp16_2d": self.use_fp16_2d,
+            "is_matterport": self.is_matterport,
+        }
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "PanopticRecon3DConfig":
+        """Create config from dictionary."""
+        # Convert lists back to tuples
+        if "target_size" in config_dict:
+            config_dict["target_size"] = tuple(config_dict["target_size"])
+        if "reduced_target_size" in config_dict:
+            config_dict["reduced_target_size"] = tuple(config_dict["reduced_target_size"])
+        if "depth_size" in config_dict:
+            config_dict["depth_size"] = tuple(config_dict["depth_size"])
+        if "pixel_mean" in config_dict:
+            config_dict["pixel_mean"] = tuple(config_dict["pixel_mean"])
+        if "pixel_std" in config_dict:
+            config_dict["pixel_std"] = tuple(config_dict["pixel_std"])
+        return cls(**config_dict)
+@dataclass
+class PanopticRecon3DOutput:
+    """Output from Panoptic Recon 3D model."""
+    # 3D outputs
+    panoptic_seg_3d: torch.Tensor  # (D, H, W) int32 - panoptic segmentation
+    geometry_3d: torch.Tensor  # (D, H, W) float32 - TSDF/geometry
+    semantic_seg_3d: torch.Tensor  # (D, H, W) int32 - semantic segmentation
+    # 2D outputs
+    panoptic_seg_2d: torch.Tensor  # (H, W) int32 - 2D panoptic segmentation
+    depth_2d: torch.Tensor  # (H, W) float32 - depth map
+    # Optional metadata
+    panoptic_semantic_mapping: Optional[Dict[int, int]] = None
+    segments_info: Optional[List[Dict]] = None
+    def to_numpy(self) -> Dict[str, np.ndarray]:
+        """Convert outputs to numpy arrays."""
+        result = {
+            "panoptic_seg_3d": self.panoptic_seg_3d.cpu().numpy(),
+            "geometry_3d": self.geometry_3d.cpu().numpy(),
+            "semantic_seg_3d": self.semantic_seg_3d.cpu().numpy(),
+            "panoptic_seg_2d": self.panoptic_seg_2d.cpu().numpy(),
+            "depth_2d": self.depth_2d.cpu().numpy(),
+        }
+        return result
+class PanopticRecon3DModel(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # HuggingFace Hub metadata
+    repo_url="nvidia/nvpanoptix-3d",
+    pipeline_tag="image-segmentation",
+    license="apache-2.0",
+    tags=["panoptic-segmentation", "3d-reconstruction", "depth-estimation", "nvidia"],
+):
+    """
+    This model performs panoptic 3D scene reconstruction from a single RGB image.
+    It combines:
+    - 2D panoptic segmentation
+    - Depth estimation
+    - 3D volumetric reconstruction
+    The model architecture uses:
+    - VGGT backbone for feature extraction
+    - MaskFormer head for panoptic segmentation
+    - Occupancy-aware lifting for 2D-to-3D projection
+    - Sparse 3D convolutions for volumetric completion
+    """
+    def __init__(
+        self,
+        num_classes: int = 13,
+        num_thing_classes: int = 9,
+        object_mask_threshold: float = 0.8,
+        overlap_threshold: float = 0.5,
+        frustum_dims: int = 256,
+        truncation: float = 3.0,
+        iso_recon_value: float = 2.0,
+        voxel_size: float = 0.03,
+        depth_min: float = 0.4,
+        depth_max: float = 6.0,
+        target_size: Tuple[int, int] = (320, 240),
+        reduced_target_size: Tuple[int, int] = (160, 120),
+        size_divisibility: int = 32,
+        downsample_factor: int = 1,
+        is_matterport: bool = False,
+        torchscript_2d_path: Optional[str] = None,
+        use_fp16_2d: bool = False,
+        **kwargs,
+    ):
+        """Initialize Panoptic Recon 3D model.
+        Args:
+            num_classes: Number of semantic classes.
+            num_thing_classes: Number of "thing" (instance) classes.
+            object_mask_threshold: Threshold for object mask confidence.
+            overlap_threshold: Threshold for mask overlap.
+            frustum_dims: Dimensions of 3D frustum volume.
+            truncation: TSDF truncation distance.
+            iso_recon_value: Iso-surface value for mesh extraction.
+            voxel_size: Voxel size in meters.
+            depth_min: Minimum depth value.
+            depth_max: Maximum depth value.
+            target_size: Target image size (width, height).
+            reduced_target_size: Reduced target size for 3D projection.
+            size_divisibility: Size divisibility for padding.
+            downsample_factor: Downsample factor for 3D reconstruction.
+            is_matterport: Whether using Matterport dataset mode.
+            torchscript_2d_path: Path to TorchScript 2D model (optional).
+            use_fp16_2d: Whether to use FP16 for 2D model.
+        """
+        super().__init__()
+        # Store config as attributes (for PyTorchModelHubMixin serialization)
+        self.num_classes = num_classes
+        self.num_thing_classes = num_thing_classes
+        self.object_mask_threshold = object_mask_threshold
+        self.overlap_threshold = overlap_threshold
+        self.frustum_dims_val = frustum_dims
+        self.truncation = truncation
+        self.iso_recon_value = iso_recon_value
+        self.voxel_size = voxel_size
+        self.depth_min = depth_min
+        self.depth_max = depth_max
+        self.target_size = target_size
+        self.reduced_target_size = reduced_target_size
+        self.size_divisibility = size_divisibility
+        self.downsample_factor = downsample_factor
+        self.is_matterport = is_matterport
+        self.torchscript_2d_path = torchscript_2d_path
+        self.use_fp16_2d = use_fp16_2d
+        # Derived values
+        self.frustum_dims = [frustum_dims] * 3
+        # Models will be loaded on first use or via load_weights
+        self.model_2d: Optional[torch.jit.ScriptModule] = None
+        self.model_3d_components: Optional[Dict[str, nn.Module]] = None
+        self._initialized = False
+        # Placeholder for post processor
+        self.post_processor = None
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        force_download: bool = False,
+        proxies: Optional[Dict] = None,
+        resume_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ) -> "PanopticRecon3DModel":
+        """Load model from HuggingFace Hub or local directory.
+        This method handles loading both the TorchScript 2D model and the 3D checkpoint.
+        Args:
+            model_id: HuggingFace Hub repo ID or local directory path.
+            revision: Git revision (branch, tag, or commit hash).
+            cache_dir: Cache directory for downloaded files.
+            force_download: Force re-download even if cached.
+            proxies: Proxy configuration.
+            resume_download: Resume interrupted downloads.
+            local_files_only: Only use local files, don't download.
+            token: HuggingFace API token.
+            map_location: Device to load model onto.
+            strict: Strict loading (not used for this model).
+            **model_kwargs: Additional model arguments.
+        Returns:
+            Initialized PanopticRecon3DModel with weights loaded.
+        """
+        # Determine device
+        device = model_kwargs.pop("device", None)
+        if device is None:
+            device = map_location if map_location != "cpu" else "cuda:0" if torch.cuda.is_available() else "cpu"
+        # Check if local directory
+        model_path = Path(model_id)
+        if model_path.exists() and model_path.is_dir():
+            # Local directory
+            config_path = model_path / "config.json"
+            weights_dir = model_path / WEIGHTS_DIR
+            torchscript_2d_path = weights_dir / TORCHSCRIPT_2D_FILENAME
+            checkpoint_3d_path = weights_dir / CHECKPOINT_3D_FILENAME
+            # Load config if exists
+            if config_path.exists():
+                with open(config_path, "r") as f:
+                    config = json.load(f)
+                    # Merge with model_kwargs (model_kwargs take precedence)
+                    for key, value in config.items():
+                        if key not in model_kwargs:
+                            model_kwargs[key] = value
+        else:
+            # HuggingFace Hub - download files
+            # Download config.json
+            try:
+                config_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename="config.json",
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                )
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+                    for key, value in config.items():
+                        if key not in model_kwargs:
+                            model_kwargs[key] = value
+            except Exception:
+                pass  # Config is optional
+            # Download weight files
+            torchscript_2d_path = hf_hub_download(
+                repo_id=model_id,
+                filename=f"{WEIGHTS_DIR}/{TORCHSCRIPT_2D_FILENAME}",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                token=token,
+            )
+            checkpoint_3d_path = hf_hub_download(
+                repo_id=model_id,
+                filename=f"{WEIGHTS_DIR}/{CHECKPOINT_3D_FILENAME}",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                token=token,
+            )
+        # Create model instance
+        model = cls(**model_kwargs)
+        # Load weights
+        model.load_weights(
+            torchscript_2d_path=str(torchscript_2d_path),
+            checkpoint_3d_path=str(checkpoint_3d_path),
+            device=device,
+        )
+        return model
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save model to directory.
+        This saves the config.json and copies weight files to the directory.
+        Args:
+            save_directory: Directory to save model to.
+        """
+        save_directory = Path(save_directory)
+        save_directory.mkdir(parents=True, exist_ok=True)
+        # Save config
+        config = {
+            "num_classes": self.num_classes,
+            "num_thing_classes": self.num_thing_classes,
+            "object_mask_threshold": self.object_mask_threshold,
+            "overlap_threshold": self.overlap_threshold,
+            "frustum_dims": self.frustum_dims_val,
+            "truncation": self.truncation,
+            "iso_recon_value": self.iso_recon_value,
+            "voxel_size": self.voxel_size,
+            "depth_min": self.depth_min,
+            "depth_max": self.depth_max,
+            "target_size": list(self.target_size),
+            "reduced_target_size": list(self.reduced_target_size),
+            "size_divisibility": self.size_divisibility,
+            "downsample_factor": self.downsample_factor,
+            "is_matterport": self.is_matterport,
+            "use_fp16_2d": self.use_fp16_2d,
+        }
+        config_path = save_directory / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        # Create weights directory and copy/save weights
+        weights_dir = save_directory / WEIGHTS_DIR
+        weights_dir.mkdir(exist_ok=True)
+        # Note: Weight files should be copied manually or the model
+        # should be saved from a loaded state
+        if self._initialized and hasattr(self, '_torchscript_2d_path'):
+            import shutil
+            # Copy TorchScript 2D model
+            src_2d = Path(self._torchscript_2d_path)
+            if src_2d.exists():
+                shutil.copy2(src_2d, weights_dir / TORCHSCRIPT_2D_FILENAME)
+            # Copy 3D checkpoint
+            src_3d = Path(self._checkpoint_3d_path)
+            if src_3d.exists():
+                shutil.copy2(src_3d, weights_dir / CHECKPOINT_3D_FILENAME)
+    def _build_omegaconf(self) -> Any:
+        """Build OmegaConf config for internal model components."""
+        return OmegaConf.create({
+            "model": {
+                "export": True,
+                "mode": "panoptic",
+                "object_mask_threshold": self.object_mask_threshold,
+                "overlap_threshold": self.overlap_threshold,
+                "test_topk_per_image": 100,
+                "backbone": {"type": "vggt", "pretrained_weights": None},
+                "sem_seg_head": {
+                    "common_stride": 4,
+                    "transformer_enc_layers": 6,
+                    "convs_dim": 256,
+                    "mask_dim": 256,
+                    "depth_dim": 256,
+                    "ignore_value": 255,
+                    "deformable_transformer_encoder_in_features": ["res3", "res4", "res5"],
+                    "num_classes": self.num_classes,
+                    "norm": "GN",
+                    "in_features": ["res2", "res3", "res4", "res5"]
+                },
+                "mask_former": {
+                    "dropout": 0.0,
+                    "nheads": 8,
+                    "num_object_queries": 100,
+                    "hidden_dim": 256,
+                    "transformer_dim_feedforward": 1024,
+                    "dim_feedforward": 2048,
+                    "dec_layers": 10,
+                    "pre_norm": False,
+                    "class_weight": 2.0,
+                    "dice_weight": 5.0,
+                    "mask_weight": 5.0,
+                    "depth_weight": 5.0,
+                    "mp_occ_weight": 5.0,
+                    "train_num_points": 12544,
+                    "oversample_ratio": 3.0,
+                    "importance_sample_ratio": 0.75,
+                    "deep_supervision": True,
+                    "no_object_weight": 0.1,
+                    "size_divisibility": self.size_divisibility
+                },
+                "frustum3d": {
+                    "truncation": self.truncation,
+                    "iso_recon_value": self.iso_recon_value,
+                    "panoptic_weight": 25.0,
+                    "completion_weights": [50.0, 25.0, 10.0],
+                    "surface_weight": 5.0,
+                    "unet_output_channels": 16,
+                    "unet_features": 16,
+                    "use_multi_scale": False,
+                    "grid_dimensions": self.frustum_dims_val,
+                    "frustum_dims": self.frustum_dims_val,
+                    "signed_channel": 3
+                },
+                "projection": {
+                    "voxel_size": self.voxel_size,
+                    "sign_channel": True,
+                    "depth_feature_dim": 256
+                }
+            },
+            "dataset": {
+                "contiguous_id": False,
+                "label_map": "",
+                "name": "",  # Empty string to match Triton behavior (triggers adjust_intrinsic)
+                "downsample_factor": self.downsample_factor,
+                "iso_value": 1.0,
+                "pixel_mean": [0.485, 0.456, 0.406],
+                "pixel_std": [0.229, 0.224, 0.225],
+                "ignore_label": 255,
+                "min_instance_pixels": 200,
+                "img_format": "RGB",
+                "target_size": list(self.target_size),
+                "reduced_target_size": list(self.reduced_target_size),
+                "depth_size": [120, 160],
+                "depth_bound": False,
+                "depth_min": self.depth_min,
+                "depth_max": self.depth_max,
+                "frustum_mask_path": "",
+                "occ_truncation_lvl": [8.0, 6.0],
+                "truncation_range": [0.0, 12.0],
+                "enable_3d": False,
+                "enable_mp_occ": True,
+                "depth_scale": 25.0,
+                "num_thing_classes": self.num_thing_classes,
+                "augmentation": {"size_divisibility": self.size_divisibility}
+            }
+        })
+    def load_weights(
+        self,
+        torchscript_2d_path: Optional[str] = None,
+        checkpoint_3d_path: Optional[str] = None,
+        device: str = "cuda:0",
+    ):
+        """Load model weights.
+        Args:
+            torchscript_2d_path: Path to TorchScript 2D model file.
+            checkpoint_3d_path: Path to 3D model checkpoint (.pth/.pt).
+            device: Device to load models onto.
+        """
+        # Use stored path if not provided
+        torchscript_2d_path = torchscript_2d_path or self.torchscript_2d_path
+        if torchscript_2d_path is None:
+            raise ValueError("torchscript_2d_path is required")
+        if checkpoint_3d_path is None:
+            raise ValueError("checkpoint_3d_path is required")
+        # Store paths for save_pretrained
+        self._torchscript_2d_path = torchscript_2d_path
+        self._checkpoint_3d_path = checkpoint_3d_path
+        # Build config
+        cfg = self._build_omegaconf()
+        # Load 2D TorchScript model
+        self.model_2d = torch.jit.load(torchscript_2d_path, map_location=device)
+        self.model_2d.eval()
+        # Load 3D model from checkpoint
+        full_model = Panoptic3DModel(cfg)
+        checkpoint = torch.load(checkpoint_3d_path, map_location="cpu")
+        state_dict = checkpoint.get("state_dict", checkpoint)
+        # Remove 'model.' prefix if present
+        filtered_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key[6:] if key.startswith("model.") else key
+            filtered_state_dict[new_key] = value
+        full_model.load_state_dict(filtered_state_dict, strict=False)
+        full_model.to(device)
+        full_model.eval()
+        # Extract 3D components
+        self.model_3d_components = {
+            "ol": full_model.ol,
+            "reprojection": full_model.reprojection,
+            "completion": full_model.completion,
+            "projector": full_model.projector,
+            "back_projection": full_model.back_projection,
+        }
+        # Store post processor and helper functions
+        self.post_processor = full_model.post_processor
+        self.back_projection = full_model.back_projection  # Required for get_kept_mapping
+        self._get_kept_mapping = get_kept_mapping
+        self._transform_feat3d_coordinates = transform_feat3d_coordinates
+        self._fuse_sparse_tensors = fuse_sparse_tensors
+        self._generate_multiscale_feat3d = generate_multiscale_feat3d
+        self._retry_if_cuda_oom = retry_if_cuda_oom
+        self._panoptic_3d_inference = full_model.panoptic_3d_inference
+        self._postprocess = full_model.postprocess
+        self._cfg = cfg
+        # Disable gradients for all components
+        for module in self.model_3d_components.values():
+            for param in module.parameters():
+                param.requires_grad = False
+            module.eval()
+        self._initialized = True
+        self._device = device
+    def _ensure_initialized(self):
+        """Ensure model is initialized."""
+        if not self._initialized:
+            raise RuntimeError(
+                "Model weights not loaded. Call load_weights() first, or use "
+                "from_pretrained() to load a pre-trained model."
+            )
+    def _infer_2d(
+        self,
+        images: torch.Tensor,
+        intrinsic: torch.Tensor,
+    ) -> Tuple[Dict[str, torch.Tensor], List[Dict], torch.Tensor]:
+        """Run 2D inference using TorchScript model.
+        Args:
+            images: Input images (B, C, H, W) as uint8 or float.
+            intrinsic: Camera intrinsics (B, 4, 4).
+        Returns:
+            outputs_2d: Dictionary of 2D model outputs.
+            processed_results: List of processed results per image.
+            occupancy_pred: Occupancy predictions.
+        """
+        # Run 2D model
+        with torch.no_grad():
+            if self.use_fp16_2d:
+                with torch.cuda.amp.autocast():
+                    outputs_dict = self.model_2d(images)
+            else:
+                outputs_dict = self.model_2d(images)
+        # Normalize to FP32
+        def to_fp32(x):
+            return x.float() if isinstance(x, torch.Tensor) and x.dtype != torch.float32 else x
+        # Extract outputs
+        mask_cls_results = to_fp32(outputs_dict["pred_logits"])
+        mask_pred_results = to_fp32(outputs_dict["pred_masks"])
+        depth_pred_results = to_fp32(outputs_dict["pred_depths"])
+        enc_features = [
+            to_fp32(outputs_dict["enc_features_0"]),
+            to_fp32(outputs_dict["enc_features_1"]),
+            to_fp32(outputs_dict["enc_features_2"]),
+            to_fp32(outputs_dict["enc_features_3"]),
+        ]
+        mask_features = to_fp32(outputs_dict["mask_features"])
+        depth_features = to_fp32(outputs_dict["depth_features"])
+        segm_decoder_out = to_fp32(outputs_dict["segm_decoder_out"])
+        pose_enc = to_fp32(outputs_dict["pose_enc"])
+        occupancy_pred = to_fp32(outputs_dict["occupancy_pred"])
+        orig_pad_h = int(outputs_dict["orig_pad_h"].item())
+        orig_pad_w = int(outputs_dict["orig_pad_w"].item())
+        orig_h = int(outputs_dict["orig_h"].item())
+        orig_w = int(outputs_dict["orig_w"].item())
+        # Interpolate masks and depths
+        padded_out_h, padded_out_w = orig_pad_h // 2, orig_pad_w // 2
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(padded_out_h, padded_out_w),
+            mode="bilinear",
+            align_corners=False,
+        )
+        depth_pred_results = F.interpolate(
+            depth_pred_results,
+            size=(padded_out_h, padded_out_w),
+            mode="bilinear",
+            align_corners=False,
+        )
+        # Postprocess each image
+        # NOTE: We need to track the CROPPED mask_pred_result for outputs_2d
+        # (matching the Triton model behavior)
+        processed_results = []
+        final_mask_cls_result = None
+        final_mask_pred_result = None
+        for idx, (mask_cls_result, mask_pred_result, depth_pred_result, per_image_intrinsic) in enumerate(zip(
+            mask_cls_results, mask_pred_results, depth_pred_results, intrinsic
+        )):
+            out_h, out_w = orig_h // 2, orig_w // 2
+            processed_results.append({})
+            # Remove padding - OVERWRITE the variable like Triton does
+            mask_pred_result = mask_pred_result[:, :out_h, :out_w]
+            depth_pred_result = depth_pred_result[:, :out_h, :out_w]
+            # Panoptic inference
+            panoptic_seg, depth_r, segments_info, sem_prob_masks = self._retry_if_cuda_oom(
+                self.post_processor.panoptic_inference
+            )(
+                mask_cls_result,
+                mask_pred_result,
+                depth_pred_result
+            )
+            depth_r = depth_r[None]
+            processed_results[-1]["panoptic_seg"] = (panoptic_seg, segments_info)
+            processed_results[-1]["depth"] = depth_r[0]
+            processed_results[-1]["image_size"] = (orig_w, orig_h)
+            processed_results[-1]["padded_size"] = (orig_pad_w, orig_pad_h)
+            processed_results[-1]["intrinsic"] = per_image_intrinsic
+            processed_results[-1]["sem_seg"] = sem_prob_masks
+            # Store last iteration's results for outputs_2d (matching Triton behavior)
+            final_mask_cls_result = mask_cls_result
+            final_mask_pred_result = mask_pred_result
+        # Reconstruct outputs_2d - use CROPPED mask_pred_result from last iteration
+        # This matches the Triton model's behavior exactly
+        outputs_2d = {
+            "pred_logits": final_mask_cls_result.unsqueeze(0),
+            "pred_masks": final_mask_pred_result.unsqueeze(0),
+            "enc_features": enc_features,
+            "mask_features": mask_features,
+            "depth_features": depth_features,
+            "segm_decoder_out": segm_decoder_out,
+            "pose_enc": pose_enc,
+        }
+        return outputs_2d, processed_results, occupancy_pred
+    def _forward_3d(
+        self,
+        batched_inputs: Dict[str, torch.Tensor],
+        outputs_2d: Dict[str, torch.Tensor],
+        processed_results: List[Dict],
+        kept: torch.Tensor,
+        mapping: torch.Tensor,
+        occupancy_pred: torch.Tensor,
+    ) -> Dict[str, Any]:
+        """Run 3D reconstruction pipeline.
+        Args:
+            batched_inputs: Dictionary containing frustum_mask, intrinsic, etc.
+            outputs_2d: 2D model outputs.
+            processed_results: Processed 2D results.
+            kept: Kept voxel indices.
+            mapping: Voxel to pixel mapping.
+            occupancy_pred: Occupancy predictions.
+        Returns:
+            Postprocessed 3D results.
+        """
+        room_mask = batched_inputs.get("room_mask_buol") if self.is_matterport else None
+        # Occupancy-aware lifting
+        feat3d, mask3d = self.model_3d_components["ol"](
+            processed_results, kept, mapping, occupancy_pred, room_mask
+        )
+        del occupancy_pred, mask3d
+        torch.cuda.empty_cache()
+        # Project features
+        multi_scale_features = list(reversed(outputs_2d["enc_features"]))
+        depth_features = self.model_3d_components["projector"](
+            outputs_2d["depth_features"],
+            outputs_2d["mask_features"].shape[-2:]
+        )
+        encoder_features = torch.cat([outputs_2d["mask_features"], depth_features], dim=1)
+        sparse_multi_scale_features, sparse_encoder_features = self.model_3d_components["reprojection"](
+            multi_scale_features, encoder_features, processed_results
+        )
+        del multi_scale_features, encoder_features
+        torch.cuda.empty_cache()
+        # Prepare 3D inputs
+        segm_queries = outputs_2d["segm_decoder_out"]
+        frustum_mask = batched_inputs["frustum_mask"]
+        intrinsic = batched_inputs["intrinsic"]
+        frustum_mask_64 = F.max_pool3d(
+            frustum_mask[:, None].float(),
+            kernel_size=2,
+            stride=4
+        ).bool()
+        # Transform 3D coordinates
+        transformed_feat3d = self._transform_feat3d_coordinates(feat3d, intrinsic)
+        del feat3d
+        # Fuse features
+        if not self.is_matterport:
+            multi_scale_feat3d = self._generate_multiscale_feat3d(transformed_feat3d)
+            fused_multi_scale_features = [
+                self._fuse_sparse_tensors(sparse_multi_scale_features[i], multi_scale_feat3d[i])
+                for i in range(len(multi_scale_feat3d))
+            ]
+            del sparse_multi_scale_features, multi_scale_feat3d
+        else:
+            fused_multi_scale_features = sparse_multi_scale_features
+        try:
+            fused_encoder_features = self._fuse_sparse_tensors(
+                sparse_encoder_features, transformed_feat3d
+            )
+        except Exception:
+            fused_encoder_features = sparse_encoder_features
+        del sparse_encoder_features, transformed_feat3d
+        torch.cuda.empty_cache()
+        # Run 3D completion
+        outputs_3d = self.model_3d_components["completion"](
+            fused_multi_scale_features,
+            fused_encoder_features,
+            segm_queries,
+            frustum_mask_64
+        )
+        outputs_3d["pred_logits"] = outputs_2d["pred_logits"]
+        outputs_3d["pred_masks"] = outputs_2d["pred_masks"]
+        return self._postprocess(outputs_3d, outputs_2d, processed_results, frustum_mask)
+    def forward(
+        self,
+        images: torch.Tensor,
+        frustum_mask: torch.Tensor,
+        intrinsic: torch.Tensor,
+        height: Optional[torch.Tensor] = None,
+        width: Optional[torch.Tensor] = None,
+    ) -> PanopticRecon3DOutput:
+        """Run full panoptic 3D reconstruction pipeline.
+        Args:
+            images: Input images (B, C, H, W) as uint8 [0-255] or float [0-1].
+            frustum_mask: Boolean frustum mask (B, D, H, W).
+            intrinsic: Camera intrinsic matrices (B, 4, 4).
+            height: Optional image heights (B,).
+            width: Optional image widths (B,).
+        Returns:
+            PanopticRecon3DOutput with 2D and 3D predictions.
+        """
+        self._ensure_initialized()
+        # Prepare inputs
+        if height is None:
+            height = torch.tensor([images.shape[2]], device=images.device)
+        if width is None:
+            width = torch.tensor([images.shape[3]], device=images.device)
+        batched_inputs = {
+            "image": images,
+            "frustum_mask": frustum_mask.bool(),
+            "intrinsic": intrinsic,
+            "height": height,
+            "width": width,
+        }
+        # Run 2D inference
+        outputs_2d, processed_results, occupancy_pred = self._infer_2d(images, intrinsic)
+        # Compute kept and mapping (self has back_projection attribute)
+        kept, mapping = self._get_kept_mapping(
+            self,
+            self._cfg,
+            batched_inputs,
+            device=images.device
+        )
+        # Run 3D inference
+        outputs_3d = self._forward_3d(
+            batched_inputs, outputs_2d, processed_results, kept, mapping, occupancy_pred
+        )
+        # Create output object
+        return PanopticRecon3DOutput(
+            panoptic_seg_3d=outputs_3d["panoptic_seg"][0],
+            geometry_3d=outputs_3d["geometry"][0],
+            semantic_seg_3d=outputs_3d["semantic_seg"][0],
+            panoptic_seg_2d=outputs_3d["panoptic_seg_2d"][0][0],
+            depth_2d=outputs_3d["depth"][0],
+            panoptic_semantic_mapping=outputs_3d["panoptic_semantic_mapping"][0],
+            segments_info=outputs_3d["panoptic_seg_2d"][0][1] if len(outputs_3d["panoptic_seg_2d"][0]) > 1 else None,
+        )
+    @torch.no_grad()
+    def predict(
+        self,
+        image: Union[np.ndarray, torch.Tensor],
+        frustum_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        intrinsic: Optional[Union[np.ndarray, torch.Tensor]] = None,
+    ) -> PanopticRecon3DOutput:
+        """User-friendly prediction interface.
+        Args:
+            image: Input RGB image as numpy array. Accepted formats:
+                   - (H, W, C) HWC format uint8 [0-255]
+                   - (C, H, W) CHW format uint8 [0-255]
+                   - (1, C, H, W) batched CHW format (from load_image)
+            frustum_mask: Optional frustum mask. If None, auto-generated using default intrinsic.
+            intrinsic: Optional camera intrinsic (4x4). If None, uses DEFAULT_INTRINSIC.
+        Returns:
+            PanopticRecon3DOutput with predictions.
+        """
+        self._ensure_initialized()
+        # Use default intrinsic if not provided
+        if intrinsic is None:
+            intrinsic = DEFAULT_INTRINSIC.copy()
+        # Process image - match test_triton_server.py preprocessing exactly
+        if isinstance(image, np.ndarray):
+            # Handle different input formats
+            if image.ndim == 4:
+                # Already batched (1, C, H, W) - from load_image
+                pass
+            elif image.ndim == 3:
+                if image.shape[2] == 3:
+                    # HWC format -> CHW format
+                    image = np.ascontiguousarray(image.transpose(2, 0, 1))
+                # Now it's CHW, add batch dimension
+                image = image[np.newaxis, ...]
+            # Ensure uint8
+            if image.dtype != np.uint8:
+                if image.max() <= 1.0:
+                    image = (image * 255).clip(0, 255).astype(np.uint8)
+                else:
+                    image = image.clip(0, 255).astype(np.uint8)
+            image = torch.from_numpy(image)
+        else:
+            # Tensor input
+            if image.dim() == 3:
+                image = image.unsqueeze(0)
+            # Ensure uint8
+            if image.dtype != torch.uint8:
+                if image.max() <= 1.0:
+                    image = (image * 255).clamp(0, 255).to(torch.uint8)
+                else:
+                    image = image.clamp(0, 255).to(torch.uint8)
+        image = image.to(self._device)
+        # Generate frustum mask if not provided
+        if frustum_mask is None:
+            intrinsic_np = intrinsic if isinstance(intrinsic, np.ndarray) else intrinsic.cpu().numpy()
+            frustum_mask = create_frustum_mask(
+                intrinsics=intrinsic_np,
+                volume_shape=(self.frustum_dims_val,) * 3,
+                depth_range=(self.depth_min, self.depth_max),
+                voxel_size=self.voxel_size,
+                image_shape=(self.target_size[1], self.target_size[0]),
+            )
+            frustum_mask = torch.from_numpy(frustum_mask).unsqueeze(0)
+        elif isinstance(frustum_mask, np.ndarray):
+            frustum_mask = torch.from_numpy(frustum_mask)
+            if frustum_mask.dim() == 3:
+                frustum_mask = frustum_mask.unsqueeze(0)
+        frustum_mask = frustum_mask.to(self._device)
+        # Convert intrinsic to tensor
+        if isinstance(intrinsic, np.ndarray):
+            intrinsic = torch.from_numpy(intrinsic)
+        if intrinsic.dim() == 2:
+            intrinsic = intrinsic.unsqueeze(0)
+        intrinsic = intrinsic.float().to(self._device)
+        return self.forward(image, frustum_mask, intrinsic)

nvpanoptix_3d/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Panoptic Recon 3D model module."""

nvpanoptix_3d/blocks.py ADDED Viewed

	@@ -0,0 +1,417 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Blocks for Panoptic Recon 3D."""
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+import torch.nn.functional as F
+import MinkowskiEngine as Me
+class ProjectionBlock(nn.Module):
+    """Projection block for depth projection."""
+    def __init__(self, in_feature, out_feature):
+        """Init"""
+        super().__init__()
+        self.conv_block1 = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+        self.conv_block2 = nn.Conv2d(
+            out_feature, out_feature,
+            kernel_size=1, stride=1,
+            padding=0
+        )
+    def forward(self, x, target_size):
+        """Forward"""
+        x = self.conv_block1(x)
+        x = F.interpolate(x, size=target_size, mode="bilinear", align_corners=False)
+        x = self.conv_block2(x)
+        return x
+class ConvBlock(nn.Module):
+    """Conv block for depth projection."""
+    def __init__(self, in_feature, out_feature):
+        """Init"""
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=2, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        """Forward"""
+        return self.conv_block(x)
+class DepthProjector(nn.Module):
+    """Depth projector module."""
+    def __init__(
+        self,
+        in_channels: int = 256,
+        out_channels: int = 256,
+        num_proj_convs: int = 4,
+        **kwargs
+    ):
+        """Init"""
+        super(DepthProjector, self).__init__()
+        self.proj_convs1 = nn.ModuleList([
+            ConvBlock(in_channels, in_channels) for _ in range(num_proj_convs)
+        ])
+        self.proj_convs2 = nn.ModuleList([
+            nn.Conv2d(
+                in_channels, out_channels,
+                kernel_size=1, stride=1,
+                padding=0
+            ) for _ in range(num_proj_convs)
+        ])
+    def forward(self, depth_features, depth_feature_shape, size_list):
+        """Forward"""
+        output_list = []
+        size_list.append(depth_feature_shape)
+        for i, (_, feat_shape) in enumerate(zip(
+            self.proj_convs1,
+            size_list[::-1]
+        )):
+            feat = depth_features[i]
+            output = self.proj_convs1[i](feat)
+            output = F.interpolate(output, feat_shape, mode="bilinear", align_corners=False)
+            output = self.proj_convs2[i](output)
+            output_list.append(output)
+        return depth_features[-1], output_list[1:][::-1]
+class SelfAttentionLayer(nn.Module):
+    """Self Attention Layer."""
+    def __init__(
+        self, d_model, nhead, dropout=0.0,
+        activation="relu", normalize_before=False, export=False
+    ):
+        """Init."""
+        super().__init__()
+        self.export = export
+        if export:
+            self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        else:
+            self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """Reset parameters."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        """Add positional embedding."""
+        return tensor if pos is None else tensor + pos
+    def forward_post(
+        self, tgt,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward post norm."""
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(
+        self, tgt,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward pre norm."""
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(
+        self, tgt,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward."""
+        if self.normalize_before:
+            return self.forward_pre(
+                tgt, tgt_mask, tgt_key_padding_mask, query_pos
+            )
+        return self.forward_post(
+            tgt, tgt_mask, tgt_key_padding_mask, query_pos
+        )
+class CrossAttentionLayer(nn.Module):
+    """Cross attention layer."""
+    def __init__(self, d_model, nhead, dropout=0.0,
+                 activation="relu", normalize_before=False, export=False):
+        """Init."""
+        super().__init__()
+        self.export = export
+        if export:
+            self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        else:
+            self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """Reset parameters."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        """Add positional embedding."""
+        return tensor if pos is None else tensor + pos
+    def forward_post(
+        self, tgt, memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward post norm."""
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(
+        self, tgt, memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward pre norm."""
+        tgt2 = self.norm(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory, attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(
+        self, tgt, memory,
+        memory_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None
+    ):
+        """Forward pass."""
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+class FFNLayer(nn.Module):
+    """Feedforward layer."""
+    def __init__(
+        self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False
+    ):
+        """Init."""
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self._reset_parameters()
+    def _reset_parameters(self):
+        """Reset parameters."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        """Add positional embedding."""
+        return tensor if pos is None else tensor + pos
+    def forward_post(self, tgt):
+        """Forward post norm."""
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+    def forward_pre(self, tgt):
+        """Forward pre norm."""
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+    def forward(self, tgt):
+        """Forward."""
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise NotImplementedError(f"activation should be relu/gelu, not {activation}.")
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        """Init."""
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        """Forward pass."""
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+# 3D blocks
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1, sparse=False):
+    """3x3 convolution with padding"""
+    if sparse:
+        return Me.MinkowskiConvolution(
+            in_planes, out_planes, kernel_size=3,
+            stride=stride, dilation=dilation,
+            bias=False, dimension=3
+        )
+    else:
+        return nn.Conv3d(
+            in_planes, out_planes, kernel_size=3,
+            stride=stride, padding=dilation,
+            groups=groups, bias=False,
+            dilation=dilation
+        )
+class BasicBlock3D(nn.Module):
+    """Basic block for 3D."""
+    def __init__(
+        self, inplanes, planes, stride=1, downsample=None, groups=1,
+        base_width=64, dilation=1, norm_layer=None, sparse=False
+    ):
+        """Init."""
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.InstanceNorm3d if not sparse else Me.MinkowskiInstanceNorm
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.conv1 = conv3x3(inplanes, planes, stride, sparse=sparse)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True) if not sparse else Me.MinkowskiReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes, sparse=sparse)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        """Forward."""
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class SparseBasicBlock3D(BasicBlock3D):
+    """Sparse basic block for 3D."""
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        """Init."""
+        super().__init__(inplanes, planes,
+                         stride=stride, downsample=downsample, groups=groups,
+                         base_width=base_width, dilation=dilation,
+                         norm_layer=norm_layer, sparse=True)

nvpanoptix_3d/model_3d.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""3D stage model for Panoptic Recon 3D."""
+import torch
+import MinkowskiEngine as Me
+from torch.nn import functional as F
+import torch.nn as nn
+from .blocks import ProjectionBlock
+from .utils.helper import retry_if_cuda_oom
+# from .model_2d import MaskFormerModel
+from .reconstruction import SparseProjection, FrustumDecoder
+from .mp_occ.occupancy_aware_lifting import OccupancyAwareLifting
+from .mp_occ.back_projection import BackProjection
+from .utils.sparse_tensor import \
+    to_dense, prepare_instance_masks_thicken
+from .utils.coords_transform import \
+    transform_feat3d_coordinates, fuse_sparse_tensors, generate_multiscale_feat3d
+class Postprocessor(nn.Module):
+    """2D model postprocessor."""
+    def __init__(self, cfg):
+        """Initialize the postprocessor."""
+        super().__init__()
+        self.cfg = cfg
+        self.test_topk_per_image = cfg.model.test_topk_per_image
+        self.num_classes = cfg.model.sem_seg_head.num_classes
+        self.num_queries = cfg.model.mask_former.num_object_queries
+        self.object_mask_threshold = cfg.model.object_mask_threshold
+        self.overlap_threshold = cfg.model.overlap_threshold
+        self.depth_scale = cfg.dataset.depth_scale
+        self.num_thing_classes = cfg.dataset.num_thing_classes
+    def panoptic_inference(self, mask_cls, mask_pred, depth_pred):
+        """Post process for panoptic segmentation."""
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+        keep = labels.ne(self.num_classes) & (scores > self.object_mask_threshold)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
+        segments_info = []
+        sem_prob_masks = torch.zeros((
+            self.num_classes, h, w
+        ), dtype=torch.float32, device=cur_masks.device)
+        current_segment_id = 0
+        if cur_masks.shape[0] == 0:
+            return panoptic_seg, depth_pred[0, :, :], segments_info, sem_prob_masks
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            stuff_mask_ids = []
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in list(range(1, self.num_thing_classes + 1))
+                mask_area = (cur_mask_ids == k).sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
+                    if mask_area / original_area < self.overlap_threshold:
+                        continue
+                    # merge stuff regions
+                    if not isthing:
+                        stuff_mask_ids.append(k)
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
+                    segments_info.append(
+                        {
+                            "id": current_segment_id,
+                            "isthing": bool(isthing),
+                            "category_id": int(pred_class),
+                        }
+                    )
+            if stuff_mask_ids:
+                # recover void pixels
+                stuff_mask_ids = torch.tensor(stuff_mask_ids, dtype=torch.long, device=cur_prob_masks.device)
+                cur_stuff_ids = stuff_mask_ids[cur_prob_masks[stuff_mask_ids].argmax(0)]
+                empty_pixel_mask = panoptic_seg == 0
+                for k in stuff_mask_ids:
+                    k = k.item()
+                    pred_class = cur_classes[k].item()
+                    mask = empty_pixel_mask & (cur_stuff_ids == k)
+                    panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                    sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
+            # clamp depth_pred
+            depth_pred = depth_pred[0, ...].clamp(min=0, max=self.depth_scale)
+            return panoptic_seg, depth_pred, segments_info, sem_prob_masks
+    @staticmethod
+    def sem_seg_postprocess(result, img_size):
+        """Return semantic segmentation predictions in the original resolution."""
+        # Crop each image in the batch to the original img_size
+        result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
+        # Interpolate to the desired output size
+        result = F.interpolate(
+            result,
+            size=(img_size[0], img_size[1]),
+            mode="bilinear",
+            align_corners=False
+        )
+        return result[0]
+    def forward(self, outputs, orig_shape, orig_pad_shape):
+        """Forward pass."""
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        depth_pred_results = outputs["pred_depths"]
+        del outputs
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(orig_pad_shape[-2], orig_pad_shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        depth_pred_results = F.interpolate(
+            depth_pred_results,
+            size=(orig_pad_shape[-2], orig_pad_shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+        if self.cfg.model.mode == "panoptic":
+            processed_results = []
+            for _, (mask_cls_result, mask_pred_result, depth_pred_result) in enumerate(zip(
+                mask_cls_results, mask_pred_results, depth_pred_results
+            )):
+                processed_results.append({})
+                mask_pred_result = retry_if_cuda_oom(self.sem_seg_postprocess)(
+                    mask_pred_result, orig_shape
+                )
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+                depth_pred_result = retry_if_cuda_oom(self.sem_seg_postprocess)(
+                    depth_pred_result, orig_shape
+                )
+                panoptic_seg, depth_r, segments_info, sem_prob_mask = retry_if_cuda_oom(
+                    self.panoptic_inference
+                )(mask_cls_result, mask_pred_result, depth_pred_result)
+                processed_results[-1]["panoptic_seg"] = (panoptic_seg, segments_info)
+                processed_results[-1]["depth"] = depth_r
+                processed_results[-1]["sem_seg"] = sem_prob_mask
+            return processed_results
+        else:
+            raise ValueError("Only panoptic mode is supported for 2D model.")
+class Panoptic3DModel(nn.Module):
+    """3D model."""
+    def __init__(self, cfg):
+        """Initialize 3D model."""
+        super().__init__()
+        self.cfg = cfg
+        # disable gradients for the 2D model
+        # for _, param in self.named_parameters():
+        #     param.requires_grad_(False)
+        # 3D modules
+        self.reprojection = SparseProjection(self.cfg)
+        self.completion = FrustumDecoder(self.cfg)
+        self.projector = ProjectionBlock(
+            self.cfg.model.projection.depth_feature_dim,
+            self.cfg.model.projection.depth_feature_dim
+        )
+        self.ol = OccupancyAwareLifting(self.cfg)
+        self.post_processor = Postprocessor(self.cfg)
+        self.back_projection = BackProjection(self.cfg)
+        # 3D model parameters
+        self.downsample_factor = cfg.dataset.downsample_factor
+        self.frustum_dims = [cfg.model.frustum3d.frustum_dims] * 3
+        self.iso_recon_value = cfg.model.frustum3d.iso_recon_value
+        self.truncation = cfg.model.frustum3d.truncation
+        self.num_classes = cfg.model.sem_seg_head.num_classes
+        self.object_mask_threshold = cfg.model.object_mask_threshold
+        self.overlap_threshold = cfg.model.overlap_threshold
+    def forward(self, x):
+        """Forward pass."""
+        pass
+    def panoptic_3d_inference(
+        self, geometry, mask_cls, sparse_mask_tuple, min_coordinates, dense_dimensions
+    ):
+        """Panoptic 3D inference."""
+        panoptic_seg = torch.zeros(geometry.shape, dtype=torch.int32, device=mask_cls.device)
+        semantic_seg = torch.zeros_like(panoptic_seg)
+        panoptic_semantic_mapping = {}
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        keep = labels.ne(self.num_classes) & \
+            labels.ne(0) & \
+            (scores > self.object_mask_threshold)
+        coords, sparse_masks, stride = sparse_mask_tuple
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = Me.MinkowskiSigmoid()(
+            Me.SparseTensor(
+                features=sparse_masks[:, keep],
+                coordinates=coords,
+                tensor_stride=stride
+            )
+        ).dense(dense_dimensions, min_coordinates)[0].squeeze(0)
+        cur_mask_cls = mask_cls[keep]
+        cur_mask_cls = cur_mask_cls[:, :-1]
+        cur_prob_masks = cur_scores.view(-1, 1, 1, 1) * cur_masks
+        current_segment_id = 0
+        if cur_masks.shape[0] > 0:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            stuff_memory_list = {}
+            query_to_segment_id = {}
+            for k in range(cur_classes.shape[0]):
+                pred_class = cur_classes[k].item()
+                isthing = pred_class in list(range(1, self.post_processor.num_thing_classes + 1))
+                mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
+                if mask.sum().item() > 0:
+                    if not isthing:
+                        if int(pred_class) in stuff_memory_list.keys():
+                            panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
+                            query_to_segment_id[k] = stuff_memory_list[int(pred_class)]
+                            continue
+                        else:
+                            stuff_memory_list[int(pred_class)] = current_segment_id + 1
+                    current_segment_id += 1
+                    panoptic_seg[mask] = current_segment_id
+                    query_to_segment_id[k] = current_segment_id
+                    panoptic_semantic_mapping[current_segment_id] = int(pred_class)
+            surface_mask = geometry.abs() <= 1.5
+            # fill unassigned surface voxels
+            unassigned_mask = surface_mask & (panoptic_seg == 0)
+            for k in range(cur_classes.shape[0]):
+                mask = (cur_mask_ids == k) & unassigned_mask
+                if mask.sum().item() > 0 and k in query_to_segment_id.keys():
+                    panoptic_seg[mask] = query_to_segment_id[k]
+            for segm_id, semantic_label in panoptic_semantic_mapping.items():
+                instance_mask = panoptic_seg == segm_id
+                semantic_seg[instance_mask] = semantic_label
+        return panoptic_seg, panoptic_semantic_mapping, semantic_seg
+    def postprocess(self, outputs_3d, outputs_2d, processed_results, frustum_mask):
+        """Postprocess 3D results."""
+        dense_dimensions = torch.Size([1, 1] + self.frustum_dims)
+        min_coordinates = torch.IntTensor([0, 0, 0])
+        geometry_results = to_dense(
+            outputs_3d["pred_geometry"],
+            dense_dimensions,
+            min_coordinates,
+            default_value=self.truncation
+        )[0]
+        mask_3d_results = outputs_3d["pred_segms"][-1]
+        mask_cls_results = outputs_2d["pred_logits"]
+        processed_results_3d = {
+            "intrinsic": [],
+            "image_size": [],
+            "depth": [],
+            "panoptic_seg_2d": [],
+            "geometry": [],
+            "panoptic_seg": [],
+            "semantic_seg": [],
+            "panoptic_semantic_mapping": [],
+            "instance_info_pred": []
+        }
+        for idx, (geometry_result, mask_cls_result) in enumerate(zip(
+            geometry_results,
+            mask_cls_results
+        )):
+            coords, mask_3d = mask_3d_results.coordinates_at(idx), mask_3d_results.features_at(idx)
+            coords, mask_3d = Me.utils.sparse_collate([coords], [mask_3d])
+            geometry_result = geometry_result.squeeze(0)
+            panoptic_seg, panoptic_semantic_mapping, semantic_seg = self.panoptic_3d_inference(
+                geometry_result,
+                mask_cls_result,
+                (coords, mask_3d, mask_3d_results.tensor_stride),
+                min_coordinates,
+                dense_dimensions,
+            )
+            processed_results_3d["intrinsic"].append(processed_results[idx]["intrinsic"])
+            processed_results_3d["image_size"].append(processed_results[idx]["image_size"])
+            processed_results_3d["depth"].append(processed_results[idx]["depth"])
+            processed_results_3d["panoptic_seg_2d"].append(processed_results[idx]["panoptic_seg"])
+            processed_results_3d["geometry"].append(geometry_result)
+            processed_results_3d["panoptic_seg"].append(panoptic_seg)
+            processed_results_3d["semantic_seg"].append(semantic_seg)
+            processed_results_3d["panoptic_semantic_mapping"].append(panoptic_semantic_mapping)
+            processed_results_3d["instance_info_pred"].append(prepare_instance_masks_thicken(
+                panoptic_seg,
+                panoptic_semantic_mapping,
+                geometry_result,
+                frustum_mask[idx],
+                iso_value=self.iso_recon_value,
+                downsample_factor=self.downsample_factor
+            ))
+        return processed_results_3d

nvpanoptix_3d/mp_occ/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-plane Occupancy module."""

nvpanoptix_3d/mp_occ/back_projection.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Back projection module."""
+import torch
+from torch import nn
+from ..utils.frustum import (
+    generate_frustum,
+    generate_frustum_volume,
+    compute_camera2frustum_transform
+)
+class BackProjection(nn.Module):
+    """Back projection module."""
+    def __init__(self, cfg):
+        """Initialize the back projection module."""
+        super().__init__()
+        self.image_size = cfg.dataset.depth_size
+        self.depth_min = cfg.dataset.depth_min
+        self.depth_max = cfg.dataset.depth_max
+        self.voxel_size = cfg.model.projection.voxel_size
+        self.frustum_dimensions = torch.tensor([cfg.model.frustum3d.frustum_dims] * 3)
+    def forward(
+        self, shp, intrinsics, frustum_masks=None, room_masks=None
+    ):
+        """Forward pass."""
+        device = intrinsics.device
+        if frustum_masks is None:
+            frustum_masks = torch.ones(
+                [len(intrinsics), *self.frustum_dimensions],
+                dtype=torch.bool, device=device
+            )
+        len_shp = len(frustum_masks.shape)
+        if len_shp == 3:
+            frustum_masks = frustum_masks[None]
+            intrinsics = intrinsics[None]
+        kepts, mappings = [], []
+        for bi, (intrinsic, frustum_mask) in enumerate(zip(intrinsics, frustum_masks)):
+            camera2frustum = compute_camera2frustum_transform(
+                intrinsic.cpu(), self.image_size, self.depth_min,
+                self.depth_max, self.voxel_size
+            ).to(device)
+            intrinsic_inverse = torch.inverse(intrinsic)
+            coordinates = torch.nonzero(frustum_mask)
+            grid_coordinates = coordinates.clone()
+            grid_coordinates[:, :2] = 256 - grid_coordinates[:, :2]
+            padding_offsets = self.compute_frustum_padding(intrinsic_inverse)
+            grid_coordinates = grid_coordinates - padding_offsets - torch.tensor([1., 1., 1.], device=device)
+            grid_coordinates = torch.cat([
+                grid_coordinates, torch.ones(len(grid_coordinates), 1, device=device)], 1
+            )
+            pointcloud = torch.mm(torch.inverse(camera2frustum), grid_coordinates.t())
+            depth_pixels = torch.mm(intrinsic, pointcloud)
+            depth = depth_pixels[2]
+            coord = depth_pixels[0:2] / depth
+            coord = torch.cat([coord, coordinates[:, 2][None]], 0).permute(1, 0)
+            kept = (depth <= self.depth_max) * \
+                   (depth >= self.depth_min) * \
+                   (coord[:, 0] < shp[1]) * (coord[:, 1] < shp[0])
+            coordinates = coordinates[kept]
+            depth = depth[kept, None]
+            mapping = torch.zeros(256, 256, 256, 5, device=depth.device) - 1.
+            mapping[coordinates[:, 0], coordinates[:, 1], coordinates[:, 2]] = \
+                torch.cat([torch.ones_like(depth) * bi, coord[kept], depth], -1)
+            kept = (mapping >= 0).all(-1)
+            if room_masks is not None:
+                mapping_kept = mapping[kept].long()
+                kept[kept.clone()] = room_masks[bi, 0, mapping_kept[:, 2], mapping_kept[:, 1]]
+            kepts.append(kept)
+            mappings.append(mapping)
+        if len_shp == 3:
+            kepts = kepts[0]
+            mappings = mappings[0][..., 1:]
+        else:
+            kepts = torch.stack(kepts, 0)
+            mappings = torch.stack(mappings, 0)
+        return kepts, mappings
+    def compute_frustum_padding(self, intrinsic_inverse: torch.Tensor) -> torch.Tensor:
+        """Compute frustum padding."""
+        frustum = generate_frustum(
+            self.image_size, intrinsic_inverse.cpu(), self.depth_min, self.depth_max
+        )
+        dimensions, _ = generate_frustum_volume(frustum, self.voxel_size)
+        difference = (
+            self.frustum_dimensions - torch.tensor(dimensions)
+        ).float().to(intrinsic_inverse.device)
+        padding_offsets = difference / 2
+        return padding_offsets

nvpanoptix_3d/mp_occ/multiplane_occupancy.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-plane occupancy head."""
+import torch
+from torch import nn
+from torch.nn import functional as F
+class _UpProjection(nn.Module):
+    """Up-projection block."""
+    def __init__(self, num_input_features, num_output_features):
+        """Initialize the up-projection block."""
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            num_input_features, num_output_features, kernel_size=5, stride=1, padding=2, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(num_output_features)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv1_2 = nn.Conv2d(
+            num_output_features, num_output_features, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn1_2 = nn.BatchNorm2d(num_output_features)
+        self.conv2 = nn.Conv2d(
+            num_input_features, num_output_features, kernel_size=5, stride=1, padding=2, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(num_output_features)
+    def forward(self, x, size):
+        """Forward pass."""
+        x = F.interpolate(x, size=size, mode="bilinear", align_corners=True)
+        x_conv1 = self.relu(self.bn1(self.conv1(x)))
+        bran1 = self.bn1_2(self.conv1_2(x_conv1))
+        bran2 = self.bn2(self.conv2(x))
+        out = self.relu(bran1 + bran2)
+        return out
+class D(nn.Module):
+    """Decoder module."""
+    def __init__(self, block_channel):
+        """Initialize the decoder module."""
+        super().__init__()
+        self.conv = nn.Conv2d(
+            block_channel[0], block_channel[1], kernel_size=1, stride=1, bias=False
+        )
+        self.bn = nn.BatchNorm2d(block_channel[1])
+        self.up1 = _UpProjection(num_input_features=block_channel[1],
+                                 num_output_features=block_channel[2])
+        self.up2 = _UpProjection(num_input_features=block_channel[2],
+                                 num_output_features=block_channel[3])
+        add_feat_channel = block_channel[3]
+        self.up3 = _UpProjection(num_input_features=add_feat_channel,
+                                 num_output_features=add_feat_channel // 2)
+        add_feat_channel = add_feat_channel // 2
+        self.up4 = _UpProjection(num_input_features=add_feat_channel,
+                                 num_output_features=add_feat_channel // 2)
+    def forward(self, x_block1, x_block2, x_block3, x_block4):
+        """Forward pass."""
+        x_d0 = F.relu(self.bn(self.conv(x_block4)))
+        x_d1 = self.up1(x_d0, [x_block3.size(2), x_block3.size(3)])
+        x_d2 = self.up2(x_d1, [x_block2.size(2), x_block2.size(3)])
+        x_d3 = self.up3(x_d2, [x_block1.size(2), x_block1.size(3)])
+        x_d4 = self.up4(x_d3, [x_block1.size(2) * 2, x_block1.size(3) * 2])
+        return x_d4
+class MFF(nn.Module):
+    """Multi-feature fusion module."""
+    def __init__(self, block_channel, num_features=64):
+        """Initialize the multi-feature fusion module."""
+        super().__init__()
+        self.up1 = _UpProjection(num_input_features=block_channel[3], num_output_features=16)
+        self.up2 = _UpProjection(num_input_features=block_channel[2], num_output_features=16)
+        self.up3 = _UpProjection(num_input_features=block_channel[1], num_output_features=16)
+        self.up4 = _UpProjection(num_input_features=block_channel[0], num_output_features=16)
+        self.conv = nn.Conv2d(
+            num_features, num_features, kernel_size=5, stride=1, padding=2, bias=False
+        )
+        self.bn = nn.BatchNorm2d(num_features)
+    def forward(self, x_block1, x_block2, x_block3, x_block4, size):
+        """Forward pass."""
+        x_m1 = self.up1(x_block1, size)
+        x_m2 = self.up2(x_block2, size)
+        x_m3 = self.up3(x_block3, size)
+        x_m4 = self.up4(x_block4, size)
+        x = self.bn(self.conv(torch.cat((x_m1, x_m2, x_m3, x_m4), 1)))
+        x = F.relu(x)
+        return x
+class R(nn.Module):
+    """Occupancy head module."""
+    def __init__(self, channel, num_class=1):
+        """Initialize the occupancy head module."""
+        super().__init__()
+        self.target_size = (120, 160)
+        self.resize = _UpProjection(num_input_features=channel, num_output_features=channel)
+        self.conv0 = nn.Conv2d(channel, channel, kernel_size=5, stride=1, padding=2, bias=False)
+        self.bn0 = nn.BatchNorm2d(channel)
+        self.conv1 = nn.Conv2d(channel, channel, kernel_size=5, stride=1, padding=2, bias=False)
+        self.bn1 = nn.BatchNorm2d(channel)
+        self.conv2 = nn.Conv2d(channel, num_class, kernel_size=5, stride=1, padding=2, bias=True)
+    def forward(self, x):
+        """Forward pass."""
+        x0 = self.resize(x, self.target_size)  # resize to 120*160
+        x0 = self.conv0(x0)
+        x0 = self.bn0(x0)
+        x0 = F.relu(x0)
+        x1 = self.conv1(x0)
+        x1 = self.bn1(x1)
+        x1 = F.relu(x1)
+        x2 = self.conv2(x1)
+        return x2
+class MultiPlaneOccupancyHead(nn.Module):
+    """Multi-plane occupancy head."""
+    def __init__(self):
+        """Initialize the multi-plane occupancy head."""
+        super().__init__()
+        block_channel = [2048, 1024, 512, 256]
+        self.feature_key = ['res2', 'res3', 'res4', 'res5']
+        feature_channels = 64
+        self.D = D(block_channel)
+        self.MFF = MFF(block_channel, feature_channels)
+        head_channels = block_channel[-1] // 4 + feature_channels
+        self.num_classes = 100
+        self.prediction = R(head_channels, self.num_classes)
+    def forward(self, x):
+        """Forward pass."""
+        x_block1, x_block2, x_block3, x_block4 = x[self.feature_key[0]], x[self.feature_key[1]], \
+            x[self.feature_key[2]], x[self.feature_key[3]]
+        x_decoder = self.D(x_block1, x_block2, x_block3, x_block4)
+        x_mff = self.MFF(
+            x_block1, x_block2, x_block3, x_block4, [x_decoder.size(2), x_decoder.size(3)]
+        )
+        x_feat = torch.cat((x_decoder, x_mff), 1)
+        occ_pred = self.prediction(x_feat)
+        return occ_pred

nvpanoptix_3d/mp_occ/occupancy_aware_lifting.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Occupancy aware lifting module for Panoptic Recon 3D."""
+import torch
+from torch import nn
+import torch.nn.functional as F
+import MinkowskiEngine as Me
+from .back_projection import BackProjection
+from ..utils.sparse_tensor import mask_invalid_sparse_voxels
+class OccupancyAwareLifting(nn.Module):
+    """Occupancy aware lifting module for Panoptic Recon 3D."""
+    def __init__(self, cfg):
+        """Initialize the occupancy aware lifting module."""
+        super(OccupancyAwareLifting, self).__init__()
+        self.bp = BackProjection(cfg)
+    def forward(self, pred, kept, mapping, occupancy2d, room_mask=None):
+        """Forward pass."""
+        # get the depth, semantic, and occupancy
+        depth = torch.stack([p['depth'][None] for p in pred])
+        features = torch.stack([p['sem_seg'] for p in pred])
+        depth_weight = occupancy2d.to(depth.device)
+        kept = kept.to(depth.device)
+        mapping = mapping.to(depth.device)
+        semantic = features.argmax(1)
+        depth_max_value = self.bp.depth_max
+        batch = semantic.shape[0]
+        # clip depth in range [0, depth_max_value]
+        depth[depth > depth_max_value] = depth_max_value
+        # get the bin index of depth 0- 100
+        depth_feat = (depth / depth_max_value * 100.)
+        depth_index = depth_feat.long()
+        depth_weight_kept = torch.ones_like(
+            depth_weight, dtype=torch.long
+        ) * torch.arange(0, 100, device=depth.device, dtype=torch.long)[None, :, None, None]
+        # stuff: wall, floor, or ceiling, erode the stuff class
+        stuff = (-F.max_pool2d(-(semantic >= 10).float(), 5, 1, 2)).bool()
+        # get the depth of the stuff class
+        stuff_depth = depth[:, 0] * stuff
+        # get the max depth of the stuff class in x direction: (batch_size, h)
+        stuff_x_max = stuff_depth.max(1)[0]
+        # get the max depth of the stuff class in y direction: (batch_size, w)
+        stuff_y_max = stuff_depth.max(2)[0]
+        stuff_depth_l = stuff_depth[:, 0].clone()
+        stuff_depth_r = stuff_depth[:, -1].clone()
+        stuff_depth_t = stuff_depth[:, :, 0].clone()
+        stuff_depth_d = stuff_depth[:, :, -1].clone()
+        for bi in range(batch):
+            stuff_depth[bi, 0] = stuff_padding(stuff_depth_l[bi], stuff_y_max[bi])
+            stuff_depth[bi, -1] = stuff_padding(stuff_depth_r[bi], stuff_y_max[bi].flip(0))
+            stuff_depth[bi, :, 0] = stuff_padding(stuff_depth_t[bi], stuff_x_max[bi])
+            stuff_depth[bi, :, -1] = stuff_padding(stuff_depth_d[bi], stuff_x_max[bi].flip(0))
+        stuff_x = stuff_depth.max(1)[0]
+        stuff_y = stuff_depth.max(2)[0]
+        for bi in range(batch):
+            stuff_x[bi] = find_none(stuff_x[bi])
+            stuff_y[bi] = find_none(stuff_y[bi])
+        # create depth limit and determine:
+        # "What's the farthest depth where we can
+        # reasonably place a 3D object before hitting a wall or boundary?"
+        depth_pixels_xy = torch.ones_like(depth).nonzero()
+        depth_max = torch.cat(
+            [
+                stuff_x[depth_pixels_xy[:, 0], depth_pixels_xy[:, 3]][..., None],
+                stuff_y[depth_pixels_xy[:, 0], depth_pixels_xy[:, 2]][..., None]
+            ],
+            dim=-1
+        ).min(-1)[0].reshape(*depth.shape)
+        depth_max = (depth_max / depth_max_value * 100.).long()  # get the min bin index of stuff class
+        depth_feat = (depth_weight_kept - depth_index) / 100. * depth_max_value
+        # get the sign and the distance of voxel to the surface
+        depth_feat = torch.cat([depth_feat.sign()[:, None], depth_feat[:, None].abs()], 1)
+        # keep voxel 3 bins before surface to 5 bins after stuff class max depth
+        depth_weight_kept = (depth_weight_kept > (depth_index - 3)) * (
+            depth_weight_kept < (depth_max + 5))
+        depth_weight = depth_weight.sigmoid() * depth_weight_kept
+        feat_kept = kept.clone()
+        if room_mask is not None:
+            room_mask = room_mask.unsqueeze(1)
+            depth_weight_kept = depth_weight_kept * room_mask
+        mapping_kept = mapping[kept]
+        mapping_kept[:, -1] = mapping_kept[:, -1] * 100 / 6
+        mapping_kept = mapping_kept.long().to(depth.device)
+        # only keep voxel before 3 bins before surface
+        # and after 5 bins after stuff class max depth and in the frustum:
+        feat_kept[kept] = depth_weight_kept[
+            mapping_kept[:, 0], mapping_kept[:, -1], mapping_kept[:, 2], mapping_kept[:, 1]]
+        features = torch.cat(
+            [
+                features[:, :, None].repeat(1, 1, 100, 1, 1),
+                depth_weight[:, None], depth_feat
+            ],
+            dim=1
+        )
+        coord_sparse = feat_kept.nonzero()
+        mapping_feat_kept = mapping[feat_kept]
+        # convert to bin index:
+        mapping_feat_kept[:, -1] = mapping_feat_kept[:, -1] * 100 / depth_max_value
+        mapping_feat_kept = mapping_feat_kept.long()
+        feat_sparse = features[
+            mapping_feat_kept[:, 0], :, mapping_feat_kept[:, -1],
+            mapping_feat_kept[:, 2], mapping_feat_kept[:, 1]
+        ]
+        padding_kept = F.max_pool3d(feat_kept.float(), 5, 1, 2).bool()
+        padding_kept[~kept] = False
+        batch_point = padding_kept.flatten(1, -1).sum(-1)
+        batch_zero = (batch_point == 0).nonzero().view(-1)
+        # fix no points
+        if len(batch_zero) > 0:
+            padding_kept[batch_zero, 127, 127, 127] = True
+        padding_kept[feat_kept] = False
+        coord_padding = padding_kept.nonzero().contiguous().float()
+        coord_padding[:, 1:] = coord_padding[:, 1:] // 2 * 2
+        feat_padding = torch.zeros(
+            (
+                len(coord_padding), features.shape[1]
+            ),
+            device=features.device, dtype=torch.float)
+        feat_sparse = torch.cat([feat_sparse, feat_padding])
+        coord_sparse = torch.cat([coord_sparse, coord_padding])
+        proj_feat = Me.SparseTensor(
+            features=feat_sparse,
+            coordinates=coord_sparse.contiguous().int(),
+            tensor_stride=1,
+            quantization_mode=Me.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE)
+        proj_feat = mask_invalid_sparse_voxels(proj_feat)
+        return proj_feat, None
+def stuff_padding(padding, max_value):
+    """Stuff padding."""
+    padding = padding.clone()
+    padding_mask = padding == 0
+    v = None
+    if padding_mask.sum() > 0:
+        for val in max_value:
+            if val != 0:
+                v = val
+                break
+        if v is not None:
+            padding[padding_mask] = v
+    return padding
+def find_none(stuff_a, min_value=0):
+    """Find none."""
+    none_v = torch.nonzero(stuff_a == 0)
+    for v in none_v:
+        l_stuff = stuff_a[:v]
+        l_stuff = l_stuff[l_stuff != 0]
+        l_stuff = min(l_stuff) if len(l_stuff) else min_value
+        r_stuff = stuff_a[v + 1:]
+        r_stuff = r_stuff[r_stuff != 0]
+        r_stuff = min(r_stuff) if len(r_stuff) else min_value
+        stuff_a[v] = max(l_stuff, r_stuff)
+    return stuff_a

nvpanoptix_3d/reconstruction/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Panoptic Recon 3D reconstruction module."""
+from .reprojection import SparseProjection
+from .decoder import FrustumDecoder
+__all__ = ["SparseProjection", "FrustumDecoder"]

nvpanoptix_3d/reconstruction/decoder.py ADDED Viewed

	@@ -0,0 +1,385 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decoder for Panoptic Recon 3D."""
+from typing import Optional, List
+import torch
+from torch import nn
+import MinkowskiEngine as Me
+from ..utils.sparse_tensor import sparse_cat_union
+from ..blocks import BasicBlock3D, SparseBasicBlock3D
+class SparseToDense(nn.Module):
+    """Sparse to dense module."""
+    def __init__(self, input_size):
+        """Initialize the sparse to dense module."""
+        super().__init__()
+        assert len(input_size) == 3
+        self.input_size = input_size
+    def forward(self, feature: Me.SparseTensor) -> torch.Tensor:
+        """Forward pass."""
+        batch_size = len(feature.decomposed_coordinates_and_features[0])
+        feat_dim = feature.C.shape[-1]
+        out_size = (
+            torch.div(
+                torch.tensor(self.input_size),
+                torch.tensor(feature.tensor_stride),
+                rounding_mode="floor"
+            )
+        ).tolist()
+        shape = torch.Size([batch_size, feat_dim, *out_size])
+        min_coordinate = torch.IntTensor([0, 0, 0])
+        mask = (feature.C[:, 1] < self.input_size[0]) & \
+               (feature.C[:, 2] < self.input_size[1]) & \
+               (feature.C[:, 3] < self.input_size[2])
+        mask = mask & (feature.C[:, 1] >= 0) & (feature.C[:, 2] >= 0) & (feature.C[:, 3] >= 0)
+        feature = Me.MinkowskiPruning()(feature, mask)
+        dense = feature.dense(shape, min_coordinate=min_coordinate)[0]
+        return dense
+class FrustumDecoder(nn.Module):
+    """Frustum decoder module."""
+    def __init__(self, cfg) -> None:
+        """Initialize the frustum decoder module."""
+        super().__init__()
+        num_output_features = cfg.model.frustum3d.unet_output_channels
+        num_features = cfg.model.frustum3d.unet_features
+        sign_channel = cfg.model.projection.sign_channel
+        mask_dim = cfg.model.sem_seg_head.mask_dim
+        depth_dim = cfg.model.sem_seg_head.depth_dim
+        num_classes = cfg.model.sem_seg_head.num_classes
+        frustum_dims = cfg.model.frustum3d.grid_dimensions
+        frustum_dims = [frustum_dims] * 3
+        self.use_ms_features = cfg.model.frustum3d.use_multi_scale
+        self.truncation = cfg.model.frustum3d.truncation
+        if cfg.dataset.name == 'matterport':
+            ms_feature_channels = cfg.model.sem_seg_head.convs_dim
+        else:
+            ms_feature_channels = cfg.model.sem_seg_head.convs_dim + \
+                cfg.model.sem_seg_head.num_classes + cfg.model.frustum3d.signed_channel
+        # input encoding
+        self.input_dims = [2 if sign_channel else 1, mask_dim + depth_dim, num_classes]
+        self.input_encoders = nn.ModuleList()
+        for input_dim in self.input_dims:
+            downsample = nn.Sequential(
+                Me.MinkowskiConvolution(
+                    input_dim, num_features,
+                    kernel_size=1, stride=1,
+                    bias=True, dimension=3
+                ),
+                Me.MinkowskiInstanceNorm(num_features),
+            )
+            self.input_encoders.append(
+                SparseBasicBlock3D(
+                    input_dim, num_features,
+                    downsample=downsample
+                )
+            )
+        self.level_encoders = nn.ModuleList([
+            self.make_encoder(len(self.input_encoders) * num_features, num_features),
+            self.make_encoder(num_features, num_features * 2),
+            self.make_encoder(num_features * 2, num_features * 4, is_sparse=False),
+            self.make_encoder(num_features * 4, num_features * 8, is_sparse=False),
+            self.make_encoder(num_features * 8, num_features * 8, is_sparse=False),
+        ])
+        sparse_to_dense = SparseToDense(frustum_dims)
+        if self.use_ms_features:
+            self.feature_adapters = nn.ModuleList([
+                self.make_adapter(ms_feature_channels, num_features),
+                self.make_adapter(ms_feature_channels, num_features * 2),
+                self.make_adapter(ms_feature_channels, num_features * 4, [sparse_to_dense]),
+            ])
+        else:
+            self.feature_adapters = None
+        self.enc_level_conversion = nn.ModuleList([
+            nn.Identity(),
+            sparse_to_dense,
+            nn.Identity(),
+            nn.Identity(),
+        ])
+        self.level_decoders = nn.ModuleList([
+            self.make_decoder(num_features * 3, num_output_features),
+            self.make_decoder(
+                num_features * 6, num_features * 2,
+                extra_layers=[SparseBasicBlock3D(num_features * 2, num_features * 2)]
+            ),
+            self.make_decoder(num_features * 8, num_features * 2, is_sparse=False),
+            self.make_decoder(num_features * 16, num_features * 4, is_sparse=False),
+            self.make_decoder(num_features * 8, num_features * 8, is_sparse=False),
+        ])
+        # occupancy heads
+        self.level_occupancy_heads = nn.ModuleList([
+            nn.Sequential(
+                Me.MinkowskiInstanceNorm(num_output_features),
+                Me.MinkowskiReLU(inplace=True),
+                SparseBasicBlock3D(num_output_features, num_output_features),
+                Me.MinkowskiConvolution(num_output_features, 1, kernel_size=3, bias=True, dimension=3),
+            ),
+            Me.MinkowskiLinear(num_features * 2, 1),
+            nn.Linear(num_features * 4, 1),
+        ])
+        # panoptic heads
+        self.level_segm_embeddings = nn.ModuleList([
+            nn.Sequential(
+                Me.MinkowskiInstanceNorm(num_output_features),
+                Me.MinkowskiReLU(inplace=True),
+                SparseBasicBlock3D(num_output_features, num_output_features),
+            ),
+            SparseBasicBlock3D(num_features * 3, num_features * 3),
+            nn.Sequential(
+                BasicBlock3D(num_features * 4, num_features * 4),
+                BasicBlock3D(num_features * 4, num_features * 4),
+            )
+        ])
+        self.level_segm_query_projection = nn.ModuleList([
+            nn.Linear(mask_dim, num_output_features),
+            nn.Linear(mask_dim, num_features * 3),
+            nn.Linear(mask_dim, num_features * 4),
+        ])
+        # geometry head
+        self.geometry_head = nn.Sequential(
+            Me.MinkowskiInstanceNorm(num_output_features),
+            Me.MinkowskiReLU(inplace=True),
+            SparseBasicBlock3D(num_output_features, num_output_features),
+            Me.MinkowskiConvolution(num_output_features, 1, kernel_size=3, bias=True, dimension=3),
+        )
+        self.register_buffer("frustum_dimensions", torch.tensor(frustum_dims), persistent=False)
+    @staticmethod
+    def forward_sparse_segm(segm_features, queries):
+        """Forward pass for sparse segmentation."""
+        features = segm_features.decomposed_features
+        segms = torch.cat(
+            [torch.mm(features[idx], queries[idx].T) for idx in range(len(features))], dim=0
+        )
+        return Me.SparseTensor(
+            segms,
+            coordinate_manager=segm_features.coordinate_manager,
+            coordinate_map_key=segm_features.coordinate_map_key,
+        )
+    @staticmethod
+    def make_encoder(input_dim, output_dim, is_sparse=True):
+        """Make encoder module."""
+        if is_sparse:
+            downsample = nn.Sequential(
+                Me.MinkowskiConvolution(
+                    input_dim, output_dim, kernel_size=4, stride=2, bias=True, dimension=3
+                ),
+                Me.MinkowskiInstanceNorm(output_dim),
+            )
+            module = nn.Sequential(
+                SparseBasicBlock3D(input_dim, output_dim, stride=2, downsample=downsample),
+                SparseBasicBlock3D(output_dim, output_dim),
+            )
+        else:
+            downsample = nn.Conv3d(
+                input_dim, output_dim,
+                kernel_size=4, stride=2,
+                padding=1, bias=False
+            )
+            module = nn.Sequential(
+                BasicBlock3D(input_dim, output_dim, stride=2, downsample=downsample),
+                BasicBlock3D(output_dim, output_dim),
+            )
+        return module
+    @staticmethod
+    def make_decoder(input_dim, output_dim, is_sparse=True, extra_layers: Optional[List] = None):
+        """Make decoder module."""
+        if extra_layers is None:
+            extra_layers = []
+        if is_sparse:
+            return nn.Sequential(
+                Me.MinkowskiConvolutionTranspose(
+                    input_dim, output_dim, kernel_size=4,
+                    stride=2, bias=False, dimension=3, expand_coordinates=True
+                ),
+                Me.MinkowskiInstanceNorm(output_dim),
+                Me.MinkowskiReLU(inplace=True),
+                *extra_layers,
+            )
+        else:
+            return nn.Sequential(
+                nn.ConvTranspose3d(input_dim, output_dim, kernel_size=4, stride=2, padding=1, bias=False),
+                nn.InstanceNorm3d(output_dim),
+                nn.ReLU(inplace=True),
+                *extra_layers,
+            )
+    @staticmethod
+    def make_adapter(input_dim, output_dim, extra_layers: Optional[List] = None):
+        """Make adapter module."""
+        if extra_layers is None:
+            extra_layers = []
+        downsample = nn.Sequential(
+            Me.MinkowskiConvolution(input_dim, output_dim, kernel_size=1, stride=1, bias=True, dimension=3),
+            Me.MinkowskiInstanceNorm(output_dim),
+        )
+        return nn.Sequential(
+            SparseBasicBlock3D(input_dim, output_dim, downsample=downsample),
+            *extra_layers,
+        )
+    def forward(
+        self, ms_features: List[Me.SparseTensor],
+        features: Me.SparseTensor, segm_queries, frustum_mask
+    ):
+        """Forward pass."""
+        start_dim = 0
+        encoded_inputs = []
+        cm = features.coordinate_manager
+        key = features.coordinate_map_key
+        for dim, encoder in zip(self.input_dims, self.input_encoders):
+            encoded_inputs.append(
+                encoder(Me.SparseTensor(
+                    features.F[:, start_dim:start_dim + dim], coordinate_manager=cm, coordinate_map_key=key
+                ))
+            )
+            start_dim += dim
+        encoded_inputs = Me.cat(*encoded_inputs)
+        lvls = len(self.level_encoders)
+        # high to low resolution
+        encoder_outputs = []
+        encoder_inputs = [encoded_inputs]
+        for idx in range(len(self.level_encoders)):
+            encoded = self.level_encoders[idx](encoder_inputs[idx])
+            if self.use_ms_features and idx < len(self.feature_adapters):
+                feat = self.feature_adapters[idx](ms_features[idx])
+                if isinstance(encoded, torch.Tensor):
+                    encoded = encoded + feat
+                else:
+                    feat = Me.SparseTensor(
+                        feat.F, coordinates=feat.C,
+                        tensor_stride=feat.tensor_stride,
+                        coordinate_manager=encoded.coordinate_manager
+                    )
+                    encoded = encoded + feat
+            encoder_outputs.append(encoded)
+            if idx < lvls - 1:
+                encoder_inputs.append(self.enc_level_conversion[idx](encoded))
+        # low to high resolution
+        decoder_outputs = []
+        decoder_inputs = [encoder_outputs[-1]]
+        pred_occupancies = []
+        pred_segms = []
+        pred_geometry = None
+        # U-Net
+        for idx in reversed(range(lvls)):
+            decoded = self.level_decoders[idx](decoder_inputs[lvls - 1 - idx])
+            decoder_outputs.append(decoded)
+            if idx <= 1:
+                # level 128, 256
+                occupancy = self.level_occupancy_heads[idx](decoded)
+                # mask invalid voxels outside of frustum
+                valid_mask = (
+                    (occupancy.C[:, 1:] >= 0) & (occupancy.C[:, 1:] < self.frustum_dimensions)
+                ).all(-1)
+                pred_occupancies.append(Me.MinkowskiPruning()(occupancy, valid_mask))
+                pruning_mask = (Me.MinkowskiSigmoid()(occupancy).F.squeeze(-1) > 0.5) & valid_mask
+                sparse_out = Me.MinkowskiPruning()(decoded, pruning_mask)
+                if idx > 0:
+                    # level 128
+                    sparse_out = sparse_cat_union(encoder_outputs[idx - 1], sparse_out)
+                    valid_mask = (
+                        (sparse_out.C[:, 1:] >= 0) & (sparse_out.C[:, 1:] < self.frustum_dimensions)
+                    ).all(-1)
+                    decoder_inputs.append(Me.MinkowskiPruning()(sparse_out, valid_mask))
+                else:
+                    # level 256
+                    pred_geometry = self.geometry_head(sparse_out)
+                    predicted_values = pred_geometry.F
+                    predicted_values = torch.clamp(predicted_values, 0.0, self.truncation)
+                    pred_geometry = Me.SparseTensor(
+                        predicted_values,
+                        coordinate_manager=pred_geometry.coordinate_manager,
+                        coordinate_map_key=pred_geometry.coordinate_map_key,
+                    )
+                    valid_mask = (
+                        (pred_geometry.C[:, 1:] >= 0) & (pred_geometry.C[:, 1:] < self.frustum_dimensions)
+                    ).all(-1)
+                    pred_geometry = Me.MinkowskiPruning()(pred_geometry, valid_mask)
+                queries = self.level_segm_query_projection[idx](segm_queries)
+                segm_features = self.level_segm_embeddings[idx](sparse_out)
+                pred_segm = self.forward_sparse_segm(segm_features, queries)
+                valid_mask = (
+                    (pred_segm.C[:, 1:] >= 0) & (pred_segm.C[:, 1:] < self.frustum_dimensions)
+                ).all(-1)
+                pred_segms.append(Me.MinkowskiPruning()(pred_segm, valid_mask))
+            elif idx == 2:
+                # level 64
+                decoded = torch.cat([encoder_inputs[idx], decoded], dim=1)
+                occupancy = self.level_occupancy_heads[idx](decoded.permute(0, 2, 3, 4, 1)).squeeze(-1)
+                pred_occupancies.append(occupancy.masked_fill(~frustum_mask.squeeze(1), -torch.inf))
+                queries = self.level_segm_query_projection[idx](segm_queries)
+                segm_features = self.level_segm_embeddings[idx](decoded)
+                pred_segm = torch.einsum("bqc,bchwd->bqhwd", queries, segm_features)
+                pred_segms.append(pred_segm.masked_fill(~frustum_mask, -torch.inf))
+                pruning_mask = (occupancy.sigmoid() > 0.5) & frustum_mask.squeeze(1)
+                coords = pruning_mask.nonzero()
+                sparse_out = decoded[coords[:, 0], :, coords[:, 1], coords[:, 2], coords[:, 3]]
+                encoded = encoder_outputs[idx - 1]
+                stride = encoded.tensor_stride
+                coords = coords.clone()
+                coords[:, 1:] *= torch.tensor(stride, device=coords.device)
+                sparse_out = Me.SparseTensor(
+                    sparse_out, coordinates=coords.int().contiguous(),
+                    tensor_stride=stride, coordinate_manager=cm
+                )
+                decoder_inputs.append(sparse_cat_union(encoded, sparse_out))
+            else:
+                decoder_inputs.append(torch.cat([encoder_inputs[idx], decoded], dim=1))
+        return {
+            "pred_geometry": pred_geometry,
+            "pred_occupancies": pred_occupancies,
+            "pred_segms": pred_segms,
+        }

nvpanoptix_3d/reconstruction/frustum.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Frustum generation for Panoptic Recon 3D."""
+from typing import Optional
+import torch
+def generate_frustum(
+    image_size: torch.Tensor, intrinsic_inv: torch.Tensor,
+    depth_min: float, depth_max: float, transform: Optional[torch.Tensor] = None
+):
+    """Generate frustum.
+    Args:
+        image_size: Image size.
+        intrinsic_inv: Inverse intrinsic matrix.
+        depth_min: Minimum depth.
+        depth_max: Maximum depth.
+        transform: Transform matrix.
+    Returns:
+        Frustum.
+    """
+    x = image_size[0]
+    y = image_size[1]
+    eight_points = torch.tensor(
+        [
+            [0.0, 0.0, depth_min, 1.0],
+            [0.0, y * depth_min, depth_min, 1.0],
+            [x * depth_min, y * depth_min, depth_min, 1.0],
+            [x * depth_min, 0.0, depth_min, 1.0],
+            [0.0, 0.0, depth_max, 1.0],
+            [0.0, y * depth_max, depth_max, 1.0],
+            [x * depth_max, y * depth_max, depth_max, 1.0],
+            [x * depth_max, 0.0, depth_max, 1.0]
+        ],
+        device=intrinsic_inv.device, dtype=intrinsic_inv.dtype
+    )
+    frustum = intrinsic_inv @ eight_points.T
+    if transform is not None:
+        frustum = transform @ frustum
+    frustum = frustum.T
+    return frustum[:, :3]
+def generate_frustum_volume(frustum: torch.Tensor, voxel_size: float):
+    """Generate frustum volume.
+    Args:
+        frustum: Frustum.
+        voxel_size: Voxel size.
+    Returns:
+        Frustum volume.
+    """
+    max_x = torch.max(frustum[:, 0]) / voxel_size
+    max_y = torch.max(frustum[:, 1]) / voxel_size
+    max_z = torch.max(frustum[:, 2]) / voxel_size
+    min_x = torch.min(frustum[:, 0]) / voxel_size
+    min_y = torch.min(frustum[:, 1]) / voxel_size
+    min_z = torch.min(frustum[:, 2]) / voxel_size
+    dim_x = torch.ceil(max_x - min_x)
+    dim_y = torch.ceil(max_y - min_y)
+    dim_z = torch.ceil(max_z - min_z)
+    camera2frustum = torch.as_tensor(
+        [
+            [1.0 / voxel_size, 0, 0, -min_x],
+            [0, 1.0 / voxel_size, 0, -min_y],
+            [0, 0, 1.0 / voxel_size, -min_z],
+            [0, 0, 0, 1.0]
+        ],
+        dtype=frustum.dtype, device=frustum.device
+    )
+    return torch.stack((dim_x, dim_y, dim_z)), camera2frustum
+def compute_camera2frustum_transform(
+        frustum: torch.Tensor, voxel_size: float,
+        frustum_dimensions: Optional[torch.Tensor] = None
+):
+    """Compute camera to frustum transform.
+    Args:
+        frustum: Frustum.
+        voxel_size: Voxel size.
+        frustum_dimensions: Frustum dimensions.
+    Returns:
+        Camera to frustum transform.
+    """
+    dimensions, camera2frustum = generate_frustum_volume(frustum, voxel_size)
+    if frustum_dimensions is not None:
+        difference = (frustum_dimensions - dimensions).float()
+        padding_offsets = torch.div(difference, 2, rounding_mode="floor")
+        return camera2frustum, padding_offsets
+    else:
+        return camera2frustum

nvpanoptix_3d/reconstruction/reprojection.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sparse projection for Panoptic Recon 3D."""
+import torch
+from torch import nn
+from torch.nn import functional as F
+import MinkowskiEngine as Me
+from ..utils.point_features import point_sample
+from .frustum import generate_frustum, compute_camera2frustum_transform
+class SparseProjection(nn.Module):
+    """Sparse projection module."""
+    def __init__(
+        self, cfg,
+        truncation=3.0,
+        sign_channel=True,
+        depth_min=0.4,
+        depth_max=6.0,
+        voxel_size=0.03,
+        frustum_dims=256
+    ):
+        """Initialize the sparse projection module."""
+        super().__init__()
+        self.truncation = cfg.model.frustum3d.truncation
+        self.sign_channel = cfg.model.projection.sign_channel
+        self.depth_min = cfg.dataset.depth_min
+        self.depth_max = cfg.dataset.depth_max
+        self.voxel_size = cfg.model.projection.voxel_size
+        self.register_buffer(
+            "frustum_dimensions",
+            torch.tensor([frustum_dims, frustum_dims, frustum_dims]),
+            persistent=False
+        )
+    @property
+    def device(self):
+        """Get the device of the sparse projection module."""
+        return self.frustum_dimensions.device
+    @staticmethod
+    def to_sparse_tensor(features, coordinates, stride=1):
+        """Convert features and coordinates to a sparse tensor."""
+        ms_sparse_features = torch.cat(features, dim=0)
+        batched_coordinates = Me.utils.batched_coordinates(coordinates, device=ms_sparse_features.device)
+        batched_coordinates[:, 1:] *= stride
+        tensor = Me.SparseTensor(
+            features=ms_sparse_features,
+            coordinates=batched_coordinates,
+            tensor_stride=stride,
+            quantization_mode=Me.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE
+        )
+        return tensor
+    @staticmethod
+    def projection(
+        frustum, voxel_size, frustum_dimensions,
+        truncation, intrinsic_inverse, depth,
+        image_size, feat_size, near_clip, far_clip
+    ):
+        """
+        Projection.
+        Args:
+            frustum: Frustum.
+            voxel_size: Voxel size.
+            frustum_dimensions: Frustum dimensions.
+            truncation: Truncation.
+            intrinsic_inverse: Inverse intrinsic matrix.
+            depth: Depth.
+            image_size: Image size.
+            feat_size: Feature size.
+            near_clip: Near clip.
+            far_clip: Far clip.
+        Returns:
+            num repetition: number of repetition.
+            segm sampling grid: segmentation sampling grid.
+            feat sampling grid: feature sampling grid.
+            flatten coordinates: flatten coordinates.
+            coordinates z: coordinates z.
+            voxel offsets: voxel offsets.
+        """
+        camera2frustum, padding_offsets = compute_camera2frustum_transform(
+            frustum, voxel_size,
+            frustum_dimensions=frustum_dimensions
+        )
+        depth = depth.clone()
+        depth[depth < near_clip] = 0
+        depth[depth > far_clip] = 0
+        depth_pixels_xy = depth.nonzero(as_tuple=False)
+        device = depth_pixels_xy.device
+        if depth_pixels_xy.shape[0] == 0:
+            depth_pixels_xy = torch.tensor(
+                [[depth.shape[0] // 2, depth.shape[1] // 2]], device=device
+            )
+        depth_pixels_z = depth[depth_pixels_xy[:, 0], depth_pixels_xy[:, 1]].reshape(-1).float()
+        depth_pixels_xy = depth_pixels_xy.flip(-1).float()
+        normalized_depth_pixels_xy = depth_pixels_xy / torch.tensor(
+            [depth.shape[-1], depth.shape[-2]], device=device
+        )
+        xv, yv = (normalized_depth_pixels_xy * torch.tensor(
+            image_size, device=device) * depth_pixels_z[:, None]
+        ).unbind(-1)
+        # Use separate size for feature maps due to size divisibility padding
+        feat_sampling_grid = depth_pixels_xy / torch.tensor(feat_size, device=device)
+        depth_pixels = torch.stack([xv, yv, depth_pixels_z, torch.ones_like(depth_pixels_z)])
+        pointcloud = torch.mm(intrinsic_inverse.float(), depth_pixels.float())
+        grid_coordinates = torch.mm(camera2frustum.float(), pointcloud).t()[:, :3].contiguous()
+        # projective sdf encoding
+        # repeat truncation, add / subtract z-offset
+        num_repetition = int(truncation * 2) + 1
+        grid_coordinates = grid_coordinates.unsqueeze(1).repeat(1, num_repetition, 1)
+        voxel_offsets = torch.arange(-truncation, truncation + 1, 1.0, device=device).view(1, -1, 1)
+        coordinates_z = grid_coordinates[:, :, 2].clone()
+        grid_coordinates[:, :, 2] += voxel_offsets[:, :, 0]
+        num_points = grid_coordinates.size(0)
+        flatten_coordinates = grid_coordinates.view(num_points * num_repetition, 3)
+        # pad to 256,256,256
+        flatten_coordinates = flatten_coordinates + padding_offsets
+        return num_repetition, normalized_depth_pixels_xy, \
+            feat_sampling_grid, flatten_coordinates, coordinates_z, voxel_offsets
+    def forward(self, multi_scale_features, encoder_features, batched_inputs) -> Me.SparseTensor:
+        """Forward pass."""
+        sparse_ms_coordinates = [[] for _ in range(len(multi_scale_features))]
+        sparse_ms_features = [[] for _ in range(len(multi_scale_features))]
+        sparse_enc_features = []
+        sparse_enc_coordinates = []
+        # Process each sample in the batch individually
+        for idx, inputs in enumerate(batched_inputs):
+            # Get GT intrinsic matrix
+            intrinsic = inputs["intrinsic"].to(self.device)
+            image_size = inputs["image_size"]
+            padded_size = inputs["padded_size"]
+            intrinsic_inverse = torch.inverse(intrinsic)
+            frustum = generate_frustum(
+                image_size, intrinsic_inverse,
+                self.depth_min, self.depth_max
+            )
+            num_repetition, segm_sampling_grid, feat_sampling_grid, \
+                flatten_coordinates, coordinates_z, voxel_offsets = \
+                self.projection(
+                    frustum, self.voxel_size,
+                    self.frustum_dimensions, self.truncation,
+                    intrinsic_inverse, inputs["depth"],
+                    image_size, (padded_size[0] // 2, padded_size[1] // 2),
+                    self.depth_min, self.depth_max
+                )
+            df_values = coordinates_z - coordinates_z.int()
+            df_values = df_values + voxel_offsets.squeeze(-1)
+            df_values.unsqueeze_(-1)
+            # encode sign and values in 2 different channels
+            if self.sign_channel:
+                sign = torch.sign(df_values)
+                value = torch.abs(df_values)
+                df_values = torch.cat([sign, value], dim=-1)
+            # segm features
+            sem_seg = inputs["sem_seg"]
+            sampled_segm_features = point_sample(sem_seg[None], segm_sampling_grid[None], align_corners=False)[0]
+            # encoder features
+            sampled_enc_features = point_sample(
+                encoder_features[[idx]],
+                feat_sampling_grid[None],
+                align_corners=False,
+            )[0]
+            sampled_enc_features = torch.cat([sampled_enc_features, sampled_segm_features], dim=0)
+            sampled_enc_features = sampled_enc_features.permute(1, 0).unsqueeze(1).repeat(1, num_repetition, 1)
+            sampled_enc_features = torch.cat([df_values, sampled_enc_features], dim=-1)
+            flat_features = sampled_enc_features.flatten(0, -2)
+            sparse_enc_coordinates.append(flatten_coordinates)
+            sparse_enc_features.append(flat_features)
+            # multi-scale features
+            for lvl, feat in enumerate(multi_scale_features):
+                ratio = feat.shape[-1] / encoder_features.shape[-1]
+                level_depth = F.interpolate(
+                    inputs["depth"][None, None], scale_factor=ratio, mode="nearest"
+                ).squeeze()
+                num_repetition, segm_sampling_grid, feat_sampling_grid, flatten_coordinates, *__ = \
+                    self.projection(
+                        frustum, self.voxel_size / ratio, self.frustum_dimensions * ratio,
+                        round(ratio * self.truncation), intrinsic_inverse, level_depth,
+                        image_size, (feat.shape[-1] * 2, feat.shape[-2] * 2),
+                        self.depth_min, self.depth_max,
+                    )
+                sampled_features = point_sample(
+                    feat[[idx]],
+                    feat_sampling_grid[None],
+                    align_corners=False,
+                )[0]
+                sampled_features = \
+                    sampled_features.permute(1, 0).unsqueeze(1).repeat(1, num_repetition, 1).flatten(0, -2)
+                sparse_ms_features[lvl].append(sampled_features)
+                # Resize feature volume
+                sparse_ms_coordinates[lvl].append(flatten_coordinates.clone())
+        # Batch
+        sparse_enc_features = self.to_sparse_tensor(sparse_enc_features, sparse_enc_coordinates)
+        strides = [2, 4, 8]
+        sparse_ms_features = [
+            self.to_sparse_tensor(feats, coords, stride=stride)
+            for feats, coords, stride in zip(sparse_ms_features, sparse_ms_coordinates, strides)
+        ]
+        return sparse_ms_features, sparse_enc_features

nvpanoptix_3d/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for Panoptic Recon 3D."""

nvpanoptix_3d/utils/coords_transform.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Coordinate transform utils."""
+import torch
+import MinkowskiEngine as Me
+from typing import List
+from ..reconstruction.frustum import \
+    generate_frustum, compute_camera2frustum_transform
+def transform_feat3d_coordinates(
+    feat3d, intrinsic,
+    image_size=(120, 160),
+    depth_min=0.4, depth_max=6.0,
+    voxel_size=0.03
+):
+    """
+    Transform feat3d coordinates to match Uni3D coordinate system
+    Args:
+        feat3d: Me.SparseTensor from occupancy-aware lifting
+        intrinsic: Camera intrinsic matrix (4x4)
+        image_size: tuple of (height, width)
+        depth_min, depth_max: depth range
+        voxel_size: voxel size in meters
+    Returns:
+        Me.SparseTensor with transformed coordinates
+    """
+    device = feat3d.device
+    coords = feat3d.C.clone()
+    # step 1: Apply coordinate flip (as done in BackProjection line 33)
+    coords[:, 1:3] = 256 - coords[:, 1:3]  # flip x, y coordinates
+    batch_indices = coords[:, 0].unique()
+    compute_once = True
+    if intrinsic.dim() == 3:  # batched intrinsics
+        # check if all intrinsics are identical
+        if len(batch_indices) > 1:
+            compute_once = torch.allclose(intrinsic[0:1].expand_as(intrinsic), intrinsic, atol=1e-6)
+        intrinsic_ref = intrinsic[0] if compute_once else None
+    else:
+        intrinsic_ref = intrinsic
+    if compute_once:
+        intrinsic_batch = intrinsic_ref
+        intrinsic_inverse = torch.inverse(intrinsic_batch)
+        frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
+        camera2frustum, padding_offsets = compute_camera2frustum_transform(
+            frustum.to(device), voxel_size,
+            frustum_dimensions=torch.tensor([256, 256, 256], device=device)
+        )
+        # pre-move to device and pre-compute inverse
+        camera2frustum = camera2frustum.to(device)
+        padding_offsets = padding_offsets.to(device)
+        camera2frustum_inv = torch.inverse(camera2frustum).float()
+        ones_offset = torch.tensor([1., 1., 1.], device=device)
+    transformed_coords_list = []
+    for batch_idx in batch_indices:
+        batch_mask = coords[:, 0] == batch_idx
+        batch_coords = coords[batch_mask, 1:].float()  # convert to float once per batch
+        # use pre-computed values or compute per-batch
+        if not compute_once:
+            intrinsic_batch = intrinsic[int(batch_idx)]
+            intrinsic_inverse = torch.inverse(intrinsic_batch)
+            frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
+            camera2frustum, padding_offsets = compute_camera2frustum_transform(
+                frustum.to(device), voxel_size,
+                frustum_dimensions=torch.tensor([256, 256, 256], device=device)
+            )
+            camera2frustum = camera2frustum.float().to(device)
+            padding_offsets = padding_offsets.to(device)
+            camera2frustum_inv = torch.inverse(camera2frustum).float()
+            ones_offset = torch.tensor([1., 1., 1.], device=device)
+        # convert voxel coordinates to world coordinates (reverse of BackProjection)
+        batch_coords_adjusted = batch_coords - padding_offsets - ones_offset
+        # convert to homogeneous coordinates
+        homogenous_coords = torch.cat([
+            batch_coords_adjusted,
+            torch.ones(batch_coords_adjusted.shape[0], 1, device=device)
+        ], dim=1)  # [N_batch, 4]
+        # apply transformations: world space -> frustum space
+        world_coords = torch.mm(camera2frustum_inv, homogenous_coords.t())
+        final_coords_homog = torch.mm(camera2frustum.float(), world_coords.float())
+        final_coords = final_coords_homog.t()[:, :3]
+        # add padding offsets (as done in SparseProjection.projection())
+        final_coords = final_coords + padding_offsets
+        # add batch index back
+        batch_column = torch.full(
+            (final_coords.shape[0], 1),
+            batch_idx,
+            device=device,
+            dtype=torch.float32
+        )
+        final_batch_coords = torch.cat([batch_column, final_coords], dim=1)
+        transformed_coords_list.append(final_batch_coords)
+    transformed_coords = torch.cat(transformed_coords_list, dim=0)
+    transformed_feat3d = Me.SparseTensor(
+        features=feat3d.F,
+        coordinates=transformed_coords.int(),
+        tensor_stride=feat3d.tensor_stride,
+        quantization_mode=feat3d.quantization_mode
+    )
+    return transformed_feat3d
+def fuse_sparse_tensors(tensor1: Me.SparseTensor, tensor2: Me.SparseTensor) -> Me.SparseTensor:
+    """
+    Efficiently fuse two sparse tensors
+    Args:
+        tensor1 (Me.SparseTensor): First sparse tensor
+        tensor2 (Me.SparseTensor): Second sparse tensor
+    Returns:
+        Me.SparseTensor: Fused sparse tensor with concatenated features
+    """
+    device = tensor1.device
+    dtype = tensor1.F.dtype
+    # get coordinates and features
+    coords1, feats1 = tensor1.C, tensor1.F
+    coords2, feats2 = tensor2.C, tensor2.F
+    feat_dim1, feat_dim2 = feats1.shape[1], feats2.shape[1]
+    fused_feat_dim = feat_dim1 + feat_dim2
+    # concatenate coordinates and create source tracking
+    all_coords = torch.cat([coords1, coords2], dim=0)
+    n_coords1 = coords1.shape[0]
+    # convert each coordinate row to a view that can be uniqued
+    coord_view = all_coords.view(all_coords.shape[0], -1)
+    # use torch.unique with return_inverse to get mapping
+    unique_coord_view, inverse_indices = torch.unique(coord_view, dim=0, return_inverse=True)
+    unique_coords = unique_coord_view.view(-1, coords1.shape[1])
+    n_unique = unique_coords.shape[0]
+    # split inverse indices for each tensor
+    inv_indices_1 = inverse_indices[:n_coords1]
+    inv_indices_2 = inverse_indices[n_coords1:]
+    # pre-allocate with zeros for automatic padding
+    fused_features = torch.zeros(n_unique, fused_feat_dim, device=device, dtype=dtype)
+    # tensor1 features go to positions [0:feat_dim1]
+    fused_features[inv_indices_1, :feat_dim1] = feats1
+    # tensor2 features go to positions [feat_dim1:feat_dim1+feat_dim2]
+    fused_features[inv_indices_2, feat_dim1:] = feats2
+    fused_tensor = Me.SparseTensor(
+        features=fused_features,
+        coordinates=unique_coords.int(),
+        tensor_stride=tensor1.tensor_stride,
+        quantization_mode=tensor1.quantization_mode
+    )
+    return fused_tensor
+def generate_multiscale_feat3d(transformed_feat3d: Me.SparseTensor) -> List[Me.SparseTensor]:
+    """
+    Generate multi-scale sparse 3D features
+    from transformed_feat3d to match sparse_multi_scale_features structure.
+    Args:
+        transformed_feat3d (Me.SparseTensor):
+        Input sparse tensor from occupancy-aware lifting (256 grid)
+    Returns:
+        List[Me.SparseTensor]: Multi-scale sparse tensors
+        at scales [1/2, 1/4, 1/8] corresponding to [128, 64, 32] grid sizes
+    """
+    device = transformed_feat3d.device
+    # use consistent stride 2 for progressive downsampling
+    # this ensures proper 1/2, 1/4, 1/8 scaling from original 256 grid
+    pooling_op = Me.MinkowskiMaxPooling(
+        kernel_size=3,
+        stride=2,
+        dimension=3
+    ).to(device)
+    multi_scale_feat3d = []
+    current_tensor = transformed_feat3d
+    target_strides = [2, 4, 8]  # Expected final strides for each scale
+    # generate features at each scale by progressive pooling with stride 2
+    for _, target_stride in enumerate(target_strides):
+        # apply stride-2 pooling to get next scale
+        pooled_tensor = pooling_op(current_tensor)
+        # ensure the tensor stride matches expected value
+        # the stride should be: 2^(i+1) relative to original
+        if pooled_tensor.tensor_stride != target_stride:
+            pooled_tensor = Me.SparseTensor(
+                features=pooled_tensor.F,
+                coordinates=pooled_tensor.C,
+                tensor_stride=target_stride,
+                quantization_mode=pooled_tensor.quantization_mode
+            )
+        multi_scale_feat3d.append(pooled_tensor)
+        # use pooled tensor as input for next scale (progressive downsampling)
+        # this gives us: 256 → 128 → 64 → 32 grid sizes
+        current_tensor = pooled_tensor
+    return multi_scale_feat3d

nvpanoptix_3d/utils/frustum.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Frustum utilities, mostly using numpy. """
+import math
+import torch
+import numpy as np
+from typing import Tuple
+def frustum2planes(frustum: np.ndarray) -> dict:
+    """Convert frustum to planes.
+    Args:
+        frustum: Frustum.
+    Returns:
+        Planes.
+    """
+    planes = {}
+    # normal towards inside
+    # near
+    a = frustum[3] - frustum[0]
+    b = frustum[1] - frustum[0]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[0])
+    planes["near"] = np.array([normal[0], normal[1], normal[2], d])
+    # far
+    a = frustum[5] - frustum[4]
+    b = frustum[7] - frustum[4]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[4])
+    planes["far"] = np.array([normal[0], normal[1], normal[2], d])
+    # left
+    a = frustum[5] - frustum[1]
+    b = frustum[0] - frustum[1]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[1])
+    planes["left"] = np.array([normal[0], normal[1], normal[2], d])
+    # right
+    a = frustum[3] - frustum[2]
+    b = frustum[6] - frustum[2]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[2])
+    planes["right"] = np.array([normal[0], normal[1], normal[2], d])
+    # top
+    a = frustum[4] - frustum[0]
+    b = frustum[3] - frustum[0]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[0])
+    planes["top"] = np.array([normal[0], normal[1], normal[2], d])
+    # bottom
+    a = frustum[2] - frustum[1]
+    b = frustum[5] - frustum[1]
+    normal = np.cross(a, b)
+    d = -np.dot(normal, frustum[1])
+    planes["bottom"] = np.array([normal[0], normal[1], normal[2], d])
+    return planes
+def frustum_culling(points: np.ndarray, frustum: np.ndarray) -> np.ndarray:
+    """Cull points outside frustum.
+    Args:
+        points: Points.
+        frustum: Frustum.
+    Returns:
+        Points inside frustum.
+    """
+    frustum_planes = frustum2planes(frustum)
+    points = np.concatenate([points, np.ones((len(points), 1))], 1)
+    flags = np.ones(len(points))
+    for _, plane in frustum_planes.items():
+        flag = np.dot(points, plane) >= 0
+        flags = np.logical_and(flags, flag)
+    return points[flags][:, :3]
+def frustum_transform(frustum: np.ndarray, transform: np.ndarray) -> np.ndarray:
+    """Transform frustum.
+    Args:
+        frustum: Frustum.
+        transform: Transform matrix.
+    Returns:
+        Transformed frustum.
+    """
+    eight_points = np.concatenate([frustum, np.ones((8, 1))], 1).transpose()
+    frustum = np.dot(transform, eight_points).transpose()
+    return frustum[:, :3]
+def generate_frustum(
+    image_size: Tuple, intrinsic_inv: np.ndarray,
+    depth_min: float, depth_max: float,
+    transform: np.ndarray = None
+) -> np.ndarray:
+    """Generate frustum.
+    Args:
+        image_size: Image size.
+        intrinsic_inv: Inverse intrinsic matrix.
+        depth_min: Minimum depth.
+        depth_max: Maximum depth.
+        transform: Transform matrix.
+    Returns:
+        Frustum.
+    """
+    x = image_size[1]
+    y = image_size[0]
+    eight_points = np.array([[0.0, 0.0, depth_min, 1.0],
+                             [0.0, y * depth_min, depth_min, 1.0],
+                             [x * depth_min, y * depth_min, depth_min, 1.0],
+                             [x * depth_min, 0.0, depth_min, 1.0],
+                             [0.0, 0.0, depth_max, 1.0],
+                             [0.0, y * depth_max, depth_max, 1.0],
+                             [x * depth_max, y * depth_max, depth_max, 1.0],
+                             [x * depth_max, 0.0, depth_max, 1.0]]).transpose()
+    frustum = np.dot(intrinsic_inv, eight_points)
+    if transform is not None:
+        frustum = np.dot(transform, frustum)
+    frustum = frustum.transpose()
+    return frustum[:, :3]
+def generate_frustum_volume(frustum: np.ndarray, voxel_size: float) -> Tuple:
+    """Generate frustum volume.
+    Args:
+        frustum: Frustum.
+        voxel_size: Voxel size.
+    Returns:
+        Frustum volume.
+        Camera-to-frustum transform.
+    """
+    max_x = np.max(frustum[:, 0]) / voxel_size
+    max_y = np.max(frustum[:, 1]) / voxel_size
+    max_z = np.max(frustum[:, 2]) / voxel_size
+    min_x = np.min(frustum[:, 0]) / voxel_size
+    min_y = np.min(frustum[:, 1]) / voxel_size
+    min_z = np.min(frustum[:, 2]) / voxel_size
+    dim_x = math.ceil(max_x - min_x)
+    dim_y = math.ceil(max_y - min_y)
+    dim_z = math.ceil(max_z - min_z)
+    camera2frustum = np.array([[1.0 / voxel_size, 0, 0, -min_x],
+                               [0, 1.0 / voxel_size, 0, -min_y],
+                               [0, 0, 1.0 / voxel_size, -min_z],
+                               [0, 0, 0, 1.0]])
+    return (dim_x, dim_y, dim_z), camera2frustum
+def compute_camera2frustum_transform(
+    intrinsic: torch.Tensor, image_size: Tuple,
+    depth_min: float, depth_max: float,
+    voxel_size: float
+) -> torch.Tensor:
+    """Compute camera-to-frustum transform.
+    Args:
+        intrinsic: Intrinsic matrix.
+        image_size: Image size.
+        depth_min: Minimum depth.
+        depth_max: Maximum depth.
+        voxel_size: Voxel size.
+    Returns:
+        Camera-to-frustum transform.
+    """
+    frustum = generate_frustum(image_size, torch.inverse(intrinsic).numpy(), depth_min, depth_max)
+    _, camera2frustum = generate_frustum_volume(frustum, voxel_size)
+    camera2frustum = torch.from_numpy(camera2frustum).float()
+    return camera2frustum

nvpanoptix_3d/utils/helper.py ADDED Viewed

	@@ -0,0 +1,326 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper utils."""
+import json
+import copy
+import itertools
+import numpy as np
+from functools import wraps
+from contextlib import contextmanager
+from typing import Tuple, Union, Optional
+from fvcore.transforms.transform import Transform
+import os
+import torch
+import torch.nn as nn
+def adjust_intrinsic(
+    intrinsic: Union[np.array, torch.Tensor],
+    intrinsic_image_dim: Tuple,
+    image_dim: Tuple
+) -> Union[np.array, torch.Tensor]:
+    """
+    Adjust intrinsic camera parameters for image dimension changes.
+    Args:
+        intrinsic: Camera intrinsic matrix (numpy array or torch tensor)
+        intrinsic_image_dim: Original image dimensions (width, height)
+        image_dim: Target image dimensions (width, height)
+    Returns:
+        Adjusted intrinsic matrix (same type as input)
+    """
+    if intrinsic_image_dim == image_dim:
+        return intrinsic
+    # Calculate scaling factors
+    height_after = image_dim[1]
+    height_before = intrinsic_image_dim[1]
+    width_after = image_dim[0]
+    width_before = intrinsic_image_dim[0]
+    width_scale = float(width_after) / float(width_before)
+    height_scale = float(height_after) / float(height_before)
+    width_offset_scale = float(width_after - 1) / float(width_before - 1)
+    height_offset_scale = float(height_after - 1) / float(height_before - 1)
+    # handle numpy array case
+    if isinstance(intrinsic, np.ndarray):
+        intrinsic_return = np.copy(intrinsic)
+        intrinsic_return[0, 0] *= width_scale
+        intrinsic_return[1, 1] *= height_scale
+        # account for cropping/padding here
+        intrinsic_return[0, 2] *= width_offset_scale
+        intrinsic_return[1, 2] *= height_offset_scale
+        return intrinsic_return
+    # handle torch tensor case
+    elif isinstance(intrinsic, torch.Tensor):
+        intrinsic_return = intrinsic.clone()
+        intrinsic_return[:, 0, 0] *= width_scale
+        intrinsic_return[:, 1, 1] *= height_scale
+        intrinsic_return[:, 0, 2] *= width_offset_scale
+        intrinsic_return[:, 1, 2] *= height_offset_scale
+        return intrinsic_return
+    else:
+        raise TypeError(f"Unsupported input type: {type(intrinsic)}.")
+class ModelInputResize(Transform):
+    """Resize and pad the model input."""
+    def __init__(self, size_divisibility: int = 0, pad_value: float = 0):
+        """Initialize model input resize transform."""
+        super().__init__()
+        self.size_divisibility = size_divisibility
+        self.pad_value = pad_value
+    def apply_coords(self, coords):
+        """ Apply transforms to the coordinates. """
+        return coords
+    def apply_image(self, array: torch.Tensor) -> torch.Tensor:
+        """ Apply transforms to the image. """
+        assert len(array) > 0
+        device = array.device
+        image_size = [array.shape[-2], array.shape[-1]]
+        max_size = torch.tensor(image_size, device=device)
+        if self.size_divisibility > 1:
+            stride = self.size_divisibility
+            max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride
+        u0 = max_size[-1] - image_size[1]
+        u1 = max_size[-2] - image_size[0]
+        padding_size = [0, u0, 0, u1]
+        array = F.pad(array, padding_size, value=self.pad_value)
+        return array
+    def apply_segmentation(self, array: torch.Tensor) -> torch.Tensor:
+        """ Apply transforms to the segmentation. """
+        return array
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """
+    A context which ignores CUDA OOM exception from pytorch.
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if "CUDA out of memory. " in str(e):
+            pass
+        else:
+            raise
+def retry_if_cuda_oom(func):
+    """
+    Makes a function retry itself after encountering
+    pytorch's CUDA OOM error.
+    It will first retry after calling `torch.cuda.empty_cache()`.
+    If that still fails, it will then retry by trying to convert inputs to CPUs.
+    In this case, it expects the function to dispatch to CPU implementation.
+    The return values may become CPU tensors as well and it's user's
+    responsibility to convert it back to CUDA tensor if needed.
+    Args:
+        func: a stateless callable that takes tensor-like objects as arguments
+    Returns:
+        a callable which retries `func` if OOM is encountered.
+    Examples:
+    ::
+        output = retry_if_cuda_oom(some_torch_function)(input1, input2)
+        # output may be on CPU even if inputs are on GPU
+    Note:
+        1. When converting inputs to CPU, it will only look at each argument and check
+           if it has `.device` and `.to` for conversion. Nested structures of tensors
+           are not supported.
+        2. Since the function might be called more than once, it has to be
+           stateless.
+    """
+    def maybe_to_cpu(x):
+        """Convert to CPU."""
+        try:
+            like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
+        except AttributeError:
+            like_gpu_tensor = False
+        if like_gpu_tensor:
+            return x.to(device="cpu")
+        return x
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        """Wrapped function."""
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+        # Clear cache and retry
+        torch.cuda.empty_cache()
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+        # Try on CPU. This slows down the code significantly, therefore print a notice.
+        logging.info(f"Attempting to copy inputs of {str(func)} to CPU due to CUDA OOM")
+        new_args = (maybe_to_cpu(x) for x in args)
+        new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+    return wrapped
+def prepare_kept_mapping(model, cfg, dataset, frustum_mask=None, intrinsic=None):
+    """
+    Prepare kept and mapping tensors using back projection.
+    Args:
+        model: The model instance with back_projection method
+        cfg: Configuration object
+        dataset: Dataset name ('front3d' or others)
+        frustum_mask: Optional frustum mask tensor
+        intrinsic: Intrinsic matrix tensor
+    Returns:
+        tuple: (kept, mapping) tensors from back projection
+    """
+    if dataset != "front3d":
+        intrinsic = adjust_intrinsic(
+            intrinsic,
+            tuple(cfg.dataset.target_size),
+            tuple(cfg.dataset.reduced_target_size)
+        )
+    kept, mapping = model.back_projection(
+        tuple(cfg.dataset.reduced_target_size[::-1]) + (256,),
+        intrinsic,
+        frustum_mask
+    )
+    return kept, mapping
+def get_kept_mapping(model, cfg, batch, device):
+    """
+    Get kept and mapping for a batch of data (used for non-front3d datasets).
+    Args:
+        model: The model instance with back_projection method
+        cfg: Configuration object
+        batch: Batch data containing frustum_mask and intrinsic
+        device: Device to place tensors on
+    Returns:
+        tuple: (kept, mapping) tensors
+    """
+    frustum_mask = batch["frustum_mask"].to(device)
+    intrinsic = batch["intrinsic"].float().to(device)
+    dataset = cfg.dataset.name
+    kept, mapping = prepare_kept_mapping(
+        model,
+        cfg,
+        dataset,
+        frustum_mask=frustum_mask,
+        intrinsic=intrinsic
+    )
+    return kept, mapping
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "SyncBN": nn.SyncBatchNorm,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            "LN": lambda channels: LayerNorm(channels),
+        }[norm]
+    return norm(out_channels)
+class Conv2d(nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        """Forward pass."""
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            # Dynamo doesn't support context managers yet
+            is_dynamo_compiling = is_compiling()
+            if not is_dynamo_compiling:
+                with warnings.catch_warnings(record=True):
+                    if x.numel() == 0 and self.training:
+                        # https://github.com/pytorch/pytorch/issues/12013
+                        assert not isinstance(
+                            self.norm, torch.nn.SyncBatchNorm
+                        ), "SyncBatchNorm does not support empty inputs!"
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x

nvpanoptix_3d/utils/point_features.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Point feature utils for Mask2former."""
+import torch
+from torch.nn import functional as F
+def point_sample(inputs, point_coords, **kwargs):
+    """
+    A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
+    Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
+    [0, 1] x [0, 1] square.
+    Args:
+        inputs (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
+        point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
+        [0, 1] x [0, 1] normalized point coordinates.
+    Returns:
+        output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
+            features for points in `point_coords`. The features are obtained via bilinear
+            interplation from `inputs` the same way as :function:`torch.nn.functional.grid_sample`.
+    """
+    add_dim = False
+    if point_coords.dim() == 3:
+        add_dim = True
+        point_coords = point_coords.unsqueeze(2)  # [c, self.num_points, 1, 2]
+    output = F.grid_sample(inputs, 2.0 * point_coords - 1.0, **kwargs)  # [c, 1, self.num_points, 1]
+    if add_dim:
+        output = output.squeeze(3)
+    return output  # [c, 1, self.num_points]
+def get_uncertain_point_coords_with_randomness(
+    coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
+):
+    """
+    Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
+        are calculated for each point using 'uncertainty_func' function that takes point's logit
+        prediction as input.
+    See PointRend paper for details.
+    Args:
+        coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
+            class-specific or class-agnostic prediction.
+        uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
+            contains logit predictions for P points and returns their uncertainties as a Tensor of
+            shape (N, 1, P).
+        num_points (int): The number of points P to sample.
+        oversample_ratio (int): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.
+    Returns:
+        point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
+            sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    num_boxes = coarse_logits.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
+    point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
+    # It is crucial to calculate uncertainty based on the sampled prediction value for the points.
+    # Calculating uncertainties of the coarse predictions first and sampling them for points leads
+    # to incorrect results.
+    # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
+    # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
+    # However, if we calculate uncertainties for the coarse predictions first,
+    # both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
+    point_uncertainties = uncertainty_func(point_logits)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        num_boxes, num_uncertain_points, 2
+    )
+    if num_random_points > 0:
+        point_coords = torch.cat(
+            [
+                point_coords,
+                torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
+            ],
+            dim=1,
+        )
+    return point_coords
+def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
+    """
+    Find `num_points` most uncertain points from `uncertainty_map` grid.
+    Args:
+        uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty
+            values for a set of points on a regular H x W grid.
+        num_points (int): The number of points P to select.
+    Returns:
+        point_indices (Tensor): A tensor of shape (N, P) that contains indices from
+            [0, H x W) of the most uncertain points.
+        point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized
+            coordinates of the most uncertain points from the H x W grid.
+    """
+    R, _, H, W = uncertainty_map.shape
+    h_step = 1.0 / float(H)
+    w_step = 1.0 / float(W)
+    num_points = min(H * W, num_points)
+    point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1]
+    point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device)
+    point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step
+    point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step
+    return point_indices, point_coords

nvpanoptix_3d/utils/sparse_tensor.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sparse tensor utils."""
+import torch
+import MinkowskiEngine as Me
+import torch.nn.functional as F
+from typing import Optional, Tuple, Dict
+def sparse_cat_union(a: Me.SparseTensor, b: Me.SparseTensor):
+    """Sparse cat union two sparse tensors."""
+    cm = a.coordinate_manager
+    stride = a.tensor_stride
+    assert cm == b.coordinate_manager, "different coords_man"
+    assert a.tensor_stride == b.tensor_stride, "different tensor_stride"
+    # handle empty tensors - if one is empty, return the other
+    if a.F.size(0) == 0 or a.F.numel() == 0:
+        return b
+    if b.F.size(0) == 0 or b.F.numel() == 0:
+        return a
+    # handle the error
+    try:
+        feats_a = F.pad(a.F, (0, b.F.shape[1]))
+    except Exception as e:
+        print("Warning: Got error in feats_a:", e)
+        return a
+    try:
+        feats_b = F.pad(b.F, (a.F.shape[1], 0))
+    except Exception as e:
+        print("Warning: Got error in feats_b:", e)
+        return b
+    new_a = Me.SparseTensor(
+        features=feats_a,
+        coordinate_map_key=a.coordinate_key,
+        coordinate_manager=cm,
+        tensor_stride=stride,
+    )
+    new_b = Me.SparseTensor(
+        features=feats_b,
+        coordinate_map_key=b.coordinate_key,
+        coordinate_manager=cm,
+        tensor_stride=stride,
+    )
+    return new_a + new_b
+def to_dense(
+    tensor: Me.SparseTensor,
+    shape: Optional[torch.Size] = None,
+    min_coordinate: Optional[torch.IntTensor] = None,
+    contract_stride: bool = True,
+    default_value: float = 0.0
+) -> Tuple[torch.Tensor, torch.IntTensor, torch.IntTensor]:
+    """Convert the :attr:`MinkowskiEngine.SparseTensor` to a torch dense
+    tensor.
+    Args:
+        :attr:`shape` (torch.Size, optional): The size of the output tensor.
+        :attr:`min_coordinate` (torch.IntTensor, optional): The min
+        coordinates of the output sparse tensor. Must be divisible by the
+        current :attr:`tensor_stride`. If 0 is given, it will use the origin for the min coordinate.
+        :attr:`contract_stride` (bool, optional): The output coordinates
+        will be divided by the tensor stride to make features spatially
+        contiguous. True by default.
+    Returns:
+        :attr:`tensor` (torch.Tensor): the torch tensor with size `[Batch
+        Dim, Feature Dim, Spatial Dim..., Spatial Dim]`. The coordinate of
+        each feature can be accessed via `min_coordinate + tensor_stride *
+        [the coordinate of the dense tensor]`.
+        :attr:`min_coordinate` (torch.IntTensor): the D-dimensional vector
+        defining the minimum coordinate of the output tensor.
+        :attr:`tensor_stride` (torch.IntTensor): the D-dimensional vector
+        defining the stride between tensor elements.
+    """
+    if min_coordinate is not None:
+        assert isinstance(min_coordinate, torch.IntTensor)
+        assert min_coordinate.numel() == tensor._D
+    if shape is not None:
+        assert isinstance(shape, torch.Size)
+        assert len(shape) == tensor._D + 2  # batch and channel
+        if shape[1] != tensor._F.size(1):
+            shape = torch.Size([shape[0], tensor._F.size(1), *[s for s in shape[2:]]])
+    # exception handling for empty tensor
+    if tensor.__len__() == 0:
+        assert shape is not None, "shape is required to densify an empty tensor"
+        return (
+            torch.zeros(shape, dtype=tensor.dtype, device=tensor.device),
+            torch.zeros(tensor._D, dtype=torch.int32, device=tensor.device),
+            tensor.tensor_stride,
+        )
+    # use int tensor for all operations
+    tensor_stride = torch.IntTensor(tensor.tensor_stride).to(tensor.device)
+    # new coordinates
+    batch_indices = tensor.C[:, 0]
+    if min_coordinate is None:
+        min_coordinate, _ = tensor.C.min(0, keepdim=True)
+        min_coordinate = min_coordinate[:, 1:]
+        if not torch.all(min_coordinate >= 0):
+            raise ValueError(
+                f"Coordinate has a negative value: {min_coordinate}. Please provide min_coordinate argument"
+            )
+        coords = tensor.C[:, 1:]
+    elif isinstance(min_coordinate, int) and min_coordinate == 0:
+        coords = tensor.C[:, 1:]
+    else:
+        min_coordinate = min_coordinate.to(tensor.device)
+        if min_coordinate.ndim == 1:
+            min_coordinate = min_coordinate.unsqueeze(0)
+        coords = tensor.C[:, 1:] - min_coordinate
+    assert (
+        min_coordinate % tensor_stride
+    ).sum() == 0, "The minimum coordinates must be divisible by the tensor stride."
+    if coords.ndim == 1:
+        coords = coords.unsqueeze(1)
+    # return the contracted tensor
+    if contract_stride:
+        coords = torch.div(coords, tensor_stride, rounding_mode="floor")
+    nchannels = tensor.F.size(1)
+    if shape is None:
+        size = coords.max(0)[0] + 1
+        shape = torch.Size(
+            [batch_indices.max() + 1, nchannels, *size.cpu().numpy()]
+        )
+    dense_F = torch.full(
+        shape, dtype=tensor.F.dtype,
+        device=tensor.F.device, fill_value=default_value
+    )
+    tcoords = coords.t().long()
+    batch_indices = batch_indices.long()
+    indices = (batch_indices, slice(None), *tcoords)
+    dense_F[indices] = tensor.F
+    tensor_stride = torch.IntTensor(tensor.tensor_stride)
+    return dense_F, min_coordinate, tensor_stride
+def _thicken_grid(grid, grid_dims, frustum_mask):
+    """Thicken grid."""
+    device = frustum_mask.device
+    offsets = torch.nonzero(torch.ones(3, 3, 3, device=device)).long()
+    locs_grid = grid.nonzero(as_tuple=False)
+    locs = locs_grid.unsqueeze(1).repeat(1, 27, 1)
+    locs += offsets
+    locs = locs.view(-1, 3)
+    masks = ((locs >= 0) & (locs < torch.as_tensor(grid_dims, device=device))).all(-1)
+    locs = locs[masks]
+    thicken = torch.zeros(grid_dims, dtype=torch.bool, device=device)
+    thicken[locs[:, 0], locs[:, 1], locs[:, 2]] = True
+    # frustum culling
+    thicken = thicken & frustum_mask
+    return thicken
+def prepare_instance_masks_thicken(
+    instances: torch.Tensor,
+    semantic_mapping: Dict[int, int],
+    distance_field: torch.Tensor,
+    frustum_mask: torch.Tensor,
+    iso_value: float = 1.0,
+    truncation: float = 3.0,
+    downsample_factor: int = 1
+) -> Dict[int, Tuple[torch.Tensor, int]]:
+    """Prepare instance masks thicken."""
+    # check if downsample factor is valid
+    assert isinstance(downsample_factor, int) and 256 % downsample_factor == 0
+    grid_dims = [256, 256, 256]
+    need_rescale = downsample_factor != 1
+    if need_rescale:
+        grid_dims = (torch.as_tensor(grid_dims) // downsample_factor).tolist()
+        frustum_mask = F.interpolate(frustum_mask[None, None].float(),
+                                     size=grid_dims, mode="nearest").squeeze(0, 1).bool()
+    instance_information = {}
+    for instance_id, semantic_class in semantic_mapping.items():
+        instance_mask: torch.Tensor = (instances == instance_id)
+        instance_distance_field = torch.full_like(
+            instance_mask,
+            dtype=torch.float,
+            fill_value=truncation
+        )
+        instance_distance_field[instance_mask] = distance_field.squeeze()[instance_mask]
+        instance_distance_field_masked = instance_distance_field.abs() < iso_value
+        if need_rescale:
+            instance_distance_field_masked = F.max_pool3d(
+                instance_distance_field_masked[None, None].float(),
+                kernel_size=downsample_factor + 1,
+                stride=downsample_factor,
+                padding=1
+            ).squeeze(0, 1).bool()
+        # instance_grid = instance_grid & frustum_mask
+        instance_grid = _thicken_grid(
+            instance_distance_field_masked,
+            grid_dims,
+            frustum_mask
+        )
+        instance_grid: torch.Tensor = instance_grid.to(torch.device("cpu"), non_blocking=True)
+        instance_information[instance_id] = instance_grid, semantic_class
+    return instance_information
+def mask_invalid_sparse_voxels(
+    grid: Me.SparseTensor,
+    mask=None, frustum_dim=[256, 256, 256]
+) -> Me.SparseTensor:
+    """Mask invalid sparse voxels."""
+    # Mask out voxels which are outside of the grid
+    valid_mask = (grid.C[:, 1] < frustum_dim[0] - 1) & (grid.C[:, 1] >= 0) & \
+                 (grid.C[:, 2] < frustum_dim[1] - 1) & (grid.C[:, 2] >= 0) & \
+                 (grid.C[:, 3] < frustum_dim[2] - 1) & (grid.C[:, 3] >= 0)
+    if mask is not None:
+        valid_mask = valid_mask * mask
+    num_valid_coordinates = valid_mask.sum()
+    if num_valid_coordinates == 0:
+        return {}, {}
+    num_masked_voxels = grid.C.size(0) - num_valid_coordinates
+    grids_needs_to_be_pruned = num_masked_voxels > 0
+    # Fix: Only prune if there are invalid voxels
+    if grids_needs_to_be_pruned:
+        grid = Me.MinkowskiPruning()(grid, valid_mask)
+    return grid

preprocessing.py ADDED Viewed

	@@ -0,0 +1,328 @@

+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocessing utilities for Panoptic Recon 3D model.
+This module provides functions for:
+- Image preprocessing and resizing
+- Frustum mask generation
+- Camera intrinsic handling
+"""
+import sys
+from fvcore.transforms.transform import Transform
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import cv2
+from PIL import Image
+# Default Front3D camera intrinsic matrix
+DEFAULT_INTRINSIC = np.array([
+    [277.1281435, 0., 159.5, 0.],
+    [0., 277.1281435, 119.5, 0.],
+    [0., 0., 1., 0.],
+    [0., 0., 0., 1.]
+], dtype=np.float32)
+# Default model parameters
+DEFAULT_GRID_DIMS = (256, 256, 256)
+DEFAULT_DEPTH_RANGE = (0.4, 6.0)
+DEFAULT_VOXEL_SIZE = 0.03
+DEFAULT_IMG_SIZE = (240, 320)  # (height, width)
+def create_frustum_mask(
+    intrinsics: Union[np.ndarray, torch.Tensor],
+    volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS,
+    depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE,
+    image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE,
+    voxel_size: float = DEFAULT_VOXEL_SIZE,
+    padding_pixels: float = 0.0,
+    volume_origin: Optional[np.ndarray] = None,
+    z_axis_reversed: bool = False,
+) -> np.ndarray:
+    """
+    Create a frustum mask for a voxel volume based on camera intrinsics.
+    This function determines which voxels in a 3D volume are visible from a camera
+    by checking if they project within the image bounds and depth range.
+    Args:
+        intrinsics: Camera intrinsic matrix (3x3 or 4x4).
+        volume_shape: Shape of the voxel volume (nx, ny, nz).
+        depth_range: Min and max depth in meters (z_min, z_max).
+        image_shape: Image dimensions (height, width). If None, inferred from principal point.
+        voxel_size: Size of each voxel in meters.
+        padding_pixels: Expand frustum bounds by this many pixels.
+        volume_origin: Origin of the volume in camera space. If None, auto-computed.
+        z_axis_reversed: If True, z-index 0 is farthest.
+    Returns:
+        frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum.
+    """
+    # Convert to numpy if tensor
+    if isinstance(intrinsics, torch.Tensor):
+        intrinsics = intrinsics.cpu().numpy()
+    # Ensure numpy array
+    intrinsics = np.asarray(intrinsics, dtype=np.float64)
+    assert intrinsics.shape in [(3, 3), (4, 4)], \
+        f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}"
+    assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}"
+    assert depth_range[0] < depth_range[1], \
+        f"depth_range must be (min, max) with min < max, got {depth_range}"
+    assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}"
+    # Extract camera parameters
+    K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics
+    fx, fy = K[0, 0], K[1, 1]
+    cx, cy = K[0, 2], K[1, 2]
+    # Determine image shape
+    if image_shape is None:
+        image_height = int(2 * cy)
+        image_width = int(2 * cx)
+    else:
+        image_height, image_width = image_shape
+    # Image bounds with padding
+    u_min = -padding_pixels
+    u_max = image_width + padding_pixels
+    v_min = -padding_pixels
+    v_max = image_height + padding_pixels
+    # Set volume origin
+    if volume_origin is None:
+        volume_origin = np.array([
+            -(volume_shape[0] * voxel_size) / 2,
+            -(volume_shape[1] * voxel_size) / 2,
+            (depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2
+        ])
+    # Create voxel grid coordinates
+    x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0]
+    y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1]
+    z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2]
+    if z_axis_reversed:
+        z_coords = z_coords[::-1]
+    # Create meshgrid
+    xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij')
+    voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1)
+    # Depth constraint
+    depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1])
+    # Project to image plane
+    valid_depth = voxel_centers[:, 2] > 1e-6
+    u = np.full(len(voxel_centers), -1.0)
+    v = np.full(len(voxel_centers), -1.0)
+    u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx
+    v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy
+    # Image bounds check
+    image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max)
+    # Combine masks
+    frustum_mask_1d = depth_mask & image_mask & valid_depth
+    frustum_mask = frustum_mask_1d.reshape(volume_shape)
+    return frustum_mask
+def get_output_shape(
+    oldh: int,
+    oldw: int,
+    short_edge_length: int,
+    max_size: int
+) -> Tuple[int, int]:
+    """Compute output size given input size and target short edge length."""
+    h, w = oldh, oldw
+    size = short_edge_length * 1.0
+    scale = size / min(h, w)
+    if h < w:
+        newh, neww = size, scale * w
+    else:
+        newh, neww = scale * h, size
+    if max(newh, neww) > max_size:
+        scale = max_size * 1.0 / max(newh, neww)
+        newh = newh * scale
+        neww = neww * scale
+    neww = int(neww + 0.5)
+    newh = int(newh + 0.5)
+    return (newh, neww)
+class ResizeShortestEdge(Transform):
+    def __init__(
+        self,
+        orig_size: Tuple[int, int],
+        short_edge_length,
+        max_size=sys.maxsize,
+        interp=cv2.INTER_LINEAR,
+        prob=1.0
+    ):
+        """ Resize shortest edge transform. """
+        super().__init__()
+        self.orig_size = orig_size
+        if isinstance(short_edge_length, int):
+            short_edge_length = (short_edge_length, short_edge_length)
+        self.short_edge_length = short_edge_length
+        self.max_size = max_size
+        self.interp = interp
+        self.prob = prob
+        self._get_output_shape()
+    def _get_output_shape(self):
+        """ Get random output shape based on short edge length. """
+        h, w = self.orig_size
+        self.new_size = None
+        size = np.random.choice(self.short_edge_length)
+        if size != 0:
+            hh, ww = get_output_shape(h, w, size, self.max_size)
+            self.new_size = (ww, hh)
+    def apply_coords(self, coords):
+        """ Apply transforms to the coordinates. """
+        return coords
+    def apply_image(self, img, interp=None):
+        """ Apply transforms to the image. """
+        new_h, new_w = self.new_size
+        return cv2.resize(img, (new_w, new_h), interpolation=self.interp)
+    def apply_segmentation(self, segmentation):
+        """ Apply transforms to the segmentation. """
+        new_h, new_w = self.new_size
+        return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
+def adjust_intrinsic(
+    intrinsic: Union[np.ndarray, torch.Tensor],
+    original_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+) -> Union[np.ndarray, torch.Tensor]:
+    """Adjust intrinsic matrix for image resize.
+    Args:
+        intrinsic: Camera intrinsic matrix (4x4 or 3x3).
+        original_size: Original image size (width, height).
+        target_size: Target image size (width, height).
+    Returns:
+        Adjusted intrinsic matrix.
+    """
+    is_tensor = isinstance(intrinsic, torch.Tensor)
+    if is_tensor:
+        device = intrinsic.device
+        dtype = intrinsic.dtype
+        intrinsic = intrinsic.cpu().numpy()
+    intrinsic = intrinsic.copy()
+    scale_x = target_size[0] / original_size[0]
+    scale_y = target_size[1] / original_size[1]
+    # Adjust focal length and principal point
+    intrinsic[0, 0] *= scale_x  # fx
+    intrinsic[1, 1] *= scale_y  # fy
+    intrinsic[0, 2] *= scale_x  # cx
+    intrinsic[1, 2] *= scale_y  # cy
+    if is_tensor:
+        intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype)
+    return intrinsic
+def load_image(
+    image_path: str,
+    target_size: Tuple[int, int] = (320, 240),
+    apply_resize_transform: bool = True,
+) -> np.ndarray:
+    """Load and preprocess image for Panoptic Recon 3D inference.
+    This function matches the preprocessing in test_triton_server.py exactly:
+    1. Load image as RGB
+    2. Resize to target_size (default 320x240)
+    3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320)
+    4. Convert to CHW format with batch dimension
+    Args:
+        image_path: Path to image file.
+        target_size: Target size (width, height). Default (320, 240).
+        apply_resize_transform: Whether to apply ResizeShortestEdge transform.
+    Returns:
+        Image as numpy array (1, C, H, W) in RGB format, uint8 dtype.
+    """
+    # Load image
+    img = Image.open(image_path).convert('RGB')
+    if img is None:
+        raise FileNotFoundError(f"Could not load image: {image_path}")
+    # Resize to target size
+    img = img.resize(target_size)
+    img = np.array(img)
+    # Apply ResizeShortestEdge transform (matches test_triton_server.py)
+    if apply_resize_transform:
+        resize_instance = ResizeShortestEdge(
+            orig_size=(target_size[0], target_size[1]),  # (width, height)
+            short_edge_length=240,
+            max_size=320,
+        )
+        img = resize_instance.apply_image(img)
+    # Convert to CHW format with contiguous memory (critical for torch.from_numpy)
+    image = np.ascontiguousarray(img.transpose(2, 0, 1))
+    # Add batch dimension: (C, H, W) -> (1, C, H, W)
+    image = image[np.newaxis, ...]
+    return image
+class DatasetConstants:
+    """Constants for Front3D dataset."""
+    DEFAULT_GRID_DIMS = [256, 256, 256]
+    DEFAULT_DEPTH_RANGE = (0.4, 6.0)
+    DEFAULT_VOXEL_SIZE = 0.03
+    DEFAULT_IMG_SIZE = (240, 320)  # (height, width)
+    IGNORE_LABEL = 255
+    INTRINSIC = DEFAULT_INTRINSIC
+    CATEGORIES = [
+        {"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"},
+        {"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"},
+        {"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"},
+        {"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"},
+        {"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"},
+        {"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"},
+        {"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"},
+        {"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"},
+        {"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"},
+        {"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"},
+        {"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"},
+        {"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"},
+    ]
+    STUFF_CLASSES = [10, 11]

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+numpy==1.26.4
+omegaconf
+addict
+pytest
+pytorch-lightning
+kubernetes
+nvidia-eff
+timm
+open_clip_torch
+colorama
+pycocotools
+fvcore
+opencv-python-headless
+huggingface-hub
+trimesh
+torchdata
+pycryptodome
+plyfile
+pyexr
+OpenEXR
+einops
+scikit-fmm
+pysdf
+scipy
+plotly
+orjson
+pymongo
+matplotlib
+hydra-core
+scikit-image

visualization.py ADDED Viewed

	@@ -0,0 +1,470 @@

+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Visualization utilities for Panoptic Recon 3D model outputs.
+This module provides functions for:
+- 2D segmentation visualization
+- Depth map visualization
+- 3D mesh extraction and PLY export
+"""
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import numpy as np
+# Optional imports for visualization
+try:
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as mpatches
+    HAS_MATPLOTLIB = True
+except ImportError:
+    HAS_MATPLOTLIB = False
+try:
+    from PIL import Image
+    HAS_PIL = True
+except ImportError:
+    HAS_PIL = False
+try:
+    from skimage import measure
+    HAS_SKIMAGE = True
+except ImportError:
+    HAS_SKIMAGE = False
+try:
+    from scipy.spatial import KDTree
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+def create_color_palette() -> np.ndarray:
+    """Create Front3D color palette for semantic classes.
+    Returns:
+        Color palette as numpy array (N, 3) with uint8 RGB values.
+    """
+    return np.array([
+        (0, 0, 0),          # 0: background
+        (174, 199, 232),    # 1: wall
+        (152, 223, 138),    # 2: floor
+        (31, 119, 180),     # 3: cabinet
+        (255, 187, 120),    # 4: bed
+        (188, 189, 34),     # 5: chair
+        (140, 86, 75),      # 6: sofa
+        (255, 152, 150),    # 7: table
+        (214, 39, 40),      # 8: door
+        (197, 176, 213),    # 9: window
+        (148, 103, 189),    # 10: bookshelf
+        (196, 156, 148),    # 11: picture
+        (23, 190, 207),     # 12: counter
+        (178, 76, 76),      # 13
+        (247, 182, 210),    # 14: desk
+        (66, 188, 102),     # 15
+        (219, 219, 141),    # 16: curtain
+        (140, 57, 197),     # 17
+        (202, 185, 52),     # 18
+        (51, 176, 203),     # 19
+        (200, 54, 131),     # 20
+        (92, 193, 61),      # 21
+        (78, 71, 183),      # 22
+        (172, 114, 82),     # 23
+        (255, 127, 14),     # 24: refrigerator
+        (91, 163, 138),     # 25
+        (153, 98, 156),     # 26
+        (140, 153, 101),    # 27
+        (158, 218, 229),    # 28: shower curtain
+        (100, 125, 154),    # 29
+        (178, 127, 135),    # 30
+        (120, 185, 128),    # 31
+        (146, 111, 194),    # 32
+        (44, 160, 44),      # 33: toilet
+        (112, 128, 144),    # 34: sink
+        (96, 207, 209),     # 35
+        (227, 119, 194),    # 36: bathtub
+        (213, 92, 176),     # 37
+        (94, 106, 211),     # 38
+        (82, 84, 163),      # 39: otherfurn
+        (100, 85, 144),     # 40
+        (172, 172, 172),    # 41
+    ], dtype=np.uint8)
+def colorize_segmentation(
+    segmentation: np.ndarray,
+    palette: Optional[np.ndarray] = None,
+) -> np.ndarray:
+    """Colorize segmentation map.
+    Args:
+        segmentation: Segmentation map (H, W) with class indices.
+        palette: Color palette (N, 3). Uses default if None.
+    Returns:
+        Colorized image (H, W, 3) as uint8.
+    """
+    if palette is None:
+        palette = create_color_palette()
+    # Clip indices to valid range
+    seg_clipped = np.clip(segmentation, 0, len(palette) - 1)
+    return palette[seg_clipped]
+def visualize_2d_segmentation(
+    image: np.ndarray,
+    panoptic_2d: np.ndarray,
+    output_path: Optional[Union[str, Path]] = None,
+    alpha: float = 0.6,
+    figsize: Tuple[int, int] = (18, 6),
+    dpi: int = 150,
+) -> Optional[np.ndarray]:
+    """Visualize 2D panoptic segmentation overlaid on image.
+    Args:
+        image: Original RGB image (H, W, C).
+        panoptic_2d: Panoptic segmentation map (H, W).
+        output_path: Path to save visualization. If None, returns array.
+        alpha: Blend alpha for overlay.
+        figsize: Figure size.
+        dpi: DPI for saved figure.
+    Returns:
+        Overlay image as numpy array if output_path is None.
+    """
+    if not HAS_MATPLOTLIB:
+        raise ImportError("matplotlib required for visualization")
+    if not HAS_PIL:
+        raise ImportError("PIL required for visualization")
+    # Get color palette
+    palette = create_color_palette()
+    colored_seg = colorize_segmentation(panoptic_2d, palette)
+    # Resize image to match segmentation if needed
+    if image.shape[:2] != panoptic_2d.shape:
+        image_pil = Image.fromarray(image)
+        image_pil = image_pil.resize((panoptic_2d.shape[1], panoptic_2d.shape[0]), Image.LANCZOS)
+        image = np.array(image_pil)
+    # Create overlay
+    overlay = (image.astype(np.float32) * (1 - alpha) + colored_seg.astype(np.float32) * alpha)
+    overlay = overlay.clip(0, 255).astype(np.uint8)
+    if output_path is None:
+        return overlay
+    # Create side-by-side visualization
+    fig, axes = plt.subplots(1, 3, figsize=figsize)
+    axes[0].imshow(image)
+    axes[0].set_title('Original Image', fontsize=14, fontweight='bold')
+    axes[0].axis('off')
+    axes[1].imshow(colored_seg)
+    axes[1].set_title('Panoptic Segmentation', fontsize=14, fontweight='bold')
+    axes[1].axis('off')
+    axes[2].imshow(overlay)
+    axes[2].set_title('Overlay', fontsize=14, fontweight='bold')
+    axes[2].axis('off')
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
+    plt.close()
+    print(f"✓ Saved 2D segmentation visualization to: {output_path}")
+    return None
+def visualize_depth_map(
+    depth_2d: np.ndarray,
+    output_path: Optional[Union[str, Path]] = None,
+    vmin: float = 0.0,
+    vmax: float = 6.0,
+    cmap: str = 'viridis',
+    figsize: Tuple[int, int] = (10, 8),
+    dpi: int = 150,
+) -> Optional[np.ndarray]:
+    """Visualize depth map.
+    Args:
+        depth_2d: Depth map (H, W).
+        output_path: Path to save visualization. If None, returns array.
+        vmin: Minimum depth for colormap.
+        vmax: Maximum depth for colormap.
+        cmap: Matplotlib colormap name.
+        figsize: Figure size.
+        dpi: DPI for saved figure.
+    Returns:
+        Colorized depth as numpy array if output_path is None.
+    """
+    if not HAS_MATPLOTLIB:
+        raise ImportError("matplotlib required for visualization")
+    # Normalize depth
+    depth_norm = (depth_2d - vmin) / (vmax - vmin)
+    depth_norm = np.clip(depth_norm, 0, 1)
+    # Get colormap
+    cm = plt.get_cmap(cmap)
+    depth_colored = (cm(depth_norm)[:, :, :3] * 255).astype(np.uint8)
+    if output_path is None:
+        return depth_colored
+    fig, ax = plt.subplots(1, 1, figsize=figsize)
+    im = ax.imshow(depth_2d, cmap=cmap, vmin=vmin, vmax=vmax)
+    ax.set_title('Depth Map', fontsize=14, fontweight='bold')
+    ax.axis('off')
+    cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_label('Depth (m)', rotation=270, labelpad=20, fontsize=12)
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
+    plt.close()
+    print(f"✓ Saved depth map visualization to: {output_path}")
+    return None
+def get_mesh(
+    distance_field: np.ndarray,
+    iso_value: float = 1.0,
+    spacing: Tuple[float, float, float] = (1.0, 1.0, 1.0),
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Extract mesh from distance field using marching cubes.
+    Args:
+        distance_field: 3D distance field (D, H, W).
+        iso_value: Iso-surface value.
+        spacing: Voxel spacing.
+    Returns:
+        vertices: Mesh vertices (N, 3).
+        faces: Mesh faces (M, 3).
+    """
+    if not HAS_SKIMAGE:
+        raise ImportError("scikit-image required for mesh extraction")
+    vertices, faces, _, _ = measure.marching_cubes(
+        distance_field,
+        level=iso_value,
+        spacing=spacing
+    )
+    return vertices, faces
+def write_ply(
+    vertices: np.ndarray,
+    output_file: Union[str, Path],
+    colors: Optional[np.ndarray] = None,
+    faces: Optional[np.ndarray] = None,
+) -> None:
+    """Write PLY file.
+    Args:
+        vertices: Vertex positions (N, 3).
+        output_file: Output PLY file path.
+        colors: Optional vertex colors (N, 3) as uint8.
+        faces: Optional face indices (M, 3).
+    """
+    with open(output_file, "w") as f:
+        f.write("ply\n")
+        f.write("format ascii 1.0\n")
+        f.write(f"element vertex {len(vertices)}\n")
+        f.write("property float x\n")
+        f.write("property float y\n")
+        f.write("property float z\n")
+        if colors is not None:
+            f.write("property uchar red\n")
+            f.write("property uchar green\n")
+            f.write("property uchar blue\n")
+        if faces is not None and len(faces) > 0:
+            f.write(f"element face {len(faces)}\n")
+            f.write("property list uchar uint vertex_indices\n")
+        f.write("end_header\n")
+        # Write vertices
+        if colors is not None:
+            for v, c in zip(vertices, colors):
+                f.write(f"{v[0]} {v[1]} {v[2]} {int(c[0])} {int(c[1])} {int(c[2])}\n")
+        else:
+            for v in vertices:
+                f.write(f"{v[0]} {v[1]} {v[2]}\n")
+        # Write faces
+        if faces is not None:
+            for face in faces:
+                f.write(f"3 {face[0]} {face[1]} {face[2]}\n")
+def save_3d_mesh(
+    geometry_3d: np.ndarray,
+    semantic_3d: np.ndarray,
+    output_path: Union[str, Path],
+    iso_value: float = 1.0,
+    voxel_size: float = 0.03,
+) -> bool:
+    """Extract and save 3D mesh with semantic colors.
+    Args:
+        geometry_3d: 3D geometry/TSDF (D, H, W).
+        semantic_3d: 3D semantic segmentation (D, H, W).
+        output_path: Output PLY file path.
+        iso_value: Iso-surface value for mesh extraction.
+        voxel_size: Voxel size in meters.
+    Returns:
+        True if successful, False otherwise.
+    """
+    if not HAS_SKIMAGE:
+        print("Warning: scikit-image not installed. Cannot save PLY mesh.")
+        return False
+    if not HAS_SCIPY:
+        print("Warning: scipy not installed. Cannot color mesh by semantics.")
+    try:
+        # Extract mesh
+        vertices, faces = get_mesh(
+            geometry_3d,
+            iso_value=iso_value,
+            spacing=(voxel_size, voxel_size, voxel_size)
+        )
+        colors = None
+        if HAS_SCIPY and np.any(semantic_3d):
+            # Get non-zero labeled voxels
+            nonzero_coords = np.stack(semantic_3d.nonzero(), axis=-1)
+            if len(nonzero_coords) > 0:
+                # Build KD tree for nearest neighbor lookup
+                labels_kd = KDTree(nonzero_coords)
+                palette = create_color_palette()
+                # Create color volume
+                semantic_clipped = np.clip(semantic_3d, 0, len(palette) - 1).astype(np.uint32)
+                color_volume = palette[semantic_clipped]
+                # Find nearest label for each vertex
+                # Scale vertices to voxel indices
+                vertex_indices = (vertices / voxel_size).astype(int)
+                neighbor_inds = labels_kd.query(vertex_indices)[1]
+                neighbors = labels_kd.data[neighbor_inds].astype(int)
+                # Clip to valid indices
+                neighbors = np.clip(neighbors, 0, np.array(color_volume.shape[:3]) - 1)
+                colors = color_volume[neighbors[:, 0], neighbors[:, 1], neighbors[:, 2]]
+        # Write PLY
+        write_ply(vertices, output_path, colors, faces)
+        print(f"✓ Saved 3D mesh to: {output_path}")
+        print(f"   Vertices: {len(vertices)}, Faces: {len(faces)}")
+        return True
+    except Exception as e:
+        print(f"Warning: Failed to save 3D mesh: {e}")
+        return False
+def save_outputs(
+    outputs,
+    output_dir: Union[str, Path],
+    original_image: Optional[np.ndarray] = None,
+    save_mesh: bool = True,
+    save_depth: bool = True,
+    save_segmentation: bool = True,
+    save_numpy: bool = True,
+) -> dict:
+    """Save all model outputs to directory.
+    Args:
+        outputs: PanopticRecon3DOutput from model.
+        output_dir: Output directory.
+        original_image: Optional original input image for visualization.
+        save_mesh: Whether to save 3D mesh PLY files.
+        save_depth: Whether to save depth visualization.
+        save_segmentation: Whether to save segmentation visualization.
+        save_numpy: Whether to save raw numpy arrays.
+    Returns:
+        Dictionary of saved file paths.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    saved_files = {}
+    # Convert outputs to numpy
+    outputs_np = outputs.to_numpy()
+    # Save numpy arrays
+    if save_numpy:
+        for name, arr in outputs_np.items():
+            npy_path = output_dir / f"{name}.npy"
+            np.save(npy_path, arr)
+            saved_files[f"{name}_npy"] = str(npy_path)
+    # Save 2D segmentation visualization
+    if save_segmentation and original_image is not None:
+        seg_path = output_dir / "panoptic_2d_visualization.png"
+        visualize_2d_segmentation(
+            original_image,
+            outputs_np["panoptic_seg_2d"],
+            seg_path
+        )
+        saved_files["segmentation_vis"] = str(seg_path)
+    # Save depth visualization
+    if save_depth:
+        depth_path = output_dir / "depth_visualization.png"
+        visualize_depth_map(
+            outputs_np["depth_2d"],
+            depth_path
+        )
+        saved_files["depth_vis"] = str(depth_path)
+    # Save 3D meshes
+    if save_mesh:
+        # Semantic mesh
+        semantic_mesh_path = output_dir / "mesh_semantic.ply"
+        if save_3d_mesh(
+            outputs_np["geometry_3d"],
+            outputs_np["semantic_seg_3d"],
+            semantic_mesh_path
+        ):
+            saved_files["semantic_mesh"] = str(semantic_mesh_path)
+        # Panoptic mesh
+        panoptic_mesh_path = output_dir / "mesh_panoptic.ply"
+        if save_3d_mesh(
+            outputs_np["geometry_3d"],
+            outputs_np["panoptic_seg_3d"],
+            panoptic_mesh_path
+        ):
+            saved_files["panoptic_mesh"] = str(panoptic_mesh_path)
+    return saved_files

weights/model_2d_fp32.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:641322e833c0a498a13695b52188dce5cf4e1fcf18fee58fc3b1c3a0a758af4a
+size 5514739585

weights/tao_vggt_front3d.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19186a4c1308bb255eb088997b70445b240a40370d9eba4cab5cf7919861f20b
+size 130585626