Update model inference code and environment setup instructions (#4)
Browse files- Update model inference code and environment setup instructions (911b37924fa99d3bb5958065d613047c8f21d115)
- Add 3D checkpoint (57eed615af595e01cd946aa2cf033d13ac027b21)
- Add 2D TorchScript model (6f5c9c267e07aa33e76aa77ec4db3b152b2e16b1)
Co-authored-by: Lam Thanh Tin <tinlam-nv@users.noreply.huggingface.co>
- README.md +57 -0
- config.json +19 -0
- model.py +987 -0
- nvpanoptix_3d/__init__.py +15 -0
- nvpanoptix_3d/blocks.py +417 -0
- nvpanoptix_3d/model_3d.py +355 -0
- nvpanoptix_3d/mp_occ/__init__.py +15 -0
- nvpanoptix_3d/mp_occ/back_projection.py +114 -0
- nvpanoptix_3d/mp_occ/multiplane_occupancy.py +175 -0
- nvpanoptix_3d/mp_occ/occupancy_aware_lifting.py +202 -0
- nvpanoptix_3d/reconstruction/__init__.py +20 -0
- nvpanoptix_3d/reconstruction/decoder.py +385 -0
- nvpanoptix_3d/reconstruction/frustum.py +112 -0
- nvpanoptix_3d/reconstruction/reprojection.py +235 -0
- nvpanoptix_3d/utils/__init__.py +15 -0
- nvpanoptix_3d/utils/coords_transform.py +232 -0
- nvpanoptix_3d/utils/frustum.py +192 -0
- nvpanoptix_3d/utils/helper.py +326 -0
- nvpanoptix_3d/utils/point_features.py +127 -0
- nvpanoptix_3d/utils/sparse_tensor.py +257 -0
- preprocessing.py +328 -0
- requirements.txt +30 -0
- visualization.py +470 -0
- weights/model_2d_fp32.pt +3 -0
- weights/tao_vggt_front3d.pth +3 -0
README.md
CHANGED
|
@@ -20,6 +20,63 @@ Global <br>
|
|
| 20 |
|
| 21 |
This model is intended for researchers and developers building 3D scene understanding applications for indoor environments, including robotics navigation, augmented reality, virtual reality, and architectural visualization. <br>
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
## Release Date
|
| 24 |
Hugging Face: 11/25/2025 via https://huggingface.co/nvidia/3d_panoptic_reconstruction <br>
|
| 25 |
|
|
|
|
| 20 |
|
| 21 |
This model is intended for researchers and developers building 3D scene understanding applications for indoor environments, including robotics navigation, augmented reality, virtual reality, and architectural visualization. <br>
|
| 22 |
|
| 23 |
+
## How to use
|
| 24 |
+
### Setup environment
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
# Setup NVPanoptix-3D env (CUDA 11.8):
|
| 28 |
+
conda create -n nvpanoptix python=3.10 -y
|
| 29 |
+
source activate nvpanoptix / conda activate nvpanoptix
|
| 30 |
+
apt-get update && apt-get install -y git git-lfs ninja-build cmake libopenblas-dev
|
| 31 |
+
git lfs install
|
| 32 |
+
|
| 33 |
+
git clone https://huggingface.co/nvidia/nvpanoptix-3d
|
| 34 |
+
cd nvpanoptix-3d
|
| 35 |
+
pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
|
| 38 |
+
# Temporarily set CUDA architecture list for MinkowskiEngine
|
| 39 |
+
export TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 9.0+PTX"
|
| 40 |
+
pip install ninja && FORCE_CUDA=1 pip install git+https://github.com/NVIDIA/MinkowskiEngine.git --no-build-isolation
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### Quick Start
|
| 44 |
+
```python
|
| 45 |
+
from model import PanopticRecon3DModel
|
| 46 |
+
from preprocessing import load_image
|
| 47 |
+
from visualization import save_outputs
|
| 48 |
+
from PIL import Image
|
| 49 |
+
import numpy as np
|
| 50 |
+
|
| 51 |
+
# Load model from local directory
|
| 52 |
+
model = PanopticRecon3DModel.from_pretrained("nvpanoptix-3d")
|
| 53 |
+
|
| 54 |
+
# Or load from HF repo
|
| 55 |
+
# model = PanopticRecon3DModel.from_pretrained("nvidia/nvpanoptix-3d")
|
| 56 |
+
|
| 57 |
+
# Load and preprocess image
|
| 58 |
+
image_path = "path/to/your/image.png"
|
| 59 |
+
|
| 60 |
+
# keep original image for visualization
|
| 61 |
+
orig_image = Image.open(image_path).convert("RGB")
|
| 62 |
+
orig_image = np.array(orig_image)
|
| 63 |
+
|
| 64 |
+
# load processed image for inference
|
| 65 |
+
image = load_image(image_path, target_size=(320, 240))
|
| 66 |
+
|
| 67 |
+
# Run inference
|
| 68 |
+
outputs = model.predict(image)
|
| 69 |
+
|
| 70 |
+
# Save results (2D segmentation, depth map, 3D mesh)
|
| 71 |
+
save_outputs(outputs, "output_dir/", original_image=orig_image)
|
| 72 |
+
|
| 73 |
+
# Access individual outputs
|
| 74 |
+
print(f"2D Panoptic: {outputs.panoptic_seg_2d.shape}") # (120, 160)
|
| 75 |
+
print(f"2D Depth: {outputs.depth_2d.shape}") # (120, 160)
|
| 76 |
+
print(f"3D Geometry: {outputs.geometry_3d.shape}") # (256, 256, 256)
|
| 77 |
+
print(f"3D Semantic: {outputs.semantic_seg_3d.shape}") # (256, 256, 256)
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
## Release Date
|
| 81 |
Hugging Face: 11/25/2025 via https://huggingface.co/nvidia/3d_panoptic_reconstruction <br>
|
| 82 |
|
config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_classes": 13,
|
| 3 |
+
"num_thing_classes": 9,
|
| 4 |
+
"object_mask_threshold": 0.8,
|
| 5 |
+
"overlap_threshold": 0.5,
|
| 6 |
+
"frustum_dims": 256,
|
| 7 |
+
"truncation": 3.0,
|
| 8 |
+
"iso_recon_value": 2.0,
|
| 9 |
+
"voxel_size": 0.03,
|
| 10 |
+
"depth_min": 0.4,
|
| 11 |
+
"depth_max": 6.0,
|
| 12 |
+
"target_size": [320, 240],
|
| 13 |
+
"reduced_target_size": [160, 120],
|
| 14 |
+
"size_divisibility": 32,
|
| 15 |
+
"downsample_factor": 1,
|
| 16 |
+
"is_matterport": false,
|
| 17 |
+
"use_fp16_2d": false
|
| 18 |
+
}
|
| 19 |
+
|
model.py
ADDED
|
@@ -0,0 +1,987 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
NVPanoptix-3D Model.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from omegaconf import OmegaConf
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from typing import Optional, Tuple, List, Dict, Any, Union
|
| 25 |
+
|
| 26 |
+
import numpy as np
|
| 27 |
+
import torch
|
| 28 |
+
import torch.nn as nn
|
| 29 |
+
from torch.nn import functional as F
|
| 30 |
+
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
|
| 31 |
+
from preprocessing import create_frustum_mask, DEFAULT_INTRINSIC
|
| 32 |
+
|
| 33 |
+
from nvpanoptix_3d.model_3d import Panoptic3DModel
|
| 34 |
+
from nvpanoptix_3d.utils.helper import get_kept_mapping, retry_if_cuda_oom
|
| 35 |
+
from nvpanoptix_3d.utils.coords_transform import (
|
| 36 |
+
transform_feat3d_coordinates, fuse_sparse_tensors, generate_multiscale_feat3d
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Weight file names (stored in weights/ subdirectory)
|
| 41 |
+
WEIGHTS_DIR = "weights"
|
| 42 |
+
TORCHSCRIPT_2D_FILENAME = "model_2d_fp32.pt"
|
| 43 |
+
CHECKPOINT_3D_FILENAME = "tao_vggt_front3d.pth"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class PanopticRecon3DConfig:
|
| 48 |
+
"""Configuration for Panoptic Recon 3D model.
|
| 49 |
+
|
| 50 |
+
This config is JSON-serializable and will be saved to config.json
|
| 51 |
+
when using save_pretrained or push_to_hub.
|
| 52 |
+
"""
|
| 53 |
+
# Model architecture
|
| 54 |
+
num_classes: int = 13
|
| 55 |
+
num_thing_classes: int = 9
|
| 56 |
+
object_mask_threshold: float = 0.8
|
| 57 |
+
overlap_threshold: float = 0.5
|
| 58 |
+
test_topk_per_image: int = 100
|
| 59 |
+
|
| 60 |
+
# Backbone
|
| 61 |
+
backbone_type: str = "vggt"
|
| 62 |
+
|
| 63 |
+
# Mask Former
|
| 64 |
+
hidden_dim: int = 256
|
| 65 |
+
num_queries: int = 100
|
| 66 |
+
mask_dim: int = 256
|
| 67 |
+
depth_dim: int = 256
|
| 68 |
+
dec_layers: int = 10
|
| 69 |
+
|
| 70 |
+
# 3D Frustum
|
| 71 |
+
frustum_dims: int = 256
|
| 72 |
+
truncation: float = 3.0
|
| 73 |
+
iso_recon_value: float = 2.0
|
| 74 |
+
voxel_size: float = 0.03
|
| 75 |
+
|
| 76 |
+
# Projection
|
| 77 |
+
depth_feature_dim: int = 256
|
| 78 |
+
sign_channel: bool = True
|
| 79 |
+
|
| 80 |
+
# Dataset/preprocessing
|
| 81 |
+
target_size: Tuple[int, int] = (320, 240)
|
| 82 |
+
reduced_target_size: Tuple[int, int] = (160, 120)
|
| 83 |
+
depth_size: Tuple[int, int] = (120, 160)
|
| 84 |
+
depth_min: float = 0.4
|
| 85 |
+
depth_max: float = 6.0
|
| 86 |
+
depth_scale: float = 25.0
|
| 87 |
+
pixel_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406)
|
| 88 |
+
pixel_std: Tuple[float, float, float] = (0.229, 0.224, 0.225)
|
| 89 |
+
ignore_label: int = 255
|
| 90 |
+
size_divisibility: int = 32
|
| 91 |
+
downsample_factor: int = 1
|
| 92 |
+
|
| 93 |
+
# Model paths
|
| 94 |
+
torchscript_2d_path: Optional[str] = None
|
| 95 |
+
use_fp16_2d: bool = False
|
| 96 |
+
|
| 97 |
+
# Dataset mode
|
| 98 |
+
is_matterport: bool = False
|
| 99 |
+
|
| 100 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 101 |
+
"""Convert config to dictionary."""
|
| 102 |
+
return {
|
| 103 |
+
"num_classes": self.num_classes,
|
| 104 |
+
"num_thing_classes": self.num_thing_classes,
|
| 105 |
+
"object_mask_threshold": self.object_mask_threshold,
|
| 106 |
+
"overlap_threshold": self.overlap_threshold,
|
| 107 |
+
"test_topk_per_image": self.test_topk_per_image,
|
| 108 |
+
"backbone_type": self.backbone_type,
|
| 109 |
+
"hidden_dim": self.hidden_dim,
|
| 110 |
+
"num_queries": self.num_queries,
|
| 111 |
+
"mask_dim": self.mask_dim,
|
| 112 |
+
"depth_dim": self.depth_dim,
|
| 113 |
+
"dec_layers": self.dec_layers,
|
| 114 |
+
"frustum_dims": self.frustum_dims,
|
| 115 |
+
"truncation": self.truncation,
|
| 116 |
+
"iso_recon_value": self.iso_recon_value,
|
| 117 |
+
"voxel_size": self.voxel_size,
|
| 118 |
+
"depth_feature_dim": self.depth_feature_dim,
|
| 119 |
+
"sign_channel": self.sign_channel,
|
| 120 |
+
"target_size": list(self.target_size),
|
| 121 |
+
"reduced_target_size": list(self.reduced_target_size),
|
| 122 |
+
"depth_size": list(self.depth_size),
|
| 123 |
+
"depth_min": self.depth_min,
|
| 124 |
+
"depth_max": self.depth_max,
|
| 125 |
+
"depth_scale": self.depth_scale,
|
| 126 |
+
"pixel_mean": list(self.pixel_mean),
|
| 127 |
+
"pixel_std": list(self.pixel_std),
|
| 128 |
+
"ignore_label": self.ignore_label,
|
| 129 |
+
"size_divisibility": self.size_divisibility,
|
| 130 |
+
"downsample_factor": self.downsample_factor,
|
| 131 |
+
"torchscript_2d_path": self.torchscript_2d_path,
|
| 132 |
+
"use_fp16_2d": self.use_fp16_2d,
|
| 133 |
+
"is_matterport": self.is_matterport,
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
@classmethod
|
| 137 |
+
def from_dict(cls, config_dict: Dict[str, Any]) -> "PanopticRecon3DConfig":
|
| 138 |
+
"""Create config from dictionary."""
|
| 139 |
+
# Convert lists back to tuples
|
| 140 |
+
if "target_size" in config_dict:
|
| 141 |
+
config_dict["target_size"] = tuple(config_dict["target_size"])
|
| 142 |
+
if "reduced_target_size" in config_dict:
|
| 143 |
+
config_dict["reduced_target_size"] = tuple(config_dict["reduced_target_size"])
|
| 144 |
+
if "depth_size" in config_dict:
|
| 145 |
+
config_dict["depth_size"] = tuple(config_dict["depth_size"])
|
| 146 |
+
if "pixel_mean" in config_dict:
|
| 147 |
+
config_dict["pixel_mean"] = tuple(config_dict["pixel_mean"])
|
| 148 |
+
if "pixel_std" in config_dict:
|
| 149 |
+
config_dict["pixel_std"] = tuple(config_dict["pixel_std"])
|
| 150 |
+
return cls(**config_dict)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
@dataclass
|
| 154 |
+
class PanopticRecon3DOutput:
|
| 155 |
+
"""Output from Panoptic Recon 3D model."""
|
| 156 |
+
# 3D outputs
|
| 157 |
+
panoptic_seg_3d: torch.Tensor # (D, H, W) int32 - panoptic segmentation
|
| 158 |
+
geometry_3d: torch.Tensor # (D, H, W) float32 - TSDF/geometry
|
| 159 |
+
semantic_seg_3d: torch.Tensor # (D, H, W) int32 - semantic segmentation
|
| 160 |
+
|
| 161 |
+
# 2D outputs
|
| 162 |
+
panoptic_seg_2d: torch.Tensor # (H, W) int32 - 2D panoptic segmentation
|
| 163 |
+
depth_2d: torch.Tensor # (H, W) float32 - depth map
|
| 164 |
+
|
| 165 |
+
# Optional metadata
|
| 166 |
+
panoptic_semantic_mapping: Optional[Dict[int, int]] = None
|
| 167 |
+
segments_info: Optional[List[Dict]] = None
|
| 168 |
+
|
| 169 |
+
def to_numpy(self) -> Dict[str, np.ndarray]:
|
| 170 |
+
"""Convert outputs to numpy arrays."""
|
| 171 |
+
result = {
|
| 172 |
+
"panoptic_seg_3d": self.panoptic_seg_3d.cpu().numpy(),
|
| 173 |
+
"geometry_3d": self.geometry_3d.cpu().numpy(),
|
| 174 |
+
"semantic_seg_3d": self.semantic_seg_3d.cpu().numpy(),
|
| 175 |
+
"panoptic_seg_2d": self.panoptic_seg_2d.cpu().numpy(),
|
| 176 |
+
"depth_2d": self.depth_2d.cpu().numpy(),
|
| 177 |
+
}
|
| 178 |
+
return result
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
class PanopticRecon3DModel(
|
| 182 |
+
nn.Module,
|
| 183 |
+
PyTorchModelHubMixin,
|
| 184 |
+
# HuggingFace Hub metadata
|
| 185 |
+
repo_url="nvidia/nvpanoptix-3d",
|
| 186 |
+
pipeline_tag="image-segmentation",
|
| 187 |
+
license="apache-2.0",
|
| 188 |
+
tags=["panoptic-segmentation", "3d-reconstruction", "depth-estimation", "nvidia"],
|
| 189 |
+
):
|
| 190 |
+
"""
|
| 191 |
+
This model performs panoptic 3D scene reconstruction from a single RGB image.
|
| 192 |
+
It combines:
|
| 193 |
+
- 2D panoptic segmentation
|
| 194 |
+
- Depth estimation
|
| 195 |
+
- 3D volumetric reconstruction
|
| 196 |
+
|
| 197 |
+
The model architecture uses:
|
| 198 |
+
- VGGT backbone for feature extraction
|
| 199 |
+
- MaskFormer head for panoptic segmentation
|
| 200 |
+
- Occupancy-aware lifting for 2D-to-3D projection
|
| 201 |
+
- Sparse 3D convolutions for volumetric completion
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
def __init__(
|
| 205 |
+
self,
|
| 206 |
+
num_classes: int = 13,
|
| 207 |
+
num_thing_classes: int = 9,
|
| 208 |
+
object_mask_threshold: float = 0.8,
|
| 209 |
+
overlap_threshold: float = 0.5,
|
| 210 |
+
frustum_dims: int = 256,
|
| 211 |
+
truncation: float = 3.0,
|
| 212 |
+
iso_recon_value: float = 2.0,
|
| 213 |
+
voxel_size: float = 0.03,
|
| 214 |
+
depth_min: float = 0.4,
|
| 215 |
+
depth_max: float = 6.0,
|
| 216 |
+
target_size: Tuple[int, int] = (320, 240),
|
| 217 |
+
reduced_target_size: Tuple[int, int] = (160, 120),
|
| 218 |
+
size_divisibility: int = 32,
|
| 219 |
+
downsample_factor: int = 1,
|
| 220 |
+
is_matterport: bool = False,
|
| 221 |
+
torchscript_2d_path: Optional[str] = None,
|
| 222 |
+
use_fp16_2d: bool = False,
|
| 223 |
+
**kwargs,
|
| 224 |
+
):
|
| 225 |
+
"""Initialize Panoptic Recon 3D model.
|
| 226 |
+
|
| 227 |
+
Args:
|
| 228 |
+
num_classes: Number of semantic classes.
|
| 229 |
+
num_thing_classes: Number of "thing" (instance) classes.
|
| 230 |
+
object_mask_threshold: Threshold for object mask confidence.
|
| 231 |
+
overlap_threshold: Threshold for mask overlap.
|
| 232 |
+
frustum_dims: Dimensions of 3D frustum volume.
|
| 233 |
+
truncation: TSDF truncation distance.
|
| 234 |
+
iso_recon_value: Iso-surface value for mesh extraction.
|
| 235 |
+
voxel_size: Voxel size in meters.
|
| 236 |
+
depth_min: Minimum depth value.
|
| 237 |
+
depth_max: Maximum depth value.
|
| 238 |
+
target_size: Target image size (width, height).
|
| 239 |
+
reduced_target_size: Reduced target size for 3D projection.
|
| 240 |
+
size_divisibility: Size divisibility for padding.
|
| 241 |
+
downsample_factor: Downsample factor for 3D reconstruction.
|
| 242 |
+
is_matterport: Whether using Matterport dataset mode.
|
| 243 |
+
torchscript_2d_path: Path to TorchScript 2D model (optional).
|
| 244 |
+
use_fp16_2d: Whether to use FP16 for 2D model.
|
| 245 |
+
"""
|
| 246 |
+
super().__init__()
|
| 247 |
+
|
| 248 |
+
# Store config as attributes (for PyTorchModelHubMixin serialization)
|
| 249 |
+
self.num_classes = num_classes
|
| 250 |
+
self.num_thing_classes = num_thing_classes
|
| 251 |
+
self.object_mask_threshold = object_mask_threshold
|
| 252 |
+
self.overlap_threshold = overlap_threshold
|
| 253 |
+
self.frustum_dims_val = frustum_dims
|
| 254 |
+
self.truncation = truncation
|
| 255 |
+
self.iso_recon_value = iso_recon_value
|
| 256 |
+
self.voxel_size = voxel_size
|
| 257 |
+
self.depth_min = depth_min
|
| 258 |
+
self.depth_max = depth_max
|
| 259 |
+
self.target_size = target_size
|
| 260 |
+
self.reduced_target_size = reduced_target_size
|
| 261 |
+
self.size_divisibility = size_divisibility
|
| 262 |
+
self.downsample_factor = downsample_factor
|
| 263 |
+
self.is_matterport = is_matterport
|
| 264 |
+
self.torchscript_2d_path = torchscript_2d_path
|
| 265 |
+
self.use_fp16_2d = use_fp16_2d
|
| 266 |
+
|
| 267 |
+
# Derived values
|
| 268 |
+
self.frustum_dims = [frustum_dims] * 3
|
| 269 |
+
|
| 270 |
+
# Models will be loaded on first use or via load_weights
|
| 271 |
+
self.model_2d: Optional[torch.jit.ScriptModule] = None
|
| 272 |
+
self.model_3d_components: Optional[Dict[str, nn.Module]] = None
|
| 273 |
+
self._initialized = False
|
| 274 |
+
|
| 275 |
+
# Placeholder for post processor
|
| 276 |
+
self.post_processor = None
|
| 277 |
+
|
| 278 |
+
@classmethod
|
| 279 |
+
def _from_pretrained(
|
| 280 |
+
cls,
|
| 281 |
+
*,
|
| 282 |
+
model_id: str,
|
| 283 |
+
revision: Optional[str] = None,
|
| 284 |
+
cache_dir: Optional[str] = None,
|
| 285 |
+
force_download: bool = False,
|
| 286 |
+
proxies: Optional[Dict] = None,
|
| 287 |
+
resume_download: bool = False,
|
| 288 |
+
local_files_only: bool = False,
|
| 289 |
+
token: Optional[Union[str, bool]] = None,
|
| 290 |
+
map_location: str = "cpu",
|
| 291 |
+
strict: bool = False,
|
| 292 |
+
**model_kwargs,
|
| 293 |
+
) -> "PanopticRecon3DModel":
|
| 294 |
+
"""Load model from HuggingFace Hub or local directory.
|
| 295 |
+
|
| 296 |
+
This method handles loading both the TorchScript 2D model and the 3D checkpoint.
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
model_id: HuggingFace Hub repo ID or local directory path.
|
| 300 |
+
revision: Git revision (branch, tag, or commit hash).
|
| 301 |
+
cache_dir: Cache directory for downloaded files.
|
| 302 |
+
force_download: Force re-download even if cached.
|
| 303 |
+
proxies: Proxy configuration.
|
| 304 |
+
resume_download: Resume interrupted downloads.
|
| 305 |
+
local_files_only: Only use local files, don't download.
|
| 306 |
+
token: HuggingFace API token.
|
| 307 |
+
map_location: Device to load model onto.
|
| 308 |
+
strict: Strict loading (not used for this model).
|
| 309 |
+
**model_kwargs: Additional model arguments.
|
| 310 |
+
|
| 311 |
+
Returns:
|
| 312 |
+
Initialized PanopticRecon3DModel with weights loaded.
|
| 313 |
+
"""
|
| 314 |
+
# Determine device
|
| 315 |
+
device = model_kwargs.pop("device", None)
|
| 316 |
+
if device is None:
|
| 317 |
+
device = map_location if map_location != "cpu" else "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 318 |
+
|
| 319 |
+
# Check if local directory
|
| 320 |
+
model_path = Path(model_id)
|
| 321 |
+
if model_path.exists() and model_path.is_dir():
|
| 322 |
+
# Local directory
|
| 323 |
+
config_path = model_path / "config.json"
|
| 324 |
+
weights_dir = model_path / WEIGHTS_DIR
|
| 325 |
+
torchscript_2d_path = weights_dir / TORCHSCRIPT_2D_FILENAME
|
| 326 |
+
checkpoint_3d_path = weights_dir / CHECKPOINT_3D_FILENAME
|
| 327 |
+
|
| 328 |
+
# Load config if exists
|
| 329 |
+
if config_path.exists():
|
| 330 |
+
with open(config_path, "r") as f:
|
| 331 |
+
config = json.load(f)
|
| 332 |
+
# Merge with model_kwargs (model_kwargs take precedence)
|
| 333 |
+
for key, value in config.items():
|
| 334 |
+
if key not in model_kwargs:
|
| 335 |
+
model_kwargs[key] = value
|
| 336 |
+
else:
|
| 337 |
+
# HuggingFace Hub - download files
|
| 338 |
+
# Download config.json
|
| 339 |
+
try:
|
| 340 |
+
config_file = hf_hub_download(
|
| 341 |
+
repo_id=model_id,
|
| 342 |
+
filename="config.json",
|
| 343 |
+
revision=revision,
|
| 344 |
+
cache_dir=cache_dir,
|
| 345 |
+
force_download=force_download,
|
| 346 |
+
proxies=proxies,
|
| 347 |
+
resume_download=resume_download,
|
| 348 |
+
local_files_only=local_files_only,
|
| 349 |
+
token=token,
|
| 350 |
+
)
|
| 351 |
+
with open(config_file, "r") as f:
|
| 352 |
+
config = json.load(f)
|
| 353 |
+
for key, value in config.items():
|
| 354 |
+
if key not in model_kwargs:
|
| 355 |
+
model_kwargs[key] = value
|
| 356 |
+
except Exception:
|
| 357 |
+
pass # Config is optional
|
| 358 |
+
|
| 359 |
+
# Download weight files
|
| 360 |
+
torchscript_2d_path = hf_hub_download(
|
| 361 |
+
repo_id=model_id,
|
| 362 |
+
filename=f"{WEIGHTS_DIR}/{TORCHSCRIPT_2D_FILENAME}",
|
| 363 |
+
revision=revision,
|
| 364 |
+
cache_dir=cache_dir,
|
| 365 |
+
force_download=force_download,
|
| 366 |
+
proxies=proxies,
|
| 367 |
+
resume_download=resume_download,
|
| 368 |
+
local_files_only=local_files_only,
|
| 369 |
+
token=token,
|
| 370 |
+
)
|
| 371 |
+
checkpoint_3d_path = hf_hub_download(
|
| 372 |
+
repo_id=model_id,
|
| 373 |
+
filename=f"{WEIGHTS_DIR}/{CHECKPOINT_3D_FILENAME}",
|
| 374 |
+
revision=revision,
|
| 375 |
+
cache_dir=cache_dir,
|
| 376 |
+
force_download=force_download,
|
| 377 |
+
proxies=proxies,
|
| 378 |
+
resume_download=resume_download,
|
| 379 |
+
local_files_only=local_files_only,
|
| 380 |
+
token=token,
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
# Create model instance
|
| 384 |
+
model = cls(**model_kwargs)
|
| 385 |
+
|
| 386 |
+
# Load weights
|
| 387 |
+
model.load_weights(
|
| 388 |
+
torchscript_2d_path=str(torchscript_2d_path),
|
| 389 |
+
checkpoint_3d_path=str(checkpoint_3d_path),
|
| 390 |
+
device=device,
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
return model
|
| 394 |
+
|
| 395 |
+
def _save_pretrained(self, save_directory: Path) -> None:
|
| 396 |
+
"""Save model to directory.
|
| 397 |
+
|
| 398 |
+
This saves the config.json and copies weight files to the directory.
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
save_directory: Directory to save model to.
|
| 402 |
+
"""
|
| 403 |
+
save_directory = Path(save_directory)
|
| 404 |
+
save_directory.mkdir(parents=True, exist_ok=True)
|
| 405 |
+
|
| 406 |
+
# Save config
|
| 407 |
+
config = {
|
| 408 |
+
"num_classes": self.num_classes,
|
| 409 |
+
"num_thing_classes": self.num_thing_classes,
|
| 410 |
+
"object_mask_threshold": self.object_mask_threshold,
|
| 411 |
+
"overlap_threshold": self.overlap_threshold,
|
| 412 |
+
"frustum_dims": self.frustum_dims_val,
|
| 413 |
+
"truncation": self.truncation,
|
| 414 |
+
"iso_recon_value": self.iso_recon_value,
|
| 415 |
+
"voxel_size": self.voxel_size,
|
| 416 |
+
"depth_min": self.depth_min,
|
| 417 |
+
"depth_max": self.depth_max,
|
| 418 |
+
"target_size": list(self.target_size),
|
| 419 |
+
"reduced_target_size": list(self.reduced_target_size),
|
| 420 |
+
"size_divisibility": self.size_divisibility,
|
| 421 |
+
"downsample_factor": self.downsample_factor,
|
| 422 |
+
"is_matterport": self.is_matterport,
|
| 423 |
+
"use_fp16_2d": self.use_fp16_2d,
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
config_path = save_directory / "config.json"
|
| 427 |
+
with open(config_path, "w") as f:
|
| 428 |
+
json.dump(config, f, indent=2)
|
| 429 |
+
|
| 430 |
+
# Create weights directory and copy/save weights
|
| 431 |
+
weights_dir = save_directory / WEIGHTS_DIR
|
| 432 |
+
weights_dir.mkdir(exist_ok=True)
|
| 433 |
+
|
| 434 |
+
# Note: Weight files should be copied manually or the model
|
| 435 |
+
# should be saved from a loaded state
|
| 436 |
+
if self._initialized and hasattr(self, '_torchscript_2d_path'):
|
| 437 |
+
import shutil
|
| 438 |
+
# Copy TorchScript 2D model
|
| 439 |
+
src_2d = Path(self._torchscript_2d_path)
|
| 440 |
+
if src_2d.exists():
|
| 441 |
+
shutil.copy2(src_2d, weights_dir / TORCHSCRIPT_2D_FILENAME)
|
| 442 |
+
|
| 443 |
+
# Copy 3D checkpoint
|
| 444 |
+
src_3d = Path(self._checkpoint_3d_path)
|
| 445 |
+
if src_3d.exists():
|
| 446 |
+
shutil.copy2(src_3d, weights_dir / CHECKPOINT_3D_FILENAME)
|
| 447 |
+
|
| 448 |
+
def _build_omegaconf(self) -> Any:
|
| 449 |
+
"""Build OmegaConf config for internal model components."""
|
| 450 |
+
|
| 451 |
+
return OmegaConf.create({
|
| 452 |
+
"model": {
|
| 453 |
+
"export": True,
|
| 454 |
+
"mode": "panoptic",
|
| 455 |
+
"object_mask_threshold": self.object_mask_threshold,
|
| 456 |
+
"overlap_threshold": self.overlap_threshold,
|
| 457 |
+
"test_topk_per_image": 100,
|
| 458 |
+
"backbone": {"type": "vggt", "pretrained_weights": None},
|
| 459 |
+
"sem_seg_head": {
|
| 460 |
+
"common_stride": 4,
|
| 461 |
+
"transformer_enc_layers": 6,
|
| 462 |
+
"convs_dim": 256,
|
| 463 |
+
"mask_dim": 256,
|
| 464 |
+
"depth_dim": 256,
|
| 465 |
+
"ignore_value": 255,
|
| 466 |
+
"deformable_transformer_encoder_in_features": ["res3", "res4", "res5"],
|
| 467 |
+
"num_classes": self.num_classes,
|
| 468 |
+
"norm": "GN",
|
| 469 |
+
"in_features": ["res2", "res3", "res4", "res5"]
|
| 470 |
+
},
|
| 471 |
+
"mask_former": {
|
| 472 |
+
"dropout": 0.0,
|
| 473 |
+
"nheads": 8,
|
| 474 |
+
"num_object_queries": 100,
|
| 475 |
+
"hidden_dim": 256,
|
| 476 |
+
"transformer_dim_feedforward": 1024,
|
| 477 |
+
"dim_feedforward": 2048,
|
| 478 |
+
"dec_layers": 10,
|
| 479 |
+
"pre_norm": False,
|
| 480 |
+
"class_weight": 2.0,
|
| 481 |
+
"dice_weight": 5.0,
|
| 482 |
+
"mask_weight": 5.0,
|
| 483 |
+
"depth_weight": 5.0,
|
| 484 |
+
"mp_occ_weight": 5.0,
|
| 485 |
+
"train_num_points": 12544,
|
| 486 |
+
"oversample_ratio": 3.0,
|
| 487 |
+
"importance_sample_ratio": 0.75,
|
| 488 |
+
"deep_supervision": True,
|
| 489 |
+
"no_object_weight": 0.1,
|
| 490 |
+
"size_divisibility": self.size_divisibility
|
| 491 |
+
},
|
| 492 |
+
"frustum3d": {
|
| 493 |
+
"truncation": self.truncation,
|
| 494 |
+
"iso_recon_value": self.iso_recon_value,
|
| 495 |
+
"panoptic_weight": 25.0,
|
| 496 |
+
"completion_weights": [50.0, 25.0, 10.0],
|
| 497 |
+
"surface_weight": 5.0,
|
| 498 |
+
"unet_output_channels": 16,
|
| 499 |
+
"unet_features": 16,
|
| 500 |
+
"use_multi_scale": False,
|
| 501 |
+
"grid_dimensions": self.frustum_dims_val,
|
| 502 |
+
"frustum_dims": self.frustum_dims_val,
|
| 503 |
+
"signed_channel": 3
|
| 504 |
+
},
|
| 505 |
+
"projection": {
|
| 506 |
+
"voxel_size": self.voxel_size,
|
| 507 |
+
"sign_channel": True,
|
| 508 |
+
"depth_feature_dim": 256
|
| 509 |
+
}
|
| 510 |
+
},
|
| 511 |
+
"dataset": {
|
| 512 |
+
"contiguous_id": False,
|
| 513 |
+
"label_map": "",
|
| 514 |
+
"name": "", # Empty string to match Triton behavior (triggers adjust_intrinsic)
|
| 515 |
+
"downsample_factor": self.downsample_factor,
|
| 516 |
+
"iso_value": 1.0,
|
| 517 |
+
"pixel_mean": [0.485, 0.456, 0.406],
|
| 518 |
+
"pixel_std": [0.229, 0.224, 0.225],
|
| 519 |
+
"ignore_label": 255,
|
| 520 |
+
"min_instance_pixels": 200,
|
| 521 |
+
"img_format": "RGB",
|
| 522 |
+
"target_size": list(self.target_size),
|
| 523 |
+
"reduced_target_size": list(self.reduced_target_size),
|
| 524 |
+
"depth_size": [120, 160],
|
| 525 |
+
"depth_bound": False,
|
| 526 |
+
"depth_min": self.depth_min,
|
| 527 |
+
"depth_max": self.depth_max,
|
| 528 |
+
"frustum_mask_path": "",
|
| 529 |
+
"occ_truncation_lvl": [8.0, 6.0],
|
| 530 |
+
"truncation_range": [0.0, 12.0],
|
| 531 |
+
"enable_3d": False,
|
| 532 |
+
"enable_mp_occ": True,
|
| 533 |
+
"depth_scale": 25.0,
|
| 534 |
+
"num_thing_classes": self.num_thing_classes,
|
| 535 |
+
"augmentation": {"size_divisibility": self.size_divisibility}
|
| 536 |
+
}
|
| 537 |
+
})
|
| 538 |
+
|
| 539 |
+
def load_weights(
|
| 540 |
+
self,
|
| 541 |
+
torchscript_2d_path: Optional[str] = None,
|
| 542 |
+
checkpoint_3d_path: Optional[str] = None,
|
| 543 |
+
device: str = "cuda:0",
|
| 544 |
+
):
|
| 545 |
+
"""Load model weights.
|
| 546 |
+
|
| 547 |
+
Args:
|
| 548 |
+
torchscript_2d_path: Path to TorchScript 2D model file.
|
| 549 |
+
checkpoint_3d_path: Path to 3D model checkpoint (.pth/.pt).
|
| 550 |
+
device: Device to load models onto.
|
| 551 |
+
"""
|
| 552 |
+
# Use stored path if not provided
|
| 553 |
+
torchscript_2d_path = torchscript_2d_path or self.torchscript_2d_path
|
| 554 |
+
|
| 555 |
+
if torchscript_2d_path is None:
|
| 556 |
+
raise ValueError("torchscript_2d_path is required")
|
| 557 |
+
if checkpoint_3d_path is None:
|
| 558 |
+
raise ValueError("checkpoint_3d_path is required")
|
| 559 |
+
|
| 560 |
+
# Store paths for save_pretrained
|
| 561 |
+
self._torchscript_2d_path = torchscript_2d_path
|
| 562 |
+
self._checkpoint_3d_path = checkpoint_3d_path
|
| 563 |
+
|
| 564 |
+
# Build config
|
| 565 |
+
cfg = self._build_omegaconf()
|
| 566 |
+
|
| 567 |
+
# Load 2D TorchScript model
|
| 568 |
+
self.model_2d = torch.jit.load(torchscript_2d_path, map_location=device)
|
| 569 |
+
self.model_2d.eval()
|
| 570 |
+
|
| 571 |
+
# Load 3D model from checkpoint
|
| 572 |
+
full_model = Panoptic3DModel(cfg)
|
| 573 |
+
|
| 574 |
+
checkpoint = torch.load(checkpoint_3d_path, map_location="cpu")
|
| 575 |
+
state_dict = checkpoint.get("state_dict", checkpoint)
|
| 576 |
+
|
| 577 |
+
# Remove 'model.' prefix if present
|
| 578 |
+
filtered_state_dict = {}
|
| 579 |
+
for key, value in state_dict.items():
|
| 580 |
+
new_key = key[6:] if key.startswith("model.") else key
|
| 581 |
+
filtered_state_dict[new_key] = value
|
| 582 |
+
|
| 583 |
+
full_model.load_state_dict(filtered_state_dict, strict=False)
|
| 584 |
+
full_model.to(device)
|
| 585 |
+
full_model.eval()
|
| 586 |
+
|
| 587 |
+
# Extract 3D components
|
| 588 |
+
self.model_3d_components = {
|
| 589 |
+
"ol": full_model.ol,
|
| 590 |
+
"reprojection": full_model.reprojection,
|
| 591 |
+
"completion": full_model.completion,
|
| 592 |
+
"projector": full_model.projector,
|
| 593 |
+
"back_projection": full_model.back_projection,
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
# Store post processor and helper functions
|
| 597 |
+
self.post_processor = full_model.post_processor
|
| 598 |
+
self.back_projection = full_model.back_projection # Required for get_kept_mapping
|
| 599 |
+
self._get_kept_mapping = get_kept_mapping
|
| 600 |
+
self._transform_feat3d_coordinates = transform_feat3d_coordinates
|
| 601 |
+
self._fuse_sparse_tensors = fuse_sparse_tensors
|
| 602 |
+
self._generate_multiscale_feat3d = generate_multiscale_feat3d
|
| 603 |
+
self._retry_if_cuda_oom = retry_if_cuda_oom
|
| 604 |
+
self._panoptic_3d_inference = full_model.panoptic_3d_inference
|
| 605 |
+
self._postprocess = full_model.postprocess
|
| 606 |
+
self._cfg = cfg
|
| 607 |
+
|
| 608 |
+
# Disable gradients for all components
|
| 609 |
+
for module in self.model_3d_components.values():
|
| 610 |
+
for param in module.parameters():
|
| 611 |
+
param.requires_grad = False
|
| 612 |
+
module.eval()
|
| 613 |
+
|
| 614 |
+
self._initialized = True
|
| 615 |
+
self._device = device
|
| 616 |
+
|
| 617 |
+
def _ensure_initialized(self):
|
| 618 |
+
"""Ensure model is initialized."""
|
| 619 |
+
if not self._initialized:
|
| 620 |
+
raise RuntimeError(
|
| 621 |
+
"Model weights not loaded. Call load_weights() first, or use "
|
| 622 |
+
"from_pretrained() to load a pre-trained model."
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
def _infer_2d(
|
| 626 |
+
self,
|
| 627 |
+
images: torch.Tensor,
|
| 628 |
+
intrinsic: torch.Tensor,
|
| 629 |
+
) -> Tuple[Dict[str, torch.Tensor], List[Dict], torch.Tensor]:
|
| 630 |
+
"""Run 2D inference using TorchScript model.
|
| 631 |
+
|
| 632 |
+
Args:
|
| 633 |
+
images: Input images (B, C, H, W) as uint8 or float.
|
| 634 |
+
intrinsic: Camera intrinsics (B, 4, 4).
|
| 635 |
+
|
| 636 |
+
Returns:
|
| 637 |
+
outputs_2d: Dictionary of 2D model outputs.
|
| 638 |
+
processed_results: List of processed results per image.
|
| 639 |
+
occupancy_pred: Occupancy predictions.
|
| 640 |
+
"""
|
| 641 |
+
# Run 2D model
|
| 642 |
+
with torch.no_grad():
|
| 643 |
+
if self.use_fp16_2d:
|
| 644 |
+
with torch.cuda.amp.autocast():
|
| 645 |
+
outputs_dict = self.model_2d(images)
|
| 646 |
+
else:
|
| 647 |
+
outputs_dict = self.model_2d(images)
|
| 648 |
+
|
| 649 |
+
# Normalize to FP32
|
| 650 |
+
def to_fp32(x):
|
| 651 |
+
return x.float() if isinstance(x, torch.Tensor) and x.dtype != torch.float32 else x
|
| 652 |
+
|
| 653 |
+
# Extract outputs
|
| 654 |
+
mask_cls_results = to_fp32(outputs_dict["pred_logits"])
|
| 655 |
+
mask_pred_results = to_fp32(outputs_dict["pred_masks"])
|
| 656 |
+
depth_pred_results = to_fp32(outputs_dict["pred_depths"])
|
| 657 |
+
|
| 658 |
+
enc_features = [
|
| 659 |
+
to_fp32(outputs_dict["enc_features_0"]),
|
| 660 |
+
to_fp32(outputs_dict["enc_features_1"]),
|
| 661 |
+
to_fp32(outputs_dict["enc_features_2"]),
|
| 662 |
+
to_fp32(outputs_dict["enc_features_3"]),
|
| 663 |
+
]
|
| 664 |
+
mask_features = to_fp32(outputs_dict["mask_features"])
|
| 665 |
+
depth_features = to_fp32(outputs_dict["depth_features"])
|
| 666 |
+
segm_decoder_out = to_fp32(outputs_dict["segm_decoder_out"])
|
| 667 |
+
pose_enc = to_fp32(outputs_dict["pose_enc"])
|
| 668 |
+
occupancy_pred = to_fp32(outputs_dict["occupancy_pred"])
|
| 669 |
+
|
| 670 |
+
orig_pad_h = int(outputs_dict["orig_pad_h"].item())
|
| 671 |
+
orig_pad_w = int(outputs_dict["orig_pad_w"].item())
|
| 672 |
+
orig_h = int(outputs_dict["orig_h"].item())
|
| 673 |
+
orig_w = int(outputs_dict["orig_w"].item())
|
| 674 |
+
|
| 675 |
+
# Interpolate masks and depths
|
| 676 |
+
padded_out_h, padded_out_w = orig_pad_h // 2, orig_pad_w // 2
|
| 677 |
+
mask_pred_results = F.interpolate(
|
| 678 |
+
mask_pred_results,
|
| 679 |
+
size=(padded_out_h, padded_out_w),
|
| 680 |
+
mode="bilinear",
|
| 681 |
+
align_corners=False,
|
| 682 |
+
)
|
| 683 |
+
depth_pred_results = F.interpolate(
|
| 684 |
+
depth_pred_results,
|
| 685 |
+
size=(padded_out_h, padded_out_w),
|
| 686 |
+
mode="bilinear",
|
| 687 |
+
align_corners=False,
|
| 688 |
+
)
|
| 689 |
+
|
| 690 |
+
# Postprocess each image
|
| 691 |
+
# NOTE: We need to track the CROPPED mask_pred_result for outputs_2d
|
| 692 |
+
# (matching the Triton model behavior)
|
| 693 |
+
processed_results = []
|
| 694 |
+
final_mask_cls_result = None
|
| 695 |
+
final_mask_pred_result = None
|
| 696 |
+
|
| 697 |
+
for idx, (mask_cls_result, mask_pred_result, depth_pred_result, per_image_intrinsic) in enumerate(zip(
|
| 698 |
+
mask_cls_results, mask_pred_results, depth_pred_results, intrinsic
|
| 699 |
+
)):
|
| 700 |
+
out_h, out_w = orig_h // 2, orig_w // 2
|
| 701 |
+
processed_results.append({})
|
| 702 |
+
|
| 703 |
+
# Remove padding - OVERWRITE the variable like Triton does
|
| 704 |
+
mask_pred_result = mask_pred_result[:, :out_h, :out_w]
|
| 705 |
+
depth_pred_result = depth_pred_result[:, :out_h, :out_w]
|
| 706 |
+
|
| 707 |
+
# Panoptic inference
|
| 708 |
+
panoptic_seg, depth_r, segments_info, sem_prob_masks = self._retry_if_cuda_oom(
|
| 709 |
+
self.post_processor.panoptic_inference
|
| 710 |
+
)(
|
| 711 |
+
mask_cls_result,
|
| 712 |
+
mask_pred_result,
|
| 713 |
+
depth_pred_result
|
| 714 |
+
)
|
| 715 |
+
depth_r = depth_r[None]
|
| 716 |
+
|
| 717 |
+
processed_results[-1]["panoptic_seg"] = (panoptic_seg, segments_info)
|
| 718 |
+
processed_results[-1]["depth"] = depth_r[0]
|
| 719 |
+
processed_results[-1]["image_size"] = (orig_w, orig_h)
|
| 720 |
+
processed_results[-1]["padded_size"] = (orig_pad_w, orig_pad_h)
|
| 721 |
+
processed_results[-1]["intrinsic"] = per_image_intrinsic
|
| 722 |
+
processed_results[-1]["sem_seg"] = sem_prob_masks
|
| 723 |
+
|
| 724 |
+
# Store last iteration's results for outputs_2d (matching Triton behavior)
|
| 725 |
+
final_mask_cls_result = mask_cls_result
|
| 726 |
+
final_mask_pred_result = mask_pred_result
|
| 727 |
+
|
| 728 |
+
# Reconstruct outputs_2d - use CROPPED mask_pred_result from last iteration
|
| 729 |
+
# This matches the Triton model's behavior exactly
|
| 730 |
+
outputs_2d = {
|
| 731 |
+
"pred_logits": final_mask_cls_result.unsqueeze(0),
|
| 732 |
+
"pred_masks": final_mask_pred_result.unsqueeze(0),
|
| 733 |
+
"enc_features": enc_features,
|
| 734 |
+
"mask_features": mask_features,
|
| 735 |
+
"depth_features": depth_features,
|
| 736 |
+
"segm_decoder_out": segm_decoder_out,
|
| 737 |
+
"pose_enc": pose_enc,
|
| 738 |
+
}
|
| 739 |
+
|
| 740 |
+
return outputs_2d, processed_results, occupancy_pred
|
| 741 |
+
|
| 742 |
+
def _forward_3d(
|
| 743 |
+
self,
|
| 744 |
+
batched_inputs: Dict[str, torch.Tensor],
|
| 745 |
+
outputs_2d: Dict[str, torch.Tensor],
|
| 746 |
+
processed_results: List[Dict],
|
| 747 |
+
kept: torch.Tensor,
|
| 748 |
+
mapping: torch.Tensor,
|
| 749 |
+
occupancy_pred: torch.Tensor,
|
| 750 |
+
) -> Dict[str, Any]:
|
| 751 |
+
"""Run 3D reconstruction pipeline.
|
| 752 |
+
|
| 753 |
+
Args:
|
| 754 |
+
batched_inputs: Dictionary containing frustum_mask, intrinsic, etc.
|
| 755 |
+
outputs_2d: 2D model outputs.
|
| 756 |
+
processed_results: Processed 2D results.
|
| 757 |
+
kept: Kept voxel indices.
|
| 758 |
+
mapping: Voxel to pixel mapping.
|
| 759 |
+
occupancy_pred: Occupancy predictions.
|
| 760 |
+
|
| 761 |
+
Returns:
|
| 762 |
+
Postprocessed 3D results.
|
| 763 |
+
"""
|
| 764 |
+
room_mask = batched_inputs.get("room_mask_buol") if self.is_matterport else None
|
| 765 |
+
|
| 766 |
+
# Occupancy-aware lifting
|
| 767 |
+
feat3d, mask3d = self.model_3d_components["ol"](
|
| 768 |
+
processed_results, kept, mapping, occupancy_pred, room_mask
|
| 769 |
+
)
|
| 770 |
+
del occupancy_pred, mask3d
|
| 771 |
+
torch.cuda.empty_cache()
|
| 772 |
+
|
| 773 |
+
# Project features
|
| 774 |
+
multi_scale_features = list(reversed(outputs_2d["enc_features"]))
|
| 775 |
+
depth_features = self.model_3d_components["projector"](
|
| 776 |
+
outputs_2d["depth_features"],
|
| 777 |
+
outputs_2d["mask_features"].shape[-2:]
|
| 778 |
+
)
|
| 779 |
+
encoder_features = torch.cat([outputs_2d["mask_features"], depth_features], dim=1)
|
| 780 |
+
|
| 781 |
+
sparse_multi_scale_features, sparse_encoder_features = self.model_3d_components["reprojection"](
|
| 782 |
+
multi_scale_features, encoder_features, processed_results
|
| 783 |
+
)
|
| 784 |
+
|
| 785 |
+
del multi_scale_features, encoder_features
|
| 786 |
+
torch.cuda.empty_cache()
|
| 787 |
+
|
| 788 |
+
# Prepare 3D inputs
|
| 789 |
+
segm_queries = outputs_2d["segm_decoder_out"]
|
| 790 |
+
frustum_mask = batched_inputs["frustum_mask"]
|
| 791 |
+
intrinsic = batched_inputs["intrinsic"]
|
| 792 |
+
|
| 793 |
+
frustum_mask_64 = F.max_pool3d(
|
| 794 |
+
frustum_mask[:, None].float(),
|
| 795 |
+
kernel_size=2,
|
| 796 |
+
stride=4
|
| 797 |
+
).bool()
|
| 798 |
+
|
| 799 |
+
# Transform 3D coordinates
|
| 800 |
+
transformed_feat3d = self._transform_feat3d_coordinates(feat3d, intrinsic)
|
| 801 |
+
del feat3d
|
| 802 |
+
|
| 803 |
+
# Fuse features
|
| 804 |
+
if not self.is_matterport:
|
| 805 |
+
multi_scale_feat3d = self._generate_multiscale_feat3d(transformed_feat3d)
|
| 806 |
+
fused_multi_scale_features = [
|
| 807 |
+
self._fuse_sparse_tensors(sparse_multi_scale_features[i], multi_scale_feat3d[i])
|
| 808 |
+
for i in range(len(multi_scale_feat3d))
|
| 809 |
+
]
|
| 810 |
+
del sparse_multi_scale_features, multi_scale_feat3d
|
| 811 |
+
else:
|
| 812 |
+
fused_multi_scale_features = sparse_multi_scale_features
|
| 813 |
+
|
| 814 |
+
try:
|
| 815 |
+
fused_encoder_features = self._fuse_sparse_tensors(
|
| 816 |
+
sparse_encoder_features, transformed_feat3d
|
| 817 |
+
)
|
| 818 |
+
except Exception:
|
| 819 |
+
fused_encoder_features = sparse_encoder_features
|
| 820 |
+
|
| 821 |
+
del sparse_encoder_features, transformed_feat3d
|
| 822 |
+
torch.cuda.empty_cache()
|
| 823 |
+
|
| 824 |
+
# Run 3D completion
|
| 825 |
+
outputs_3d = self.model_3d_components["completion"](
|
| 826 |
+
fused_multi_scale_features,
|
| 827 |
+
fused_encoder_features,
|
| 828 |
+
segm_queries,
|
| 829 |
+
frustum_mask_64
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
outputs_3d["pred_logits"] = outputs_2d["pred_logits"]
|
| 833 |
+
outputs_3d["pred_masks"] = outputs_2d["pred_masks"]
|
| 834 |
+
|
| 835 |
+
return self._postprocess(outputs_3d, outputs_2d, processed_results, frustum_mask)
|
| 836 |
+
|
| 837 |
+
def forward(
|
| 838 |
+
self,
|
| 839 |
+
images: torch.Tensor,
|
| 840 |
+
frustum_mask: torch.Tensor,
|
| 841 |
+
intrinsic: torch.Tensor,
|
| 842 |
+
height: Optional[torch.Tensor] = None,
|
| 843 |
+
width: Optional[torch.Tensor] = None,
|
| 844 |
+
) -> PanopticRecon3DOutput:
|
| 845 |
+
"""Run full panoptic 3D reconstruction pipeline.
|
| 846 |
+
|
| 847 |
+
Args:
|
| 848 |
+
images: Input images (B, C, H, W) as uint8 [0-255] or float [0-1].
|
| 849 |
+
frustum_mask: Boolean frustum mask (B, D, H, W).
|
| 850 |
+
intrinsic: Camera intrinsic matrices (B, 4, 4).
|
| 851 |
+
height: Optional image heights (B,).
|
| 852 |
+
width: Optional image widths (B,).
|
| 853 |
+
|
| 854 |
+
Returns:
|
| 855 |
+
PanopticRecon3DOutput with 2D and 3D predictions.
|
| 856 |
+
"""
|
| 857 |
+
self._ensure_initialized()
|
| 858 |
+
|
| 859 |
+
# Prepare inputs
|
| 860 |
+
if height is None:
|
| 861 |
+
height = torch.tensor([images.shape[2]], device=images.device)
|
| 862 |
+
if width is None:
|
| 863 |
+
width = torch.tensor([images.shape[3]], device=images.device)
|
| 864 |
+
|
| 865 |
+
batched_inputs = {
|
| 866 |
+
"image": images,
|
| 867 |
+
"frustum_mask": frustum_mask.bool(),
|
| 868 |
+
"intrinsic": intrinsic,
|
| 869 |
+
"height": height,
|
| 870 |
+
"width": width,
|
| 871 |
+
}
|
| 872 |
+
|
| 873 |
+
# Run 2D inference
|
| 874 |
+
outputs_2d, processed_results, occupancy_pred = self._infer_2d(images, intrinsic)
|
| 875 |
+
|
| 876 |
+
# Compute kept and mapping (self has back_projection attribute)
|
| 877 |
+
kept, mapping = self._get_kept_mapping(
|
| 878 |
+
self,
|
| 879 |
+
self._cfg,
|
| 880 |
+
batched_inputs,
|
| 881 |
+
device=images.device
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
# Run 3D inference
|
| 885 |
+
outputs_3d = self._forward_3d(
|
| 886 |
+
batched_inputs, outputs_2d, processed_results, kept, mapping, occupancy_pred
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
# Create output object
|
| 890 |
+
return PanopticRecon3DOutput(
|
| 891 |
+
panoptic_seg_3d=outputs_3d["panoptic_seg"][0],
|
| 892 |
+
geometry_3d=outputs_3d["geometry"][0],
|
| 893 |
+
semantic_seg_3d=outputs_3d["semantic_seg"][0],
|
| 894 |
+
panoptic_seg_2d=outputs_3d["panoptic_seg_2d"][0][0],
|
| 895 |
+
depth_2d=outputs_3d["depth"][0],
|
| 896 |
+
panoptic_semantic_mapping=outputs_3d["panoptic_semantic_mapping"][0],
|
| 897 |
+
segments_info=outputs_3d["panoptic_seg_2d"][0][1] if len(outputs_3d["panoptic_seg_2d"][0]) > 1 else None,
|
| 898 |
+
)
|
| 899 |
+
|
| 900 |
+
@torch.no_grad()
|
| 901 |
+
def predict(
|
| 902 |
+
self,
|
| 903 |
+
image: Union[np.ndarray, torch.Tensor],
|
| 904 |
+
frustum_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
|
| 905 |
+
intrinsic: Optional[Union[np.ndarray, torch.Tensor]] = None,
|
| 906 |
+
) -> PanopticRecon3DOutput:
|
| 907 |
+
"""User-friendly prediction interface.
|
| 908 |
+
|
| 909 |
+
Args:
|
| 910 |
+
image: Input RGB image as numpy array. Accepted formats:
|
| 911 |
+
- (H, W, C) HWC format uint8 [0-255]
|
| 912 |
+
- (C, H, W) CHW format uint8 [0-255]
|
| 913 |
+
- (1, C, H, W) batched CHW format (from load_image)
|
| 914 |
+
frustum_mask: Optional frustum mask. If None, auto-generated using default intrinsic.
|
| 915 |
+
intrinsic: Optional camera intrinsic (4x4). If None, uses DEFAULT_INTRINSIC.
|
| 916 |
+
|
| 917 |
+
Returns:
|
| 918 |
+
PanopticRecon3DOutput with predictions.
|
| 919 |
+
"""
|
| 920 |
+
self._ensure_initialized()
|
| 921 |
+
|
| 922 |
+
# Use default intrinsic if not provided
|
| 923 |
+
if intrinsic is None:
|
| 924 |
+
intrinsic = DEFAULT_INTRINSIC.copy()
|
| 925 |
+
|
| 926 |
+
# Process image - match test_triton_server.py preprocessing exactly
|
| 927 |
+
if isinstance(image, np.ndarray):
|
| 928 |
+
# Handle different input formats
|
| 929 |
+
if image.ndim == 4:
|
| 930 |
+
# Already batched (1, C, H, W) - from load_image
|
| 931 |
+
pass
|
| 932 |
+
elif image.ndim == 3:
|
| 933 |
+
if image.shape[2] == 3:
|
| 934 |
+
# HWC format -> CHW format
|
| 935 |
+
image = np.ascontiguousarray(image.transpose(2, 0, 1))
|
| 936 |
+
# Now it's CHW, add batch dimension
|
| 937 |
+
image = image[np.newaxis, ...]
|
| 938 |
+
|
| 939 |
+
# Ensure uint8
|
| 940 |
+
if image.dtype != np.uint8:
|
| 941 |
+
if image.max() <= 1.0:
|
| 942 |
+
image = (image * 255).clip(0, 255).astype(np.uint8)
|
| 943 |
+
else:
|
| 944 |
+
image = image.clip(0, 255).astype(np.uint8)
|
| 945 |
+
|
| 946 |
+
image = torch.from_numpy(image)
|
| 947 |
+
else:
|
| 948 |
+
# Tensor input
|
| 949 |
+
if image.dim() == 3:
|
| 950 |
+
image = image.unsqueeze(0)
|
| 951 |
+
# Ensure uint8
|
| 952 |
+
if image.dtype != torch.uint8:
|
| 953 |
+
if image.max() <= 1.0:
|
| 954 |
+
image = (image * 255).clamp(0, 255).to(torch.uint8)
|
| 955 |
+
else:
|
| 956 |
+
image = image.clamp(0, 255).to(torch.uint8)
|
| 957 |
+
|
| 958 |
+
image = image.to(self._device)
|
| 959 |
+
|
| 960 |
+
# Generate frustum mask if not provided
|
| 961 |
+
if frustum_mask is None:
|
| 962 |
+
intrinsic_np = intrinsic if isinstance(intrinsic, np.ndarray) else intrinsic.cpu().numpy()
|
| 963 |
+
frustum_mask = create_frustum_mask(
|
| 964 |
+
intrinsics=intrinsic_np,
|
| 965 |
+
volume_shape=(self.frustum_dims_val,) * 3,
|
| 966 |
+
depth_range=(self.depth_min, self.depth_max),
|
| 967 |
+
voxel_size=self.voxel_size,
|
| 968 |
+
image_shape=(self.target_size[1], self.target_size[0]),
|
| 969 |
+
)
|
| 970 |
+
frustum_mask = torch.from_numpy(frustum_mask).unsqueeze(0)
|
| 971 |
+
elif isinstance(frustum_mask, np.ndarray):
|
| 972 |
+
frustum_mask = torch.from_numpy(frustum_mask)
|
| 973 |
+
if frustum_mask.dim() == 3:
|
| 974 |
+
frustum_mask = frustum_mask.unsqueeze(0)
|
| 975 |
+
|
| 976 |
+
frustum_mask = frustum_mask.to(self._device)
|
| 977 |
+
|
| 978 |
+
# Convert intrinsic to tensor
|
| 979 |
+
if isinstance(intrinsic, np.ndarray):
|
| 980 |
+
intrinsic = torch.from_numpy(intrinsic)
|
| 981 |
+
if intrinsic.dim() == 2:
|
| 982 |
+
intrinsic = intrinsic.unsqueeze(0)
|
| 983 |
+
|
| 984 |
+
intrinsic = intrinsic.float().to(self._device)
|
| 985 |
+
|
| 986 |
+
return self.forward(image, frustum_mask, intrinsic)
|
| 987 |
+
|
nvpanoptix_3d/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Panoptic Recon 3D model module."""
|
nvpanoptix_3d/blocks.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Blocks for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
import torch.nn as nn
|
| 18 |
+
from torch import Tensor
|
| 19 |
+
from typing import Optional
|
| 20 |
+
import torch.nn.functional as F
|
| 21 |
+
import MinkowskiEngine as Me
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ProjectionBlock(nn.Module):
|
| 25 |
+
"""Projection block for depth projection."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, in_feature, out_feature):
|
| 28 |
+
"""Init"""
|
| 29 |
+
super().__init__()
|
| 30 |
+
self.conv_block1 = nn.Sequential(
|
| 31 |
+
nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=2, padding=1),
|
| 32 |
+
nn.BatchNorm2d(out_feature),
|
| 33 |
+
nn.ReLU(True)
|
| 34 |
+
)
|
| 35 |
+
self.conv_block2 = nn.Conv2d(
|
| 36 |
+
out_feature, out_feature,
|
| 37 |
+
kernel_size=1, stride=1,
|
| 38 |
+
padding=0
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def forward(self, x, target_size):
|
| 42 |
+
"""Forward"""
|
| 43 |
+
x = self.conv_block1(x)
|
| 44 |
+
x = F.interpolate(x, size=target_size, mode="bilinear", align_corners=False)
|
| 45 |
+
x = self.conv_block2(x)
|
| 46 |
+
return x
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class ConvBlock(nn.Module):
|
| 50 |
+
"""Conv block for depth projection."""
|
| 51 |
+
|
| 52 |
+
def __init__(self, in_feature, out_feature):
|
| 53 |
+
"""Init"""
|
| 54 |
+
super().__init__()
|
| 55 |
+
self.conv_block = nn.Sequential(
|
| 56 |
+
nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=2, padding=1),
|
| 57 |
+
nn.BatchNorm2d(out_feature),
|
| 58 |
+
nn.ReLU(True)
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
def forward(self, x):
|
| 62 |
+
"""Forward"""
|
| 63 |
+
return self.conv_block(x)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class DepthProjector(nn.Module):
|
| 67 |
+
"""Depth projector module."""
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
in_channels: int = 256,
|
| 72 |
+
out_channels: int = 256,
|
| 73 |
+
num_proj_convs: int = 4,
|
| 74 |
+
**kwargs
|
| 75 |
+
):
|
| 76 |
+
"""Init"""
|
| 77 |
+
super(DepthProjector, self).__init__()
|
| 78 |
+
self.proj_convs1 = nn.ModuleList([
|
| 79 |
+
ConvBlock(in_channels, in_channels) for _ in range(num_proj_convs)
|
| 80 |
+
])
|
| 81 |
+
self.proj_convs2 = nn.ModuleList([
|
| 82 |
+
nn.Conv2d(
|
| 83 |
+
in_channels, out_channels,
|
| 84 |
+
kernel_size=1, stride=1,
|
| 85 |
+
padding=0
|
| 86 |
+
) for _ in range(num_proj_convs)
|
| 87 |
+
])
|
| 88 |
+
|
| 89 |
+
def forward(self, depth_features, depth_feature_shape, size_list):
|
| 90 |
+
"""Forward"""
|
| 91 |
+
output_list = []
|
| 92 |
+
size_list.append(depth_feature_shape)
|
| 93 |
+
for i, (_, feat_shape) in enumerate(zip(
|
| 94 |
+
self.proj_convs1,
|
| 95 |
+
size_list[::-1]
|
| 96 |
+
)):
|
| 97 |
+
feat = depth_features[i]
|
| 98 |
+
output = self.proj_convs1[i](feat)
|
| 99 |
+
output = F.interpolate(output, feat_shape, mode="bilinear", align_corners=False)
|
| 100 |
+
output = self.proj_convs2[i](output)
|
| 101 |
+
output_list.append(output)
|
| 102 |
+
|
| 103 |
+
return depth_features[-1], output_list[1:][::-1]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class SelfAttentionLayer(nn.Module):
|
| 107 |
+
"""Self Attention Layer."""
|
| 108 |
+
|
| 109 |
+
def __init__(
|
| 110 |
+
self, d_model, nhead, dropout=0.0,
|
| 111 |
+
activation="relu", normalize_before=False, export=False
|
| 112 |
+
):
|
| 113 |
+
"""Init."""
|
| 114 |
+
super().__init__()
|
| 115 |
+
self.export = export
|
| 116 |
+
if export:
|
| 117 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
| 118 |
+
else:
|
| 119 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
| 120 |
+
|
| 121 |
+
self.norm = nn.LayerNorm(d_model)
|
| 122 |
+
self.dropout = nn.Dropout(dropout)
|
| 123 |
+
|
| 124 |
+
self.activation = _get_activation_fn(activation)
|
| 125 |
+
self.normalize_before = normalize_before
|
| 126 |
+
|
| 127 |
+
self._reset_parameters()
|
| 128 |
+
|
| 129 |
+
def _reset_parameters(self):
|
| 130 |
+
"""Reset parameters."""
|
| 131 |
+
for p in self.parameters():
|
| 132 |
+
if p.dim() > 1:
|
| 133 |
+
nn.init.xavier_uniform_(p)
|
| 134 |
+
|
| 135 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
| 136 |
+
"""Add positional embedding."""
|
| 137 |
+
return tensor if pos is None else tensor + pos
|
| 138 |
+
|
| 139 |
+
def forward_post(
|
| 140 |
+
self, tgt,
|
| 141 |
+
tgt_mask: Optional[Tensor] = None,
|
| 142 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
| 143 |
+
query_pos: Optional[Tensor] = None
|
| 144 |
+
):
|
| 145 |
+
"""Forward post norm."""
|
| 146 |
+
q = k = self.with_pos_embed(tgt, query_pos)
|
| 147 |
+
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
|
| 148 |
+
key_padding_mask=tgt_key_padding_mask)[0]
|
| 149 |
+
tgt = tgt + self.dropout(tgt2)
|
| 150 |
+
tgt = self.norm(tgt)
|
| 151 |
+
|
| 152 |
+
return tgt
|
| 153 |
+
|
| 154 |
+
def forward_pre(
|
| 155 |
+
self, tgt,
|
| 156 |
+
tgt_mask: Optional[Tensor] = None,
|
| 157 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
| 158 |
+
query_pos: Optional[Tensor] = None
|
| 159 |
+
):
|
| 160 |
+
"""Forward pre norm."""
|
| 161 |
+
tgt2 = self.norm(tgt)
|
| 162 |
+
q = k = self.with_pos_embed(tgt2, query_pos)
|
| 163 |
+
tgt2 = self.self_attn(
|
| 164 |
+
q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
|
| 165 |
+
)[0]
|
| 166 |
+
tgt = tgt + self.dropout(tgt2)
|
| 167 |
+
|
| 168 |
+
return tgt
|
| 169 |
+
|
| 170 |
+
def forward(
|
| 171 |
+
self, tgt,
|
| 172 |
+
tgt_mask: Optional[Tensor] = None,
|
| 173 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
| 174 |
+
query_pos: Optional[Tensor] = None
|
| 175 |
+
):
|
| 176 |
+
"""Forward."""
|
| 177 |
+
if self.normalize_before:
|
| 178 |
+
return self.forward_pre(
|
| 179 |
+
tgt, tgt_mask, tgt_key_padding_mask, query_pos
|
| 180 |
+
)
|
| 181 |
+
return self.forward_post(
|
| 182 |
+
tgt, tgt_mask, tgt_key_padding_mask, query_pos
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class CrossAttentionLayer(nn.Module):
|
| 187 |
+
"""Cross attention layer."""
|
| 188 |
+
|
| 189 |
+
def __init__(self, d_model, nhead, dropout=0.0,
|
| 190 |
+
activation="relu", normalize_before=False, export=False):
|
| 191 |
+
"""Init."""
|
| 192 |
+
super().__init__()
|
| 193 |
+
self.export = export
|
| 194 |
+
if export:
|
| 195 |
+
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
| 196 |
+
else:
|
| 197 |
+
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
| 198 |
+
|
| 199 |
+
self.norm = nn.LayerNorm(d_model)
|
| 200 |
+
self.dropout = nn.Dropout(dropout)
|
| 201 |
+
|
| 202 |
+
self.activation = _get_activation_fn(activation)
|
| 203 |
+
self.normalize_before = normalize_before
|
| 204 |
+
|
| 205 |
+
self._reset_parameters()
|
| 206 |
+
|
| 207 |
+
def _reset_parameters(self):
|
| 208 |
+
"""Reset parameters."""
|
| 209 |
+
for p in self.parameters():
|
| 210 |
+
if p.dim() > 1:
|
| 211 |
+
nn.init.xavier_uniform_(p)
|
| 212 |
+
|
| 213 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
| 214 |
+
"""Add positional embedding."""
|
| 215 |
+
return tensor if pos is None else tensor + pos
|
| 216 |
+
|
| 217 |
+
def forward_post(
|
| 218 |
+
self, tgt, memory,
|
| 219 |
+
memory_mask: Optional[Tensor] = None,
|
| 220 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
| 221 |
+
pos: Optional[Tensor] = None,
|
| 222 |
+
query_pos: Optional[Tensor] = None
|
| 223 |
+
):
|
| 224 |
+
"""Forward post norm."""
|
| 225 |
+
tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
|
| 226 |
+
key=self.with_pos_embed(memory, pos),
|
| 227 |
+
value=memory, attn_mask=memory_mask,
|
| 228 |
+
key_padding_mask=memory_key_padding_mask)[0]
|
| 229 |
+
tgt = tgt + self.dropout(tgt2)
|
| 230 |
+
tgt = self.norm(tgt)
|
| 231 |
+
|
| 232 |
+
return tgt
|
| 233 |
+
|
| 234 |
+
def forward_pre(
|
| 235 |
+
self, tgt, memory,
|
| 236 |
+
memory_mask: Optional[Tensor] = None,
|
| 237 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
| 238 |
+
pos: Optional[Tensor] = None,
|
| 239 |
+
query_pos: Optional[Tensor] = None
|
| 240 |
+
):
|
| 241 |
+
"""Forward pre norm."""
|
| 242 |
+
tgt2 = self.norm(tgt)
|
| 243 |
+
tgt2 = self.multihead_attn(
|
| 244 |
+
query=self.with_pos_embed(tgt2, query_pos),
|
| 245 |
+
key=self.with_pos_embed(memory, pos),
|
| 246 |
+
value=memory, attn_mask=memory_mask,
|
| 247 |
+
key_padding_mask=memory_key_padding_mask
|
| 248 |
+
)[0]
|
| 249 |
+
tgt = tgt + self.dropout(tgt2)
|
| 250 |
+
|
| 251 |
+
return tgt
|
| 252 |
+
|
| 253 |
+
def forward(
|
| 254 |
+
self, tgt, memory,
|
| 255 |
+
memory_mask: Optional[Tensor] = None,
|
| 256 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
| 257 |
+
pos: Optional[Tensor] = None,
|
| 258 |
+
query_pos: Optional[Tensor] = None
|
| 259 |
+
):
|
| 260 |
+
"""Forward pass."""
|
| 261 |
+
if self.normalize_before:
|
| 262 |
+
return self.forward_pre(tgt, memory, memory_mask,
|
| 263 |
+
memory_key_padding_mask, pos, query_pos)
|
| 264 |
+
return self.forward_post(tgt, memory, memory_mask,
|
| 265 |
+
memory_key_padding_mask, pos, query_pos)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
class FFNLayer(nn.Module):
|
| 269 |
+
"""Feedforward layer."""
|
| 270 |
+
|
| 271 |
+
def __init__(
|
| 272 |
+
self, d_model, dim_feedforward=2048, dropout=0.0, activation="relu", normalize_before=False
|
| 273 |
+
):
|
| 274 |
+
"""Init."""
|
| 275 |
+
super().__init__()
|
| 276 |
+
# Implementation of Feedforward model
|
| 277 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
| 278 |
+
self.dropout = nn.Dropout(dropout)
|
| 279 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
| 280 |
+
|
| 281 |
+
self.norm = nn.LayerNorm(d_model)
|
| 282 |
+
|
| 283 |
+
self.activation = _get_activation_fn(activation)
|
| 284 |
+
self.normalize_before = normalize_before
|
| 285 |
+
|
| 286 |
+
self._reset_parameters()
|
| 287 |
+
|
| 288 |
+
def _reset_parameters(self):
|
| 289 |
+
"""Reset parameters."""
|
| 290 |
+
for p in self.parameters():
|
| 291 |
+
if p.dim() > 1:
|
| 292 |
+
nn.init.xavier_uniform_(p)
|
| 293 |
+
|
| 294 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
| 295 |
+
"""Add positional embedding."""
|
| 296 |
+
return tensor if pos is None else tensor + pos
|
| 297 |
+
|
| 298 |
+
def forward_post(self, tgt):
|
| 299 |
+
"""Forward post norm."""
|
| 300 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
| 301 |
+
tgt = tgt + self.dropout(tgt2)
|
| 302 |
+
tgt = self.norm(tgt)
|
| 303 |
+
return tgt
|
| 304 |
+
|
| 305 |
+
def forward_pre(self, tgt):
|
| 306 |
+
"""Forward pre norm."""
|
| 307 |
+
tgt2 = self.norm(tgt)
|
| 308 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
|
| 309 |
+
tgt = tgt + self.dropout(tgt2)
|
| 310 |
+
return tgt
|
| 311 |
+
|
| 312 |
+
def forward(self, tgt):
|
| 313 |
+
"""Forward."""
|
| 314 |
+
if self.normalize_before:
|
| 315 |
+
return self.forward_pre(tgt)
|
| 316 |
+
return self.forward_post(tgt)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def _get_activation_fn(activation):
|
| 320 |
+
"""Return an activation function given a string"""
|
| 321 |
+
if activation == "relu":
|
| 322 |
+
return F.relu
|
| 323 |
+
if activation == "gelu":
|
| 324 |
+
return F.gelu
|
| 325 |
+
if activation == "glu":
|
| 326 |
+
return F.glu
|
| 327 |
+
raise NotImplementedError(f"activation should be relu/gelu, not {activation}.")
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
class MLP(nn.Module):
|
| 331 |
+
""" Very simple multi-layer perceptron (also called FFN)"""
|
| 332 |
+
|
| 333 |
+
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
| 334 |
+
"""Init."""
|
| 335 |
+
super().__init__()
|
| 336 |
+
self.num_layers = num_layers
|
| 337 |
+
h = [hidden_dim] * (num_layers - 1)
|
| 338 |
+
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
| 339 |
+
|
| 340 |
+
def forward(self, x):
|
| 341 |
+
"""Forward pass."""
|
| 342 |
+
for i, layer in enumerate(self.layers):
|
| 343 |
+
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
| 344 |
+
return x
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# 3D blocks
|
| 348 |
+
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1, sparse=False):
|
| 349 |
+
"""3x3 convolution with padding"""
|
| 350 |
+
if sparse:
|
| 351 |
+
return Me.MinkowskiConvolution(
|
| 352 |
+
in_planes, out_planes, kernel_size=3,
|
| 353 |
+
stride=stride, dilation=dilation,
|
| 354 |
+
bias=False, dimension=3
|
| 355 |
+
)
|
| 356 |
+
else:
|
| 357 |
+
return nn.Conv3d(
|
| 358 |
+
in_planes, out_planes, kernel_size=3,
|
| 359 |
+
stride=stride, padding=dilation,
|
| 360 |
+
groups=groups, bias=False,
|
| 361 |
+
dilation=dilation
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
class BasicBlock3D(nn.Module):
|
| 366 |
+
"""Basic block for 3D."""
|
| 367 |
+
|
| 368 |
+
def __init__(
|
| 369 |
+
self, inplanes, planes, stride=1, downsample=None, groups=1,
|
| 370 |
+
base_width=64, dilation=1, norm_layer=None, sparse=False
|
| 371 |
+
):
|
| 372 |
+
"""Init."""
|
| 373 |
+
super().__init__()
|
| 374 |
+
if norm_layer is None:
|
| 375 |
+
norm_layer = nn.InstanceNorm3d if not sparse else Me.MinkowskiInstanceNorm
|
| 376 |
+
if groups != 1 or base_width != 64:
|
| 377 |
+
raise ValueError("BasicBlock only supports groups=1 and base_width=64")
|
| 378 |
+
if dilation > 1:
|
| 379 |
+
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
|
| 380 |
+
self.conv1 = conv3x3(inplanes, planes, stride, sparse=sparse)
|
| 381 |
+
self.bn1 = norm_layer(planes)
|
| 382 |
+
self.relu = nn.ReLU(inplace=True) if not sparse else Me.MinkowskiReLU(inplace=True)
|
| 383 |
+
self.conv2 = conv3x3(planes, planes, sparse=sparse)
|
| 384 |
+
self.bn2 = norm_layer(planes)
|
| 385 |
+
self.downsample = downsample
|
| 386 |
+
self.stride = stride
|
| 387 |
+
|
| 388 |
+
def forward(self, x):
|
| 389 |
+
"""Forward."""
|
| 390 |
+
identity = x
|
| 391 |
+
|
| 392 |
+
out = self.conv1(x)
|
| 393 |
+
out = self.bn1(out)
|
| 394 |
+
out = self.relu(out)
|
| 395 |
+
|
| 396 |
+
out = self.conv2(out)
|
| 397 |
+
out = self.bn2(out)
|
| 398 |
+
|
| 399 |
+
if self.downsample is not None:
|
| 400 |
+
identity = self.downsample(x)
|
| 401 |
+
|
| 402 |
+
out += identity
|
| 403 |
+
out = self.relu(out)
|
| 404 |
+
|
| 405 |
+
return out
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
class SparseBasicBlock3D(BasicBlock3D):
|
| 409 |
+
"""Sparse basic block for 3D."""
|
| 410 |
+
|
| 411 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
|
| 412 |
+
base_width=64, dilation=1, norm_layer=None):
|
| 413 |
+
"""Init."""
|
| 414 |
+
super().__init__(inplanes, planes,
|
| 415 |
+
stride=stride, downsample=downsample, groups=groups,
|
| 416 |
+
base_width=base_width, dilation=dilation,
|
| 417 |
+
norm_layer=norm_layer, sparse=True)
|
nvpanoptix_3d/model_3d.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""3D stage model for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
import MinkowskiEngine as Me
|
| 19 |
+
from torch.nn import functional as F
|
| 20 |
+
import torch.nn as nn
|
| 21 |
+
|
| 22 |
+
from .blocks import ProjectionBlock
|
| 23 |
+
from .utils.helper import retry_if_cuda_oom
|
| 24 |
+
# from .model_2d import MaskFormerModel
|
| 25 |
+
from .reconstruction import SparseProjection, FrustumDecoder
|
| 26 |
+
from .mp_occ.occupancy_aware_lifting import OccupancyAwareLifting
|
| 27 |
+
from .mp_occ.back_projection import BackProjection
|
| 28 |
+
|
| 29 |
+
from .utils.sparse_tensor import \
|
| 30 |
+
to_dense, prepare_instance_masks_thicken
|
| 31 |
+
from .utils.coords_transform import \
|
| 32 |
+
transform_feat3d_coordinates, fuse_sparse_tensors, generate_multiscale_feat3d
|
| 33 |
+
|
| 34 |
+
class Postprocessor(nn.Module):
|
| 35 |
+
"""2D model postprocessor."""
|
| 36 |
+
|
| 37 |
+
def __init__(self, cfg):
|
| 38 |
+
"""Initialize the postprocessor."""
|
| 39 |
+
super().__init__()
|
| 40 |
+
self.cfg = cfg
|
| 41 |
+
self.test_topk_per_image = cfg.model.test_topk_per_image
|
| 42 |
+
self.num_classes = cfg.model.sem_seg_head.num_classes
|
| 43 |
+
self.num_queries = cfg.model.mask_former.num_object_queries
|
| 44 |
+
self.object_mask_threshold = cfg.model.object_mask_threshold
|
| 45 |
+
self.overlap_threshold = cfg.model.overlap_threshold
|
| 46 |
+
self.depth_scale = cfg.dataset.depth_scale
|
| 47 |
+
self.num_thing_classes = cfg.dataset.num_thing_classes
|
| 48 |
+
|
| 49 |
+
def panoptic_inference(self, mask_cls, mask_pred, depth_pred):
|
| 50 |
+
"""Post process for panoptic segmentation."""
|
| 51 |
+
scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
|
| 52 |
+
mask_pred = mask_pred.sigmoid()
|
| 53 |
+
|
| 54 |
+
keep = labels.ne(self.num_classes) & (scores > self.object_mask_threshold)
|
| 55 |
+
cur_scores = scores[keep]
|
| 56 |
+
cur_classes = labels[keep]
|
| 57 |
+
cur_masks = mask_pred[keep]
|
| 58 |
+
cur_mask_cls = mask_cls[keep]
|
| 59 |
+
cur_mask_cls = cur_mask_cls[:, :-1]
|
| 60 |
+
|
| 61 |
+
cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
|
| 62 |
+
|
| 63 |
+
h, w = cur_masks.shape[-2:]
|
| 64 |
+
panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
|
| 65 |
+
segments_info = []
|
| 66 |
+
|
| 67 |
+
sem_prob_masks = torch.zeros((
|
| 68 |
+
self.num_classes, h, w
|
| 69 |
+
), dtype=torch.float32, device=cur_masks.device)
|
| 70 |
+
|
| 71 |
+
current_segment_id = 0
|
| 72 |
+
|
| 73 |
+
if cur_masks.shape[0] == 0:
|
| 74 |
+
return panoptic_seg, depth_pred[0, :, :], segments_info, sem_prob_masks
|
| 75 |
+
else:
|
| 76 |
+
cur_mask_ids = cur_prob_masks.argmax(0)
|
| 77 |
+
stuff_memory_list = {}
|
| 78 |
+
stuff_mask_ids = []
|
| 79 |
+
for k in range(cur_classes.shape[0]):
|
| 80 |
+
pred_class = cur_classes[k].item()
|
| 81 |
+
isthing = pred_class in list(range(1, self.num_thing_classes + 1))
|
| 82 |
+
mask_area = (cur_mask_ids == k).sum().item()
|
| 83 |
+
original_area = (cur_masks[k] >= 0.5).sum().item()
|
| 84 |
+
mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
|
| 85 |
+
|
| 86 |
+
if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
|
| 87 |
+
if mask_area / original_area < self.overlap_threshold:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
# merge stuff regions
|
| 91 |
+
if not isthing:
|
| 92 |
+
stuff_mask_ids.append(k)
|
| 93 |
+
if int(pred_class) in stuff_memory_list.keys():
|
| 94 |
+
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
|
| 95 |
+
sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
|
| 96 |
+
continue
|
| 97 |
+
else:
|
| 98 |
+
stuff_memory_list[int(pred_class)] = current_segment_id + 1
|
| 99 |
+
|
| 100 |
+
current_segment_id += 1
|
| 101 |
+
panoptic_seg[mask] = current_segment_id
|
| 102 |
+
sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
|
| 103 |
+
|
| 104 |
+
segments_info.append(
|
| 105 |
+
{
|
| 106 |
+
"id": current_segment_id,
|
| 107 |
+
"isthing": bool(isthing),
|
| 108 |
+
"category_id": int(pred_class),
|
| 109 |
+
}
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if stuff_mask_ids:
|
| 113 |
+
# recover void pixels
|
| 114 |
+
stuff_mask_ids = torch.tensor(stuff_mask_ids, dtype=torch.long, device=cur_prob_masks.device)
|
| 115 |
+
cur_stuff_ids = stuff_mask_ids[cur_prob_masks[stuff_mask_ids].argmax(0)]
|
| 116 |
+
empty_pixel_mask = panoptic_seg == 0
|
| 117 |
+
for k in stuff_mask_ids:
|
| 118 |
+
k = k.item()
|
| 119 |
+
pred_class = cur_classes[k].item()
|
| 120 |
+
mask = empty_pixel_mask & (cur_stuff_ids == k)
|
| 121 |
+
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
|
| 122 |
+
sem_prob_masks[int(pred_class)][mask] = cur_prob_masks[k][mask]
|
| 123 |
+
|
| 124 |
+
# clamp depth_pred
|
| 125 |
+
depth_pred = depth_pred[0, ...].clamp(min=0, max=self.depth_scale)
|
| 126 |
+
return panoptic_seg, depth_pred, segments_info, sem_prob_masks
|
| 127 |
+
|
| 128 |
+
@staticmethod
|
| 129 |
+
def sem_seg_postprocess(result, img_size):
|
| 130 |
+
"""Return semantic segmentation predictions in the original resolution."""
|
| 131 |
+
# Crop each image in the batch to the original img_size
|
| 132 |
+
result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
|
| 133 |
+
# Interpolate to the desired output size
|
| 134 |
+
result = F.interpolate(
|
| 135 |
+
result,
|
| 136 |
+
size=(img_size[0], img_size[1]),
|
| 137 |
+
mode="bilinear",
|
| 138 |
+
align_corners=False
|
| 139 |
+
)
|
| 140 |
+
return result[0]
|
| 141 |
+
|
| 142 |
+
def forward(self, outputs, orig_shape, orig_pad_shape):
|
| 143 |
+
"""Forward pass."""
|
| 144 |
+
mask_cls_results = outputs["pred_logits"]
|
| 145 |
+
mask_pred_results = outputs["pred_masks"]
|
| 146 |
+
depth_pred_results = outputs["pred_depths"]
|
| 147 |
+
|
| 148 |
+
del outputs
|
| 149 |
+
|
| 150 |
+
mask_pred_results = F.interpolate(
|
| 151 |
+
mask_pred_results,
|
| 152 |
+
size=(orig_pad_shape[-2], orig_pad_shape[-1]),
|
| 153 |
+
mode="bilinear",
|
| 154 |
+
align_corners=False,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
depth_pred_results = F.interpolate(
|
| 158 |
+
depth_pred_results,
|
| 159 |
+
size=(orig_pad_shape[-2], orig_pad_shape[-1]),
|
| 160 |
+
mode="bilinear",
|
| 161 |
+
align_corners=False,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if self.cfg.model.mode == "panoptic":
|
| 165 |
+
processed_results = []
|
| 166 |
+
for _, (mask_cls_result, mask_pred_result, depth_pred_result) in enumerate(zip(
|
| 167 |
+
mask_cls_results, mask_pred_results, depth_pred_results
|
| 168 |
+
)):
|
| 169 |
+
processed_results.append({})
|
| 170 |
+
|
| 171 |
+
mask_pred_result = retry_if_cuda_oom(self.sem_seg_postprocess)(
|
| 172 |
+
mask_pred_result, orig_shape
|
| 173 |
+
)
|
| 174 |
+
mask_cls_result = mask_cls_result.to(mask_pred_result)
|
| 175 |
+
|
| 176 |
+
depth_pred_result = retry_if_cuda_oom(self.sem_seg_postprocess)(
|
| 177 |
+
depth_pred_result, orig_shape
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
panoptic_seg, depth_r, segments_info, sem_prob_mask = retry_if_cuda_oom(
|
| 181 |
+
self.panoptic_inference
|
| 182 |
+
)(mask_cls_result, mask_pred_result, depth_pred_result)
|
| 183 |
+
|
| 184 |
+
processed_results[-1]["panoptic_seg"] = (panoptic_seg, segments_info)
|
| 185 |
+
processed_results[-1]["depth"] = depth_r
|
| 186 |
+
processed_results[-1]["sem_seg"] = sem_prob_mask
|
| 187 |
+
|
| 188 |
+
return processed_results
|
| 189 |
+
|
| 190 |
+
else:
|
| 191 |
+
raise ValueError("Only panoptic mode is supported for 2D model.")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class Panoptic3DModel(nn.Module):
|
| 195 |
+
"""3D model."""
|
| 196 |
+
|
| 197 |
+
def __init__(self, cfg):
|
| 198 |
+
"""Initialize 3D model."""
|
| 199 |
+
super().__init__()
|
| 200 |
+
self.cfg = cfg
|
| 201 |
+
|
| 202 |
+
# disable gradients for the 2D model
|
| 203 |
+
# for _, param in self.named_parameters():
|
| 204 |
+
# param.requires_grad_(False)
|
| 205 |
+
|
| 206 |
+
# 3D modules
|
| 207 |
+
self.reprojection = SparseProjection(self.cfg)
|
| 208 |
+
self.completion = FrustumDecoder(self.cfg)
|
| 209 |
+
self.projector = ProjectionBlock(
|
| 210 |
+
self.cfg.model.projection.depth_feature_dim,
|
| 211 |
+
self.cfg.model.projection.depth_feature_dim
|
| 212 |
+
)
|
| 213 |
+
self.ol = OccupancyAwareLifting(self.cfg)
|
| 214 |
+
self.post_processor = Postprocessor(self.cfg)
|
| 215 |
+
self.back_projection = BackProjection(self.cfg)
|
| 216 |
+
|
| 217 |
+
# 3D model parameters
|
| 218 |
+
self.downsample_factor = cfg.dataset.downsample_factor
|
| 219 |
+
self.frustum_dims = [cfg.model.frustum3d.frustum_dims] * 3
|
| 220 |
+
self.iso_recon_value = cfg.model.frustum3d.iso_recon_value
|
| 221 |
+
self.truncation = cfg.model.frustum3d.truncation
|
| 222 |
+
self.num_classes = cfg.model.sem_seg_head.num_classes
|
| 223 |
+
self.object_mask_threshold = cfg.model.object_mask_threshold
|
| 224 |
+
self.overlap_threshold = cfg.model.overlap_threshold
|
| 225 |
+
|
| 226 |
+
def forward(self, x):
|
| 227 |
+
"""Forward pass."""
|
| 228 |
+
pass
|
| 229 |
+
|
| 230 |
+
def panoptic_3d_inference(
|
| 231 |
+
self, geometry, mask_cls, sparse_mask_tuple, min_coordinates, dense_dimensions
|
| 232 |
+
):
|
| 233 |
+
"""Panoptic 3D inference."""
|
| 234 |
+
panoptic_seg = torch.zeros(geometry.shape, dtype=torch.int32, device=mask_cls.device)
|
| 235 |
+
semantic_seg = torch.zeros_like(panoptic_seg)
|
| 236 |
+
panoptic_semantic_mapping = {}
|
| 237 |
+
|
| 238 |
+
scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
|
| 239 |
+
keep = labels.ne(self.num_classes) & \
|
| 240 |
+
labels.ne(0) & \
|
| 241 |
+
(scores > self.object_mask_threshold)
|
| 242 |
+
|
| 243 |
+
coords, sparse_masks, stride = sparse_mask_tuple
|
| 244 |
+
cur_scores = scores[keep]
|
| 245 |
+
cur_classes = labels[keep]
|
| 246 |
+
cur_masks = Me.MinkowskiSigmoid()(
|
| 247 |
+
Me.SparseTensor(
|
| 248 |
+
features=sparse_masks[:, keep],
|
| 249 |
+
coordinates=coords,
|
| 250 |
+
tensor_stride=stride
|
| 251 |
+
)
|
| 252 |
+
).dense(dense_dimensions, min_coordinates)[0].squeeze(0)
|
| 253 |
+
cur_mask_cls = mask_cls[keep]
|
| 254 |
+
cur_mask_cls = cur_mask_cls[:, :-1]
|
| 255 |
+
|
| 256 |
+
cur_prob_masks = cur_scores.view(-1, 1, 1, 1) * cur_masks
|
| 257 |
+
|
| 258 |
+
current_segment_id = 0
|
| 259 |
+
if cur_masks.shape[0] > 0:
|
| 260 |
+
cur_mask_ids = cur_prob_masks.argmax(0)
|
| 261 |
+
stuff_memory_list = {}
|
| 262 |
+
query_to_segment_id = {}
|
| 263 |
+
for k in range(cur_classes.shape[0]):
|
| 264 |
+
pred_class = cur_classes[k].item()
|
| 265 |
+
isthing = pred_class in list(range(1, self.post_processor.num_thing_classes + 1))
|
| 266 |
+
mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
|
| 267 |
+
|
| 268 |
+
if mask.sum().item() > 0:
|
| 269 |
+
if not isthing:
|
| 270 |
+
if int(pred_class) in stuff_memory_list.keys():
|
| 271 |
+
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
|
| 272 |
+
query_to_segment_id[k] = stuff_memory_list[int(pred_class)]
|
| 273 |
+
continue
|
| 274 |
+
else:
|
| 275 |
+
stuff_memory_list[int(pred_class)] = current_segment_id + 1
|
| 276 |
+
|
| 277 |
+
current_segment_id += 1
|
| 278 |
+
panoptic_seg[mask] = current_segment_id
|
| 279 |
+
query_to_segment_id[k] = current_segment_id
|
| 280 |
+
panoptic_semantic_mapping[current_segment_id] = int(pred_class)
|
| 281 |
+
|
| 282 |
+
surface_mask = geometry.abs() <= 1.5
|
| 283 |
+
|
| 284 |
+
# fill unassigned surface voxels
|
| 285 |
+
unassigned_mask = surface_mask & (panoptic_seg == 0)
|
| 286 |
+
for k in range(cur_classes.shape[0]):
|
| 287 |
+
mask = (cur_mask_ids == k) & unassigned_mask
|
| 288 |
+
if mask.sum().item() > 0 and k in query_to_segment_id.keys():
|
| 289 |
+
panoptic_seg[mask] = query_to_segment_id[k]
|
| 290 |
+
|
| 291 |
+
for segm_id, semantic_label in panoptic_semantic_mapping.items():
|
| 292 |
+
instance_mask = panoptic_seg == segm_id
|
| 293 |
+
semantic_seg[instance_mask] = semantic_label
|
| 294 |
+
|
| 295 |
+
return panoptic_seg, panoptic_semantic_mapping, semantic_seg
|
| 296 |
+
|
| 297 |
+
def postprocess(self, outputs_3d, outputs_2d, processed_results, frustum_mask):
|
| 298 |
+
"""Postprocess 3D results."""
|
| 299 |
+
dense_dimensions = torch.Size([1, 1] + self.frustum_dims)
|
| 300 |
+
min_coordinates = torch.IntTensor([0, 0, 0])
|
| 301 |
+
|
| 302 |
+
geometry_results = to_dense(
|
| 303 |
+
outputs_3d["pred_geometry"],
|
| 304 |
+
dense_dimensions,
|
| 305 |
+
min_coordinates,
|
| 306 |
+
default_value=self.truncation
|
| 307 |
+
)[0]
|
| 308 |
+
mask_3d_results = outputs_3d["pred_segms"][-1]
|
| 309 |
+
mask_cls_results = outputs_2d["pred_logits"]
|
| 310 |
+
|
| 311 |
+
processed_results_3d = {
|
| 312 |
+
"intrinsic": [],
|
| 313 |
+
"image_size": [],
|
| 314 |
+
"depth": [],
|
| 315 |
+
"panoptic_seg_2d": [],
|
| 316 |
+
"geometry": [],
|
| 317 |
+
"panoptic_seg": [],
|
| 318 |
+
"semantic_seg": [],
|
| 319 |
+
"panoptic_semantic_mapping": [],
|
| 320 |
+
"instance_info_pred": []
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
for idx, (geometry_result, mask_cls_result) in enumerate(zip(
|
| 324 |
+
geometry_results,
|
| 325 |
+
mask_cls_results
|
| 326 |
+
)):
|
| 327 |
+
coords, mask_3d = mask_3d_results.coordinates_at(idx), mask_3d_results.features_at(idx)
|
| 328 |
+
coords, mask_3d = Me.utils.sparse_collate([coords], [mask_3d])
|
| 329 |
+
geometry_result = geometry_result.squeeze(0)
|
| 330 |
+
panoptic_seg, panoptic_semantic_mapping, semantic_seg = self.panoptic_3d_inference(
|
| 331 |
+
geometry_result,
|
| 332 |
+
mask_cls_result,
|
| 333 |
+
(coords, mask_3d, mask_3d_results.tensor_stride),
|
| 334 |
+
min_coordinates,
|
| 335 |
+
dense_dimensions,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
processed_results_3d["intrinsic"].append(processed_results[idx]["intrinsic"])
|
| 339 |
+
processed_results_3d["image_size"].append(processed_results[idx]["image_size"])
|
| 340 |
+
processed_results_3d["depth"].append(processed_results[idx]["depth"])
|
| 341 |
+
processed_results_3d["panoptic_seg_2d"].append(processed_results[idx]["panoptic_seg"])
|
| 342 |
+
processed_results_3d["geometry"].append(geometry_result)
|
| 343 |
+
processed_results_3d["panoptic_seg"].append(panoptic_seg)
|
| 344 |
+
processed_results_3d["semantic_seg"].append(semantic_seg)
|
| 345 |
+
processed_results_3d["panoptic_semantic_mapping"].append(panoptic_semantic_mapping)
|
| 346 |
+
processed_results_3d["instance_info_pred"].append(prepare_instance_masks_thicken(
|
| 347 |
+
panoptic_seg,
|
| 348 |
+
panoptic_semantic_mapping,
|
| 349 |
+
geometry_result,
|
| 350 |
+
frustum_mask[idx],
|
| 351 |
+
iso_value=self.iso_recon_value,
|
| 352 |
+
downsample_factor=self.downsample_factor
|
| 353 |
+
))
|
| 354 |
+
|
| 355 |
+
return processed_results_3d
|
nvpanoptix_3d/mp_occ/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Multi-plane Occupancy module."""
|
nvpanoptix_3d/mp_occ/back_projection.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Back projection module."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
from torch import nn
|
| 19 |
+
from ..utils.frustum import (
|
| 20 |
+
generate_frustum,
|
| 21 |
+
generate_frustum_volume,
|
| 22 |
+
compute_camera2frustum_transform
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class BackProjection(nn.Module):
|
| 27 |
+
"""Back projection module."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, cfg):
|
| 30 |
+
"""Initialize the back projection module."""
|
| 31 |
+
super().__init__()
|
| 32 |
+
self.image_size = cfg.dataset.depth_size
|
| 33 |
+
self.depth_min = cfg.dataset.depth_min
|
| 34 |
+
self.depth_max = cfg.dataset.depth_max
|
| 35 |
+
self.voxel_size = cfg.model.projection.voxel_size
|
| 36 |
+
self.frustum_dimensions = torch.tensor([cfg.model.frustum3d.frustum_dims] * 3)
|
| 37 |
+
|
| 38 |
+
def forward(
|
| 39 |
+
self, shp, intrinsics, frustum_masks=None, room_masks=None
|
| 40 |
+
):
|
| 41 |
+
"""Forward pass."""
|
| 42 |
+
device = intrinsics.device
|
| 43 |
+
if frustum_masks is None:
|
| 44 |
+
frustum_masks = torch.ones(
|
| 45 |
+
[len(intrinsics), *self.frustum_dimensions],
|
| 46 |
+
dtype=torch.bool, device=device
|
| 47 |
+
)
|
| 48 |
+
len_shp = len(frustum_masks.shape)
|
| 49 |
+
if len_shp == 3:
|
| 50 |
+
frustum_masks = frustum_masks[None]
|
| 51 |
+
intrinsics = intrinsics[None]
|
| 52 |
+
|
| 53 |
+
kepts, mappings = [], []
|
| 54 |
+
for bi, (intrinsic, frustum_mask) in enumerate(zip(intrinsics, frustum_masks)):
|
| 55 |
+
camera2frustum = compute_camera2frustum_transform(
|
| 56 |
+
intrinsic.cpu(), self.image_size, self.depth_min,
|
| 57 |
+
self.depth_max, self.voxel_size
|
| 58 |
+
).to(device)
|
| 59 |
+
intrinsic_inverse = torch.inverse(intrinsic)
|
| 60 |
+
coordinates = torch.nonzero(frustum_mask)
|
| 61 |
+
grid_coordinates = coordinates.clone()
|
| 62 |
+
grid_coordinates[:, :2] = 256 - grid_coordinates[:, :2]
|
| 63 |
+
|
| 64 |
+
padding_offsets = self.compute_frustum_padding(intrinsic_inverse)
|
| 65 |
+
grid_coordinates = grid_coordinates - padding_offsets - torch.tensor([1., 1., 1.], device=device)
|
| 66 |
+
grid_coordinates = torch.cat([
|
| 67 |
+
grid_coordinates, torch.ones(len(grid_coordinates), 1, device=device)], 1
|
| 68 |
+
)
|
| 69 |
+
pointcloud = torch.mm(torch.inverse(camera2frustum), grid_coordinates.t())
|
| 70 |
+
depth_pixels = torch.mm(intrinsic, pointcloud)
|
| 71 |
+
|
| 72 |
+
depth = depth_pixels[2]
|
| 73 |
+
coord = depth_pixels[0:2] / depth
|
| 74 |
+
coord = torch.cat([coord, coordinates[:, 2][None]], 0).permute(1, 0)
|
| 75 |
+
|
| 76 |
+
kept = (depth <= self.depth_max) * \
|
| 77 |
+
(depth >= self.depth_min) * \
|
| 78 |
+
(coord[:, 0] < shp[1]) * (coord[:, 1] < shp[0])
|
| 79 |
+
coordinates = coordinates[kept]
|
| 80 |
+
depth = depth[kept, None]
|
| 81 |
+
|
| 82 |
+
mapping = torch.zeros(256, 256, 256, 5, device=depth.device) - 1.
|
| 83 |
+
mapping[coordinates[:, 0], coordinates[:, 1], coordinates[:, 2]] = \
|
| 84 |
+
torch.cat([torch.ones_like(depth) * bi, coord[kept], depth], -1)
|
| 85 |
+
|
| 86 |
+
kept = (mapping >= 0).all(-1)
|
| 87 |
+
|
| 88 |
+
if room_masks is not None:
|
| 89 |
+
mapping_kept = mapping[kept].long()
|
| 90 |
+
kept[kept.clone()] = room_masks[bi, 0, mapping_kept[:, 2], mapping_kept[:, 1]]
|
| 91 |
+
|
| 92 |
+
kepts.append(kept)
|
| 93 |
+
mappings.append(mapping)
|
| 94 |
+
|
| 95 |
+
if len_shp == 3:
|
| 96 |
+
kepts = kepts[0]
|
| 97 |
+
mappings = mappings[0][..., 1:]
|
| 98 |
+
else:
|
| 99 |
+
kepts = torch.stack(kepts, 0)
|
| 100 |
+
mappings = torch.stack(mappings, 0)
|
| 101 |
+
|
| 102 |
+
return kepts, mappings
|
| 103 |
+
|
| 104 |
+
def compute_frustum_padding(self, intrinsic_inverse: torch.Tensor) -> torch.Tensor:
|
| 105 |
+
"""Compute frustum padding."""
|
| 106 |
+
frustum = generate_frustum(
|
| 107 |
+
self.image_size, intrinsic_inverse.cpu(), self.depth_min, self.depth_max
|
| 108 |
+
)
|
| 109 |
+
dimensions, _ = generate_frustum_volume(frustum, self.voxel_size)
|
| 110 |
+
difference = (
|
| 111 |
+
self.frustum_dimensions - torch.tensor(dimensions)
|
| 112 |
+
).float().to(intrinsic_inverse.device)
|
| 113 |
+
padding_offsets = difference / 2
|
| 114 |
+
return padding_offsets
|
nvpanoptix_3d/mp_occ/multiplane_occupancy.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Multi-plane occupancy head."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
from torch import nn
|
| 19 |
+
from torch.nn import functional as F
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class _UpProjection(nn.Module):
|
| 23 |
+
"""Up-projection block."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, num_input_features, num_output_features):
|
| 26 |
+
"""Initialize the up-projection block."""
|
| 27 |
+
super().__init__()
|
| 28 |
+
self.conv1 = nn.Conv2d(
|
| 29 |
+
num_input_features, num_output_features, kernel_size=5, stride=1, padding=2, bias=False
|
| 30 |
+
)
|
| 31 |
+
self.bn1 = nn.BatchNorm2d(num_output_features)
|
| 32 |
+
self.relu = nn.ReLU(inplace=True)
|
| 33 |
+
self.conv1_2 = nn.Conv2d(
|
| 34 |
+
num_output_features, num_output_features, kernel_size=3, stride=1, padding=1, bias=False
|
| 35 |
+
)
|
| 36 |
+
self.bn1_2 = nn.BatchNorm2d(num_output_features)
|
| 37 |
+
self.conv2 = nn.Conv2d(
|
| 38 |
+
num_input_features, num_output_features, kernel_size=5, stride=1, padding=2, bias=False
|
| 39 |
+
)
|
| 40 |
+
self.bn2 = nn.BatchNorm2d(num_output_features)
|
| 41 |
+
|
| 42 |
+
def forward(self, x, size):
|
| 43 |
+
"""Forward pass."""
|
| 44 |
+
x = F.interpolate(x, size=size, mode="bilinear", align_corners=True)
|
| 45 |
+
x_conv1 = self.relu(self.bn1(self.conv1(x)))
|
| 46 |
+
bran1 = self.bn1_2(self.conv1_2(x_conv1))
|
| 47 |
+
bran2 = self.bn2(self.conv2(x))
|
| 48 |
+
out = self.relu(bran1 + bran2)
|
| 49 |
+
return out
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class D(nn.Module):
|
| 53 |
+
"""Decoder module."""
|
| 54 |
+
|
| 55 |
+
def __init__(self, block_channel):
|
| 56 |
+
"""Initialize the decoder module."""
|
| 57 |
+
super().__init__()
|
| 58 |
+
self.conv = nn.Conv2d(
|
| 59 |
+
block_channel[0], block_channel[1], kernel_size=1, stride=1, bias=False
|
| 60 |
+
)
|
| 61 |
+
self.bn = nn.BatchNorm2d(block_channel[1])
|
| 62 |
+
|
| 63 |
+
self.up1 = _UpProjection(num_input_features=block_channel[1],
|
| 64 |
+
num_output_features=block_channel[2])
|
| 65 |
+
|
| 66 |
+
self.up2 = _UpProjection(num_input_features=block_channel[2],
|
| 67 |
+
num_output_features=block_channel[3])
|
| 68 |
+
|
| 69 |
+
add_feat_channel = block_channel[3]
|
| 70 |
+
self.up3 = _UpProjection(num_input_features=add_feat_channel,
|
| 71 |
+
num_output_features=add_feat_channel // 2)
|
| 72 |
+
|
| 73 |
+
add_feat_channel = add_feat_channel // 2
|
| 74 |
+
self.up4 = _UpProjection(num_input_features=add_feat_channel,
|
| 75 |
+
num_output_features=add_feat_channel // 2)
|
| 76 |
+
|
| 77 |
+
def forward(self, x_block1, x_block2, x_block3, x_block4):
|
| 78 |
+
"""Forward pass."""
|
| 79 |
+
x_d0 = F.relu(self.bn(self.conv(x_block4)))
|
| 80 |
+
x_d1 = self.up1(x_d0, [x_block3.size(2), x_block3.size(3)])
|
| 81 |
+
x_d2 = self.up2(x_d1, [x_block2.size(2), x_block2.size(3)])
|
| 82 |
+
x_d3 = self.up3(x_d2, [x_block1.size(2), x_block1.size(3)])
|
| 83 |
+
x_d4 = self.up4(x_d3, [x_block1.size(2) * 2, x_block1.size(3) * 2])
|
| 84 |
+
return x_d4
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class MFF(nn.Module):
|
| 88 |
+
"""Multi-feature fusion module."""
|
| 89 |
+
|
| 90 |
+
def __init__(self, block_channel, num_features=64):
|
| 91 |
+
"""Initialize the multi-feature fusion module."""
|
| 92 |
+
super().__init__()
|
| 93 |
+
self.up1 = _UpProjection(num_input_features=block_channel[3], num_output_features=16)
|
| 94 |
+
self.up2 = _UpProjection(num_input_features=block_channel[2], num_output_features=16)
|
| 95 |
+
self.up3 = _UpProjection(num_input_features=block_channel[1], num_output_features=16)
|
| 96 |
+
self.up4 = _UpProjection(num_input_features=block_channel[0], num_output_features=16)
|
| 97 |
+
|
| 98 |
+
self.conv = nn.Conv2d(
|
| 99 |
+
num_features, num_features, kernel_size=5, stride=1, padding=2, bias=False
|
| 100 |
+
)
|
| 101 |
+
self.bn = nn.BatchNorm2d(num_features)
|
| 102 |
+
|
| 103 |
+
def forward(self, x_block1, x_block2, x_block3, x_block4, size):
|
| 104 |
+
"""Forward pass."""
|
| 105 |
+
x_m1 = self.up1(x_block1, size)
|
| 106 |
+
x_m2 = self.up2(x_block2, size)
|
| 107 |
+
x_m3 = self.up3(x_block3, size)
|
| 108 |
+
x_m4 = self.up4(x_block4, size)
|
| 109 |
+
|
| 110 |
+
x = self.bn(self.conv(torch.cat((x_m1, x_m2, x_m3, x_m4), 1)))
|
| 111 |
+
x = F.relu(x)
|
| 112 |
+
return x
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class R(nn.Module):
|
| 116 |
+
"""Occupancy head module."""
|
| 117 |
+
|
| 118 |
+
def __init__(self, channel, num_class=1):
|
| 119 |
+
"""Initialize the occupancy head module."""
|
| 120 |
+
super().__init__()
|
| 121 |
+
|
| 122 |
+
self.target_size = (120, 160)
|
| 123 |
+
self.resize = _UpProjection(num_input_features=channel, num_output_features=channel)
|
| 124 |
+
|
| 125 |
+
self.conv0 = nn.Conv2d(channel, channel, kernel_size=5, stride=1, padding=2, bias=False)
|
| 126 |
+
self.bn0 = nn.BatchNorm2d(channel)
|
| 127 |
+
|
| 128 |
+
self.conv1 = nn.Conv2d(channel, channel, kernel_size=5, stride=1, padding=2, bias=False)
|
| 129 |
+
self.bn1 = nn.BatchNorm2d(channel)
|
| 130 |
+
|
| 131 |
+
self.conv2 = nn.Conv2d(channel, num_class, kernel_size=5, stride=1, padding=2, bias=True)
|
| 132 |
+
|
| 133 |
+
def forward(self, x):
|
| 134 |
+
"""Forward pass."""
|
| 135 |
+
x0 = self.resize(x, self.target_size) # resize to 120*160
|
| 136 |
+
x0 = self.conv0(x0)
|
| 137 |
+
x0 = self.bn0(x0)
|
| 138 |
+
x0 = F.relu(x0)
|
| 139 |
+
|
| 140 |
+
x1 = self.conv1(x0)
|
| 141 |
+
x1 = self.bn1(x1)
|
| 142 |
+
x1 = F.relu(x1)
|
| 143 |
+
|
| 144 |
+
x2 = self.conv2(x1)
|
| 145 |
+
return x2
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class MultiPlaneOccupancyHead(nn.Module):
|
| 149 |
+
"""Multi-plane occupancy head."""
|
| 150 |
+
|
| 151 |
+
def __init__(self):
|
| 152 |
+
"""Initialize the multi-plane occupancy head."""
|
| 153 |
+
super().__init__()
|
| 154 |
+
block_channel = [2048, 1024, 512, 256]
|
| 155 |
+
self.feature_key = ['res2', 'res3', 'res4', 'res5']
|
| 156 |
+
feature_channels = 64
|
| 157 |
+
|
| 158 |
+
self.D = D(block_channel)
|
| 159 |
+
self.MFF = MFF(block_channel, feature_channels)
|
| 160 |
+
head_channels = block_channel[-1] // 4 + feature_channels
|
| 161 |
+
self.num_classes = 100
|
| 162 |
+
self.prediction = R(head_channels, self.num_classes)
|
| 163 |
+
|
| 164 |
+
def forward(self, x):
|
| 165 |
+
"""Forward pass."""
|
| 166 |
+
x_block1, x_block2, x_block3, x_block4 = x[self.feature_key[0]], x[self.feature_key[1]], \
|
| 167 |
+
x[self.feature_key[2]], x[self.feature_key[3]]
|
| 168 |
+
x_decoder = self.D(x_block1, x_block2, x_block3, x_block4)
|
| 169 |
+
x_mff = self.MFF(
|
| 170 |
+
x_block1, x_block2, x_block3, x_block4, [x_decoder.size(2), x_decoder.size(3)]
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
x_feat = torch.cat((x_decoder, x_mff), 1)
|
| 174 |
+
occ_pred = self.prediction(x_feat)
|
| 175 |
+
return occ_pred
|
nvpanoptix_3d/mp_occ/occupancy_aware_lifting.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Occupancy aware lifting module for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
from torch import nn
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
import MinkowskiEngine as Me
|
| 21 |
+
|
| 22 |
+
from .back_projection import BackProjection
|
| 23 |
+
from ..utils.sparse_tensor import mask_invalid_sparse_voxels
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class OccupancyAwareLifting(nn.Module):
|
| 27 |
+
"""Occupancy aware lifting module for Panoptic Recon 3D."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, cfg):
|
| 30 |
+
"""Initialize the occupancy aware lifting module."""
|
| 31 |
+
super(OccupancyAwareLifting, self).__init__()
|
| 32 |
+
self.bp = BackProjection(cfg)
|
| 33 |
+
|
| 34 |
+
def forward(self, pred, kept, mapping, occupancy2d, room_mask=None):
|
| 35 |
+
"""Forward pass."""
|
| 36 |
+
# get the depth, semantic, and occupancy
|
| 37 |
+
depth = torch.stack([p['depth'][None] for p in pred])
|
| 38 |
+
features = torch.stack([p['sem_seg'] for p in pred])
|
| 39 |
+
depth_weight = occupancy2d.to(depth.device)
|
| 40 |
+
kept = kept.to(depth.device)
|
| 41 |
+
mapping = mapping.to(depth.device)
|
| 42 |
+
|
| 43 |
+
semantic = features.argmax(1)
|
| 44 |
+
depth_max_value = self.bp.depth_max
|
| 45 |
+
batch = semantic.shape[0]
|
| 46 |
+
|
| 47 |
+
# clip depth in range [0, depth_max_value]
|
| 48 |
+
depth[depth > depth_max_value] = depth_max_value
|
| 49 |
+
|
| 50 |
+
# get the bin index of depth 0- 100
|
| 51 |
+
depth_feat = (depth / depth_max_value * 100.)
|
| 52 |
+
|
| 53 |
+
depth_index = depth_feat.long()
|
| 54 |
+
depth_weight_kept = torch.ones_like(
|
| 55 |
+
depth_weight, dtype=torch.long
|
| 56 |
+
) * torch.arange(0, 100, device=depth.device, dtype=torch.long)[None, :, None, None]
|
| 57 |
+
|
| 58 |
+
# stuff: wall, floor, or ceiling, erode the stuff class
|
| 59 |
+
stuff = (-F.max_pool2d(-(semantic >= 10).float(), 5, 1, 2)).bool()
|
| 60 |
+
# get the depth of the stuff class
|
| 61 |
+
stuff_depth = depth[:, 0] * stuff
|
| 62 |
+
|
| 63 |
+
# get the max depth of the stuff class in x direction: (batch_size, h)
|
| 64 |
+
stuff_x_max = stuff_depth.max(1)[0]
|
| 65 |
+
# get the max depth of the stuff class in y direction: (batch_size, w)
|
| 66 |
+
stuff_y_max = stuff_depth.max(2)[0]
|
| 67 |
+
|
| 68 |
+
stuff_depth_l = stuff_depth[:, 0].clone()
|
| 69 |
+
stuff_depth_r = stuff_depth[:, -1].clone()
|
| 70 |
+
stuff_depth_t = stuff_depth[:, :, 0].clone()
|
| 71 |
+
stuff_depth_d = stuff_depth[:, :, -1].clone()
|
| 72 |
+
|
| 73 |
+
for bi in range(batch):
|
| 74 |
+
stuff_depth[bi, 0] = stuff_padding(stuff_depth_l[bi], stuff_y_max[bi])
|
| 75 |
+
stuff_depth[bi, -1] = stuff_padding(stuff_depth_r[bi], stuff_y_max[bi].flip(0))
|
| 76 |
+
stuff_depth[bi, :, 0] = stuff_padding(stuff_depth_t[bi], stuff_x_max[bi])
|
| 77 |
+
stuff_depth[bi, :, -1] = stuff_padding(stuff_depth_d[bi], stuff_x_max[bi].flip(0))
|
| 78 |
+
|
| 79 |
+
stuff_x = stuff_depth.max(1)[0]
|
| 80 |
+
stuff_y = stuff_depth.max(2)[0]
|
| 81 |
+
|
| 82 |
+
for bi in range(batch):
|
| 83 |
+
stuff_x[bi] = find_none(stuff_x[bi])
|
| 84 |
+
stuff_y[bi] = find_none(stuff_y[bi])
|
| 85 |
+
|
| 86 |
+
# create depth limit and determine:
|
| 87 |
+
# "What's the farthest depth where we can
|
| 88 |
+
# reasonably place a 3D object before hitting a wall or boundary?"
|
| 89 |
+
depth_pixels_xy = torch.ones_like(depth).nonzero()
|
| 90 |
+
depth_max = torch.cat(
|
| 91 |
+
[
|
| 92 |
+
stuff_x[depth_pixels_xy[:, 0], depth_pixels_xy[:, 3]][..., None],
|
| 93 |
+
stuff_y[depth_pixels_xy[:, 0], depth_pixels_xy[:, 2]][..., None]
|
| 94 |
+
],
|
| 95 |
+
dim=-1
|
| 96 |
+
).min(-1)[0].reshape(*depth.shape)
|
| 97 |
+
|
| 98 |
+
depth_max = (depth_max / depth_max_value * 100.).long() # get the min bin index of stuff class
|
| 99 |
+
depth_feat = (depth_weight_kept - depth_index) / 100. * depth_max_value
|
| 100 |
+
|
| 101 |
+
# get the sign and the distance of voxel to the surface
|
| 102 |
+
depth_feat = torch.cat([depth_feat.sign()[:, None], depth_feat[:, None].abs()], 1)
|
| 103 |
+
|
| 104 |
+
# keep voxel 3 bins before surface to 5 bins after stuff class max depth
|
| 105 |
+
depth_weight_kept = (depth_weight_kept > (depth_index - 3)) * (
|
| 106 |
+
depth_weight_kept < (depth_max + 5))
|
| 107 |
+
|
| 108 |
+
depth_weight = depth_weight.sigmoid() * depth_weight_kept
|
| 109 |
+
|
| 110 |
+
feat_kept = kept.clone()
|
| 111 |
+
|
| 112 |
+
if room_mask is not None:
|
| 113 |
+
room_mask = room_mask.unsqueeze(1)
|
| 114 |
+
depth_weight_kept = depth_weight_kept * room_mask
|
| 115 |
+
|
| 116 |
+
mapping_kept = mapping[kept]
|
| 117 |
+
mapping_kept[:, -1] = mapping_kept[:, -1] * 100 / 6
|
| 118 |
+
mapping_kept = mapping_kept.long().to(depth.device)
|
| 119 |
+
|
| 120 |
+
# only keep voxel before 3 bins before surface
|
| 121 |
+
# and after 5 bins after stuff class max depth and in the frustum:
|
| 122 |
+
feat_kept[kept] = depth_weight_kept[
|
| 123 |
+
mapping_kept[:, 0], mapping_kept[:, -1], mapping_kept[:, 2], mapping_kept[:, 1]]
|
| 124 |
+
|
| 125 |
+
features = torch.cat(
|
| 126 |
+
[
|
| 127 |
+
features[:, :, None].repeat(1, 1, 100, 1, 1),
|
| 128 |
+
depth_weight[:, None], depth_feat
|
| 129 |
+
],
|
| 130 |
+
dim=1
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
coord_sparse = feat_kept.nonzero()
|
| 134 |
+
mapping_feat_kept = mapping[feat_kept]
|
| 135 |
+
|
| 136 |
+
# convert to bin index:
|
| 137 |
+
mapping_feat_kept[:, -1] = mapping_feat_kept[:, -1] * 100 / depth_max_value
|
| 138 |
+
mapping_feat_kept = mapping_feat_kept.long()
|
| 139 |
+
feat_sparse = features[
|
| 140 |
+
mapping_feat_kept[:, 0], :, mapping_feat_kept[:, -1],
|
| 141 |
+
mapping_feat_kept[:, 2], mapping_feat_kept[:, 1]
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
padding_kept = F.max_pool3d(feat_kept.float(), 5, 1, 2).bool()
|
| 145 |
+
padding_kept[~kept] = False
|
| 146 |
+
|
| 147 |
+
batch_point = padding_kept.flatten(1, -1).sum(-1)
|
| 148 |
+
batch_zero = (batch_point == 0).nonzero().view(-1)
|
| 149 |
+
|
| 150 |
+
# fix no points
|
| 151 |
+
if len(batch_zero) > 0:
|
| 152 |
+
padding_kept[batch_zero, 127, 127, 127] = True
|
| 153 |
+
padding_kept[feat_kept] = False
|
| 154 |
+
coord_padding = padding_kept.nonzero().contiguous().float()
|
| 155 |
+
|
| 156 |
+
coord_padding[:, 1:] = coord_padding[:, 1:] // 2 * 2
|
| 157 |
+
feat_padding = torch.zeros(
|
| 158 |
+
(
|
| 159 |
+
len(coord_padding), features.shape[1]
|
| 160 |
+
),
|
| 161 |
+
device=features.device, dtype=torch.float)
|
| 162 |
+
|
| 163 |
+
feat_sparse = torch.cat([feat_sparse, feat_padding])
|
| 164 |
+
coord_sparse = torch.cat([coord_sparse, coord_padding])
|
| 165 |
+
|
| 166 |
+
proj_feat = Me.SparseTensor(
|
| 167 |
+
features=feat_sparse,
|
| 168 |
+
coordinates=coord_sparse.contiguous().int(),
|
| 169 |
+
tensor_stride=1,
|
| 170 |
+
quantization_mode=Me.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE)
|
| 171 |
+
|
| 172 |
+
proj_feat = mask_invalid_sparse_voxels(proj_feat)
|
| 173 |
+
return proj_feat, None
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def stuff_padding(padding, max_value):
|
| 177 |
+
"""Stuff padding."""
|
| 178 |
+
padding = padding.clone()
|
| 179 |
+
padding_mask = padding == 0
|
| 180 |
+
v = None
|
| 181 |
+
if padding_mask.sum() > 0:
|
| 182 |
+
for val in max_value:
|
| 183 |
+
if val != 0:
|
| 184 |
+
v = val
|
| 185 |
+
break
|
| 186 |
+
if v is not None:
|
| 187 |
+
padding[padding_mask] = v
|
| 188 |
+
return padding
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def find_none(stuff_a, min_value=0):
|
| 192 |
+
"""Find none."""
|
| 193 |
+
none_v = torch.nonzero(stuff_a == 0)
|
| 194 |
+
for v in none_v:
|
| 195 |
+
l_stuff = stuff_a[:v]
|
| 196 |
+
l_stuff = l_stuff[l_stuff != 0]
|
| 197 |
+
l_stuff = min(l_stuff) if len(l_stuff) else min_value
|
| 198 |
+
r_stuff = stuff_a[v + 1:]
|
| 199 |
+
r_stuff = r_stuff[r_stuff != 0]
|
| 200 |
+
r_stuff = min(r_stuff) if len(r_stuff) else min_value
|
| 201 |
+
stuff_a[v] = max(l_stuff, r_stuff)
|
| 202 |
+
return stuff_a
|
nvpanoptix_3d/reconstruction/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Panoptic Recon 3D reconstruction module."""
|
| 16 |
+
|
| 17 |
+
from .reprojection import SparseProjection
|
| 18 |
+
from .decoder import FrustumDecoder
|
| 19 |
+
|
| 20 |
+
__all__ = ["SparseProjection", "FrustumDecoder"]
|
nvpanoptix_3d/reconstruction/decoder.py
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Decoder for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
from typing import Optional, List
|
| 18 |
+
import torch
|
| 19 |
+
from torch import nn
|
| 20 |
+
import MinkowskiEngine as Me
|
| 21 |
+
from ..utils.sparse_tensor import sparse_cat_union
|
| 22 |
+
from ..blocks import BasicBlock3D, SparseBasicBlock3D
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SparseToDense(nn.Module):
|
| 26 |
+
"""Sparse to dense module."""
|
| 27 |
+
|
| 28 |
+
def __init__(self, input_size):
|
| 29 |
+
"""Initialize the sparse to dense module."""
|
| 30 |
+
super().__init__()
|
| 31 |
+
assert len(input_size) == 3
|
| 32 |
+
self.input_size = input_size
|
| 33 |
+
|
| 34 |
+
def forward(self, feature: Me.SparseTensor) -> torch.Tensor:
|
| 35 |
+
"""Forward pass."""
|
| 36 |
+
batch_size = len(feature.decomposed_coordinates_and_features[0])
|
| 37 |
+
feat_dim = feature.C.shape[-1]
|
| 38 |
+
|
| 39 |
+
out_size = (
|
| 40 |
+
torch.div(
|
| 41 |
+
torch.tensor(self.input_size),
|
| 42 |
+
torch.tensor(feature.tensor_stride),
|
| 43 |
+
rounding_mode="floor"
|
| 44 |
+
)
|
| 45 |
+
).tolist()
|
| 46 |
+
shape = torch.Size([batch_size, feat_dim, *out_size])
|
| 47 |
+
min_coordinate = torch.IntTensor([0, 0, 0])
|
| 48 |
+
|
| 49 |
+
mask = (feature.C[:, 1] < self.input_size[0]) & \
|
| 50 |
+
(feature.C[:, 2] < self.input_size[1]) & \
|
| 51 |
+
(feature.C[:, 3] < self.input_size[2])
|
| 52 |
+
mask = mask & (feature.C[:, 1] >= 0) & (feature.C[:, 2] >= 0) & (feature.C[:, 3] >= 0)
|
| 53 |
+
|
| 54 |
+
feature = Me.MinkowskiPruning()(feature, mask)
|
| 55 |
+
dense = feature.dense(shape, min_coordinate=min_coordinate)[0]
|
| 56 |
+
|
| 57 |
+
return dense
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class FrustumDecoder(nn.Module):
|
| 61 |
+
"""Frustum decoder module."""
|
| 62 |
+
|
| 63 |
+
def __init__(self, cfg) -> None:
|
| 64 |
+
"""Initialize the frustum decoder module."""
|
| 65 |
+
super().__init__()
|
| 66 |
+
|
| 67 |
+
num_output_features = cfg.model.frustum3d.unet_output_channels
|
| 68 |
+
num_features = cfg.model.frustum3d.unet_features
|
| 69 |
+
sign_channel = cfg.model.projection.sign_channel
|
| 70 |
+
mask_dim = cfg.model.sem_seg_head.mask_dim
|
| 71 |
+
depth_dim = cfg.model.sem_seg_head.depth_dim
|
| 72 |
+
num_classes = cfg.model.sem_seg_head.num_classes
|
| 73 |
+
frustum_dims = cfg.model.frustum3d.grid_dimensions
|
| 74 |
+
frustum_dims = [frustum_dims] * 3
|
| 75 |
+
|
| 76 |
+
self.use_ms_features = cfg.model.frustum3d.use_multi_scale
|
| 77 |
+
self.truncation = cfg.model.frustum3d.truncation
|
| 78 |
+
|
| 79 |
+
if cfg.dataset.name == 'matterport':
|
| 80 |
+
ms_feature_channels = cfg.model.sem_seg_head.convs_dim
|
| 81 |
+
else:
|
| 82 |
+
ms_feature_channels = cfg.model.sem_seg_head.convs_dim + \
|
| 83 |
+
cfg.model.sem_seg_head.num_classes + cfg.model.frustum3d.signed_channel
|
| 84 |
+
|
| 85 |
+
# input encoding
|
| 86 |
+
self.input_dims = [2 if sign_channel else 1, mask_dim + depth_dim, num_classes]
|
| 87 |
+
self.input_encoders = nn.ModuleList()
|
| 88 |
+
for input_dim in self.input_dims:
|
| 89 |
+
downsample = nn.Sequential(
|
| 90 |
+
Me.MinkowskiConvolution(
|
| 91 |
+
input_dim, num_features,
|
| 92 |
+
kernel_size=1, stride=1,
|
| 93 |
+
bias=True, dimension=3
|
| 94 |
+
),
|
| 95 |
+
Me.MinkowskiInstanceNorm(num_features),
|
| 96 |
+
)
|
| 97 |
+
self.input_encoders.append(
|
| 98 |
+
SparseBasicBlock3D(
|
| 99 |
+
input_dim, num_features,
|
| 100 |
+
downsample=downsample
|
| 101 |
+
)
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
self.level_encoders = nn.ModuleList([
|
| 105 |
+
self.make_encoder(len(self.input_encoders) * num_features, num_features),
|
| 106 |
+
self.make_encoder(num_features, num_features * 2),
|
| 107 |
+
self.make_encoder(num_features * 2, num_features * 4, is_sparse=False),
|
| 108 |
+
self.make_encoder(num_features * 4, num_features * 8, is_sparse=False),
|
| 109 |
+
self.make_encoder(num_features * 8, num_features * 8, is_sparse=False),
|
| 110 |
+
])
|
| 111 |
+
|
| 112 |
+
sparse_to_dense = SparseToDense(frustum_dims)
|
| 113 |
+
|
| 114 |
+
if self.use_ms_features:
|
| 115 |
+
self.feature_adapters = nn.ModuleList([
|
| 116 |
+
self.make_adapter(ms_feature_channels, num_features),
|
| 117 |
+
self.make_adapter(ms_feature_channels, num_features * 2),
|
| 118 |
+
self.make_adapter(ms_feature_channels, num_features * 4, [sparse_to_dense]),
|
| 119 |
+
])
|
| 120 |
+
else:
|
| 121 |
+
self.feature_adapters = None
|
| 122 |
+
|
| 123 |
+
self.enc_level_conversion = nn.ModuleList([
|
| 124 |
+
nn.Identity(),
|
| 125 |
+
sparse_to_dense,
|
| 126 |
+
nn.Identity(),
|
| 127 |
+
nn.Identity(),
|
| 128 |
+
])
|
| 129 |
+
|
| 130 |
+
self.level_decoders = nn.ModuleList([
|
| 131 |
+
self.make_decoder(num_features * 3, num_output_features),
|
| 132 |
+
self.make_decoder(
|
| 133 |
+
num_features * 6, num_features * 2,
|
| 134 |
+
extra_layers=[SparseBasicBlock3D(num_features * 2, num_features * 2)]
|
| 135 |
+
),
|
| 136 |
+
self.make_decoder(num_features * 8, num_features * 2, is_sparse=False),
|
| 137 |
+
self.make_decoder(num_features * 16, num_features * 4, is_sparse=False),
|
| 138 |
+
self.make_decoder(num_features * 8, num_features * 8, is_sparse=False),
|
| 139 |
+
])
|
| 140 |
+
|
| 141 |
+
# occupancy heads
|
| 142 |
+
self.level_occupancy_heads = nn.ModuleList([
|
| 143 |
+
nn.Sequential(
|
| 144 |
+
Me.MinkowskiInstanceNorm(num_output_features),
|
| 145 |
+
Me.MinkowskiReLU(inplace=True),
|
| 146 |
+
SparseBasicBlock3D(num_output_features, num_output_features),
|
| 147 |
+
Me.MinkowskiConvolution(num_output_features, 1, kernel_size=3, bias=True, dimension=3),
|
| 148 |
+
),
|
| 149 |
+
Me.MinkowskiLinear(num_features * 2, 1),
|
| 150 |
+
nn.Linear(num_features * 4, 1),
|
| 151 |
+
])
|
| 152 |
+
|
| 153 |
+
# panoptic heads
|
| 154 |
+
self.level_segm_embeddings = nn.ModuleList([
|
| 155 |
+
nn.Sequential(
|
| 156 |
+
Me.MinkowskiInstanceNorm(num_output_features),
|
| 157 |
+
Me.MinkowskiReLU(inplace=True),
|
| 158 |
+
SparseBasicBlock3D(num_output_features, num_output_features),
|
| 159 |
+
),
|
| 160 |
+
SparseBasicBlock3D(num_features * 3, num_features * 3),
|
| 161 |
+
nn.Sequential(
|
| 162 |
+
BasicBlock3D(num_features * 4, num_features * 4),
|
| 163 |
+
BasicBlock3D(num_features * 4, num_features * 4),
|
| 164 |
+
)
|
| 165 |
+
])
|
| 166 |
+
self.level_segm_query_projection = nn.ModuleList([
|
| 167 |
+
nn.Linear(mask_dim, num_output_features),
|
| 168 |
+
nn.Linear(mask_dim, num_features * 3),
|
| 169 |
+
nn.Linear(mask_dim, num_features * 4),
|
| 170 |
+
])
|
| 171 |
+
|
| 172 |
+
# geometry head
|
| 173 |
+
self.geometry_head = nn.Sequential(
|
| 174 |
+
Me.MinkowskiInstanceNorm(num_output_features),
|
| 175 |
+
Me.MinkowskiReLU(inplace=True),
|
| 176 |
+
SparseBasicBlock3D(num_output_features, num_output_features),
|
| 177 |
+
Me.MinkowskiConvolution(num_output_features, 1, kernel_size=3, bias=True, dimension=3),
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
self.register_buffer("frustum_dimensions", torch.tensor(frustum_dims), persistent=False)
|
| 181 |
+
|
| 182 |
+
@staticmethod
|
| 183 |
+
def forward_sparse_segm(segm_features, queries):
|
| 184 |
+
"""Forward pass for sparse segmentation."""
|
| 185 |
+
features = segm_features.decomposed_features
|
| 186 |
+
segms = torch.cat(
|
| 187 |
+
[torch.mm(features[idx], queries[idx].T) for idx in range(len(features))], dim=0
|
| 188 |
+
)
|
| 189 |
+
return Me.SparseTensor(
|
| 190 |
+
segms,
|
| 191 |
+
coordinate_manager=segm_features.coordinate_manager,
|
| 192 |
+
coordinate_map_key=segm_features.coordinate_map_key,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
@staticmethod
|
| 196 |
+
def make_encoder(input_dim, output_dim, is_sparse=True):
|
| 197 |
+
"""Make encoder module."""
|
| 198 |
+
if is_sparse:
|
| 199 |
+
downsample = nn.Sequential(
|
| 200 |
+
Me.MinkowskiConvolution(
|
| 201 |
+
input_dim, output_dim, kernel_size=4, stride=2, bias=True, dimension=3
|
| 202 |
+
),
|
| 203 |
+
Me.MinkowskiInstanceNorm(output_dim),
|
| 204 |
+
)
|
| 205 |
+
module = nn.Sequential(
|
| 206 |
+
SparseBasicBlock3D(input_dim, output_dim, stride=2, downsample=downsample),
|
| 207 |
+
SparseBasicBlock3D(output_dim, output_dim),
|
| 208 |
+
)
|
| 209 |
+
else:
|
| 210 |
+
downsample = nn.Conv3d(
|
| 211 |
+
input_dim, output_dim,
|
| 212 |
+
kernel_size=4, stride=2,
|
| 213 |
+
padding=1, bias=False
|
| 214 |
+
)
|
| 215 |
+
module = nn.Sequential(
|
| 216 |
+
BasicBlock3D(input_dim, output_dim, stride=2, downsample=downsample),
|
| 217 |
+
BasicBlock3D(output_dim, output_dim),
|
| 218 |
+
)
|
| 219 |
+
return module
|
| 220 |
+
|
| 221 |
+
@staticmethod
|
| 222 |
+
def make_decoder(input_dim, output_dim, is_sparse=True, extra_layers: Optional[List] = None):
|
| 223 |
+
"""Make decoder module."""
|
| 224 |
+
if extra_layers is None:
|
| 225 |
+
extra_layers = []
|
| 226 |
+
if is_sparse:
|
| 227 |
+
return nn.Sequential(
|
| 228 |
+
Me.MinkowskiConvolutionTranspose(
|
| 229 |
+
input_dim, output_dim, kernel_size=4,
|
| 230 |
+
stride=2, bias=False, dimension=3, expand_coordinates=True
|
| 231 |
+
),
|
| 232 |
+
Me.MinkowskiInstanceNorm(output_dim),
|
| 233 |
+
Me.MinkowskiReLU(inplace=True),
|
| 234 |
+
*extra_layers,
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
return nn.Sequential(
|
| 238 |
+
nn.ConvTranspose3d(input_dim, output_dim, kernel_size=4, stride=2, padding=1, bias=False),
|
| 239 |
+
nn.InstanceNorm3d(output_dim),
|
| 240 |
+
nn.ReLU(inplace=True),
|
| 241 |
+
*extra_layers,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
@staticmethod
|
| 245 |
+
def make_adapter(input_dim, output_dim, extra_layers: Optional[List] = None):
|
| 246 |
+
"""Make adapter module."""
|
| 247 |
+
if extra_layers is None:
|
| 248 |
+
extra_layers = []
|
| 249 |
+
downsample = nn.Sequential(
|
| 250 |
+
Me.MinkowskiConvolution(input_dim, output_dim, kernel_size=1, stride=1, bias=True, dimension=3),
|
| 251 |
+
Me.MinkowskiInstanceNorm(output_dim),
|
| 252 |
+
)
|
| 253 |
+
return nn.Sequential(
|
| 254 |
+
SparseBasicBlock3D(input_dim, output_dim, downsample=downsample),
|
| 255 |
+
*extra_layers,
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
def forward(
|
| 259 |
+
self, ms_features: List[Me.SparseTensor],
|
| 260 |
+
features: Me.SparseTensor, segm_queries, frustum_mask
|
| 261 |
+
):
|
| 262 |
+
"""Forward pass."""
|
| 263 |
+
start_dim = 0
|
| 264 |
+
encoded_inputs = []
|
| 265 |
+
cm = features.coordinate_manager
|
| 266 |
+
key = features.coordinate_map_key
|
| 267 |
+
for dim, encoder in zip(self.input_dims, self.input_encoders):
|
| 268 |
+
encoded_inputs.append(
|
| 269 |
+
encoder(Me.SparseTensor(
|
| 270 |
+
features.F[:, start_dim:start_dim + dim], coordinate_manager=cm, coordinate_map_key=key
|
| 271 |
+
))
|
| 272 |
+
)
|
| 273 |
+
start_dim += dim
|
| 274 |
+
encoded_inputs = Me.cat(*encoded_inputs)
|
| 275 |
+
|
| 276 |
+
lvls = len(self.level_encoders)
|
| 277 |
+
|
| 278 |
+
# high to low resolution
|
| 279 |
+
encoder_outputs = []
|
| 280 |
+
encoder_inputs = [encoded_inputs]
|
| 281 |
+
|
| 282 |
+
for idx in range(len(self.level_encoders)):
|
| 283 |
+
encoded = self.level_encoders[idx](encoder_inputs[idx])
|
| 284 |
+
if self.use_ms_features and idx < len(self.feature_adapters):
|
| 285 |
+
feat = self.feature_adapters[idx](ms_features[idx])
|
| 286 |
+
|
| 287 |
+
if isinstance(encoded, torch.Tensor):
|
| 288 |
+
encoded = encoded + feat
|
| 289 |
+
else:
|
| 290 |
+
feat = Me.SparseTensor(
|
| 291 |
+
feat.F, coordinates=feat.C,
|
| 292 |
+
tensor_stride=feat.tensor_stride,
|
| 293 |
+
coordinate_manager=encoded.coordinate_manager
|
| 294 |
+
)
|
| 295 |
+
encoded = encoded + feat
|
| 296 |
+
|
| 297 |
+
encoder_outputs.append(encoded)
|
| 298 |
+
|
| 299 |
+
if idx < lvls - 1:
|
| 300 |
+
encoder_inputs.append(self.enc_level_conversion[idx](encoded))
|
| 301 |
+
|
| 302 |
+
# low to high resolution
|
| 303 |
+
decoder_outputs = []
|
| 304 |
+
decoder_inputs = [encoder_outputs[-1]]
|
| 305 |
+
pred_occupancies = []
|
| 306 |
+
pred_segms = []
|
| 307 |
+
pred_geometry = None
|
| 308 |
+
|
| 309 |
+
# U-Net
|
| 310 |
+
for idx in reversed(range(lvls)):
|
| 311 |
+
decoded = self.level_decoders[idx](decoder_inputs[lvls - 1 - idx])
|
| 312 |
+
decoder_outputs.append(decoded)
|
| 313 |
+
|
| 314 |
+
if idx <= 1:
|
| 315 |
+
# level 128, 256
|
| 316 |
+
occupancy = self.level_occupancy_heads[idx](decoded)
|
| 317 |
+
# mask invalid voxels outside of frustum
|
| 318 |
+
valid_mask = (
|
| 319 |
+
(occupancy.C[:, 1:] >= 0) & (occupancy.C[:, 1:] < self.frustum_dimensions)
|
| 320 |
+
).all(-1)
|
| 321 |
+
pred_occupancies.append(Me.MinkowskiPruning()(occupancy, valid_mask))
|
| 322 |
+
pruning_mask = (Me.MinkowskiSigmoid()(occupancy).F.squeeze(-1) > 0.5) & valid_mask
|
| 323 |
+
sparse_out = Me.MinkowskiPruning()(decoded, pruning_mask)
|
| 324 |
+
|
| 325 |
+
if idx > 0:
|
| 326 |
+
# level 128
|
| 327 |
+
sparse_out = sparse_cat_union(encoder_outputs[idx - 1], sparse_out)
|
| 328 |
+
valid_mask = (
|
| 329 |
+
(sparse_out.C[:, 1:] >= 0) & (sparse_out.C[:, 1:] < self.frustum_dimensions)
|
| 330 |
+
).all(-1)
|
| 331 |
+
decoder_inputs.append(Me.MinkowskiPruning()(sparse_out, valid_mask))
|
| 332 |
+
else:
|
| 333 |
+
# level 256
|
| 334 |
+
pred_geometry = self.geometry_head(sparse_out)
|
| 335 |
+
predicted_values = pred_geometry.F
|
| 336 |
+
predicted_values = torch.clamp(predicted_values, 0.0, self.truncation)
|
| 337 |
+
pred_geometry = Me.SparseTensor(
|
| 338 |
+
predicted_values,
|
| 339 |
+
coordinate_manager=pred_geometry.coordinate_manager,
|
| 340 |
+
coordinate_map_key=pred_geometry.coordinate_map_key,
|
| 341 |
+
)
|
| 342 |
+
valid_mask = (
|
| 343 |
+
(pred_geometry.C[:, 1:] >= 0) & (pred_geometry.C[:, 1:] < self.frustum_dimensions)
|
| 344 |
+
).all(-1)
|
| 345 |
+
pred_geometry = Me.MinkowskiPruning()(pred_geometry, valid_mask)
|
| 346 |
+
|
| 347 |
+
queries = self.level_segm_query_projection[idx](segm_queries)
|
| 348 |
+
segm_features = self.level_segm_embeddings[idx](sparse_out)
|
| 349 |
+
pred_segm = self.forward_sparse_segm(segm_features, queries)
|
| 350 |
+
valid_mask = (
|
| 351 |
+
(pred_segm.C[:, 1:] >= 0) & (pred_segm.C[:, 1:] < self.frustum_dimensions)
|
| 352 |
+
).all(-1)
|
| 353 |
+
pred_segms.append(Me.MinkowskiPruning()(pred_segm, valid_mask))
|
| 354 |
+
|
| 355 |
+
elif idx == 2:
|
| 356 |
+
# level 64
|
| 357 |
+
decoded = torch.cat([encoder_inputs[idx], decoded], dim=1)
|
| 358 |
+
occupancy = self.level_occupancy_heads[idx](decoded.permute(0, 2, 3, 4, 1)).squeeze(-1)
|
| 359 |
+
pred_occupancies.append(occupancy.masked_fill(~frustum_mask.squeeze(1), -torch.inf))
|
| 360 |
+
|
| 361 |
+
queries = self.level_segm_query_projection[idx](segm_queries)
|
| 362 |
+
segm_features = self.level_segm_embeddings[idx](decoded)
|
| 363 |
+
pred_segm = torch.einsum("bqc,bchwd->bqhwd", queries, segm_features)
|
| 364 |
+
pred_segms.append(pred_segm.masked_fill(~frustum_mask, -torch.inf))
|
| 365 |
+
|
| 366 |
+
pruning_mask = (occupancy.sigmoid() > 0.5) & frustum_mask.squeeze(1)
|
| 367 |
+
coords = pruning_mask.nonzero()
|
| 368 |
+
sparse_out = decoded[coords[:, 0], :, coords[:, 1], coords[:, 2], coords[:, 3]]
|
| 369 |
+
encoded = encoder_outputs[idx - 1]
|
| 370 |
+
stride = encoded.tensor_stride
|
| 371 |
+
coords = coords.clone()
|
| 372 |
+
coords[:, 1:] *= torch.tensor(stride, device=coords.device)
|
| 373 |
+
sparse_out = Me.SparseTensor(
|
| 374 |
+
sparse_out, coordinates=coords.int().contiguous(),
|
| 375 |
+
tensor_stride=stride, coordinate_manager=cm
|
| 376 |
+
)
|
| 377 |
+
decoder_inputs.append(sparse_cat_union(encoded, sparse_out))
|
| 378 |
+
else:
|
| 379 |
+
decoder_inputs.append(torch.cat([encoder_inputs[idx], decoded], dim=1))
|
| 380 |
+
|
| 381 |
+
return {
|
| 382 |
+
"pred_geometry": pred_geometry,
|
| 383 |
+
"pred_occupancies": pred_occupancies,
|
| 384 |
+
"pred_segms": pred_segms,
|
| 385 |
+
}
|
nvpanoptix_3d/reconstruction/frustum.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Frustum generation for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
from typing import Optional
|
| 18 |
+
import torch
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def generate_frustum(
|
| 22 |
+
image_size: torch.Tensor, intrinsic_inv: torch.Tensor,
|
| 23 |
+
depth_min: float, depth_max: float, transform: Optional[torch.Tensor] = None
|
| 24 |
+
):
|
| 25 |
+
"""Generate frustum.
|
| 26 |
+
Args:
|
| 27 |
+
image_size: Image size.
|
| 28 |
+
intrinsic_inv: Inverse intrinsic matrix.
|
| 29 |
+
depth_min: Minimum depth.
|
| 30 |
+
depth_max: Maximum depth.
|
| 31 |
+
transform: Transform matrix.
|
| 32 |
+
Returns:
|
| 33 |
+
Frustum.
|
| 34 |
+
"""
|
| 35 |
+
x = image_size[0]
|
| 36 |
+
y = image_size[1]
|
| 37 |
+
|
| 38 |
+
eight_points = torch.tensor(
|
| 39 |
+
[
|
| 40 |
+
[0.0, 0.0, depth_min, 1.0],
|
| 41 |
+
[0.0, y * depth_min, depth_min, 1.0],
|
| 42 |
+
[x * depth_min, y * depth_min, depth_min, 1.0],
|
| 43 |
+
[x * depth_min, 0.0, depth_min, 1.0],
|
| 44 |
+
[0.0, 0.0, depth_max, 1.0],
|
| 45 |
+
[0.0, y * depth_max, depth_max, 1.0],
|
| 46 |
+
[x * depth_max, y * depth_max, depth_max, 1.0],
|
| 47 |
+
[x * depth_max, 0.0, depth_max, 1.0]
|
| 48 |
+
],
|
| 49 |
+
device=intrinsic_inv.device, dtype=intrinsic_inv.dtype
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
frustum = intrinsic_inv @ eight_points.T
|
| 53 |
+
|
| 54 |
+
if transform is not None:
|
| 55 |
+
frustum = transform @ frustum
|
| 56 |
+
|
| 57 |
+
frustum = frustum.T
|
| 58 |
+
|
| 59 |
+
return frustum[:, :3]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def generate_frustum_volume(frustum: torch.Tensor, voxel_size: float):
|
| 63 |
+
"""Generate frustum volume.
|
| 64 |
+
Args:
|
| 65 |
+
frustum: Frustum.
|
| 66 |
+
voxel_size: Voxel size.
|
| 67 |
+
Returns:
|
| 68 |
+
Frustum volume.
|
| 69 |
+
"""
|
| 70 |
+
max_x = torch.max(frustum[:, 0]) / voxel_size
|
| 71 |
+
max_y = torch.max(frustum[:, 1]) / voxel_size
|
| 72 |
+
max_z = torch.max(frustum[:, 2]) / voxel_size
|
| 73 |
+
min_x = torch.min(frustum[:, 0]) / voxel_size
|
| 74 |
+
min_y = torch.min(frustum[:, 1]) / voxel_size
|
| 75 |
+
min_z = torch.min(frustum[:, 2]) / voxel_size
|
| 76 |
+
|
| 77 |
+
dim_x = torch.ceil(max_x - min_x)
|
| 78 |
+
dim_y = torch.ceil(max_y - min_y)
|
| 79 |
+
dim_z = torch.ceil(max_z - min_z)
|
| 80 |
+
|
| 81 |
+
camera2frustum = torch.as_tensor(
|
| 82 |
+
[
|
| 83 |
+
[1.0 / voxel_size, 0, 0, -min_x],
|
| 84 |
+
[0, 1.0 / voxel_size, 0, -min_y],
|
| 85 |
+
[0, 0, 1.0 / voxel_size, -min_z],
|
| 86 |
+
[0, 0, 0, 1.0]
|
| 87 |
+
],
|
| 88 |
+
dtype=frustum.dtype, device=frustum.device
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
return torch.stack((dim_x, dim_y, dim_z)), camera2frustum
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def compute_camera2frustum_transform(
|
| 95 |
+
frustum: torch.Tensor, voxel_size: float,
|
| 96 |
+
frustum_dimensions: Optional[torch.Tensor] = None
|
| 97 |
+
):
|
| 98 |
+
"""Compute camera to frustum transform.
|
| 99 |
+
Args:
|
| 100 |
+
frustum: Frustum.
|
| 101 |
+
voxel_size: Voxel size.
|
| 102 |
+
frustum_dimensions: Frustum dimensions.
|
| 103 |
+
Returns:
|
| 104 |
+
Camera to frustum transform.
|
| 105 |
+
"""
|
| 106 |
+
dimensions, camera2frustum = generate_frustum_volume(frustum, voxel_size)
|
| 107 |
+
if frustum_dimensions is not None:
|
| 108 |
+
difference = (frustum_dimensions - dimensions).float()
|
| 109 |
+
padding_offsets = torch.div(difference, 2, rounding_mode="floor")
|
| 110 |
+
return camera2frustum, padding_offsets
|
| 111 |
+
else:
|
| 112 |
+
return camera2frustum
|
nvpanoptix_3d/reconstruction/reprojection.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Sparse projection for Panoptic Recon 3D."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
from torch import nn
|
| 19 |
+
from torch.nn import functional as F
|
| 20 |
+
import MinkowskiEngine as Me
|
| 21 |
+
|
| 22 |
+
from ..utils.point_features import point_sample
|
| 23 |
+
from .frustum import generate_frustum, compute_camera2frustum_transform
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SparseProjection(nn.Module):
|
| 27 |
+
"""Sparse projection module."""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self, cfg,
|
| 31 |
+
truncation=3.0,
|
| 32 |
+
sign_channel=True,
|
| 33 |
+
depth_min=0.4,
|
| 34 |
+
depth_max=6.0,
|
| 35 |
+
voxel_size=0.03,
|
| 36 |
+
frustum_dims=256
|
| 37 |
+
):
|
| 38 |
+
"""Initialize the sparse projection module."""
|
| 39 |
+
super().__init__()
|
| 40 |
+
|
| 41 |
+
self.truncation = cfg.model.frustum3d.truncation
|
| 42 |
+
self.sign_channel = cfg.model.projection.sign_channel
|
| 43 |
+
self.depth_min = cfg.dataset.depth_min
|
| 44 |
+
self.depth_max = cfg.dataset.depth_max
|
| 45 |
+
self.voxel_size = cfg.model.projection.voxel_size
|
| 46 |
+
self.register_buffer(
|
| 47 |
+
"frustum_dimensions",
|
| 48 |
+
torch.tensor([frustum_dims, frustum_dims, frustum_dims]),
|
| 49 |
+
persistent=False
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def device(self):
|
| 54 |
+
"""Get the device of the sparse projection module."""
|
| 55 |
+
return self.frustum_dimensions.device
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def to_sparse_tensor(features, coordinates, stride=1):
|
| 59 |
+
"""Convert features and coordinates to a sparse tensor."""
|
| 60 |
+
ms_sparse_features = torch.cat(features, dim=0)
|
| 61 |
+
batched_coordinates = Me.utils.batched_coordinates(coordinates, device=ms_sparse_features.device)
|
| 62 |
+
batched_coordinates[:, 1:] *= stride
|
| 63 |
+
tensor = Me.SparseTensor(
|
| 64 |
+
features=ms_sparse_features,
|
| 65 |
+
coordinates=batched_coordinates,
|
| 66 |
+
tensor_stride=stride,
|
| 67 |
+
quantization_mode=Me.SparseTensorQuantizationMode.RANDOM_SUBSAMPLE
|
| 68 |
+
)
|
| 69 |
+
return tensor
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def projection(
|
| 73 |
+
frustum, voxel_size, frustum_dimensions,
|
| 74 |
+
truncation, intrinsic_inverse, depth,
|
| 75 |
+
image_size, feat_size, near_clip, far_clip
|
| 76 |
+
):
|
| 77 |
+
"""
|
| 78 |
+
Projection.
|
| 79 |
+
Args:
|
| 80 |
+
frustum: Frustum.
|
| 81 |
+
voxel_size: Voxel size.
|
| 82 |
+
frustum_dimensions: Frustum dimensions.
|
| 83 |
+
truncation: Truncation.
|
| 84 |
+
intrinsic_inverse: Inverse intrinsic matrix.
|
| 85 |
+
depth: Depth.
|
| 86 |
+
image_size: Image size.
|
| 87 |
+
feat_size: Feature size.
|
| 88 |
+
near_clip: Near clip.
|
| 89 |
+
far_clip: Far clip.
|
| 90 |
+
Returns:
|
| 91 |
+
num repetition: number of repetition.
|
| 92 |
+
segm sampling grid: segmentation sampling grid.
|
| 93 |
+
feat sampling grid: feature sampling grid.
|
| 94 |
+
flatten coordinates: flatten coordinates.
|
| 95 |
+
coordinates z: coordinates z.
|
| 96 |
+
voxel offsets: voxel offsets.
|
| 97 |
+
"""
|
| 98 |
+
camera2frustum, padding_offsets = compute_camera2frustum_transform(
|
| 99 |
+
frustum, voxel_size,
|
| 100 |
+
frustum_dimensions=frustum_dimensions
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
depth = depth.clone()
|
| 104 |
+
depth[depth < near_clip] = 0
|
| 105 |
+
depth[depth > far_clip] = 0
|
| 106 |
+
depth_pixels_xy = depth.nonzero(as_tuple=False)
|
| 107 |
+
device = depth_pixels_xy.device
|
| 108 |
+
|
| 109 |
+
if depth_pixels_xy.shape[0] == 0:
|
| 110 |
+
depth_pixels_xy = torch.tensor(
|
| 111 |
+
[[depth.shape[0] // 2, depth.shape[1] // 2]], device=device
|
| 112 |
+
)
|
| 113 |
+
depth_pixels_z = depth[depth_pixels_xy[:, 0], depth_pixels_xy[:, 1]].reshape(-1).float()
|
| 114 |
+
|
| 115 |
+
depth_pixels_xy = depth_pixels_xy.flip(-1).float()
|
| 116 |
+
normalized_depth_pixels_xy = depth_pixels_xy / torch.tensor(
|
| 117 |
+
[depth.shape[-1], depth.shape[-2]], device=device
|
| 118 |
+
)
|
| 119 |
+
xv, yv = (normalized_depth_pixels_xy * torch.tensor(
|
| 120 |
+
image_size, device=device) * depth_pixels_z[:, None]
|
| 121 |
+
).unbind(-1)
|
| 122 |
+
# Use separate size for feature maps due to size divisibility padding
|
| 123 |
+
feat_sampling_grid = depth_pixels_xy / torch.tensor(feat_size, device=device)
|
| 124 |
+
|
| 125 |
+
depth_pixels = torch.stack([xv, yv, depth_pixels_z, torch.ones_like(depth_pixels_z)])
|
| 126 |
+
pointcloud = torch.mm(intrinsic_inverse.float(), depth_pixels.float())
|
| 127 |
+
grid_coordinates = torch.mm(camera2frustum.float(), pointcloud).t()[:, :3].contiguous()
|
| 128 |
+
|
| 129 |
+
# projective sdf encoding
|
| 130 |
+
# repeat truncation, add / subtract z-offset
|
| 131 |
+
num_repetition = int(truncation * 2) + 1
|
| 132 |
+
grid_coordinates = grid_coordinates.unsqueeze(1).repeat(1, num_repetition, 1)
|
| 133 |
+
voxel_offsets = torch.arange(-truncation, truncation + 1, 1.0, device=device).view(1, -1, 1)
|
| 134 |
+
coordinates_z = grid_coordinates[:, :, 2].clone()
|
| 135 |
+
grid_coordinates[:, :, 2] += voxel_offsets[:, :, 0]
|
| 136 |
+
|
| 137 |
+
num_points = grid_coordinates.size(0)
|
| 138 |
+
|
| 139 |
+
flatten_coordinates = grid_coordinates.view(num_points * num_repetition, 3)
|
| 140 |
+
# pad to 256,256,256
|
| 141 |
+
flatten_coordinates = flatten_coordinates + padding_offsets
|
| 142 |
+
return num_repetition, normalized_depth_pixels_xy, \
|
| 143 |
+
feat_sampling_grid, flatten_coordinates, coordinates_z, voxel_offsets
|
| 144 |
+
|
| 145 |
+
def forward(self, multi_scale_features, encoder_features, batched_inputs) -> Me.SparseTensor:
|
| 146 |
+
"""Forward pass."""
|
| 147 |
+
sparse_ms_coordinates = [[] for _ in range(len(multi_scale_features))]
|
| 148 |
+
sparse_ms_features = [[] for _ in range(len(multi_scale_features))]
|
| 149 |
+
sparse_enc_features = []
|
| 150 |
+
sparse_enc_coordinates = []
|
| 151 |
+
|
| 152 |
+
# Process each sample in the batch individually
|
| 153 |
+
for idx, inputs in enumerate(batched_inputs):
|
| 154 |
+
# Get GT intrinsic matrix
|
| 155 |
+
intrinsic = inputs["intrinsic"].to(self.device)
|
| 156 |
+
image_size = inputs["image_size"]
|
| 157 |
+
padded_size = inputs["padded_size"]
|
| 158 |
+
intrinsic_inverse = torch.inverse(intrinsic)
|
| 159 |
+
|
| 160 |
+
frustum = generate_frustum(
|
| 161 |
+
image_size, intrinsic_inverse,
|
| 162 |
+
self.depth_min, self.depth_max
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
num_repetition, segm_sampling_grid, feat_sampling_grid, \
|
| 166 |
+
flatten_coordinates, coordinates_z, voxel_offsets = \
|
| 167 |
+
self.projection(
|
| 168 |
+
frustum, self.voxel_size,
|
| 169 |
+
self.frustum_dimensions, self.truncation,
|
| 170 |
+
intrinsic_inverse, inputs["depth"],
|
| 171 |
+
image_size, (padded_size[0] // 2, padded_size[1] // 2),
|
| 172 |
+
self.depth_min, self.depth_max
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
df_values = coordinates_z - coordinates_z.int()
|
| 176 |
+
df_values = df_values + voxel_offsets.squeeze(-1)
|
| 177 |
+
df_values.unsqueeze_(-1)
|
| 178 |
+
|
| 179 |
+
# encode sign and values in 2 different channels
|
| 180 |
+
if self.sign_channel:
|
| 181 |
+
sign = torch.sign(df_values)
|
| 182 |
+
value = torch.abs(df_values)
|
| 183 |
+
df_values = torch.cat([sign, value], dim=-1)
|
| 184 |
+
|
| 185 |
+
# segm features
|
| 186 |
+
sem_seg = inputs["sem_seg"]
|
| 187 |
+
sampled_segm_features = point_sample(sem_seg[None], segm_sampling_grid[None], align_corners=False)[0]
|
| 188 |
+
|
| 189 |
+
# encoder features
|
| 190 |
+
sampled_enc_features = point_sample(
|
| 191 |
+
encoder_features[[idx]],
|
| 192 |
+
feat_sampling_grid[None],
|
| 193 |
+
align_corners=False,
|
| 194 |
+
)[0]
|
| 195 |
+
sampled_enc_features = torch.cat([sampled_enc_features, sampled_segm_features], dim=0)
|
| 196 |
+
sampled_enc_features = sampled_enc_features.permute(1, 0).unsqueeze(1).repeat(1, num_repetition, 1)
|
| 197 |
+
sampled_enc_features = torch.cat([df_values, sampled_enc_features], dim=-1)
|
| 198 |
+
|
| 199 |
+
flat_features = sampled_enc_features.flatten(0, -2)
|
| 200 |
+
sparse_enc_coordinates.append(flatten_coordinates)
|
| 201 |
+
sparse_enc_features.append(flat_features)
|
| 202 |
+
|
| 203 |
+
# multi-scale features
|
| 204 |
+
for lvl, feat in enumerate(multi_scale_features):
|
| 205 |
+
ratio = feat.shape[-1] / encoder_features.shape[-1]
|
| 206 |
+
level_depth = F.interpolate(
|
| 207 |
+
inputs["depth"][None, None], scale_factor=ratio, mode="nearest"
|
| 208 |
+
).squeeze()
|
| 209 |
+
num_repetition, segm_sampling_grid, feat_sampling_grid, flatten_coordinates, *__ = \
|
| 210 |
+
self.projection(
|
| 211 |
+
frustum, self.voxel_size / ratio, self.frustum_dimensions * ratio,
|
| 212 |
+
round(ratio * self.truncation), intrinsic_inverse, level_depth,
|
| 213 |
+
image_size, (feat.shape[-1] * 2, feat.shape[-2] * 2),
|
| 214 |
+
self.depth_min, self.depth_max,
|
| 215 |
+
)
|
| 216 |
+
sampled_features = point_sample(
|
| 217 |
+
feat[[idx]],
|
| 218 |
+
feat_sampling_grid[None],
|
| 219 |
+
align_corners=False,
|
| 220 |
+
)[0]
|
| 221 |
+
sampled_features = \
|
| 222 |
+
sampled_features.permute(1, 0).unsqueeze(1).repeat(1, num_repetition, 1).flatten(0, -2)
|
| 223 |
+
sparse_ms_features[lvl].append(sampled_features)
|
| 224 |
+
# Resize feature volume
|
| 225 |
+
sparse_ms_coordinates[lvl].append(flatten_coordinates.clone())
|
| 226 |
+
|
| 227 |
+
# Batch
|
| 228 |
+
sparse_enc_features = self.to_sparse_tensor(sparse_enc_features, sparse_enc_coordinates)
|
| 229 |
+
strides = [2, 4, 8]
|
| 230 |
+
sparse_ms_features = [
|
| 231 |
+
self.to_sparse_tensor(feats, coords, stride=stride)
|
| 232 |
+
for feats, coords, stride in zip(sparse_ms_features, sparse_ms_coordinates, strides)
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
return sparse_ms_features, sparse_enc_features
|
nvpanoptix_3d/utils/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Utils for Panoptic Recon 3D."""
|
nvpanoptix_3d/utils/coords_transform.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Coordinate transform utils."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
import MinkowskiEngine as Me
|
| 19 |
+
from typing import List
|
| 20 |
+
|
| 21 |
+
from ..reconstruction.frustum import \
|
| 22 |
+
generate_frustum, compute_camera2frustum_transform
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def transform_feat3d_coordinates(
|
| 26 |
+
feat3d, intrinsic,
|
| 27 |
+
image_size=(120, 160),
|
| 28 |
+
depth_min=0.4, depth_max=6.0,
|
| 29 |
+
voxel_size=0.03
|
| 30 |
+
):
|
| 31 |
+
"""
|
| 32 |
+
Transform feat3d coordinates to match Uni3D coordinate system
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
feat3d: Me.SparseTensor from occupancy-aware lifting
|
| 36 |
+
intrinsic: Camera intrinsic matrix (4x4)
|
| 37 |
+
image_size: tuple of (height, width)
|
| 38 |
+
depth_min, depth_max: depth range
|
| 39 |
+
voxel_size: voxel size in meters
|
| 40 |
+
Returns:
|
| 41 |
+
Me.SparseTensor with transformed coordinates
|
| 42 |
+
"""
|
| 43 |
+
device = feat3d.device
|
| 44 |
+
coords = feat3d.C.clone()
|
| 45 |
+
|
| 46 |
+
# step 1: Apply coordinate flip (as done in BackProjection line 33)
|
| 47 |
+
coords[:, 1:3] = 256 - coords[:, 1:3] # flip x, y coordinates
|
| 48 |
+
batch_indices = coords[:, 0].unique()
|
| 49 |
+
|
| 50 |
+
compute_once = True
|
| 51 |
+
if intrinsic.dim() == 3: # batched intrinsics
|
| 52 |
+
# check if all intrinsics are identical
|
| 53 |
+
if len(batch_indices) > 1:
|
| 54 |
+
compute_once = torch.allclose(intrinsic[0:1].expand_as(intrinsic), intrinsic, atol=1e-6)
|
| 55 |
+
intrinsic_ref = intrinsic[0] if compute_once else None
|
| 56 |
+
else:
|
| 57 |
+
intrinsic_ref = intrinsic
|
| 58 |
+
|
| 59 |
+
if compute_once:
|
| 60 |
+
intrinsic_batch = intrinsic_ref
|
| 61 |
+
intrinsic_inverse = torch.inverse(intrinsic_batch)
|
| 62 |
+
frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
|
| 63 |
+
camera2frustum, padding_offsets = compute_camera2frustum_transform(
|
| 64 |
+
frustum.to(device), voxel_size,
|
| 65 |
+
frustum_dimensions=torch.tensor([256, 256, 256], device=device)
|
| 66 |
+
)
|
| 67 |
+
# pre-move to device and pre-compute inverse
|
| 68 |
+
camera2frustum = camera2frustum.to(device)
|
| 69 |
+
padding_offsets = padding_offsets.to(device)
|
| 70 |
+
camera2frustum_inv = torch.inverse(camera2frustum).float()
|
| 71 |
+
ones_offset = torch.tensor([1., 1., 1.], device=device)
|
| 72 |
+
|
| 73 |
+
transformed_coords_list = []
|
| 74 |
+
|
| 75 |
+
for batch_idx in batch_indices:
|
| 76 |
+
batch_mask = coords[:, 0] == batch_idx
|
| 77 |
+
batch_coords = coords[batch_mask, 1:].float() # convert to float once per batch
|
| 78 |
+
|
| 79 |
+
# use pre-computed values or compute per-batch
|
| 80 |
+
if not compute_once:
|
| 81 |
+
intrinsic_batch = intrinsic[int(batch_idx)]
|
| 82 |
+
intrinsic_inverse = torch.inverse(intrinsic_batch)
|
| 83 |
+
frustum = generate_frustum(image_size, intrinsic_inverse, depth_min, depth_max)
|
| 84 |
+
camera2frustum, padding_offsets = compute_camera2frustum_transform(
|
| 85 |
+
frustum.to(device), voxel_size,
|
| 86 |
+
frustum_dimensions=torch.tensor([256, 256, 256], device=device)
|
| 87 |
+
)
|
| 88 |
+
camera2frustum = camera2frustum.float().to(device)
|
| 89 |
+
padding_offsets = padding_offsets.to(device)
|
| 90 |
+
camera2frustum_inv = torch.inverse(camera2frustum).float()
|
| 91 |
+
ones_offset = torch.tensor([1., 1., 1.], device=device)
|
| 92 |
+
|
| 93 |
+
# convert voxel coordinates to world coordinates (reverse of BackProjection)
|
| 94 |
+
batch_coords_adjusted = batch_coords - padding_offsets - ones_offset
|
| 95 |
+
|
| 96 |
+
# convert to homogeneous coordinates
|
| 97 |
+
homogenous_coords = torch.cat([
|
| 98 |
+
batch_coords_adjusted,
|
| 99 |
+
torch.ones(batch_coords_adjusted.shape[0], 1, device=device)
|
| 100 |
+
], dim=1) # [N_batch, 4]
|
| 101 |
+
|
| 102 |
+
# apply transformations: world space -> frustum space
|
| 103 |
+
world_coords = torch.mm(camera2frustum_inv, homogenous_coords.t())
|
| 104 |
+
final_coords_homog = torch.mm(camera2frustum.float(), world_coords.float())
|
| 105 |
+
final_coords = final_coords_homog.t()[:, :3]
|
| 106 |
+
|
| 107 |
+
# add padding offsets (as done in SparseProjection.projection())
|
| 108 |
+
final_coords = final_coords + padding_offsets
|
| 109 |
+
|
| 110 |
+
# add batch index back
|
| 111 |
+
batch_column = torch.full(
|
| 112 |
+
(final_coords.shape[0], 1),
|
| 113 |
+
batch_idx,
|
| 114 |
+
device=device,
|
| 115 |
+
dtype=torch.float32
|
| 116 |
+
)
|
| 117 |
+
final_batch_coords = torch.cat([batch_column, final_coords], dim=1)
|
| 118 |
+
transformed_coords_list.append(final_batch_coords)
|
| 119 |
+
|
| 120 |
+
transformed_coords = torch.cat(transformed_coords_list, dim=0)
|
| 121 |
+
|
| 122 |
+
transformed_feat3d = Me.SparseTensor(
|
| 123 |
+
features=feat3d.F,
|
| 124 |
+
coordinates=transformed_coords.int(),
|
| 125 |
+
tensor_stride=feat3d.tensor_stride,
|
| 126 |
+
quantization_mode=feat3d.quantization_mode
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return transformed_feat3d
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def fuse_sparse_tensors(tensor1: Me.SparseTensor, tensor2: Me.SparseTensor) -> Me.SparseTensor:
|
| 133 |
+
"""
|
| 134 |
+
Efficiently fuse two sparse tensors
|
| 135 |
+
Args:
|
| 136 |
+
tensor1 (Me.SparseTensor): First sparse tensor
|
| 137 |
+
tensor2 (Me.SparseTensor): Second sparse tensor
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
Me.SparseTensor: Fused sparse tensor with concatenated features
|
| 141 |
+
"""
|
| 142 |
+
device = tensor1.device
|
| 143 |
+
dtype = tensor1.F.dtype
|
| 144 |
+
|
| 145 |
+
# get coordinates and features
|
| 146 |
+
coords1, feats1 = tensor1.C, tensor1.F
|
| 147 |
+
coords2, feats2 = tensor2.C, tensor2.F
|
| 148 |
+
|
| 149 |
+
feat_dim1, feat_dim2 = feats1.shape[1], feats2.shape[1]
|
| 150 |
+
fused_feat_dim = feat_dim1 + feat_dim2
|
| 151 |
+
|
| 152 |
+
# concatenate coordinates and create source tracking
|
| 153 |
+
all_coords = torch.cat([coords1, coords2], dim=0)
|
| 154 |
+
n_coords1 = coords1.shape[0]
|
| 155 |
+
|
| 156 |
+
# convert each coordinate row to a view that can be uniqued
|
| 157 |
+
coord_view = all_coords.view(all_coords.shape[0], -1)
|
| 158 |
+
|
| 159 |
+
# use torch.unique with return_inverse to get mapping
|
| 160 |
+
unique_coord_view, inverse_indices = torch.unique(coord_view, dim=0, return_inverse=True)
|
| 161 |
+
unique_coords = unique_coord_view.view(-1, coords1.shape[1])
|
| 162 |
+
n_unique = unique_coords.shape[0]
|
| 163 |
+
|
| 164 |
+
# split inverse indices for each tensor
|
| 165 |
+
inv_indices_1 = inverse_indices[:n_coords1]
|
| 166 |
+
inv_indices_2 = inverse_indices[n_coords1:]
|
| 167 |
+
|
| 168 |
+
# pre-allocate with zeros for automatic padding
|
| 169 |
+
fused_features = torch.zeros(n_unique, fused_feat_dim, device=device, dtype=dtype)
|
| 170 |
+
|
| 171 |
+
# tensor1 features go to positions [0:feat_dim1]
|
| 172 |
+
fused_features[inv_indices_1, :feat_dim1] = feats1
|
| 173 |
+
|
| 174 |
+
# tensor2 features go to positions [feat_dim1:feat_dim1+feat_dim2]
|
| 175 |
+
fused_features[inv_indices_2, feat_dim1:] = feats2
|
| 176 |
+
fused_tensor = Me.SparseTensor(
|
| 177 |
+
features=fused_features,
|
| 178 |
+
coordinates=unique_coords.int(),
|
| 179 |
+
tensor_stride=tensor1.tensor_stride,
|
| 180 |
+
quantization_mode=tensor1.quantization_mode
|
| 181 |
+
)
|
| 182 |
+
return fused_tensor
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def generate_multiscale_feat3d(transformed_feat3d: Me.SparseTensor) -> List[Me.SparseTensor]:
|
| 186 |
+
"""
|
| 187 |
+
Generate multi-scale sparse 3D features
|
| 188 |
+
from transformed_feat3d to match sparse_multi_scale_features structure.
|
| 189 |
+
Args:
|
| 190 |
+
transformed_feat3d (Me.SparseTensor):
|
| 191 |
+
Input sparse tensor from occupancy-aware lifting (256 grid)
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
List[Me.SparseTensor]: Multi-scale sparse tensors
|
| 195 |
+
at scales [1/2, 1/4, 1/8] corresponding to [128, 64, 32] grid sizes
|
| 196 |
+
"""
|
| 197 |
+
device = transformed_feat3d.device
|
| 198 |
+
|
| 199 |
+
# use consistent stride 2 for progressive downsampling
|
| 200 |
+
# this ensures proper 1/2, 1/4, 1/8 scaling from original 256 grid
|
| 201 |
+
pooling_op = Me.MinkowskiMaxPooling(
|
| 202 |
+
kernel_size=3,
|
| 203 |
+
stride=2,
|
| 204 |
+
dimension=3
|
| 205 |
+
).to(device)
|
| 206 |
+
|
| 207 |
+
multi_scale_feat3d = []
|
| 208 |
+
current_tensor = transformed_feat3d
|
| 209 |
+
target_strides = [2, 4, 8] # Expected final strides for each scale
|
| 210 |
+
|
| 211 |
+
# generate features at each scale by progressive pooling with stride 2
|
| 212 |
+
for _, target_stride in enumerate(target_strides):
|
| 213 |
+
# apply stride-2 pooling to get next scale
|
| 214 |
+
pooled_tensor = pooling_op(current_tensor)
|
| 215 |
+
|
| 216 |
+
# ensure the tensor stride matches expected value
|
| 217 |
+
# the stride should be: 2^(i+1) relative to original
|
| 218 |
+
if pooled_tensor.tensor_stride != target_stride:
|
| 219 |
+
pooled_tensor = Me.SparseTensor(
|
| 220 |
+
features=pooled_tensor.F,
|
| 221 |
+
coordinates=pooled_tensor.C,
|
| 222 |
+
tensor_stride=target_stride,
|
| 223 |
+
quantization_mode=pooled_tensor.quantization_mode
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
multi_scale_feat3d.append(pooled_tensor)
|
| 227 |
+
|
| 228 |
+
# use pooled tensor as input for next scale (progressive downsampling)
|
| 229 |
+
# this gives us: 256 → 128 → 64 → 32 grid sizes
|
| 230 |
+
current_tensor = pooled_tensor
|
| 231 |
+
|
| 232 |
+
return multi_scale_feat3d
|
nvpanoptix_3d/utils/frustum.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
""" Frustum utilities, mostly using numpy. """
|
| 16 |
+
|
| 17 |
+
import math
|
| 18 |
+
import torch
|
| 19 |
+
import numpy as np
|
| 20 |
+
from typing import Tuple
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def frustum2planes(frustum: np.ndarray) -> dict:
|
| 24 |
+
"""Convert frustum to planes.
|
| 25 |
+
Args:
|
| 26 |
+
frustum: Frustum.
|
| 27 |
+
Returns:
|
| 28 |
+
Planes.
|
| 29 |
+
"""
|
| 30 |
+
planes = {}
|
| 31 |
+
# normal towards inside
|
| 32 |
+
# near
|
| 33 |
+
a = frustum[3] - frustum[0]
|
| 34 |
+
b = frustum[1] - frustum[0]
|
| 35 |
+
normal = np.cross(a, b)
|
| 36 |
+
d = -np.dot(normal, frustum[0])
|
| 37 |
+
planes["near"] = np.array([normal[0], normal[1], normal[2], d])
|
| 38 |
+
|
| 39 |
+
# far
|
| 40 |
+
a = frustum[5] - frustum[4]
|
| 41 |
+
b = frustum[7] - frustum[4]
|
| 42 |
+
normal = np.cross(a, b)
|
| 43 |
+
d = -np.dot(normal, frustum[4])
|
| 44 |
+
planes["far"] = np.array([normal[0], normal[1], normal[2], d])
|
| 45 |
+
|
| 46 |
+
# left
|
| 47 |
+
a = frustum[5] - frustum[1]
|
| 48 |
+
b = frustum[0] - frustum[1]
|
| 49 |
+
normal = np.cross(a, b)
|
| 50 |
+
d = -np.dot(normal, frustum[1])
|
| 51 |
+
planes["left"] = np.array([normal[0], normal[1], normal[2], d])
|
| 52 |
+
|
| 53 |
+
# right
|
| 54 |
+
a = frustum[3] - frustum[2]
|
| 55 |
+
b = frustum[6] - frustum[2]
|
| 56 |
+
normal = np.cross(a, b)
|
| 57 |
+
d = -np.dot(normal, frustum[2])
|
| 58 |
+
planes["right"] = np.array([normal[0], normal[1], normal[2], d])
|
| 59 |
+
|
| 60 |
+
# top
|
| 61 |
+
a = frustum[4] - frustum[0]
|
| 62 |
+
b = frustum[3] - frustum[0]
|
| 63 |
+
normal = np.cross(a, b)
|
| 64 |
+
d = -np.dot(normal, frustum[0])
|
| 65 |
+
planes["top"] = np.array([normal[0], normal[1], normal[2], d])
|
| 66 |
+
|
| 67 |
+
# bottom
|
| 68 |
+
a = frustum[2] - frustum[1]
|
| 69 |
+
b = frustum[5] - frustum[1]
|
| 70 |
+
normal = np.cross(a, b)
|
| 71 |
+
d = -np.dot(normal, frustum[1])
|
| 72 |
+
planes["bottom"] = np.array([normal[0], normal[1], normal[2], d])
|
| 73 |
+
|
| 74 |
+
return planes
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def frustum_culling(points: np.ndarray, frustum: np.ndarray) -> np.ndarray:
|
| 78 |
+
"""Cull points outside frustum.
|
| 79 |
+
Args:
|
| 80 |
+
points: Points.
|
| 81 |
+
frustum: Frustum.
|
| 82 |
+
Returns:
|
| 83 |
+
Points inside frustum.
|
| 84 |
+
"""
|
| 85 |
+
frustum_planes = frustum2planes(frustum)
|
| 86 |
+
points = np.concatenate([points, np.ones((len(points), 1))], 1)
|
| 87 |
+
flags = np.ones(len(points))
|
| 88 |
+
for _, plane in frustum_planes.items():
|
| 89 |
+
flag = np.dot(points, plane) >= 0
|
| 90 |
+
flags = np.logical_and(flags, flag)
|
| 91 |
+
|
| 92 |
+
return points[flags][:, :3]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def frustum_transform(frustum: np.ndarray, transform: np.ndarray) -> np.ndarray:
|
| 96 |
+
"""Transform frustum.
|
| 97 |
+
Args:
|
| 98 |
+
frustum: Frustum.
|
| 99 |
+
transform: Transform matrix.
|
| 100 |
+
Returns:
|
| 101 |
+
Transformed frustum.
|
| 102 |
+
"""
|
| 103 |
+
eight_points = np.concatenate([frustum, np.ones((8, 1))], 1).transpose()
|
| 104 |
+
frustum = np.dot(transform, eight_points).transpose()
|
| 105 |
+
return frustum[:, :3]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def generate_frustum(
|
| 109 |
+
image_size: Tuple, intrinsic_inv: np.ndarray,
|
| 110 |
+
depth_min: float, depth_max: float,
|
| 111 |
+
transform: np.ndarray = None
|
| 112 |
+
) -> np.ndarray:
|
| 113 |
+
"""Generate frustum.
|
| 114 |
+
Args:
|
| 115 |
+
image_size: Image size.
|
| 116 |
+
intrinsic_inv: Inverse intrinsic matrix.
|
| 117 |
+
depth_min: Minimum depth.
|
| 118 |
+
depth_max: Maximum depth.
|
| 119 |
+
transform: Transform matrix.
|
| 120 |
+
Returns:
|
| 121 |
+
Frustum.
|
| 122 |
+
"""
|
| 123 |
+
x = image_size[1]
|
| 124 |
+
y = image_size[0]
|
| 125 |
+
|
| 126 |
+
eight_points = np.array([[0.0, 0.0, depth_min, 1.0],
|
| 127 |
+
[0.0, y * depth_min, depth_min, 1.0],
|
| 128 |
+
[x * depth_min, y * depth_min, depth_min, 1.0],
|
| 129 |
+
[x * depth_min, 0.0, depth_min, 1.0],
|
| 130 |
+
[0.0, 0.0, depth_max, 1.0],
|
| 131 |
+
[0.0, y * depth_max, depth_max, 1.0],
|
| 132 |
+
[x * depth_max, y * depth_max, depth_max, 1.0],
|
| 133 |
+
[x * depth_max, 0.0, depth_max, 1.0]]).transpose()
|
| 134 |
+
|
| 135 |
+
frustum = np.dot(intrinsic_inv, eight_points)
|
| 136 |
+
|
| 137 |
+
if transform is not None:
|
| 138 |
+
frustum = np.dot(transform, frustum)
|
| 139 |
+
|
| 140 |
+
frustum = frustum.transpose()
|
| 141 |
+
|
| 142 |
+
return frustum[:, :3]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def generate_frustum_volume(frustum: np.ndarray, voxel_size: float) -> Tuple:
|
| 146 |
+
"""Generate frustum volume.
|
| 147 |
+
Args:
|
| 148 |
+
frustum: Frustum.
|
| 149 |
+
voxel_size: Voxel size.
|
| 150 |
+
Returns:
|
| 151 |
+
Frustum volume.
|
| 152 |
+
Camera-to-frustum transform.
|
| 153 |
+
"""
|
| 154 |
+
max_x = np.max(frustum[:, 0]) / voxel_size
|
| 155 |
+
max_y = np.max(frustum[:, 1]) / voxel_size
|
| 156 |
+
max_z = np.max(frustum[:, 2]) / voxel_size
|
| 157 |
+
min_x = np.min(frustum[:, 0]) / voxel_size
|
| 158 |
+
min_y = np.min(frustum[:, 1]) / voxel_size
|
| 159 |
+
min_z = np.min(frustum[:, 2]) / voxel_size
|
| 160 |
+
|
| 161 |
+
dim_x = math.ceil(max_x - min_x)
|
| 162 |
+
dim_y = math.ceil(max_y - min_y)
|
| 163 |
+
dim_z = math.ceil(max_z - min_z)
|
| 164 |
+
|
| 165 |
+
camera2frustum = np.array([[1.0 / voxel_size, 0, 0, -min_x],
|
| 166 |
+
[0, 1.0 / voxel_size, 0, -min_y],
|
| 167 |
+
[0, 0, 1.0 / voxel_size, -min_z],
|
| 168 |
+
[0, 0, 0, 1.0]])
|
| 169 |
+
|
| 170 |
+
return (dim_x, dim_y, dim_z), camera2frustum
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def compute_camera2frustum_transform(
|
| 174 |
+
intrinsic: torch.Tensor, image_size: Tuple,
|
| 175 |
+
depth_min: float, depth_max: float,
|
| 176 |
+
voxel_size: float
|
| 177 |
+
) -> torch.Tensor:
|
| 178 |
+
"""Compute camera-to-frustum transform.
|
| 179 |
+
Args:
|
| 180 |
+
intrinsic: Intrinsic matrix.
|
| 181 |
+
image_size: Image size.
|
| 182 |
+
depth_min: Minimum depth.
|
| 183 |
+
depth_max: Maximum depth.
|
| 184 |
+
voxel_size: Voxel size.
|
| 185 |
+
Returns:
|
| 186 |
+
Camera-to-frustum transform.
|
| 187 |
+
"""
|
| 188 |
+
frustum = generate_frustum(image_size, torch.inverse(intrinsic).numpy(), depth_min, depth_max)
|
| 189 |
+
_, camera2frustum = generate_frustum_volume(frustum, voxel_size)
|
| 190 |
+
camera2frustum = torch.from_numpy(camera2frustum).float()
|
| 191 |
+
|
| 192 |
+
return camera2frustum
|
nvpanoptix_3d/utils/helper.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""Helper utils."""
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import copy
|
| 18 |
+
import itertools
|
| 19 |
+
import numpy as np
|
| 20 |
+
from functools import wraps
|
| 21 |
+
from contextlib import contextmanager
|
| 22 |
+
from typing import Tuple, Union, Optional
|
| 23 |
+
from fvcore.transforms.transform import Transform
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
import torch
|
| 27 |
+
import torch.nn as nn
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def adjust_intrinsic(
|
| 31 |
+
intrinsic: Union[np.array, torch.Tensor],
|
| 32 |
+
intrinsic_image_dim: Tuple,
|
| 33 |
+
image_dim: Tuple
|
| 34 |
+
) -> Union[np.array, torch.Tensor]:
|
| 35 |
+
"""
|
| 36 |
+
Adjust intrinsic camera parameters for image dimension changes.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
intrinsic: Camera intrinsic matrix (numpy array or torch tensor)
|
| 40 |
+
intrinsic_image_dim: Original image dimensions (width, height)
|
| 41 |
+
image_dim: Target image dimensions (width, height)
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Adjusted intrinsic matrix (same type as input)
|
| 45 |
+
"""
|
| 46 |
+
if intrinsic_image_dim == image_dim:
|
| 47 |
+
return intrinsic
|
| 48 |
+
|
| 49 |
+
# Calculate scaling factors
|
| 50 |
+
height_after = image_dim[1]
|
| 51 |
+
height_before = intrinsic_image_dim[1]
|
| 52 |
+
width_after = image_dim[0]
|
| 53 |
+
width_before = intrinsic_image_dim[0]
|
| 54 |
+
|
| 55 |
+
width_scale = float(width_after) / float(width_before)
|
| 56 |
+
height_scale = float(height_after) / float(height_before)
|
| 57 |
+
width_offset_scale = float(width_after - 1) / float(width_before - 1)
|
| 58 |
+
height_offset_scale = float(height_after - 1) / float(height_before - 1)
|
| 59 |
+
|
| 60 |
+
# handle numpy array case
|
| 61 |
+
if isinstance(intrinsic, np.ndarray):
|
| 62 |
+
intrinsic_return = np.copy(intrinsic)
|
| 63 |
+
|
| 64 |
+
intrinsic_return[0, 0] *= width_scale
|
| 65 |
+
intrinsic_return[1, 1] *= height_scale
|
| 66 |
+
# account for cropping/padding here
|
| 67 |
+
intrinsic_return[0, 2] *= width_offset_scale
|
| 68 |
+
intrinsic_return[1, 2] *= height_offset_scale
|
| 69 |
+
|
| 70 |
+
return intrinsic_return
|
| 71 |
+
|
| 72 |
+
# handle torch tensor case
|
| 73 |
+
elif isinstance(intrinsic, torch.Tensor):
|
| 74 |
+
intrinsic_return = intrinsic.clone()
|
| 75 |
+
|
| 76 |
+
intrinsic_return[:, 0, 0] *= width_scale
|
| 77 |
+
intrinsic_return[:, 1, 1] *= height_scale
|
| 78 |
+
|
| 79 |
+
intrinsic_return[:, 0, 2] *= width_offset_scale
|
| 80 |
+
intrinsic_return[:, 1, 2] *= height_offset_scale
|
| 81 |
+
|
| 82 |
+
return intrinsic_return
|
| 83 |
+
|
| 84 |
+
else:
|
| 85 |
+
raise TypeError(f"Unsupported input type: {type(intrinsic)}.")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class ModelInputResize(Transform):
|
| 89 |
+
"""Resize and pad the model input."""
|
| 90 |
+
|
| 91 |
+
def __init__(self, size_divisibility: int = 0, pad_value: float = 0):
|
| 92 |
+
"""Initialize model input resize transform."""
|
| 93 |
+
super().__init__()
|
| 94 |
+
self.size_divisibility = size_divisibility
|
| 95 |
+
self.pad_value = pad_value
|
| 96 |
+
|
| 97 |
+
def apply_coords(self, coords):
|
| 98 |
+
""" Apply transforms to the coordinates. """
|
| 99 |
+
return coords
|
| 100 |
+
|
| 101 |
+
def apply_image(self, array: torch.Tensor) -> torch.Tensor:
|
| 102 |
+
""" Apply transforms to the image. """
|
| 103 |
+
assert len(array) > 0
|
| 104 |
+
device = array.device
|
| 105 |
+
image_size = [array.shape[-2], array.shape[-1]]
|
| 106 |
+
|
| 107 |
+
max_size = torch.tensor(image_size, device=device)
|
| 108 |
+
if self.size_divisibility > 1:
|
| 109 |
+
stride = self.size_divisibility
|
| 110 |
+
max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride
|
| 111 |
+
|
| 112 |
+
u0 = max_size[-1] - image_size[1]
|
| 113 |
+
u1 = max_size[-2] - image_size[0]
|
| 114 |
+
padding_size = [0, u0, 0, u1]
|
| 115 |
+
|
| 116 |
+
array = F.pad(array, padding_size, value=self.pad_value)
|
| 117 |
+
return array
|
| 118 |
+
|
| 119 |
+
def apply_segmentation(self, array: torch.Tensor) -> torch.Tensor:
|
| 120 |
+
""" Apply transforms to the segmentation. """
|
| 121 |
+
return array
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@contextmanager
|
| 125 |
+
def _ignore_torch_cuda_oom():
|
| 126 |
+
"""
|
| 127 |
+
A context which ignores CUDA OOM exception from pytorch.
|
| 128 |
+
"""
|
| 129 |
+
try:
|
| 130 |
+
yield
|
| 131 |
+
except RuntimeError as e:
|
| 132 |
+
# NOTE: the string may change?
|
| 133 |
+
if "CUDA out of memory. " in str(e):
|
| 134 |
+
pass
|
| 135 |
+
else:
|
| 136 |
+
raise
|
| 137 |
+
|
| 138 |
+
def retry_if_cuda_oom(func):
|
| 139 |
+
"""
|
| 140 |
+
Makes a function retry itself after encountering
|
| 141 |
+
pytorch's CUDA OOM error.
|
| 142 |
+
It will first retry after calling `torch.cuda.empty_cache()`.
|
| 143 |
+
|
| 144 |
+
If that still fails, it will then retry by trying to convert inputs to CPUs.
|
| 145 |
+
In this case, it expects the function to dispatch to CPU implementation.
|
| 146 |
+
The return values may become CPU tensors as well and it's user's
|
| 147 |
+
responsibility to convert it back to CUDA tensor if needed.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
func: a stateless callable that takes tensor-like objects as arguments
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
a callable which retries `func` if OOM is encountered.
|
| 154 |
+
|
| 155 |
+
Examples:
|
| 156 |
+
::
|
| 157 |
+
output = retry_if_cuda_oom(some_torch_function)(input1, input2)
|
| 158 |
+
# output may be on CPU even if inputs are on GPU
|
| 159 |
+
|
| 160 |
+
Note:
|
| 161 |
+
1. When converting inputs to CPU, it will only look at each argument and check
|
| 162 |
+
if it has `.device` and `.to` for conversion. Nested structures of tensors
|
| 163 |
+
are not supported.
|
| 164 |
+
|
| 165 |
+
2. Since the function might be called more than once, it has to be
|
| 166 |
+
stateless.
|
| 167 |
+
"""
|
| 168 |
+
|
| 169 |
+
def maybe_to_cpu(x):
|
| 170 |
+
"""Convert to CPU."""
|
| 171 |
+
try:
|
| 172 |
+
like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
|
| 173 |
+
except AttributeError:
|
| 174 |
+
like_gpu_tensor = False
|
| 175 |
+
if like_gpu_tensor:
|
| 176 |
+
return x.to(device="cpu")
|
| 177 |
+
return x
|
| 178 |
+
|
| 179 |
+
@wraps(func)
|
| 180 |
+
def wrapped(*args, **kwargs):
|
| 181 |
+
"""Wrapped function."""
|
| 182 |
+
with _ignore_torch_cuda_oom():
|
| 183 |
+
return func(*args, **kwargs)
|
| 184 |
+
|
| 185 |
+
# Clear cache and retry
|
| 186 |
+
torch.cuda.empty_cache()
|
| 187 |
+
with _ignore_torch_cuda_oom():
|
| 188 |
+
return func(*args, **kwargs)
|
| 189 |
+
|
| 190 |
+
# Try on CPU. This slows down the code significantly, therefore print a notice.
|
| 191 |
+
logging.info(f"Attempting to copy inputs of {str(func)} to CPU due to CUDA OOM")
|
| 192 |
+
new_args = (maybe_to_cpu(x) for x in args)
|
| 193 |
+
new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
|
| 194 |
+
return func(*new_args, **new_kwargs)
|
| 195 |
+
|
| 196 |
+
return wrapped
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def prepare_kept_mapping(model, cfg, dataset, frustum_mask=None, intrinsic=None):
|
| 200 |
+
"""
|
| 201 |
+
Prepare kept and mapping tensors using back projection.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
model: The model instance with back_projection method
|
| 205 |
+
cfg: Configuration object
|
| 206 |
+
dataset: Dataset name ('front3d' or others)
|
| 207 |
+
frustum_mask: Optional frustum mask tensor
|
| 208 |
+
intrinsic: Intrinsic matrix tensor
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
tuple: (kept, mapping) tensors from back projection
|
| 212 |
+
"""
|
| 213 |
+
if dataset != "front3d":
|
| 214 |
+
intrinsic = adjust_intrinsic(
|
| 215 |
+
intrinsic,
|
| 216 |
+
tuple(cfg.dataset.target_size),
|
| 217 |
+
tuple(cfg.dataset.reduced_target_size)
|
| 218 |
+
)
|
| 219 |
+
kept, mapping = model.back_projection(
|
| 220 |
+
tuple(cfg.dataset.reduced_target_size[::-1]) + (256,),
|
| 221 |
+
intrinsic,
|
| 222 |
+
frustum_mask
|
| 223 |
+
)
|
| 224 |
+
return kept, mapping
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def get_kept_mapping(model, cfg, batch, device):
|
| 228 |
+
"""
|
| 229 |
+
Get kept and mapping for a batch of data (used for non-front3d datasets).
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
model: The model instance with back_projection method
|
| 233 |
+
cfg: Configuration object
|
| 234 |
+
batch: Batch data containing frustum_mask and intrinsic
|
| 235 |
+
device: Device to place tensors on
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
tuple: (kept, mapping) tensors
|
| 239 |
+
"""
|
| 240 |
+
frustum_mask = batch["frustum_mask"].to(device)
|
| 241 |
+
intrinsic = batch["intrinsic"].float().to(device)
|
| 242 |
+
dataset = cfg.dataset.name
|
| 243 |
+
|
| 244 |
+
kept, mapping = prepare_kept_mapping(
|
| 245 |
+
model,
|
| 246 |
+
cfg,
|
| 247 |
+
dataset,
|
| 248 |
+
frustum_mask=frustum_mask,
|
| 249 |
+
intrinsic=intrinsic
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
return kept, mapping
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def get_norm(norm, out_channels):
|
| 256 |
+
"""
|
| 257 |
+
Args:
|
| 258 |
+
norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
|
| 259 |
+
or a callable that takes a channel number and returns
|
| 260 |
+
the normalization layer as a nn.Module.
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
nn.Module or None: the normalization layer
|
| 264 |
+
"""
|
| 265 |
+
if norm is None:
|
| 266 |
+
return None
|
| 267 |
+
if isinstance(norm, str):
|
| 268 |
+
if len(norm) == 0:
|
| 269 |
+
return None
|
| 270 |
+
norm = {
|
| 271 |
+
"SyncBN": nn.SyncBatchNorm,
|
| 272 |
+
"GN": lambda channels: nn.GroupNorm(32, channels),
|
| 273 |
+
"LN": lambda channels: LayerNorm(channels),
|
| 274 |
+
}[norm]
|
| 275 |
+
return norm(out_channels)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
class Conv2d(nn.Conv2d):
|
| 279 |
+
"""
|
| 280 |
+
A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
def __init__(self, *args, **kwargs):
|
| 284 |
+
"""
|
| 285 |
+
Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
norm (nn.Module, optional): a normalization layer
|
| 289 |
+
activation (callable(Tensor) -> Tensor): a callable activation function
|
| 290 |
+
|
| 291 |
+
It assumes that norm layer is used before activation.
|
| 292 |
+
"""
|
| 293 |
+
norm = kwargs.pop("norm", None)
|
| 294 |
+
activation = kwargs.pop("activation", None)
|
| 295 |
+
super().__init__(*args, **kwargs)
|
| 296 |
+
|
| 297 |
+
self.norm = norm
|
| 298 |
+
self.activation = activation
|
| 299 |
+
|
| 300 |
+
def forward(self, x):
|
| 301 |
+
"""Forward pass."""
|
| 302 |
+
# torchscript does not support SyncBatchNorm yet
|
| 303 |
+
# https://github.com/pytorch/pytorch/issues/40507
|
| 304 |
+
# and we skip these codes in torchscript since:
|
| 305 |
+
# 1. currently we only support torchscript in evaluation mode
|
| 306 |
+
# 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
|
| 307 |
+
# later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
|
| 308 |
+
if not torch.jit.is_scripting():
|
| 309 |
+
# Dynamo doesn't support context managers yet
|
| 310 |
+
is_dynamo_compiling = is_compiling()
|
| 311 |
+
if not is_dynamo_compiling:
|
| 312 |
+
with warnings.catch_warnings(record=True):
|
| 313 |
+
if x.numel() == 0 and self.training:
|
| 314 |
+
# https://github.com/pytorch/pytorch/issues/12013
|
| 315 |
+
assert not isinstance(
|
| 316 |
+
self.norm, torch.nn.SyncBatchNorm
|
| 317 |
+
), "SyncBatchNorm does not support empty inputs!"
|
| 318 |
+
|
| 319 |
+
x = F.conv2d(
|
| 320 |
+
x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
|
| 321 |
+
)
|
| 322 |
+
if self.norm is not None:
|
| 323 |
+
x = self.norm(x)
|
| 324 |
+
if self.activation is not None:
|
| 325 |
+
x = self.activation(x)
|
| 326 |
+
return x
|
nvpanoptix_3d/utils/point_features.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Point feature utils for Mask2former."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
from torch.nn import functional as F
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def point_sample(inputs, point_coords, **kwargs):
|
| 22 |
+
"""
|
| 23 |
+
A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
|
| 24 |
+
Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
|
| 25 |
+
[0, 1] x [0, 1] square.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
inputs (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
|
| 29 |
+
point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
|
| 30 |
+
[0, 1] x [0, 1] normalized point coordinates.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
|
| 34 |
+
features for points in `point_coords`. The features are obtained via bilinear
|
| 35 |
+
interplation from `inputs` the same way as :function:`torch.nn.functional.grid_sample`.
|
| 36 |
+
"""
|
| 37 |
+
add_dim = False
|
| 38 |
+
if point_coords.dim() == 3:
|
| 39 |
+
add_dim = True
|
| 40 |
+
point_coords = point_coords.unsqueeze(2) # [c, self.num_points, 1, 2]
|
| 41 |
+
output = F.grid_sample(inputs, 2.0 * point_coords - 1.0, **kwargs) # [c, 1, self.num_points, 1]
|
| 42 |
+
if add_dim:
|
| 43 |
+
output = output.squeeze(3)
|
| 44 |
+
return output # [c, 1, self.num_points]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_uncertain_point_coords_with_randomness(
|
| 48 |
+
coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
|
| 49 |
+
):
|
| 50 |
+
"""
|
| 51 |
+
Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
|
| 52 |
+
are calculated for each point using 'uncertainty_func' function that takes point's logit
|
| 53 |
+
prediction as input.
|
| 54 |
+
See PointRend paper for details.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
|
| 58 |
+
class-specific or class-agnostic prediction.
|
| 59 |
+
uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
|
| 60 |
+
contains logit predictions for P points and returns their uncertainties as a Tensor of
|
| 61 |
+
shape (N, 1, P).
|
| 62 |
+
num_points (int): The number of points P to sample.
|
| 63 |
+
oversample_ratio (int): Oversampling parameter.
|
| 64 |
+
importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
|
| 68 |
+
sampled points.
|
| 69 |
+
"""
|
| 70 |
+
assert oversample_ratio >= 1
|
| 71 |
+
assert 0 <= importance_sample_ratio <= 1
|
| 72 |
+
num_boxes = coarse_logits.shape[0]
|
| 73 |
+
num_sampled = int(num_points * oversample_ratio)
|
| 74 |
+
point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
|
| 75 |
+
point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
|
| 76 |
+
# It is crucial to calculate uncertainty based on the sampled prediction value for the points.
|
| 77 |
+
# Calculating uncertainties of the coarse predictions first and sampling them for points leads
|
| 78 |
+
# to incorrect results.
|
| 79 |
+
# To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
|
| 80 |
+
# two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
|
| 81 |
+
# However, if we calculate uncertainties for the coarse predictions first,
|
| 82 |
+
# both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
|
| 83 |
+
point_uncertainties = uncertainty_func(point_logits)
|
| 84 |
+
num_uncertain_points = int(importance_sample_ratio * num_points)
|
| 85 |
+
num_random_points = num_points - num_uncertain_points
|
| 86 |
+
idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
|
| 87 |
+
shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
|
| 88 |
+
idx += shift[:, None]
|
| 89 |
+
point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
|
| 90 |
+
num_boxes, num_uncertain_points, 2
|
| 91 |
+
)
|
| 92 |
+
if num_random_points > 0:
|
| 93 |
+
point_coords = torch.cat(
|
| 94 |
+
[
|
| 95 |
+
point_coords,
|
| 96 |
+
torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
|
| 97 |
+
],
|
| 98 |
+
dim=1,
|
| 99 |
+
)
|
| 100 |
+
return point_coords
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
|
| 104 |
+
"""
|
| 105 |
+
Find `num_points` most uncertain points from `uncertainty_map` grid.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty
|
| 109 |
+
values for a set of points on a regular H x W grid.
|
| 110 |
+
num_points (int): The number of points P to select.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
point_indices (Tensor): A tensor of shape (N, P) that contains indices from
|
| 114 |
+
[0, H x W) of the most uncertain points.
|
| 115 |
+
point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized
|
| 116 |
+
coordinates of the most uncertain points from the H x W grid.
|
| 117 |
+
"""
|
| 118 |
+
R, _, H, W = uncertainty_map.shape
|
| 119 |
+
h_step = 1.0 / float(H)
|
| 120 |
+
w_step = 1.0 / float(W)
|
| 121 |
+
|
| 122 |
+
num_points = min(H * W, num_points)
|
| 123 |
+
point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1]
|
| 124 |
+
point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device)
|
| 125 |
+
point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step
|
| 126 |
+
point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step
|
| 127 |
+
return point_indices, point_coords
|
nvpanoptix_3d/utils/sparse_tensor.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""Sparse tensor utils."""
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
import MinkowskiEngine as Me
|
| 19 |
+
import torch.nn.functional as F
|
| 20 |
+
from typing import Optional, Tuple, Dict
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def sparse_cat_union(a: Me.SparseTensor, b: Me.SparseTensor):
|
| 24 |
+
"""Sparse cat union two sparse tensors."""
|
| 25 |
+
cm = a.coordinate_manager
|
| 26 |
+
stride = a.tensor_stride
|
| 27 |
+
assert cm == b.coordinate_manager, "different coords_man"
|
| 28 |
+
assert a.tensor_stride == b.tensor_stride, "different tensor_stride"
|
| 29 |
+
|
| 30 |
+
# handle empty tensors - if one is empty, return the other
|
| 31 |
+
if a.F.size(0) == 0 or a.F.numel() == 0:
|
| 32 |
+
return b
|
| 33 |
+
if b.F.size(0) == 0 or b.F.numel() == 0:
|
| 34 |
+
return a
|
| 35 |
+
# handle the error
|
| 36 |
+
try:
|
| 37 |
+
feats_a = F.pad(a.F, (0, b.F.shape[1]))
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print("Warning: Got error in feats_a:", e)
|
| 40 |
+
return a
|
| 41 |
+
try:
|
| 42 |
+
feats_b = F.pad(b.F, (a.F.shape[1], 0))
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print("Warning: Got error in feats_b:", e)
|
| 45 |
+
return b
|
| 46 |
+
|
| 47 |
+
new_a = Me.SparseTensor(
|
| 48 |
+
features=feats_a,
|
| 49 |
+
coordinate_map_key=a.coordinate_key,
|
| 50 |
+
coordinate_manager=cm,
|
| 51 |
+
tensor_stride=stride,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
new_b = Me.SparseTensor(
|
| 55 |
+
features=feats_b,
|
| 56 |
+
coordinate_map_key=b.coordinate_key,
|
| 57 |
+
coordinate_manager=cm,
|
| 58 |
+
tensor_stride=stride,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
return new_a + new_b
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def to_dense(
|
| 65 |
+
tensor: Me.SparseTensor,
|
| 66 |
+
shape: Optional[torch.Size] = None,
|
| 67 |
+
min_coordinate: Optional[torch.IntTensor] = None,
|
| 68 |
+
contract_stride: bool = True,
|
| 69 |
+
default_value: float = 0.0
|
| 70 |
+
) -> Tuple[torch.Tensor, torch.IntTensor, torch.IntTensor]:
|
| 71 |
+
"""Convert the :attr:`MinkowskiEngine.SparseTensor` to a torch dense
|
| 72 |
+
tensor.
|
| 73 |
+
Args:
|
| 74 |
+
:attr:`shape` (torch.Size, optional): The size of the output tensor.
|
| 75 |
+
:attr:`min_coordinate` (torch.IntTensor, optional): The min
|
| 76 |
+
coordinates of the output sparse tensor. Must be divisible by the
|
| 77 |
+
current :attr:`tensor_stride`. If 0 is given, it will use the origin for the min coordinate.
|
| 78 |
+
:attr:`contract_stride` (bool, optional): The output coordinates
|
| 79 |
+
will be divided by the tensor stride to make features spatially
|
| 80 |
+
contiguous. True by default.
|
| 81 |
+
Returns:
|
| 82 |
+
:attr:`tensor` (torch.Tensor): the torch tensor with size `[Batch
|
| 83 |
+
Dim, Feature Dim, Spatial Dim..., Spatial Dim]`. The coordinate of
|
| 84 |
+
each feature can be accessed via `min_coordinate + tensor_stride *
|
| 85 |
+
[the coordinate of the dense tensor]`.
|
| 86 |
+
:attr:`min_coordinate` (torch.IntTensor): the D-dimensional vector
|
| 87 |
+
defining the minimum coordinate of the output tensor.
|
| 88 |
+
:attr:`tensor_stride` (torch.IntTensor): the D-dimensional vector
|
| 89 |
+
defining the stride between tensor elements.
|
| 90 |
+
"""
|
| 91 |
+
if min_coordinate is not None:
|
| 92 |
+
assert isinstance(min_coordinate, torch.IntTensor)
|
| 93 |
+
assert min_coordinate.numel() == tensor._D
|
| 94 |
+
if shape is not None:
|
| 95 |
+
assert isinstance(shape, torch.Size)
|
| 96 |
+
assert len(shape) == tensor._D + 2 # batch and channel
|
| 97 |
+
if shape[1] != tensor._F.size(1):
|
| 98 |
+
shape = torch.Size([shape[0], tensor._F.size(1), *[s for s in shape[2:]]])
|
| 99 |
+
|
| 100 |
+
# exception handling for empty tensor
|
| 101 |
+
if tensor.__len__() == 0:
|
| 102 |
+
assert shape is not None, "shape is required to densify an empty tensor"
|
| 103 |
+
return (
|
| 104 |
+
torch.zeros(shape, dtype=tensor.dtype, device=tensor.device),
|
| 105 |
+
torch.zeros(tensor._D, dtype=torch.int32, device=tensor.device),
|
| 106 |
+
tensor.tensor_stride,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# use int tensor for all operations
|
| 110 |
+
tensor_stride = torch.IntTensor(tensor.tensor_stride).to(tensor.device)
|
| 111 |
+
|
| 112 |
+
# new coordinates
|
| 113 |
+
batch_indices = tensor.C[:, 0]
|
| 114 |
+
|
| 115 |
+
if min_coordinate is None:
|
| 116 |
+
min_coordinate, _ = tensor.C.min(0, keepdim=True)
|
| 117 |
+
min_coordinate = min_coordinate[:, 1:]
|
| 118 |
+
if not torch.all(min_coordinate >= 0):
|
| 119 |
+
raise ValueError(
|
| 120 |
+
f"Coordinate has a negative value: {min_coordinate}. Please provide min_coordinate argument"
|
| 121 |
+
)
|
| 122 |
+
coords = tensor.C[:, 1:]
|
| 123 |
+
elif isinstance(min_coordinate, int) and min_coordinate == 0:
|
| 124 |
+
coords = tensor.C[:, 1:]
|
| 125 |
+
else:
|
| 126 |
+
min_coordinate = min_coordinate.to(tensor.device)
|
| 127 |
+
if min_coordinate.ndim == 1:
|
| 128 |
+
min_coordinate = min_coordinate.unsqueeze(0)
|
| 129 |
+
coords = tensor.C[:, 1:] - min_coordinate
|
| 130 |
+
|
| 131 |
+
assert (
|
| 132 |
+
min_coordinate % tensor_stride
|
| 133 |
+
).sum() == 0, "The minimum coordinates must be divisible by the tensor stride."
|
| 134 |
+
|
| 135 |
+
if coords.ndim == 1:
|
| 136 |
+
coords = coords.unsqueeze(1)
|
| 137 |
+
|
| 138 |
+
# return the contracted tensor
|
| 139 |
+
if contract_stride:
|
| 140 |
+
coords = torch.div(coords, tensor_stride, rounding_mode="floor")
|
| 141 |
+
|
| 142 |
+
nchannels = tensor.F.size(1)
|
| 143 |
+
if shape is None:
|
| 144 |
+
size = coords.max(0)[0] + 1
|
| 145 |
+
shape = torch.Size(
|
| 146 |
+
[batch_indices.max() + 1, nchannels, *size.cpu().numpy()]
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
dense_F = torch.full(
|
| 150 |
+
shape, dtype=tensor.F.dtype,
|
| 151 |
+
device=tensor.F.device, fill_value=default_value
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
tcoords = coords.t().long()
|
| 155 |
+
batch_indices = batch_indices.long()
|
| 156 |
+
|
| 157 |
+
indices = (batch_indices, slice(None), *tcoords)
|
| 158 |
+
dense_F[indices] = tensor.F
|
| 159 |
+
|
| 160 |
+
tensor_stride = torch.IntTensor(tensor.tensor_stride)
|
| 161 |
+
return dense_F, min_coordinate, tensor_stride
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def _thicken_grid(grid, grid_dims, frustum_mask):
|
| 165 |
+
"""Thicken grid."""
|
| 166 |
+
device = frustum_mask.device
|
| 167 |
+
offsets = torch.nonzero(torch.ones(3, 3, 3, device=device)).long()
|
| 168 |
+
locs_grid = grid.nonzero(as_tuple=False)
|
| 169 |
+
locs = locs_grid.unsqueeze(1).repeat(1, 27, 1)
|
| 170 |
+
locs += offsets
|
| 171 |
+
locs = locs.view(-1, 3)
|
| 172 |
+
masks = ((locs >= 0) & (locs < torch.as_tensor(grid_dims, device=device))).all(-1)
|
| 173 |
+
locs = locs[masks]
|
| 174 |
+
|
| 175 |
+
thicken = torch.zeros(grid_dims, dtype=torch.bool, device=device)
|
| 176 |
+
thicken[locs[:, 0], locs[:, 1], locs[:, 2]] = True
|
| 177 |
+
# frustum culling
|
| 178 |
+
thicken = thicken & frustum_mask
|
| 179 |
+
|
| 180 |
+
return thicken
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def prepare_instance_masks_thicken(
|
| 184 |
+
instances: torch.Tensor,
|
| 185 |
+
semantic_mapping: Dict[int, int],
|
| 186 |
+
distance_field: torch.Tensor,
|
| 187 |
+
frustum_mask: torch.Tensor,
|
| 188 |
+
iso_value: float = 1.0,
|
| 189 |
+
truncation: float = 3.0,
|
| 190 |
+
downsample_factor: int = 1
|
| 191 |
+
) -> Dict[int, Tuple[torch.Tensor, int]]:
|
| 192 |
+
"""Prepare instance masks thicken."""
|
| 193 |
+
# check if downsample factor is valid
|
| 194 |
+
assert isinstance(downsample_factor, int) and 256 % downsample_factor == 0
|
| 195 |
+
grid_dims = [256, 256, 256]
|
| 196 |
+
need_rescale = downsample_factor != 1
|
| 197 |
+
if need_rescale:
|
| 198 |
+
grid_dims = (torch.as_tensor(grid_dims) // downsample_factor).tolist()
|
| 199 |
+
frustum_mask = F.interpolate(frustum_mask[None, None].float(),
|
| 200 |
+
size=grid_dims, mode="nearest").squeeze(0, 1).bool()
|
| 201 |
+
|
| 202 |
+
instance_information = {}
|
| 203 |
+
|
| 204 |
+
for instance_id, semantic_class in semantic_mapping.items():
|
| 205 |
+
instance_mask: torch.Tensor = (instances == instance_id)
|
| 206 |
+
instance_distance_field = torch.full_like(
|
| 207 |
+
instance_mask,
|
| 208 |
+
dtype=torch.float,
|
| 209 |
+
fill_value=truncation
|
| 210 |
+
)
|
| 211 |
+
instance_distance_field[instance_mask] = distance_field.squeeze()[instance_mask]
|
| 212 |
+
instance_distance_field_masked = instance_distance_field.abs() < iso_value
|
| 213 |
+
|
| 214 |
+
if need_rescale:
|
| 215 |
+
instance_distance_field_masked = F.max_pool3d(
|
| 216 |
+
instance_distance_field_masked[None, None].float(),
|
| 217 |
+
kernel_size=downsample_factor + 1,
|
| 218 |
+
stride=downsample_factor,
|
| 219 |
+
padding=1
|
| 220 |
+
).squeeze(0, 1).bool()
|
| 221 |
+
|
| 222 |
+
# instance_grid = instance_grid & frustum_mask
|
| 223 |
+
instance_grid = _thicken_grid(
|
| 224 |
+
instance_distance_field_masked,
|
| 225 |
+
grid_dims,
|
| 226 |
+
frustum_mask
|
| 227 |
+
)
|
| 228 |
+
instance_grid: torch.Tensor = instance_grid.to(torch.device("cpu"), non_blocking=True)
|
| 229 |
+
instance_information[instance_id] = instance_grid, semantic_class
|
| 230 |
+
|
| 231 |
+
return instance_information
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def mask_invalid_sparse_voxels(
|
| 235 |
+
grid: Me.SparseTensor,
|
| 236 |
+
mask=None, frustum_dim=[256, 256, 256]
|
| 237 |
+
) -> Me.SparseTensor:
|
| 238 |
+
"""Mask invalid sparse voxels."""
|
| 239 |
+
# Mask out voxels which are outside of the grid
|
| 240 |
+
valid_mask = (grid.C[:, 1] < frustum_dim[0] - 1) & (grid.C[:, 1] >= 0) & \
|
| 241 |
+
(grid.C[:, 2] < frustum_dim[1] - 1) & (grid.C[:, 2] >= 0) & \
|
| 242 |
+
(grid.C[:, 3] < frustum_dim[2] - 1) & (grid.C[:, 3] >= 0)
|
| 243 |
+
if mask is not None:
|
| 244 |
+
valid_mask = valid_mask * mask
|
| 245 |
+
num_valid_coordinates = valid_mask.sum()
|
| 246 |
+
|
| 247 |
+
if num_valid_coordinates == 0:
|
| 248 |
+
return {}, {}
|
| 249 |
+
|
| 250 |
+
num_masked_voxels = grid.C.size(0) - num_valid_coordinates
|
| 251 |
+
grids_needs_to_be_pruned = num_masked_voxels > 0
|
| 252 |
+
|
| 253 |
+
# Fix: Only prune if there are invalid voxels
|
| 254 |
+
if grids_needs_to_be_pruned:
|
| 255 |
+
grid = Me.MinkowskiPruning()(grid, valid_mask)
|
| 256 |
+
|
| 257 |
+
return grid
|
preprocessing.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
Preprocessing utilities for Panoptic Recon 3D model.
|
| 18 |
+
|
| 19 |
+
This module provides functions for:
|
| 20 |
+
- Image preprocessing and resizing
|
| 21 |
+
- Frustum mask generation
|
| 22 |
+
- Camera intrinsic handling
|
| 23 |
+
"""
|
| 24 |
+
import sys
|
| 25 |
+
from fvcore.transforms.transform import Transform
|
| 26 |
+
from typing import Optional, Tuple, Union
|
| 27 |
+
import numpy as np
|
| 28 |
+
import torch
|
| 29 |
+
import cv2
|
| 30 |
+
from PIL import Image
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Default Front3D camera intrinsic matrix
|
| 34 |
+
DEFAULT_INTRINSIC = np.array([
|
| 35 |
+
[277.1281435, 0., 159.5, 0.],
|
| 36 |
+
[0., 277.1281435, 119.5, 0.],
|
| 37 |
+
[0., 0., 1., 0.],
|
| 38 |
+
[0., 0., 0., 1.]
|
| 39 |
+
], dtype=np.float32)
|
| 40 |
+
|
| 41 |
+
# Default model parameters
|
| 42 |
+
DEFAULT_GRID_DIMS = (256, 256, 256)
|
| 43 |
+
DEFAULT_DEPTH_RANGE = (0.4, 6.0)
|
| 44 |
+
DEFAULT_VOXEL_SIZE = 0.03
|
| 45 |
+
DEFAULT_IMG_SIZE = (240, 320) # (height, width)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def create_frustum_mask(
|
| 49 |
+
intrinsics: Union[np.ndarray, torch.Tensor],
|
| 50 |
+
volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS,
|
| 51 |
+
depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE,
|
| 52 |
+
image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE,
|
| 53 |
+
voxel_size: float = DEFAULT_VOXEL_SIZE,
|
| 54 |
+
padding_pixels: float = 0.0,
|
| 55 |
+
volume_origin: Optional[np.ndarray] = None,
|
| 56 |
+
z_axis_reversed: bool = False,
|
| 57 |
+
) -> np.ndarray:
|
| 58 |
+
"""
|
| 59 |
+
Create a frustum mask for a voxel volume based on camera intrinsics.
|
| 60 |
+
|
| 61 |
+
This function determines which voxels in a 3D volume are visible from a camera
|
| 62 |
+
by checking if they project within the image bounds and depth range.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
intrinsics: Camera intrinsic matrix (3x3 or 4x4).
|
| 66 |
+
volume_shape: Shape of the voxel volume (nx, ny, nz).
|
| 67 |
+
depth_range: Min and max depth in meters (z_min, z_max).
|
| 68 |
+
image_shape: Image dimensions (height, width). If None, inferred from principal point.
|
| 69 |
+
voxel_size: Size of each voxel in meters.
|
| 70 |
+
padding_pixels: Expand frustum bounds by this many pixels.
|
| 71 |
+
volume_origin: Origin of the volume in camera space. If None, auto-computed.
|
| 72 |
+
z_axis_reversed: If True, z-index 0 is farthest.
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum.
|
| 76 |
+
"""
|
| 77 |
+
# Convert to numpy if tensor
|
| 78 |
+
if isinstance(intrinsics, torch.Tensor):
|
| 79 |
+
intrinsics = intrinsics.cpu().numpy()
|
| 80 |
+
|
| 81 |
+
# Ensure numpy array
|
| 82 |
+
intrinsics = np.asarray(intrinsics, dtype=np.float64)
|
| 83 |
+
|
| 84 |
+
assert intrinsics.shape in [(3, 3), (4, 4)], \
|
| 85 |
+
f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}"
|
| 86 |
+
assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}"
|
| 87 |
+
assert depth_range[0] < depth_range[1], \
|
| 88 |
+
f"depth_range must be (min, max) with min < max, got {depth_range}"
|
| 89 |
+
assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}"
|
| 90 |
+
|
| 91 |
+
# Extract camera parameters
|
| 92 |
+
K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics
|
| 93 |
+
fx, fy = K[0, 0], K[1, 1]
|
| 94 |
+
cx, cy = K[0, 2], K[1, 2]
|
| 95 |
+
|
| 96 |
+
# Determine image shape
|
| 97 |
+
if image_shape is None:
|
| 98 |
+
image_height = int(2 * cy)
|
| 99 |
+
image_width = int(2 * cx)
|
| 100 |
+
else:
|
| 101 |
+
image_height, image_width = image_shape
|
| 102 |
+
|
| 103 |
+
# Image bounds with padding
|
| 104 |
+
u_min = -padding_pixels
|
| 105 |
+
u_max = image_width + padding_pixels
|
| 106 |
+
v_min = -padding_pixels
|
| 107 |
+
v_max = image_height + padding_pixels
|
| 108 |
+
|
| 109 |
+
# Set volume origin
|
| 110 |
+
if volume_origin is None:
|
| 111 |
+
volume_origin = np.array([
|
| 112 |
+
-(volume_shape[0] * voxel_size) / 2,
|
| 113 |
+
-(volume_shape[1] * voxel_size) / 2,
|
| 114 |
+
(depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2
|
| 115 |
+
])
|
| 116 |
+
|
| 117 |
+
# Create voxel grid coordinates
|
| 118 |
+
x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0]
|
| 119 |
+
y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1]
|
| 120 |
+
z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2]
|
| 121 |
+
|
| 122 |
+
if z_axis_reversed:
|
| 123 |
+
z_coords = z_coords[::-1]
|
| 124 |
+
|
| 125 |
+
# Create meshgrid
|
| 126 |
+
xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij')
|
| 127 |
+
voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1)
|
| 128 |
+
|
| 129 |
+
# Depth constraint
|
| 130 |
+
depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1])
|
| 131 |
+
|
| 132 |
+
# Project to image plane
|
| 133 |
+
valid_depth = voxel_centers[:, 2] > 1e-6
|
| 134 |
+
u = np.full(len(voxel_centers), -1.0)
|
| 135 |
+
v = np.full(len(voxel_centers), -1.0)
|
| 136 |
+
|
| 137 |
+
u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx
|
| 138 |
+
v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy
|
| 139 |
+
|
| 140 |
+
# Image bounds check
|
| 141 |
+
image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max)
|
| 142 |
+
|
| 143 |
+
# Combine masks
|
| 144 |
+
frustum_mask_1d = depth_mask & image_mask & valid_depth
|
| 145 |
+
frustum_mask = frustum_mask_1d.reshape(volume_shape)
|
| 146 |
+
|
| 147 |
+
return frustum_mask
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def get_output_shape(
|
| 151 |
+
oldh: int,
|
| 152 |
+
oldw: int,
|
| 153 |
+
short_edge_length: int,
|
| 154 |
+
max_size: int
|
| 155 |
+
) -> Tuple[int, int]:
|
| 156 |
+
"""Compute output size given input size and target short edge length."""
|
| 157 |
+
h, w = oldh, oldw
|
| 158 |
+
size = short_edge_length * 1.0
|
| 159 |
+
scale = size / min(h, w)
|
| 160 |
+
if h < w:
|
| 161 |
+
newh, neww = size, scale * w
|
| 162 |
+
else:
|
| 163 |
+
newh, neww = scale * h, size
|
| 164 |
+
if max(newh, neww) > max_size:
|
| 165 |
+
scale = max_size * 1.0 / max(newh, neww)
|
| 166 |
+
newh = newh * scale
|
| 167 |
+
neww = neww * scale
|
| 168 |
+
neww = int(neww + 0.5)
|
| 169 |
+
newh = int(newh + 0.5)
|
| 170 |
+
return (newh, neww)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class ResizeShortestEdge(Transform):
|
| 174 |
+
def __init__(
|
| 175 |
+
self,
|
| 176 |
+
orig_size: Tuple[int, int],
|
| 177 |
+
short_edge_length,
|
| 178 |
+
max_size=sys.maxsize,
|
| 179 |
+
interp=cv2.INTER_LINEAR,
|
| 180 |
+
prob=1.0
|
| 181 |
+
):
|
| 182 |
+
""" Resize shortest edge transform. """
|
| 183 |
+
super().__init__()
|
| 184 |
+
self.orig_size = orig_size
|
| 185 |
+
if isinstance(short_edge_length, int):
|
| 186 |
+
short_edge_length = (short_edge_length, short_edge_length)
|
| 187 |
+
self.short_edge_length = short_edge_length
|
| 188 |
+
self.max_size = max_size
|
| 189 |
+
self.interp = interp
|
| 190 |
+
self.prob = prob
|
| 191 |
+
self._get_output_shape()
|
| 192 |
+
|
| 193 |
+
def _get_output_shape(self):
|
| 194 |
+
""" Get random output shape based on short edge length. """
|
| 195 |
+
h, w = self.orig_size
|
| 196 |
+
self.new_size = None
|
| 197 |
+
size = np.random.choice(self.short_edge_length)
|
| 198 |
+
if size != 0:
|
| 199 |
+
hh, ww = get_output_shape(h, w, size, self.max_size)
|
| 200 |
+
self.new_size = (ww, hh)
|
| 201 |
+
|
| 202 |
+
def apply_coords(self, coords):
|
| 203 |
+
""" Apply transforms to the coordinates. """
|
| 204 |
+
return coords
|
| 205 |
+
|
| 206 |
+
def apply_image(self, img, interp=None):
|
| 207 |
+
""" Apply transforms to the image. """
|
| 208 |
+
new_h, new_w = self.new_size
|
| 209 |
+
return cv2.resize(img, (new_w, new_h), interpolation=self.interp)
|
| 210 |
+
|
| 211 |
+
def apply_segmentation(self, segmentation):
|
| 212 |
+
""" Apply transforms to the segmentation. """
|
| 213 |
+
new_h, new_w = self.new_size
|
| 214 |
+
return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def adjust_intrinsic(
|
| 218 |
+
intrinsic: Union[np.ndarray, torch.Tensor],
|
| 219 |
+
original_size: Tuple[int, int],
|
| 220 |
+
target_size: Tuple[int, int],
|
| 221 |
+
) -> Union[np.ndarray, torch.Tensor]:
|
| 222 |
+
"""Adjust intrinsic matrix for image resize.
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
intrinsic: Camera intrinsic matrix (4x4 or 3x3).
|
| 226 |
+
original_size: Original image size (width, height).
|
| 227 |
+
target_size: Target image size (width, height).
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
Adjusted intrinsic matrix.
|
| 231 |
+
"""
|
| 232 |
+
is_tensor = isinstance(intrinsic, torch.Tensor)
|
| 233 |
+
if is_tensor:
|
| 234 |
+
device = intrinsic.device
|
| 235 |
+
dtype = intrinsic.dtype
|
| 236 |
+
intrinsic = intrinsic.cpu().numpy()
|
| 237 |
+
|
| 238 |
+
intrinsic = intrinsic.copy()
|
| 239 |
+
|
| 240 |
+
scale_x = target_size[0] / original_size[0]
|
| 241 |
+
scale_y = target_size[1] / original_size[1]
|
| 242 |
+
|
| 243 |
+
# Adjust focal length and principal point
|
| 244 |
+
intrinsic[0, 0] *= scale_x # fx
|
| 245 |
+
intrinsic[1, 1] *= scale_y # fy
|
| 246 |
+
intrinsic[0, 2] *= scale_x # cx
|
| 247 |
+
intrinsic[1, 2] *= scale_y # cy
|
| 248 |
+
|
| 249 |
+
if is_tensor:
|
| 250 |
+
intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype)
|
| 251 |
+
|
| 252 |
+
return intrinsic
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def load_image(
|
| 256 |
+
image_path: str,
|
| 257 |
+
target_size: Tuple[int, int] = (320, 240),
|
| 258 |
+
apply_resize_transform: bool = True,
|
| 259 |
+
) -> np.ndarray:
|
| 260 |
+
"""Load and preprocess image for Panoptic Recon 3D inference.
|
| 261 |
+
|
| 262 |
+
This function matches the preprocessing in test_triton_server.py exactly:
|
| 263 |
+
1. Load image as RGB
|
| 264 |
+
2. Resize to target_size (default 320x240)
|
| 265 |
+
3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320)
|
| 266 |
+
4. Convert to CHW format with batch dimension
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
image_path: Path to image file.
|
| 270 |
+
target_size: Target size (width, height). Default (320, 240).
|
| 271 |
+
apply_resize_transform: Whether to apply ResizeShortestEdge transform.
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
Image as numpy array (1, C, H, W) in RGB format, uint8 dtype.
|
| 275 |
+
"""
|
| 276 |
+
# Load image
|
| 277 |
+
img = Image.open(image_path).convert('RGB')
|
| 278 |
+
if img is None:
|
| 279 |
+
raise FileNotFoundError(f"Could not load image: {image_path}")
|
| 280 |
+
|
| 281 |
+
# Resize to target size
|
| 282 |
+
img = img.resize(target_size)
|
| 283 |
+
img = np.array(img)
|
| 284 |
+
|
| 285 |
+
# Apply ResizeShortestEdge transform (matches test_triton_server.py)
|
| 286 |
+
if apply_resize_transform:
|
| 287 |
+
resize_instance = ResizeShortestEdge(
|
| 288 |
+
orig_size=(target_size[0], target_size[1]), # (width, height)
|
| 289 |
+
short_edge_length=240,
|
| 290 |
+
max_size=320,
|
| 291 |
+
)
|
| 292 |
+
img = resize_instance.apply_image(img)
|
| 293 |
+
|
| 294 |
+
# Convert to CHW format with contiguous memory (critical for torch.from_numpy)
|
| 295 |
+
image = np.ascontiguousarray(img.transpose(2, 0, 1))
|
| 296 |
+
|
| 297 |
+
# Add batch dimension: (C, H, W) -> (1, C, H, W)
|
| 298 |
+
image = image[np.newaxis, ...]
|
| 299 |
+
|
| 300 |
+
return image
|
| 301 |
+
|
| 302 |
+
class DatasetConstants:
|
| 303 |
+
"""Constants for Front3D dataset."""
|
| 304 |
+
DEFAULT_GRID_DIMS = [256, 256, 256]
|
| 305 |
+
DEFAULT_DEPTH_RANGE = (0.4, 6.0)
|
| 306 |
+
DEFAULT_VOXEL_SIZE = 0.03
|
| 307 |
+
DEFAULT_IMG_SIZE = (240, 320) # (height, width)
|
| 308 |
+
IGNORE_LABEL = 255
|
| 309 |
+
|
| 310 |
+
INTRINSIC = DEFAULT_INTRINSIC
|
| 311 |
+
|
| 312 |
+
CATEGORIES = [
|
| 313 |
+
{"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"},
|
| 314 |
+
{"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"},
|
| 315 |
+
{"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"},
|
| 316 |
+
{"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"},
|
| 317 |
+
{"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"},
|
| 318 |
+
{"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"},
|
| 319 |
+
{"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"},
|
| 320 |
+
{"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"},
|
| 321 |
+
{"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"},
|
| 322 |
+
{"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"},
|
| 323 |
+
{"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"},
|
| 324 |
+
{"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"},
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
STUFF_CLASSES = [10, 11]
|
| 328 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==1.26.4
|
| 2 |
+
omegaconf
|
| 3 |
+
addict
|
| 4 |
+
pytest
|
| 5 |
+
pytorch-lightning
|
| 6 |
+
kubernetes
|
| 7 |
+
nvidia-eff
|
| 8 |
+
timm
|
| 9 |
+
open_clip_torch
|
| 10 |
+
colorama
|
| 11 |
+
pycocotools
|
| 12 |
+
fvcore
|
| 13 |
+
opencv-python-headless
|
| 14 |
+
huggingface-hub
|
| 15 |
+
trimesh
|
| 16 |
+
torchdata
|
| 17 |
+
pycryptodome
|
| 18 |
+
plyfile
|
| 19 |
+
pyexr
|
| 20 |
+
OpenEXR
|
| 21 |
+
einops
|
| 22 |
+
scikit-fmm
|
| 23 |
+
pysdf
|
| 24 |
+
scipy
|
| 25 |
+
plotly
|
| 26 |
+
orjson
|
| 27 |
+
pymongo
|
| 28 |
+
matplotlib
|
| 29 |
+
hydra-core
|
| 30 |
+
scikit-image
|
visualization.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
Visualization utilities for Panoptic Recon 3D model outputs.
|
| 18 |
+
|
| 19 |
+
This module provides functions for:
|
| 20 |
+
- 2D segmentation visualization
|
| 21 |
+
- Depth map visualization
|
| 22 |
+
- 3D mesh extraction and PLY export
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Optional, Tuple, Union
|
| 27 |
+
|
| 28 |
+
import numpy as np
|
| 29 |
+
|
| 30 |
+
# Optional imports for visualization
|
| 31 |
+
try:
|
| 32 |
+
import matplotlib.pyplot as plt
|
| 33 |
+
import matplotlib.patches as mpatches
|
| 34 |
+
HAS_MATPLOTLIB = True
|
| 35 |
+
except ImportError:
|
| 36 |
+
HAS_MATPLOTLIB = False
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
from PIL import Image
|
| 40 |
+
HAS_PIL = True
|
| 41 |
+
except ImportError:
|
| 42 |
+
HAS_PIL = False
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
from skimage import measure
|
| 46 |
+
HAS_SKIMAGE = True
|
| 47 |
+
except ImportError:
|
| 48 |
+
HAS_SKIMAGE = False
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from scipy.spatial import KDTree
|
| 52 |
+
HAS_SCIPY = True
|
| 53 |
+
except ImportError:
|
| 54 |
+
HAS_SCIPY = False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def create_color_palette() -> np.ndarray:
|
| 58 |
+
"""Create Front3D color palette for semantic classes.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Color palette as numpy array (N, 3) with uint8 RGB values.
|
| 62 |
+
"""
|
| 63 |
+
return np.array([
|
| 64 |
+
(0, 0, 0), # 0: background
|
| 65 |
+
(174, 199, 232), # 1: wall
|
| 66 |
+
(152, 223, 138), # 2: floor
|
| 67 |
+
(31, 119, 180), # 3: cabinet
|
| 68 |
+
(255, 187, 120), # 4: bed
|
| 69 |
+
(188, 189, 34), # 5: chair
|
| 70 |
+
(140, 86, 75), # 6: sofa
|
| 71 |
+
(255, 152, 150), # 7: table
|
| 72 |
+
(214, 39, 40), # 8: door
|
| 73 |
+
(197, 176, 213), # 9: window
|
| 74 |
+
(148, 103, 189), # 10: bookshelf
|
| 75 |
+
(196, 156, 148), # 11: picture
|
| 76 |
+
(23, 190, 207), # 12: counter
|
| 77 |
+
(178, 76, 76), # 13
|
| 78 |
+
(247, 182, 210), # 14: desk
|
| 79 |
+
(66, 188, 102), # 15
|
| 80 |
+
(219, 219, 141), # 16: curtain
|
| 81 |
+
(140, 57, 197), # 17
|
| 82 |
+
(202, 185, 52), # 18
|
| 83 |
+
(51, 176, 203), # 19
|
| 84 |
+
(200, 54, 131), # 20
|
| 85 |
+
(92, 193, 61), # 21
|
| 86 |
+
(78, 71, 183), # 22
|
| 87 |
+
(172, 114, 82), # 23
|
| 88 |
+
(255, 127, 14), # 24: refrigerator
|
| 89 |
+
(91, 163, 138), # 25
|
| 90 |
+
(153, 98, 156), # 26
|
| 91 |
+
(140, 153, 101), # 27
|
| 92 |
+
(158, 218, 229), # 28: shower curtain
|
| 93 |
+
(100, 125, 154), # 29
|
| 94 |
+
(178, 127, 135), # 30
|
| 95 |
+
(120, 185, 128), # 31
|
| 96 |
+
(146, 111, 194), # 32
|
| 97 |
+
(44, 160, 44), # 33: toilet
|
| 98 |
+
(112, 128, 144), # 34: sink
|
| 99 |
+
(96, 207, 209), # 35
|
| 100 |
+
(227, 119, 194), # 36: bathtub
|
| 101 |
+
(213, 92, 176), # 37
|
| 102 |
+
(94, 106, 211), # 38
|
| 103 |
+
(82, 84, 163), # 39: otherfurn
|
| 104 |
+
(100, 85, 144), # 40
|
| 105 |
+
(172, 172, 172), # 41
|
| 106 |
+
], dtype=np.uint8)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def colorize_segmentation(
|
| 110 |
+
segmentation: np.ndarray,
|
| 111 |
+
palette: Optional[np.ndarray] = None,
|
| 112 |
+
) -> np.ndarray:
|
| 113 |
+
"""Colorize segmentation map.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
segmentation: Segmentation map (H, W) with class indices.
|
| 117 |
+
palette: Color palette (N, 3). Uses default if None.
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Colorized image (H, W, 3) as uint8.
|
| 121 |
+
"""
|
| 122 |
+
if palette is None:
|
| 123 |
+
palette = create_color_palette()
|
| 124 |
+
|
| 125 |
+
# Clip indices to valid range
|
| 126 |
+
seg_clipped = np.clip(segmentation, 0, len(palette) - 1)
|
| 127 |
+
return palette[seg_clipped]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def visualize_2d_segmentation(
|
| 131 |
+
image: np.ndarray,
|
| 132 |
+
panoptic_2d: np.ndarray,
|
| 133 |
+
output_path: Optional[Union[str, Path]] = None,
|
| 134 |
+
alpha: float = 0.6,
|
| 135 |
+
figsize: Tuple[int, int] = (18, 6),
|
| 136 |
+
dpi: int = 150,
|
| 137 |
+
) -> Optional[np.ndarray]:
|
| 138 |
+
"""Visualize 2D panoptic segmentation overlaid on image.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
image: Original RGB image (H, W, C).
|
| 142 |
+
panoptic_2d: Panoptic segmentation map (H, W).
|
| 143 |
+
output_path: Path to save visualization. If None, returns array.
|
| 144 |
+
alpha: Blend alpha for overlay.
|
| 145 |
+
figsize: Figure size.
|
| 146 |
+
dpi: DPI for saved figure.
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Overlay image as numpy array if output_path is None.
|
| 150 |
+
"""
|
| 151 |
+
if not HAS_MATPLOTLIB:
|
| 152 |
+
raise ImportError("matplotlib required for visualization")
|
| 153 |
+
if not HAS_PIL:
|
| 154 |
+
raise ImportError("PIL required for visualization")
|
| 155 |
+
|
| 156 |
+
# Get color palette
|
| 157 |
+
palette = create_color_palette()
|
| 158 |
+
colored_seg = colorize_segmentation(panoptic_2d, palette)
|
| 159 |
+
|
| 160 |
+
# Resize image to match segmentation if needed
|
| 161 |
+
if image.shape[:2] != panoptic_2d.shape:
|
| 162 |
+
image_pil = Image.fromarray(image)
|
| 163 |
+
image_pil = image_pil.resize((panoptic_2d.shape[1], panoptic_2d.shape[0]), Image.LANCZOS)
|
| 164 |
+
image = np.array(image_pil)
|
| 165 |
+
|
| 166 |
+
# Create overlay
|
| 167 |
+
overlay = (image.astype(np.float32) * (1 - alpha) + colored_seg.astype(np.float32) * alpha)
|
| 168 |
+
overlay = overlay.clip(0, 255).astype(np.uint8)
|
| 169 |
+
|
| 170 |
+
if output_path is None:
|
| 171 |
+
return overlay
|
| 172 |
+
|
| 173 |
+
# Create side-by-side visualization
|
| 174 |
+
fig, axes = plt.subplots(1, 3, figsize=figsize)
|
| 175 |
+
|
| 176 |
+
axes[0].imshow(image)
|
| 177 |
+
axes[0].set_title('Original Image', fontsize=14, fontweight='bold')
|
| 178 |
+
axes[0].axis('off')
|
| 179 |
+
|
| 180 |
+
axes[1].imshow(colored_seg)
|
| 181 |
+
axes[1].set_title('Panoptic Segmentation', fontsize=14, fontweight='bold')
|
| 182 |
+
axes[1].axis('off')
|
| 183 |
+
|
| 184 |
+
axes[2].imshow(overlay)
|
| 185 |
+
axes[2].set_title('Overlay', fontsize=14, fontweight='bold')
|
| 186 |
+
axes[2].axis('off')
|
| 187 |
+
|
| 188 |
+
plt.tight_layout()
|
| 189 |
+
plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
|
| 190 |
+
plt.close()
|
| 191 |
+
|
| 192 |
+
print(f"✓ Saved 2D segmentation visualization to: {output_path}")
|
| 193 |
+
return None
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def visualize_depth_map(
|
| 197 |
+
depth_2d: np.ndarray,
|
| 198 |
+
output_path: Optional[Union[str, Path]] = None,
|
| 199 |
+
vmin: float = 0.0,
|
| 200 |
+
vmax: float = 6.0,
|
| 201 |
+
cmap: str = 'viridis',
|
| 202 |
+
figsize: Tuple[int, int] = (10, 8),
|
| 203 |
+
dpi: int = 150,
|
| 204 |
+
) -> Optional[np.ndarray]:
|
| 205 |
+
"""Visualize depth map.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
depth_2d: Depth map (H, W).
|
| 209 |
+
output_path: Path to save visualization. If None, returns array.
|
| 210 |
+
vmin: Minimum depth for colormap.
|
| 211 |
+
vmax: Maximum depth for colormap.
|
| 212 |
+
cmap: Matplotlib colormap name.
|
| 213 |
+
figsize: Figure size.
|
| 214 |
+
dpi: DPI for saved figure.
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
Colorized depth as numpy array if output_path is None.
|
| 218 |
+
"""
|
| 219 |
+
if not HAS_MATPLOTLIB:
|
| 220 |
+
raise ImportError("matplotlib required for visualization")
|
| 221 |
+
|
| 222 |
+
# Normalize depth
|
| 223 |
+
depth_norm = (depth_2d - vmin) / (vmax - vmin)
|
| 224 |
+
depth_norm = np.clip(depth_norm, 0, 1)
|
| 225 |
+
|
| 226 |
+
# Get colormap
|
| 227 |
+
cm = plt.get_cmap(cmap)
|
| 228 |
+
depth_colored = (cm(depth_norm)[:, :, :3] * 255).astype(np.uint8)
|
| 229 |
+
|
| 230 |
+
if output_path is None:
|
| 231 |
+
return depth_colored
|
| 232 |
+
|
| 233 |
+
fig, ax = plt.subplots(1, 1, figsize=figsize)
|
| 234 |
+
|
| 235 |
+
im = ax.imshow(depth_2d, cmap=cmap, vmin=vmin, vmax=vmax)
|
| 236 |
+
ax.set_title('Depth Map', fontsize=14, fontweight='bold')
|
| 237 |
+
ax.axis('off')
|
| 238 |
+
|
| 239 |
+
cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
| 240 |
+
cbar.set_label('Depth (m)', rotation=270, labelpad=20, fontsize=12)
|
| 241 |
+
|
| 242 |
+
plt.tight_layout()
|
| 243 |
+
plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
|
| 244 |
+
plt.close()
|
| 245 |
+
|
| 246 |
+
print(f"✓ Saved depth map visualization to: {output_path}")
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def get_mesh(
|
| 251 |
+
distance_field: np.ndarray,
|
| 252 |
+
iso_value: float = 1.0,
|
| 253 |
+
spacing: Tuple[float, float, float] = (1.0, 1.0, 1.0),
|
| 254 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 255 |
+
"""Extract mesh from distance field using marching cubes.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
distance_field: 3D distance field (D, H, W).
|
| 259 |
+
iso_value: Iso-surface value.
|
| 260 |
+
spacing: Voxel spacing.
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
vertices: Mesh vertices (N, 3).
|
| 264 |
+
faces: Mesh faces (M, 3).
|
| 265 |
+
"""
|
| 266 |
+
if not HAS_SKIMAGE:
|
| 267 |
+
raise ImportError("scikit-image required for mesh extraction")
|
| 268 |
+
|
| 269 |
+
vertices, faces, _, _ = measure.marching_cubes(
|
| 270 |
+
distance_field,
|
| 271 |
+
level=iso_value,
|
| 272 |
+
spacing=spacing
|
| 273 |
+
)
|
| 274 |
+
return vertices, faces
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def write_ply(
|
| 278 |
+
vertices: np.ndarray,
|
| 279 |
+
output_file: Union[str, Path],
|
| 280 |
+
colors: Optional[np.ndarray] = None,
|
| 281 |
+
faces: Optional[np.ndarray] = None,
|
| 282 |
+
) -> None:
|
| 283 |
+
"""Write PLY file.
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
vertices: Vertex positions (N, 3).
|
| 287 |
+
output_file: Output PLY file path.
|
| 288 |
+
colors: Optional vertex colors (N, 3) as uint8.
|
| 289 |
+
faces: Optional face indices (M, 3).
|
| 290 |
+
"""
|
| 291 |
+
with open(output_file, "w") as f:
|
| 292 |
+
f.write("ply\n")
|
| 293 |
+
f.write("format ascii 1.0\n")
|
| 294 |
+
f.write(f"element vertex {len(vertices)}\n")
|
| 295 |
+
f.write("property float x\n")
|
| 296 |
+
f.write("property float y\n")
|
| 297 |
+
f.write("property float z\n")
|
| 298 |
+
|
| 299 |
+
if colors is not None:
|
| 300 |
+
f.write("property uchar red\n")
|
| 301 |
+
f.write("property uchar green\n")
|
| 302 |
+
f.write("property uchar blue\n")
|
| 303 |
+
|
| 304 |
+
if faces is not None and len(faces) > 0:
|
| 305 |
+
f.write(f"element face {len(faces)}\n")
|
| 306 |
+
f.write("property list uchar uint vertex_indices\n")
|
| 307 |
+
|
| 308 |
+
f.write("end_header\n")
|
| 309 |
+
|
| 310 |
+
# Write vertices
|
| 311 |
+
if colors is not None:
|
| 312 |
+
for v, c in zip(vertices, colors):
|
| 313 |
+
f.write(f"{v[0]} {v[1]} {v[2]} {int(c[0])} {int(c[1])} {int(c[2])}\n")
|
| 314 |
+
else:
|
| 315 |
+
for v in vertices:
|
| 316 |
+
f.write(f"{v[0]} {v[1]} {v[2]}\n")
|
| 317 |
+
|
| 318 |
+
# Write faces
|
| 319 |
+
if faces is not None:
|
| 320 |
+
for face in faces:
|
| 321 |
+
f.write(f"3 {face[0]} {face[1]} {face[2]}\n")
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def save_3d_mesh(
|
| 325 |
+
geometry_3d: np.ndarray,
|
| 326 |
+
semantic_3d: np.ndarray,
|
| 327 |
+
output_path: Union[str, Path],
|
| 328 |
+
iso_value: float = 1.0,
|
| 329 |
+
voxel_size: float = 0.03,
|
| 330 |
+
) -> bool:
|
| 331 |
+
"""Extract and save 3D mesh with semantic colors.
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
geometry_3d: 3D geometry/TSDF (D, H, W).
|
| 335 |
+
semantic_3d: 3D semantic segmentation (D, H, W).
|
| 336 |
+
output_path: Output PLY file path.
|
| 337 |
+
iso_value: Iso-surface value for mesh extraction.
|
| 338 |
+
voxel_size: Voxel size in meters.
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
True if successful, False otherwise.
|
| 342 |
+
"""
|
| 343 |
+
if not HAS_SKIMAGE:
|
| 344 |
+
print("Warning: scikit-image not installed. Cannot save PLY mesh.")
|
| 345 |
+
return False
|
| 346 |
+
if not HAS_SCIPY:
|
| 347 |
+
print("Warning: scipy not installed. Cannot color mesh by semantics.")
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
# Extract mesh
|
| 351 |
+
vertices, faces = get_mesh(
|
| 352 |
+
geometry_3d,
|
| 353 |
+
iso_value=iso_value,
|
| 354 |
+
spacing=(voxel_size, voxel_size, voxel_size)
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
colors = None
|
| 358 |
+
if HAS_SCIPY and np.any(semantic_3d):
|
| 359 |
+
# Get non-zero labeled voxels
|
| 360 |
+
nonzero_coords = np.stack(semantic_3d.nonzero(), axis=-1)
|
| 361 |
+
|
| 362 |
+
if len(nonzero_coords) > 0:
|
| 363 |
+
# Build KD tree for nearest neighbor lookup
|
| 364 |
+
labels_kd = KDTree(nonzero_coords)
|
| 365 |
+
palette = create_color_palette()
|
| 366 |
+
|
| 367 |
+
# Create color volume
|
| 368 |
+
semantic_clipped = np.clip(semantic_3d, 0, len(palette) - 1).astype(np.uint32)
|
| 369 |
+
color_volume = palette[semantic_clipped]
|
| 370 |
+
|
| 371 |
+
# Find nearest label for each vertex
|
| 372 |
+
# Scale vertices to voxel indices
|
| 373 |
+
vertex_indices = (vertices / voxel_size).astype(int)
|
| 374 |
+
neighbor_inds = labels_kd.query(vertex_indices)[1]
|
| 375 |
+
neighbors = labels_kd.data[neighbor_inds].astype(int)
|
| 376 |
+
|
| 377 |
+
# Clip to valid indices
|
| 378 |
+
neighbors = np.clip(neighbors, 0, np.array(color_volume.shape[:3]) - 1)
|
| 379 |
+
colors = color_volume[neighbors[:, 0], neighbors[:, 1], neighbors[:, 2]]
|
| 380 |
+
|
| 381 |
+
# Write PLY
|
| 382 |
+
write_ply(vertices, output_path, colors, faces)
|
| 383 |
+
print(f"✓ Saved 3D mesh to: {output_path}")
|
| 384 |
+
print(f" Vertices: {len(vertices)}, Faces: {len(faces)}")
|
| 385 |
+
return True
|
| 386 |
+
|
| 387 |
+
except Exception as e:
|
| 388 |
+
print(f"Warning: Failed to save 3D mesh: {e}")
|
| 389 |
+
return False
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def save_outputs(
|
| 393 |
+
outputs,
|
| 394 |
+
output_dir: Union[str, Path],
|
| 395 |
+
original_image: Optional[np.ndarray] = None,
|
| 396 |
+
save_mesh: bool = True,
|
| 397 |
+
save_depth: bool = True,
|
| 398 |
+
save_segmentation: bool = True,
|
| 399 |
+
save_numpy: bool = True,
|
| 400 |
+
) -> dict:
|
| 401 |
+
"""Save all model outputs to directory.
|
| 402 |
+
|
| 403 |
+
Args:
|
| 404 |
+
outputs: PanopticRecon3DOutput from model.
|
| 405 |
+
output_dir: Output directory.
|
| 406 |
+
original_image: Optional original input image for visualization.
|
| 407 |
+
save_mesh: Whether to save 3D mesh PLY files.
|
| 408 |
+
save_depth: Whether to save depth visualization.
|
| 409 |
+
save_segmentation: Whether to save segmentation visualization.
|
| 410 |
+
save_numpy: Whether to save raw numpy arrays.
|
| 411 |
+
|
| 412 |
+
Returns:
|
| 413 |
+
Dictionary of saved file paths.
|
| 414 |
+
"""
|
| 415 |
+
output_dir = Path(output_dir)
|
| 416 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 417 |
+
|
| 418 |
+
saved_files = {}
|
| 419 |
+
|
| 420 |
+
# Convert outputs to numpy
|
| 421 |
+
outputs_np = outputs.to_numpy()
|
| 422 |
+
|
| 423 |
+
# Save numpy arrays
|
| 424 |
+
if save_numpy:
|
| 425 |
+
for name, arr in outputs_np.items():
|
| 426 |
+
npy_path = output_dir / f"{name}.npy"
|
| 427 |
+
np.save(npy_path, arr)
|
| 428 |
+
saved_files[f"{name}_npy"] = str(npy_path)
|
| 429 |
+
|
| 430 |
+
# Save 2D segmentation visualization
|
| 431 |
+
if save_segmentation and original_image is not None:
|
| 432 |
+
seg_path = output_dir / "panoptic_2d_visualization.png"
|
| 433 |
+
visualize_2d_segmentation(
|
| 434 |
+
original_image,
|
| 435 |
+
outputs_np["panoptic_seg_2d"],
|
| 436 |
+
seg_path
|
| 437 |
+
)
|
| 438 |
+
saved_files["segmentation_vis"] = str(seg_path)
|
| 439 |
+
|
| 440 |
+
# Save depth visualization
|
| 441 |
+
if save_depth:
|
| 442 |
+
depth_path = output_dir / "depth_visualization.png"
|
| 443 |
+
visualize_depth_map(
|
| 444 |
+
outputs_np["depth_2d"],
|
| 445 |
+
depth_path
|
| 446 |
+
)
|
| 447 |
+
saved_files["depth_vis"] = str(depth_path)
|
| 448 |
+
|
| 449 |
+
# Save 3D meshes
|
| 450 |
+
if save_mesh:
|
| 451 |
+
# Semantic mesh
|
| 452 |
+
semantic_mesh_path = output_dir / "mesh_semantic.ply"
|
| 453 |
+
if save_3d_mesh(
|
| 454 |
+
outputs_np["geometry_3d"],
|
| 455 |
+
outputs_np["semantic_seg_3d"],
|
| 456 |
+
semantic_mesh_path
|
| 457 |
+
):
|
| 458 |
+
saved_files["semantic_mesh"] = str(semantic_mesh_path)
|
| 459 |
+
|
| 460 |
+
# Panoptic mesh
|
| 461 |
+
panoptic_mesh_path = output_dir / "mesh_panoptic.ply"
|
| 462 |
+
if save_3d_mesh(
|
| 463 |
+
outputs_np["geometry_3d"],
|
| 464 |
+
outputs_np["panoptic_seg_3d"],
|
| 465 |
+
panoptic_mesh_path
|
| 466 |
+
):
|
| 467 |
+
saved_files["panoptic_mesh"] = str(panoptic_mesh_path)
|
| 468 |
+
|
| 469 |
+
return saved_files
|
| 470 |
+
|
weights/model_2d_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:641322e833c0a498a13695b52188dce5cf4e1fcf18fee58fc3b1c3a0a758af4a
|
| 3 |
+
size 5514739585
|
weights/tao_vggt_front3d.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19186a4c1308bb255eb088997b70445b240a40370d9eba4cab5cf7919861f20b
|
| 3 |
+
size 130585626
|