#!/usr/bin/env python3 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Preprocessing utilities for Panoptic Recon 3D model. This module provides functions for: - Image preprocessing and resizing - Frustum mask generation - Camera intrinsic handling """ import sys from fvcore.transforms.transform import Transform from typing import Optional, Tuple, Union import numpy as np import torch import cv2 from PIL import Image # Default Front3D camera intrinsic matrix DEFAULT_INTRINSIC = np.array([ [277.1281435, 0., 159.5, 0.], [0., 277.1281435, 119.5, 0.], [0., 0., 1., 0.], [0., 0., 0., 1.] ], dtype=np.float32) # Default model parameters DEFAULT_GRID_DIMS = (256, 256, 256) DEFAULT_DEPTH_RANGE = (0.4, 6.0) DEFAULT_VOXEL_SIZE = 0.03 DEFAULT_IMG_SIZE = (240, 320) # (height, width) def create_frustum_mask( intrinsics: Union[np.ndarray, torch.Tensor], volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS, depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE, image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE, voxel_size: float = DEFAULT_VOXEL_SIZE, padding_pixels: float = 0.0, volume_origin: Optional[np.ndarray] = None, z_axis_reversed: bool = False, ) -> np.ndarray: """ Create a frustum mask for a voxel volume based on camera intrinsics. This function determines which voxels in a 3D volume are visible from a camera by checking if they project within the image bounds and depth range. Args: intrinsics: Camera intrinsic matrix (3x3 or 4x4). volume_shape: Shape of the voxel volume (nx, ny, nz). depth_range: Min and max depth in meters (z_min, z_max). image_shape: Image dimensions (height, width). If None, inferred from principal point. voxel_size: Size of each voxel in meters. padding_pixels: Expand frustum bounds by this many pixels. volume_origin: Origin of the volume in camera space. If None, auto-computed. z_axis_reversed: If True, z-index 0 is farthest. Returns: frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum. """ # Convert to numpy if tensor if isinstance(intrinsics, torch.Tensor): intrinsics = intrinsics.cpu().numpy() # Ensure numpy array intrinsics = np.asarray(intrinsics, dtype=np.float64) assert intrinsics.shape in [(3, 3), (4, 4)], \ f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}" assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}" assert depth_range[0] < depth_range[1], \ f"depth_range must be (min, max) with min < max, got {depth_range}" assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}" # Extract camera parameters K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics fx, fy = K[0, 0], K[1, 1] cx, cy = K[0, 2], K[1, 2] # Determine image shape if image_shape is None: image_height = int(2 * cy) image_width = int(2 * cx) else: image_height, image_width = image_shape # Image bounds with padding u_min = -padding_pixels u_max = image_width + padding_pixels v_min = -padding_pixels v_max = image_height + padding_pixels # Set volume origin if volume_origin is None: volume_origin = np.array([ -(volume_shape[0] * voxel_size) / 2, -(volume_shape[1] * voxel_size) / 2, (depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2 ]) # Create voxel grid coordinates x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0] y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1] z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2] if z_axis_reversed: z_coords = z_coords[::-1] # Create meshgrid xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij') voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1) # Depth constraint depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1]) # Project to image plane valid_depth = voxel_centers[:, 2] > 1e-6 u = np.full(len(voxel_centers), -1.0) v = np.full(len(voxel_centers), -1.0) u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy # Image bounds check image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max) # Combine masks frustum_mask_1d = depth_mask & image_mask & valid_depth frustum_mask = frustum_mask_1d.reshape(volume_shape) return frustum_mask def get_output_shape( oldh: int, oldw: int, short_edge_length: int, max_size: int ) -> Tuple[int, int]: """Compute output size given input size and target short edge length.""" h, w = oldh, oldw size = short_edge_length * 1.0 scale = size / min(h, w) if h < w: newh, neww = size, scale * w else: newh, neww = scale * h, size if max(newh, neww) > max_size: scale = max_size * 1.0 / max(newh, neww) newh = newh * scale neww = neww * scale neww = int(neww + 0.5) newh = int(newh + 0.5) return (newh, neww) class ResizeShortestEdge(Transform): def __init__( self, orig_size: Tuple[int, int], short_edge_length, max_size=sys.maxsize, interp=cv2.INTER_LINEAR, prob=1.0 ): """ Resize shortest edge transform. """ super().__init__() self.orig_size = orig_size if isinstance(short_edge_length, int): short_edge_length = (short_edge_length, short_edge_length) self.short_edge_length = short_edge_length self.max_size = max_size self.interp = interp self.prob = prob self._get_output_shape() def _get_output_shape(self): """ Get random output shape based on short edge length. """ h, w = self.orig_size self.new_size = None size = np.random.choice(self.short_edge_length) if size != 0: hh, ww = get_output_shape(h, w, size, self.max_size) self.new_size = (ww, hh) def apply_coords(self, coords): """ Apply transforms to the coordinates. """ return coords def apply_image(self, img, interp=None): """ Apply transforms to the image. """ new_h, new_w = self.new_size return cv2.resize(img, (new_w, new_h), interpolation=self.interp) def apply_segmentation(self, segmentation): """ Apply transforms to the segmentation. """ new_h, new_w = self.new_size return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST) def adjust_intrinsic( intrinsic: Union[np.ndarray, torch.Tensor], original_size: Tuple[int, int], target_size: Tuple[int, int], ) -> Union[np.ndarray, torch.Tensor]: """Adjust intrinsic matrix for image resize. Args: intrinsic: Camera intrinsic matrix (4x4 or 3x3). original_size: Original image size (width, height). target_size: Target image size (width, height). Returns: Adjusted intrinsic matrix. """ is_tensor = isinstance(intrinsic, torch.Tensor) if is_tensor: device = intrinsic.device dtype = intrinsic.dtype intrinsic = intrinsic.cpu().numpy() intrinsic = intrinsic.copy() scale_x = target_size[0] / original_size[0] scale_y = target_size[1] / original_size[1] # Adjust focal length and principal point intrinsic[0, 0] *= scale_x # fx intrinsic[1, 1] *= scale_y # fy intrinsic[0, 2] *= scale_x # cx intrinsic[1, 2] *= scale_y # cy if is_tensor: intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype) return intrinsic def load_image( image_path: str, target_size: Tuple[int, int] = (320, 240), apply_resize_transform: bool = True, ) -> np.ndarray: """Load and preprocess image for Panoptic Recon 3D inference. This function matches the preprocessing in test_triton_server.py exactly: 1. Load image as RGB 2. Resize to target_size (default 320x240) 3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320) 4. Convert to CHW format with batch dimension Args: image_path: Path to image file. target_size: Target size (width, height). Default (320, 240). apply_resize_transform: Whether to apply ResizeShortestEdge transform. Returns: Image as numpy array (1, C, H, W) in RGB format, uint8 dtype. """ # Load image img = Image.open(image_path).convert('RGB') if img is None: raise FileNotFoundError(f"Could not load image: {image_path}") # Resize to target size img = img.resize(target_size) img = np.array(img) # Apply ResizeShortestEdge transform (matches test_triton_server.py) if apply_resize_transform: resize_instance = ResizeShortestEdge( orig_size=(target_size[0], target_size[1]), # (width, height) short_edge_length=240, max_size=320, ) img = resize_instance.apply_image(img) # Convert to CHW format with contiguous memory (critical for torch.from_numpy) image = np.ascontiguousarray(img.transpose(2, 0, 1)) # Add batch dimension: (C, H, W) -> (1, C, H, W) image = image[np.newaxis, ...] return image class DatasetConstants: """Constants for Front3D dataset.""" DEFAULT_GRID_DIMS = [256, 256, 256] DEFAULT_DEPTH_RANGE = (0.4, 6.0) DEFAULT_VOXEL_SIZE = 0.03 DEFAULT_IMG_SIZE = (240, 320) # (height, width) IGNORE_LABEL = 255 INTRINSIC = DEFAULT_INTRINSIC CATEGORIES = [ {"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"}, {"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"}, {"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"}, {"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"}, {"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"}, {"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"}, {"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"}, {"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"}, {"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"}, {"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"}, {"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"}, {"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"}, ] STUFF_CLASSES = [10, 11]