File size: 11,653 Bytes

911b379

#!/usr/bin/env python3
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Preprocessing utilities for Panoptic Recon 3D model.

This module provides functions for:
- Image preprocessing and resizing
- Frustum mask generation
- Camera intrinsic handling
"""
import sys
from fvcore.transforms.transform import Transform
from typing import Optional, Tuple, Union
import numpy as np
import torch
import cv2
from PIL import Image


# Default Front3D camera intrinsic matrix
DEFAULT_INTRINSIC = np.array([
    [277.1281435, 0., 159.5, 0.],
    [0., 277.1281435, 119.5, 0.],
    [0., 0., 1., 0.],
    [0., 0., 0., 1.]
], dtype=np.float32)

# Default model parameters
DEFAULT_GRID_DIMS = (256, 256, 256)
DEFAULT_DEPTH_RANGE = (0.4, 6.0)
DEFAULT_VOXEL_SIZE = 0.03
DEFAULT_IMG_SIZE = (240, 320)  # (height, width)


def create_frustum_mask(
    intrinsics: Union[np.ndarray, torch.Tensor],
    volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS,
    depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE,
    image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE,
    voxel_size: float = DEFAULT_VOXEL_SIZE,
    padding_pixels: float = 0.0,
    volume_origin: Optional[np.ndarray] = None,
    z_axis_reversed: bool = False,
) -> np.ndarray:
    """
    Create a frustum mask for a voxel volume based on camera intrinsics.

    This function determines which voxels in a 3D volume are visible from a camera
    by checking if they project within the image bounds and depth range.

    Args:
        intrinsics: Camera intrinsic matrix (3x3 or 4x4).
        volume_shape: Shape of the voxel volume (nx, ny, nz).
        depth_range: Min and max depth in meters (z_min, z_max).
        image_shape: Image dimensions (height, width). If None, inferred from principal point.
        voxel_size: Size of each voxel in meters.
        padding_pixels: Expand frustum bounds by this many pixels.
        volume_origin: Origin of the volume in camera space. If None, auto-computed.
        z_axis_reversed: If True, z-index 0 is farthest.

    Returns:
        frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum.
    """
    # Convert to numpy if tensor
    if isinstance(intrinsics, torch.Tensor):
        intrinsics = intrinsics.cpu().numpy()
    
    # Ensure numpy array
    intrinsics = np.asarray(intrinsics, dtype=np.float64)
    
    assert intrinsics.shape in [(3, 3), (4, 4)], \
        f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}"
    assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}"
    assert depth_range[0] < depth_range[1], \
        f"depth_range must be (min, max) with min < max, got {depth_range}"
    assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}"

    # Extract camera parameters
    K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics
    fx, fy = K[0, 0], K[1, 1]
    cx, cy = K[0, 2], K[1, 2]

    # Determine image shape
    if image_shape is None:
        image_height = int(2 * cy)
        image_width = int(2 * cx)
    else:
        image_height, image_width = image_shape

    # Image bounds with padding
    u_min = -padding_pixels
    u_max = image_width + padding_pixels
    v_min = -padding_pixels
    v_max = image_height + padding_pixels

    # Set volume origin
    if volume_origin is None:
        volume_origin = np.array([
            -(volume_shape[0] * voxel_size) / 2,
            -(volume_shape[1] * voxel_size) / 2,
            (depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2
        ])

    # Create voxel grid coordinates
    x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0]
    y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1]
    z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2]

    if z_axis_reversed:
        z_coords = z_coords[::-1]

    # Create meshgrid
    xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij')
    voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1)

    # Depth constraint
    depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1])

    # Project to image plane
    valid_depth = voxel_centers[:, 2] > 1e-6
    u = np.full(len(voxel_centers), -1.0)
    v = np.full(len(voxel_centers), -1.0)

    u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx
    v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy

    # Image bounds check
    image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max)

    # Combine masks
    frustum_mask_1d = depth_mask & image_mask & valid_depth
    frustum_mask = frustum_mask_1d.reshape(volume_shape)

    return frustum_mask


def get_output_shape(
    oldh: int,
    oldw: int,
    short_edge_length: int,
    max_size: int
) -> Tuple[int, int]:
    """Compute output size given input size and target short edge length."""
    h, w = oldh, oldw
    size = short_edge_length * 1.0
    scale = size / min(h, w)
    if h < w:
        newh, neww = size, scale * w
    else:
        newh, neww = scale * h, size
    if max(newh, neww) > max_size:
        scale = max_size * 1.0 / max(newh, neww)
        newh = newh * scale
        neww = neww * scale
    neww = int(neww + 0.5)
    newh = int(newh + 0.5)
    return (newh, neww)


class ResizeShortestEdge(Transform):
    def __init__(
        self,
        orig_size: Tuple[int, int],
        short_edge_length,
        max_size=sys.maxsize,
        interp=cv2.INTER_LINEAR,
        prob=1.0
    ):
        """ Resize shortest edge transform. """
        super().__init__()
        self.orig_size = orig_size
        if isinstance(short_edge_length, int):
            short_edge_length = (short_edge_length, short_edge_length)
        self.short_edge_length = short_edge_length
        self.max_size = max_size
        self.interp = interp
        self.prob = prob
        self._get_output_shape()

    def _get_output_shape(self):
        """ Get random output shape based on short edge length. """
        h, w = self.orig_size
        self.new_size = None
        size = np.random.choice(self.short_edge_length)
        if size != 0:
            hh, ww = get_output_shape(h, w, size, self.max_size)
            self.new_size = (ww, hh)

    def apply_coords(self, coords):
        """ Apply transforms to the coordinates. """
        return coords

    def apply_image(self, img, interp=None):
        """ Apply transforms to the image. """
        new_h, new_w = self.new_size
        return cv2.resize(img, (new_w, new_h), interpolation=self.interp)

    def apply_segmentation(self, segmentation):
        """ Apply transforms to the segmentation. """
        new_h, new_w = self.new_size
        return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST)


def adjust_intrinsic(
    intrinsic: Union[np.ndarray, torch.Tensor],
    original_size: Tuple[int, int],
    target_size: Tuple[int, int],
) -> Union[np.ndarray, torch.Tensor]:
    """Adjust intrinsic matrix for image resize.
    
    Args:
        intrinsic: Camera intrinsic matrix (4x4 or 3x3).
        original_size: Original image size (width, height).
        target_size: Target image size (width, height).
        
    Returns:
        Adjusted intrinsic matrix.
    """
    is_tensor = isinstance(intrinsic, torch.Tensor)
    if is_tensor:
        device = intrinsic.device
        dtype = intrinsic.dtype
        intrinsic = intrinsic.cpu().numpy()
    
    intrinsic = intrinsic.copy()
    
    scale_x = target_size[0] / original_size[0]
    scale_y = target_size[1] / original_size[1]
    
    # Adjust focal length and principal point
    intrinsic[0, 0] *= scale_x  # fx
    intrinsic[1, 1] *= scale_y  # fy
    intrinsic[0, 2] *= scale_x  # cx
    intrinsic[1, 2] *= scale_y  # cy
    
    if is_tensor:
        intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype)
    
    return intrinsic


def load_image(
    image_path: str,
    target_size: Tuple[int, int] = (320, 240),
    apply_resize_transform: bool = True,
) -> np.ndarray:
    """Load and preprocess image for Panoptic Recon 3D inference.
    
    This function matches the preprocessing in test_triton_server.py exactly:
    1. Load image as RGB
    2. Resize to target_size (default 320x240)
    3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320)
    4. Convert to CHW format with batch dimension
    
    Args:
        image_path: Path to image file.
        target_size: Target size (width, height). Default (320, 240).
        apply_resize_transform: Whether to apply ResizeShortestEdge transform.
        
    Returns:
        Image as numpy array (1, C, H, W) in RGB format, uint8 dtype.
    """
    # Load image
    img = Image.open(image_path).convert('RGB')
    if img is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")
    
    # Resize to target size
    img = img.resize(target_size)
    img = np.array(img)
    
    # Apply ResizeShortestEdge transform (matches test_triton_server.py)
    if apply_resize_transform:
        resize_instance = ResizeShortestEdge(
            orig_size=(target_size[0], target_size[1]),  # (width, height)
            short_edge_length=240,
            max_size=320,
        )
        img = resize_instance.apply_image(img)
    
    # Convert to CHW format with contiguous memory (critical for torch.from_numpy)
    image = np.ascontiguousarray(img.transpose(2, 0, 1))
    
    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    image = image[np.newaxis, ...]
    
    return image

class DatasetConstants:
    """Constants for Front3D dataset."""
    DEFAULT_GRID_DIMS = [256, 256, 256]
    DEFAULT_DEPTH_RANGE = (0.4, 6.0)
    DEFAULT_VOXEL_SIZE = 0.03
    DEFAULT_IMG_SIZE = (240, 320)  # (height, width)
    IGNORE_LABEL = 255
    
    INTRINSIC = DEFAULT_INTRINSIC
    
    CATEGORIES = [
        {"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"},
        {"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"},
        {"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"},
        {"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"},
        {"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"},
        {"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"},
        {"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"},
        {"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"},
        {"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"},
        {"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"},
        {"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"},
        {"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"},
    ]
    
    STUFF_CLASSES = [10, 11]