nvpanoptix-3d / preprocessing.py

Update model inference code and environment setup instructions (#4)

f4a0919 verified 2 days ago

11.7 kB

	#!/usr/bin/env python3
	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Preprocessing utilities for Panoptic Recon 3D model.

	This module provides functions for:
	- Image preprocessing and resizing
	- Frustum mask generation
	- Camera intrinsic handling
	"""
	import sys
	from fvcore.transforms.transform import Transform
	from typing import Optional, Tuple, Union
	import numpy as np
	import torch
	import cv2
	from PIL import Image


	# Default Front3D camera intrinsic matrix
	DEFAULT_INTRINSIC = np.array([
	[277.1281435, 0., 159.5, 0.],
	[0., 277.1281435, 119.5, 0.],
	[0., 0., 1., 0.],
	[0., 0., 0., 1.]
	], dtype=np.float32)

	# Default model parameters
	DEFAULT_GRID_DIMS = (256, 256, 256)
	DEFAULT_DEPTH_RANGE = (0.4, 6.0)
	DEFAULT_VOXEL_SIZE = 0.03
	DEFAULT_IMG_SIZE = (240, 320) # (height, width)


	def create_frustum_mask(
	intrinsics: Union[np.ndarray, torch.Tensor],
	volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS,
	depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE,
	image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE,
	voxel_size: float = DEFAULT_VOXEL_SIZE,
	padding_pixels: float = 0.0,
	volume_origin: Optional[np.ndarray] = None,
	z_axis_reversed: bool = False,
	) -> np.ndarray:
	"""
	Create a frustum mask for a voxel volume based on camera intrinsics.

	This function determines which voxels in a 3D volume are visible from a camera
	by checking if they project within the image bounds and depth range.

	Args:
	intrinsics: Camera intrinsic matrix (3x3 or 4x4).
	volume_shape: Shape of the voxel volume (nx, ny, nz).
	depth_range: Min and max depth in meters (z_min, z_max).
	image_shape: Image dimensions (height, width). If None, inferred from principal point.
	voxel_size: Size of each voxel in meters.
	padding_pixels: Expand frustum bounds by this many pixels.
	volume_origin: Origin of the volume in camera space. If None, auto-computed.
	z_axis_reversed: If True, z-index 0 is farthest.

	Returns:
	frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum.
	"""
	# Convert to numpy if tensor
	if isinstance(intrinsics, torch.Tensor):
	intrinsics = intrinsics.cpu().numpy()

	# Ensure numpy array
	intrinsics = np.asarray(intrinsics, dtype=np.float64)

	assert intrinsics.shape in [(3, 3), (4, 4)], \
	f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}"
	assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}"
	assert depth_range[0] < depth_range[1], \
	f"depth_range must be (min, max) with min < max, got {depth_range}"
	assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}"

	# Extract camera parameters
	K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics
	fx, fy = K[0, 0], K[1, 1]
	cx, cy = K[0, 2], K[1, 2]

	# Determine image shape
	if image_shape is None:
	image_height = int(2 * cy)
	image_width = int(2 * cx)
	else:
	image_height, image_width = image_shape

	# Image bounds with padding
	u_min = -padding_pixels
	u_max = image_width + padding_pixels
	v_min = -padding_pixels
	v_max = image_height + padding_pixels

	# Set volume origin
	if volume_origin is None:
	volume_origin = np.array([
	-(volume_shape[0] * voxel_size) / 2,
	-(volume_shape[1] * voxel_size) / 2,
	(depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2
	])

	# Create voxel grid coordinates
	x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0]
	y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1]
	z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2]

	if z_axis_reversed:
	z_coords = z_coords[::-1]

	# Create meshgrid
	xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij')
	voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1)

	# Depth constraint
	depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1])

	# Project to image plane
	valid_depth = voxel_centers[:, 2] > 1e-6
	u = np.full(len(voxel_centers), -1.0)
	v = np.full(len(voxel_centers), -1.0)

	u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx
	v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy

	# Image bounds check
	image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max)

	# Combine masks
	frustum_mask_1d = depth_mask & image_mask & valid_depth
	frustum_mask = frustum_mask_1d.reshape(volume_shape)

	return frustum_mask


	def get_output_shape(
	oldh: int,
	oldw: int,
	short_edge_length: int,
	max_size: int
	) -> Tuple[int, int]:
	"""Compute output size given input size and target short edge length."""
	h, w = oldh, oldw
	size = short_edge_length * 1.0
	scale = size / min(h, w)
	if h < w:
	newh, neww = size, scale * w
	else:
	newh, neww = scale * h, size
	if max(newh, neww) > max_size:
	scale = max_size * 1.0 / max(newh, neww)
	newh = newh * scale
	neww = neww * scale
	neww = int(neww + 0.5)
	newh = int(newh + 0.5)
	return (newh, neww)


	class ResizeShortestEdge(Transform):
	def __init__(
	self,
	orig_size: Tuple[int, int],
	short_edge_length,
	max_size=sys.maxsize,
	interp=cv2.INTER_LINEAR,
	prob=1.0
	):
	""" Resize shortest edge transform. """
	super().__init__()
	self.orig_size = orig_size
	if isinstance(short_edge_length, int):
	short_edge_length = (short_edge_length, short_edge_length)
	self.short_edge_length = short_edge_length
	self.max_size = max_size
	self.interp = interp
	self.prob = prob
	self._get_output_shape()

	def _get_output_shape(self):
	""" Get random output shape based on short edge length. """
	h, w = self.orig_size
	self.new_size = None
	size = np.random.choice(self.short_edge_length)
	if size != 0:
	hh, ww = get_output_shape(h, w, size, self.max_size)
	self.new_size = (ww, hh)

	def apply_coords(self, coords):
	""" Apply transforms to the coordinates. """
	return coords

	def apply_image(self, img, interp=None):
	""" Apply transforms to the image. """
	new_h, new_w = self.new_size
	return cv2.resize(img, (new_w, new_h), interpolation=self.interp)

	def apply_segmentation(self, segmentation):
	""" Apply transforms to the segmentation. """
	new_h, new_w = self.new_size
	return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST)


	def adjust_intrinsic(
	intrinsic: Union[np.ndarray, torch.Tensor],
	original_size: Tuple[int, int],
	target_size: Tuple[int, int],
	) -> Union[np.ndarray, torch.Tensor]:
	"""Adjust intrinsic matrix for image resize.

	Args:
	intrinsic: Camera intrinsic matrix (4x4 or 3x3).
	original_size: Original image size (width, height).
	target_size: Target image size (width, height).

	Returns:
	Adjusted intrinsic matrix.
	"""
	is_tensor = isinstance(intrinsic, torch.Tensor)
	if is_tensor:
	device = intrinsic.device
	dtype = intrinsic.dtype
	intrinsic = intrinsic.cpu().numpy()

	intrinsic = intrinsic.copy()

	scale_x = target_size[0] / original_size[0]
	scale_y = target_size[1] / original_size[1]

	# Adjust focal length and principal point
	intrinsic[0, 0] *= scale_x # fx
	intrinsic[1, 1] *= scale_y # fy
	intrinsic[0, 2] *= scale_x # cx
	intrinsic[1, 2] *= scale_y # cy

	if is_tensor:
	intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype)

	return intrinsic


	def load_image(
	image_path: str,
	target_size: Tuple[int, int] = (320, 240),
	apply_resize_transform: bool = True,
	) -> np.ndarray:
	"""Load and preprocess image for Panoptic Recon 3D inference.

	This function matches the preprocessing in test_triton_server.py exactly:
	1. Load image as RGB
	2. Resize to target_size (default 320x240)
	3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320)
	4. Convert to CHW format with batch dimension

	Args:
	image_path: Path to image file.
	target_size: Target size (width, height). Default (320, 240).
	apply_resize_transform: Whether to apply ResizeShortestEdge transform.

	Returns:
	Image as numpy array (1, C, H, W) in RGB format, uint8 dtype.
	"""
	# Load image
	img = Image.open(image_path).convert('RGB')
	if img is None:
	raise FileNotFoundError(f"Could not load image: {image_path}")

	# Resize to target size
	img = img.resize(target_size)
	img = np.array(img)

	# Apply ResizeShortestEdge transform (matches test_triton_server.py)
	if apply_resize_transform:
	resize_instance = ResizeShortestEdge(
	orig_size=(target_size[0], target_size[1]), # (width, height)
	short_edge_length=240,
	max_size=320,
	)
	img = resize_instance.apply_image(img)

	# Convert to CHW format with contiguous memory (critical for torch.from_numpy)
	image = np.ascontiguousarray(img.transpose(2, 0, 1))

	# Add batch dimension: (C, H, W) -> (1, C, H, W)
	image = image[np.newaxis, ...]

	return image

	class DatasetConstants:
	"""Constants for Front3D dataset."""
	DEFAULT_GRID_DIMS = [256, 256, 256]
	DEFAULT_DEPTH_RANGE = (0.4, 6.0)
	DEFAULT_VOXEL_SIZE = 0.03
	DEFAULT_IMG_SIZE = (240, 320) # (height, width)
	IGNORE_LABEL = 255

	INTRINSIC = DEFAULT_INTRINSIC

	CATEGORIES = [
	{"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"},
	{"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"},
	{"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"},
	{"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"},
	{"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"},
	{"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"},
	{"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"},
	{"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"},
	{"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"},
	{"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"},
	{"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"},
	{"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"},
	]

	STUFF_CLASSES = [10, 11]