|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Preprocessing utilities for Panoptic Recon 3D model. |
|
|
|
|
|
This module provides functions for: |
|
|
- Image preprocessing and resizing |
|
|
- Frustum mask generation |
|
|
- Camera intrinsic handling |
|
|
""" |
|
|
import sys |
|
|
from fvcore.transforms.transform import Transform |
|
|
from typing import Optional, Tuple, Union |
|
|
import numpy as np |
|
|
import torch |
|
|
import cv2 |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_INTRINSIC = np.array([ |
|
|
[277.1281435, 0., 159.5, 0.], |
|
|
[0., 277.1281435, 119.5, 0.], |
|
|
[0., 0., 1., 0.], |
|
|
[0., 0., 0., 1.] |
|
|
], dtype=np.float32) |
|
|
|
|
|
|
|
|
DEFAULT_GRID_DIMS = (256, 256, 256) |
|
|
DEFAULT_DEPTH_RANGE = (0.4, 6.0) |
|
|
DEFAULT_VOXEL_SIZE = 0.03 |
|
|
DEFAULT_IMG_SIZE = (240, 320) |
|
|
|
|
|
|
|
|
def create_frustum_mask( |
|
|
intrinsics: Union[np.ndarray, torch.Tensor], |
|
|
volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS, |
|
|
depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE, |
|
|
image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE, |
|
|
voxel_size: float = DEFAULT_VOXEL_SIZE, |
|
|
padding_pixels: float = 0.0, |
|
|
volume_origin: Optional[np.ndarray] = None, |
|
|
z_axis_reversed: bool = False, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Create a frustum mask for a voxel volume based on camera intrinsics. |
|
|
|
|
|
This function determines which voxels in a 3D volume are visible from a camera |
|
|
by checking if they project within the image bounds and depth range. |
|
|
|
|
|
Args: |
|
|
intrinsics: Camera intrinsic matrix (3x3 or 4x4). |
|
|
volume_shape: Shape of the voxel volume (nx, ny, nz). |
|
|
depth_range: Min and max depth in meters (z_min, z_max). |
|
|
image_shape: Image dimensions (height, width). If None, inferred from principal point. |
|
|
voxel_size: Size of each voxel in meters. |
|
|
padding_pixels: Expand frustum bounds by this many pixels. |
|
|
volume_origin: Origin of the volume in camera space. If None, auto-computed. |
|
|
z_axis_reversed: If True, z-index 0 is farthest. |
|
|
|
|
|
Returns: |
|
|
frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum. |
|
|
""" |
|
|
|
|
|
if isinstance(intrinsics, torch.Tensor): |
|
|
intrinsics = intrinsics.cpu().numpy() |
|
|
|
|
|
|
|
|
intrinsics = np.asarray(intrinsics, dtype=np.float64) |
|
|
|
|
|
assert intrinsics.shape in [(3, 3), (4, 4)], \ |
|
|
f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}" |
|
|
assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}" |
|
|
assert depth_range[0] < depth_range[1], \ |
|
|
f"depth_range must be (min, max) with min < max, got {depth_range}" |
|
|
assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}" |
|
|
|
|
|
|
|
|
K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics |
|
|
fx, fy = K[0, 0], K[1, 1] |
|
|
cx, cy = K[0, 2], K[1, 2] |
|
|
|
|
|
|
|
|
if image_shape is None: |
|
|
image_height = int(2 * cy) |
|
|
image_width = int(2 * cx) |
|
|
else: |
|
|
image_height, image_width = image_shape |
|
|
|
|
|
|
|
|
u_min = -padding_pixels |
|
|
u_max = image_width + padding_pixels |
|
|
v_min = -padding_pixels |
|
|
v_max = image_height + padding_pixels |
|
|
|
|
|
|
|
|
if volume_origin is None: |
|
|
volume_origin = np.array([ |
|
|
-(volume_shape[0] * voxel_size) / 2, |
|
|
-(volume_shape[1] * voxel_size) / 2, |
|
|
(depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2 |
|
|
]) |
|
|
|
|
|
|
|
|
x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0] |
|
|
y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1] |
|
|
z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2] |
|
|
|
|
|
if z_axis_reversed: |
|
|
z_coords = z_coords[::-1] |
|
|
|
|
|
|
|
|
xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij') |
|
|
voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1) |
|
|
|
|
|
|
|
|
depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1]) |
|
|
|
|
|
|
|
|
valid_depth = voxel_centers[:, 2] > 1e-6 |
|
|
u = np.full(len(voxel_centers), -1.0) |
|
|
v = np.full(len(voxel_centers), -1.0) |
|
|
|
|
|
u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx |
|
|
v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy |
|
|
|
|
|
|
|
|
image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max) |
|
|
|
|
|
|
|
|
frustum_mask_1d = depth_mask & image_mask & valid_depth |
|
|
frustum_mask = frustum_mask_1d.reshape(volume_shape) |
|
|
|
|
|
return frustum_mask |
|
|
|
|
|
|
|
|
def get_output_shape( |
|
|
oldh: int, |
|
|
oldw: int, |
|
|
short_edge_length: int, |
|
|
max_size: int |
|
|
) -> Tuple[int, int]: |
|
|
"""Compute output size given input size and target short edge length.""" |
|
|
h, w = oldh, oldw |
|
|
size = short_edge_length * 1.0 |
|
|
scale = size / min(h, w) |
|
|
if h < w: |
|
|
newh, neww = size, scale * w |
|
|
else: |
|
|
newh, neww = scale * h, size |
|
|
if max(newh, neww) > max_size: |
|
|
scale = max_size * 1.0 / max(newh, neww) |
|
|
newh = newh * scale |
|
|
neww = neww * scale |
|
|
neww = int(neww + 0.5) |
|
|
newh = int(newh + 0.5) |
|
|
return (newh, neww) |
|
|
|
|
|
|
|
|
class ResizeShortestEdge(Transform): |
|
|
def __init__( |
|
|
self, |
|
|
orig_size: Tuple[int, int], |
|
|
short_edge_length, |
|
|
max_size=sys.maxsize, |
|
|
interp=cv2.INTER_LINEAR, |
|
|
prob=1.0 |
|
|
): |
|
|
""" Resize shortest edge transform. """ |
|
|
super().__init__() |
|
|
self.orig_size = orig_size |
|
|
if isinstance(short_edge_length, int): |
|
|
short_edge_length = (short_edge_length, short_edge_length) |
|
|
self.short_edge_length = short_edge_length |
|
|
self.max_size = max_size |
|
|
self.interp = interp |
|
|
self.prob = prob |
|
|
self._get_output_shape() |
|
|
|
|
|
def _get_output_shape(self): |
|
|
""" Get random output shape based on short edge length. """ |
|
|
h, w = self.orig_size |
|
|
self.new_size = None |
|
|
size = np.random.choice(self.short_edge_length) |
|
|
if size != 0: |
|
|
hh, ww = get_output_shape(h, w, size, self.max_size) |
|
|
self.new_size = (ww, hh) |
|
|
|
|
|
def apply_coords(self, coords): |
|
|
""" Apply transforms to the coordinates. """ |
|
|
return coords |
|
|
|
|
|
def apply_image(self, img, interp=None): |
|
|
""" Apply transforms to the image. """ |
|
|
new_h, new_w = self.new_size |
|
|
return cv2.resize(img, (new_w, new_h), interpolation=self.interp) |
|
|
|
|
|
def apply_segmentation(self, segmentation): |
|
|
""" Apply transforms to the segmentation. """ |
|
|
new_h, new_w = self.new_size |
|
|
return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST) |
|
|
|
|
|
|
|
|
def adjust_intrinsic( |
|
|
intrinsic: Union[np.ndarray, torch.Tensor], |
|
|
original_size: Tuple[int, int], |
|
|
target_size: Tuple[int, int], |
|
|
) -> Union[np.ndarray, torch.Tensor]: |
|
|
"""Adjust intrinsic matrix for image resize. |
|
|
|
|
|
Args: |
|
|
intrinsic: Camera intrinsic matrix (4x4 or 3x3). |
|
|
original_size: Original image size (width, height). |
|
|
target_size: Target image size (width, height). |
|
|
|
|
|
Returns: |
|
|
Adjusted intrinsic matrix. |
|
|
""" |
|
|
is_tensor = isinstance(intrinsic, torch.Tensor) |
|
|
if is_tensor: |
|
|
device = intrinsic.device |
|
|
dtype = intrinsic.dtype |
|
|
intrinsic = intrinsic.cpu().numpy() |
|
|
|
|
|
intrinsic = intrinsic.copy() |
|
|
|
|
|
scale_x = target_size[0] / original_size[0] |
|
|
scale_y = target_size[1] / original_size[1] |
|
|
|
|
|
|
|
|
intrinsic[0, 0] *= scale_x |
|
|
intrinsic[1, 1] *= scale_y |
|
|
intrinsic[0, 2] *= scale_x |
|
|
intrinsic[1, 2] *= scale_y |
|
|
|
|
|
if is_tensor: |
|
|
intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype) |
|
|
|
|
|
return intrinsic |
|
|
|
|
|
|
|
|
def load_image( |
|
|
image_path: str, |
|
|
target_size: Tuple[int, int] = (320, 240), |
|
|
apply_resize_transform: bool = True, |
|
|
) -> np.ndarray: |
|
|
"""Load and preprocess image for Panoptic Recon 3D inference. |
|
|
|
|
|
This function matches the preprocessing in test_triton_server.py exactly: |
|
|
1. Load image as RGB |
|
|
2. Resize to target_size (default 320x240) |
|
|
3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320) |
|
|
4. Convert to CHW format with batch dimension |
|
|
|
|
|
Args: |
|
|
image_path: Path to image file. |
|
|
target_size: Target size (width, height). Default (320, 240). |
|
|
apply_resize_transform: Whether to apply ResizeShortestEdge transform. |
|
|
|
|
|
Returns: |
|
|
Image as numpy array (1, C, H, W) in RGB format, uint8 dtype. |
|
|
""" |
|
|
|
|
|
img = Image.open(image_path).convert('RGB') |
|
|
if img is None: |
|
|
raise FileNotFoundError(f"Could not load image: {image_path}") |
|
|
|
|
|
|
|
|
img = img.resize(target_size) |
|
|
img = np.array(img) |
|
|
|
|
|
|
|
|
if apply_resize_transform: |
|
|
resize_instance = ResizeShortestEdge( |
|
|
orig_size=(target_size[0], target_size[1]), |
|
|
short_edge_length=240, |
|
|
max_size=320, |
|
|
) |
|
|
img = resize_instance.apply_image(img) |
|
|
|
|
|
|
|
|
image = np.ascontiguousarray(img.transpose(2, 0, 1)) |
|
|
|
|
|
|
|
|
image = image[np.newaxis, ...] |
|
|
|
|
|
return image |
|
|
|
|
|
class DatasetConstants: |
|
|
"""Constants for Front3D dataset.""" |
|
|
DEFAULT_GRID_DIMS = [256, 256, 256] |
|
|
DEFAULT_DEPTH_RANGE = (0.4, 6.0) |
|
|
DEFAULT_VOXEL_SIZE = 0.03 |
|
|
DEFAULT_IMG_SIZE = (240, 320) |
|
|
IGNORE_LABEL = 255 |
|
|
|
|
|
INTRINSIC = DEFAULT_INTRINSIC |
|
|
|
|
|
CATEGORIES = [ |
|
|
{"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"}, |
|
|
{"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"}, |
|
|
{"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"}, |
|
|
{"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"}, |
|
|
{"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"}, |
|
|
{"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"}, |
|
|
{"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"}, |
|
|
{"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"}, |
|
|
{"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"}, |
|
|
{"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"}, |
|
|
{"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"}, |
|
|
{"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"}, |
|
|
] |
|
|
|
|
|
STUFF_CLASSES = [10, 11] |
|
|
|
|
|
|