File size: 11,653 Bytes
911b379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env python3
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Preprocessing utilities for Panoptic Recon 3D model.

This module provides functions for:
- Image preprocessing and resizing
- Frustum mask generation
- Camera intrinsic handling
"""
import sys
from fvcore.transforms.transform import Transform
from typing import Optional, Tuple, Union
import numpy as np
import torch
import cv2
from PIL import Image


# Default Front3D camera intrinsic matrix
DEFAULT_INTRINSIC = np.array([
    [277.1281435, 0., 159.5, 0.],
    [0., 277.1281435, 119.5, 0.],
    [0., 0., 1., 0.],
    [0., 0., 0., 1.]
], dtype=np.float32)

# Default model parameters
DEFAULT_GRID_DIMS = (256, 256, 256)
DEFAULT_DEPTH_RANGE = (0.4, 6.0)
DEFAULT_VOXEL_SIZE = 0.03
DEFAULT_IMG_SIZE = (240, 320)  # (height, width)


def create_frustum_mask(
    intrinsics: Union[np.ndarray, torch.Tensor],
    volume_shape: Tuple[int, int, int] = DEFAULT_GRID_DIMS,
    depth_range: Tuple[float, float] = DEFAULT_DEPTH_RANGE,
    image_shape: Optional[Tuple[int, int]] = DEFAULT_IMG_SIZE,
    voxel_size: float = DEFAULT_VOXEL_SIZE,
    padding_pixels: float = 0.0,
    volume_origin: Optional[np.ndarray] = None,
    z_axis_reversed: bool = False,
) -> np.ndarray:
    """
    Create a frustum mask for a voxel volume based on camera intrinsics.

    This function determines which voxels in a 3D volume are visible from a camera
    by checking if they project within the image bounds and depth range.

    Args:
        intrinsics: Camera intrinsic matrix (3x3 or 4x4).
        volume_shape: Shape of the voxel volume (nx, ny, nz).
        depth_range: Min and max depth in meters (z_min, z_max).
        image_shape: Image dimensions (height, width). If None, inferred from principal point.
        voxel_size: Size of each voxel in meters.
        padding_pixels: Expand frustum bounds by this many pixels.
        volume_origin: Origin of the volume in camera space. If None, auto-computed.
        z_axis_reversed: If True, z-index 0 is farthest.

    Returns:
        frustum_mask: Boolean mask of shape volume_shape indicating voxels inside frustum.
    """
    # Convert to numpy if tensor
    if isinstance(intrinsics, torch.Tensor):
        intrinsics = intrinsics.cpu().numpy()
    
    # Ensure numpy array
    intrinsics = np.asarray(intrinsics, dtype=np.float64)
    
    assert intrinsics.shape in [(3, 3), (4, 4)], \
        f"Intrinsics must be 3x3 or 4x4, got shape {intrinsics.shape}"
    assert voxel_size > 0, f"voxel_size must be positive, got {voxel_size}"
    assert depth_range[0] < depth_range[1], \
        f"depth_range must be (min, max) with min < max, got {depth_range}"
    assert depth_range[0] > 0, f"depth_range min must be positive, got {depth_range[0]}"

    # Extract camera parameters
    K = intrinsics[:3, :3] if intrinsics.shape == (4, 4) else intrinsics
    fx, fy = K[0, 0], K[1, 1]
    cx, cy = K[0, 2], K[1, 2]

    # Determine image shape
    if image_shape is None:
        image_height = int(2 * cy)
        image_width = int(2 * cx)
    else:
        image_height, image_width = image_shape

    # Image bounds with padding
    u_min = -padding_pixels
    u_max = image_width + padding_pixels
    v_min = -padding_pixels
    v_max = image_height + padding_pixels

    # Set volume origin
    if volume_origin is None:
        volume_origin = np.array([
            -(volume_shape[0] * voxel_size) / 2,
            -(volume_shape[1] * voxel_size) / 2,
            (depth_range[0] + depth_range[1]) / 2 - (volume_shape[2] * voxel_size) / 2
        ])

    # Create voxel grid coordinates
    x_coords = np.arange(volume_shape[0]) * voxel_size + volume_origin[0]
    y_coords = np.arange(volume_shape[1]) * voxel_size + volume_origin[1]
    z_coords = np.arange(volume_shape[2]) * voxel_size + volume_origin[2]

    if z_axis_reversed:
        z_coords = z_coords[::-1]

    # Create meshgrid
    xx, yy, zz = np.meshgrid(x_coords, y_coords, z_coords, indexing='ij')
    voxel_centers = np.stack([xx.ravel(), yy.ravel(), zz.ravel()], axis=-1)

    # Depth constraint
    depth_mask = (voxel_centers[:, 2] >= depth_range[0]) & (voxel_centers[:, 2] <= depth_range[1])

    # Project to image plane
    valid_depth = voxel_centers[:, 2] > 1e-6
    u = np.full(len(voxel_centers), -1.0)
    v = np.full(len(voxel_centers), -1.0)

    u[valid_depth] = (fx * voxel_centers[valid_depth, 0] / voxel_centers[valid_depth, 2]) + cx
    v[valid_depth] = (fy * voxel_centers[valid_depth, 1] / voxel_centers[valid_depth, 2]) + cy

    # Image bounds check
    image_mask = (u >= u_min) & (u < u_max) & (v >= v_min) & (v < v_max)

    # Combine masks
    frustum_mask_1d = depth_mask & image_mask & valid_depth
    frustum_mask = frustum_mask_1d.reshape(volume_shape)

    return frustum_mask


def get_output_shape(
    oldh: int,
    oldw: int,
    short_edge_length: int,
    max_size: int
) -> Tuple[int, int]:
    """Compute output size given input size and target short edge length."""
    h, w = oldh, oldw
    size = short_edge_length * 1.0
    scale = size / min(h, w)
    if h < w:
        newh, neww = size, scale * w
    else:
        newh, neww = scale * h, size
    if max(newh, neww) > max_size:
        scale = max_size * 1.0 / max(newh, neww)
        newh = newh * scale
        neww = neww * scale
    neww = int(neww + 0.5)
    newh = int(newh + 0.5)
    return (newh, neww)


class ResizeShortestEdge(Transform):
    def __init__(
        self,
        orig_size: Tuple[int, int],
        short_edge_length,
        max_size=sys.maxsize,
        interp=cv2.INTER_LINEAR,
        prob=1.0
    ):
        """ Resize shortest edge transform. """
        super().__init__()
        self.orig_size = orig_size
        if isinstance(short_edge_length, int):
            short_edge_length = (short_edge_length, short_edge_length)
        self.short_edge_length = short_edge_length
        self.max_size = max_size
        self.interp = interp
        self.prob = prob
        self._get_output_shape()

    def _get_output_shape(self):
        """ Get random output shape based on short edge length. """
        h, w = self.orig_size
        self.new_size = None
        size = np.random.choice(self.short_edge_length)
        if size != 0:
            hh, ww = get_output_shape(h, w, size, self.max_size)
            self.new_size = (ww, hh)

    def apply_coords(self, coords):
        """ Apply transforms to the coordinates. """
        return coords

    def apply_image(self, img, interp=None):
        """ Apply transforms to the image. """
        new_h, new_w = self.new_size
        return cv2.resize(img, (new_w, new_h), interpolation=self.interp)

    def apply_segmentation(self, segmentation):
        """ Apply transforms to the segmentation. """
        new_h, new_w = self.new_size
        return cv2.resize(segmentation, (new_w, new_h), interpolation=cv2.INTER_NEAREST)


def adjust_intrinsic(
    intrinsic: Union[np.ndarray, torch.Tensor],
    original_size: Tuple[int, int],
    target_size: Tuple[int, int],
) -> Union[np.ndarray, torch.Tensor]:
    """Adjust intrinsic matrix for image resize.
    
    Args:
        intrinsic: Camera intrinsic matrix (4x4 or 3x3).
        original_size: Original image size (width, height).
        target_size: Target image size (width, height).
        
    Returns:
        Adjusted intrinsic matrix.
    """
    is_tensor = isinstance(intrinsic, torch.Tensor)
    if is_tensor:
        device = intrinsic.device
        dtype = intrinsic.dtype
        intrinsic = intrinsic.cpu().numpy()
    
    intrinsic = intrinsic.copy()
    
    scale_x = target_size[0] / original_size[0]
    scale_y = target_size[1] / original_size[1]
    
    # Adjust focal length and principal point
    intrinsic[0, 0] *= scale_x  # fx
    intrinsic[1, 1] *= scale_y  # fy
    intrinsic[0, 2] *= scale_x  # cx
    intrinsic[1, 2] *= scale_y  # cy
    
    if is_tensor:
        intrinsic = torch.from_numpy(intrinsic).to(device=device, dtype=dtype)
    
    return intrinsic


def load_image(
    image_path: str,
    target_size: Tuple[int, int] = (320, 240),
    apply_resize_transform: bool = True,
) -> np.ndarray:
    """Load and preprocess image for Panoptic Recon 3D inference.
    
    This function matches the preprocessing in test_triton_server.py exactly:
    1. Load image as RGB
    2. Resize to target_size (default 320x240)
    3. Apply ResizeShortestEdge transform (short_edge=240, max_size=320)
    4. Convert to CHW format with batch dimension
    
    Args:
        image_path: Path to image file.
        target_size: Target size (width, height). Default (320, 240).
        apply_resize_transform: Whether to apply ResizeShortestEdge transform.
        
    Returns:
        Image as numpy array (1, C, H, W) in RGB format, uint8 dtype.
    """
    # Load image
    img = Image.open(image_path).convert('RGB')
    if img is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")
    
    # Resize to target size
    img = img.resize(target_size)
    img = np.array(img)
    
    # Apply ResizeShortestEdge transform (matches test_triton_server.py)
    if apply_resize_transform:
        resize_instance = ResizeShortestEdge(
            orig_size=(target_size[0], target_size[1]),  # (width, height)
            short_edge_length=240,
            max_size=320,
        )
        img = resize_instance.apply_image(img)
    
    # Convert to CHW format with contiguous memory (critical for torch.from_numpy)
    image = np.ascontiguousarray(img.transpose(2, 0, 1))
    
    # Add batch dimension: (C, H, W) -> (1, C, H, W)
    image = image[np.newaxis, ...]
    
    return image

class DatasetConstants:
    """Constants for Front3D dataset."""
    DEFAULT_GRID_DIMS = [256, 256, 256]
    DEFAULT_DEPTH_RANGE = (0.4, 6.0)
    DEFAULT_VOXEL_SIZE = 0.03
    DEFAULT_IMG_SIZE = (240, 320)  # (height, width)
    IGNORE_LABEL = 255
    
    INTRINSIC = DEFAULT_INTRINSIC
    
    CATEGORIES = [
        {"color": (220, 20, 60), "isthing": 1, "id": 1, "trainId": 1, "name": "cabinet"},
        {"color": (255, 0, 0), "isthing": 1, "id": 2, "trainId": 2, "name": "bed"},
        {"color": (0, 0, 142), "isthing": 1, "id": 3, "trainId": 3, "name": "chair"},
        {"color": (0, 0, 70), "isthing": 1, "id": 4, "trainId": 4, "name": "sofa"},
        {"color": (0, 60, 100), "isthing": 1, "id": 5, "trainId": 5, "name": "table"},
        {"color": (0, 80, 100), "isthing": 1, "id": 6, "trainId": 6, "name": "desk"},
        {"color": (0, 0, 230), "isthing": 1, "id": 7, "trainId": 7, "name": "dresser"},
        {"color": (119, 11, 32), "isthing": 1, "id": 8, "trainId": 8, "name": "lamp"},
        {"color": (190, 50, 60), "isthing": 1, "id": 9, "trainId": 9, "name": "other"},
        {"color": (102, 102, 156), "isthing": 0, "id": 10, "trainId": 10, "name": "wall"},
        {"color": (128, 64, 128), "isthing": 0, "id": 11, "trainId": 11, "name": "floor"},
        {"color": (70, 70, 70), "isthing": 0, "id": 12, "trainId": 12, "name": "ceiling"},
    ]
    
    STUFF_CLASSES = [10, 11]