| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from __future__ import annotations |
|
|
| import os |
| import numpy as np |
| import trimesh |
|
|
| from ...specs import Prediction |
| from ...utils.logger import logger |
|
|
| from .depth_vis import export_to_depth_vis |
|
|
|
|
| def set_sky_depth(prediction: Prediction, sky_mask: np.ndarray, sky_depth_def: float = 98.0): |
| non_sky_mask = ~sky_mask |
| valid_depth = prediction.depth[non_sky_mask] |
| if valid_depth.size > 0: |
| max_depth = np.percentile(valid_depth, sky_depth_def) |
| prediction.depth[sky_mask] = max_depth |
|
|
|
|
| def get_conf_thresh( |
| prediction: Prediction, |
| sky_mask: np.ndarray, |
| conf_thresh: float, |
| conf_thresh_percentile: float = 10.0, |
| ensure_thresh_percentile: float = 90.0, |
| ): |
| if sky_mask is not None and (~sky_mask).sum() > 10: |
| conf_pixels = prediction.conf[~sky_mask] |
| else: |
| conf_pixels = prediction.conf |
| lower = np.percentile(conf_pixels, conf_thresh_percentile) |
| upper = np.percentile(conf_pixels, ensure_thresh_percentile) |
| conf_thresh = min(max(conf_thresh, lower), upper) |
| return conf_thresh |
|
|
|
|
| def export_to_glb( |
| prediction: Prediction, |
| export_dir: str, |
| num_max_points: int = 1_000_000, |
| conf_thresh: float = 1.05, |
| filter_black_bg: bool = False, |
| filter_white_bg: bool = False, |
| conf_thresh_percentile: float = 40.0, |
| ensure_thresh_percentile: float = 90.0, |
| sky_depth_def: float = 98.0, |
| show_cameras: bool = True, |
| camera_size: float = 0.03, |
| export_depth_vis: bool = True, |
| ) -> str: |
| """Generate a 3D point cloud and camera wireframes and export them as a ``.glb`` file. |
| |
| The function builds a point cloud from the predicted depth maps, aligns it to the |
| first camera in glTF coordinates (X-right, Y-up, Z-backward), optionally draws |
| camera wireframes, and writes the result to ``scene.glb``. Auxiliary assets such as |
| depth visualizations can also be generated alongside the main export. |
| |
| Args: |
| prediction: Model prediction containing depth, confidence, intrinsics, extrinsics, |
| and pre-processed images. |
| export_dir: Output directory where the glTF assets will be written. |
| num_max_points: Maximum number of points retained after downsampling. |
| conf_thresh: Base confidence threshold used before percentile adjustments. |
| filter_black_bg: Mark near-black background pixels for removal during confidence filtering. |
| filter_white_bg: Mark near-white background pixels for removal during confidence filtering. |
| conf_thresh_percentile: Lower percentile used when adapting the confidence threshold. |
| ensure_thresh_percentile: Upper percentile clamp for the adaptive threshold. |
| sky_depth_def: Percentile used to fill sky pixels with plausible depth values. |
| show_cameras: Whether to render camera wireframes in the exported scene. |
| camera_size: Relative camera wireframe scale as a fraction of the scene diagonal. |
| export_depth_vis: Whether to export raster depth visualisations alongside the glTF. |
| |
| Returns: |
| Path to the exported ``scene.glb`` file. |
| """ |
| |
| assert ( |
| prediction.processed_images is not None |
| ), "Export to GLB: prediction.processed_images is required but not available" |
| assert ( |
| prediction.depth is not None |
| ), "Export to GLB: prediction.depth is required but not available" |
| assert ( |
| prediction.intrinsics is not None |
| ), "Export to GLB: prediction.intrinsics is required but not available" |
| assert ( |
| prediction.extrinsics is not None |
| ), "Export to GLB: prediction.extrinsics is required but not available" |
| assert ( |
| prediction.conf is not None |
| ), "Export to GLB: prediction.conf is required but not available" |
| logger.info(f"conf_thresh_percentile: {conf_thresh_percentile}") |
| logger.info(f"num max points: {num_max_points}") |
| logger.info(f"Exporting to GLB with num_max_points: {num_max_points}") |
| if prediction.processed_images is None: |
| raise ValueError("prediction.processed_images is required but not available") |
|
|
| images_u8 = prediction.processed_images |
|
|
| |
| if getattr(prediction, "sky_mask", None) is not None: |
| set_sky_depth(prediction, prediction.sky_mask, sky_depth_def) |
|
|
| |
| if filter_black_bg: |
| prediction.conf[(prediction.processed_images < 16).all(axis=-1)] = 1.0 |
| if filter_white_bg: |
| prediction.conf[(prediction.processed_images >= 240).all(axis=-1)] = 1.0 |
| conf_thr = get_conf_thresh( |
| prediction, |
| getattr(prediction, "sky_mask", None), |
| conf_thresh, |
| conf_thresh_percentile, |
| ensure_thresh_percentile, |
| ) |
|
|
| |
| points, colors = _depths_to_world_points_with_colors( |
| prediction.depth, |
| prediction.intrinsics, |
| prediction.extrinsics, |
| images_u8, |
| prediction.conf, |
| conf_thr, |
| ) |
|
|
| |
| |
| A = _compute_alignment_transform_first_cam_glTF_center_by_points( |
| prediction.extrinsics[0], points |
| ) |
|
|
| if points.shape[0] > 0: |
| points = trimesh.transform_points(points, A) |
|
|
| |
| points, colors = _filter_and_downsample(points, colors, num_max_points) |
|
|
| |
| scene = trimesh.Scene() |
| if scene.metadata is None: |
| scene.metadata = {} |
| scene.metadata["hf_alignment"] = A |
|
|
| if points.shape[0] > 0: |
| pc = trimesh.points.PointCloud(vertices=points, colors=colors) |
| scene.add_geometry(pc) |
|
|
| |
| if show_cameras and prediction.intrinsics is not None and prediction.extrinsics is not None: |
| scene_scale = _estimate_scene_scale(points, fallback=1.0) |
| H, W = prediction.depth.shape[1:] |
| _add_cameras_to_scene( |
| scene=scene, |
| K=prediction.intrinsics, |
| ext_w2c=prediction.extrinsics, |
| image_sizes=[(H, W)] * prediction.depth.shape[0], |
| scale=scene_scale * camera_size, |
| ) |
|
|
| |
| os.makedirs(export_dir, exist_ok=True) |
| out_path = os.path.join(export_dir, "scene.glb") |
| scene.export(out_path) |
|
|
| if export_depth_vis: |
| export_to_depth_vis(prediction, export_dir) |
| os.system(f"cp -r {export_dir}/depth_vis/0000.jpg {export_dir}/scene.jpg") |
| return out_path |
|
|
|
|
| |
| |
| |
|
|
|
|
| def _as_homogeneous44(ext: np.ndarray) -> np.ndarray: |
| """ |
| Accept (4,4) or (3,4) extrinsic parameters, return (4,4) homogeneous matrix. |
| """ |
| if ext.shape == (4, 4): |
| return ext |
| if ext.shape == (3, 4): |
| H = np.eye(4, dtype=ext.dtype) |
| H[:3, :4] = ext |
| return H |
| raise ValueError(f"extrinsic must be (4,4) or (3,4), got {ext.shape}") |
|
|
|
|
| def _depths_to_world_points_with_colors( |
| depth: np.ndarray, |
| K: np.ndarray, |
| ext_w2c: np.ndarray, |
| images_u8: np.ndarray, |
| conf: np.ndarray | None, |
| conf_thr: float, |
| ) -> tuple[np.ndarray, np.ndarray]: |
| """ |
| For each frame, transform (u,v,1) through K^{-1} to get rays, |
| multiply by depth to camera frame, then use (w2c)^{-1} to transform to world frame. |
| Simultaneously extract colors. |
| """ |
| N, H, W = depth.shape |
| us, vs = np.meshgrid(np.arange(W), np.arange(H)) |
| ones = np.ones_like(us) |
| pix = np.stack([us, vs, ones], axis=-1).reshape(-1, 3) |
|
|
| pts_all, col_all = [], [] |
|
|
| for i in range(N): |
| d = depth[i] |
| valid = np.isfinite(d) & (d > 0) |
| if conf is not None: |
| valid &= conf[i] >= conf_thr |
| if not np.any(valid): |
| continue |
|
|
| d_flat = d.reshape(-1) |
| vidx = np.flatnonzero(valid.reshape(-1)) |
|
|
| K_inv = np.linalg.inv(K[i]) |
| c2w = np.linalg.inv(_as_homogeneous44(ext_w2c[i])) |
|
|
| rays = K_inv @ pix[vidx].T |
| Xc = rays * d_flat[vidx][None, :] |
| Xc_h = np.vstack([Xc, np.ones((1, Xc.shape[1]))]) |
| Xw = (c2w @ Xc_h)[:3].T.astype(np.float32) |
|
|
| cols = images_u8[i].reshape(-1, 3)[vidx].astype(np.uint8) |
|
|
| pts_all.append(Xw) |
| col_all.append(cols) |
|
|
| if len(pts_all) == 0: |
| return np.zeros((0, 3), dtype=np.float32), np.zeros((0, 3), dtype=np.uint8) |
|
|
| return np.concatenate(pts_all, 0), np.concatenate(col_all, 0) |
|
|
|
|
| def _filter_and_downsample(points: np.ndarray, colors: np.ndarray, num_max: int): |
| if points.shape[0] == 0: |
| return points, colors |
| finite = np.isfinite(points).all(axis=1) |
| points, colors = points[finite], colors[finite] |
| if points.shape[0] > num_max: |
| idx = np.random.choice(points.shape[0], num_max, replace=False) |
| points, colors = points[idx], colors[idx] |
| return points, colors |
|
|
|
|
| def _estimate_scene_scale(points: np.ndarray, fallback: float = 1.0) -> float: |
| if points.shape[0] < 2: |
| return fallback |
| lo = np.percentile(points, 5, axis=0) |
| hi = np.percentile(points, 95, axis=0) |
| diag = np.linalg.norm(hi - lo) |
| return float(diag if np.isfinite(diag) and diag > 0 else fallback) |
|
|
|
|
| def _compute_alignment_transform_first_cam_glTF_center_by_points( |
| ext_w2c0: np.ndarray, |
| points_world: np.ndarray, |
| ) -> np.ndarray: |
| """Computes the transformation matrix to align the scene with glTF standards. |
| |
| This function calculates a 4x4 homogeneous matrix that centers the scene's |
| point cloud and transforms its coordinate system from the computer vision (CV) |
| standard to the glTF standard. |
| |
| The transformation process involves three main steps: |
| 1. **Initial Alignment**: Orients the world coordinate system to match the |
| first camera's view (x-right, y-down, z-forward). |
| 2. **Coordinate System Conversion**: Converts the CV camera frame to the |
| glTF frame (x-right, y-up, z-backward) by flipping the Y and Z axes. |
| 3. **Centering**: Translates the entire scene so that the median of the |
| point cloud becomes the new origin (0,0,0). |
| |
| Returns: |
| A 4x4 homogeneous transformation matrix (torch.Tensor or np.ndarray) |
| that applies these transformations. A: X' = A @ [X;1] |
| """ |
|
|
| w2c0 = _as_homogeneous44(ext_w2c0).astype(np.float64) |
|
|
| |
| M = np.eye(4, dtype=np.float64) |
| M[1, 1] = -1.0 |
| M[2, 2] = -1.0 |
|
|
| |
| A_no_center = M @ w2c0 |
|
|
| |
| if points_world.shape[0] > 0: |
| pts_tmp = trimesh.transform_points(points_world, A_no_center) |
| center = np.median(pts_tmp, axis=0) |
| else: |
| center = np.zeros(3, dtype=np.float64) |
|
|
| T_center = np.eye(4, dtype=np.float64) |
| T_center[:3, 3] = -center |
|
|
| A = T_center @ A_no_center |
| return A |
|
|
|
|
| def _add_cameras_to_scene( |
| scene: trimesh.Scene, |
| K: np.ndarray, |
| ext_w2c: np.ndarray, |
| image_sizes: list[tuple[int, int]], |
| scale: float, |
| ) -> None: |
| """Draws camera frustums to visualize their position and orientation. |
| |
| This function renders each camera as a wireframe pyramid, originating from |
| the camera's center and extending to the corners of its imaging plane. |
| |
| It reads the 'hf_alignment' metadata from the scene to ensure the |
| wireframes are correctly aligned with the 3D point cloud. |
| """ |
| N = K.shape[0] |
| if N == 0: |
| return |
|
|
| |
| A = None |
| try: |
| A = scene.metadata.get("hf_alignment", None) if scene.metadata else None |
| except Exception: |
| A = None |
| if A is None: |
| A = np.eye(4, dtype=np.float64) |
|
|
| for i in range(N): |
| H, W = image_sizes[i] |
| segs = _camera_frustum_lines(K[i], ext_w2c[i], W, H, scale) |
| |
| segs = trimesh.transform_points(segs.reshape(-1, 3), A).reshape(-1, 2, 3) |
| path = trimesh.load_path(segs) |
| color = _index_color_rgb(i, N) |
| if hasattr(path, "colors"): |
| path.colors = np.tile(color, (len(path.entities), 1)) |
| scene.add_geometry(path) |
|
|
|
|
| def _camera_frustum_lines( |
| K: np.ndarray, ext_w2c: np.ndarray, W: int, H: int, scale: float |
| ) -> np.ndarray: |
| corners = np.array( |
| [ |
| [0, 0, 1.0], |
| [W - 1, 0, 1.0], |
| [W - 1, H - 1, 1.0], |
| [0, H - 1, 1.0], |
| ], |
| dtype=float, |
| ) |
|
|
| K_inv = np.linalg.inv(K) |
| c2w = np.linalg.inv(_as_homogeneous44(ext_w2c)) |
|
|
| |
| Cw = (c2w @ np.array([0, 0, 0, 1.0]))[:3] |
|
|
| |
| rays = (K_inv @ corners.T).T |
| z = rays[:, 2:3] |
| z[z == 0] = 1.0 |
| plane_cam = (rays / z) * scale |
|
|
| |
| plane_w = [] |
| for p in plane_cam: |
| pw = (c2w @ np.array([p[0], p[1], p[2], 1.0]))[:3] |
| plane_w.append(pw) |
| plane_w = np.stack(plane_w, 0) |
|
|
| segs = [] |
| |
| for k in range(4): |
| segs.append(np.stack([Cw, plane_w[k]], 0)) |
| |
| order = [0, 1, 2, 3, 0] |
| for a, b in zip(order[:-1], order[1:]): |
| segs.append(np.stack([plane_w[a], plane_w[b]], 0)) |
|
|
| return np.stack(segs, 0) |
|
|
|
|
| def _index_color_rgb(i: int, n: int) -> np.ndarray: |
| h = (i + 0.5) / max(n, 1) |
| s, v = 0.85, 0.95 |
| r, g, b = _hsv_to_rgb(h, s, v) |
| return (np.array([r, g, b]) * 255).astype(np.uint8) |
|
|
|
|
| def _hsv_to_rgb(h: float, s: float, v: float) -> tuple[float, float, float]: |
| i = int(h * 6.0) |
| f = h * 6.0 - i |
| p = v * (1.0 - s) |
| q = v * (1.0 - f * s) |
| t = v * (1.0 - (1.0 - f) * s) |
| i = i % 6 |
| if i == 0: |
| r, g, b = v, t, p |
| elif i == 1: |
| r, g, b = q, v, p |
| elif i == 2: |
| r, g, b = p, v, t |
| elif i == 3: |
| r, g, b = p, q, v |
| elif i == 4: |
| r, g, b = t, p, v |
| else: |
| r, g, b = v, p, q |
| return r, g, b |
|
|