Spaces:

dwellbot
/

dwellbot_stream3r

Configuration error

App Files Files Community

brian4dwell commited on Sep 19, 2025

Commit

08c2845

1 Parent(s): 79b8fec

localization

Browse files

Files changed (5) hide show

app.py +31 -52
stream3r/models/__pycache__/stream3r.cpython-311.pyc +0 -0
stream3r/models/stream3r.py +2 -2
stream3r/utils/__pycache__/visual_utils.cpython-311.pyc +0 -0
stream3r/utils/visual_utils.py +30 -0

app.py CHANGED Viewed

@@ -413,10 +413,12 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
     predictions["intrinsic"] = intrinsic
     # Convert tensors to numpy
-    for key in predictions.keys():
-        if isinstance(predictions[key], torch.Tensor):
-            predictions[key] = predictions[key].cpu().numpy().squeeze(0)  # remove batch dimension
-    predictions['pose_enc_list'] = None # remove pose_enc_list
     # Generate world points from depth map
     print("Computing world points from depth map...")
@@ -671,11 +673,6 @@ def localize_new_image(
     kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
     predictions_path = os.path.join(target_dir, "predictions.npz")
-    if not os.path.exists(kv_cache_path):
-        return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
-    if not os.path.exists(predictions_path):
-        return ("Predictions not found. Run reconstruction before localizing.", gr.update())
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
@@ -683,6 +680,8 @@ def localize_new_image(
     except Exception as exc:
         return (f"Failed to preprocess image: {exc}", gr.update())
     session = getattr(localize_new_image, "_session", None)
     if session is None or session.mode != mode:
         session = StreamSession(model, mode=mode)
@@ -690,6 +689,16 @@ def localize_new_image(
     session.clear()
     try:
         session.load_cache(kv_cache_path, device=image_tensor.device)
@@ -783,62 +792,32 @@ def localize_new_image(
             "Intrinsics matrix:\n" + "\n".join("  " + _format_vector(row) for row in intrinsic)
         )
-    new_frame_data = {}
-    for key, tensor in localized_predictions.items():
-        if isinstance(tensor, torch.Tensor) and tensor.dim() >= 2:
-            new_frame_data[key] = _extract_frame(tensor, new_index)
     try:
-        stored = np.load(predictions_path)
-    except Exception as exc:
-        return (f"Failed to load stored predictions: {exc}", gr.update())
-    temp_predictions = {key: stored[key] for key in stored.files}
-    def _append_prediction(key: str, value: np.ndarray):
-        if key not in temp_predictions or value is None:
-            return
-        base = temp_predictions[key]
-        value_np = value.squeeze(0)
         try:
-            temp_predictions[key] = np.concatenate([base, value_np], axis=0)
         except ValueError:
-            pass
-    for key in [
-        "pose_enc",
-        "world_points",
-        "world_points_conf",
-        "world_points_from_depth",
-        "depth",
-        "depth_conf",
-        "images",
-        "extrinsic",
-        "intrinsic",
-    ]:
-        if key in new_frame_data:
-            _append_prediction(key, new_frame_data[key])
     localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
-    images_array = temp_predictions.get("images")
-    world_array = temp_predictions.get("world_points")
-    if images_array is not None:
-        frame_identifier = images_array.shape[0] - 1
-    elif world_array is not None:
-        frame_identifier = world_array.shape[0] - 1
-    else:
-        frame_identifier = 0
     try:
         glbscene = predictions_to_glb(
-            temp_predictions,
             conf_thres=conf_thres,
-            filter_by_frames=f"{frame_identifier}: localized",
             mask_black_bg=mask_black_bg,
             mask_white_bg=mask_white_bg,
-            show_cam=show_cam,
             mask_sky=mask_sky,
             target_dir=target_dir,
             prediction_mode=prediction_mode,
         )
         glbscene.export(file_obj=localization_glb_path)
     except Exception as exc:

     predictions["intrinsic"] = intrinsic
     # Convert tensors to numpy
+    for key in list(predictions.keys()):
+        value = predictions[key]
+        if isinstance(value, torch.Tensor):
+            predictions[key] = value.cpu().numpy().squeeze(0)  # remove batch dimension
+    predictions.pop("pose_enc_list", None)
     # Generate world points from depth map
     print("Computing world points from depth map...")
     kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
     predictions_path = os.path.join(target_dir, "predictions.npz")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     try:
     except Exception as exc:
         return (f"Failed to preprocess image: {exc}", gr.update())
+    model.eval()
+    model.to(device)
     session = getattr(localize_new_image, "_session", None)
     if session is None or session.mode != mode:
         session = StreamSession(model, mode=mode)
     session.clear()
+    if not os.path.exists(kv_cache_path):
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
+    if not os.path.exists(predictions_path):
+        if device == "cuda":
+            torch.cuda.empty_cache()
+        return ("Predictions not found. Run reconstruction before localizing.", gr.update())
     try:
         session.load_cache(kv_cache_path, device=image_tensor.device)
             "Intrinsics matrix:\n" + "\n".join("  " + _format_vector(row) for row in intrinsic)
         )
     try:
         try:
+            stored = np.load(predictions_path)
         except ValueError:
+            stored = np.load(predictions_path, allow_pickle=True)
+        base_predictions = {key: stored[key] for key in stored.files}
+        stored.close()
+    except Exception as exc:
+        return (f"Failed to load stored predictions: {exc}", gr.update())
     localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
+    world_to_camera = np.eye(4)
+    world_to_camera[:3, :4] = extrinsic
     try:
         glbscene = predictions_to_glb(
+            base_predictions,
             conf_thres=conf_thres,
+            filter_by_frames="All",
             mask_black_bg=mask_black_bg,
             mask_white_bg=mask_white_bg,
+            show_cam=False,
             mask_sky=mask_sky,
             target_dir=target_dir,
             prediction_mode=prediction_mode,
+            extra_cameras=[world_to_camera],
+            extra_camera_color=(255, 0, 0),
         )
         glbscene.export(file_obj=localization_glb_path)
     except Exception as exc:

stream3r/models/__pycache__/stream3r.cpython-311.pyc CHANGED Viewed

Binary files a/stream3r/models/__pycache__/stream3r.cpython-311.pyc and b/stream3r/models/__pycache__/stream3r.cpython-311.pyc differ

stream3r/models/stream3r.py CHANGED Viewed

@@ -64,7 +64,7 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
                 - images (torch.Tensor): Original input images, preserved for visualization
         """
         if self.training:
-            images = torch.stack([view["img"] for view in images], dim=1)
             images = (images + 1.) / 2.
         # If without batch dimension, add it
@@ -111,4 +111,4 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
         if not self.training:
             predictions["images"] = images
-        return predictions

                 - images (torch.Tensor): Original input images, preserved for visualization
         """
         if self.training:
+            images = torch.stack(tuple(view["img"] for view in images), dim=1)
             images = (images + 1.) / 2.
         # If without batch dimension, add it
         if not self.training:
             predictions["images"] = images
+        return predictions

stream3r/utils/__pycache__/visual_utils.cpython-311.pyc CHANGED Viewed

Binary files a/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc and b/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc differ

stream3r/utils/visual_utils.py CHANGED Viewed

@@ -24,6 +24,8 @@ def predictions_to_glb(
     mask_sky=False,
     target_dir=None,
     prediction_mode="Predicted Pointmap",
 ) -> trimesh.Scene:
     """
     Converts predictions to a 3D scene represented as a GLB file.
@@ -42,6 +44,9 @@ def predictions_to_glb(
         mask_sky (bool): Apply sky segmentation mask (default: False)
         target_dir (str): Output directory for intermediate files (default: None)
         prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
     Returns:
         trimesh.Scene: Processed 3D scene containing point cloud and cameras
@@ -197,6 +202,14 @@ def predictions_to_glb(
     extrinsics_matrices[:, :3, :4] = camera_matrices
     extrinsics_matrices[:, 3, 3] = 1
     if show_cam:
         # Add camera models to the scene
         for i in range(num_cameras):
@@ -207,6 +220,23 @@ def predictions_to_glb(
             integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
     # Align scene to the observation of the first camera
     scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)

     mask_sky=False,
     target_dir=None,
     prediction_mode="Predicted Pointmap",
+    extra_cameras=None,
+    extra_camera_color=(255, 0, 0),
 ) -> trimesh.Scene:
     """
     Converts predictions to a 3D scene represented as a GLB file.
         mask_sky (bool): Apply sky segmentation mask (default: False)
         target_dir (str): Output directory for intermediate files (default: None)
         prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
+        extra_cameras (Optional[List[np.ndarray]]): Additional camera extrinsics (3x4 or 4x4)
+            to visualize even when show_cam=False. Useful for highlighting localized poses.
+        extra_camera_color (tuple or list[tuple]): RGB color(s) for extra cameras.
     Returns:
         trimesh.Scene: Processed 3D scene containing point cloud and cameras
     extrinsics_matrices[:, :3, :4] = camera_matrices
     extrinsics_matrices[:, 3, 3] = 1
+    extra_cameras = [] if extra_cameras is None else list(extra_cameras)
+    if isinstance(extra_camera_color, tuple) and len(extra_cameras) > 1:
+        extra_colors = [extra_camera_color for _ in extra_cameras]
+    elif isinstance(extra_camera_color, (list, tuple)) and len(extra_cameras) == len(extra_camera_color):
+        extra_colors = list(extra_camera_color)
+    else:
+        extra_colors = [(255, 0, 0) for _ in extra_cameras]
     if show_cam:
         # Add camera models to the scene
         for i in range(num_cameras):
             integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
+    for idx, extra in enumerate(extra_cameras):
+        extra = np.asarray(extra)
+        if extra.shape == (3, 4):
+            world_to_camera = np.eye(4)
+            world_to_camera[:3, :4] = extra
+        elif extra.shape == (4, 4):
+            world_to_camera = extra
+        else:
+            raise ValueError("Extra camera extrinsic must have shape (3,4) or (4,4)")
+        camera_to_world = np.linalg.inv(world_to_camera)
+        integrate_camera_into_scene(
+            scene_3d,
+            camera_to_world,
+            extra_colors[idx] if idx < len(extra_colors) else (255, 0, 0),
+            scene_scale,
+        )
     # Align scene to the observation of the first camera
     scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)