brian4dwell commited on
Commit
08c2845
·
1 Parent(s): 79b8fec

localization

Browse files
app.py CHANGED
@@ -413,10 +413,12 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
413
  predictions["intrinsic"] = intrinsic
414
 
415
  # Convert tensors to numpy
416
- for key in predictions.keys():
417
- if isinstance(predictions[key], torch.Tensor):
418
- predictions[key] = predictions[key].cpu().numpy().squeeze(0) # remove batch dimension
419
- predictions['pose_enc_list'] = None # remove pose_enc_list
 
 
420
 
421
  # Generate world points from depth map
422
  print("Computing world points from depth map...")
@@ -671,11 +673,6 @@ def localize_new_image(
671
  kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
672
  predictions_path = os.path.join(target_dir, "predictions.npz")
673
 
674
- if not os.path.exists(kv_cache_path):
675
- return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
676
- if not os.path.exists(predictions_path):
677
- return ("Predictions not found. Run reconstruction before localizing.", gr.update())
678
-
679
  device = "cuda" if torch.cuda.is_available() else "cpu"
680
 
681
  try:
@@ -683,6 +680,8 @@ def localize_new_image(
683
  except Exception as exc:
684
  return (f"Failed to preprocess image: {exc}", gr.update())
685
 
 
 
686
  session = getattr(localize_new_image, "_session", None)
687
  if session is None or session.mode != mode:
688
  session = StreamSession(model, mode=mode)
@@ -690,6 +689,16 @@ def localize_new_image(
690
 
691
  session.clear()
692
 
 
 
 
 
 
 
 
 
 
 
693
  try:
694
  session.load_cache(kv_cache_path, device=image_tensor.device)
695
 
@@ -783,62 +792,32 @@ def localize_new_image(
783
  "Intrinsics matrix:\n" + "\n".join(" " + _format_vector(row) for row in intrinsic)
784
  )
785
 
786
- new_frame_data = {}
787
- for key, tensor in localized_predictions.items():
788
- if isinstance(tensor, torch.Tensor) and tensor.dim() >= 2:
789
- new_frame_data[key] = _extract_frame(tensor, new_index)
790
-
791
  try:
792
- stored = np.load(predictions_path)
793
- except Exception as exc:
794
- return (f"Failed to load stored predictions: {exc}", gr.update())
795
-
796
- temp_predictions = {key: stored[key] for key in stored.files}
797
-
798
- def _append_prediction(key: str, value: np.ndarray):
799
- if key not in temp_predictions or value is None:
800
- return
801
- base = temp_predictions[key]
802
- value_np = value.squeeze(0)
803
  try:
804
- temp_predictions[key] = np.concatenate([base, value_np], axis=0)
805
  except ValueError:
806
- pass
807
-
808
- for key in [
809
- "pose_enc",
810
- "world_points",
811
- "world_points_conf",
812
- "world_points_from_depth",
813
- "depth",
814
- "depth_conf",
815
- "images",
816
- "extrinsic",
817
- "intrinsic",
818
- ]:
819
- if key in new_frame_data:
820
- _append_prediction(key, new_frame_data[key])
821
 
822
  localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
823
- images_array = temp_predictions.get("images")
824
- world_array = temp_predictions.get("world_points")
825
- if images_array is not None:
826
- frame_identifier = images_array.shape[0] - 1
827
- elif world_array is not None:
828
- frame_identifier = world_array.shape[0] - 1
829
- else:
830
- frame_identifier = 0
831
  try:
832
  glbscene = predictions_to_glb(
833
- temp_predictions,
834
  conf_thres=conf_thres,
835
- filter_by_frames=f"{frame_identifier}: localized",
836
  mask_black_bg=mask_black_bg,
837
  mask_white_bg=mask_white_bg,
838
- show_cam=show_cam,
839
  mask_sky=mask_sky,
840
  target_dir=target_dir,
841
  prediction_mode=prediction_mode,
 
 
842
  )
843
  glbscene.export(file_obj=localization_glb_path)
844
  except Exception as exc:
 
413
  predictions["intrinsic"] = intrinsic
414
 
415
  # Convert tensors to numpy
416
+ for key in list(predictions.keys()):
417
+ value = predictions[key]
418
+ if isinstance(value, torch.Tensor):
419
+ predictions[key] = value.cpu().numpy().squeeze(0) # remove batch dimension
420
+
421
+ predictions.pop("pose_enc_list", None)
422
 
423
  # Generate world points from depth map
424
  print("Computing world points from depth map...")
 
673
  kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
674
  predictions_path = os.path.join(target_dir, "predictions.npz")
675
 
 
 
 
 
 
676
  device = "cuda" if torch.cuda.is_available() else "cpu"
677
 
678
  try:
 
680
  except Exception as exc:
681
  return (f"Failed to preprocess image: {exc}", gr.update())
682
 
683
+ model.eval()
684
+ model.to(device)
685
  session = getattr(localize_new_image, "_session", None)
686
  if session is None or session.mode != mode:
687
  session = StreamSession(model, mode=mode)
 
689
 
690
  session.clear()
691
 
692
+ if not os.path.exists(kv_cache_path):
693
+ if device == "cuda":
694
+ torch.cuda.empty_cache()
695
+ return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
696
+
697
+ if not os.path.exists(predictions_path):
698
+ if device == "cuda":
699
+ torch.cuda.empty_cache()
700
+ return ("Predictions not found. Run reconstruction before localizing.", gr.update())
701
+
702
  try:
703
  session.load_cache(kv_cache_path, device=image_tensor.device)
704
 
 
792
  "Intrinsics matrix:\n" + "\n".join(" " + _format_vector(row) for row in intrinsic)
793
  )
794
 
 
 
 
 
 
795
  try:
 
 
 
 
 
 
 
 
 
 
 
796
  try:
797
+ stored = np.load(predictions_path)
798
  except ValueError:
799
+ stored = np.load(predictions_path, allow_pickle=True)
800
+ base_predictions = {key: stored[key] for key in stored.files}
801
+ stored.close()
802
+ except Exception as exc:
803
+ return (f"Failed to load stored predictions: {exc}", gr.update())
 
 
 
 
 
 
 
 
 
 
804
 
805
  localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
806
+ world_to_camera = np.eye(4)
807
+ world_to_camera[:3, :4] = extrinsic
 
 
 
 
 
 
808
  try:
809
  glbscene = predictions_to_glb(
810
+ base_predictions,
811
  conf_thres=conf_thres,
812
+ filter_by_frames="All",
813
  mask_black_bg=mask_black_bg,
814
  mask_white_bg=mask_white_bg,
815
+ show_cam=False,
816
  mask_sky=mask_sky,
817
  target_dir=target_dir,
818
  prediction_mode=prediction_mode,
819
+ extra_cameras=[world_to_camera],
820
+ extra_camera_color=(255, 0, 0),
821
  )
822
  glbscene.export(file_obj=localization_glb_path)
823
  except Exception as exc:
stream3r/models/__pycache__/stream3r.cpython-311.pyc CHANGED
Binary files a/stream3r/models/__pycache__/stream3r.cpython-311.pyc and b/stream3r/models/__pycache__/stream3r.cpython-311.pyc differ
 
stream3r/models/stream3r.py CHANGED
@@ -64,7 +64,7 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
64
  - images (torch.Tensor): Original input images, preserved for visualization
65
  """
66
  if self.training:
67
- images = torch.stack([view["img"] for view in images], dim=1)
68
  images = (images + 1.) / 2.
69
 
70
  # If without batch dimension, add it
@@ -111,4 +111,4 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
111
  if not self.training:
112
  predictions["images"] = images
113
 
114
- return predictions
 
64
  - images (torch.Tensor): Original input images, preserved for visualization
65
  """
66
  if self.training:
67
+ images = torch.stack(tuple(view["img"] for view in images), dim=1)
68
  images = (images + 1.) / 2.
69
 
70
  # If without batch dimension, add it
 
111
  if not self.training:
112
  predictions["images"] = images
113
 
114
+ return predictions
stream3r/utils/__pycache__/visual_utils.cpython-311.pyc CHANGED
Binary files a/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc and b/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc differ
 
stream3r/utils/visual_utils.py CHANGED
@@ -24,6 +24,8 @@ def predictions_to_glb(
24
  mask_sky=False,
25
  target_dir=None,
26
  prediction_mode="Predicted Pointmap",
 
 
27
  ) -> trimesh.Scene:
28
  """
29
  Converts predictions to a 3D scene represented as a GLB file.
@@ -42,6 +44,9 @@ def predictions_to_glb(
42
  mask_sky (bool): Apply sky segmentation mask (default: False)
43
  target_dir (str): Output directory for intermediate files (default: None)
44
  prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
 
 
 
45
 
46
  Returns:
47
  trimesh.Scene: Processed 3D scene containing point cloud and cameras
@@ -197,6 +202,14 @@ def predictions_to_glb(
197
  extrinsics_matrices[:, :3, :4] = camera_matrices
198
  extrinsics_matrices[:, 3, 3] = 1
199
 
 
 
 
 
 
 
 
 
200
  if show_cam:
201
  # Add camera models to the scene
202
  for i in range(num_cameras):
@@ -207,6 +220,23 @@ def predictions_to_glb(
207
 
208
  integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Align scene to the observation of the first camera
211
  scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
212
 
 
24
  mask_sky=False,
25
  target_dir=None,
26
  prediction_mode="Predicted Pointmap",
27
+ extra_cameras=None,
28
+ extra_camera_color=(255, 0, 0),
29
  ) -> trimesh.Scene:
30
  """
31
  Converts predictions to a 3D scene represented as a GLB file.
 
44
  mask_sky (bool): Apply sky segmentation mask (default: False)
45
  target_dir (str): Output directory for intermediate files (default: None)
46
  prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
47
+ extra_cameras (Optional[List[np.ndarray]]): Additional camera extrinsics (3x4 or 4x4)
48
+ to visualize even when show_cam=False. Useful for highlighting localized poses.
49
+ extra_camera_color (tuple or list[tuple]): RGB color(s) for extra cameras.
50
 
51
  Returns:
52
  trimesh.Scene: Processed 3D scene containing point cloud and cameras
 
202
  extrinsics_matrices[:, :3, :4] = camera_matrices
203
  extrinsics_matrices[:, 3, 3] = 1
204
 
205
+ extra_cameras = [] if extra_cameras is None else list(extra_cameras)
206
+ if isinstance(extra_camera_color, tuple) and len(extra_cameras) > 1:
207
+ extra_colors = [extra_camera_color for _ in extra_cameras]
208
+ elif isinstance(extra_camera_color, (list, tuple)) and len(extra_cameras) == len(extra_camera_color):
209
+ extra_colors = list(extra_camera_color)
210
+ else:
211
+ extra_colors = [(255, 0, 0) for _ in extra_cameras]
212
+
213
  if show_cam:
214
  # Add camera models to the scene
215
  for i in range(num_cameras):
 
220
 
221
  integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
222
 
223
+ for idx, extra in enumerate(extra_cameras):
224
+ extra = np.asarray(extra)
225
+ if extra.shape == (3, 4):
226
+ world_to_camera = np.eye(4)
227
+ world_to_camera[:3, :4] = extra
228
+ elif extra.shape == (4, 4):
229
+ world_to_camera = extra
230
+ else:
231
+ raise ValueError("Extra camera extrinsic must have shape (3,4) or (4,4)")
232
+ camera_to_world = np.linalg.inv(world_to_camera)
233
+ integrate_camera_into_scene(
234
+ scene_3d,
235
+ camera_to_world,
236
+ extra_colors[idx] if idx < len(extra_colors) else (255, 0, 0),
237
+ scene_scale,
238
+ )
239
+
240
  # Align scene to the observation of the first camera
241
  scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
242