Spaces:
Configuration error
Configuration error
Commit
·
08c2845
1
Parent(s):
79b8fec
localization
Browse files
app.py
CHANGED
|
@@ -413,10 +413,12 @@ def run_model(target_dir: str, model: STream3R, mode: str="causal", streaming: b
|
|
| 413 |
predictions["intrinsic"] = intrinsic
|
| 414 |
|
| 415 |
# Convert tensors to numpy
|
| 416 |
-
for key in predictions.keys():
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
|
|
|
|
|
|
| 420 |
|
| 421 |
# Generate world points from depth map
|
| 422 |
print("Computing world points from depth map...")
|
|
@@ -671,11 +673,6 @@ def localize_new_image(
|
|
| 671 |
kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
|
| 672 |
predictions_path = os.path.join(target_dir, "predictions.npz")
|
| 673 |
|
| 674 |
-
if not os.path.exists(kv_cache_path):
|
| 675 |
-
return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
|
| 676 |
-
if not os.path.exists(predictions_path):
|
| 677 |
-
return ("Predictions not found. Run reconstruction before localizing.", gr.update())
|
| 678 |
-
|
| 679 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 680 |
|
| 681 |
try:
|
|
@@ -683,6 +680,8 @@ def localize_new_image(
|
|
| 683 |
except Exception as exc:
|
| 684 |
return (f"Failed to preprocess image: {exc}", gr.update())
|
| 685 |
|
|
|
|
|
|
|
| 686 |
session = getattr(localize_new_image, "_session", None)
|
| 687 |
if session is None or session.mode != mode:
|
| 688 |
session = StreamSession(model, mode=mode)
|
|
@@ -690,6 +689,16 @@ def localize_new_image(
|
|
| 690 |
|
| 691 |
session.clear()
|
| 692 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
try:
|
| 694 |
session.load_cache(kv_cache_path, device=image_tensor.device)
|
| 695 |
|
|
@@ -783,62 +792,32 @@ def localize_new_image(
|
|
| 783 |
"Intrinsics matrix:\n" + "\n".join(" " + _format_vector(row) for row in intrinsic)
|
| 784 |
)
|
| 785 |
|
| 786 |
-
new_frame_data = {}
|
| 787 |
-
for key, tensor in localized_predictions.items():
|
| 788 |
-
if isinstance(tensor, torch.Tensor) and tensor.dim() >= 2:
|
| 789 |
-
new_frame_data[key] = _extract_frame(tensor, new_index)
|
| 790 |
-
|
| 791 |
try:
|
| 792 |
-
stored = np.load(predictions_path)
|
| 793 |
-
except Exception as exc:
|
| 794 |
-
return (f"Failed to load stored predictions: {exc}", gr.update())
|
| 795 |
-
|
| 796 |
-
temp_predictions = {key: stored[key] for key in stored.files}
|
| 797 |
-
|
| 798 |
-
def _append_prediction(key: str, value: np.ndarray):
|
| 799 |
-
if key not in temp_predictions or value is None:
|
| 800 |
-
return
|
| 801 |
-
base = temp_predictions[key]
|
| 802 |
-
value_np = value.squeeze(0)
|
| 803 |
try:
|
| 804 |
-
|
| 805 |
except ValueError:
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
"
|
| 811 |
-
"world_points_conf",
|
| 812 |
-
"world_points_from_depth",
|
| 813 |
-
"depth",
|
| 814 |
-
"depth_conf",
|
| 815 |
-
"images",
|
| 816 |
-
"extrinsic",
|
| 817 |
-
"intrinsic",
|
| 818 |
-
]:
|
| 819 |
-
if key in new_frame_data:
|
| 820 |
-
_append_prediction(key, new_frame_data[key])
|
| 821 |
|
| 822 |
localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
if images_array is not None:
|
| 826 |
-
frame_identifier = images_array.shape[0] - 1
|
| 827 |
-
elif world_array is not None:
|
| 828 |
-
frame_identifier = world_array.shape[0] - 1
|
| 829 |
-
else:
|
| 830 |
-
frame_identifier = 0
|
| 831 |
try:
|
| 832 |
glbscene = predictions_to_glb(
|
| 833 |
-
|
| 834 |
conf_thres=conf_thres,
|
| 835 |
-
filter_by_frames=
|
| 836 |
mask_black_bg=mask_black_bg,
|
| 837 |
mask_white_bg=mask_white_bg,
|
| 838 |
-
show_cam=
|
| 839 |
mask_sky=mask_sky,
|
| 840 |
target_dir=target_dir,
|
| 841 |
prediction_mode=prediction_mode,
|
|
|
|
|
|
|
| 842 |
)
|
| 843 |
glbscene.export(file_obj=localization_glb_path)
|
| 844 |
except Exception as exc:
|
|
|
|
| 413 |
predictions["intrinsic"] = intrinsic
|
| 414 |
|
| 415 |
# Convert tensors to numpy
|
| 416 |
+
for key in list(predictions.keys()):
|
| 417 |
+
value = predictions[key]
|
| 418 |
+
if isinstance(value, torch.Tensor):
|
| 419 |
+
predictions[key] = value.cpu().numpy().squeeze(0) # remove batch dimension
|
| 420 |
+
|
| 421 |
+
predictions.pop("pose_enc_list", None)
|
| 422 |
|
| 423 |
# Generate world points from depth map
|
| 424 |
print("Computing world points from depth map...")
|
|
|
|
| 673 |
kv_cache_path = os.path.join(target_dir, "kv_cache.pt")
|
| 674 |
predictions_path = os.path.join(target_dir, "predictions.npz")
|
| 675 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 677 |
|
| 678 |
try:
|
|
|
|
| 680 |
except Exception as exc:
|
| 681 |
return (f"Failed to preprocess image: {exc}", gr.update())
|
| 682 |
|
| 683 |
+
model.eval()
|
| 684 |
+
model.to(device)
|
| 685 |
session = getattr(localize_new_image, "_session", None)
|
| 686 |
if session is None or session.mode != mode:
|
| 687 |
session = StreamSession(model, mode=mode)
|
|
|
|
| 689 |
|
| 690 |
session.clear()
|
| 691 |
|
| 692 |
+
if not os.path.exists(kv_cache_path):
|
| 693 |
+
if device == "cuda":
|
| 694 |
+
torch.cuda.empty_cache()
|
| 695 |
+
return ("Streaming cache not found. Run reconstruction in streaming mode first.", gr.update())
|
| 696 |
+
|
| 697 |
+
if not os.path.exists(predictions_path):
|
| 698 |
+
if device == "cuda":
|
| 699 |
+
torch.cuda.empty_cache()
|
| 700 |
+
return ("Predictions not found. Run reconstruction before localizing.", gr.update())
|
| 701 |
+
|
| 702 |
try:
|
| 703 |
session.load_cache(kv_cache_path, device=image_tensor.device)
|
| 704 |
|
|
|
|
| 792 |
"Intrinsics matrix:\n" + "\n".join(" " + _format_vector(row) for row in intrinsic)
|
| 793 |
)
|
| 794 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
try:
|
| 797 |
+
stored = np.load(predictions_path)
|
| 798 |
except ValueError:
|
| 799 |
+
stored = np.load(predictions_path, allow_pickle=True)
|
| 800 |
+
base_predictions = {key: stored[key] for key in stored.files}
|
| 801 |
+
stored.close()
|
| 802 |
+
except Exception as exc:
|
| 803 |
+
return (f"Failed to load stored predictions: {exc}", gr.update())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
|
| 805 |
localization_glb_path = os.path.join(target_dir, "localization_preview.glb")
|
| 806 |
+
world_to_camera = np.eye(4)
|
| 807 |
+
world_to_camera[:3, :4] = extrinsic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
try:
|
| 809 |
glbscene = predictions_to_glb(
|
| 810 |
+
base_predictions,
|
| 811 |
conf_thres=conf_thres,
|
| 812 |
+
filter_by_frames="All",
|
| 813 |
mask_black_bg=mask_black_bg,
|
| 814 |
mask_white_bg=mask_white_bg,
|
| 815 |
+
show_cam=False,
|
| 816 |
mask_sky=mask_sky,
|
| 817 |
target_dir=target_dir,
|
| 818 |
prediction_mode=prediction_mode,
|
| 819 |
+
extra_cameras=[world_to_camera],
|
| 820 |
+
extra_camera_color=(255, 0, 0),
|
| 821 |
)
|
| 822 |
glbscene.export(file_obj=localization_glb_path)
|
| 823 |
except Exception as exc:
|
stream3r/models/__pycache__/stream3r.cpython-311.pyc
CHANGED
|
Binary files a/stream3r/models/__pycache__/stream3r.cpython-311.pyc and b/stream3r/models/__pycache__/stream3r.cpython-311.pyc differ
|
|
|
stream3r/models/stream3r.py
CHANGED
|
@@ -64,7 +64,7 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
|
|
| 64 |
- images (torch.Tensor): Original input images, preserved for visualization
|
| 65 |
"""
|
| 66 |
if self.training:
|
| 67 |
-
images = torch.stack(
|
| 68 |
images = (images + 1.) / 2.
|
| 69 |
|
| 70 |
# If without batch dimension, add it
|
|
@@ -111,4 +111,4 @@ class STream3R(nn.Module, PyTorchModelHubMixin):
|
|
| 111 |
if not self.training:
|
| 112 |
predictions["images"] = images
|
| 113 |
|
| 114 |
-
return predictions
|
|
|
|
| 64 |
- images (torch.Tensor): Original input images, preserved for visualization
|
| 65 |
"""
|
| 66 |
if self.training:
|
| 67 |
+
images = torch.stack(tuple(view["img"] for view in images), dim=1)
|
| 68 |
images = (images + 1.) / 2.
|
| 69 |
|
| 70 |
# If without batch dimension, add it
|
|
|
|
| 111 |
if not self.training:
|
| 112 |
predictions["images"] = images
|
| 113 |
|
| 114 |
+
return predictions
|
stream3r/utils/__pycache__/visual_utils.cpython-311.pyc
CHANGED
|
Binary files a/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc and b/stream3r/utils/__pycache__/visual_utils.cpython-311.pyc differ
|
|
|
stream3r/utils/visual_utils.py
CHANGED
|
@@ -24,6 +24,8 @@ def predictions_to_glb(
|
|
| 24 |
mask_sky=False,
|
| 25 |
target_dir=None,
|
| 26 |
prediction_mode="Predicted Pointmap",
|
|
|
|
|
|
|
| 27 |
) -> trimesh.Scene:
|
| 28 |
"""
|
| 29 |
Converts predictions to a 3D scene represented as a GLB file.
|
|
@@ -42,6 +44,9 @@ def predictions_to_glb(
|
|
| 42 |
mask_sky (bool): Apply sky segmentation mask (default: False)
|
| 43 |
target_dir (str): Output directory for intermediate files (default: None)
|
| 44 |
prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
Returns:
|
| 47 |
trimesh.Scene: Processed 3D scene containing point cloud and cameras
|
|
@@ -197,6 +202,14 @@ def predictions_to_glb(
|
|
| 197 |
extrinsics_matrices[:, :3, :4] = camera_matrices
|
| 198 |
extrinsics_matrices[:, 3, 3] = 1
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
if show_cam:
|
| 201 |
# Add camera models to the scene
|
| 202 |
for i in range(num_cameras):
|
|
@@ -207,6 +220,23 @@ def predictions_to_glb(
|
|
| 207 |
|
| 208 |
integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# Align scene to the observation of the first camera
|
| 211 |
scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
|
| 212 |
|
|
|
|
| 24 |
mask_sky=False,
|
| 25 |
target_dir=None,
|
| 26 |
prediction_mode="Predicted Pointmap",
|
| 27 |
+
extra_cameras=None,
|
| 28 |
+
extra_camera_color=(255, 0, 0),
|
| 29 |
) -> trimesh.Scene:
|
| 30 |
"""
|
| 31 |
Converts predictions to a 3D scene represented as a GLB file.
|
|
|
|
| 44 |
mask_sky (bool): Apply sky segmentation mask (default: False)
|
| 45 |
target_dir (str): Output directory for intermediate files (default: None)
|
| 46 |
prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
|
| 47 |
+
extra_cameras (Optional[List[np.ndarray]]): Additional camera extrinsics (3x4 or 4x4)
|
| 48 |
+
to visualize even when show_cam=False. Useful for highlighting localized poses.
|
| 49 |
+
extra_camera_color (tuple or list[tuple]): RGB color(s) for extra cameras.
|
| 50 |
|
| 51 |
Returns:
|
| 52 |
trimesh.Scene: Processed 3D scene containing point cloud and cameras
|
|
|
|
| 202 |
extrinsics_matrices[:, :3, :4] = camera_matrices
|
| 203 |
extrinsics_matrices[:, 3, 3] = 1
|
| 204 |
|
| 205 |
+
extra_cameras = [] if extra_cameras is None else list(extra_cameras)
|
| 206 |
+
if isinstance(extra_camera_color, tuple) and len(extra_cameras) > 1:
|
| 207 |
+
extra_colors = [extra_camera_color for _ in extra_cameras]
|
| 208 |
+
elif isinstance(extra_camera_color, (list, tuple)) and len(extra_cameras) == len(extra_camera_color):
|
| 209 |
+
extra_colors = list(extra_camera_color)
|
| 210 |
+
else:
|
| 211 |
+
extra_colors = [(255, 0, 0) for _ in extra_cameras]
|
| 212 |
+
|
| 213 |
if show_cam:
|
| 214 |
# Add camera models to the scene
|
| 215 |
for i in range(num_cameras):
|
|
|
|
| 220 |
|
| 221 |
integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
|
| 222 |
|
| 223 |
+
for idx, extra in enumerate(extra_cameras):
|
| 224 |
+
extra = np.asarray(extra)
|
| 225 |
+
if extra.shape == (3, 4):
|
| 226 |
+
world_to_camera = np.eye(4)
|
| 227 |
+
world_to_camera[:3, :4] = extra
|
| 228 |
+
elif extra.shape == (4, 4):
|
| 229 |
+
world_to_camera = extra
|
| 230 |
+
else:
|
| 231 |
+
raise ValueError("Extra camera extrinsic must have shape (3,4) or (4,4)")
|
| 232 |
+
camera_to_world = np.linalg.inv(world_to_camera)
|
| 233 |
+
integrate_camera_into_scene(
|
| 234 |
+
scene_3d,
|
| 235 |
+
camera_to_world,
|
| 236 |
+
extra_colors[idx] if idx < len(extra_colors) else (255, 0, 0),
|
| 237 |
+
scene_scale,
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
# Align scene to the observation of the first camera
|
| 241 |
scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
|
| 242 |
|