Spaces:

bulatko
/

zoo3d

Paused

App Files Files Community

drozdgk commited on 2 days ago

Commit

c00671c

1 Parent(s): 52f5401

feat: CPU-compatible dummy pipeline for debugging

Browse files

Files changed (1) hide show

mvp.py +103 -17

mvp.py CHANGED Viewed

@@ -22,6 +22,7 @@ import trimesh
 import matplotlib.pyplot as plt
 import subprocess
 import tempfile
 from huggingface_hub import hf_hub_download
 try:
@@ -62,6 +63,13 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 _VGGT_MODEL = None
 _METRIC3D_MODEL = None
@@ -117,7 +125,18 @@ def _init_models():
     global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
     if not torch.cuda.is_available():
-        raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
     if _VGGT_MODEL is None:
         print("Initializing and loading VGGT model...")
@@ -154,6 +173,9 @@ def _init_models():
 cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
 def check_weights():
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
         os.makedirs(MK_PATH, exist_ok=True)
@@ -195,14 +217,18 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
     """
     print(f"Processing images from {target_dir}")
-    # Device check
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if device != "cuda":
-        raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
-    # Move model to device
-    model = model.to(device)
-    model.eval()
     # Load and preprocess images
     image_names = glob.glob(os.path.join(target_dir, "images", "*"))
@@ -211,15 +237,71 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
     if len(image_names) == 0:
         raise ValueError("No images found. Check your upload.")
-    images = load_and_preprocess_images(image_names).to(device)
-    print(f"Preprocessed images shape: {images.shape}")
-    # Run inference
     print("Running inference...")
     dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
     with torch.no_grad():
-        with torch.cuda.amp.autocast(dtype=dtype):
             predictions = model(images)
     scale_factor = torch.tensor(1.0, device=device)
@@ -329,7 +411,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
     predictions["world_points_from_depth"] = world_points
     # Clean up
-    torch.cuda.empty_cache()
     return predictions
@@ -343,7 +426,8 @@ def handle_uploads(input_video, input_images):
     """
     start_time = time.time()
     gc.collect()
-    torch.cuda.empty_cache()
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
@@ -439,7 +523,8 @@ def reconstruct(
     start_time = time.time()
     gc.collect()
-    torch.cuda.empty_cache()
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
@@ -814,10 +899,11 @@ def detect_objects(text_labels, target_dir, conf_thres, *viz_args):
         return None, "Please enter at least one text label (separated by ';')."
     # Ensure CropFormer weights exist (if detection pipeline uses them)
-    try:
-        check_weights()
-    except Exception as e:
-        print(f"Warning: could not ensure Mask2Former weights: {e}")
     # 1. Run reconstruction first if needed (checking if predictions exist)
     predictions_path = os.path.join(target_dir, "predictions.npz")

 import matplotlib.pyplot as plt
 import subprocess
 import tempfile
+import contextlib
 from huggingface_hub import hf_hub_download
 try:
 print(f"Using device: {device}")
+# CPU debug / compatibility knobs:
+# - On CPU, VGGT-1B inference is usually impractical. For debugging, we fall back to a lightweight
+#   dummy pipeline that produces a minimal predictions dict compatible with `predictions_to_glb`.
+ZOO3D_ALLOW_CPU = os.environ.get("ZOO3D_ALLOW_CPU", "1") == "1"
+ZOO3D_CPU_DUMMY = os.environ.get("ZOO3D_CPU_DUMMY", "1") == "1"
+ZOO3D_SKIP_DOWNLOADS = os.environ.get("ZOO3D_SKIP_DOWNLOADS", "0") == "1"
 _VGGT_MODEL = None
 _METRIC3D_MODEL = None
     global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
     if not torch.cuda.is_available():
+        # CPU-friendly mode for debugging: skip heavy models.
+        if not ZOO3D_ALLOW_CPU:
+            raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
+        # We still can load CLIP on CPU if needed, but skip VGGT/Metric3D.
+        if _CLIP_MODEL is None:
+            print("[INFO] loading CLIP model (CPU)...")
+            cm, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
+            cm.to("cpu")
+            cm.eval()
+            print("[INFO] finish loading CLIP model (CPU)...")
+            globals()["_CLIP_MODEL"] = cm
+        return None, None, _CLIP_MODEL
     if _VGGT_MODEL is None:
         print("Initializing and loading VGGT model...")
 cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
 def check_weights():
+    if ZOO3D_SKIP_DOWNLOADS:
+        print("[INFO] ZOO3D_SKIP_DOWNLOADS=1: skipping Mask2Former weights download.")
+        return
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
         os.makedirs(MK_PATH, exist_ok=True)
     """
     print(f"Processing images from {target_dir}")
+    # Device selection
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if device != "cuda":
+        if not ZOO3D_ALLOW_CPU:
+            raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
+        if not ZOO3D_CPU_DUMMY:
+            raise RuntimeError(
+                "CPU режим включен, но ZOO3D_CPU_DUMMY=0. "
+                "Для отладки поставь ZOO3D_CPU_DUMMY=1 или включи GPU."
+            )
+    # Load and preprocess images (we need them for both GPU and CPU-dummy)
     # Load and preprocess images
     image_names = glob.glob(os.path.join(target_dir, "images", "*"))
     if len(image_names) == 0:
         raise ValueError("No images found. Check your upload.")
+    # For CPU dummy mode we want the original HxW for `predictions_to_glb` coloring.
+    cpu_images_u8 = None
+    if device == "cpu":
+        imgs = []
+        for p in image_names:
+            im = cv2.imread(p, cv2.IMREAD_COLOR)
+            if im is None:
+                continue
+            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
+            imgs.append(im)
+        if len(imgs) == 0:
+            raise ValueError("No readable images found. Check your upload.")
+        # Make all images same size for stacking
+        H, W = imgs[0].shape[:2]
+        imgs2 = []
+        for im in imgs:
+            if im.shape[:2] != (H, W):
+                im = cv2.resize(im, (W, H))
+            imgs2.append(im)
+        cpu_images_u8 = np.stack(imgs2, axis=0)  # (S,H,W,3) uint8
+        print(f"CPU dummy: loaded images shape: {cpu_images_u8.shape}")
+    images = load_and_preprocess_images(image_names)
+    print(f"Preprocessed images shape: {tuple(images.shape)}")
+    if device == "cuda":
+        images = images.to(device)
+    if device == "cpu":
+        # Dummy predictions for CPU debugging: minimal keys needed by `predictions_to_glb`
+        S, H, W = cpu_images_u8.shape[0], cpu_images_u8.shape[1], cpu_images_u8.shape[2]
+        # Simple planar point cloud in camera space
+        uu, vv = np.meshgrid(np.arange(W), np.arange(H))
+        x = (uu - (W / 2.0)) / float(max(W, 1))
+        y = -(vv - (H / 2.0)) / float(max(W, 1))
+        z = np.ones_like(x, dtype=np.float32) * 1.0
+        pts = np.stack([x, y, z], axis=-1).astype(np.float32)  # (H,W,3)
+        world_points_from_depth = np.repeat(pts[None, ...], S, axis=0)  # (S,H,W,3)
+        depth = np.ones((S, H, W, 1), dtype=np.float32)
+        depth_conf = np.ones((S, H, W), dtype=np.float32)
+        extrinsic = np.tile(np.array([[1, 0, 0, 0],
+                                      [0, 1, 0, 0],
+                                      [0, 0, 1, 0]], dtype=np.float32)[None, ...], (S, 1, 1))
+        intrinsic = np.tile(np.eye(3, dtype=np.float32)[None, ...], (S, 1, 1))
+        pose = np.tile(np.eye(4, dtype=np.float32)[None, ...], (S, 1, 1))
+        return {
+            "images": cpu_images_u8,
+            "extrinsic": extrinsic,
+            "intrinsic": intrinsic,
+            "pose": pose,
+            "depth": depth,
+            "depth_conf": depth_conf,
+            "world_points_from_depth": world_points_from_depth,
+        }
+    # GPU inference
+    # Move model to device
+    model = model.to(device)
+    model.eval()
     print("Running inference...")
     dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+    amp_ctx = torch.cuda.amp.autocast(dtype=dtype) if device == "cuda" else contextlib.nullcontext()
     with torch.no_grad():
+        with amp_ctx:
             predictions = model(images)
     scale_factor = torch.tensor(1.0, device=device)
     predictions["world_points_from_depth"] = world_points
     # Clean up
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return predictions
     """
     start_time = time.time()
     gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     start_time = time.time()
     gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
         return None, "Please enter at least one text label (separated by ';')."
     # Ensure CropFormer weights exist (if detection pipeline uses them)
+    if torch.cuda.is_available() or not ZOO3D_SKIP_DOWNLOADS:
+        try:
+            check_weights()
+        except Exception as e:
+            print(f"Warning: could not ensure Mask2Former weights: {e}")
     # 1. Run reconstruction first if needed (checking if predictions exist)
     predictions_path = os.path.join(target_dir, "predictions.npz")