Spaces:

bulatko
/

zoo3d

Paused

App Files Files Community

bulatko commited on about 18 hours ago

Commit

6c099d4

1 Parent(s): e4c5797

fix detectron & weights

Browse files

Files changed (4) hide show

README.md +7 -0
app.py +23 -0
mvp.py +202 -46
requirements.txt +13 -0

README.md CHANGED Viewed

@@ -1,3 +1,10 @@
 <div align="center">
 <h1>VGGT: Visual Geometry Grounded Transformer</h1>

+---
+title: Zoo3D (VGGT + open-vocabulary 3D detection)
+sdk: gradio
+app_file: app.py
+pinned: false
+---
 <div align="center">
 <h1>VGGT: Visual Geometry Grounded Transformer</h1>

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import gradio as gr
+def _launch():
+    # HF Spaces expects the app to listen on 0.0.0.0:7860 (PORT may be provided).
+    import mvp
+    port = int(os.getenv("PORT", "7860"))
+    # `mvp` defines `demo` (gr.Blocks). We launch it here instead of inside `mvp.py`.
+    mvp.demo.queue(max_size=20).launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        show_error=True,
+        share=False,
+    )
+if __name__ == "__main__":
+    _launch()

mvp.py CHANGED Viewed

@@ -20,9 +20,22 @@ import open_clip
 from open_clip import tokenizer
 import trimesh
 import matplotlib.pyplot as plt
-sys.path.append("vggt/")
-MK_PATH = "MaskClustering"
 from visual_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
@@ -34,46 +47,93 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-print("Initializing and loading VGGT model...")
-# model = VGGT.from_pretrained("facebook/VGGT-1B")  # another way to load the model
-model = VGGT()
-_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
-model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
-model.eval()
-model = model.to(device)
-print("Initializing and loading Metric3D model...")
-try:
-    metric3d_model = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_small', pretrain=True, trust_repo=True)
-except TypeError:
-    metric3d_model = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_small', pretrain=True)
-metric3d_model.to(device)
-metric3d_model.eval()
 cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
 def check_weights():
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
-        os.system(f"wget https://huggingface.co/datasets/qqlu1992/Adobe_EntitySeg/resolve/main/CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth?download=true -O {os.path.join(MK_PATH, cropformer_name)}")
         print(f"Downloaded {cropformer_name}...")
     else:
         print(f"{cropformer_name} already exists...")
 check_weights()
-def load_clip():
-    print(f'[INFO] loading CLIP model...')
-    model, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
-    model.cuda()
-    model.eval()
-    print(f'[INFO]', ' finish loading CLIP model...')
-    return model
 def extract_text_feature(descriptions, clip_model, target_path):
-    text_tokens = tokenizer.tokenize(descriptions).cuda()
     with torch.no_grad():
         text_features = clip_model.encode_text(text_tokens).float()
         text_features /= text_features.norm(dim=-1, keepdim=True)
@@ -87,7 +147,7 @@ def extract_text_feature(descriptions, clip_model, target_path):
     return text_features_dict
-clip_model = load_clip()
 # -------------------------------------------------------------------------
@@ -101,8 +161,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
     # Device check
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    if not torch.cuda.is_available():
-        raise ValueError("CUDA is not available. Check your environment.")
     # Move model to device
     model = model.to(device)
@@ -126,6 +186,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
         with torch.cuda.amp.autocast(dtype=dtype):
             predictions = model(images)
     # Metric3D inference
     if metric3d_model is not None:
         print("Running Metric3D inference...")
@@ -176,15 +238,13 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
              metric_depth = metric_depth.unsqueeze(-1) # -> (B, H, W, 1)
         # Move to same device/dtype
-        vggt_depth = vggt_depth.to(metric_depth.device).float()[0]
         metric_depth = metric_depth.float()
         # Resize metric depth to match VGGT depth if they differ in spatial resolution
         # vggt_depth: (B, H, W, 1) or (B, H, W)
         # metric_depth: (B, H, W, 1) after permutation
-        target_h, target_w = vggt_depth.shape[1], vggt_depth.shape[2]
         # Mask for valid values to compute median
         print(f"Metric3D depth shape: {metric_depth.shape}")
         print(f"VGGT depth shape: {vggt_depth.shape}")
@@ -194,6 +254,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
             ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
             scale_factor = torch.median(ratio)
             print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")
     print("Converting pose encoding to extrinsic and intrinsic matrices...")
     extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
     extrinsic = extrinsic[0]
@@ -222,7 +284,7 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
     # Generate world points from depth map
     print("Computing world points from depth map...")
-    predictions["depth"] = predictions["depth"] * scale_factor.item()
     depth_map = predictions["depth"]
     world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
     predictions["world_points_from_depth"] = world_points
@@ -246,7 +308,7 @@ def handle_uploads(input_video, input_images):
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    target_dir = f"temp/input/{timestamp}"
     target_dir_images = os.path.join(target_dir, "images")
     # Clean up if somehow that folder already exists
@@ -349,7 +411,8 @@ def reconstruct(
     print("Running run_model...")
     with torch.no_grad():
-        predictions = run_model(target_dir, model, metric3d_model=metric3d_model)
     # Save predictions
@@ -421,14 +484,82 @@ def reconstruct(
     end_time = time.time()
     print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
     log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
-    os.system(f"python {MK_PATH}/third_party/detectron2/projects/CropFormer/demo_cropformer/mask_predict.py   \
---config-file {MK_PATH}/third_party/detectron2/projects/CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x.yaml \
---root temp/input/ --image_path_pattern images/*.jpg --dataset arkit_gt \
---seq_name_list {os.path.basename(target_dir)} --opts MODEL.WEIGHTS \
-{MK_PATH}/Mask2Former_hornet_3x_576d0b.pth")
-    os.system(f"python {MK_PATH}/main.py --config wild --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input  --seq_name_list {os.path.basename(target_dir)}")
-    os.system(f"PYTHONPATH={MK_PATH} python  {MK_PATH}/semantics/get_open-voc_features.py --config wild\
-          --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input --seq_name_list {os.path.basename(target_dir)}")
     return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
@@ -669,10 +800,30 @@ def detect_objects(text_labels, target_dir, conf_thres, *viz_args):
         labels = [l.strip() for l in text_labels.split(";") if l.strip()]
         if labels:
             print(f"Extracting features for labels: {labels}")
             text_features = extract_text_feature(labels, clip_model, target_dir)
             print(f"Text features: {text_features}")
-            os.system(f"PYTHONPATH={MK_PATH} python {MK_PATH}/semantics/wopen-voc_query.py --config wild\
-                 --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input  --seq_name {os.path.basename(target_dir)}")
     return visualize_detections(target_dir, conf_thres, *viz_args)
@@ -1101,4 +1252,9 @@ with gr.Blocks(
         outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
     )
-    demo.queue(max_size=20).launch(show_error=True, share=True)

 from open_clip import tokenizer
 import trimesh
 import matplotlib.pyplot as plt
+import subprocess
+import tempfile
+from huggingface_hub import hf_hub_download
+try:
+    import gdown
+except Exception:
+    gdown = None
+REPO_ROOT = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(REPO_ROOT, "vggt"))
+MK_PATH = os.path.join(REPO_ROOT, "MaskClustering")
+# Writable workdir (HF Spaces: prefer /tmp)
+WORK_DIR = os.environ.get("ZOO3D_WORKDIR", os.path.join(tempfile.gettempdir(), "zoo3d"))
+os.makedirs(WORK_DIR, exist_ok=True)
 from visual_util import predictions_to_glb
 from vggt.models.vggt import VGGT
 from vggt.utils.load_fn import load_and_preprocess_images
 print(f"Using device: {device}")
+_VGGT_MODEL = None
+_METRIC3D_MODEL = None
+_CLIP_MODEL = None
+def _download_vggt_weights(dst_path: str) -> str:
+    """
+    Download VGGT weights from Google Drive to dst_path.
+    The user provided:
+    https://drive.google.com/file/d/10G7s6bVMwN__bcrR2fBal3goo69Y5Do4/view?usp=sharing
+    """
+    if os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
+        return dst_path
+    if gdown is None:
+        raise RuntimeError("Не найден пакет gdown. Добавь gdown в requirements.txt для загрузки весов из Google Drive.")
+    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
+    url = "https://drive.google.com/uc?id=10G7s6bVMwN__bcrR2fBal3goo69Y5Do4"
+    out = gdown.download(url, dst_path, quiet=False)
+    if out is None or not os.path.exists(dst_path) or os.path.getsize(dst_path) == 0:
+        raise RuntimeError("Не удалось скачать веса VGGT из Google Drive (проверь доступ/квоты/публичность).")
+    return dst_path
+def _init_models():
+    """
+    Lazy-load heavy models so the UI can start quickly on HF Spaces.
+    """
+    global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
+    if _VGGT_MODEL is None:
+        print("Initializing and loading VGGT model...")
+        m = VGGT()
+        weights_path = os.environ.get("VGGT_WEIGHTS_PATH")
+        if not weights_path:
+            weights_path = os.path.join(WORK_DIR, "weights", "vggt_model.pt")
+            _download_vggt_weights(weights_path)
+        state = torch.load(weights_path, map_location="cpu")
+        m.load_state_dict(state)
+        m.eval()
+        _VGGT_MODEL = m.to(device)
+    if _METRIC3D_MODEL is None:
+        print("Initializing and loading Metric3D model...")
+        try:
+            mm = torch.hub.load("yvanyin/metric3d", "metric3d_vit_small", pretrain=True, trust_repo=True)
+        except TypeError:
+            mm = torch.hub.load("yvanyin/metric3d", "metric3d_vit_small", pretrain=True)
+        mm.to(device)
+        mm.eval()
+        _METRIC3D_MODEL = mm
+    if _CLIP_MODEL is None:
+        print("[INFO] loading CLIP model...")
+        cm, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
+        cm.to(device)
+        cm.eval()
+        print("[INFO] finish loading CLIP model...")
+        _CLIP_MODEL = cm
+    return _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
 cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
 def check_weights():
     if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
         print(f"Downloading {cropformer_name}...")
+        # Prefer HF cache over `wget` for Spaces compatibility.
+        cached = hf_hub_download(
+            repo_id="qqlu1992/Adobe_EntitySeg",
+            repo_type="dataset",
+            filename="CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth",
+        )
+        os.makedirs(MK_PATH, exist_ok=True)
+        dst = os.path.join(MK_PATH, cropformer_name)
+        shutil.copyfile(cached, dst)
         print(f"Downloaded {cropformer_name}...")
     else:
         print(f"{cropformer_name} already exists...")
 check_weights()
 def extract_text_feature(descriptions, clip_model, target_path):
+    text_tokens = tokenizer.tokenize(descriptions).to(device)
     with torch.no_grad():
         text_features = clip_model.encode_text(text_tokens).float()
         text_features /= text_features.norm(dim=-1, keepdim=True)
     return text_features_dict
+clip_model = None
 # -------------------------------------------------------------------------
     # Device check
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device != "cuda":
+        raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
     # Move model to device
     model = model.to(device)
         with torch.cuda.amp.autocast(dtype=dtype):
             predictions = model(images)
+    scale_factor = torch.tensor(1.0, device=device)
     # Metric3D inference
     if metric3d_model is not None:
         print("Running Metric3D inference...")
              metric_depth = metric_depth.unsqueeze(-1) # -> (B, H, W, 1)
         # Move to same device/dtype
+        vggt_depth = vggt_depth.to(metric_depth.device).float()
         metric_depth = metric_depth.float()
         # Resize metric depth to match VGGT depth if they differ in spatial resolution
         # vggt_depth: (B, H, W, 1) or (B, H, W)
         # metric_depth: (B, H, W, 1) after permutation
         # Mask for valid values to compute median
         print(f"Metric3D depth shape: {metric_depth.shape}")
         print(f"VGGT depth shape: {vggt_depth.shape}")
             ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
             scale_factor = torch.median(ratio)
             print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")
+        else:
+            print("Warning: could not compute scale factor; falling back to 1.0")
     print("Converting pose encoding to extrinsic and intrinsic matrices...")
     extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
     extrinsic = extrinsic[0]
     # Generate world points from depth map
     print("Computing world points from depth map...")
+    predictions["depth"] = predictions["depth"] * float(scale_factor.item())
     depth_map = predictions["depth"]
     world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
     predictions["world_points_from_depth"] = world_points
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    target_dir = os.path.join(WORK_DIR, "input", timestamp)
     target_dir_images = os.path.join(target_dir, "images")
     # Clean up if somehow that folder already exists
     print("Running run_model...")
     with torch.no_grad():
+        vggt_model, metric3d_model, _ = _init_models()
+        predictions = run_model(target_dir, vggt_model, metric3d_model=metric3d_model)
     # Save predictions
     end_time = time.time()
     print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
     log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+    # External pipelines are fragile in Spaces (often require compiled ops).
+    # We try to run them, but do not fail the whole app if they error.
+    root_input_dir = os.path.dirname(target_dir)
+    seq_name = os.path.basename(target_dir)
+    try:
+        subprocess.run(
+            [
+                sys.executable,
+                os.path.join(
+                    MK_PATH,
+                    "third_party",
+                    "detectron2",
+                    "projects",
+                    "CropFormer",
+                    "demo_cropformer",
+                    "mask_predict.py",
+                ),
+                "--config-file",
+                os.path.join(
+                    MK_PATH,
+                    "third_party",
+                    "detectron2",
+                    "projects",
+                    "CropFormer",
+                    "configs",
+                    "entityv2",
+                    "entity_segmentation",
+                    "mask2former_hornet_3x.yaml",
+                ),
+                "--root",
+                root_input_dir,
+                "--image_path_pattern",
+                "images/*.jpg",
+                "--dataset",
+                "arkit_gt",
+                "--seq_name_list",
+                seq_name,
+                "--opts",
+                "MODEL.WEIGHTS",
+                os.path.join(MK_PATH, cropformer_name),
+            ],
+            check=True,
+        )
+        subprocess.run(
+            [
+                sys.executable,
+                os.path.join(MK_PATH, "main.py"),
+                "--config",
+                "wild",
+                "--root",
+                root_input_dir,
+                "--seq_name_list",
+                seq_name,
+            ],
+            check=True,
+        )
+        env = dict(os.environ)
+        env["PYTHONPATH"] = MK_PATH + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
+        subprocess.run(
+            [
+                sys.executable,
+                os.path.join(MK_PATH, "semantics", "get_open-voc_features.py"),
+                "--config",
+                "wild",
+                "--root",
+                root_input_dir,
+                "--seq_name_list",
+                seq_name,
+            ],
+            env=env,
+            check=True,
+        )
+    except Exception as e:
+        print(f"Warning: external MaskClustering pipeline failed: {e}")
     return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
         labels = [l.strip() for l in text_labels.split(";") if l.strip()]
         if labels:
             print(f"Extracting features for labels: {labels}")
+            _, _, clip_model = _init_models()
             text_features = extract_text_feature(labels, clip_model, target_dir)
             print(f"Text features: {text_features}")
+            try:
+                env = dict(os.environ)
+                env["PYTHONPATH"] = MK_PATH + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
+                root_input_dir = os.path.dirname(target_dir)
+                seq_name = os.path.basename(target_dir)
+                subprocess.run(
+                    [
+                        sys.executable,
+                        os.path.join(MK_PATH, "semantics", "wopen-voc_query.py"),
+                        "--config",
+                        "wild",
+                        "--root",
+                        root_input_dir,
+                        "--seq_name",
+                        seq_name,
+                    ],
+                    env=env,
+                    check=True,
+                )
+            except Exception as e:
+                print(f"Warning: open-voc query failed: {e}")
     return visualize_detections(target_dir, conf_thres, *viz_args)
         outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
     )
+def main():
+    demo.queue(max_size=20).launch(show_error=True, share=False)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -5,3 +5,16 @@ Pillow
 huggingface_hub
 einops
 safetensors

 huggingface_hub
 einops
 safetensors
+gradio==5.17.1
+opencv-python
+requests
+trimesh
+matplotlib
+open-clip-torch
+open3d
+tqdm
+hydra-core
+omegaconf
+scipy
+onnxruntime
+gdown