bulatko commited on
Commit
6c099d4
·
1 Parent(s): e4c5797

fix detectron & weights

Browse files
Files changed (4) hide show
  1. README.md +7 -0
  2. app.py +23 -0
  3. mvp.py +202 -46
  4. requirements.txt +13 -0
README.md CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  <div align="center">
2
  <h1>VGGT: Visual Geometry Grounded Transformer</h1>
3
 
 
1
+ ---
2
+ title: Zoo3D (VGGT + open-vocabulary 3D detection)
3
+ sdk: gradio
4
+ app_file: app.py
5
+ pinned: false
6
+ ---
7
+
8
  <div align="center">
9
  <h1>VGGT: Visual Geometry Grounded Transformer</h1>
10
 
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+
5
+
6
+ def _launch():
7
+ # HF Spaces expects the app to listen on 0.0.0.0:7860 (PORT may be provided).
8
+ import mvp
9
+
10
+ port = int(os.getenv("PORT", "7860"))
11
+ # `mvp` defines `demo` (gr.Blocks). We launch it here instead of inside `mvp.py`.
12
+ mvp.demo.queue(max_size=20).launch(
13
+ server_name="0.0.0.0",
14
+ server_port=port,
15
+ show_error=True,
16
+ share=False,
17
+ )
18
+
19
+
20
+ if __name__ == "__main__":
21
+ _launch()
22
+
23
+
mvp.py CHANGED
@@ -20,9 +20,22 @@ import open_clip
20
  from open_clip import tokenizer
21
  import trimesh
22
  import matplotlib.pyplot as plt
 
 
 
23
 
24
- sys.path.append("vggt/")
25
- MK_PATH = "MaskClustering"
 
 
 
 
 
 
 
 
 
 
26
  from visual_util import predictions_to_glb
27
  from vggt.models.vggt import VGGT
28
  from vggt.utils.load_fn import load_and_preprocess_images
@@ -34,46 +47,93 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
34
  print(f"Using device: {device}")
35
 
36
 
37
- print("Initializing and loading VGGT model...")
38
- # model = VGGT.from_pretrained("facebook/VGGT-1B") # another way to load the model
 
 
39
 
40
- model = VGGT()
41
- _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
42
- model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 
 
 
 
 
43
 
 
 
44
 
45
- model.eval()
46
- model = model.to(device)
 
 
 
 
47
 
48
- print("Initializing and loading Metric3D model...")
49
- try:
50
- metric3d_model = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_small', pretrain=True, trust_repo=True)
51
- except TypeError:
52
- metric3d_model = torch.hub.load('yvanyin/metric3d', 'metric3d_vit_small', pretrain=True)
53
- metric3d_model.to(device)
54
- metric3d_model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
57
 
58
  def check_weights():
59
  if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
60
  print(f"Downloading {cropformer_name}...")
61
- os.system(f"wget https://huggingface.co/datasets/qqlu1992/Adobe_EntitySeg/resolve/main/CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth?download=true -O {os.path.join(MK_PATH, cropformer_name)}")
 
 
 
 
 
 
 
 
62
  print(f"Downloaded {cropformer_name}...")
63
  else:
64
  print(f"{cropformer_name} already exists...")
65
  check_weights()
66
 
67
- def load_clip():
68
- print(f'[INFO] loading CLIP model...')
69
- model, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
70
- model.cuda()
71
- model.eval()
72
- print(f'[INFO]', ' finish loading CLIP model...')
73
- return model
74
-
75
  def extract_text_feature(descriptions, clip_model, target_path):
76
- text_tokens = tokenizer.tokenize(descriptions).cuda()
77
  with torch.no_grad():
78
  text_features = clip_model.encode_text(text_tokens).float()
79
  text_features /= text_features.norm(dim=-1, keepdim=True)
@@ -87,7 +147,7 @@ def extract_text_feature(descriptions, clip_model, target_path):
87
  return text_features_dict
88
 
89
 
90
- clip_model = load_clip()
91
 
92
 
93
  # -------------------------------------------------------------------------
@@ -101,8 +161,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
101
 
102
  # Device check
103
  device = "cuda" if torch.cuda.is_available() else "cpu"
104
- if not torch.cuda.is_available():
105
- raise ValueError("CUDA is not available. Check your environment.")
106
 
107
  # Move model to device
108
  model = model.to(device)
@@ -126,6 +186,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
126
  with torch.cuda.amp.autocast(dtype=dtype):
127
  predictions = model(images)
128
 
 
 
129
  # Metric3D inference
130
  if metric3d_model is not None:
131
  print("Running Metric3D inference...")
@@ -176,15 +238,13 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
176
  metric_depth = metric_depth.unsqueeze(-1) # -> (B, H, W, 1)
177
 
178
  # Move to same device/dtype
179
- vggt_depth = vggt_depth.to(metric_depth.device).float()[0]
180
  metric_depth = metric_depth.float()
181
 
182
  # Resize metric depth to match VGGT depth if they differ in spatial resolution
183
  # vggt_depth: (B, H, W, 1) or (B, H, W)
184
  # metric_depth: (B, H, W, 1) after permutation
185
 
186
- target_h, target_w = vggt_depth.shape[1], vggt_depth.shape[2]
187
-
188
  # Mask for valid values to compute median
189
  print(f"Metric3D depth shape: {metric_depth.shape}")
190
  print(f"VGGT depth shape: {vggt_depth.shape}")
@@ -194,6 +254,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
194
  ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
195
  scale_factor = torch.median(ratio)
196
  print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")
 
 
197
  print("Converting pose encoding to extrinsic and intrinsic matrices...")
198
  extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
199
  extrinsic = extrinsic[0]
@@ -222,7 +284,7 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
222
 
223
  # Generate world points from depth map
224
  print("Computing world points from depth map...")
225
- predictions["depth"] = predictions["depth"] * scale_factor.item()
226
  depth_map = predictions["depth"]
227
  world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
228
  predictions["world_points_from_depth"] = world_points
@@ -246,7 +308,7 @@ def handle_uploads(input_video, input_images):
246
 
247
  # Create a unique folder name
248
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
249
- target_dir = f"temp/input/{timestamp}"
250
  target_dir_images = os.path.join(target_dir, "images")
251
 
252
  # Clean up if somehow that folder already exists
@@ -349,7 +411,8 @@ def reconstruct(
349
 
350
  print("Running run_model...")
351
  with torch.no_grad():
352
- predictions = run_model(target_dir, model, metric3d_model=metric3d_model)
 
353
 
354
 
355
  # Save predictions
@@ -421,14 +484,82 @@ def reconstruct(
421
  end_time = time.time()
422
  print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
423
  log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
424
- os.system(f"python {MK_PATH}/third_party/detectron2/projects/CropFormer/demo_cropformer/mask_predict.py \
425
- --config-file {MK_PATH}/third_party/detectron2/projects/CropFormer/configs/entityv2/entity_segmentation/mask2former_hornet_3x.yaml \
426
- --root temp/input/ --image_path_pattern images/*.jpg --dataset arkit_gt \
427
- --seq_name_list {os.path.basename(target_dir)} --opts MODEL.WEIGHTS \
428
- {MK_PATH}/Mask2Former_hornet_3x_576d0b.pth")
429
- os.system(f"python {MK_PATH}/main.py --config wild --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input --seq_name_list {os.path.basename(target_dir)}")
430
- os.system(f"PYTHONPATH={MK_PATH} python {MK_PATH}/semantics/get_open-voc_features.py --config wild\
431
- --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input --seq_name_list {os.path.basename(target_dir)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
  return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
434
 
@@ -669,10 +800,30 @@ def detect_objects(text_labels, target_dir, conf_thres, *viz_args):
669
  labels = [l.strip() for l in text_labels.split(";") if l.strip()]
670
  if labels:
671
  print(f"Extracting features for labels: {labels}")
 
672
  text_features = extract_text_feature(labels, clip_model, target_dir)
673
  print(f"Text features: {text_features}")
674
- os.system(f"PYTHONPATH={MK_PATH} python {MK_PATH}/semantics/wopen-voc_query.py --config wild\
675
- --root /home/jovyan/users/bulat/workspace/3drec/vggt/temp/input --seq_name {os.path.basename(target_dir)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
  return visualize_detections(target_dir, conf_thres, *viz_args)
678
 
@@ -1101,4 +1252,9 @@ with gr.Blocks(
1101
  outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
1102
  )
1103
 
1104
- demo.queue(max_size=20).launch(show_error=True, share=True)
 
 
 
 
 
 
20
  from open_clip import tokenizer
21
  import trimesh
22
  import matplotlib.pyplot as plt
23
+ import subprocess
24
+ import tempfile
25
+ from huggingface_hub import hf_hub_download
26
 
27
+ try:
28
+ import gdown
29
+ except Exception:
30
+ gdown = None
31
+
32
+ REPO_ROOT = os.path.dirname(os.path.abspath(__file__))
33
+ sys.path.append(os.path.join(REPO_ROOT, "vggt"))
34
+ MK_PATH = os.path.join(REPO_ROOT, "MaskClustering")
35
+
36
+ # Writable workdir (HF Spaces: prefer /tmp)
37
+ WORK_DIR = os.environ.get("ZOO3D_WORKDIR", os.path.join(tempfile.gettempdir(), "zoo3d"))
38
+ os.makedirs(WORK_DIR, exist_ok=True)
39
  from visual_util import predictions_to_glb
40
  from vggt.models.vggt import VGGT
41
  from vggt.utils.load_fn import load_and_preprocess_images
 
47
  print(f"Using device: {device}")
48
 
49
 
50
+ _VGGT_MODEL = None
51
+ _METRIC3D_MODEL = None
52
+ _CLIP_MODEL = None
53
+
54
 
55
+ def _download_vggt_weights(dst_path: str) -> str:
56
+ """
57
+ Download VGGT weights from Google Drive to dst_path.
58
+ The user provided:
59
+ https://drive.google.com/file/d/10G7s6bVMwN__bcrR2fBal3goo69Y5Do4/view?usp=sharing
60
+ """
61
+ if os.path.exists(dst_path) and os.path.getsize(dst_path) > 0:
62
+ return dst_path
63
 
64
+ if gdown is None:
65
+ raise RuntimeError("Не найден пакет gdown. Добавь gdown в requirements.txt для загрузки весов из Google Drive.")
66
 
67
+ os.makedirs(os.path.dirname(dst_path), exist_ok=True)
68
+ url = "https://drive.google.com/uc?id=10G7s6bVMwN__bcrR2fBal3goo69Y5Do4"
69
+ out = gdown.download(url, dst_path, quiet=False)
70
+ if out is None or not os.path.exists(dst_path) or os.path.getsize(dst_path) == 0:
71
+ raise RuntimeError("Не удалось скачать веса VGGT из Google Drive (проверь доступ/квоты/публичность).")
72
+ return dst_path
73
 
74
+
75
+ def _init_models():
76
+ """
77
+ Lazy-load heavy models so the UI can start quickly on HF Spaces.
78
+ """
79
+ global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
80
+
81
+ if not torch.cuda.is_available():
82
+ raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
83
+
84
+ if _VGGT_MODEL is None:
85
+ print("Initializing and loading VGGT model...")
86
+ m = VGGT()
87
+ weights_path = os.environ.get("VGGT_WEIGHTS_PATH")
88
+ if not weights_path:
89
+ weights_path = os.path.join(WORK_DIR, "weights", "vggt_model.pt")
90
+ _download_vggt_weights(weights_path)
91
+ state = torch.load(weights_path, map_location="cpu")
92
+ m.load_state_dict(state)
93
+ m.eval()
94
+ _VGGT_MODEL = m.to(device)
95
+
96
+ if _METRIC3D_MODEL is None:
97
+ print("Initializing and loading Metric3D model...")
98
+ try:
99
+ mm = torch.hub.load("yvanyin/metric3d", "metric3d_vit_small", pretrain=True, trust_repo=True)
100
+ except TypeError:
101
+ mm = torch.hub.load("yvanyin/metric3d", "metric3d_vit_small", pretrain=True)
102
+ mm.to(device)
103
+ mm.eval()
104
+ _METRIC3D_MODEL = mm
105
+
106
+ if _CLIP_MODEL is None:
107
+ print("[INFO] loading CLIP model...")
108
+ cm, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
109
+ cm.to(device)
110
+ cm.eval()
111
+ print("[INFO] finish loading CLIP model...")
112
+ _CLIP_MODEL = cm
113
+
114
+ return _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
115
 
116
  cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
117
 
118
  def check_weights():
119
  if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
120
  print(f"Downloading {cropformer_name}...")
121
+ # Prefer HF cache over `wget` for Spaces compatibility.
122
+ cached = hf_hub_download(
123
+ repo_id="qqlu1992/Adobe_EntitySeg",
124
+ repo_type="dataset",
125
+ filename="CropFormer_model/Entity_Segmentation/Mask2Former_hornet_3x/Mask2Former_hornet_3x_576d0b.pth",
126
+ )
127
+ os.makedirs(MK_PATH, exist_ok=True)
128
+ dst = os.path.join(MK_PATH, cropformer_name)
129
+ shutil.copyfile(cached, dst)
130
  print(f"Downloaded {cropformer_name}...")
131
  else:
132
  print(f"{cropformer_name} already exists...")
133
  check_weights()
134
 
 
 
 
 
 
 
 
 
135
  def extract_text_feature(descriptions, clip_model, target_path):
136
+ text_tokens = tokenizer.tokenize(descriptions).to(device)
137
  with torch.no_grad():
138
  text_features = clip_model.encode_text(text_tokens).float()
139
  text_features /= text_features.norm(dim=-1, keepdim=True)
 
147
  return text_features_dict
148
 
149
 
150
+ clip_model = None
151
 
152
 
153
  # -------------------------------------------------------------------------
 
161
 
162
  # Device check
163
  device = "cuda" if torch.cuda.is_available() else "cpu"
164
+ if device != "cuda":
165
+ raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
166
 
167
  # Move model to device
168
  model = model.to(device)
 
186
  with torch.cuda.amp.autocast(dtype=dtype):
187
  predictions = model(images)
188
 
189
+ scale_factor = torch.tensor(1.0, device=device)
190
+
191
  # Metric3D inference
192
  if metric3d_model is not None:
193
  print("Running Metric3D inference...")
 
238
  metric_depth = metric_depth.unsqueeze(-1) # -> (B, H, W, 1)
239
 
240
  # Move to same device/dtype
241
+ vggt_depth = vggt_depth.to(metric_depth.device).float()
242
  metric_depth = metric_depth.float()
243
 
244
  # Resize metric depth to match VGGT depth if they differ in spatial resolution
245
  # vggt_depth: (B, H, W, 1) or (B, H, W)
246
  # metric_depth: (B, H, W, 1) after permutation
247
 
 
 
248
  # Mask for valid values to compute median
249
  print(f"Metric3D depth shape: {metric_depth.shape}")
250
  print(f"VGGT depth shape: {vggt_depth.shape}")
 
254
  ratio = metric_depth[valid_mask] / vggt_depth[valid_mask]
255
  scale_factor = torch.median(ratio)
256
  print(f"Computed scale factor (VGGT / Metric3D): {scale_factor.item():.4f}")
257
+ else:
258
+ print("Warning: could not compute scale factor; falling back to 1.0")
259
  print("Converting pose encoding to extrinsic and intrinsic matrices...")
260
  extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
261
  extrinsic = extrinsic[0]
 
284
 
285
  # Generate world points from depth map
286
  print("Computing world points from depth map...")
287
+ predictions["depth"] = predictions["depth"] * float(scale_factor.item())
288
  depth_map = predictions["depth"]
289
  world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
290
  predictions["world_points_from_depth"] = world_points
 
308
 
309
  # Create a unique folder name
310
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
311
+ target_dir = os.path.join(WORK_DIR, "input", timestamp)
312
  target_dir_images = os.path.join(target_dir, "images")
313
 
314
  # Clean up if somehow that folder already exists
 
411
 
412
  print("Running run_model...")
413
  with torch.no_grad():
414
+ vggt_model, metric3d_model, _ = _init_models()
415
+ predictions = run_model(target_dir, vggt_model, metric3d_model=metric3d_model)
416
 
417
 
418
  # Save predictions
 
484
  end_time = time.time()
485
  print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
486
  log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
487
+ # External pipelines are fragile in Spaces (often require compiled ops).
488
+ # We try to run them, but do not fail the whole app if they error.
489
+ root_input_dir = os.path.dirname(target_dir)
490
+ seq_name = os.path.basename(target_dir)
491
+ try:
492
+ subprocess.run(
493
+ [
494
+ sys.executable,
495
+ os.path.join(
496
+ MK_PATH,
497
+ "third_party",
498
+ "detectron2",
499
+ "projects",
500
+ "CropFormer",
501
+ "demo_cropformer",
502
+ "mask_predict.py",
503
+ ),
504
+ "--config-file",
505
+ os.path.join(
506
+ MK_PATH,
507
+ "third_party",
508
+ "detectron2",
509
+ "projects",
510
+ "CropFormer",
511
+ "configs",
512
+ "entityv2",
513
+ "entity_segmentation",
514
+ "mask2former_hornet_3x.yaml",
515
+ ),
516
+ "--root",
517
+ root_input_dir,
518
+ "--image_path_pattern",
519
+ "images/*.jpg",
520
+ "--dataset",
521
+ "arkit_gt",
522
+ "--seq_name_list",
523
+ seq_name,
524
+ "--opts",
525
+ "MODEL.WEIGHTS",
526
+ os.path.join(MK_PATH, cropformer_name),
527
+ ],
528
+ check=True,
529
+ )
530
+
531
+ subprocess.run(
532
+ [
533
+ sys.executable,
534
+ os.path.join(MK_PATH, "main.py"),
535
+ "--config",
536
+ "wild",
537
+ "--root",
538
+ root_input_dir,
539
+ "--seq_name_list",
540
+ seq_name,
541
+ ],
542
+ check=True,
543
+ )
544
+
545
+ env = dict(os.environ)
546
+ env["PYTHONPATH"] = MK_PATH + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
547
+ subprocess.run(
548
+ [
549
+ sys.executable,
550
+ os.path.join(MK_PATH, "semantics", "get_open-voc_features.py"),
551
+ "--config",
552
+ "wild",
553
+ "--root",
554
+ root_input_dir,
555
+ "--seq_name_list",
556
+ seq_name,
557
+ ],
558
+ env=env,
559
+ check=True,
560
+ )
561
+ except Exception as e:
562
+ print(f"Warning: external MaskClustering pipeline failed: {e}")
563
 
564
  return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
565
 
 
800
  labels = [l.strip() for l in text_labels.split(";") if l.strip()]
801
  if labels:
802
  print(f"Extracting features for labels: {labels}")
803
+ _, _, clip_model = _init_models()
804
  text_features = extract_text_feature(labels, clip_model, target_dir)
805
  print(f"Text features: {text_features}")
806
+ try:
807
+ env = dict(os.environ)
808
+ env["PYTHONPATH"] = MK_PATH + (os.pathsep + env["PYTHONPATH"] if env.get("PYTHONPATH") else "")
809
+ root_input_dir = os.path.dirname(target_dir)
810
+ seq_name = os.path.basename(target_dir)
811
+ subprocess.run(
812
+ [
813
+ sys.executable,
814
+ os.path.join(MK_PATH, "semantics", "wopen-voc_query.py"),
815
+ "--config",
816
+ "wild",
817
+ "--root",
818
+ root_input_dir,
819
+ "--seq_name",
820
+ seq_name,
821
+ ],
822
+ env=env,
823
+ check=True,
824
+ )
825
+ except Exception as e:
826
+ print(f"Warning: open-voc query failed: {e}")
827
 
828
  return visualize_detections(target_dir, conf_thres, *viz_args)
829
 
 
1252
  outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
1253
  )
1254
 
1255
+ def main():
1256
+ demo.queue(max_size=20).launch(show_error=True, share=False)
1257
+
1258
+
1259
+ if __name__ == "__main__":
1260
+ main()
requirements.txt CHANGED
@@ -5,3 +5,16 @@ Pillow
5
  huggingface_hub
6
  einops
7
  safetensors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  huggingface_hub
6
  einops
7
  safetensors
8
+ gradio==5.17.1
9
+ opencv-python
10
+ requests
11
+ trimesh
12
+ matplotlib
13
+ open-clip-torch
14
+ open3d
15
+ tqdm
16
+ hydra-core
17
+ omegaconf
18
+ scipy
19
+ onnxruntime
20
+ gdown