feat: CPU-compatible dummy pipeline for debugging
Browse files
mvp.py
CHANGED
|
@@ -22,6 +22,7 @@ import trimesh
|
|
| 22 |
import matplotlib.pyplot as plt
|
| 23 |
import subprocess
|
| 24 |
import tempfile
|
|
|
|
| 25 |
from huggingface_hub import hf_hub_download
|
| 26 |
|
| 27 |
try:
|
|
@@ -62,6 +63,13 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
| 62 |
|
| 63 |
print(f"Using device: {device}")
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
_VGGT_MODEL = None
|
| 67 |
_METRIC3D_MODEL = None
|
|
@@ -117,7 +125,18 @@ def _init_models():
|
|
| 117 |
global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
|
| 118 |
|
| 119 |
if not torch.cuda.is_available():
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
if _VGGT_MODEL is None:
|
| 123 |
print("Initializing and loading VGGT model...")
|
|
@@ -154,6 +173,9 @@ def _init_models():
|
|
| 154 |
cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
|
| 155 |
|
| 156 |
def check_weights():
|
|
|
|
|
|
|
|
|
|
| 157 |
if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
|
| 158 |
print(f"Downloading {cropformer_name}...")
|
| 159 |
os.makedirs(MK_PATH, exist_ok=True)
|
|
@@ -195,14 +217,18 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
|
|
| 195 |
"""
|
| 196 |
print(f"Processing images from {target_dir}")
|
| 197 |
|
| 198 |
-
# Device
|
| 199 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 200 |
if device != "cuda":
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
#
|
| 204 |
-
model = model.to(device)
|
| 205 |
-
model.eval()
|
| 206 |
|
| 207 |
# Load and preprocess images
|
| 208 |
image_names = glob.glob(os.path.join(target_dir, "images", "*"))
|
|
@@ -211,15 +237,71 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
|
|
| 211 |
if len(image_names) == 0:
|
| 212 |
raise ValueError("No images found. Check your upload.")
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
|
| 217 |
-
# Run inference
|
| 218 |
print("Running inference...")
|
| 219 |
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
|
|
|
|
| 220 |
|
| 221 |
with torch.no_grad():
|
| 222 |
-
with
|
| 223 |
predictions = model(images)
|
| 224 |
|
| 225 |
scale_factor = torch.tensor(1.0, device=device)
|
|
@@ -329,7 +411,8 @@ def run_model(target_dir, model, metric3d_model=None) -> dict:
|
|
| 329 |
predictions["world_points_from_depth"] = world_points
|
| 330 |
|
| 331 |
# Clean up
|
| 332 |
-
torch.cuda.
|
|
|
|
| 333 |
return predictions
|
| 334 |
|
| 335 |
|
|
@@ -343,7 +426,8 @@ def handle_uploads(input_video, input_images):
|
|
| 343 |
"""
|
| 344 |
start_time = time.time()
|
| 345 |
gc.collect()
|
| 346 |
-
torch.cuda.
|
|
|
|
| 347 |
|
| 348 |
# Create a unique folder name
|
| 349 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
|
@@ -439,7 +523,8 @@ def reconstruct(
|
|
| 439 |
|
| 440 |
start_time = time.time()
|
| 441 |
gc.collect()
|
| 442 |
-
torch.cuda.
|
|
|
|
| 443 |
|
| 444 |
# Prepare frame_filter dropdown
|
| 445 |
target_dir_images = os.path.join(target_dir, "images")
|
|
@@ -814,10 +899,11 @@ def detect_objects(text_labels, target_dir, conf_thres, *viz_args):
|
|
| 814 |
return None, "Please enter at least one text label (separated by ';')."
|
| 815 |
|
| 816 |
# Ensure CropFormer weights exist (if detection pipeline uses them)
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
|
|
|
| 821 |
|
| 822 |
# 1. Run reconstruction first if needed (checking if predictions exist)
|
| 823 |
predictions_path = os.path.join(target_dir, "predictions.npz")
|
|
|
|
| 22 |
import matplotlib.pyplot as plt
|
| 23 |
import subprocess
|
| 24 |
import tempfile
|
| 25 |
+
import contextlib
|
| 26 |
from huggingface_hub import hf_hub_download
|
| 27 |
|
| 28 |
try:
|
|
|
|
| 63 |
|
| 64 |
print(f"Using device: {device}")
|
| 65 |
|
| 66 |
+
# CPU debug / compatibility knobs:
|
| 67 |
+
# - On CPU, VGGT-1B inference is usually impractical. For debugging, we fall back to a lightweight
|
| 68 |
+
# dummy pipeline that produces a minimal predictions dict compatible with `predictions_to_glb`.
|
| 69 |
+
ZOO3D_ALLOW_CPU = os.environ.get("ZOO3D_ALLOW_CPU", "1") == "1"
|
| 70 |
+
ZOO3D_CPU_DUMMY = os.environ.get("ZOO3D_CPU_DUMMY", "1") == "1"
|
| 71 |
+
ZOO3D_SKIP_DOWNLOADS = os.environ.get("ZOO3D_SKIP_DOWNLOADS", "0") == "1"
|
| 72 |
+
|
| 73 |
|
| 74 |
_VGGT_MODEL = None
|
| 75 |
_METRIC3D_MODEL = None
|
|
|
|
| 125 |
global _VGGT_MODEL, _METRIC3D_MODEL, _CLIP_MODEL
|
| 126 |
|
| 127 |
if not torch.cuda.is_available():
|
| 128 |
+
# CPU-friendly mode for debugging: skip heavy models.
|
| 129 |
+
if not ZOO3D_ALLOW_CPU:
|
| 130 |
+
raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
|
| 131 |
+
# We still can load CLIP on CPU if needed, but skip VGGT/Metric3D.
|
| 132 |
+
if _CLIP_MODEL is None:
|
| 133 |
+
print("[INFO] loading CLIP model (CPU)...")
|
| 134 |
+
cm, _, _ = open_clip.create_model_and_transforms("ViT-H-14", pretrained="laion2b_s32b_b79k")
|
| 135 |
+
cm.to("cpu")
|
| 136 |
+
cm.eval()
|
| 137 |
+
print("[INFO] finish loading CLIP model (CPU)...")
|
| 138 |
+
globals()["_CLIP_MODEL"] = cm
|
| 139 |
+
return None, None, _CLIP_MODEL
|
| 140 |
|
| 141 |
if _VGGT_MODEL is None:
|
| 142 |
print("Initializing and loading VGGT model...")
|
|
|
|
| 173 |
cropformer_name = "Mask2Former_hornet_3x_576d0b.pth"
|
| 174 |
|
| 175 |
def check_weights():
|
| 176 |
+
if ZOO3D_SKIP_DOWNLOADS:
|
| 177 |
+
print("[INFO] ZOO3D_SKIP_DOWNLOADS=1: skipping Mask2Former weights download.")
|
| 178 |
+
return
|
| 179 |
if not os.path.exists(os.path.join(MK_PATH, cropformer_name)):
|
| 180 |
print(f"Downloading {cropformer_name}...")
|
| 181 |
os.makedirs(MK_PATH, exist_ok=True)
|
|
|
|
| 217 |
"""
|
| 218 |
print(f"Processing images from {target_dir}")
|
| 219 |
|
| 220 |
+
# Device selection
|
| 221 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 222 |
if device != "cuda":
|
| 223 |
+
if not ZOO3D_ALLOW_CPU:
|
| 224 |
+
raise RuntimeError("CUDA недоступна. Для этого Space нужен GPU (CUDA).")
|
| 225 |
+
if not ZOO3D_CPU_DUMMY:
|
| 226 |
+
raise RuntimeError(
|
| 227 |
+
"CPU режим включен, но ZOO3D_CPU_DUMMY=0. "
|
| 228 |
+
"Для отладки поставь ZOO3D_CPU_DUMMY=1 или включи GPU."
|
| 229 |
+
)
|
| 230 |
|
| 231 |
+
# Load and preprocess images (we need them for both GPU and CPU-dummy)
|
|
|
|
|
|
|
| 232 |
|
| 233 |
# Load and preprocess images
|
| 234 |
image_names = glob.glob(os.path.join(target_dir, "images", "*"))
|
|
|
|
| 237 |
if len(image_names) == 0:
|
| 238 |
raise ValueError("No images found. Check your upload.")
|
| 239 |
|
| 240 |
+
# For CPU dummy mode we want the original HxW for `predictions_to_glb` coloring.
|
| 241 |
+
cpu_images_u8 = None
|
| 242 |
+
if device == "cpu":
|
| 243 |
+
imgs = []
|
| 244 |
+
for p in image_names:
|
| 245 |
+
im = cv2.imread(p, cv2.IMREAD_COLOR)
|
| 246 |
+
if im is None:
|
| 247 |
+
continue
|
| 248 |
+
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
|
| 249 |
+
imgs.append(im)
|
| 250 |
+
if len(imgs) == 0:
|
| 251 |
+
raise ValueError("No readable images found. Check your upload.")
|
| 252 |
+
# Make all images same size for stacking
|
| 253 |
+
H, W = imgs[0].shape[:2]
|
| 254 |
+
imgs2 = []
|
| 255 |
+
for im in imgs:
|
| 256 |
+
if im.shape[:2] != (H, W):
|
| 257 |
+
im = cv2.resize(im, (W, H))
|
| 258 |
+
imgs2.append(im)
|
| 259 |
+
cpu_images_u8 = np.stack(imgs2, axis=0) # (S,H,W,3) uint8
|
| 260 |
+
print(f"CPU dummy: loaded images shape: {cpu_images_u8.shape}")
|
| 261 |
+
|
| 262 |
+
images = load_and_preprocess_images(image_names)
|
| 263 |
+
print(f"Preprocessed images shape: {tuple(images.shape)}")
|
| 264 |
+
if device == "cuda":
|
| 265 |
+
images = images.to(device)
|
| 266 |
+
|
| 267 |
+
if device == "cpu":
|
| 268 |
+
# Dummy predictions for CPU debugging: minimal keys needed by `predictions_to_glb`
|
| 269 |
+
S, H, W = cpu_images_u8.shape[0], cpu_images_u8.shape[1], cpu_images_u8.shape[2]
|
| 270 |
+
# Simple planar point cloud in camera space
|
| 271 |
+
uu, vv = np.meshgrid(np.arange(W), np.arange(H))
|
| 272 |
+
x = (uu - (W / 2.0)) / float(max(W, 1))
|
| 273 |
+
y = -(vv - (H / 2.0)) / float(max(W, 1))
|
| 274 |
+
z = np.ones_like(x, dtype=np.float32) * 1.0
|
| 275 |
+
pts = np.stack([x, y, z], axis=-1).astype(np.float32) # (H,W,3)
|
| 276 |
+
world_points_from_depth = np.repeat(pts[None, ...], S, axis=0) # (S,H,W,3)
|
| 277 |
+
depth = np.ones((S, H, W, 1), dtype=np.float32)
|
| 278 |
+
depth_conf = np.ones((S, H, W), dtype=np.float32)
|
| 279 |
+
extrinsic = np.tile(np.array([[1, 0, 0, 0],
|
| 280 |
+
[0, 1, 0, 0],
|
| 281 |
+
[0, 0, 1, 0]], dtype=np.float32)[None, ...], (S, 1, 1))
|
| 282 |
+
intrinsic = np.tile(np.eye(3, dtype=np.float32)[None, ...], (S, 1, 1))
|
| 283 |
+
pose = np.tile(np.eye(4, dtype=np.float32)[None, ...], (S, 1, 1))
|
| 284 |
+
return {
|
| 285 |
+
"images": cpu_images_u8,
|
| 286 |
+
"extrinsic": extrinsic,
|
| 287 |
+
"intrinsic": intrinsic,
|
| 288 |
+
"pose": pose,
|
| 289 |
+
"depth": depth,
|
| 290 |
+
"depth_conf": depth_conf,
|
| 291 |
+
"world_points_from_depth": world_points_from_depth,
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
# GPU inference
|
| 295 |
+
# Move model to device
|
| 296 |
+
model = model.to(device)
|
| 297 |
+
model.eval()
|
| 298 |
|
|
|
|
| 299 |
print("Running inference...")
|
| 300 |
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
|
| 301 |
+
amp_ctx = torch.cuda.amp.autocast(dtype=dtype) if device == "cuda" else contextlib.nullcontext()
|
| 302 |
|
| 303 |
with torch.no_grad():
|
| 304 |
+
with amp_ctx:
|
| 305 |
predictions = model(images)
|
| 306 |
|
| 307 |
scale_factor = torch.tensor(1.0, device=device)
|
|
|
|
| 411 |
predictions["world_points_from_depth"] = world_points
|
| 412 |
|
| 413 |
# Clean up
|
| 414 |
+
if torch.cuda.is_available():
|
| 415 |
+
torch.cuda.empty_cache()
|
| 416 |
return predictions
|
| 417 |
|
| 418 |
|
|
|
|
| 426 |
"""
|
| 427 |
start_time = time.time()
|
| 428 |
gc.collect()
|
| 429 |
+
if torch.cuda.is_available():
|
| 430 |
+
torch.cuda.empty_cache()
|
| 431 |
|
| 432 |
# Create a unique folder name
|
| 433 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
|
|
|
| 523 |
|
| 524 |
start_time = time.time()
|
| 525 |
gc.collect()
|
| 526 |
+
if torch.cuda.is_available():
|
| 527 |
+
torch.cuda.empty_cache()
|
| 528 |
|
| 529 |
# Prepare frame_filter dropdown
|
| 530 |
target_dir_images = os.path.join(target_dir, "images")
|
|
|
|
| 899 |
return None, "Please enter at least one text label (separated by ';')."
|
| 900 |
|
| 901 |
# Ensure CropFormer weights exist (if detection pipeline uses them)
|
| 902 |
+
if torch.cuda.is_available() or not ZOO3D_SKIP_DOWNLOADS:
|
| 903 |
+
try:
|
| 904 |
+
check_weights()
|
| 905 |
+
except Exception as e:
|
| 906 |
+
print(f"Warning: could not ensure Mask2Former weights: {e}")
|
| 907 |
|
| 908 |
# 1. Run reconstruction first if needed (checking if predictions exist)
|
| 909 |
predictions_path = os.path.join(target_dir, "predictions.npz")
|