ritianyu commited on
Commit
6afe9df
·
1 Parent(s): bc84e54
Files changed (2) hide show
  1. InfiniDepth/utils/hf_demo_utils.py +296 -108
  2. app.py +101 -30
InfiniDepth/utils/hf_demo_utils.py CHANGED
@@ -184,6 +184,24 @@ class GPUInferenceResult:
184
  cy_out: float
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  class ModelCache:
188
  def __init__(self):
189
  self._cache: dict[tuple[str, str], Any] = {}
@@ -265,11 +283,29 @@ def _prepare_image_tensor(
265
  return _image_tensor_from_numpy(resized, device), org_h, org_w
266
 
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  def _resolve_depth_inputs(
269
  depth_path: Optional[str],
270
  input_size: tuple[int, int],
271
  image: torch.Tensor,
272
  device: torch.device,
 
273
  ) -> tuple[
274
  torch.Tensor,
275
  torch.Tensor,
@@ -280,7 +316,8 @@ def _resolve_depth_inputs(
280
  Optional[tuple[float, float, float, float]],
281
  ]:
282
  input_depth_path = depth_path if depth_path else None
283
- moge2_pretrained = resolve_moge2_pretrained()
 
284
  gt_depth, prompt_depth, gt_depth_mask, used_input_depth, moge2_intrinsics = prepare_metric_depth_inputs(
285
  input_depth_path=input_depth_path,
286
  input_size=input_size,
@@ -373,6 +410,234 @@ def resolve_checkpoint_path(model_type: str) -> str:
373
  )
374
 
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  def run_single_image_demo(
377
  image_np: np.ndarray,
378
  depth_path: Optional[str],
@@ -535,116 +800,39 @@ def run_gpu_inference(
535
  model_cache: Optional[ModelCache] = None,
536
  stage_callback: Optional[Callable[[str], None]] = None,
537
  ) -> GPUInferenceResult:
538
- """Run only GPU-intensive inference. All outputs are moved to CPU numpy before return.
539
-
540
- This is designed for HuggingFace ZeroGPU where GPU time is limited: only the actual
541
- CUDA work (MoGe-2, model inference, intrinsics estimation) runs here. CPU-heavy
542
- post-processing (colorization, point cloud, file I/O) should happen outside the
543
- ``@spaces.GPU`` decorated caller.
544
- """
545
- image_shape = tuple(int(d) for d in image_np.shape) if image_np is not None else None
546
- _report_stage(stage_callback, "gpu:start")
547
- Log.info(
548
- f"run_gpu_inference: model_type={model_type}, input_size={input_size_text}, "
549
- f"output_resolution_mode={output_resolution_mode}, upsample_ratio={upsample_ratio}, "
550
- f"has_depth={bool(depth_path)}, image_shape={image_shape}, "
551
- f"cuda_available={torch.cuda.is_available()}"
552
- )
553
- if not torch.cuda.is_available():
554
- raise RuntimeError(
555
- "No CUDA GPU is available. If using Hugging Face ZeroGPU, "
556
- "decorate the Gradio inference function with @spaces.GPU and enable queue()."
557
- )
558
-
559
- input_size = _parse_image_size(input_size_text)
560
- if upsample_ratio < 1 or upsample_ratio > 8:
561
- raise ValueError("upsample_ratio must be in [1, 8]")
562
- output_size = input_size
563
- device = torch.device("cuda")
564
-
565
- _debug = os.getenv("INFINIDEPTH_DEBUG_GPU", "0") == "1"
566
-
567
- image, org_h, org_w = _prepare_image_tensor(image_np, input_size, device)
568
- _report_stage(stage_callback, "gpu:image_prepared")
569
- if _debug:
570
- torch.cuda.synchronize()
571
- Log.info(f"[GPU-DEBUG] image_prepared: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
572
-
573
- h_in, w_in = input_size
574
- h_out, w_out = resolve_output_size_from_mode(
575
  output_resolution_mode=output_resolution_mode,
576
- org_h=org_h, org_w=org_w, h=h_in, w=w_in,
577
- output_size=output_size, upsample_ratio=upsample_ratio,
578
- )
579
-
580
- if model_type == "InfiniDepth_DC":
581
- assert depth_path is not None and os.path.exists(depth_path), \
582
- "InfiniDepth_DC requires a valid input depth map for depth completion."
583
-
584
- _report_stage(stage_callback, "gpu:resolving_depth")
585
- gt_depth, prompt_depth, gt_depth_mask, prompt_mask, depth_source_label, moge2_pretrained, moge2_intrinsics = \
586
- _resolve_depth_inputs(depth_path=depth_path, input_size=input_size, image=image, device=device)
587
- if _debug:
588
- torch.cuda.synchronize()
589
- Log.info(f"[GPU-DEBUG] depth_resolved: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
590
- _report_stage(stage_callback, f"gpu:depth_resolved source={depth_source_label}")
591
- Log.info(f"Depth source resolved: {depth_source_label}")
592
-
593
- gt = depth_to_disparity(gt_depth)
594
- prompt = depth_to_disparity(prompt_depth)
595
- prompt_mask = prompt > 0
596
-
597
- ckpt_path = resolve_checkpoint_path(model_type)
598
- _report_stage(stage_callback, "gpu:loading_model")
599
- model_cache = model_cache or ModelCache()
600
- model = model_cache.get(model_type=model_type, model_path=ckpt_path, device=device)
601
- if _debug:
602
- torch.cuda.synchronize()
603
- Log.info(f"[GPU-DEBUG] model_loaded: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
604
- _report_stage(stage_callback, "gpu:model_loaded")
605
-
606
- query_2d_uniform_coord = make_2d_uniform_coord((h_out, w_out)).unsqueeze(0).to(device)
607
- _report_stage(stage_callback, "gpu:inference_started")
608
- pred_depth, _ = model.inference(
609
- image=image, query_coord=query_2d_uniform_coord,
610
- gt_depth=gt, gt_depth_mask=gt_depth_mask,
611
- prompt_depth=prompt, prompt_mask=prompt_mask,
612
- )
613
- if _debug:
614
- torch.cuda.synchronize()
615
- Log.info(f"[GPU-DEBUG] inference_finished: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
616
- _report_stage(stage_callback, "gpu:inference_finished")
617
- Log.info(f"Model inference completed: output_size={h_out}x{w_out}")
618
-
619
- pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
620
-
621
- fx, fy, cx, cy, intrinsics_source_label = resolve_camera_intrinsics_for_inference(
622
- fx_org=fx_org, fy_org=fy_org, cx_org=cx_org, cy_org=cy_org,
623
- org_h=org_h, org_w=org_w, image=image,
624
- moge2_pretrained=moge2_pretrained, moge2_intrinsics=moge2_intrinsics,
625
- )
626
- Log.info(f"Camera intrinsics source: {intrinsics_source_label}")
627
- fx_out, fy_out, cx_out, cy_out, _ = build_scaled_intrinsics_matrix(
628
- fx_org=fx, fy_org=fy, cx_org=cx, cy_org=cy,
629
- org_h=org_h, org_w=org_w, h=h_in, w=w_in, device=device,
630
  )
631
-
632
- # Transfer all GPU tensors to CPU numpy before returning
633
- _report_stage(stage_callback, "gpu:transferring_to_cpu")
634
- result = GPUInferenceResult(
635
- pred_depthmap_np=pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32),
636
- query_coord_np=query_2d_uniform_coord.detach().cpu().numpy().astype(np.float32),
637
- pred_depth_np=pred_depth.detach().cpu().numpy().astype(np.float32),
638
- image_tensor_np=image.detach().cpu().numpy().astype(np.float32),
639
- depth_source_label=depth_source_label,
640
- intrinsics_source_label=intrinsics_source_label,
641
- h_out=h_out, w_out=w_out,
642
- org_h=org_h, org_w=org_w,
643
- fx_out=float(fx_out), fy_out=float(fy_out),
644
- cx_out=float(cx_out), cy_out=float(cy_out),
 
 
 
 
 
 
 
645
  )
646
- _report_stage(stage_callback, "gpu:complete")
647
- return result
648
 
649
 
650
  def postprocess_gpu_result(
 
184
  cy_out: float
185
 
186
 
187
+ @dataclass
188
+ class PreparedGPURequest:
189
+ """CPU-only request payload prepared before entering the ZeroGPU section."""
190
+ image_tensor_np: np.ndarray
191
+ query_coord_np: np.ndarray
192
+ gt_depth_np: Optional[np.ndarray]
193
+ prompt_depth_np: Optional[np.ndarray]
194
+ gt_depth_mask_np: Optional[np.ndarray]
195
+ prompt_mask_np: Optional[np.ndarray]
196
+ depth_source_label: Optional[str]
197
+ model_path: str
198
+ moge2_pretrained: str
199
+ h_out: int
200
+ w_out: int
201
+ org_h: int
202
+ org_w: int
203
+
204
+
205
  class ModelCache:
206
  def __init__(self):
207
  self._cache: dict[tuple[str, str], Any] = {}
 
283
  return _image_tensor_from_numpy(resized, device), org_h, org_w
284
 
285
 
286
+ def _prepare_image_tensor_numpy(
287
+ image_np: np.ndarray,
288
+ input_size: tuple[int, int],
289
+ ) -> tuple[np.ndarray, int, int]:
290
+ if image_np is None:
291
+ raise ValueError("Input image is required")
292
+ if image_np.ndim != 3 or image_np.shape[2] != 3:
293
+ raise ValueError("Input image must be an RGB image with shape [H, W, 3]")
294
+
295
+ org_h, org_w = int(image_np.shape[0]), int(image_np.shape[1])
296
+ resized = _resize_rgb_image(image_np, input_size)
297
+ image_tensor_np = np.ascontiguousarray(
298
+ resized.astype(np.float32).transpose(2, 0, 1)[None] / 255.0
299
+ )
300
+ return image_tensor_np, org_h, org_w
301
+
302
+
303
  def _resolve_depth_inputs(
304
  depth_path: Optional[str],
305
  input_size: tuple[int, int],
306
  image: torch.Tensor,
307
  device: torch.device,
308
+ moge2_pretrained: Optional[str] = None,
309
  ) -> tuple[
310
  torch.Tensor,
311
  torch.Tensor,
 
316
  Optional[tuple[float, float, float, float]],
317
  ]:
318
  input_depth_path = depth_path if depth_path else None
319
+ if moge2_pretrained is None:
320
+ moge2_pretrained = resolve_moge2_pretrained()
321
  gt_depth, prompt_depth, gt_depth_mask, used_input_depth, moge2_intrinsics = prepare_metric_depth_inputs(
322
  input_depth_path=input_depth_path,
323
  input_size=input_size,
 
410
  )
411
 
412
 
413
+ def prepare_gpu_request_inputs(
414
+ image_np: np.ndarray,
415
+ depth_path: Optional[str],
416
+ model_type: str,
417
+ input_size_text: str,
418
+ output_resolution_mode: str,
419
+ upsample_ratio: int,
420
+ model_cache: Optional[ModelCache] = None,
421
+ stage_callback: Optional[Callable[[str], None]] = None,
422
+ ) -> PreparedGPURequest:
423
+ """Prepare all CPU-only inputs before entering the ZeroGPU-decorated section."""
424
+ _report_stage(stage_callback, "cpu:prepare_started")
425
+
426
+ input_size = _parse_image_size(input_size_text)
427
+ if upsample_ratio < 1 or upsample_ratio > 8:
428
+ raise ValueError("upsample_ratio must be in [1, 8]")
429
+
430
+ image_tensor_np, org_h, org_w = _prepare_image_tensor_numpy(image_np, input_size)
431
+ _report_stage(stage_callback, "cpu:image_prepared")
432
+
433
+ h_in, w_in = input_size
434
+ h_out, w_out = resolve_output_size_from_mode(
435
+ output_resolution_mode=output_resolution_mode,
436
+ org_h=org_h,
437
+ org_w=org_w,
438
+ h=h_in,
439
+ w=w_in,
440
+ output_size=input_size,
441
+ upsample_ratio=upsample_ratio,
442
+ )
443
+ query_coord_np = np.ascontiguousarray(
444
+ make_2d_uniform_coord((h_out, w_out)).unsqueeze(0).cpu().numpy().astype(np.float32)
445
+ )
446
+ _report_stage(stage_callback, "cpu:query_coord_prepared")
447
+
448
+ if model_type == "InfiniDepth_DC":
449
+ assert depth_path is not None and os.path.exists(depth_path), \
450
+ "InfiniDepth_DC requires a valid input depth map for depth completion."
451
+
452
+ moge2_pretrained = resolve_moge2_pretrained()
453
+ _report_stage(stage_callback, "cpu:moge2_path_resolved")
454
+ gt_depth_np = None
455
+ prompt_depth_np = None
456
+ gt_depth_mask_np = None
457
+ prompt_mask_np = None
458
+ depth_source_label = None
459
+ if depth_path is not None and os.path.exists(depth_path):
460
+ image_cpu = torch.from_numpy(image_tensor_np).to(dtype=torch.float32)
461
+ gt_depth, prompt_depth, gt_depth_mask, used_input_depth, _ = prepare_metric_depth_inputs(
462
+ input_depth_path=depth_path,
463
+ input_size=input_size,
464
+ image=image_cpu,
465
+ device=torch.device("cpu"),
466
+ moge2_pretrained=moge2_pretrained,
467
+ depth_load_kwargs={"enable_noise_filter": False},
468
+ )
469
+ gt = depth_to_disparity(gt_depth)
470
+ prompt = depth_to_disparity(prompt_depth)
471
+ prompt_mask = prompt > 0
472
+ depth_source_label = "uploaded depth" if used_input_depth else "MoGe-2 prior"
473
+ gt_depth_np = np.ascontiguousarray(gt.cpu().numpy().astype(np.float32))
474
+ prompt_depth_np = np.ascontiguousarray(prompt.cpu().numpy().astype(np.float32))
475
+ gt_depth_mask_np = np.ascontiguousarray(gt_depth_mask.cpu().numpy().astype(np.float32))
476
+ prompt_mask_np = np.ascontiguousarray(prompt_mask.cpu().numpy())
477
+ _report_stage(stage_callback, f"cpu:uploaded_depth_prepared source={depth_source_label}")
478
+
479
+ model_path = resolve_checkpoint_path(model_type)
480
+ _report_stage(stage_callback, "cpu:model_path_resolved")
481
+ (model_cache or ModelCache()).preload(model_type=model_type, model_path=model_path)
482
+ _report_stage(stage_callback, "cpu:model_cached")
483
+ _report_stage(stage_callback, "cpu:prepare_completed")
484
+
485
+ return PreparedGPURequest(
486
+ image_tensor_np=image_tensor_np,
487
+ query_coord_np=query_coord_np,
488
+ gt_depth_np=gt_depth_np,
489
+ prompt_depth_np=prompt_depth_np,
490
+ gt_depth_mask_np=gt_depth_mask_np,
491
+ prompt_mask_np=prompt_mask_np,
492
+ depth_source_label=depth_source_label,
493
+ model_path=model_path,
494
+ moge2_pretrained=moge2_pretrained,
495
+ h_out=h_out,
496
+ w_out=w_out,
497
+ org_h=org_h,
498
+ org_w=org_w,
499
+ )
500
+
501
+
502
+ def run_prepared_gpu_inference(
503
+ image_tensor_np: np.ndarray,
504
+ query_coord_np: np.ndarray,
505
+ model_type: str,
506
+ model_path: str,
507
+ moge2_pretrained: str,
508
+ h_out: int,
509
+ w_out: int,
510
+ org_h: int,
511
+ org_w: int,
512
+ depth_source_label: Optional[str] = None,
513
+ gt_depth_np: Optional[np.ndarray] = None,
514
+ prompt_depth_np: Optional[np.ndarray] = None,
515
+ gt_depth_mask_np: Optional[np.ndarray] = None,
516
+ prompt_mask_np: Optional[np.ndarray] = None,
517
+ fx_org: Optional[float] = None,
518
+ fy_org: Optional[float] = None,
519
+ cx_org: Optional[float] = None,
520
+ cy_org: Optional[float] = None,
521
+ model_cache: Optional[ModelCache] = None,
522
+ stage_callback: Optional[Callable[[str], None]] = None,
523
+ ) -> GPUInferenceResult:
524
+ """Run CUDA-bound MoGe/model inference and return CPU outputs."""
525
+ image_shape = tuple(int(d) for d in image_tensor_np.shape)
526
+ _report_stage(stage_callback, "gpu:start")
527
+ Log.info(
528
+ f"run_prepared_gpu_inference: model_type={model_type}, image_tensor_shape={image_shape}, "
529
+ f"output_size={h_out}x{w_out}, has_prepared_depth={gt_depth_np is not None}, "
530
+ f"cuda_available={torch.cuda.is_available()}"
531
+ )
532
+ if not torch.cuda.is_available():
533
+ raise RuntimeError(
534
+ "No CUDA GPU is available. If using Hugging Face ZeroGPU, "
535
+ "decorate the Gradio inference function with @spaces.GPU and enable queue()."
536
+ )
537
+
538
+ device = torch.device("cuda")
539
+ _debug = os.getenv("INFINIDEPTH_DEBUG_GPU", "0") == "1"
540
+
541
+ image = torch.from_numpy(image_tensor_np).to(device=device, dtype=torch.float32)
542
+ _report_stage(stage_callback, "gpu:image_to_device")
543
+ if _debug:
544
+ torch.cuda.synchronize()
545
+ Log.info(f"[GPU-DEBUG] image_to_device: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
546
+
547
+ h_in, w_in = int(image.shape[-2]), int(image.shape[-1])
548
+
549
+ if gt_depth_np is not None:
550
+ _report_stage(stage_callback, "gpu:using_prepared_depth_inputs")
551
+ gt_depth = torch.from_numpy(gt_depth_np).to(device=device, dtype=torch.float32)
552
+ prompt_depth = torch.from_numpy(prompt_depth_np).to(device=device, dtype=torch.float32)
553
+ gt_depth_mask = torch.from_numpy(gt_depth_mask_np).to(device=device, dtype=torch.float32)
554
+ prompt_mask = torch.from_numpy(prompt_mask_np).to(device=device)
555
+ moge2_intrinsics = None
556
+ resolved_depth_source_label = depth_source_label or "uploaded depth"
557
+ else:
558
+ _report_stage(stage_callback, "gpu:resolving_depth")
559
+ gt_depth, prompt_depth, gt_depth_mask, prompt_mask, resolved_depth_source_label, _, moge2_intrinsics = \
560
+ _resolve_depth_inputs(
561
+ depth_path=None,
562
+ input_size=(h_in, w_in),
563
+ image=image,
564
+ device=device,
565
+ moge2_pretrained=moge2_pretrained,
566
+ )
567
+ if _debug:
568
+ torch.cuda.synchronize()
569
+ Log.info(f"[GPU-DEBUG] depth_resolved: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
570
+ _report_stage(stage_callback, f"gpu:depth_resolved source={resolved_depth_source_label}")
571
+
572
+ Log.info(f"Depth source resolved: {resolved_depth_source_label}")
573
+
574
+ _report_stage(stage_callback, "gpu:loading_model")
575
+ model_cache = model_cache or ModelCache()
576
+ model = model_cache.get(model_type=model_type, model_path=model_path, device=device)
577
+ if _debug:
578
+ torch.cuda.synchronize()
579
+ Log.info(f"[GPU-DEBUG] model_loaded: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
580
+ _report_stage(stage_callback, "gpu:model_loaded")
581
+
582
+ query_2d_uniform_coord = torch.from_numpy(query_coord_np).to(device=device, dtype=torch.float32)
583
+ _report_stage(stage_callback, "gpu:query_coord_to_device")
584
+ _report_stage(stage_callback, "gpu:inference_started")
585
+ pred_depth, _ = model.inference(
586
+ image=image, query_coord=query_2d_uniform_coord,
587
+ gt_depth=gt_depth, gt_depth_mask=gt_depth_mask,
588
+ prompt_depth=prompt_depth, prompt_mask=prompt_mask,
589
+ )
590
+ if _debug:
591
+ torch.cuda.synchronize()
592
+ Log.info(f"[GPU-DEBUG] inference_finished: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
593
+ _report_stage(stage_callback, "gpu:inference_finished")
594
+
595
+ pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
596
+
597
+ fx, fy, cx, cy, intrinsics_source_label = resolve_camera_intrinsics_for_inference(
598
+ fx_org=fx_org,
599
+ fy_org=fy_org,
600
+ cx_org=cx_org,
601
+ cy_org=cy_org,
602
+ org_h=org_h,
603
+ org_w=org_w,
604
+ image=image,
605
+ moge2_pretrained=moge2_pretrained,
606
+ moge2_intrinsics=moge2_intrinsics,
607
+ )
608
+ Log.info(f"Camera intrinsics source: {intrinsics_source_label}")
609
+ fx_out, fy_out, cx_out, cy_out, _ = build_scaled_intrinsics_matrix(
610
+ fx_org=fx,
611
+ fy_org=fy,
612
+ cx_org=cx,
613
+ cy_org=cy,
614
+ org_h=org_h,
615
+ org_w=org_w,
616
+ h=h_in,
617
+ w=w_in,
618
+ device=device,
619
+ )
620
+
621
+ _report_stage(stage_callback, "gpu:transferring_to_cpu")
622
+ _report_stage(stage_callback, "gpu:complete")
623
+ return GPUInferenceResult(
624
+ pred_depthmap_np=pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32),
625
+ query_coord_np=query_2d_uniform_coord.detach().cpu().numpy().astype(np.float32),
626
+ pred_depth_np=pred_depth.detach().cpu().numpy().astype(np.float32),
627
+ image_tensor_np=image.detach().cpu().numpy().astype(np.float32),
628
+ depth_source_label=resolved_depth_source_label,
629
+ intrinsics_source_label=intrinsics_source_label,
630
+ h_out=h_out,
631
+ w_out=w_out,
632
+ org_h=org_h,
633
+ org_w=org_w,
634
+ fx_out=float(fx_out),
635
+ fy_out=float(fy_out),
636
+ cx_out=float(cx_out),
637
+ cy_out=float(cy_out),
638
+ )
639
+
640
+
641
  def run_single_image_demo(
642
  image_np: np.ndarray,
643
  depth_path: Optional[str],
 
800
  model_cache: Optional[ModelCache] = None,
801
  stage_callback: Optional[Callable[[str], None]] = None,
802
  ) -> GPUInferenceResult:
803
+ """Run GPU inference with CPU preprocessing performed ahead of time."""
804
+ prepared = prepare_gpu_request_inputs(
805
+ image_np=image_np,
806
+ depth_path=depth_path,
807
+ model_type=model_type,
808
+ input_size_text=input_size_text,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  output_resolution_mode=output_resolution_mode,
810
+ upsample_ratio=upsample_ratio,
811
+ model_cache=model_cache,
812
+ stage_callback=stage_callback,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
813
  )
814
+ return run_prepared_gpu_inference(
815
+ image_tensor_np=prepared.image_tensor_np,
816
+ query_coord_np=prepared.query_coord_np,
817
+ model_type=model_type,
818
+ model_path=prepared.model_path,
819
+ moge2_pretrained=prepared.moge2_pretrained,
820
+ h_out=prepared.h_out,
821
+ w_out=prepared.w_out,
822
+ org_h=prepared.org_h,
823
+ org_w=prepared.org_w,
824
+ depth_source_label=prepared.depth_source_label,
825
+ gt_depth_np=prepared.gt_depth_np,
826
+ prompt_depth_np=prepared.prompt_depth_np,
827
+ gt_depth_mask_np=prepared.gt_depth_mask_np,
828
+ prompt_mask_np=prepared.prompt_mask_np,
829
+ fx_org=fx_org,
830
+ fy_org=fy_org,
831
+ cx_org=cx_org,
832
+ cy_org=cy_org,
833
+ model_cache=model_cache,
834
+ stage_callback=stage_callback,
835
  )
 
 
836
 
837
 
838
  def postprocess_gpu_result(
app.py CHANGED
@@ -27,9 +27,10 @@ from PIL import Image
27
  from InfiniDepth.utils.hf_demo_utils import (
28
  ModelCache,
29
  postprocess_gpu_result,
 
30
  prepare_runtime_assets,
31
  preload_space_runtime_models,
32
- run_gpu_inference,
33
  )
34
  from InfiniDepth.utils.logger import Log
35
 
@@ -47,6 +48,33 @@ TRACE_ROOT = OUTPUT_ROOT / "trace"
47
  EXAMPLE_DATA_ROOT = Path(__file__).resolve().parent / "example_data"
48
  EXAMPLE_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
49
  EXAMPLE_DEPTH_EXTENSIONS = {".png", ".npy", ".npz", ".h5", ".hdf5", ".exr"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  CUSTOM_CSS = """
52
  #main-layout {
@@ -245,38 +273,45 @@ def _export_glb_from_points(xyz: np.ndarray, rgb: np.ndarray, output_path: Path)
245
  cloud.export(output_path.as_posix())
246
 
247
 
248
- @spaces.GPU(duration=120)
249
  def run_demo_gpu(
250
- image: np.ndarray,
251
- depth_file,
252
  model_type: str,
253
- input_size: str,
254
- output_resolution_mode: str,
255
- upsample_ratio: int,
 
 
 
 
 
 
 
 
256
  fx_org: Optional[float],
257
  fy_org: Optional[float],
258
  cx_org: Optional[float],
259
  cy_org: Optional[float],
260
  trace_path: str,
261
  ):
262
- """GPU-only inference. Returns a GPUInferenceResult with all data on CPU."""
263
- import torch
264
  _append_trace(trace_path, "worker:entered run_demo_gpu")
265
-
266
- if image is None:
267
- raise ValueError("Input RGB image is required")
268
-
269
- depth_path = None
270
- if depth_file is not None:
271
- depth_path = depth_file if isinstance(depth_file, str) else depth_file.name
272
-
273
- return run_gpu_inference(
274
- image_np=image,
275
- depth_path=depth_path,
276
  model_type=model_type,
277
- input_size_text=input_size,
278
- output_resolution_mode=output_resolution_mode,
279
- upsample_ratio=int(upsample_ratio),
 
 
 
 
 
 
 
 
280
  fx_org=_none_if_invalid(fx_org),
281
  fy_org=_none_if_invalid(fy_org),
282
  cx_org=_none_if_invalid(cx_org),
@@ -310,20 +345,42 @@ def run_demo(
310
  f"[{request_id}] run_demo start: model_type={model_type}, "
311
  f"input_size={input_size}, output_resolution_mode={output_resolution_mode}, "
312
  f"upsample_ratio={upsample_ratio}, max_points_preview={max_points_preview}, "
313
- f"depth_path={depth_path}, image_shape={image_shape}"
 
314
  )
315
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # --- GPU-only inference (consumes ZeroGPU quota) ---
317
  # ZeroGPU proxy tokens are bound to the current Gradio request.
318
  # Retrying a @spaces.GPU call inside the same request can turn a transient
319
  # "GPU task aborted" into a deterministic "Expired ZeroGPU proxy token".
320
  gpu_result = run_demo_gpu(
321
- image=image,
322
- depth_file=depth_file,
323
  model_type=model_type,
324
- input_size=input_size,
325
- output_resolution_mode=output_resolution_mode,
326
- upsample_ratio=upsample_ratio,
 
 
 
 
 
 
 
 
327
  fx_org=fx_org,
328
  fy_org=fy_org,
329
  cx_org=cx_org,
@@ -380,6 +437,10 @@ def run_demo(
380
  exc_type = type(exc).__name__
381
  exc_module = type(exc).__module__ or ""
382
  is_zerogpu_error = "spaces" in exc_module or "ZeroGPU" in str(exc) or "GPU task aborted" in str(exc)
 
 
 
 
383
  if is_zerogpu_error:
384
  error_message = (
385
  f"[{request_id}] ZeroGPU error: {exc}\n\n"
@@ -389,6 +450,15 @@ def run_demo(
389
  " - GPU task was preempted/aborted (click the button again)\n"
390
  " - duration too high for remaining quota"
391
  )
 
 
 
 
 
 
 
 
 
392
  else:
393
  error_message = f"Error [{request_id}] ({exc_type}): {exc}"
394
 
@@ -465,7 +535,8 @@ with gr.Blocks(title="InfiniDepth Demo", theme=gr.themes.Soft(), css=CUSTOM_CSS,
465
  "Tips: when a depth map is uploaded it will be used automatically, otherwise the demo falls back to MoGe-2. "
466
  "If camera intrinsics are missing, the demo first tries MoGe-2 estimates before image-size defaults. "
467
  "Use lower preview points for faster 3D interaction. "
468
- "On ZeroGPU, `512x672` is the safest default for cold starts."
 
469
  )
470
 
471
  with gr.Column(elem_id="right-panel"):
 
27
  from InfiniDepth.utils.hf_demo_utils import (
28
  ModelCache,
29
  postprocess_gpu_result,
30
+ prepare_gpu_request_inputs,
31
  prepare_runtime_assets,
32
  preload_space_runtime_models,
33
+ run_prepared_gpu_inference,
34
  )
35
  from InfiniDepth.utils.logger import Log
36
 
 
48
  EXAMPLE_DATA_ROOT = Path(__file__).resolve().parent / "example_data"
49
  EXAMPLE_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
50
  EXAMPLE_DEPTH_EXTENSIONS = {".png", ".npy", ".npz", ".h5", ".hdf5", ".exr"}
51
+ MAX_ZEROGPU_DURATION_SECONDS = 300
52
+
53
+
54
+ def _resolve_zerogpu_duration_seconds() -> int:
55
+ raw_value = os.getenv("INFINIDEPTH_ZEROGPU_DURATION", str(MAX_ZEROGPU_DURATION_SECONDS))
56
+ try:
57
+ duration = int(raw_value)
58
+ except ValueError:
59
+ Log.warning(
60
+ f"Invalid INFINIDEPTH_ZEROGPU_DURATION={raw_value!r}; falling back to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
61
+ )
62
+ return MAX_ZEROGPU_DURATION_SECONDS
63
+ if duration < 1:
64
+ Log.warning(
65
+ f"Non-positive INFINIDEPTH_ZEROGPU_DURATION={duration}; falling back to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
66
+ )
67
+ return MAX_ZEROGPU_DURATION_SECONDS
68
+ if duration > MAX_ZEROGPU_DURATION_SECONDS:
69
+ Log.warning(
70
+ f"INFINIDEPTH_ZEROGPU_DURATION={duration} exceeds the supported ZeroGPU ceiling; "
71
+ f"clamping to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
72
+ )
73
+ return MAX_ZEROGPU_DURATION_SECONDS
74
+ return duration
75
+
76
+
77
+ ZEROGPU_DURATION_SECONDS = _resolve_zerogpu_duration_seconds()
78
 
79
  CUSTOM_CSS = """
80
  #main-layout {
 
273
  cloud.export(output_path.as_posix())
274
 
275
 
276
+ @spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
277
  def run_demo_gpu(
278
+ image_tensor_np: np.ndarray,
279
+ query_coord_np: np.ndarray,
280
  model_type: str,
281
+ model_path: str,
282
+ moge2_pretrained: str,
283
+ h_out: int,
284
+ w_out: int,
285
+ org_h: int,
286
+ org_w: int,
287
+ prepared_depth_source_label: Optional[str],
288
+ gt_depth_np: Optional[np.ndarray],
289
+ prompt_depth_np: Optional[np.ndarray],
290
+ gt_depth_mask_np: Optional[np.ndarray],
291
+ prompt_mask_np: Optional[np.ndarray],
292
  fx_org: Optional[float],
293
  fy_org: Optional[float],
294
  cx_org: Optional[float],
295
  cy_org: Optional[float],
296
  trace_path: str,
297
  ):
298
+ """ZeroGPU section: run MoGe/model inference on GPU and return CPU outputs."""
 
299
  _append_trace(trace_path, "worker:entered run_demo_gpu")
300
+ return run_prepared_gpu_inference(
301
+ image_tensor_np=image_tensor_np,
302
+ query_coord_np=query_coord_np,
 
 
 
 
 
 
 
 
303
  model_type=model_type,
304
+ model_path=model_path,
305
+ moge2_pretrained=moge2_pretrained,
306
+ h_out=h_out,
307
+ w_out=w_out,
308
+ org_h=org_h,
309
+ org_w=org_w,
310
+ depth_source_label=prepared_depth_source_label,
311
+ gt_depth_np=gt_depth_np,
312
+ prompt_depth_np=prompt_depth_np,
313
+ gt_depth_mask_np=gt_depth_mask_np,
314
+ prompt_mask_np=prompt_mask_np,
315
  fx_org=_none_if_invalid(fx_org),
316
  fy_org=_none_if_invalid(fy_org),
317
  cx_org=_none_if_invalid(cx_org),
 
345
  f"[{request_id}] run_demo start: model_type={model_type}, "
346
  f"input_size={input_size}, output_resolution_mode={output_resolution_mode}, "
347
  f"upsample_ratio={upsample_ratio}, max_points_preview={max_points_preview}, "
348
+ f"depth_path={depth_path}, image_shape={image_shape}, "
349
+ f"zerogpu_duration={ZEROGPU_DURATION_SECONDS}s"
350
  )
351
  try:
352
+ _append_trace(trace_path, "ui:preparing_cpu_inputs")
353
+ prepared_gpu_request = prepare_gpu_request_inputs(
354
+ image_np=image,
355
+ depth_path=depth_path,
356
+ model_type=model_type,
357
+ input_size_text=input_size,
358
+ output_resolution_mode=output_resolution_mode,
359
+ upsample_ratio=int(upsample_ratio),
360
+ model_cache=MODEL_CACHE,
361
+ stage_callback=lambda stage: _append_trace(trace_path, stage),
362
+ )
363
+ _append_trace(trace_path, "ui:cpu_inputs_ready, entering gpu")
364
+
365
  # --- GPU-only inference (consumes ZeroGPU quota) ---
366
  # ZeroGPU proxy tokens are bound to the current Gradio request.
367
  # Retrying a @spaces.GPU call inside the same request can turn a transient
368
  # "GPU task aborted" into a deterministic "Expired ZeroGPU proxy token".
369
  gpu_result = run_demo_gpu(
370
+ image_tensor_np=prepared_gpu_request.image_tensor_np,
371
+ query_coord_np=prepared_gpu_request.query_coord_np,
372
  model_type=model_type,
373
+ model_path=prepared_gpu_request.model_path,
374
+ moge2_pretrained=prepared_gpu_request.moge2_pretrained,
375
+ h_out=prepared_gpu_request.h_out,
376
+ w_out=prepared_gpu_request.w_out,
377
+ org_h=prepared_gpu_request.org_h,
378
+ org_w=prepared_gpu_request.org_w,
379
+ prepared_depth_source_label=prepared_gpu_request.depth_source_label,
380
+ gt_depth_np=prepared_gpu_request.gt_depth_np,
381
+ prompt_depth_np=prepared_gpu_request.prompt_depth_np,
382
+ gt_depth_mask_np=prepared_gpu_request.gt_depth_mask_np,
383
+ prompt_mask_np=prepared_gpu_request.prompt_mask_np,
384
  fx_org=fx_org,
385
  fy_org=fy_org,
386
  cx_org=cx_org,
 
437
  exc_type = type(exc).__name__
438
  exc_module = type(exc).__module__ or ""
439
  is_zerogpu_error = "spaces" in exc_module or "ZeroGPU" in str(exc) or "GPU task aborted" in str(exc)
440
+ likely_gpu_timeout = (
441
+ "GPU task aborted" in str(exc)
442
+ and "gpu:complete" not in trace_content
443
+ )
444
  if is_zerogpu_error:
445
  error_message = (
446
  f"[{request_id}] ZeroGPU error: {exc}\n\n"
 
450
  " - GPU task was preempted/aborted (click the button again)\n"
451
  " - duration too high for remaining quota"
452
  )
453
+ if likely_gpu_timeout:
454
+ error_message = (
455
+ f"{error_message}\n"
456
+ f" - configured GPU runtime budget too short (current `@spaces.GPU(duration={ZEROGPU_DURATION_SECONDS})`)\n\n"
457
+ f"Current ZeroGPU duration: {ZEROGPU_DURATION_SECONDS}s.\n"
458
+ "This request likely exceeded the configured GPU runtime budget.\n"
459
+ "Try `512x672`, keep `upsample_ratio=1`, avoid `original` output for large images, "
460
+ f"or move to a dedicated GPU Space if `{MAX_ZEROGPU_DURATION_SECONDS}s` is still not enough."
461
+ )
462
  else:
463
  error_message = f"Error [{request_id}] ({exc_type}): {exc}"
464
 
 
535
  "Tips: when a depth map is uploaded it will be used automatically, otherwise the demo falls back to MoGe-2. "
536
  "If camera intrinsics are missing, the demo first tries MoGe-2 estimates before image-size defaults. "
537
  "Use lower preview points for faster 3D interaction. "
538
+ f"On ZeroGPU, `512x672` with `upsample_ratio=1` is the safest default for cold starts. "
539
+ f"The current GPU runtime budget is `{ZEROGPU_DURATION_SECONDS}s`."
540
  )
541
 
542
  with gr.Column(elem_id="right-panel"):