Spaces:

ritianyu
/

InfiniDepth

Running on Zero

App Files Files Community

ritianyu commited on 21 days ago

Commit

6afe9df

1 Parent(s): bc84e54

update

Browse files

Files changed (2) hide show

InfiniDepth/utils/hf_demo_utils.py +296 -108
app.py +101 -30

InfiniDepth/utils/hf_demo_utils.py CHANGED Viewed

@@ -184,6 +184,24 @@ class GPUInferenceResult:
     cy_out: float
 class ModelCache:
     def __init__(self):
         self._cache: dict[tuple[str, str], Any] = {}
@@ -265,11 +283,29 @@ def _prepare_image_tensor(
     return _image_tensor_from_numpy(resized, device), org_h, org_w
 def _resolve_depth_inputs(
     depth_path: Optional[str],
     input_size: tuple[int, int],
     image: torch.Tensor,
     device: torch.device,
 ) -> tuple[
     torch.Tensor,
     torch.Tensor,
@@ -280,7 +316,8 @@ def _resolve_depth_inputs(
     Optional[tuple[float, float, float, float]],
 ]:
     input_depth_path = depth_path if depth_path else None
-    moge2_pretrained = resolve_moge2_pretrained()
     gt_depth, prompt_depth, gt_depth_mask, used_input_depth, moge2_intrinsics = prepare_metric_depth_inputs(
         input_depth_path=input_depth_path,
         input_size=input_size,
@@ -373,6 +410,234 @@ def resolve_checkpoint_path(model_type: str) -> str:
     )
 def run_single_image_demo(
     image_np: np.ndarray,
     depth_path: Optional[str],
@@ -535,116 +800,39 @@ def run_gpu_inference(
     model_cache: Optional[ModelCache] = None,
     stage_callback: Optional[Callable[[str], None]] = None,
 ) -> GPUInferenceResult:
-    """Run only GPU-intensive inference. All outputs are moved to CPU numpy before return.
-    This is designed for HuggingFace ZeroGPU where GPU time is limited: only the actual
-    CUDA work (MoGe-2, model inference, intrinsics estimation) runs here. CPU-heavy
-    post-processing (colorization, point cloud, file I/O) should happen outside the
-    ``@spaces.GPU`` decorated caller.
-    """
-    image_shape = tuple(int(d) for d in image_np.shape) if image_np is not None else None
-    _report_stage(stage_callback, "gpu:start")
-    Log.info(
-        f"run_gpu_inference: model_type={model_type}, input_size={input_size_text}, "
-        f"output_resolution_mode={output_resolution_mode}, upsample_ratio={upsample_ratio}, "
-        f"has_depth={bool(depth_path)}, image_shape={image_shape}, "
-        f"cuda_available={torch.cuda.is_available()}"
-    )
-    if not torch.cuda.is_available():
-        raise RuntimeError(
-            "No CUDA GPU is available. If using Hugging Face ZeroGPU, "
-            "decorate the Gradio inference function with @spaces.GPU and enable queue()."
-        )
-    input_size = _parse_image_size(input_size_text)
-    if upsample_ratio < 1 or upsample_ratio > 8:
-        raise ValueError("upsample_ratio must be in [1, 8]")
-    output_size = input_size
-    device = torch.device("cuda")
-    _debug = os.getenv("INFINIDEPTH_DEBUG_GPU", "0") == "1"
-    image, org_h, org_w = _prepare_image_tensor(image_np, input_size, device)
-    _report_stage(stage_callback, "gpu:image_prepared")
-    if _debug:
-        torch.cuda.synchronize()
-        Log.info(f"[GPU-DEBUG] image_prepared: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
-    h_in, w_in = input_size
-    h_out, w_out = resolve_output_size_from_mode(
         output_resolution_mode=output_resolution_mode,
-        org_h=org_h, org_w=org_w, h=h_in, w=w_in,
-        output_size=output_size, upsample_ratio=upsample_ratio,
-    )
-    if model_type == "InfiniDepth_DC":
-        assert depth_path is not None and os.path.exists(depth_path), \
-            "InfiniDepth_DC requires a valid input depth map for depth completion."
-    _report_stage(stage_callback, "gpu:resolving_depth")
-    gt_depth, prompt_depth, gt_depth_mask, prompt_mask, depth_source_label, moge2_pretrained, moge2_intrinsics = \
-        _resolve_depth_inputs(depth_path=depth_path, input_size=input_size, image=image, device=device)
-    if _debug:
-        torch.cuda.synchronize()
-        Log.info(f"[GPU-DEBUG] depth_resolved: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
-    _report_stage(stage_callback, f"gpu:depth_resolved source={depth_source_label}")
-    Log.info(f"Depth source resolved: {depth_source_label}")
-    gt = depth_to_disparity(gt_depth)
-    prompt = depth_to_disparity(prompt_depth)
-    prompt_mask = prompt > 0
-    ckpt_path = resolve_checkpoint_path(model_type)
-    _report_stage(stage_callback, "gpu:loading_model")
-    model_cache = model_cache or ModelCache()
-    model = model_cache.get(model_type=model_type, model_path=ckpt_path, device=device)
-    if _debug:
-        torch.cuda.synchronize()
-        Log.info(f"[GPU-DEBUG] model_loaded: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
-    _report_stage(stage_callback, "gpu:model_loaded")
-    query_2d_uniform_coord = make_2d_uniform_coord((h_out, w_out)).unsqueeze(0).to(device)
-    _report_stage(stage_callback, "gpu:inference_started")
-    pred_depth, _ = model.inference(
-        image=image, query_coord=query_2d_uniform_coord,
-        gt_depth=gt, gt_depth_mask=gt_depth_mask,
-        prompt_depth=prompt, prompt_mask=prompt_mask,
-    )
-    if _debug:
-        torch.cuda.synchronize()
-        Log.info(f"[GPU-DEBUG] inference_finished: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
-    _report_stage(stage_callback, "gpu:inference_finished")
-    Log.info(f"Model inference completed: output_size={h_out}x{w_out}")
-    pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
-    fx, fy, cx, cy, intrinsics_source_label = resolve_camera_intrinsics_for_inference(
-        fx_org=fx_org, fy_org=fy_org, cx_org=cx_org, cy_org=cy_org,
-        org_h=org_h, org_w=org_w, image=image,
-        moge2_pretrained=moge2_pretrained, moge2_intrinsics=moge2_intrinsics,
-    )
-    Log.info(f"Camera intrinsics source: {intrinsics_source_label}")
-    fx_out, fy_out, cx_out, cy_out, _ = build_scaled_intrinsics_matrix(
-        fx_org=fx, fy_org=fy, cx_org=cx, cy_org=cy,
-        org_h=org_h, org_w=org_w, h=h_in, w=w_in, device=device,
     )
-    # Transfer all GPU tensors to CPU numpy before returning
-    _report_stage(stage_callback, "gpu:transferring_to_cpu")
-    result = GPUInferenceResult(
-        pred_depthmap_np=pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32),
-        query_coord_np=query_2d_uniform_coord.detach().cpu().numpy().astype(np.float32),
-        pred_depth_np=pred_depth.detach().cpu().numpy().astype(np.float32),
-        image_tensor_np=image.detach().cpu().numpy().astype(np.float32),
-        depth_source_label=depth_source_label,
-        intrinsics_source_label=intrinsics_source_label,
-        h_out=h_out, w_out=w_out,
-        org_h=org_h, org_w=org_w,
-        fx_out=float(fx_out), fy_out=float(fy_out),
-        cx_out=float(cx_out), cy_out=float(cy_out),
     )
-    _report_stage(stage_callback, "gpu:complete")
-    return result
 def postprocess_gpu_result(

     cy_out: float
+@dataclass
+class PreparedGPURequest:
+    """CPU-only request payload prepared before entering the ZeroGPU section."""
+    image_tensor_np: np.ndarray
+    query_coord_np: np.ndarray
+    gt_depth_np: Optional[np.ndarray]
+    prompt_depth_np: Optional[np.ndarray]
+    gt_depth_mask_np: Optional[np.ndarray]
+    prompt_mask_np: Optional[np.ndarray]
+    depth_source_label: Optional[str]
+    model_path: str
+    moge2_pretrained: str
+    h_out: int
+    w_out: int
+    org_h: int
+    org_w: int
 class ModelCache:
     def __init__(self):
         self._cache: dict[tuple[str, str], Any] = {}
     return _image_tensor_from_numpy(resized, device), org_h, org_w
+def _prepare_image_tensor_numpy(
+    image_np: np.ndarray,
+    input_size: tuple[int, int],
+) -> tuple[np.ndarray, int, int]:
+    if image_np is None:
+        raise ValueError("Input image is required")
+    if image_np.ndim != 3 or image_np.shape[2] != 3:
+        raise ValueError("Input image must be an RGB image with shape [H, W, 3]")
+    org_h, org_w = int(image_np.shape[0]), int(image_np.shape[1])
+    resized = _resize_rgb_image(image_np, input_size)
+    image_tensor_np = np.ascontiguousarray(
+        resized.astype(np.float32).transpose(2, 0, 1)[None] / 255.0
+    )
+    return image_tensor_np, org_h, org_w
 def _resolve_depth_inputs(
     depth_path: Optional[str],
     input_size: tuple[int, int],
     image: torch.Tensor,
     device: torch.device,
+    moge2_pretrained: Optional[str] = None,
 ) -> tuple[
     torch.Tensor,
     torch.Tensor,
     Optional[tuple[float, float, float, float]],
 ]:
     input_depth_path = depth_path if depth_path else None
+    if moge2_pretrained is None:
+        moge2_pretrained = resolve_moge2_pretrained()
     gt_depth, prompt_depth, gt_depth_mask, used_input_depth, moge2_intrinsics = prepare_metric_depth_inputs(
         input_depth_path=input_depth_path,
         input_size=input_size,
     )
+def prepare_gpu_request_inputs(
+    image_np: np.ndarray,
+    depth_path: Optional[str],
+    model_type: str,
+    input_size_text: str,
+    output_resolution_mode: str,
+    upsample_ratio: int,
+    model_cache: Optional[ModelCache] = None,
+    stage_callback: Optional[Callable[[str], None]] = None,
+) -> PreparedGPURequest:
+    """Prepare all CPU-only inputs before entering the ZeroGPU-decorated section."""
+    _report_stage(stage_callback, "cpu:prepare_started")
+    input_size = _parse_image_size(input_size_text)
+    if upsample_ratio < 1 or upsample_ratio > 8:
+        raise ValueError("upsample_ratio must be in [1, 8]")
+    image_tensor_np, org_h, org_w = _prepare_image_tensor_numpy(image_np, input_size)
+    _report_stage(stage_callback, "cpu:image_prepared")
+    h_in, w_in = input_size
+    h_out, w_out = resolve_output_size_from_mode(
+        output_resolution_mode=output_resolution_mode,
+        org_h=org_h,
+        org_w=org_w,
+        h=h_in,
+        w=w_in,
+        output_size=input_size,
+        upsample_ratio=upsample_ratio,
+    )
+    query_coord_np = np.ascontiguousarray(
+        make_2d_uniform_coord((h_out, w_out)).unsqueeze(0).cpu().numpy().astype(np.float32)
+    )
+    _report_stage(stage_callback, "cpu:query_coord_prepared")
+    if model_type == "InfiniDepth_DC":
+        assert depth_path is not None and os.path.exists(depth_path), \
+            "InfiniDepth_DC requires a valid input depth map for depth completion."
+    moge2_pretrained = resolve_moge2_pretrained()
+    _report_stage(stage_callback, "cpu:moge2_path_resolved")
+    gt_depth_np = None
+    prompt_depth_np = None
+    gt_depth_mask_np = None
+    prompt_mask_np = None
+    depth_source_label = None
+    if depth_path is not None and os.path.exists(depth_path):
+        image_cpu = torch.from_numpy(image_tensor_np).to(dtype=torch.float32)
+        gt_depth, prompt_depth, gt_depth_mask, used_input_depth, _ = prepare_metric_depth_inputs(
+            input_depth_path=depth_path,
+            input_size=input_size,
+            image=image_cpu,
+            device=torch.device("cpu"),
+            moge2_pretrained=moge2_pretrained,
+            depth_load_kwargs={"enable_noise_filter": False},
+        )
+        gt = depth_to_disparity(gt_depth)
+        prompt = depth_to_disparity(prompt_depth)
+        prompt_mask = prompt > 0
+        depth_source_label = "uploaded depth" if used_input_depth else "MoGe-2 prior"
+        gt_depth_np = np.ascontiguousarray(gt.cpu().numpy().astype(np.float32))
+        prompt_depth_np = np.ascontiguousarray(prompt.cpu().numpy().astype(np.float32))
+        gt_depth_mask_np = np.ascontiguousarray(gt_depth_mask.cpu().numpy().astype(np.float32))
+        prompt_mask_np = np.ascontiguousarray(prompt_mask.cpu().numpy())
+        _report_stage(stage_callback, f"cpu:uploaded_depth_prepared source={depth_source_label}")
+    model_path = resolve_checkpoint_path(model_type)
+    _report_stage(stage_callback, "cpu:model_path_resolved")
+    (model_cache or ModelCache()).preload(model_type=model_type, model_path=model_path)
+    _report_stage(stage_callback, "cpu:model_cached")
+    _report_stage(stage_callback, "cpu:prepare_completed")
+    return PreparedGPURequest(
+        image_tensor_np=image_tensor_np,
+        query_coord_np=query_coord_np,
+        gt_depth_np=gt_depth_np,
+        prompt_depth_np=prompt_depth_np,
+        gt_depth_mask_np=gt_depth_mask_np,
+        prompt_mask_np=prompt_mask_np,
+        depth_source_label=depth_source_label,
+        model_path=model_path,
+        moge2_pretrained=moge2_pretrained,
+        h_out=h_out,
+        w_out=w_out,
+        org_h=org_h,
+        org_w=org_w,
+    )
+def run_prepared_gpu_inference(
+    image_tensor_np: np.ndarray,
+    query_coord_np: np.ndarray,
+    model_type: str,
+    model_path: str,
+    moge2_pretrained: str,
+    h_out: int,
+    w_out: int,
+    org_h: int,
+    org_w: int,
+    depth_source_label: Optional[str] = None,
+    gt_depth_np: Optional[np.ndarray] = None,
+    prompt_depth_np: Optional[np.ndarray] = None,
+    gt_depth_mask_np: Optional[np.ndarray] = None,
+    prompt_mask_np: Optional[np.ndarray] = None,
+    fx_org: Optional[float] = None,
+    fy_org: Optional[float] = None,
+    cx_org: Optional[float] = None,
+    cy_org: Optional[float] = None,
+    model_cache: Optional[ModelCache] = None,
+    stage_callback: Optional[Callable[[str], None]] = None,
+) -> GPUInferenceResult:
+    """Run CUDA-bound MoGe/model inference and return CPU outputs."""
+    image_shape = tuple(int(d) for d in image_tensor_np.shape)
+    _report_stage(stage_callback, "gpu:start")
+    Log.info(
+        f"run_prepared_gpu_inference: model_type={model_type}, image_tensor_shape={image_shape}, "
+        f"output_size={h_out}x{w_out}, has_prepared_depth={gt_depth_np is not None}, "
+        f"cuda_available={torch.cuda.is_available()}"
+    )
+    if not torch.cuda.is_available():
+        raise RuntimeError(
+            "No CUDA GPU is available. If using Hugging Face ZeroGPU, "
+            "decorate the Gradio inference function with @spaces.GPU and enable queue()."
+        )
+    device = torch.device("cuda")
+    _debug = os.getenv("INFINIDEPTH_DEBUG_GPU", "0") == "1"
+    image = torch.from_numpy(image_tensor_np).to(device=device, dtype=torch.float32)
+    _report_stage(stage_callback, "gpu:image_to_device")
+    if _debug:
+        torch.cuda.synchronize()
+        Log.info(f"[GPU-DEBUG] image_to_device: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
+    h_in, w_in = int(image.shape[-2]), int(image.shape[-1])
+    if gt_depth_np is not None:
+        _report_stage(stage_callback, "gpu:using_prepared_depth_inputs")
+        gt_depth = torch.from_numpy(gt_depth_np).to(device=device, dtype=torch.float32)
+        prompt_depth = torch.from_numpy(prompt_depth_np).to(device=device, dtype=torch.float32)
+        gt_depth_mask = torch.from_numpy(gt_depth_mask_np).to(device=device, dtype=torch.float32)
+        prompt_mask = torch.from_numpy(prompt_mask_np).to(device=device)
+        moge2_intrinsics = None
+        resolved_depth_source_label = depth_source_label or "uploaded depth"
+    else:
+        _report_stage(stage_callback, "gpu:resolving_depth")
+        gt_depth, prompt_depth, gt_depth_mask, prompt_mask, resolved_depth_source_label, _, moge2_intrinsics = \
+            _resolve_depth_inputs(
+                depth_path=None,
+                input_size=(h_in, w_in),
+                image=image,
+                device=device,
+                moge2_pretrained=moge2_pretrained,
+            )
+        if _debug:
+            torch.cuda.synchronize()
+            Log.info(f"[GPU-DEBUG] depth_resolved: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
+        _report_stage(stage_callback, f"gpu:depth_resolved source={resolved_depth_source_label}")
+    Log.info(f"Depth source resolved: {resolved_depth_source_label}")
+    _report_stage(stage_callback, "gpu:loading_model")
+    model_cache = model_cache or ModelCache()
+    model = model_cache.get(model_type=model_type, model_path=model_path, device=device)
+    if _debug:
+        torch.cuda.synchronize()
+        Log.info(f"[GPU-DEBUG] model_loaded: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
+    _report_stage(stage_callback, "gpu:model_loaded")
+    query_2d_uniform_coord = torch.from_numpy(query_coord_np).to(device=device, dtype=torch.float32)
+    _report_stage(stage_callback, "gpu:query_coord_to_device")
+    _report_stage(stage_callback, "gpu:inference_started")
+    pred_depth, _ = model.inference(
+        image=image, query_coord=query_2d_uniform_coord,
+        gt_depth=gt_depth, gt_depth_mask=gt_depth_mask,
+        prompt_depth=prompt_depth, prompt_mask=prompt_mask,
+    )
+    if _debug:
+        torch.cuda.synchronize()
+        Log.info(f"[GPU-DEBUG] inference_finished: GPU mem allocated={torch.cuda.memory_allocated(device) / 1e6:.1f}MB")
+    _report_stage(stage_callback, "gpu:inference_finished")
+    pred_depthmap = pred_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)
+    fx, fy, cx, cy, intrinsics_source_label = resolve_camera_intrinsics_for_inference(
+        fx_org=fx_org,
+        fy_org=fy_org,
+        cx_org=cx_org,
+        cy_org=cy_org,
+        org_h=org_h,
+        org_w=org_w,
+        image=image,
+        moge2_pretrained=moge2_pretrained,
+        moge2_intrinsics=moge2_intrinsics,
+    )
+    Log.info(f"Camera intrinsics source: {intrinsics_source_label}")
+    fx_out, fy_out, cx_out, cy_out, _ = build_scaled_intrinsics_matrix(
+        fx_org=fx,
+        fy_org=fy,
+        cx_org=cx,
+        cy_org=cy,
+        org_h=org_h,
+        org_w=org_w,
+        h=h_in,
+        w=w_in,
+        device=device,
+    )
+    _report_stage(stage_callback, "gpu:transferring_to_cpu")
+    _report_stage(stage_callback, "gpu:complete")
+    return GPUInferenceResult(
+        pred_depthmap_np=pred_depthmap[0, 0].detach().cpu().numpy().astype(np.float32),
+        query_coord_np=query_2d_uniform_coord.detach().cpu().numpy().astype(np.float32),
+        pred_depth_np=pred_depth.detach().cpu().numpy().astype(np.float32),
+        image_tensor_np=image.detach().cpu().numpy().astype(np.float32),
+        depth_source_label=resolved_depth_source_label,
+        intrinsics_source_label=intrinsics_source_label,
+        h_out=h_out,
+        w_out=w_out,
+        org_h=org_h,
+        org_w=org_w,
+        fx_out=float(fx_out),
+        fy_out=float(fy_out),
+        cx_out=float(cx_out),
+        cy_out=float(cy_out),
+    )
 def run_single_image_demo(
     image_np: np.ndarray,
     depth_path: Optional[str],
     model_cache: Optional[ModelCache] = None,
     stage_callback: Optional[Callable[[str], None]] = None,
 ) -> GPUInferenceResult:
+    """Run GPU inference with CPU preprocessing performed ahead of time."""
+    prepared = prepare_gpu_request_inputs(
+        image_np=image_np,
+        depth_path=depth_path,
+        model_type=model_type,
+        input_size_text=input_size_text,
         output_resolution_mode=output_resolution_mode,
+        upsample_ratio=upsample_ratio,
+        model_cache=model_cache,
+        stage_callback=stage_callback,
     )
+    return run_prepared_gpu_inference(
+        image_tensor_np=prepared.image_tensor_np,
+        query_coord_np=prepared.query_coord_np,
+        model_type=model_type,
+        model_path=prepared.model_path,
+        moge2_pretrained=prepared.moge2_pretrained,
+        h_out=prepared.h_out,
+        w_out=prepared.w_out,
+        org_h=prepared.org_h,
+        org_w=prepared.org_w,
+        depth_source_label=prepared.depth_source_label,
+        gt_depth_np=prepared.gt_depth_np,
+        prompt_depth_np=prepared.prompt_depth_np,
+        gt_depth_mask_np=prepared.gt_depth_mask_np,
+        prompt_mask_np=prepared.prompt_mask_np,
+        fx_org=fx_org,
+        fy_org=fy_org,
+        cx_org=cx_org,
+        cy_org=cy_org,
+        model_cache=model_cache,
+        stage_callback=stage_callback,
     )
 def postprocess_gpu_result(

app.py CHANGED Viewed

@@ -27,9 +27,10 @@ from PIL import Image
 from InfiniDepth.utils.hf_demo_utils import (
     ModelCache,
     postprocess_gpu_result,
     prepare_runtime_assets,
     preload_space_runtime_models,
-    run_gpu_inference,
 )
 from InfiniDepth.utils.logger import Log
@@ -47,6 +48,33 @@ TRACE_ROOT = OUTPUT_ROOT / "trace"
 EXAMPLE_DATA_ROOT = Path(__file__).resolve().parent / "example_data"
 EXAMPLE_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
 EXAMPLE_DEPTH_EXTENSIONS = {".png", ".npy", ".npz", ".h5", ".hdf5", ".exr"}
 CUSTOM_CSS = """
 #main-layout {
@@ -245,38 +273,45 @@ def _export_glb_from_points(xyz: np.ndarray, rgb: np.ndarray, output_path: Path)
     cloud.export(output_path.as_posix())
-@spaces.GPU(duration=120)
 def run_demo_gpu(
-    image: np.ndarray,
-    depth_file,
     model_type: str,
-    input_size: str,
-    output_resolution_mode: str,
-    upsample_ratio: int,
     fx_org: Optional[float],
     fy_org: Optional[float],
     cx_org: Optional[float],
     cy_org: Optional[float],
     trace_path: str,
 ):
-    """GPU-only inference. Returns a GPUInferenceResult with all data on CPU."""
-    import torch
     _append_trace(trace_path, "worker:entered run_demo_gpu")
-    if image is None:
-        raise ValueError("Input RGB image is required")
-    depth_path = None
-    if depth_file is not None:
-        depth_path = depth_file if isinstance(depth_file, str) else depth_file.name
-    return run_gpu_inference(
-        image_np=image,
-        depth_path=depth_path,
         model_type=model_type,
-        input_size_text=input_size,
-        output_resolution_mode=output_resolution_mode,
-        upsample_ratio=int(upsample_ratio),
         fx_org=_none_if_invalid(fx_org),
         fy_org=_none_if_invalid(fy_org),
         cx_org=_none_if_invalid(cx_org),
@@ -310,20 +345,42 @@ def run_demo(
         f"[{request_id}] run_demo start: model_type={model_type}, "
         f"input_size={input_size}, output_resolution_mode={output_resolution_mode}, "
         f"upsample_ratio={upsample_ratio}, max_points_preview={max_points_preview}, "
-        f"depth_path={depth_path}, image_shape={image_shape}"
     )
     try:
         # --- GPU-only inference (consumes ZeroGPU quota) ---
         # ZeroGPU proxy tokens are bound to the current Gradio request.
         # Retrying a @spaces.GPU call inside the same request can turn a transient
         # "GPU task aborted" into a deterministic "Expired ZeroGPU proxy token".
         gpu_result = run_demo_gpu(
-            image=image,
-            depth_file=depth_file,
             model_type=model_type,
-            input_size=input_size,
-            output_resolution_mode=output_resolution_mode,
-            upsample_ratio=upsample_ratio,
             fx_org=fx_org,
             fy_org=fy_org,
             cx_org=cx_org,
@@ -380,6 +437,10 @@ def run_demo(
         exc_type = type(exc).__name__
         exc_module = type(exc).__module__ or ""
         is_zerogpu_error = "spaces" in exc_module or "ZeroGPU" in str(exc) or "GPU task aborted" in str(exc)
         if is_zerogpu_error:
             error_message = (
                 f"[{request_id}] ZeroGPU error: {exc}\n\n"
@@ -389,6 +450,15 @@ def run_demo(
                 "  - GPU task was preempted/aborted (click the button again)\n"
                 "  - duration too high for remaining quota"
             )
         else:
             error_message = f"Error [{request_id}] ({exc_type}): {exc}"
@@ -465,7 +535,8 @@ with gr.Blocks(title="InfiniDepth Demo", theme=gr.themes.Soft(), css=CUSTOM_CSS,
                 "Tips: when a depth map is uploaded it will be used automatically, otherwise the demo falls back to MoGe-2. "
                 "If camera intrinsics are missing, the demo first tries MoGe-2 estimates before image-size defaults. "
                 "Use lower preview points for faster 3D interaction. "
-                "On ZeroGPU, `512x672` is the safest default for cold starts."
             )
         with gr.Column(elem_id="right-panel"):

 from InfiniDepth.utils.hf_demo_utils import (
     ModelCache,
     postprocess_gpu_result,
+    prepare_gpu_request_inputs,
     prepare_runtime_assets,
     preload_space_runtime_models,
+    run_prepared_gpu_inference,
 )
 from InfiniDepth.utils.logger import Log
 EXAMPLE_DATA_ROOT = Path(__file__).resolve().parent / "example_data"
 EXAMPLE_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp"}
 EXAMPLE_DEPTH_EXTENSIONS = {".png", ".npy", ".npz", ".h5", ".hdf5", ".exr"}
+MAX_ZEROGPU_DURATION_SECONDS = 300
+def _resolve_zerogpu_duration_seconds() -> int:
+    raw_value = os.getenv("INFINIDEPTH_ZEROGPU_DURATION", str(MAX_ZEROGPU_DURATION_SECONDS))
+    try:
+        duration = int(raw_value)
+    except ValueError:
+        Log.warning(
+            f"Invalid INFINIDEPTH_ZEROGPU_DURATION={raw_value!r}; falling back to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
+        )
+        return MAX_ZEROGPU_DURATION_SECONDS
+    if duration < 1:
+        Log.warning(
+            f"Non-positive INFINIDEPTH_ZEROGPU_DURATION={duration}; falling back to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
+        )
+        return MAX_ZEROGPU_DURATION_SECONDS
+    if duration > MAX_ZEROGPU_DURATION_SECONDS:
+        Log.warning(
+            f"INFINIDEPTH_ZEROGPU_DURATION={duration} exceeds the supported ZeroGPU ceiling; "
+            f"clamping to {MAX_ZEROGPU_DURATION_SECONDS} seconds."
+        )
+        return MAX_ZEROGPU_DURATION_SECONDS
+    return duration
+ZEROGPU_DURATION_SECONDS = _resolve_zerogpu_duration_seconds()
 CUSTOM_CSS = """
 #main-layout {
     cloud.export(output_path.as_posix())
+@spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
 def run_demo_gpu(
+    image_tensor_np: np.ndarray,
+    query_coord_np: np.ndarray,
     model_type: str,
+    model_path: str,
+    moge2_pretrained: str,
+    h_out: int,
+    w_out: int,
+    org_h: int,
+    org_w: int,
+    prepared_depth_source_label: Optional[str],
+    gt_depth_np: Optional[np.ndarray],
+    prompt_depth_np: Optional[np.ndarray],
+    gt_depth_mask_np: Optional[np.ndarray],
+    prompt_mask_np: Optional[np.ndarray],
     fx_org: Optional[float],
     fy_org: Optional[float],
     cx_org: Optional[float],
     cy_org: Optional[float],
     trace_path: str,
 ):
+    """ZeroGPU section: run MoGe/model inference on GPU and return CPU outputs."""
     _append_trace(trace_path, "worker:entered run_demo_gpu")
+    return run_prepared_gpu_inference(
+        image_tensor_np=image_tensor_np,
+        query_coord_np=query_coord_np,
         model_type=model_type,
+        model_path=model_path,
+        moge2_pretrained=moge2_pretrained,
+        h_out=h_out,
+        w_out=w_out,
+        org_h=org_h,
+        org_w=org_w,
+        depth_source_label=prepared_depth_source_label,
+        gt_depth_np=gt_depth_np,
+        prompt_depth_np=prompt_depth_np,
+        gt_depth_mask_np=gt_depth_mask_np,
+        prompt_mask_np=prompt_mask_np,
         fx_org=_none_if_invalid(fx_org),
         fy_org=_none_if_invalid(fy_org),
         cx_org=_none_if_invalid(cx_org),
         f"[{request_id}] run_demo start: model_type={model_type}, "
         f"input_size={input_size}, output_resolution_mode={output_resolution_mode}, "
         f"upsample_ratio={upsample_ratio}, max_points_preview={max_points_preview}, "
+        f"depth_path={depth_path}, image_shape={image_shape}, "
+        f"zerogpu_duration={ZEROGPU_DURATION_SECONDS}s"
     )
     try:
+        _append_trace(trace_path, "ui:preparing_cpu_inputs")
+        prepared_gpu_request = prepare_gpu_request_inputs(
+            image_np=image,
+            depth_path=depth_path,
+            model_type=model_type,
+            input_size_text=input_size,
+            output_resolution_mode=output_resolution_mode,
+            upsample_ratio=int(upsample_ratio),
+            model_cache=MODEL_CACHE,
+            stage_callback=lambda stage: _append_trace(trace_path, stage),
+        )
+        _append_trace(trace_path, "ui:cpu_inputs_ready, entering gpu")
         # --- GPU-only inference (consumes ZeroGPU quota) ---
         # ZeroGPU proxy tokens are bound to the current Gradio request.
         # Retrying a @spaces.GPU call inside the same request can turn a transient
         # "GPU task aborted" into a deterministic "Expired ZeroGPU proxy token".
         gpu_result = run_demo_gpu(
+            image_tensor_np=prepared_gpu_request.image_tensor_np,
+            query_coord_np=prepared_gpu_request.query_coord_np,
             model_type=model_type,
+            model_path=prepared_gpu_request.model_path,
+            moge2_pretrained=prepared_gpu_request.moge2_pretrained,
+            h_out=prepared_gpu_request.h_out,
+            w_out=prepared_gpu_request.w_out,
+            org_h=prepared_gpu_request.org_h,
+            org_w=prepared_gpu_request.org_w,
+            prepared_depth_source_label=prepared_gpu_request.depth_source_label,
+            gt_depth_np=prepared_gpu_request.gt_depth_np,
+            prompt_depth_np=prepared_gpu_request.prompt_depth_np,
+            gt_depth_mask_np=prepared_gpu_request.gt_depth_mask_np,
+            prompt_mask_np=prepared_gpu_request.prompt_mask_np,
             fx_org=fx_org,
             fy_org=fy_org,
             cx_org=cx_org,
         exc_type = type(exc).__name__
         exc_module = type(exc).__module__ or ""
         is_zerogpu_error = "spaces" in exc_module or "ZeroGPU" in str(exc) or "GPU task aborted" in str(exc)
+        likely_gpu_timeout = (
+            "GPU task aborted" in str(exc)
+            and "gpu:complete" not in trace_content
+        )
         if is_zerogpu_error:
             error_message = (
                 f"[{request_id}] ZeroGPU error: {exc}\n\n"
                 "  - GPU task was preempted/aborted (click the button again)\n"
                 "  - duration too high for remaining quota"
             )
+            if likely_gpu_timeout:
+                error_message = (
+                    f"{error_message}\n"
+                    f"  - configured GPU runtime budget too short (current `@spaces.GPU(duration={ZEROGPU_DURATION_SECONDS})`)\n\n"
+                    f"Current ZeroGPU duration: {ZEROGPU_DURATION_SECONDS}s.\n"
+                    "This request likely exceeded the configured GPU runtime budget.\n"
+                    "Try `512x672`, keep `upsample_ratio=1`, avoid `original` output for large images, "
+                    f"or move to a dedicated GPU Space if `{MAX_ZEROGPU_DURATION_SECONDS}s` is still not enough."
+                )
         else:
             error_message = f"Error [{request_id}] ({exc_type}): {exc}"
                 "Tips: when a depth map is uploaded it will be used automatically, otherwise the demo falls back to MoGe-2. "
                 "If camera intrinsics are missing, the demo first tries MoGe-2 estimates before image-size defaults. "
                 "Use lower preview points for faster 3D interaction. "
+                f"On ZeroGPU, `512x672` with `upsample_ratio=1` is the safest default for cold starts. "
+                f"The current GPU runtime budget is `{ZEROGPU_DURATION_SECONDS}s`."
             )
         with gr.Column(elem_id="right-panel"):