DepthLens / app.py
Rishabh Jain
Pre-load models at import; wrap inference with longer-duration @spaces.GPU
9b33910
"""
Hugging Face Spaces entry point (ZeroGPU compatible).
On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU
decorated functions. Models are loaded once at import so the first
request does not spend minutes downloading weights inside a short GPU
slice (which caused every call to fall back to CPU).
"""
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
# ---------------------------------------------------------------------------
# Defensive patch: gradio_client.utils.get_type crashes on bool schemas.
# ---------------------------------------------------------------------------
try:
from gradio_client import utils as _gc_utils
_orig_get_type = _gc_utils.get_type
def _safe_get_type(schema):
if isinstance(schema, bool) or not isinstance(schema, dict):
return "Any"
return _orig_get_type(schema)
_gc_utils.get_type = _safe_get_type
_orig_json_to_py = _gc_utils._json_schema_to_python_type
def _safe_json_to_py(schema, defs=None):
if isinstance(schema, bool) or not isinstance(schema, dict):
return "Any"
try:
return _orig_json_to_py(schema, defs)
except (TypeError, KeyError):
return "Any"
_gc_utils._json_schema_to_python_type = _safe_json_to_py
except Exception:
pass
# ---------------------------------------------------------------------------
# ZeroGPU import + pipeline warm-up
# ---------------------------------------------------------------------------
try:
import spaces
_HAS_SPACES = True
except ImportError:
_HAS_SPACES = False
from src.ui import gradio_app as _ga
from src.pipeline import Pipeline
# Pre-load every model at import — on ZeroGPU torch.cuda is available here
# and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime
# handles slice re-attachment transparently).
print("Pre-loading pipeline (VLM + Depth + YOLO)...")
_pipeline = Pipeline()
_pipeline._get_vlm()
_pipeline._get_depth()
_pipeline._get_detector()
_ga._PIPELINE = _pipeline
print("Pipeline ready.")
# ---------------------------------------------------------------------------
# Wrap inference entry points so ZeroGPU keeps a GPU slice for the call.
# 300 s covers first-call weight migration + heaviest Stage 3 inference.
# ---------------------------------------------------------------------------
if _HAS_SPACES:
_orig_run = _ga.run_inference
_orig_live = _ga.live_run_inference
@spaces.GPU(duration=300)
def _gpu_run_inference(*args, **kwargs):
return _orig_run(*args, **kwargs)
@spaces.GPU(duration=300)
def _gpu_live_inference(*args, **kwargs):
return _orig_live(*args, **kwargs)
_ga.run_inference = _gpu_run_inference
_ga.live_run_inference = _gpu_live_inference
demo = _ga.build_ui()
if __name__ == "__main__":
demo.launch()