Spaces:
Sleeping
Sleeping
File size: 2,884 Bytes
5412d82 842d4e9 5412d82 9b33910 5412d82 842d4e9 5412d82 9b33910 d265d8e 9b33910 d265d8e 9b33910 842d4e9 9b33910 842d4e9 9b33910 842d4e9 9b33910 842d4e9 9b33910 5412d82 842d4e9 5412d82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | """
Hugging Face Spaces entry point (ZeroGPU compatible).
On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU
decorated functions. Models are loaded once at import so the first
request does not spend minutes downloading weights inside a short GPU
slice (which caused every call to fall back to CPU).
"""
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
# ---------------------------------------------------------------------------
# Defensive patch: gradio_client.utils.get_type crashes on bool schemas.
# ---------------------------------------------------------------------------
try:
from gradio_client import utils as _gc_utils
_orig_get_type = _gc_utils.get_type
def _safe_get_type(schema):
if isinstance(schema, bool) or not isinstance(schema, dict):
return "Any"
return _orig_get_type(schema)
_gc_utils.get_type = _safe_get_type
_orig_json_to_py = _gc_utils._json_schema_to_python_type
def _safe_json_to_py(schema, defs=None):
if isinstance(schema, bool) or not isinstance(schema, dict):
return "Any"
try:
return _orig_json_to_py(schema, defs)
except (TypeError, KeyError):
return "Any"
_gc_utils._json_schema_to_python_type = _safe_json_to_py
except Exception:
pass
# ---------------------------------------------------------------------------
# ZeroGPU import + pipeline warm-up
# ---------------------------------------------------------------------------
try:
import spaces
_HAS_SPACES = True
except ImportError:
_HAS_SPACES = False
from src.ui import gradio_app as _ga
from src.pipeline import Pipeline
# Pre-load every model at import — on ZeroGPU torch.cuda is available here
# and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime
# handles slice re-attachment transparently).
print("Pre-loading pipeline (VLM + Depth + YOLO)...")
_pipeline = Pipeline()
_pipeline._get_vlm()
_pipeline._get_depth()
_pipeline._get_detector()
_ga._PIPELINE = _pipeline
print("Pipeline ready.")
# ---------------------------------------------------------------------------
# Wrap inference entry points so ZeroGPU keeps a GPU slice for the call.
# 300 s covers first-call weight migration + heaviest Stage 3 inference.
# ---------------------------------------------------------------------------
if _HAS_SPACES:
_orig_run = _ga.run_inference
_orig_live = _ga.live_run_inference
@spaces.GPU(duration=300)
def _gpu_run_inference(*args, **kwargs):
return _orig_run(*args, **kwargs)
@spaces.GPU(duration=300)
def _gpu_live_inference(*args, **kwargs):
return _orig_live(*args, **kwargs)
_ga.run_inference = _gpu_run_inference
_ga.live_run_inference = _gpu_live_inference
demo = _ga.build_ui()
if __name__ == "__main__":
demo.launch()
|