""" Hugging Face Spaces entry point (ZeroGPU compatible). On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU decorated functions. Models are loaded once at import so the first request does not spend minutes downloading weights inside a short GPU slice (which caused every call to fall back to CPU). """ import os import sys sys.path.insert(0, os.path.dirname(__file__)) # --------------------------------------------------------------------------- # Defensive patch: gradio_client.utils.get_type crashes on bool schemas. # --------------------------------------------------------------------------- try: from gradio_client import utils as _gc_utils _orig_get_type = _gc_utils.get_type def _safe_get_type(schema): if isinstance(schema, bool) or not isinstance(schema, dict): return "Any" return _orig_get_type(schema) _gc_utils.get_type = _safe_get_type _orig_json_to_py = _gc_utils._json_schema_to_python_type def _safe_json_to_py(schema, defs=None): if isinstance(schema, bool) or not isinstance(schema, dict): return "Any" try: return _orig_json_to_py(schema, defs) except (TypeError, KeyError): return "Any" _gc_utils._json_schema_to_python_type = _safe_json_to_py except Exception: pass # --------------------------------------------------------------------------- # ZeroGPU import + pipeline warm-up # --------------------------------------------------------------------------- try: import spaces _HAS_SPACES = True except ImportError: _HAS_SPACES = False from src.ui import gradio_app as _ga from src.pipeline import Pipeline # Pre-load every model at import — on ZeroGPU torch.cuda is available here # and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime # handles slice re-attachment transparently). print("Pre-loading pipeline (VLM + Depth + YOLO)...") _pipeline = Pipeline() _pipeline._get_vlm() _pipeline._get_depth() _pipeline._get_detector() _ga._PIPELINE = _pipeline print("Pipeline ready.") # --------------------------------------------------------------------------- # Wrap inference entry points so ZeroGPU keeps a GPU slice for the call. # 300 s covers first-call weight migration + heaviest Stage 3 inference. # --------------------------------------------------------------------------- if _HAS_SPACES: _orig_run = _ga.run_inference _orig_live = _ga.live_run_inference @spaces.GPU(duration=300) def _gpu_run_inference(*args, **kwargs): return _orig_run(*args, **kwargs) @spaces.GPU(duration=300) def _gpu_live_inference(*args, **kwargs): return _orig_live(*args, **kwargs) _ga.run_inference = _gpu_run_inference _ga.live_run_inference = _gpu_live_inference demo = _ga.build_ui() if __name__ == "__main__": demo.launch()