Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face Spaces entry point (ZeroGPU compatible). | |
| On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU | |
| decorated functions. Models are loaded once at import so the first | |
| request does not spend minutes downloading weights inside a short GPU | |
| slice (which caused every call to fall back to CPU). | |
| """ | |
| import os | |
| import sys | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| # --------------------------------------------------------------------------- | |
| # Defensive patch: gradio_client.utils.get_type crashes on bool schemas. | |
| # --------------------------------------------------------------------------- | |
| try: | |
| from gradio_client import utils as _gc_utils | |
| _orig_get_type = _gc_utils.get_type | |
| def _safe_get_type(schema): | |
| if isinstance(schema, bool) or not isinstance(schema, dict): | |
| return "Any" | |
| return _orig_get_type(schema) | |
| _gc_utils.get_type = _safe_get_type | |
| _orig_json_to_py = _gc_utils._json_schema_to_python_type | |
| def _safe_json_to_py(schema, defs=None): | |
| if isinstance(schema, bool) or not isinstance(schema, dict): | |
| return "Any" | |
| try: | |
| return _orig_json_to_py(schema, defs) | |
| except (TypeError, KeyError): | |
| return "Any" | |
| _gc_utils._json_schema_to_python_type = _safe_json_to_py | |
| except Exception: | |
| pass | |
| # --------------------------------------------------------------------------- | |
| # ZeroGPU import + pipeline warm-up | |
| # --------------------------------------------------------------------------- | |
| try: | |
| import spaces | |
| _HAS_SPACES = True | |
| except ImportError: | |
| _HAS_SPACES = False | |
| from src.ui import gradio_app as _ga | |
| from src.pipeline import Pipeline | |
| # Pre-load every model at import — on ZeroGPU torch.cuda is available here | |
| # and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime | |
| # handles slice re-attachment transparently). | |
| print("Pre-loading pipeline (VLM + Depth + YOLO)...") | |
| _pipeline = Pipeline() | |
| _pipeline._get_vlm() | |
| _pipeline._get_depth() | |
| _pipeline._get_detector() | |
| _ga._PIPELINE = _pipeline | |
| print("Pipeline ready.") | |
| # --------------------------------------------------------------------------- | |
| # Wrap inference entry points so ZeroGPU keeps a GPU slice for the call. | |
| # 300 s covers first-call weight migration + heaviest Stage 3 inference. | |
| # --------------------------------------------------------------------------- | |
| if _HAS_SPACES: | |
| _orig_run = _ga.run_inference | |
| _orig_live = _ga.live_run_inference | |
| def _gpu_run_inference(*args, **kwargs): | |
| return _orig_run(*args, **kwargs) | |
| def _gpu_live_inference(*args, **kwargs): | |
| return _orig_live(*args, **kwargs) | |
| _ga.run_inference = _gpu_run_inference | |
| _ga.live_run_inference = _gpu_live_inference | |
| demo = _ga.build_ui() | |
| if __name__ == "__main__": | |
| demo.launch() | |