"""
Hugging Face Spaces entry point (ZeroGPU compatible).

On ZeroGPU, CUDA is available at import time AND inside @spaces.GPU
decorated functions.  Models are loaded once at import so the first
request does not spend minutes downloading weights inside a short GPU
slice (which caused every call to fall back to CPU).
"""

import os
import sys

sys.path.insert(0, os.path.dirname(__file__))

# ---------------------------------------------------------------------------
# Defensive patch: gradio_client.utils.get_type crashes on bool schemas.
# ---------------------------------------------------------------------------
try:
    from gradio_client import utils as _gc_utils

    _orig_get_type = _gc_utils.get_type

    def _safe_get_type(schema):
        if isinstance(schema, bool) or not isinstance(schema, dict):
            return "Any"
        return _orig_get_type(schema)

    _gc_utils.get_type = _safe_get_type

    _orig_json_to_py = _gc_utils._json_schema_to_python_type

    def _safe_json_to_py(schema, defs=None):
        if isinstance(schema, bool) or not isinstance(schema, dict):
            return "Any"
        try:
            return _orig_json_to_py(schema, defs)
        except (TypeError, KeyError):
            return "Any"

    _gc_utils._json_schema_to_python_type = _safe_json_to_py
except Exception:
    pass

# ---------------------------------------------------------------------------
# ZeroGPU import + pipeline warm-up
# ---------------------------------------------------------------------------
try:
    import spaces
    _HAS_SPACES = True
except ImportError:
    _HAS_SPACES = False

from src.ui import gradio_app as _ga
from src.pipeline import Pipeline

# Pre-load every model at import — on ZeroGPU torch.cuda is available here
# and the loaded tensors stay on GPU across @spaces.GPU calls (the runtime
# handles slice re-attachment transparently).
print("Pre-loading pipeline (VLM + Depth + YOLO)...")
_pipeline = Pipeline()
_pipeline._get_vlm()
_pipeline._get_depth()
_pipeline._get_detector()
_ga._PIPELINE = _pipeline
print("Pipeline ready.")

# ---------------------------------------------------------------------------
# Wrap inference entry points so ZeroGPU keeps a GPU slice for the call.
# 300 s covers first-call weight migration + heaviest Stage 3 inference.
# ---------------------------------------------------------------------------
if _HAS_SPACES:
    _orig_run = _ga.run_inference
    _orig_live = _ga.live_run_inference

    @spaces.GPU(duration=300)
    def _gpu_run_inference(*args, **kwargs):
        return _orig_run(*args, **kwargs)

    @spaces.GPU(duration=300)
    def _gpu_live_inference(*args, **kwargs):
        return _orig_live(*args, **kwargs)

    _ga.run_inference = _gpu_run_inference
    _ga.live_run_inference = _gpu_live_inference

demo = _ga.build_ui()

if __name__ == "__main__":
    demo.launch()