Flight-Transit-Agent

Running on Zero

File size: 5,972 Bytes

"""openbmb/MiniCPM5-1B (safetensors) wrapper via transformers, ZeroGPU-ready.

On HuggingFace ZeroGPU Spaces:
  * `spaces` is imported before torch, and the actual generation runs inside a
    `@spaces.GPU` function (the GPU is attached only for that call).
  * the model is placed on `cuda` at module level (ZeroGPU's CUDA emulation makes
    this work outside the decorated function), as the docs recommend.

Off ZeroGPU (local CPU/GPU, or `spaces` not installed) everything still works:
  * `@spaces.GPU` becomes a no-op, and the device falls back to a real CUDA GPU
    if present, otherwise CPU.
If anything is unavailable the app keeps running with a deterministic fallback.
"""
from __future__ import annotations

import os
import threading
import time

# IMPORTANT: import `spaces` BEFORE torch so it can patch CUDA for ZeroGPU.
try:
    import spaces  # noqa: F401
    _HAS_SPACES = True
except Exception:
    _HAS_SPACES = False

_ON_ZEROGPU = bool(os.environ.get("SPACES_ZERO_GPU"))

_MODEL = None
_TOKENIZER = None
_DEVICE = "cpu"
_LOAD_LOCK = threading.Lock()
_LOAD_ERROR = None

SYSTEM_PROMPT = (
    "You are FLIGHTDECK, a terse air-traffic analyst. Answer only from the live "
    "flight data you are given. Be concise and use callsigns. Never invent flights."
)


def llm_disabled() -> bool:
    return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}


def _model_id() -> str:
    # Safetensors repo (transformers), overridable via LLM_REPO.
    return os.environ.get("LLM_REPO", "openbmb/MiniCPM5-1B")


def _gpu(fn):
    """Wrap a function with @spaces.GPU on ZeroGPU; no-op everywhere else."""
    if _HAS_SPACES:
        duration = int(os.environ.get("ZEROGPU_DURATION", "60"))
        return spaces.GPU(duration=duration)(fn)
    return fn


def _apply_chat_template(messages, tokenizer) -> str:
    if getattr(tokenizer, "chat_template", None):
        return tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)
    parts = [f"[{m.get('role', 'user').upper()}]\n{m.get('content', '')}" for m in messages]
    parts.append("[ASSISTANT]\n")
    return "\n".join(parts)


def _load():
    """Load model + tokenizer once (in the main process; ZeroGPU-safe)."""
    global _MODEL, _TOKENIZER, _DEVICE, _LOAD_ERROR
    if _MODEL is not None or _LOAD_ERROR is not None:
        return _MODEL
    with _LOAD_LOCK:
        if _MODEL is not None or _LOAD_ERROR is not None:
            return _MODEL
        try:
            import torch
            from transformers import AutoModelForCausalLM, AutoTokenizer

            mid = _model_id()
            want_cuda = _ON_ZEROGPU or torch.cuda.is_available()
            _DEVICE = "cuda" if want_cuda else "cpu"
            dtype = torch.float16 if want_cuda else torch.float32

            _TOKENIZER = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
            model = AutoModelForCausalLM.from_pretrained(
                mid, dtype=dtype, trust_remote_code=True)
            # Module-level cuda placement (works via ZeroGPU CUDA emulation).
            model.to(_DEVICE)
            model.eval()
            _MODEL = model
        except Exception as e:  # noqa: BLE001
            _LOAD_ERROR = e
            _MODEL = None
    return _MODEL


@_gpu
def _generate(prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
    """The only GPU-touching function — runs on the ZeroGPU device when attached."""
    import torch
    inputs = _TOKENIZER(prompt, return_tensors="pt").to(_DEVICE)
    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0,
        top_p=top_p,
        pad_token_id=_TOKENIZER.eos_token_id,
    )
    if temperature > 0:
        gen_kwargs["temperature"] = temperature
    with torch.no_grad():
        out = _MODEL.generate(**inputs, **gen_kwargs)
    new_tokens = out[0][inputs["input_ids"].shape[1]:]
    return _TOKENIZER.decode(new_tokens, skip_special_tokens=True)


def status() -> str:
    label = _model_id().split("/")[-1]
    if llm_disabled():
        return "LLM disabled (DISABLE_LLM=1)."
    if _LOAD_ERROR is not None:
        return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
    if _MODEL is None:
        return f"{label} not loaded yet (loads on first query)."
    mode = "ZeroGPU" if (_HAS_SPACES and _ON_ZEROGPU) else _DEVICE.upper()
    return f"{label} online ({mode})."


def available() -> bool:
    if llm_disabled():
        return False
    return _load() is not None


def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
    """Chat completion used by the agents. Returns (text, latency_ms)."""
    if _load() is None:
        raise RuntimeError(status())
    prompt = _apply_chat_template(messages, _TOKENIZER)
    t0 = time.time()
    text = _generate(prompt, int(max_tokens), float(temperature), float(top_p))
    return str(text).strip(), int((time.time() - t0) * 1000)


def _fallback(question: str, context: str) -> str:
    return (
        "[AI offline — raw readout]\n"
        f"Q: {question}\n\n{context}\n\n"
        "(Enable the model — transformers + torch — for natural-language briefings.)"
    )


def briefing(question: str, context: str, max_tokens: int = 512) -> str:
    if llm_disabled() or _load() is None:
        return _fallback(question, context)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",
         "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
    ]
    try:
        text, _ = complete(messages, max_tokens=max_tokens, temperature=0.4)
        return text
    except Exception as e:  # noqa: BLE001
        return _fallback(question, f"{context}\n\n(LLM error: {e})")


# ZeroGPU recommends placing the model at startup (not lazily). On ZeroGPU we
# eager-load; locally we stay lazy so imports/tests remain fast.
if _ON_ZEROGPU and not llm_disabled():
    _load()