"""openbmb/MiniCPM5-1B (safetensors) wrapper via transformers, ZeroGPU-ready. On HuggingFace ZeroGPU Spaces: * `spaces` is imported before torch, and the actual generation runs inside a `@spaces.GPU` function (the GPU is attached only for that call). * the model is placed on `cuda` at module level (ZeroGPU's CUDA emulation makes this work outside the decorated function), as the docs recommend. Off ZeroGPU (local CPU/GPU, or `spaces` not installed) everything still works: * `@spaces.GPU` becomes a no-op, and the device falls back to a real CUDA GPU if present, otherwise CPU. If anything is unavailable the app keeps running with a deterministic fallback. """ from __future__ import annotations import os import threading import time # IMPORTANT: import `spaces` BEFORE torch so it can patch CUDA for ZeroGPU. try: import spaces # noqa: F401 _HAS_SPACES = True except Exception: _HAS_SPACES = False _ON_ZEROGPU = bool(os.environ.get("SPACES_ZERO_GPU")) _MODEL = None _TOKENIZER = None _DEVICE = "cpu" _LOAD_LOCK = threading.Lock() _LOAD_ERROR = None SYSTEM_PROMPT = ( "You are FLIGHTDECK, a terse air-traffic analyst. Answer only from the live " "flight data you are given. Be concise and use callsigns. Never invent flights." ) def llm_disabled() -> bool: return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"} def _model_id() -> str: # Safetensors repo (transformers), overridable via LLM_REPO. return os.environ.get("LLM_REPO", "openbmb/MiniCPM5-1B") def _gpu(fn): """Wrap a function with @spaces.GPU on ZeroGPU; no-op everywhere else.""" if _HAS_SPACES: duration = int(os.environ.get("ZEROGPU_DURATION", "60")) return spaces.GPU(duration=duration)(fn) return fn def _apply_chat_template(messages, tokenizer) -> str: if getattr(tokenizer, "chat_template", None): return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) parts = [f"[{m.get('role', 'user').upper()}]\n{m.get('content', '')}" for m in messages] parts.append("[ASSISTANT]\n") return "\n".join(parts) def _load(): """Load model + tokenizer once (in the main process; ZeroGPU-safe).""" global _MODEL, _TOKENIZER, _DEVICE, _LOAD_ERROR if _MODEL is not None or _LOAD_ERROR is not None: return _MODEL with _LOAD_LOCK: if _MODEL is not None or _LOAD_ERROR is not None: return _MODEL try: import torch from transformers import AutoModelForCausalLM, AutoTokenizer mid = _model_id() want_cuda = _ON_ZEROGPU or torch.cuda.is_available() _DEVICE = "cuda" if want_cuda else "cpu" dtype = torch.float16 if want_cuda else torch.float32 _TOKENIZER = AutoTokenizer.from_pretrained(mid, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( mid, dtype=dtype, trust_remote_code=True) # Module-level cuda placement (works via ZeroGPU CUDA emulation). model.to(_DEVICE) model.eval() _MODEL = model except Exception as e: # noqa: BLE001 _LOAD_ERROR = e _MODEL = None return _MODEL @_gpu def _generate(prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str: """The only GPU-touching function — runs on the ZeroGPU device when attached.""" import torch inputs = _TOKENIZER(prompt, return_tensors="pt").to(_DEVICE) gen_kwargs = dict( max_new_tokens=max_new_tokens, do_sample=temperature > 0, top_p=top_p, pad_token_id=_TOKENIZER.eos_token_id, ) if temperature > 0: gen_kwargs["temperature"] = temperature with torch.no_grad(): out = _MODEL.generate(**inputs, **gen_kwargs) new_tokens = out[0][inputs["input_ids"].shape[1]:] return _TOKENIZER.decode(new_tokens, skip_special_tokens=True) def status() -> str: label = _model_id().split("/")[-1] if llm_disabled(): return "LLM disabled (DISABLE_LLM=1)." if _LOAD_ERROR is not None: return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}" if _MODEL is None: return f"{label} not loaded yet (loads on first query)." mode = "ZeroGPU" if (_HAS_SPACES and _ON_ZEROGPU) else _DEVICE.upper() return f"{label} online ({mode})." def available() -> bool: if llm_disabled(): return False return _load() is not None def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9): """Chat completion used by the agents. Returns (text, latency_ms).""" if _load() is None: raise RuntimeError(status()) prompt = _apply_chat_template(messages, _TOKENIZER) t0 = time.time() text = _generate(prompt, int(max_tokens), float(temperature), float(top_p)) return str(text).strip(), int((time.time() - t0) * 1000) def _fallback(question: str, context: str) -> str: return ( "[AI offline — raw readout]\n" f"Q: {question}\n\n{context}\n\n" "(Enable the model — transformers + torch — for natural-language briefings.)" ) def briefing(question: str, context: str, max_tokens: int = 512) -> str: if llm_disabled() or _load() is None: return _fallback(question, context) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"}, ] try: text, _ = complete(messages, max_tokens=max_tokens, temperature=0.4) return text except Exception as e: # noqa: BLE001 return _fallback(question, f"{context}\n\n(LLM error: {e})") # ZeroGPU recommends placing the model at startup (not lazily). On ZeroGPU we # eager-load; locally we stay lazy so imports/tests remain fast. if _ON_ZEROGPU and not llm_disabled(): _load()