Spaces:
Running on Zero
Running on Zero
| """openbmb/MiniCPM5-1B (safetensors) wrapper via transformers, ZeroGPU-ready. | |
| On HuggingFace ZeroGPU Spaces: | |
| * `spaces` is imported before torch, and the actual generation runs inside a | |
| `@spaces.GPU` function (the GPU is attached only for that call). | |
| * the model is placed on `cuda` at module level (ZeroGPU's CUDA emulation makes | |
| this work outside the decorated function), as the docs recommend. | |
| Off ZeroGPU (local CPU/GPU, or `spaces` not installed) everything still works: | |
| * `@spaces.GPU` becomes a no-op, and the device falls back to a real CUDA GPU | |
| if present, otherwise CPU. | |
| If anything is unavailable the app keeps running with a deterministic fallback. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import threading | |
| import time | |
| # IMPORTANT: import `spaces` BEFORE torch so it can patch CUDA for ZeroGPU. | |
| try: | |
| import spaces # noqa: F401 | |
| _HAS_SPACES = True | |
| except Exception: | |
| _HAS_SPACES = False | |
| _ON_ZEROGPU = bool(os.environ.get("SPACES_ZERO_GPU")) | |
| _MODEL = None | |
| _TOKENIZER = None | |
| _DEVICE = "cpu" | |
| _LOAD_LOCK = threading.Lock() | |
| _LOAD_ERROR = None | |
| SYSTEM_PROMPT = ( | |
| "You are FLIGHTDECK, a terse air-traffic analyst. Answer only from the live " | |
| "flight data you are given. Be concise and use callsigns. Never invent flights." | |
| ) | |
| def llm_disabled() -> bool: | |
| return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"} | |
| def _model_id() -> str: | |
| # Safetensors repo (transformers), overridable via LLM_REPO. | |
| return os.environ.get("LLM_REPO", "openbmb/MiniCPM5-1B") | |
| def _gpu(fn): | |
| """Wrap a function with @spaces.GPU on ZeroGPU; no-op everywhere else.""" | |
| if _HAS_SPACES: | |
| duration = int(os.environ.get("ZEROGPU_DURATION", "60")) | |
| return spaces.GPU(duration=duration)(fn) | |
| return fn | |
| def _apply_chat_template(messages, tokenizer) -> str: | |
| if getattr(tokenizer, "chat_template", None): | |
| return tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True) | |
| parts = [f"[{m.get('role', 'user').upper()}]\n{m.get('content', '')}" for m in messages] | |
| parts.append("[ASSISTANT]\n") | |
| return "\n".join(parts) | |
| def _load(): | |
| """Load model + tokenizer once (in the main process; ZeroGPU-safe).""" | |
| global _MODEL, _TOKENIZER, _DEVICE, _LOAD_ERROR | |
| if _MODEL is not None or _LOAD_ERROR is not None: | |
| return _MODEL | |
| with _LOAD_LOCK: | |
| if _MODEL is not None or _LOAD_ERROR is not None: | |
| return _MODEL | |
| try: | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| mid = _model_id() | |
| want_cuda = _ON_ZEROGPU or torch.cuda.is_available() | |
| _DEVICE = "cuda" if want_cuda else "cpu" | |
| dtype = torch.float16 if want_cuda else torch.float32 | |
| _TOKENIZER = AutoTokenizer.from_pretrained(mid, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| mid, dtype=dtype, trust_remote_code=True) | |
| # Module-level cuda placement (works via ZeroGPU CUDA emulation). | |
| model.to(_DEVICE) | |
| model.eval() | |
| _MODEL = model | |
| except Exception as e: # noqa: BLE001 | |
| _LOAD_ERROR = e | |
| _MODEL = None | |
| return _MODEL | |
| def _generate(prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str: | |
| """The only GPU-touching function — runs on the ZeroGPU device when attached.""" | |
| import torch | |
| inputs = _TOKENIZER(prompt, return_tensors="pt").to(_DEVICE) | |
| gen_kwargs = dict( | |
| max_new_tokens=max_new_tokens, | |
| do_sample=temperature > 0, | |
| top_p=top_p, | |
| pad_token_id=_TOKENIZER.eos_token_id, | |
| ) | |
| if temperature > 0: | |
| gen_kwargs["temperature"] = temperature | |
| with torch.no_grad(): | |
| out = _MODEL.generate(**inputs, **gen_kwargs) | |
| new_tokens = out[0][inputs["input_ids"].shape[1]:] | |
| return _TOKENIZER.decode(new_tokens, skip_special_tokens=True) | |
| def status() -> str: | |
| label = _model_id().split("/")[-1] | |
| if llm_disabled(): | |
| return "LLM disabled (DISABLE_LLM=1)." | |
| if _LOAD_ERROR is not None: | |
| return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}" | |
| if _MODEL is None: | |
| return f"{label} not loaded yet (loads on first query)." | |
| mode = "ZeroGPU" if (_HAS_SPACES and _ON_ZEROGPU) else _DEVICE.upper() | |
| return f"{label} online ({mode})." | |
| def available() -> bool: | |
| if llm_disabled(): | |
| return False | |
| return _load() is not None | |
| def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9): | |
| """Chat completion used by the agents. Returns (text, latency_ms).""" | |
| if _load() is None: | |
| raise RuntimeError(status()) | |
| prompt = _apply_chat_template(messages, _TOKENIZER) | |
| t0 = time.time() | |
| text = _generate(prompt, int(max_tokens), float(temperature), float(top_p)) | |
| return str(text).strip(), int((time.time() - t0) * 1000) | |
| def _fallback(question: str, context: str) -> str: | |
| return ( | |
| "[AI offline — raw readout]\n" | |
| f"Q: {question}\n\n{context}\n\n" | |
| "(Enable the model — transformers + torch — for natural-language briefings.)" | |
| ) | |
| def briefing(question: str, context: str, max_tokens: int = 512) -> str: | |
| if llm_disabled() or _load() is None: | |
| return _fallback(question, context) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", | |
| "content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"}, | |
| ] | |
| try: | |
| text, _ = complete(messages, max_tokens=max_tokens, temperature=0.4) | |
| return text | |
| except Exception as e: # noqa: BLE001 | |
| return _fallback(question, f"{context}\n\n(LLM error: {e})") | |
| # ZeroGPU recommends placing the model at startup (not lazily). On ZeroGPU we | |
| # eager-load; locally we stay lazy so imports/tests remain fast. | |
| if _ON_ZEROGPU and not llm_disabled(): | |
| _load() | |