quaz93's picture
Switch default LLM to openbmb/MiniCPM5-1B
88d4864
Raw
History Blame Contribute Delete
5.97 kB
"""openbmb/MiniCPM5-1B (safetensors) wrapper via transformers, ZeroGPU-ready.
On HuggingFace ZeroGPU Spaces:
* `spaces` is imported before torch, and the actual generation runs inside a
`@spaces.GPU` function (the GPU is attached only for that call).
* the model is placed on `cuda` at module level (ZeroGPU's CUDA emulation makes
this work outside the decorated function), as the docs recommend.
Off ZeroGPU (local CPU/GPU, or `spaces` not installed) everything still works:
* `@spaces.GPU` becomes a no-op, and the device falls back to a real CUDA GPU
if present, otherwise CPU.
If anything is unavailable the app keeps running with a deterministic fallback.
"""
from __future__ import annotations
import os
import threading
import time
# IMPORTANT: import `spaces` BEFORE torch so it can patch CUDA for ZeroGPU.
try:
import spaces # noqa: F401
_HAS_SPACES = True
except Exception:
_HAS_SPACES = False
_ON_ZEROGPU = bool(os.environ.get("SPACES_ZERO_GPU"))
_MODEL = None
_TOKENIZER = None
_DEVICE = "cpu"
_LOAD_LOCK = threading.Lock()
_LOAD_ERROR = None
SYSTEM_PROMPT = (
"You are FLIGHTDECK, a terse air-traffic analyst. Answer only from the live "
"flight data you are given. Be concise and use callsigns. Never invent flights."
)
def llm_disabled() -> bool:
return os.environ.get("DISABLE_LLM", "0").strip() in {"1", "true", "yes"}
def _model_id() -> str:
# Safetensors repo (transformers), overridable via LLM_REPO.
return os.environ.get("LLM_REPO", "openbmb/MiniCPM5-1B")
def _gpu(fn):
"""Wrap a function with @spaces.GPU on ZeroGPU; no-op everywhere else."""
if _HAS_SPACES:
duration = int(os.environ.get("ZEROGPU_DURATION", "60"))
return spaces.GPU(duration=duration)(fn)
return fn
def _apply_chat_template(messages, tokenizer) -> str:
if getattr(tokenizer, "chat_template", None):
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
parts = [f"[{m.get('role', 'user').upper()}]\n{m.get('content', '')}" for m in messages]
parts.append("[ASSISTANT]\n")
return "\n".join(parts)
def _load():
"""Load model + tokenizer once (in the main process; ZeroGPU-safe)."""
global _MODEL, _TOKENIZER, _DEVICE, _LOAD_ERROR
if _MODEL is not None or _LOAD_ERROR is not None:
return _MODEL
with _LOAD_LOCK:
if _MODEL is not None or _LOAD_ERROR is not None:
return _MODEL
try:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
mid = _model_id()
want_cuda = _ON_ZEROGPU or torch.cuda.is_available()
_DEVICE = "cuda" if want_cuda else "cpu"
dtype = torch.float16 if want_cuda else torch.float32
_TOKENIZER = AutoTokenizer.from_pretrained(mid, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
mid, dtype=dtype, trust_remote_code=True)
# Module-level cuda placement (works via ZeroGPU CUDA emulation).
model.to(_DEVICE)
model.eval()
_MODEL = model
except Exception as e: # noqa: BLE001
_LOAD_ERROR = e
_MODEL = None
return _MODEL
@_gpu
def _generate(prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
"""The only GPU-touching function — runs on the ZeroGPU device when attached."""
import torch
inputs = _TOKENIZER(prompt, return_tensors="pt").to(_DEVICE)
gen_kwargs = dict(
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
top_p=top_p,
pad_token_id=_TOKENIZER.eos_token_id,
)
if temperature > 0:
gen_kwargs["temperature"] = temperature
with torch.no_grad():
out = _MODEL.generate(**inputs, **gen_kwargs)
new_tokens = out[0][inputs["input_ids"].shape[1]:]
return _TOKENIZER.decode(new_tokens, skip_special_tokens=True)
def status() -> str:
label = _model_id().split("/")[-1]
if llm_disabled():
return "LLM disabled (DISABLE_LLM=1)."
if _LOAD_ERROR is not None:
return f"{label} unavailable: {type(_LOAD_ERROR).__name__}: {_LOAD_ERROR}"
if _MODEL is None:
return f"{label} not loaded yet (loads on first query)."
mode = "ZeroGPU" if (_HAS_SPACES and _ON_ZEROGPU) else _DEVICE.upper()
return f"{label} online ({mode})."
def available() -> bool:
if llm_disabled():
return False
return _load() is not None
def complete(messages, *, max_tokens=512, temperature=0.2, top_p=0.9):
"""Chat completion used by the agents. Returns (text, latency_ms)."""
if _load() is None:
raise RuntimeError(status())
prompt = _apply_chat_template(messages, _TOKENIZER)
t0 = time.time()
text = _generate(prompt, int(max_tokens), float(temperature), float(top_p))
return str(text).strip(), int((time.time() - t0) * 1000)
def _fallback(question: str, context: str) -> str:
return (
"[AI offline — raw readout]\n"
f"Q: {question}\n\n{context}\n\n"
"(Enable the model — transformers + torch — for natural-language briefings.)"
)
def briefing(question: str, context: str, max_tokens: int = 512) -> str:
if llm_disabled() or _load() is None:
return _fallback(question, context)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user",
"content": f"LIVE FLIGHT DATA:\n{context}\n\nQUESTION: {question}"},
]
try:
text, _ = complete(messages, max_tokens=max_tokens, temperature=0.4)
return text
except Exception as e: # noqa: BLE001
return _fallback(question, f"{context}\n\n(LLM error: {e})")
# ZeroGPU recommends placing the model at startup (not lazily). On ZeroGPU we
# eager-load; locally we stay lazy so imports/tests remain fast.
if _ON_ZEROGPU and not llm_disabled():
_load()