FrogQuest / llm.py
VirusDumb's picture
Big Leagues Calling
c6815eb
Raw
History Blame Contribute Delete
7.15 kB
"""Nemotron Nano 4B (text-only) -> raw quest JSON. Pluggable GPU backend.
FROGQUEST_BACKEND selects WHERE the GPU work runs (the public functions are identical either way):
- "zerogpu" (default): construct the Llama via llama.cpp INSIDE a @spaces.GPU function on the
HF Space's ZeroGPU. (First call ~60-90s, then disk-cached & fast.)
- "modal": forward to a deployed Modal class (see modal_app.py); the Space itself runs on
CPU-basic and imports NOTHING heavy here.
The LLM's job is ONLY to write JSON to the contract in schema.py. Output is constrained with a
JSON-schema response_format and then validated/clamped by the caller. Shared prompts / the JSON
extractor / model config live in gpu_shared.py so both backends stay in lockstep.
"""
from __future__ import annotations
import os
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # MUST precede huggingface_hub import
BACKEND = os.environ.get("FROGQUEST_BACKEND", "zerogpu").lower()
if BACKEND != "modal": # the local/ZeroGPU path (default + any unrecognized value) needs the decorator
import spaces # noqa: E402
from schema import CAMPAIGN_RESPONSE_SCHEMA, INTENT_SCHEMA, RESPONSE_SCHEMA # noqa: E402
from gpu_shared import ( # noqa: E402
CAMPAIGN_SYSTEM_PROMPT,
GGUF_FILE,
GGUF_REPO,
INTENT_SYSTEM_PROMPT,
LOW_VRAM_GB,
N_CTX,
N_CTX_SMALL,
SYSTEM_PROMPT,
extract_json,
preload_cuda_libs,
)
# Best-effort: warm the HF cache at startup so the FIRST @spaces.GPU call doesn't spend its
# (metered, on ZeroGPU) duration downloading ~4GB. Local-path only — on a CPU-basic Space (modal
# backend) we must NOT download the GGUF. No-op if offline or on a fresh local checkout.
if BACKEND != "modal":
try:
from huggingface_hub import hf_hub_download, list_repo_files
_gguf = next((f for f in list_repo_files(GGUF_REPO) if "Q8_0" in f and f.endswith(".gguf")), None)
if _gguf:
hf_hub_download(GGUF_REPO, _gguf)
except Exception:
pass
_llm = None
def _get_llm():
"""Lazily download + construct the Llama model on the GPU (must run inside @spaces.GPU).
First call downloads the GGUF then disk-caches it, so later calls are fast.
"""
global _llm
if _llm is None:
# The prebuilt CUDA llama-cpp-python wheel links libcudart.so.12 / libcublas etc., which
# ship inside the nvidia-*-cu12 packages torch pulls in but are NOT on the loader path.
# Without help you get "libcudart.so.12: cannot open shared object file".
# 1) importing torch loads many of them RTLD_GLOBAL;
# 2) belt-and-suspenders: explicitly preload the nvidia-* CUDA libs too.
import torch # noqa: F401
preload_cuda_libs()
from llama_cpp import Llama
vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9
if torch.cuda.is_available() else 0)
n_ctx = N_CTX if vram_gb >= LOW_VRAM_GB else N_CTX_SMALL
_llm = Llama.from_pretrained(
repo_id=GGUF_REPO,
filename=GGUF_FILE, # glob -> resolves the exact Q8_0 file (warmed at import)
n_gpu_layers=-1, # offload all layers (Q8 4B ~4.3GB fits even on a T4)
n_ctx=n_ctx,
verbose=False,
)
return _llm
# ----------------------------- local (in-Space, ZeroGPU) implementations -----------------------------
def _generate_quests_local(todos: str, theme: str) -> dict:
"""Return the model's raw JSON object (UNVALIDATED - caller must validate_and_clamp)."""
llm = _get_llm()
system = SYSTEM_PROMPT.replace("{theme}", theme)
user = f"Theme: {theme}\nMy to-do list / goals:\n{todos.strip()}"
out = llm.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
response_format={"type": "json_object", "schema": RESPONSE_SCHEMA},
temperature=0.0,
max_tokens=4096,
)
return extract_json(out["choices"][0]["message"]["content"])
def _generate_campaign_local(goal: str, theme: str, snippets: str = "") -> dict:
"""One long-term goal (+ optional research snippets) -> raw campaign JSON (UNVALIDATED -
caller must validate_campaign)."""
llm = _get_llm()
system = CAMPAIGN_SYSTEM_PROMPT.replace("{theme}", theme)
user = f"Theme: {theme}\nLong-term goal:\n{goal.strip()}"
if (snippets or "").strip():
user += f"\n\nResearch notes:\n{snippets.strip()}"
out = llm.create_chat_completion(
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
response_format={"type": "json_object", "schema": CAMPAIGN_RESPONSE_SCHEMA},
temperature=0.0,
max_tokens=4096,
)
return extract_json(out["choices"][0]["message"]["content"])
def _route_intent_local(message: str, context: str) -> dict:
"""Classify one Frog Master chat message into {intent, target_task?, reason?}.
`context` is a SHORT text summary of the current log (does a log exist + quest titles/ids/
status) - never images (CLAUDE.md rule). Falls back to {"intent": "unknown"} on bad output.
"""
llm = _get_llm()
user = f"Context:\n{context.strip()}\n\nUser message:\n{message.strip()}"
out = llm.create_chat_completion(
messages=[
{"role": "system", "content": INTENT_SYSTEM_PROMPT},
{"role": "user", "content": user},
],
response_format={"type": "json_object", "schema": INTENT_SCHEMA},
temperature=0.0,
max_tokens=256,
)
parsed = extract_json(out["choices"][0]["message"]["content"])
if not isinstance(parsed, dict) or parsed.get("intent") not in (
"forge", "add_tasks", "mark_done", "mark_couldnt", "unknown",
):
return {"intent": "unknown"}
return parsed
# ----------------------------- modal (off-Space) wrappers -----------------------------
def _generate_quests_modal(todos: str, theme: str) -> dict:
import modal
llm = modal.Cls.from_name("frogquest", "LLM")()
return llm.generate_quests.remote(todos, theme)
def _generate_campaign_modal(goal: str, theme: str, snippets: str = "") -> dict:
import modal
llm = modal.Cls.from_name("frogquest", "LLM")()
return llm.generate_campaign.remote(goal, theme, snippets)
def _route_intent_modal(message: str, context: str) -> dict:
import modal
llm = modal.Cls.from_name("frogquest", "LLM")()
return llm.route_intent.remote(message, context)
# ----------------------------- bind public names from the backend -----------------------------
# app.py imports these by name; signatures are identical across backends.
if BACKEND == "modal":
generate_quests_raw = _generate_quests_modal
generate_campaign_raw = _generate_campaign_modal
route_intent = _route_intent_modal
else:
generate_quests_raw = spaces.GPU(duration=70)(_generate_quests_local)
generate_campaign_raw = spaces.GPU(duration=70)(_generate_campaign_local)
route_intent = spaces.GPU(duration=45)(_route_intent_local)