"""Nemotron Nano 4B (text-only) -> raw quest JSON. Pluggable GPU backend. FROGQUEST_BACKEND selects WHERE the GPU work runs (the public functions are identical either way): - "zerogpu" (default): construct the Llama via llama.cpp INSIDE a @spaces.GPU function on the HF Space's ZeroGPU. (First call ~60-90s, then disk-cached & fast.) - "modal": forward to a deployed Modal class (see modal_app.py); the Space itself runs on CPU-basic and imports NOTHING heavy here. The LLM's job is ONLY to write JSON to the contract in schema.py. Output is constrained with a JSON-schema response_format and then validated/clamped by the caller. Shared prompts / the JSON extractor / model config live in gpu_shared.py so both backends stay in lockstep. """ from __future__ import annotations import os os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # MUST precede huggingface_hub import BACKEND = os.environ.get("FROGQUEST_BACKEND", "zerogpu").lower() if BACKEND != "modal": # the local/ZeroGPU path (default + any unrecognized value) needs the decorator import spaces # noqa: E402 from schema import CAMPAIGN_RESPONSE_SCHEMA, INTENT_SCHEMA, RESPONSE_SCHEMA # noqa: E402 from gpu_shared import ( # noqa: E402 CAMPAIGN_SYSTEM_PROMPT, GGUF_FILE, GGUF_REPO, INTENT_SYSTEM_PROMPT, LOW_VRAM_GB, N_CTX, N_CTX_SMALL, SYSTEM_PROMPT, extract_json, preload_cuda_libs, ) # Best-effort: warm the HF cache at startup so the FIRST @spaces.GPU call doesn't spend its # (metered, on ZeroGPU) duration downloading ~4GB. Local-path only — on a CPU-basic Space (modal # backend) we must NOT download the GGUF. No-op if offline or on a fresh local checkout. if BACKEND != "modal": try: from huggingface_hub import hf_hub_download, list_repo_files _gguf = next((f for f in list_repo_files(GGUF_REPO) if "Q8_0" in f and f.endswith(".gguf")), None) if _gguf: hf_hub_download(GGUF_REPO, _gguf) except Exception: pass _llm = None def _get_llm(): """Lazily download + construct the Llama model on the GPU (must run inside @spaces.GPU). First call downloads the GGUF then disk-caches it, so later calls are fast. """ global _llm if _llm is None: # The prebuilt CUDA llama-cpp-python wheel links libcudart.so.12 / libcublas etc., which # ship inside the nvidia-*-cu12 packages torch pulls in but are NOT on the loader path. # Without help you get "libcudart.so.12: cannot open shared object file". # 1) importing torch loads many of them RTLD_GLOBAL; # 2) belt-and-suspenders: explicitly preload the nvidia-* CUDA libs too. import torch # noqa: F401 preload_cuda_libs() from llama_cpp import Llama vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else 0) n_ctx = N_CTX if vram_gb >= LOW_VRAM_GB else N_CTX_SMALL _llm = Llama.from_pretrained( repo_id=GGUF_REPO, filename=GGUF_FILE, # glob -> resolves the exact Q8_0 file (warmed at import) n_gpu_layers=-1, # offload all layers (Q8 4B ~4.3GB fits even on a T4) n_ctx=n_ctx, verbose=False, ) return _llm # ----------------------------- local (in-Space, ZeroGPU) implementations ----------------------------- def _generate_quests_local(todos: str, theme: str) -> dict: """Return the model's raw JSON object (UNVALIDATED - caller must validate_and_clamp).""" llm = _get_llm() system = SYSTEM_PROMPT.replace("{theme}", theme) user = f"Theme: {theme}\nMy to-do list / goals:\n{todos.strip()}" out = llm.create_chat_completion( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], response_format={"type": "json_object", "schema": RESPONSE_SCHEMA}, temperature=0.0, max_tokens=4096, ) return extract_json(out["choices"][0]["message"]["content"]) def _generate_campaign_local(goal: str, theme: str, snippets: str = "") -> dict: """One long-term goal (+ optional research snippets) -> raw campaign JSON (UNVALIDATED - caller must validate_campaign).""" llm = _get_llm() system = CAMPAIGN_SYSTEM_PROMPT.replace("{theme}", theme) user = f"Theme: {theme}\nLong-term goal:\n{goal.strip()}" if (snippets or "").strip(): user += f"\n\nResearch notes:\n{snippets.strip()}" out = llm.create_chat_completion( messages=[ {"role": "system", "content": system}, {"role": "user", "content": user}, ], response_format={"type": "json_object", "schema": CAMPAIGN_RESPONSE_SCHEMA}, temperature=0.0, max_tokens=4096, ) return extract_json(out["choices"][0]["message"]["content"]) def _route_intent_local(message: str, context: str) -> dict: """Classify one Frog Master chat message into {intent, target_task?, reason?}. `context` is a SHORT text summary of the current log (does a log exist + quest titles/ids/ status) - never images (CLAUDE.md rule). Falls back to {"intent": "unknown"} on bad output. """ llm = _get_llm() user = f"Context:\n{context.strip()}\n\nUser message:\n{message.strip()}" out = llm.create_chat_completion( messages=[ {"role": "system", "content": INTENT_SYSTEM_PROMPT}, {"role": "user", "content": user}, ], response_format={"type": "json_object", "schema": INTENT_SCHEMA}, temperature=0.0, max_tokens=256, ) parsed = extract_json(out["choices"][0]["message"]["content"]) if not isinstance(parsed, dict) or parsed.get("intent") not in ( "forge", "add_tasks", "mark_done", "mark_couldnt", "unknown", ): return {"intent": "unknown"} return parsed # ----------------------------- modal (off-Space) wrappers ----------------------------- def _generate_quests_modal(todos: str, theme: str) -> dict: import modal llm = modal.Cls.from_name("frogquest", "LLM")() return llm.generate_quests.remote(todos, theme) def _generate_campaign_modal(goal: str, theme: str, snippets: str = "") -> dict: import modal llm = modal.Cls.from_name("frogquest", "LLM")() return llm.generate_campaign.remote(goal, theme, snippets) def _route_intent_modal(message: str, context: str) -> dict: import modal llm = modal.Cls.from_name("frogquest", "LLM")() return llm.route_intent.remote(message, context) # ----------------------------- bind public names from the backend ----------------------------- # app.py imports these by name; signatures are identical across backends. if BACKEND == "modal": generate_quests_raw = _generate_quests_modal generate_campaign_raw = _generate_campaign_modal route_intent = _route_intent_modal else: generate_quests_raw = spaces.GPU(duration=70)(_generate_quests_local) generate_campaign_raw = spaces.GPU(duration=70)(_generate_campaign_local) route_intent = spaces.GPU(duration=45)(_route_intent_local)