Spaces:
Running on Zero
Running on Zero
| """Nemotron Nano 4B (text-only) -> raw quest JSON. Pluggable GPU backend. | |
| FROGQUEST_BACKEND selects WHERE the GPU work runs (the public functions are identical either way): | |
| - "zerogpu" (default): construct the Llama via llama.cpp INSIDE a @spaces.GPU function on the | |
| HF Space's ZeroGPU. (First call ~60-90s, then disk-cached & fast.) | |
| - "modal": forward to a deployed Modal class (see modal_app.py); the Space itself runs on | |
| CPU-basic and imports NOTHING heavy here. | |
| The LLM's job is ONLY to write JSON to the contract in schema.py. Output is constrained with a | |
| JSON-schema response_format and then validated/clamped by the caller. Shared prompts / the JSON | |
| extractor / model config live in gpu_shared.py so both backends stay in lockstep. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # MUST precede huggingface_hub import | |
| BACKEND = os.environ.get("FROGQUEST_BACKEND", "zerogpu").lower() | |
| if BACKEND != "modal": # the local/ZeroGPU path (default + any unrecognized value) needs the decorator | |
| import spaces # noqa: E402 | |
| from schema import CAMPAIGN_RESPONSE_SCHEMA, INTENT_SCHEMA, RESPONSE_SCHEMA # noqa: E402 | |
| from gpu_shared import ( # noqa: E402 | |
| CAMPAIGN_SYSTEM_PROMPT, | |
| GGUF_FILE, | |
| GGUF_REPO, | |
| INTENT_SYSTEM_PROMPT, | |
| LOW_VRAM_GB, | |
| N_CTX, | |
| N_CTX_SMALL, | |
| SYSTEM_PROMPT, | |
| extract_json, | |
| preload_cuda_libs, | |
| ) | |
| # Best-effort: warm the HF cache at startup so the FIRST @spaces.GPU call doesn't spend its | |
| # (metered, on ZeroGPU) duration downloading ~4GB. Local-path only — on a CPU-basic Space (modal | |
| # backend) we must NOT download the GGUF. No-op if offline or on a fresh local checkout. | |
| if BACKEND != "modal": | |
| try: | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| _gguf = next((f for f in list_repo_files(GGUF_REPO) if "Q8_0" in f and f.endswith(".gguf")), None) | |
| if _gguf: | |
| hf_hub_download(GGUF_REPO, _gguf) | |
| except Exception: | |
| pass | |
| _llm = None | |
| def _get_llm(): | |
| """Lazily download + construct the Llama model on the GPU (must run inside @spaces.GPU). | |
| First call downloads the GGUF then disk-caches it, so later calls are fast. | |
| """ | |
| global _llm | |
| if _llm is None: | |
| # The prebuilt CUDA llama-cpp-python wheel links libcudart.so.12 / libcublas etc., which | |
| # ship inside the nvidia-*-cu12 packages torch pulls in but are NOT on the loader path. | |
| # Without help you get "libcudart.so.12: cannot open shared object file". | |
| # 1) importing torch loads many of them RTLD_GLOBAL; | |
| # 2) belt-and-suspenders: explicitly preload the nvidia-* CUDA libs too. | |
| import torch # noqa: F401 | |
| preload_cuda_libs() | |
| from llama_cpp import Llama | |
| vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9 | |
| if torch.cuda.is_available() else 0) | |
| n_ctx = N_CTX if vram_gb >= LOW_VRAM_GB else N_CTX_SMALL | |
| _llm = Llama.from_pretrained( | |
| repo_id=GGUF_REPO, | |
| filename=GGUF_FILE, # glob -> resolves the exact Q8_0 file (warmed at import) | |
| n_gpu_layers=-1, # offload all layers (Q8 4B ~4.3GB fits even on a T4) | |
| n_ctx=n_ctx, | |
| verbose=False, | |
| ) | |
| return _llm | |
| # ----------------------------- local (in-Space, ZeroGPU) implementations ----------------------------- | |
| def _generate_quests_local(todos: str, theme: str) -> dict: | |
| """Return the model's raw JSON object (UNVALIDATED - caller must validate_and_clamp).""" | |
| llm = _get_llm() | |
| system = SYSTEM_PROMPT.replace("{theme}", theme) | |
| user = f"Theme: {theme}\nMy to-do list / goals:\n{todos.strip()}" | |
| out = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| response_format={"type": "json_object", "schema": RESPONSE_SCHEMA}, | |
| temperature=0.0, | |
| max_tokens=4096, | |
| ) | |
| return extract_json(out["choices"][0]["message"]["content"]) | |
| def _generate_campaign_local(goal: str, theme: str, snippets: str = "") -> dict: | |
| """One long-term goal (+ optional research snippets) -> raw campaign JSON (UNVALIDATED - | |
| caller must validate_campaign).""" | |
| llm = _get_llm() | |
| system = CAMPAIGN_SYSTEM_PROMPT.replace("{theme}", theme) | |
| user = f"Theme: {theme}\nLong-term goal:\n{goal.strip()}" | |
| if (snippets or "").strip(): | |
| user += f"\n\nResearch notes:\n{snippets.strip()}" | |
| out = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": user}, | |
| ], | |
| response_format={"type": "json_object", "schema": CAMPAIGN_RESPONSE_SCHEMA}, | |
| temperature=0.0, | |
| max_tokens=4096, | |
| ) | |
| return extract_json(out["choices"][0]["message"]["content"]) | |
| def _route_intent_local(message: str, context: str) -> dict: | |
| """Classify one Frog Master chat message into {intent, target_task?, reason?}. | |
| `context` is a SHORT text summary of the current log (does a log exist + quest titles/ids/ | |
| status) - never images (CLAUDE.md rule). Falls back to {"intent": "unknown"} on bad output. | |
| """ | |
| llm = _get_llm() | |
| user = f"Context:\n{context.strip()}\n\nUser message:\n{message.strip()}" | |
| out = llm.create_chat_completion( | |
| messages=[ | |
| {"role": "system", "content": INTENT_SYSTEM_PROMPT}, | |
| {"role": "user", "content": user}, | |
| ], | |
| response_format={"type": "json_object", "schema": INTENT_SCHEMA}, | |
| temperature=0.0, | |
| max_tokens=256, | |
| ) | |
| parsed = extract_json(out["choices"][0]["message"]["content"]) | |
| if not isinstance(parsed, dict) or parsed.get("intent") not in ( | |
| "forge", "add_tasks", "mark_done", "mark_couldnt", "unknown", | |
| ): | |
| return {"intent": "unknown"} | |
| return parsed | |
| # ----------------------------- modal (off-Space) wrappers ----------------------------- | |
| def _generate_quests_modal(todos: str, theme: str) -> dict: | |
| import modal | |
| llm = modal.Cls.from_name("frogquest", "LLM")() | |
| return llm.generate_quests.remote(todos, theme) | |
| def _generate_campaign_modal(goal: str, theme: str, snippets: str = "") -> dict: | |
| import modal | |
| llm = modal.Cls.from_name("frogquest", "LLM")() | |
| return llm.generate_campaign.remote(goal, theme, snippets) | |
| def _route_intent_modal(message: str, context: str) -> dict: | |
| import modal | |
| llm = modal.Cls.from_name("frogquest", "LLM")() | |
| return llm.route_intent.remote(message, context) | |
| # ----------------------------- bind public names from the backend ----------------------------- | |
| # app.py imports these by name; signatures are identical across backends. | |
| if BACKEND == "modal": | |
| generate_quests_raw = _generate_quests_modal | |
| generate_campaign_raw = _generate_campaign_modal | |
| route_intent = _route_intent_modal | |
| else: | |
| generate_quests_raw = spaces.GPU(duration=70)(_generate_quests_local) | |
| generate_campaign_raw = spaces.GPU(duration=70)(_generate_campaign_local) | |
| route_intent = spaces.GPU(duration=45)(_route_intent_local) | |