Spaces:

build-small-hackathon
/

FrogQuest

Running on Zero

App Files Files Community

FrogQuest / llm.py

VirusDumb

Big Leagues Calling

c6815eb 8 days ago

Raw

History Blame Contribute Delete

7.15 kB

	"""Nemotron Nano 4B (text-only) -> raw quest JSON. Pluggable GPU backend.

	FROGQUEST_BACKEND selects WHERE the GPU work runs (the public functions are identical either way):
	- "zerogpu" (default): construct the Llama via llama.cpp INSIDE a @spaces.GPU function on the
	HF Space's ZeroGPU. (First call ~60-90s, then disk-cached & fast.)
	- "modal": forward to a deployed Modal class (see modal_app.py); the Space itself runs on
	CPU-basic and imports NOTHING heavy here.

	The LLM's job is ONLY to write JSON to the contract in schema.py. Output is constrained with a
	JSON-schema response_format and then validated/clamped by the caller. Shared prompts / the JSON
	extractor / model config live in gpu_shared.py so both backends stay in lockstep.
	"""
	from __future__ import annotations

	import os

	os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # MUST precede huggingface_hub import

	BACKEND = os.environ.get("FROGQUEST_BACKEND", "zerogpu").lower()
	if BACKEND != "modal": # the local/ZeroGPU path (default + any unrecognized value) needs the decorator
	import spaces # noqa: E402

	from schema import CAMPAIGN_RESPONSE_SCHEMA, INTENT_SCHEMA, RESPONSE_SCHEMA # noqa: E402
	from gpu_shared import ( # noqa: E402
	CAMPAIGN_SYSTEM_PROMPT,
	GGUF_FILE,
	GGUF_REPO,
	INTENT_SYSTEM_PROMPT,
	LOW_VRAM_GB,
	N_CTX,
	N_CTX_SMALL,
	SYSTEM_PROMPT,
	extract_json,
	preload_cuda_libs,
	)

	# Best-effort: warm the HF cache at startup so the FIRST @spaces.GPU call doesn't spend its
	# (metered, on ZeroGPU) duration downloading ~4GB. Local-path only — on a CPU-basic Space (modal
	# backend) we must NOT download the GGUF. No-op if offline or on a fresh local checkout.
	if BACKEND != "modal":
	try:
	from huggingface_hub import hf_hub_download, list_repo_files
	_gguf = next((f for f in list_repo_files(GGUF_REPO) if "Q8_0" in f and f.endswith(".gguf")), None)
	if _gguf:
	hf_hub_download(GGUF_REPO, _gguf)
	except Exception:
	pass

	_llm = None


	def _get_llm():
	"""Lazily download + construct the Llama model on the GPU (must run inside @spaces.GPU).

	First call downloads the GGUF then disk-caches it, so later calls are fast.
	"""
	global _llm
	if _llm is None:
	# The prebuilt CUDA llama-cpp-python wheel links libcudart.so.12 / libcublas etc., which
	# ship inside the nvidia-*-cu12 packages torch pulls in but are NOT on the loader path.
	# Without help you get "libcudart.so.12: cannot open shared object file".
	# 1) importing torch loads many of them RTLD_GLOBAL;
	# 2) belt-and-suspenders: explicitly preload the nvidia-* CUDA libs too.
	import torch # noqa: F401
	preload_cuda_libs()
	from llama_cpp import Llama

	vram_gb = (torch.cuda.get_device_properties(0).total_memory / 1e9
	if torch.cuda.is_available() else 0)
	n_ctx = N_CTX if vram_gb >= LOW_VRAM_GB else N_CTX_SMALL
	_llm = Llama.from_pretrained(
	repo_id=GGUF_REPO,
	filename=GGUF_FILE, # glob -> resolves the exact Q8_0 file (warmed at import)
	n_gpu_layers=-1, # offload all layers (Q8 4B ~4.3GB fits even on a T4)
	n_ctx=n_ctx,
	verbose=False,
	)
	return _llm


	# ----------------------------- local (in-Space, ZeroGPU) implementations -----------------------------

	def _generate_quests_local(todos: str, theme: str) -> dict:
	"""Return the model's raw JSON object (UNVALIDATED - caller must validate_and_clamp)."""
	llm = _get_llm()
	system = SYSTEM_PROMPT.replace("{theme}", theme)
	user = f"Theme: {theme}\nMy to-do list / goals:\n{todos.strip()}"

	out = llm.create_chat_completion(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	response_format={"type": "json_object", "schema": RESPONSE_SCHEMA},
	temperature=0.0,
	max_tokens=4096,
	)
	return extract_json(out["choices"][0]["message"]["content"])


	def _generate_campaign_local(goal: str, theme: str, snippets: str = "") -> dict:
	"""One long-term goal (+ optional research snippets) -> raw campaign JSON (UNVALIDATED -
	caller must validate_campaign)."""
	llm = _get_llm()
	system = CAMPAIGN_SYSTEM_PROMPT.replace("{theme}", theme)
	user = f"Theme: {theme}\nLong-term goal:\n{goal.strip()}"
	if (snippets or "").strip():
	user += f"\n\nResearch notes:\n{snippets.strip()}"
	out = llm.create_chat_completion(
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	response_format={"type": "json_object", "schema": CAMPAIGN_RESPONSE_SCHEMA},
	temperature=0.0,
	max_tokens=4096,
	)
	return extract_json(out["choices"][0]["message"]["content"])


	def _route_intent_local(message: str, context: str) -> dict:
	"""Classify one Frog Master chat message into {intent, target_task?, reason?}.

	`context` is a SHORT text summary of the current log (does a log exist + quest titles/ids/
	status) - never images (CLAUDE.md rule). Falls back to {"intent": "unknown"} on bad output.
	"""
	llm = _get_llm()
	user = f"Context:\n{context.strip()}\n\nUser message:\n{message.strip()}"
	out = llm.create_chat_completion(
	messages=[
	{"role": "system", "content": INTENT_SYSTEM_PROMPT},
	{"role": "user", "content": user},
	],
	response_format={"type": "json_object", "schema": INTENT_SCHEMA},
	temperature=0.0,
	max_tokens=256,
	)
	parsed = extract_json(out["choices"][0]["message"]["content"])
	if not isinstance(parsed, dict) or parsed.get("intent") not in (
	"forge", "add_tasks", "mark_done", "mark_couldnt", "unknown",
	):
	return {"intent": "unknown"}
	return parsed


	# ----------------------------- modal (off-Space) wrappers -----------------------------

	def _generate_quests_modal(todos: str, theme: str) -> dict:
	import modal
	llm = modal.Cls.from_name("frogquest", "LLM")()
	return llm.generate_quests.remote(todos, theme)


	def _generate_campaign_modal(goal: str, theme: str, snippets: str = "") -> dict:
	import modal
	llm = modal.Cls.from_name("frogquest", "LLM")()
	return llm.generate_campaign.remote(goal, theme, snippets)


	def _route_intent_modal(message: str, context: str) -> dict:
	import modal
	llm = modal.Cls.from_name("frogquest", "LLM")()
	return llm.route_intent.remote(message, context)


	# ----------------------------- bind public names from the backend -----------------------------
	# app.py imports these by name; signatures are identical across backends.
	if BACKEND == "modal":
	generate_quests_raw = _generate_quests_modal
	generate_campaign_raw = _generate_campaign_modal
	route_intent = _route_intent_modal
	else:
	generate_quests_raw = spaces.GPU(duration=70)(_generate_quests_local)
	generate_campaign_raw = spaces.GPU(duration=70)(_generate_campaign_local)
	route_intent = spaces.GPU(duration=45)(_route_intent_local)