from __future__ import annotations import os from dataclasses import dataclass from typing import Any, Protocol try: # ZeroGPU: present on the Space, absent in local/CPU dev import spaces except ImportError: # pragma: no cover - local fallback spaces = None from .json_utils import parse_json_object @dataclass(frozen=True) class RuntimeStatus: label: str used_fallback: bool = False def _same_look(a: dict, b: dict) -> bool: """True when two genome dicts render identically: the visible creature is driven only by archetype + palette + parts, so a tweak that leaves all three untouched produces no on-screen change.""" keys = ("archetype", "palette", "parts") return all(a.get(k) == b.get(k) for k in keys) # The app runs a single model: Gemma 4 12B (the only family that draws hard voxel # forms in the model-selection spike — see wiki/model-selection-spike.md). Every # AI moment — freeform generation and hatch fusion — goes through it. Gemma 4 # needs llama-cpp-python > 0.3.19 (the Space pin); the local sibling venv has # 0.3.28. Override via env for a local gguf. See # docs/superpowers/specs/2026-06-15-freeform-voxel-generation-design.md. FREEFORM_MODEL_REPO = os.getenv("BUDDY_FREEFORM_MODEL_REPO", "unsloth/gemma-4-12b-it-GGUF") FREEFORM_MODEL_FILE = os.getenv("BUDDY_FREEFORM_MODEL_FILE", "gemma-4-12b-it-Q4_K_M.gguf") # The edit path no longer uses a model: an English keyword parser # (edit_parser.parse_edit) maps the instruction to a catalog part + parameters # and the assembler snaps it on deterministically — instant, no GPU. The # generative model above is reserved for full generation + hatch fusion, and for # the freeform-edit safety valve when no catalog part matches. def _blocking_completion( *, model_path: str, messages: list[dict[str, str]], schema: dict[str, Any] | None, max_tokens: int, temperature: float, top_p: float, n_ctx: int, n_threads: int | None, n_gpu_layers: int, flash_attn: bool, label: str = "zerogpu-bench", ) -> str: """Build a Llama and run one chat completion. On ZeroGPU the GPU is attached only inside the @spaces.GPU wrappers below and detaches when they return, so the model must be (re)constructed here every call with ``n_gpu_layers=-1`` — there is no persisting it across calls (a llama.cpp object built in global scope would stay on CPU, since ``spaces`` only hooks PyTorch's ``.to("cuda")``). """ import time from llama_cpp import Llama t0 = time.perf_counter() llm = Llama( model_path=model_path, n_ctx=n_ctx, n_threads=n_threads, n_gpu_layers=n_gpu_layers, flash_attn=flash_attn, verbose=False, ) t1 = time.perf_counter() kwargs: dict[str, Any] = dict( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p ) if schema is not None: # llama-cpp-python only honors {"type": "json_object", "schema": ...}; the # OpenAI-style {"type": "json_schema"} is silently ignored, which left the # model unconstrained (it returned `type` instead of `kind`, bare strings, # or a {"genome": {...}} wrapper) and forced every edit onto the keyword # fallback. json_object + schema turns the schema into an enforced GBNF # grammar so fuse/tweak output actually matches GENOME_SCHEMA. kwargs["response_format"] = {"type": "json_object", "schema": schema} response = llm.create_chat_completion(**kwargs) t2 = time.perf_counter() usage = response.get("usage") or {} tokens = int(usage.get("completion_tokens") or 0) gen_s = t2 - t1 tok_s = tokens / gen_s if gen_s > 0 else 0.0 # Printed to the Space logs so each call's cost is visible (reload + gen). print( f"[{label}] reload={t1 - t0:.2f}s gen={gen_s:.2f}s " f"total={t2 - t0:.2f}s tokens={tokens} ({tok_s:.1f} tok/s) " f"n_gpu_layers={n_gpu_layers}", flush=True, ) return response["choices"][0]["message"]["content"] def _run_completion(**kwargs: Any) -> str: return _blocking_completion(**kwargs) def _run_freeform_completion(**kwargs: Any) -> str: return _blocking_completion(**kwargs) def _warmup_completion(**kwargs: Any) -> str: return _blocking_completion(label="zerogpu-warmup", **kwargs) # Freeform Gemma-4-12B box-geometry calls generate 300-600+ tokens at ~20 tok/s # and routinely hit 25-31s total. On the old shared duration=30 they raced the # ZeroGPU detach and crashed mid-decode (ggml_cuda_mul_mat_q CUDA abort -> the # call dies and the user gets the fallback after a full wait). Give them their # own larger budget. Override with BUDDY_FREEFORM_GPU_DURATION if 12B gen grows. _FREEFORM_GPU_DURATION = int(os.getenv("BUDDY_FREEFORM_GPU_DURATION", "60")) if spaces is not None: # pragma: no cover - only active on the ZeroGPU Space # Short calls (e.g. tweak suggestions) are quick once warm, so keep their GPU # budget small for better queue behavior. The warmup eats the one-time ~75s # CUDA kernel JIT compile, so it needs a larger budget. _run_completion = spaces.GPU(duration=30)(_run_completion) _run_freeform_completion = spaces.GPU(duration=_FREEFORM_GPU_DURATION)(_run_freeform_completion) _warmup_completion = spaces.GPU(duration=90)(_warmup_completion) class BuddyRuntime(Protocol): def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: raise NotImplementedError def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: raise NotImplementedError def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: raise NotImplementedError def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: raise NotImplementedError def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: raise NotImplementedError def suggest_tweaks(self, boxes: list[dict]) -> list[str]: raise NotImplementedError class FakeBuddyRuntime: _STATUS = RuntimeStatus("Generated by the local fallback", used_fallback=True) def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: from .genome import genome_from_dict, genome_to_dict from .fallback import fuse_fallback child = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1) return genome_to_dict(child), self._STATUS def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: from .genome import genome_from_dict, genome_to_dict from .fallback import voice_edit_genome edited = voice_edit_genome(genome_from_dict(genome), transcript) return genome_to_dict(edited), self._STATUS def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: from .exemplars import pick_exemplar ex = pick_exemplar(prompt) genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []} return genome, list(ex["boxes"]), self._STATUS def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: from .fallback import fuse_boxes_fallback boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1) name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}" return {"name": name, "archetype": "chick", "parts": []}, boxes, self._STATUS def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: # Same model-free edit path as the real runtime: the keyword parser maps # the instruction to a catalog part + parameters, the assembler snaps it # on. An unmatched instruction defaults to horns so local UI testing # always produces a visible change. from .assembler import assemble_part from .parts import PARTS from .edit_parser import parse_edit from .fallback import suggest_tweaks_fallback spec = parse_edit(instruction) if spec is not None: nb = assemble_part(boxes, spec.part, spec.anchor, spec.scale, spec.color, spec.count, spec.rotation) else: nb = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"]) return genome, nb, suggest_tweaks_fallback(nb), self._STATUS def suggest_tweaks(self, boxes: list[dict]) -> list[str]: from .fallback import suggest_tweaks_fallback return suggest_tweaks_fallback(boxes) @dataclass(frozen=True) class LlamaRuntimeConfig: model_path: str | None = None model_repo: str = FREEFORM_MODEL_REPO model_file: str = FREEFORM_MODEL_FILE n_ctx: int = 2048 n_threads: int | None = None n_gpu_layers: int = -1 flash_attn: bool = True timeout_seconds: float = 35.0 max_tokens: int = 420 @classmethod def from_env(cls) -> "LlamaRuntimeConfig": return cls( model_path=os.getenv("BUDDY_MODEL_PATH"), model_repo=os.getenv("BUDDY_MODEL_REPO", FREEFORM_MODEL_REPO), model_file=os.getenv("BUDDY_MODEL_FILE", FREEFORM_MODEL_FILE), n_ctx=int(os.getenv("BUDDY_N_CTX", "2048")), n_threads=_optional_int(os.getenv("BUDDY_N_THREADS")), n_gpu_layers=int(os.getenv("BUDDY_N_GPU_LAYERS", "-1")), flash_attn=os.getenv("BUDDY_FLASH_ATTN", "1") != "0", timeout_seconds=float(os.getenv("BUDDY_TIMEOUT_SECONDS", "35")), max_tokens=int(os.getenv("BUDDY_MAX_TOKENS", "420")), ) class LlamaCppBuddyRuntime: def __init__(self, config: LlamaRuntimeConfig | None = None) -> None: self.config = config or LlamaRuntimeConfig.from_env() def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: from .genome import genome_from_dict, genome_to_dict, repair_genome from .fallback import fuse_fallback from .prompts import build_fuse_messages, GENOME_SCHEMA fb = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1) try: raw = self._chat_json( build_fuse_messages(parent_a, parent_b), GENOME_SCHEMA, max_tokens=int(os.environ.get("BUDDY_FUSE_MAX_TOKENS", "400")), temperature=0.9, ) child = repair_genome(raw, fb) return genome_to_dict(child), RuntimeStatus("Generated by the local model") except Exception: # noqa: BLE001 - runtime failures must not break the demo return genome_to_dict(fb), RuntimeStatus("Used the fallback splice", used_fallback=True) def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: from .genome import genome_from_dict, genome_to_dict, repair_genome from .fallback import voice_edit_genome from .prompts import build_genome_command_messages, GENOME_EDIT_SCHEMA base = genome_from_dict(genome) base_out = genome_to_dict(base) fb = genome_to_dict(voice_edit_genome(base, transcript)) try: edit = self._chat_json( build_genome_command_messages(genome, transcript), GENOME_EDIT_SCHEMA, max_tokens=int(os.environ.get("BUDDY_EDIT_MAX_TOKENS", "200")), temperature=0.5, ) or {} merged = {**genome, "parts": (edit.get("parts") or genome.get("parts")), "palette": (edit.get("palette") or genome.get("palette")), "archetype": (edit.get("archetype") or genome.get("archetype"))} model_out = genome_to_dict(repair_genome(merged, base)) # A small model often returns a valid-but-empty edit (no parts / # palette / archetype), which silently leaves the creature untouched # so the user sees no reaction. When the model changed nothing visible # but the deterministic keyword editor did, surface that instead so a # tweak always lands. (The success path used to keep the original and # never reach the keyword fallback, which only fired on an exception.) if _same_look(model_out, base_out) and not _same_look(fb, base_out): return fb, RuntimeStatus("Used the keyword editor", used_fallback=True) return model_out, RuntimeStatus("Generated by the local model") except Exception: # noqa: BLE001 - runtime failures must not break the demo return fb, RuntimeStatus("Used the keyword editor", used_fallback=True) def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: # Model-free by design: the bench's from-scratch box must never call the # model — only Splice (fuse_creatures) does. Pick a reference body by # keyword, the same deterministic path the fake runtime uses, so real and # fake agree. (The model-backed draw used to live here; it was retired when # the bench input was made fully model-free.) from .exemplars import pick_exemplar ex = pick_exemplar(prompt) genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []} return genome, list(ex["boxes"]), RuntimeStatus("Built from a reference template") def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: import json from .exemplars import pick_exemplar from .boxes import repair_boxes from .fallback import fuse_boxes_fallback from .prompts import build_freeform_fuse_messages, BOX_SCHEMA ex = pick_exemplar((parent_a.get("name") or "") + " " + (parent_b.get("name") or "")) name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}" fb_boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1) try: raw = self._chat_json( build_freeform_fuse_messages(parent_a, parent_b, json.dumps(ex)), BOX_SCHEMA, max_tokens=int(os.environ.get("BUDDY_FREEFORM_MAX_TOKENS", "1000")), temperature=0.9, heavy=True, ) or {} boxes = repair_boxes(raw.get("boxes") if isinstance(raw, dict) else None) child_name = (raw.get("name") if isinstance(raw, dict) else None) or name if boxes: return {"name": child_name, "archetype": "chick", "parts": []}, boxes, RuntimeStatus("Generated by the local model") except Exception: # noqa: BLE001 - runtime failures must not break the demo pass return {"name": name, "archetype": "chick", "parts": []}, fb_boxes, RuntimeStatus("Used a reference template", used_fallback=True) def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: # Model-free by design: the bench tweak box must never call the model — only # Splice (fuse_creatures) does. An English keyword parser maps the instruction # to a catalog part + parameters and the assembler snaps it on deterministically. # An instruction that matches no catalog part defaults to horns, so a tweak # always produces a visible change (mirrors the fake runtime). (The model-backed # freeform edit used to be a fallback here; it was retired with the box.) from .assembler import assemble_part from .parts import PARTS from .edit_parser import parse_edit from .fallback import suggest_tweaks_fallback spec = parse_edit(instruction) if spec is not None: print(f"[edit] part={spec.part} anchor={spec.anchor} count={spec.count} " f"scale={spec.scale} rot={spec.rotation}", flush=True) new_boxes = assemble_part(boxes, spec.part, spec.anchor, spec.scale, spec.color, spec.count, spec.rotation) else: new_boxes = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"]) return genome, new_boxes, suggest_tweaks_fallback(new_boxes), RuntimeStatus("Built from the part catalog") def suggest_tweaks(self, boxes: list[dict]) -> list[str]: from .fallback import suggest_tweaks_fallback from .prompts import build_freeform_suggest_messages, TWEAK_SCHEMA try: raw = self._chat_json( build_freeform_suggest_messages(boxes), TWEAK_SCHEMA, max_tokens=int(os.environ.get("BUDDY_SUGGEST_MAX_TOKENS", "200")), temperature=0.9, ) or {} tweaks = raw.get("tweaks") if isinstance(raw, dict) else None clean = [str(t).strip() for t in tweaks if str(t).strip()] if isinstance(tweaks, list) else [] if clean: # Pad/trim to exactly 4, topping up from the fallback if short. fill = suggest_tweaks_fallback(boxes) while len(clean) < 4: clean.append(next((f for f in fill if f not in clean), fill[len(clean) % len(fill)])) return clean[:4] except Exception: # noqa: BLE001 - suggestions are advisory; never break the bench pass return suggest_tweaks_fallback(boxes) def _chat_json( self, messages: list[dict[str, str]], schema: dict[str, Any], *, max_tokens: int | None = None, temperature: float = 0.85, heavy: bool = False, ) -> dict[str, Any] | None: # Resolve the model file outside the GPU call (hf_hub_download caches # to disk), then run inference inside the @spaces.GPU-wrapped helper so # ZeroGPU attaches a GPU for the duration of the completion. heavy=True # routes the long-running 12B freeform calls to the larger-duration GPU # wrapper so they do not race the ZeroGPU detach and crash mid-decode. model_path = self._resolve_model_path() run = _run_freeform_completion if heavy else _run_completion # On the freeform box path, BUDDY_FREEFORM_NO_GRAMMAR=1 drops the GBNF # grammar: it ~2x's 12B decode (the grammar is intrinsic per-token CPU # overhead, not schema bloat) while parse_json_object + repair_boxes catch # the now-unconstrained output. Short (non-heavy) calls always keep the # grammar. See wiki/deployment-strategy.md. effective_schema = schema if heavy and os.getenv("BUDDY_FREEFORM_NO_GRAMMAR") == "1": effective_schema = None content = run( model_path=model_path, messages=messages, schema=effective_schema, max_tokens=max_tokens or self.config.max_tokens, temperature=temperature, top_p=0.9, n_ctx=max(self.config.n_ctx, 4096), n_threads=self.config.n_threads, n_gpu_layers=self.config.n_gpu_layers, flash_attn=self.config.flash_attn, ) return parse_json_object(content) def _resolve_model_path(self) -> str: # An explicit local path wins (local dev / tests); otherwise download the # configured GGUF (hf_hub_download caches to disk). On a download failure # each runtime method already falls back to a hand-authored exemplar, so # there is no second model to fall back to. if self.config.model_path: return self.config.model_path from huggingface_hub import hf_hub_download return hf_hub_download(repo_id=self.config.model_repo, filename=self.config.model_file) def default_runtime() -> BuddyRuntime: if os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1": return FakeBuddyRuntime() return LlamaCppBuddyRuntime() _warmed = False def warmup() -> None: """Trigger the one-time CUDA kernel JIT compile ahead of real traffic. Wired to Gradio's ``demo.load`` so it fires when the page opens: the ~48s first-call compile happens while the player reads the intro, so their first real action is already warm. Runs once per container; safe to call repeatedly and on CPU / when ``spaces`` is absent. No-ops in the fake runtime. """ global _warmed if _warmed or os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1": return _warmed = True # claim first so concurrent page loads do not double-fire try: cfg = LlamaRuntimeConfig.from_env() # Warm the model every real action — draw / tweak / splice — runs. model_path = LlamaCppBuddyRuntime(cfg)._resolve_model_path() _warmup_completion( model_path=model_path, messages=[{"role": "user", "content": "warmup"}], schema=None, max_tokens=8, temperature=0.0, top_p=1.0, n_ctx=cfg.n_ctx, n_threads=cfg.n_threads, n_gpu_layers=cfg.n_gpu_layers, flash_attn=cfg.flash_attn, ) if os.getenv("BUDDY_GRAMMAR_BENCH") == "1": _grammar_bench(cfg, model_path) except Exception as exc: # noqa: BLE001 - warmup is best-effort _warmed = False # allow a retry on the next page load print(f"[zerogpu-warmup] failed: {exc}", flush=True) def _grammar_bench(cfg: "LlamaRuntimeConfig", model_path: str) -> None: """One-off diagnostic: run the freeform prompt UNCONSTRAINED (no GBNF grammar) to isolate how much grammar-constrained sampling costs vs raw 12B decode. Only the grammar-OFF run lives here: the grammar-ON throughput is already the normal [zerogpu-bench] number every real freeform call prints (~17-22 tok/s), so there is no need to re-measure it. Running BOTH inside this one demo.load handler stacked a 3rd GPU acquisition after the warmup gen and ZeroGPU aborted it ("GPU task aborted"); keeping the handler to 2 acquisitions (warmup + this) is in the range the log shows succeeds. Uses the same n_ctx=4096 / freeform GPU wrapper as the real calls so the number transfers directly. Gated by BUDDY_GRAMMAR_BENCH=1; read [grammar-bench-off] off the log, compare against the [zerogpu-bench] (grammar-on) lines, then unset the env.""" import json from .exemplars import pick_exemplar from .prompts import build_freeform_messages prompt = "a small round dragon with two curved horns and a long tail" ex = pick_exemplar(prompt) try: _run_freeform_completion( model_path=model_path, messages=build_freeform_messages(prompt, json.dumps(ex)), schema=None, max_tokens=int(os.getenv("BUDDY_GRAMMAR_BENCH_TOKENS", "400")), temperature=0.8, top_p=0.9, n_ctx=max(cfg.n_ctx, 4096), n_threads=cfg.n_threads, n_gpu_layers=cfg.n_gpu_layers, flash_attn=cfg.flash_attn, label="grammar-bench-off", ) except Exception as exc: # noqa: BLE001 - diagnostic only, never break warmup print(f"[grammar-bench] failed: {exc}", flush=True) def _optional_int(value: str | None) -> int | None: if value is None or value == "": return None return int(value)