hatchimera / src /buddy_fusion /runtime.py
arkai2025's picture
feat: model-free bench, Quick Start + Build-from-scratch landing, fix blank-Lab
3076240
Raw
History Blame Contribute Delete
23.7 kB
from __future__ import annotations
import os
from dataclasses import dataclass
from typing import Any, Protocol
try: # ZeroGPU: present on the Space, absent in local/CPU dev
import spaces
except ImportError: # pragma: no cover - local fallback
spaces = None
from .json_utils import parse_json_object
@dataclass(frozen=True)
class RuntimeStatus:
label: str
used_fallback: bool = False
def _same_look(a: dict, b: dict) -> bool:
"""True when two genome dicts render identically: the visible creature is
driven only by archetype + palette + parts, so a tweak that leaves all three
untouched produces no on-screen change."""
keys = ("archetype", "palette", "parts")
return all(a.get(k) == b.get(k) for k in keys)
# The app runs a single model: Gemma 4 12B (the only family that draws hard voxel
# forms in the model-selection spike — see wiki/model-selection-spike.md). Every
# AI moment — freeform generation and hatch fusion — goes through it. Gemma 4
# needs llama-cpp-python > 0.3.19 (the Space pin); the local sibling venv has
# 0.3.28. Override via env for a local gguf. See
# docs/superpowers/specs/2026-06-15-freeform-voxel-generation-design.md.
FREEFORM_MODEL_REPO = os.getenv("BUDDY_FREEFORM_MODEL_REPO", "unsloth/gemma-4-12b-it-GGUF")
FREEFORM_MODEL_FILE = os.getenv("BUDDY_FREEFORM_MODEL_FILE", "gemma-4-12b-it-Q4_K_M.gguf")
# The edit path no longer uses a model: an English keyword parser
# (edit_parser.parse_edit) maps the instruction to a catalog part + parameters
# and the assembler snaps it on deterministically — instant, no GPU. The
# generative model above is reserved for full generation + hatch fusion, and for
# the freeform-edit safety valve when no catalog part matches.
def _blocking_completion(
*,
model_path: str,
messages: list[dict[str, str]],
schema: dict[str, Any] | None,
max_tokens: int,
temperature: float,
top_p: float,
n_ctx: int,
n_threads: int | None,
n_gpu_layers: int,
flash_attn: bool,
label: str = "zerogpu-bench",
) -> str:
"""Build a Llama and run one chat completion.
On ZeroGPU the GPU is attached only inside the @spaces.GPU wrappers below
and detaches when they return, so the model must be (re)constructed here
every call with ``n_gpu_layers=-1`` — there is no persisting it across
calls (a llama.cpp object built in global scope would stay on CPU, since
``spaces`` only hooks PyTorch's ``.to("cuda")``).
"""
import time
from llama_cpp import Llama
t0 = time.perf_counter()
llm = Llama(
model_path=model_path,
n_ctx=n_ctx,
n_threads=n_threads,
n_gpu_layers=n_gpu_layers,
flash_attn=flash_attn,
verbose=False,
)
t1 = time.perf_counter()
kwargs: dict[str, Any] = dict(
messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p
)
if schema is not None:
# llama-cpp-python only honors {"type": "json_object", "schema": ...}; the
# OpenAI-style {"type": "json_schema"} is silently ignored, which left the
# model unconstrained (it returned `type` instead of `kind`, bare strings,
# or a {"genome": {...}} wrapper) and forced every edit onto the keyword
# fallback. json_object + schema turns the schema into an enforced GBNF
# grammar so fuse/tweak output actually matches GENOME_SCHEMA.
kwargs["response_format"] = {"type": "json_object", "schema": schema}
response = llm.create_chat_completion(**kwargs)
t2 = time.perf_counter()
usage = response.get("usage") or {}
tokens = int(usage.get("completion_tokens") or 0)
gen_s = t2 - t1
tok_s = tokens / gen_s if gen_s > 0 else 0.0
# Printed to the Space logs so each call's cost is visible (reload + gen).
print(
f"[{label}] reload={t1 - t0:.2f}s gen={gen_s:.2f}s "
f"total={t2 - t0:.2f}s tokens={tokens} ({tok_s:.1f} tok/s) "
f"n_gpu_layers={n_gpu_layers}",
flush=True,
)
return response["choices"][0]["message"]["content"]
def _run_completion(**kwargs: Any) -> str:
return _blocking_completion(**kwargs)
def _run_freeform_completion(**kwargs: Any) -> str:
return _blocking_completion(**kwargs)
def _warmup_completion(**kwargs: Any) -> str:
return _blocking_completion(label="zerogpu-warmup", **kwargs)
# Freeform Gemma-4-12B box-geometry calls generate 300-600+ tokens at ~20 tok/s
# and routinely hit 25-31s total. On the old shared duration=30 they raced the
# ZeroGPU detach and crashed mid-decode (ggml_cuda_mul_mat_q CUDA abort -> the
# call dies and the user gets the fallback after a full wait). Give them their
# own larger budget. Override with BUDDY_FREEFORM_GPU_DURATION if 12B gen grows.
_FREEFORM_GPU_DURATION = int(os.getenv("BUDDY_FREEFORM_GPU_DURATION", "60"))
if spaces is not None: # pragma: no cover - only active on the ZeroGPU Space
# Short calls (e.g. tweak suggestions) are quick once warm, so keep their GPU
# budget small for better queue behavior. The warmup eats the one-time ~75s
# CUDA kernel JIT compile, so it needs a larger budget.
_run_completion = spaces.GPU(duration=30)(_run_completion)
_run_freeform_completion = spaces.GPU(duration=_FREEFORM_GPU_DURATION)(_run_freeform_completion)
_warmup_completion = spaces.GPU(duration=90)(_warmup_completion)
class BuddyRuntime(Protocol):
def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
raise NotImplementedError
def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
raise NotImplementedError
def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
raise NotImplementedError
def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
raise NotImplementedError
def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
raise NotImplementedError
def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
raise NotImplementedError
class FakeBuddyRuntime:
_STATUS = RuntimeStatus("Generated by the local fallback", used_fallback=True)
def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
from .genome import genome_from_dict, genome_to_dict
from .fallback import fuse_fallback
child = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
return genome_to_dict(child), self._STATUS
def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
from .genome import genome_from_dict, genome_to_dict
from .fallback import voice_edit_genome
edited = voice_edit_genome(genome_from_dict(genome), transcript)
return genome_to_dict(edited), self._STATUS
def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
from .exemplars import pick_exemplar
ex = pick_exemplar(prompt)
genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
return genome, list(ex["boxes"]), self._STATUS
def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
from .fallback import fuse_boxes_fallback
boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
return {"name": name, "archetype": "chick", "parts": []}, boxes, self._STATUS
def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
# Same model-free edit path as the real runtime: the keyword parser maps
# the instruction to a catalog part + parameters, the assembler snaps it
# on. An unmatched instruction defaults to horns so local UI testing
# always produces a visible change.
from .assembler import assemble_part
from .parts import PARTS
from .edit_parser import parse_edit
from .fallback import suggest_tweaks_fallback
spec = parse_edit(instruction)
if spec is not None:
nb = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
spec.color, spec.count, spec.rotation)
else:
nb = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
return genome, nb, suggest_tweaks_fallback(nb), self._STATUS
def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
from .fallback import suggest_tweaks_fallback
return suggest_tweaks_fallback(boxes)
@dataclass(frozen=True)
class LlamaRuntimeConfig:
model_path: str | None = None
model_repo: str = FREEFORM_MODEL_REPO
model_file: str = FREEFORM_MODEL_FILE
n_ctx: int = 2048
n_threads: int | None = None
n_gpu_layers: int = -1
flash_attn: bool = True
timeout_seconds: float = 35.0
max_tokens: int = 420
@classmethod
def from_env(cls) -> "LlamaRuntimeConfig":
return cls(
model_path=os.getenv("BUDDY_MODEL_PATH"),
model_repo=os.getenv("BUDDY_MODEL_REPO", FREEFORM_MODEL_REPO),
model_file=os.getenv("BUDDY_MODEL_FILE", FREEFORM_MODEL_FILE),
n_ctx=int(os.getenv("BUDDY_N_CTX", "2048")),
n_threads=_optional_int(os.getenv("BUDDY_N_THREADS")),
n_gpu_layers=int(os.getenv("BUDDY_N_GPU_LAYERS", "-1")),
flash_attn=os.getenv("BUDDY_FLASH_ATTN", "1") != "0",
timeout_seconds=float(os.getenv("BUDDY_TIMEOUT_SECONDS", "35")),
max_tokens=int(os.getenv("BUDDY_MAX_TOKENS", "420")),
)
class LlamaCppBuddyRuntime:
def __init__(self, config: LlamaRuntimeConfig | None = None) -> None:
self.config = config or LlamaRuntimeConfig.from_env()
def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
from .genome import genome_from_dict, genome_to_dict, repair_genome
from .fallback import fuse_fallback
from .prompts import build_fuse_messages, GENOME_SCHEMA
fb = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
try:
raw = self._chat_json(
build_fuse_messages(parent_a, parent_b), GENOME_SCHEMA,
max_tokens=int(os.environ.get("BUDDY_FUSE_MAX_TOKENS", "400")), temperature=0.9,
)
child = repair_genome(raw, fb)
return genome_to_dict(child), RuntimeStatus("Generated by the local model")
except Exception: # noqa: BLE001 - runtime failures must not break the demo
return genome_to_dict(fb), RuntimeStatus("Used the fallback splice", used_fallback=True)
def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
from .genome import genome_from_dict, genome_to_dict, repair_genome
from .fallback import voice_edit_genome
from .prompts import build_genome_command_messages, GENOME_EDIT_SCHEMA
base = genome_from_dict(genome)
base_out = genome_to_dict(base)
fb = genome_to_dict(voice_edit_genome(base, transcript))
try:
edit = self._chat_json(
build_genome_command_messages(genome, transcript), GENOME_EDIT_SCHEMA,
max_tokens=int(os.environ.get("BUDDY_EDIT_MAX_TOKENS", "200")), temperature=0.5,
) or {}
merged = {**genome,
"parts": (edit.get("parts") or genome.get("parts")),
"palette": (edit.get("palette") or genome.get("palette")),
"archetype": (edit.get("archetype") or genome.get("archetype"))}
model_out = genome_to_dict(repair_genome(merged, base))
# A small model often returns a valid-but-empty edit (no parts /
# palette / archetype), which silently leaves the creature untouched
# so the user sees no reaction. When the model changed nothing visible
# but the deterministic keyword editor did, surface that instead so a
# tweak always lands. (The success path used to keep the original and
# never reach the keyword fallback, which only fired on an exception.)
if _same_look(model_out, base_out) and not _same_look(fb, base_out):
return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)
return model_out, RuntimeStatus("Generated by the local model")
except Exception: # noqa: BLE001 - runtime failures must not break the demo
return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)
def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
# Model-free by design: the bench's from-scratch box must never call the
# model — only Splice (fuse_creatures) does. Pick a reference body by
# keyword, the same deterministic path the fake runtime uses, so real and
# fake agree. (The model-backed draw used to live here; it was retired when
# the bench input was made fully model-free.)
from .exemplars import pick_exemplar
ex = pick_exemplar(prompt)
genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
return genome, list(ex["boxes"]), RuntimeStatus("Built from a reference template")
def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
import json
from .exemplars import pick_exemplar
from .boxes import repair_boxes
from .fallback import fuse_boxes_fallback
from .prompts import build_freeform_fuse_messages, BOX_SCHEMA
ex = pick_exemplar((parent_a.get("name") or "") + " " + (parent_b.get("name") or ""))
name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
fb_boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
try:
raw = self._chat_json(
build_freeform_fuse_messages(parent_a, parent_b, json.dumps(ex)), BOX_SCHEMA,
max_tokens=int(os.environ.get("BUDDY_FREEFORM_MAX_TOKENS", "1000")),
temperature=0.9,
heavy=True,
) or {}
boxes = repair_boxes(raw.get("boxes") if isinstance(raw, dict) else None)
child_name = (raw.get("name") if isinstance(raw, dict) else None) or name
if boxes:
return {"name": child_name, "archetype": "chick", "parts": []}, boxes, RuntimeStatus("Generated by the local model")
except Exception: # noqa: BLE001 - runtime failures must not break the demo
pass
return {"name": name, "archetype": "chick", "parts": []}, fb_boxes, RuntimeStatus("Used a reference template", used_fallback=True)
def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
# Model-free by design: the bench tweak box must never call the model — only
# Splice (fuse_creatures) does. An English keyword parser maps the instruction
# to a catalog part + parameters and the assembler snaps it on deterministically.
# An instruction that matches no catalog part defaults to horns, so a tweak
# always produces a visible change (mirrors the fake runtime). (The model-backed
# freeform edit used to be a fallback here; it was retired with the box.)
from .assembler import assemble_part
from .parts import PARTS
from .edit_parser import parse_edit
from .fallback import suggest_tweaks_fallback
spec = parse_edit(instruction)
if spec is not None:
print(f"[edit] part={spec.part} anchor={spec.anchor} count={spec.count} "
f"scale={spec.scale} rot={spec.rotation}", flush=True)
new_boxes = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
spec.color, spec.count, spec.rotation)
else:
new_boxes = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
return genome, new_boxes, suggest_tweaks_fallback(new_boxes), RuntimeStatus("Built from the part catalog")
def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
from .fallback import suggest_tweaks_fallback
from .prompts import build_freeform_suggest_messages, TWEAK_SCHEMA
try:
raw = self._chat_json(
build_freeform_suggest_messages(boxes), TWEAK_SCHEMA,
max_tokens=int(os.environ.get("BUDDY_SUGGEST_MAX_TOKENS", "200")),
temperature=0.9,
) or {}
tweaks = raw.get("tweaks") if isinstance(raw, dict) else None
clean = [str(t).strip() for t in tweaks if str(t).strip()] if isinstance(tweaks, list) else []
if clean:
# Pad/trim to exactly 4, topping up from the fallback if short.
fill = suggest_tweaks_fallback(boxes)
while len(clean) < 4:
clean.append(next((f for f in fill if f not in clean), fill[len(clean) % len(fill)]))
return clean[:4]
except Exception: # noqa: BLE001 - suggestions are advisory; never break the bench
pass
return suggest_tweaks_fallback(boxes)
def _chat_json(
self,
messages: list[dict[str, str]],
schema: dict[str, Any],
*,
max_tokens: int | None = None,
temperature: float = 0.85,
heavy: bool = False,
) -> dict[str, Any] | None:
# Resolve the model file outside the GPU call (hf_hub_download caches
# to disk), then run inference inside the @spaces.GPU-wrapped helper so
# ZeroGPU attaches a GPU for the duration of the completion. heavy=True
# routes the long-running 12B freeform calls to the larger-duration GPU
# wrapper so they do not race the ZeroGPU detach and crash mid-decode.
model_path = self._resolve_model_path()
run = _run_freeform_completion if heavy else _run_completion
# On the freeform box path, BUDDY_FREEFORM_NO_GRAMMAR=1 drops the GBNF
# grammar: it ~2x's 12B decode (the grammar is intrinsic per-token CPU
# overhead, not schema bloat) while parse_json_object + repair_boxes catch
# the now-unconstrained output. Short (non-heavy) calls always keep the
# grammar. See wiki/deployment-strategy.md.
effective_schema = schema
if heavy and os.getenv("BUDDY_FREEFORM_NO_GRAMMAR") == "1":
effective_schema = None
content = run(
model_path=model_path,
messages=messages,
schema=effective_schema,
max_tokens=max_tokens or self.config.max_tokens,
temperature=temperature,
top_p=0.9,
n_ctx=max(self.config.n_ctx, 4096),
n_threads=self.config.n_threads,
n_gpu_layers=self.config.n_gpu_layers,
flash_attn=self.config.flash_attn,
)
return parse_json_object(content)
def _resolve_model_path(self) -> str:
# An explicit local path wins (local dev / tests); otherwise download the
# configured GGUF (hf_hub_download caches to disk). On a download failure
# each runtime method already falls back to a hand-authored exemplar, so
# there is no second model to fall back to.
if self.config.model_path:
return self.config.model_path
from huggingface_hub import hf_hub_download
return hf_hub_download(repo_id=self.config.model_repo, filename=self.config.model_file)
def default_runtime() -> BuddyRuntime:
if os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
return FakeBuddyRuntime()
return LlamaCppBuddyRuntime()
_warmed = False
def warmup() -> None:
"""Trigger the one-time CUDA kernel JIT compile ahead of real traffic.
Wired to Gradio's ``demo.load`` so it fires when the page opens: the ~48s
first-call compile happens while the player reads the intro, so their first
real action is already warm. Runs once per container; safe to call
repeatedly and on CPU / when ``spaces`` is absent. No-ops in the fake runtime.
"""
global _warmed
if _warmed or os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
return
_warmed = True # claim first so concurrent page loads do not double-fire
try:
cfg = LlamaRuntimeConfig.from_env()
# Warm the model every real action — draw / tweak / splice — runs.
model_path = LlamaCppBuddyRuntime(cfg)._resolve_model_path()
_warmup_completion(
model_path=model_path,
messages=[{"role": "user", "content": "warmup"}],
schema=None,
max_tokens=8,
temperature=0.0,
top_p=1.0,
n_ctx=cfg.n_ctx,
n_threads=cfg.n_threads,
n_gpu_layers=cfg.n_gpu_layers,
flash_attn=cfg.flash_attn,
)
if os.getenv("BUDDY_GRAMMAR_BENCH") == "1":
_grammar_bench(cfg, model_path)
except Exception as exc: # noqa: BLE001 - warmup is best-effort
_warmed = False # allow a retry on the next page load
print(f"[zerogpu-warmup] failed: {exc}", flush=True)
def _grammar_bench(cfg: "LlamaRuntimeConfig", model_path: str) -> None:
"""One-off diagnostic: run the freeform prompt UNCONSTRAINED (no GBNF grammar)
to isolate how much grammar-constrained sampling costs vs raw 12B decode.
Only the grammar-OFF run lives here: the grammar-ON throughput is already the
normal [zerogpu-bench] number every real freeform call prints (~17-22 tok/s),
so there is no need to re-measure it. Running BOTH inside this one demo.load
handler stacked a 3rd GPU acquisition after the warmup gen and ZeroGPU aborted
it ("GPU task aborted"); keeping the handler to 2 acquisitions (warmup + this)
is in the range the log shows succeeds. Uses the same n_ctx=4096 / freeform
GPU wrapper as the real calls so the number transfers directly. Gated by
BUDDY_GRAMMAR_BENCH=1; read [grammar-bench-off] off the log, compare against
the [zerogpu-bench] (grammar-on) lines, then unset the env."""
import json
from .exemplars import pick_exemplar
from .prompts import build_freeform_messages
prompt = "a small round dragon with two curved horns and a long tail"
ex = pick_exemplar(prompt)
try:
_run_freeform_completion(
model_path=model_path,
messages=build_freeform_messages(prompt, json.dumps(ex)),
schema=None,
max_tokens=int(os.getenv("BUDDY_GRAMMAR_BENCH_TOKENS", "400")),
temperature=0.8,
top_p=0.9,
n_ctx=max(cfg.n_ctx, 4096),
n_threads=cfg.n_threads,
n_gpu_layers=cfg.n_gpu_layers,
flash_attn=cfg.flash_attn,
label="grammar-bench-off",
)
except Exception as exc: # noqa: BLE001 - diagnostic only, never break warmup
print(f"[grammar-bench] failed: {exc}", flush=True)
def _optional_int(value: str | None) -> int | None:
if value is None or value == "":
return None
return int(value)