Spaces:

build-small-hackathon
/

hatchimera

Running on Zero

App Files Files Community

hatchimera / src /buddy_fusion /runtime.py

arkai2025

feat: model-free bench, Quick Start + Build-from-scratch landing, fix blank-Lab

3076240 20 days ago

Raw

History Blame Contribute Delete

23.7 kB

	from __future__ import annotations

	import os
	from dataclasses import dataclass
	from typing import Any, Protocol

	try: # ZeroGPU: present on the Space, absent in local/CPU dev
	import spaces
	except ImportError: # pragma: no cover - local fallback
	spaces = None

	from .json_utils import parse_json_object


	@dataclass(frozen=True)
	class RuntimeStatus:
	label: str
	used_fallback: bool = False


	def _same_look(a: dict, b: dict) -> bool:
	"""True when two genome dicts render identically: the visible creature is
	driven only by archetype + palette + parts, so a tweak that leaves all three
	untouched produces no on-screen change."""
	keys = ("archetype", "palette", "parts")
	return all(a.get(k) == b.get(k) for k in keys)


	# The app runs a single model: Gemma 4 12B (the only family that draws hard voxel
	# forms in the model-selection spike — see wiki/model-selection-spike.md). Every
	# AI moment — freeform generation and hatch fusion — goes through it. Gemma 4
	# needs llama-cpp-python > 0.3.19 (the Space pin); the local sibling venv has
	# 0.3.28. Override via env for a local gguf. See
	# docs/superpowers/specs/2026-06-15-freeform-voxel-generation-design.md.
	FREEFORM_MODEL_REPO = os.getenv("BUDDY_FREEFORM_MODEL_REPO", "unsloth/gemma-4-12b-it-GGUF")
	FREEFORM_MODEL_FILE = os.getenv("BUDDY_FREEFORM_MODEL_FILE", "gemma-4-12b-it-Q4_K_M.gguf")

	# The edit path no longer uses a model: an English keyword parser
	# (edit_parser.parse_edit) maps the instruction to a catalog part + parameters
	# and the assembler snaps it on deterministically — instant, no GPU. The
	# generative model above is reserved for full generation + hatch fusion, and for
	# the freeform-edit safety valve when no catalog part matches.


	def _blocking_completion(
	*,
	model_path: str,
	messages: list[dict[str, str]],
	schema: dict[str, Any] \| None,
	max_tokens: int,
	temperature: float,
	top_p: float,
	n_ctx: int,
	n_threads: int \| None,
	n_gpu_layers: int,
	flash_attn: bool,
	label: str = "zerogpu-bench",
	) -> str:
	"""Build a Llama and run one chat completion.

	On ZeroGPU the GPU is attached only inside the @spaces.GPU wrappers below
	and detaches when they return, so the model must be (re)constructed here
	every call with ``n_gpu_layers=-1`` — there is no persisting it across
	calls (a llama.cpp object built in global scope would stay on CPU, since
	``spaces`` only hooks PyTorch's ``.to("cuda")``).
	"""
	import time

	from llama_cpp import Llama

	t0 = time.perf_counter()
	llm = Llama(
	model_path=model_path,
	n_ctx=n_ctx,
	n_threads=n_threads,
	n_gpu_layers=n_gpu_layers,
	flash_attn=flash_attn,
	verbose=False,
	)
	t1 = time.perf_counter()
	kwargs: dict[str, Any] = dict(
	messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p
	)
	if schema is not None:
	# llama-cpp-python only honors {"type": "json_object", "schema": ...}; the
	# OpenAI-style {"type": "json_schema"} is silently ignored, which left the
	# model unconstrained (it returned `type` instead of `kind`, bare strings,
	# or a {"genome": {...}} wrapper) and forced every edit onto the keyword
	# fallback. json_object + schema turns the schema into an enforced GBNF
	# grammar so fuse/tweak output actually matches GENOME_SCHEMA.
	kwargs["response_format"] = {"type": "json_object", "schema": schema}
	response = llm.create_chat_completion(**kwargs)
	t2 = time.perf_counter()
	usage = response.get("usage") or {}
	tokens = int(usage.get("completion_tokens") or 0)
	gen_s = t2 - t1
	tok_s = tokens / gen_s if gen_s > 0 else 0.0
	# Printed to the Space logs so each call's cost is visible (reload + gen).
	print(
	f"[{label}] reload={t1 - t0:.2f}s gen={gen_s:.2f}s "
	f"total={t2 - t0:.2f}s tokens={tokens} ({tok_s:.1f} tok/s) "
	f"n_gpu_layers={n_gpu_layers}",
	flush=True,
	)
	return response["choices"][0]["message"]["content"]


	def _run_completion(**kwargs: Any) -> str:
	return _blocking_completion(**kwargs)


	def _run_freeform_completion(**kwargs: Any) -> str:
	return _blocking_completion(**kwargs)


	def _warmup_completion(**kwargs: Any) -> str:
	return _blocking_completion(label="zerogpu-warmup", **kwargs)


	# Freeform Gemma-4-12B box-geometry calls generate 300-600+ tokens at ~20 tok/s
	# and routinely hit 25-31s total. On the old shared duration=30 they raced the
	# ZeroGPU detach and crashed mid-decode (ggml_cuda_mul_mat_q CUDA abort -> the
	# call dies and the user gets the fallback after a full wait). Give them their
	# own larger budget. Override with BUDDY_FREEFORM_GPU_DURATION if 12B gen grows.
	_FREEFORM_GPU_DURATION = int(os.getenv("BUDDY_FREEFORM_GPU_DURATION", "60"))

	if spaces is not None: # pragma: no cover - only active on the ZeroGPU Space
	# Short calls (e.g. tweak suggestions) are quick once warm, so keep their GPU
	# budget small for better queue behavior. The warmup eats the one-time ~75s
	# CUDA kernel JIT compile, so it needs a larger budget.
	_run_completion = spaces.GPU(duration=30)(_run_completion)
	_run_freeform_completion = spaces.GPU(duration=_FREEFORM_GPU_DURATION)(_run_freeform_completion)
	_warmup_completion = spaces.GPU(duration=90)(_warmup_completion)


	class BuddyRuntime(Protocol):
	def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
	raise NotImplementedError

	def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
	raise NotImplementedError

	def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
	raise NotImplementedError

	def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
	raise NotImplementedError

	def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
	raise NotImplementedError

	def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
	raise NotImplementedError


	class FakeBuddyRuntime:
	_STATUS = RuntimeStatus("Generated by the local fallback", used_fallback=True)

	def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
	from .genome import genome_from_dict, genome_to_dict
	from .fallback import fuse_fallback
	child = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
	return genome_to_dict(child), self._STATUS

	def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
	from .genome import genome_from_dict, genome_to_dict
	from .fallback import voice_edit_genome
	edited = voice_edit_genome(genome_from_dict(genome), transcript)
	return genome_to_dict(edited), self._STATUS

	def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
	from .exemplars import pick_exemplar
	ex = pick_exemplar(prompt)
	genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
	return genome, list(ex["boxes"]), self._STATUS

	def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
	from .fallback import fuse_boxes_fallback
	boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
	name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
	return {"name": name, "archetype": "chick", "parts": []}, boxes, self._STATUS

	def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
	# Same model-free edit path as the real runtime: the keyword parser maps
	# the instruction to a catalog part + parameters, the assembler snaps it
	# on. An unmatched instruction defaults to horns so local UI testing
	# always produces a visible change.
	from .assembler import assemble_part
	from .parts import PARTS
	from .edit_parser import parse_edit
	from .fallback import suggest_tweaks_fallback
	spec = parse_edit(instruction)
	if spec is not None:
	nb = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
	spec.color, spec.count, spec.rotation)
	else:
	nb = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
	return genome, nb, suggest_tweaks_fallback(nb), self._STATUS

	def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
	from .fallback import suggest_tweaks_fallback
	return suggest_tweaks_fallback(boxes)


	@dataclass(frozen=True)
	class LlamaRuntimeConfig:
	model_path: str \| None = None
	model_repo: str = FREEFORM_MODEL_REPO
	model_file: str = FREEFORM_MODEL_FILE
	n_ctx: int = 2048
	n_threads: int \| None = None
	n_gpu_layers: int = -1
	flash_attn: bool = True
	timeout_seconds: float = 35.0
	max_tokens: int = 420

	@classmethod
	def from_env(cls) -> "LlamaRuntimeConfig":
	return cls(
	model_path=os.getenv("BUDDY_MODEL_PATH"),
	model_repo=os.getenv("BUDDY_MODEL_REPO", FREEFORM_MODEL_REPO),
	model_file=os.getenv("BUDDY_MODEL_FILE", FREEFORM_MODEL_FILE),
	n_ctx=int(os.getenv("BUDDY_N_CTX", "2048")),
	n_threads=_optional_int(os.getenv("BUDDY_N_THREADS")),
	n_gpu_layers=int(os.getenv("BUDDY_N_GPU_LAYERS", "-1")),
	flash_attn=os.getenv("BUDDY_FLASH_ATTN", "1") != "0",
	timeout_seconds=float(os.getenv("BUDDY_TIMEOUT_SECONDS", "35")),
	max_tokens=int(os.getenv("BUDDY_MAX_TOKENS", "420")),
	)


	class LlamaCppBuddyRuntime:
	def __init__(self, config: LlamaRuntimeConfig \| None = None) -> None:
	self.config = config or LlamaRuntimeConfig.from_env()

	def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]:
	from .genome import genome_from_dict, genome_to_dict, repair_genome
	from .fallback import fuse_fallback
	from .prompts import build_fuse_messages, GENOME_SCHEMA
	fb = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1)
	try:
	raw = self._chat_json(
	build_fuse_messages(parent_a, parent_b), GENOME_SCHEMA,
	max_tokens=int(os.environ.get("BUDDY_FUSE_MAX_TOKENS", "400")), temperature=0.9,
	)
	child = repair_genome(raw, fb)
	return genome_to_dict(child), RuntimeStatus("Generated by the local model")
	except Exception: # noqa: BLE001 - runtime failures must not break the demo
	return genome_to_dict(fb), RuntimeStatus("Used the fallback splice", used_fallback=True)

	def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]:
	from .genome import genome_from_dict, genome_to_dict, repair_genome
	from .fallback import voice_edit_genome
	from .prompts import build_genome_command_messages, GENOME_EDIT_SCHEMA
	base = genome_from_dict(genome)
	base_out = genome_to_dict(base)
	fb = genome_to_dict(voice_edit_genome(base, transcript))
	try:
	edit = self._chat_json(
	build_genome_command_messages(genome, transcript), GENOME_EDIT_SCHEMA,
	max_tokens=int(os.environ.get("BUDDY_EDIT_MAX_TOKENS", "200")), temperature=0.5,
	) or {}
	merged = {**genome,
	"parts": (edit.get("parts") or genome.get("parts")),
	"palette": (edit.get("palette") or genome.get("palette")),
	"archetype": (edit.get("archetype") or genome.get("archetype"))}
	model_out = genome_to_dict(repair_genome(merged, base))
	# A small model often returns a valid-but-empty edit (no parts /
	# palette / archetype), which silently leaves the creature untouched
	# so the user sees no reaction. When the model changed nothing visible
	# but the deterministic keyword editor did, surface that instead so a
	# tweak always lands. (The success path used to keep the original and
	# never reach the keyword fallback, which only fired on an exception.)
	if _same_look(model_out, base_out) and not _same_look(fb, base_out):
	return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)
	return model_out, RuntimeStatus("Generated by the local model")
	except Exception: # noqa: BLE001 - runtime failures must not break the demo
	return fb, RuntimeStatus("Used the keyword editor", used_fallback=True)

	def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]:
	# Model-free by design: the bench's from-scratch box must never call the
	# model — only Splice (fuse_creatures) does. Pick a reference body by
	# keyword, the same deterministic path the fake runtime uses, so real and
	# fake agree. (The model-backed draw used to live here; it was retired when
	# the bench input was made fully model-free.)
	from .exemplars import pick_exemplar
	ex = pick_exemplar(prompt)
	genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []}
	return genome, list(ex["boxes"]), RuntimeStatus("Built from a reference template")

	def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]:
	import json
	from .exemplars import pick_exemplar
	from .boxes import repair_boxes
	from .fallback import fuse_boxes_fallback
	from .prompts import build_freeform_fuse_messages, BOX_SCHEMA
	ex = pick_exemplar((parent_a.get("name") or "") + " " + (parent_b.get("name") or ""))
	name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}"
	fb_boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1)
	try:
	raw = self._chat_json(
	build_freeform_fuse_messages(parent_a, parent_b, json.dumps(ex)), BOX_SCHEMA,
	max_tokens=int(os.environ.get("BUDDY_FREEFORM_MAX_TOKENS", "1000")),
	temperature=0.9,
	heavy=True,
	) or {}
	boxes = repair_boxes(raw.get("boxes") if isinstance(raw, dict) else None)
	child_name = (raw.get("name") if isinstance(raw, dict) else None) or name
	if boxes:
	return {"name": child_name, "archetype": "chick", "parts": []}, boxes, RuntimeStatus("Generated by the local model")
	except Exception: # noqa: BLE001 - runtime failures must not break the demo
	pass
	return {"name": name, "archetype": "chick", "parts": []}, fb_boxes, RuntimeStatus("Used a reference template", used_fallback=True)

	def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]:
	# Model-free by design: the bench tweak box must never call the model — only
	# Splice (fuse_creatures) does. An English keyword parser maps the instruction
	# to a catalog part + parameters and the assembler snaps it on deterministically.
	# An instruction that matches no catalog part defaults to horns, so a tweak
	# always produces a visible change (mirrors the fake runtime). (The model-backed
	# freeform edit used to be a fallback here; it was retired with the box.)
	from .assembler import assemble_part
	from .parts import PARTS
	from .edit_parser import parse_edit
	from .fallback import suggest_tweaks_fallback
	spec = parse_edit(instruction)
	if spec is not None:
	print(f"[edit] part={spec.part} anchor={spec.anchor} count={spec.count} "
	f"scale={spec.scale} rot={spec.rotation}", flush=True)
	new_boxes = assemble_part(boxes, spec.part, spec.anchor, spec.scale,
	spec.color, spec.count, spec.rotation)
	else:
	new_boxes = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"])
	return genome, new_boxes, suggest_tweaks_fallback(new_boxes), RuntimeStatus("Built from the part catalog")

	def suggest_tweaks(self, boxes: list[dict]) -> list[str]:
	from .fallback import suggest_tweaks_fallback
	from .prompts import build_freeform_suggest_messages, TWEAK_SCHEMA
	try:
	raw = self._chat_json(
	build_freeform_suggest_messages(boxes), TWEAK_SCHEMA,
	max_tokens=int(os.environ.get("BUDDY_SUGGEST_MAX_TOKENS", "200")),
	temperature=0.9,
	) or {}
	tweaks = raw.get("tweaks") if isinstance(raw, dict) else None
	clean = [str(t).strip() for t in tweaks if str(t).strip()] if isinstance(tweaks, list) else []
	if clean:
	# Pad/trim to exactly 4, topping up from the fallback if short.
	fill = suggest_tweaks_fallback(boxes)
	while len(clean) < 4:
	clean.append(next((f for f in fill if f not in clean), fill[len(clean) % len(fill)]))
	return clean[:4]
	except Exception: # noqa: BLE001 - suggestions are advisory; never break the bench
	pass
	return suggest_tweaks_fallback(boxes)

	def _chat_json(
	self,
	messages: list[dict[str, str]],
	schema: dict[str, Any],
	*,
	max_tokens: int \| None = None,
	temperature: float = 0.85,
	heavy: bool = False,
	) -> dict[str, Any] \| None:
	# Resolve the model file outside the GPU call (hf_hub_download caches
	# to disk), then run inference inside the @spaces.GPU-wrapped helper so
	# ZeroGPU attaches a GPU for the duration of the completion. heavy=True
	# routes the long-running 12B freeform calls to the larger-duration GPU
	# wrapper so they do not race the ZeroGPU detach and crash mid-decode.
	model_path = self._resolve_model_path()
	run = _run_freeform_completion if heavy else _run_completion
	# On the freeform box path, BUDDY_FREEFORM_NO_GRAMMAR=1 drops the GBNF
	# grammar: it ~2x's 12B decode (the grammar is intrinsic per-token CPU
	# overhead, not schema bloat) while parse_json_object + repair_boxes catch
	# the now-unconstrained output. Short (non-heavy) calls always keep the
	# grammar. See wiki/deployment-strategy.md.
	effective_schema = schema
	if heavy and os.getenv("BUDDY_FREEFORM_NO_GRAMMAR") == "1":
	effective_schema = None
	content = run(
	model_path=model_path,
	messages=messages,
	schema=effective_schema,
	max_tokens=max_tokens or self.config.max_tokens,
	temperature=temperature,
	top_p=0.9,
	n_ctx=max(self.config.n_ctx, 4096),
	n_threads=self.config.n_threads,
	n_gpu_layers=self.config.n_gpu_layers,
	flash_attn=self.config.flash_attn,
	)
	return parse_json_object(content)

	def _resolve_model_path(self) -> str:
	# An explicit local path wins (local dev / tests); otherwise download the
	# configured GGUF (hf_hub_download caches to disk). On a download failure
	# each runtime method already falls back to a hand-authored exemplar, so
	# there is no second model to fall back to.
	if self.config.model_path:
	return self.config.model_path
	from huggingface_hub import hf_hub_download

	return hf_hub_download(repo_id=self.config.model_repo, filename=self.config.model_file)


	def default_runtime() -> BuddyRuntime:
	if os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
	return FakeBuddyRuntime()
	return LlamaCppBuddyRuntime()


	_warmed = False


	def warmup() -> None:
	"""Trigger the one-time CUDA kernel JIT compile ahead of real traffic.

	Wired to Gradio's ``demo.load`` so it fires when the page opens: the ~48s
	first-call compile happens while the player reads the intro, so their first
	real action is already warm. Runs once per container; safe to call
	repeatedly and on CPU / when ``spaces`` is absent. No-ops in the fake runtime.
	"""
	global _warmed
	if _warmed or os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1":
	return
	_warmed = True # claim first so concurrent page loads do not double-fire
	try:
	cfg = LlamaRuntimeConfig.from_env()
	# Warm the model every real action — draw / tweak / splice — runs.
	model_path = LlamaCppBuddyRuntime(cfg)._resolve_model_path()
	_warmup_completion(
	model_path=model_path,
	messages=[{"role": "user", "content": "warmup"}],
	schema=None,
	max_tokens=8,
	temperature=0.0,
	top_p=1.0,
	n_ctx=cfg.n_ctx,
	n_threads=cfg.n_threads,
	n_gpu_layers=cfg.n_gpu_layers,
	flash_attn=cfg.flash_attn,
	)
	if os.getenv("BUDDY_GRAMMAR_BENCH") == "1":
	_grammar_bench(cfg, model_path)
	except Exception as exc: # noqa: BLE001 - warmup is best-effort
	_warmed = False # allow a retry on the next page load
	print(f"[zerogpu-warmup] failed: {exc}", flush=True)


	def _grammar_bench(cfg: "LlamaRuntimeConfig", model_path: str) -> None:
	"""One-off diagnostic: run the freeform prompt UNCONSTRAINED (no GBNF grammar)
	to isolate how much grammar-constrained sampling costs vs raw 12B decode.

	Only the grammar-OFF run lives here: the grammar-ON throughput is already the
	normal [zerogpu-bench] number every real freeform call prints (~17-22 tok/s),
	so there is no need to re-measure it. Running BOTH inside this one demo.load
	handler stacked a 3rd GPU acquisition after the warmup gen and ZeroGPU aborted
	it ("GPU task aborted"); keeping the handler to 2 acquisitions (warmup + this)
	is in the range the log shows succeeds. Uses the same n_ctx=4096 / freeform
	GPU wrapper as the real calls so the number transfers directly. Gated by
	BUDDY_GRAMMAR_BENCH=1; read [grammar-bench-off] off the log, compare against
	the [zerogpu-bench] (grammar-on) lines, then unset the env."""
	import json
	from .exemplars import pick_exemplar
	from .prompts import build_freeform_messages

	prompt = "a small round dragon with two curved horns and a long tail"
	ex = pick_exemplar(prompt)
	try:
	_run_freeform_completion(
	model_path=model_path,
	messages=build_freeform_messages(prompt, json.dumps(ex)),
	schema=None,
	max_tokens=int(os.getenv("BUDDY_GRAMMAR_BENCH_TOKENS", "400")),
	temperature=0.8,
	top_p=0.9,
	n_ctx=max(cfg.n_ctx, 4096),
	n_threads=cfg.n_threads,
	n_gpu_layers=cfg.n_gpu_layers,
	flash_attn=cfg.flash_attn,
	label="grammar-bench-off",
	)
	except Exception as exc: # noqa: BLE001 - diagnostic only, never break warmup
	print(f"[grammar-bench] failed: {exc}", flush=True)


	def _optional_int(value: str \| None) -> int \| None:
	if value is None or value == "":
	return None
	return int(value)