Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| import os | |
| from dataclasses import dataclass | |
| from typing import Any, Protocol | |
| try: # ZeroGPU: present on the Space, absent in local/CPU dev | |
| import spaces | |
| except ImportError: # pragma: no cover - local fallback | |
| spaces = None | |
| from .json_utils import parse_json_object | |
| class RuntimeStatus: | |
| label: str | |
| used_fallback: bool = False | |
| def _same_look(a: dict, b: dict) -> bool: | |
| """True when two genome dicts render identically: the visible creature is | |
| driven only by archetype + palette + parts, so a tweak that leaves all three | |
| untouched produces no on-screen change.""" | |
| keys = ("archetype", "palette", "parts") | |
| return all(a.get(k) == b.get(k) for k in keys) | |
| # The app runs a single model: Gemma 4 12B (the only family that draws hard voxel | |
| # forms in the model-selection spike — see wiki/model-selection-spike.md). Every | |
| # AI moment — freeform generation and hatch fusion — goes through it. Gemma 4 | |
| # needs llama-cpp-python > 0.3.19 (the Space pin); the local sibling venv has | |
| # 0.3.28. Override via env for a local gguf. See | |
| # docs/superpowers/specs/2026-06-15-freeform-voxel-generation-design.md. | |
| FREEFORM_MODEL_REPO = os.getenv("BUDDY_FREEFORM_MODEL_REPO", "unsloth/gemma-4-12b-it-GGUF") | |
| FREEFORM_MODEL_FILE = os.getenv("BUDDY_FREEFORM_MODEL_FILE", "gemma-4-12b-it-Q4_K_M.gguf") | |
| # The edit path no longer uses a model: an English keyword parser | |
| # (edit_parser.parse_edit) maps the instruction to a catalog part + parameters | |
| # and the assembler snaps it on deterministically — instant, no GPU. The | |
| # generative model above is reserved for full generation + hatch fusion, and for | |
| # the freeform-edit safety valve when no catalog part matches. | |
| def _blocking_completion( | |
| *, | |
| model_path: str, | |
| messages: list[dict[str, str]], | |
| schema: dict[str, Any] | None, | |
| max_tokens: int, | |
| temperature: float, | |
| top_p: float, | |
| n_ctx: int, | |
| n_threads: int | None, | |
| n_gpu_layers: int, | |
| flash_attn: bool, | |
| label: str = "zerogpu-bench", | |
| ) -> str: | |
| """Build a Llama and run one chat completion. | |
| On ZeroGPU the GPU is attached only inside the @spaces.GPU wrappers below | |
| and detaches when they return, so the model must be (re)constructed here | |
| every call with ``n_gpu_layers=-1`` — there is no persisting it across | |
| calls (a llama.cpp object built in global scope would stay on CPU, since | |
| ``spaces`` only hooks PyTorch's ``.to("cuda")``). | |
| """ | |
| import time | |
| from llama_cpp import Llama | |
| t0 = time.perf_counter() | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=n_ctx, | |
| n_threads=n_threads, | |
| n_gpu_layers=n_gpu_layers, | |
| flash_attn=flash_attn, | |
| verbose=False, | |
| ) | |
| t1 = time.perf_counter() | |
| kwargs: dict[str, Any] = dict( | |
| messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p | |
| ) | |
| if schema is not None: | |
| # llama-cpp-python only honors {"type": "json_object", "schema": ...}; the | |
| # OpenAI-style {"type": "json_schema"} is silently ignored, which left the | |
| # model unconstrained (it returned `type` instead of `kind`, bare strings, | |
| # or a {"genome": {...}} wrapper) and forced every edit onto the keyword | |
| # fallback. json_object + schema turns the schema into an enforced GBNF | |
| # grammar so fuse/tweak output actually matches GENOME_SCHEMA. | |
| kwargs["response_format"] = {"type": "json_object", "schema": schema} | |
| response = llm.create_chat_completion(**kwargs) | |
| t2 = time.perf_counter() | |
| usage = response.get("usage") or {} | |
| tokens = int(usage.get("completion_tokens") or 0) | |
| gen_s = t2 - t1 | |
| tok_s = tokens / gen_s if gen_s > 0 else 0.0 | |
| # Printed to the Space logs so each call's cost is visible (reload + gen). | |
| print( | |
| f"[{label}] reload={t1 - t0:.2f}s gen={gen_s:.2f}s " | |
| f"total={t2 - t0:.2f}s tokens={tokens} ({tok_s:.1f} tok/s) " | |
| f"n_gpu_layers={n_gpu_layers}", | |
| flush=True, | |
| ) | |
| return response["choices"][0]["message"]["content"] | |
| def _run_completion(**kwargs: Any) -> str: | |
| return _blocking_completion(**kwargs) | |
| def _run_freeform_completion(**kwargs: Any) -> str: | |
| return _blocking_completion(**kwargs) | |
| def _warmup_completion(**kwargs: Any) -> str: | |
| return _blocking_completion(label="zerogpu-warmup", **kwargs) | |
| # Freeform Gemma-4-12B box-geometry calls generate 300-600+ tokens at ~20 tok/s | |
| # and routinely hit 25-31s total. On the old shared duration=30 they raced the | |
| # ZeroGPU detach and crashed mid-decode (ggml_cuda_mul_mat_q CUDA abort -> the | |
| # call dies and the user gets the fallback after a full wait). Give them their | |
| # own larger budget. Override with BUDDY_FREEFORM_GPU_DURATION if 12B gen grows. | |
| _FREEFORM_GPU_DURATION = int(os.getenv("BUDDY_FREEFORM_GPU_DURATION", "60")) | |
| if spaces is not None: # pragma: no cover - only active on the ZeroGPU Space | |
| # Short calls (e.g. tweak suggestions) are quick once warm, so keep their GPU | |
| # budget small for better queue behavior. The warmup eats the one-time ~75s | |
| # CUDA kernel JIT compile, so it needs a larger budget. | |
| _run_completion = spaces.GPU(duration=30)(_run_completion) | |
| _run_freeform_completion = spaces.GPU(duration=_FREEFORM_GPU_DURATION)(_run_freeform_completion) | |
| _warmup_completion = spaces.GPU(duration=90)(_warmup_completion) | |
| class BuddyRuntime(Protocol): | |
| def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: | |
| raise NotImplementedError | |
| def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: | |
| raise NotImplementedError | |
| def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: | |
| raise NotImplementedError | |
| def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: | |
| raise NotImplementedError | |
| def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: | |
| raise NotImplementedError | |
| def suggest_tweaks(self, boxes: list[dict]) -> list[str]: | |
| raise NotImplementedError | |
| class FakeBuddyRuntime: | |
| _STATUS = RuntimeStatus("Generated by the local fallback", used_fallback=True) | |
| def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: | |
| from .genome import genome_from_dict, genome_to_dict | |
| from .fallback import fuse_fallback | |
| child = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1) | |
| return genome_to_dict(child), self._STATUS | |
| def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: | |
| from .genome import genome_from_dict, genome_to_dict | |
| from .fallback import voice_edit_genome | |
| edited = voice_edit_genome(genome_from_dict(genome), transcript) | |
| return genome_to_dict(edited), self._STATUS | |
| def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: | |
| from .exemplars import pick_exemplar | |
| ex = pick_exemplar(prompt) | |
| genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []} | |
| return genome, list(ex["boxes"]), self._STATUS | |
| def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: | |
| from .fallback import fuse_boxes_fallback | |
| boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1) | |
| name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}" | |
| return {"name": name, "archetype": "chick", "parts": []}, boxes, self._STATUS | |
| def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: | |
| # Same model-free edit path as the real runtime: the keyword parser maps | |
| # the instruction to a catalog part + parameters, the assembler snaps it | |
| # on. An unmatched instruction defaults to horns so local UI testing | |
| # always produces a visible change. | |
| from .assembler import assemble_part | |
| from .parts import PARTS | |
| from .edit_parser import parse_edit | |
| from .fallback import suggest_tweaks_fallback | |
| spec = parse_edit(instruction) | |
| if spec is not None: | |
| nb = assemble_part(boxes, spec.part, spec.anchor, spec.scale, | |
| spec.color, spec.count, spec.rotation) | |
| else: | |
| nb = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"]) | |
| return genome, nb, suggest_tweaks_fallback(nb), self._STATUS | |
| def suggest_tweaks(self, boxes: list[dict]) -> list[str]: | |
| from .fallback import suggest_tweaks_fallback | |
| return suggest_tweaks_fallback(boxes) | |
| class LlamaRuntimeConfig: | |
| model_path: str | None = None | |
| model_repo: str = FREEFORM_MODEL_REPO | |
| model_file: str = FREEFORM_MODEL_FILE | |
| n_ctx: int = 2048 | |
| n_threads: int | None = None | |
| n_gpu_layers: int = -1 | |
| flash_attn: bool = True | |
| timeout_seconds: float = 35.0 | |
| max_tokens: int = 420 | |
| def from_env(cls) -> "LlamaRuntimeConfig": | |
| return cls( | |
| model_path=os.getenv("BUDDY_MODEL_PATH"), | |
| model_repo=os.getenv("BUDDY_MODEL_REPO", FREEFORM_MODEL_REPO), | |
| model_file=os.getenv("BUDDY_MODEL_FILE", FREEFORM_MODEL_FILE), | |
| n_ctx=int(os.getenv("BUDDY_N_CTX", "2048")), | |
| n_threads=_optional_int(os.getenv("BUDDY_N_THREADS")), | |
| n_gpu_layers=int(os.getenv("BUDDY_N_GPU_LAYERS", "-1")), | |
| flash_attn=os.getenv("BUDDY_FLASH_ATTN", "1") != "0", | |
| timeout_seconds=float(os.getenv("BUDDY_TIMEOUT_SECONDS", "35")), | |
| max_tokens=int(os.getenv("BUDDY_MAX_TOKENS", "420")), | |
| ) | |
| class LlamaCppBuddyRuntime: | |
| def __init__(self, config: LlamaRuntimeConfig | None = None) -> None: | |
| self.config = config or LlamaRuntimeConfig.from_env() | |
| def fuse_genomes(self, parent_a: dict, parent_b: dict) -> tuple[dict, RuntimeStatus]: | |
| from .genome import genome_from_dict, genome_to_dict, repair_genome | |
| from .fallback import fuse_fallback | |
| from .prompts import build_fuse_messages, GENOME_SCHEMA | |
| fb = fuse_fallback(genome_from_dict(parent_a), genome_from_dict(parent_b), seed=1) | |
| try: | |
| raw = self._chat_json( | |
| build_fuse_messages(parent_a, parent_b), GENOME_SCHEMA, | |
| max_tokens=int(os.environ.get("BUDDY_FUSE_MAX_TOKENS", "400")), temperature=0.9, | |
| ) | |
| child = repair_genome(raw, fb) | |
| return genome_to_dict(child), RuntimeStatus("Generated by the local model") | |
| except Exception: # noqa: BLE001 - runtime failures must not break the demo | |
| return genome_to_dict(fb), RuntimeStatus("Used the fallback splice", used_fallback=True) | |
| def interpret_genome_command(self, genome: dict, transcript: str) -> tuple[dict, RuntimeStatus]: | |
| from .genome import genome_from_dict, genome_to_dict, repair_genome | |
| from .fallback import voice_edit_genome | |
| from .prompts import build_genome_command_messages, GENOME_EDIT_SCHEMA | |
| base = genome_from_dict(genome) | |
| base_out = genome_to_dict(base) | |
| fb = genome_to_dict(voice_edit_genome(base, transcript)) | |
| try: | |
| edit = self._chat_json( | |
| build_genome_command_messages(genome, transcript), GENOME_EDIT_SCHEMA, | |
| max_tokens=int(os.environ.get("BUDDY_EDIT_MAX_TOKENS", "200")), temperature=0.5, | |
| ) or {} | |
| merged = {**genome, | |
| "parts": (edit.get("parts") or genome.get("parts")), | |
| "palette": (edit.get("palette") or genome.get("palette")), | |
| "archetype": (edit.get("archetype") or genome.get("archetype"))} | |
| model_out = genome_to_dict(repair_genome(merged, base)) | |
| # A small model often returns a valid-but-empty edit (no parts / | |
| # palette / archetype), which silently leaves the creature untouched | |
| # so the user sees no reaction. When the model changed nothing visible | |
| # but the deterministic keyword editor did, surface that instead so a | |
| # tweak always lands. (The success path used to keep the original and | |
| # never reach the keyword fallback, which only fired on an exception.) | |
| if _same_look(model_out, base_out) and not _same_look(fb, base_out): | |
| return fb, RuntimeStatus("Used the keyword editor", used_fallback=True) | |
| return model_out, RuntimeStatus("Generated by the local model") | |
| except Exception: # noqa: BLE001 - runtime failures must not break the demo | |
| return fb, RuntimeStatus("Used the keyword editor", used_fallback=True) | |
| def generate_creature(self, prompt: str) -> tuple[dict, list[dict], RuntimeStatus]: | |
| # Model-free by design: the bench's from-scratch box must never call the | |
| # model — only Splice (fuse_creatures) does. Pick a reference body by | |
| # keyword, the same deterministic path the fake runtime uses, so real and | |
| # fake agree. (The model-backed draw used to live here; it was retired when | |
| # the bench input was made fully model-free.) | |
| from .exemplars import pick_exemplar | |
| ex = pick_exemplar(prompt) | |
| genome = {"name": prompt.strip()[:40] or "Freeform Buddy", "archetype": "chick", "parts": []} | |
| return genome, list(ex["boxes"]), RuntimeStatus("Built from a reference template") | |
| def fuse_creatures(self, parent_a: dict, parent_b: dict) -> tuple[dict, list[dict], RuntimeStatus]: | |
| import json | |
| from .exemplars import pick_exemplar | |
| from .boxes import repair_boxes | |
| from .fallback import fuse_boxes_fallback | |
| from .prompts import build_freeform_fuse_messages, BOX_SCHEMA | |
| ex = pick_exemplar((parent_a.get("name") or "") + " " + (parent_b.get("name") or "")) | |
| name = f"{(parent_a.get('name') or 'A')[:8]}×{(parent_b.get('name') or 'B')[:8]}" | |
| fb_boxes = fuse_boxes_fallback(parent_a.get("boxes"), parent_b.get("boxes"), seed=1) | |
| try: | |
| raw = self._chat_json( | |
| build_freeform_fuse_messages(parent_a, parent_b, json.dumps(ex)), BOX_SCHEMA, | |
| max_tokens=int(os.environ.get("BUDDY_FREEFORM_MAX_TOKENS", "1000")), | |
| temperature=0.9, | |
| heavy=True, | |
| ) or {} | |
| boxes = repair_boxes(raw.get("boxes") if isinstance(raw, dict) else None) | |
| child_name = (raw.get("name") if isinstance(raw, dict) else None) or name | |
| if boxes: | |
| return {"name": child_name, "archetype": "chick", "parts": []}, boxes, RuntimeStatus("Generated by the local model") | |
| except Exception: # noqa: BLE001 - runtime failures must not break the demo | |
| pass | |
| return {"name": name, "archetype": "chick", "parts": []}, fb_boxes, RuntimeStatus("Used a reference template", used_fallback=True) | |
| def edit_creature(self, genome: dict, boxes: list[dict], instruction: str) -> tuple[dict, list[dict], list[str], RuntimeStatus]: | |
| # Model-free by design: the bench tweak box must never call the model — only | |
| # Splice (fuse_creatures) does. An English keyword parser maps the instruction | |
| # to a catalog part + parameters and the assembler snaps it on deterministically. | |
| # An instruction that matches no catalog part defaults to horns, so a tweak | |
| # always produces a visible change (mirrors the fake runtime). (The model-backed | |
| # freeform edit used to be a fallback here; it was retired with the box.) | |
| from .assembler import assemble_part | |
| from .parts import PARTS | |
| from .edit_parser import parse_edit | |
| from .fallback import suggest_tweaks_fallback | |
| spec = parse_edit(instruction) | |
| if spec is not None: | |
| print(f"[edit] part={spec.part} anchor={spec.anchor} count={spec.count} " | |
| f"scale={spec.scale} rot={spec.rotation}", flush=True) | |
| new_boxes = assemble_part(boxes, spec.part, spec.anchor, spec.scale, | |
| spec.color, spec.count, spec.rotation) | |
| else: | |
| new_boxes = assemble_part(boxes, "horns", PARTS["horns"]["default_anchor"]) | |
| return genome, new_boxes, suggest_tweaks_fallback(new_boxes), RuntimeStatus("Built from the part catalog") | |
| def suggest_tweaks(self, boxes: list[dict]) -> list[str]: | |
| from .fallback import suggest_tweaks_fallback | |
| from .prompts import build_freeform_suggest_messages, TWEAK_SCHEMA | |
| try: | |
| raw = self._chat_json( | |
| build_freeform_suggest_messages(boxes), TWEAK_SCHEMA, | |
| max_tokens=int(os.environ.get("BUDDY_SUGGEST_MAX_TOKENS", "200")), | |
| temperature=0.9, | |
| ) or {} | |
| tweaks = raw.get("tweaks") if isinstance(raw, dict) else None | |
| clean = [str(t).strip() for t in tweaks if str(t).strip()] if isinstance(tweaks, list) else [] | |
| if clean: | |
| # Pad/trim to exactly 4, topping up from the fallback if short. | |
| fill = suggest_tweaks_fallback(boxes) | |
| while len(clean) < 4: | |
| clean.append(next((f for f in fill if f not in clean), fill[len(clean) % len(fill)])) | |
| return clean[:4] | |
| except Exception: # noqa: BLE001 - suggestions are advisory; never break the bench | |
| pass | |
| return suggest_tweaks_fallback(boxes) | |
| def _chat_json( | |
| self, | |
| messages: list[dict[str, str]], | |
| schema: dict[str, Any], | |
| *, | |
| max_tokens: int | None = None, | |
| temperature: float = 0.85, | |
| heavy: bool = False, | |
| ) -> dict[str, Any] | None: | |
| # Resolve the model file outside the GPU call (hf_hub_download caches | |
| # to disk), then run inference inside the @spaces.GPU-wrapped helper so | |
| # ZeroGPU attaches a GPU for the duration of the completion. heavy=True | |
| # routes the long-running 12B freeform calls to the larger-duration GPU | |
| # wrapper so they do not race the ZeroGPU detach and crash mid-decode. | |
| model_path = self._resolve_model_path() | |
| run = _run_freeform_completion if heavy else _run_completion | |
| # On the freeform box path, BUDDY_FREEFORM_NO_GRAMMAR=1 drops the GBNF | |
| # grammar: it ~2x's 12B decode (the grammar is intrinsic per-token CPU | |
| # overhead, not schema bloat) while parse_json_object + repair_boxes catch | |
| # the now-unconstrained output. Short (non-heavy) calls always keep the | |
| # grammar. See wiki/deployment-strategy.md. | |
| effective_schema = schema | |
| if heavy and os.getenv("BUDDY_FREEFORM_NO_GRAMMAR") == "1": | |
| effective_schema = None | |
| content = run( | |
| model_path=model_path, | |
| messages=messages, | |
| schema=effective_schema, | |
| max_tokens=max_tokens or self.config.max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| n_ctx=max(self.config.n_ctx, 4096), | |
| n_threads=self.config.n_threads, | |
| n_gpu_layers=self.config.n_gpu_layers, | |
| flash_attn=self.config.flash_attn, | |
| ) | |
| return parse_json_object(content) | |
| def _resolve_model_path(self) -> str: | |
| # An explicit local path wins (local dev / tests); otherwise download the | |
| # configured GGUF (hf_hub_download caches to disk). On a download failure | |
| # each runtime method already falls back to a hand-authored exemplar, so | |
| # there is no second model to fall back to. | |
| if self.config.model_path: | |
| return self.config.model_path | |
| from huggingface_hub import hf_hub_download | |
| return hf_hub_download(repo_id=self.config.model_repo, filename=self.config.model_file) | |
| def default_runtime() -> BuddyRuntime: | |
| if os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1": | |
| return FakeBuddyRuntime() | |
| return LlamaCppBuddyRuntime() | |
| _warmed = False | |
| def warmup() -> None: | |
| """Trigger the one-time CUDA kernel JIT compile ahead of real traffic. | |
| Wired to Gradio's ``demo.load`` so it fires when the page opens: the ~48s | |
| first-call compile happens while the player reads the intro, so their first | |
| real action is already warm. Runs once per container; safe to call | |
| repeatedly and on CPU / when ``spaces`` is absent. No-ops in the fake runtime. | |
| """ | |
| global _warmed | |
| if _warmed or os.getenv("BUDDY_FORCE_FAKE_RUNTIME") == "1": | |
| return | |
| _warmed = True # claim first so concurrent page loads do not double-fire | |
| try: | |
| cfg = LlamaRuntimeConfig.from_env() | |
| # Warm the model every real action — draw / tweak / splice — runs. | |
| model_path = LlamaCppBuddyRuntime(cfg)._resolve_model_path() | |
| _warmup_completion( | |
| model_path=model_path, | |
| messages=[{"role": "user", "content": "warmup"}], | |
| schema=None, | |
| max_tokens=8, | |
| temperature=0.0, | |
| top_p=1.0, | |
| n_ctx=cfg.n_ctx, | |
| n_threads=cfg.n_threads, | |
| n_gpu_layers=cfg.n_gpu_layers, | |
| flash_attn=cfg.flash_attn, | |
| ) | |
| if os.getenv("BUDDY_GRAMMAR_BENCH") == "1": | |
| _grammar_bench(cfg, model_path) | |
| except Exception as exc: # noqa: BLE001 - warmup is best-effort | |
| _warmed = False # allow a retry on the next page load | |
| print(f"[zerogpu-warmup] failed: {exc}", flush=True) | |
| def _grammar_bench(cfg: "LlamaRuntimeConfig", model_path: str) -> None: | |
| """One-off diagnostic: run the freeform prompt UNCONSTRAINED (no GBNF grammar) | |
| to isolate how much grammar-constrained sampling costs vs raw 12B decode. | |
| Only the grammar-OFF run lives here: the grammar-ON throughput is already the | |
| normal [zerogpu-bench] number every real freeform call prints (~17-22 tok/s), | |
| so there is no need to re-measure it. Running BOTH inside this one demo.load | |
| handler stacked a 3rd GPU acquisition after the warmup gen and ZeroGPU aborted | |
| it ("GPU task aborted"); keeping the handler to 2 acquisitions (warmup + this) | |
| is in the range the log shows succeeds. Uses the same n_ctx=4096 / freeform | |
| GPU wrapper as the real calls so the number transfers directly. Gated by | |
| BUDDY_GRAMMAR_BENCH=1; read [grammar-bench-off] off the log, compare against | |
| the [zerogpu-bench] (grammar-on) lines, then unset the env.""" | |
| import json | |
| from .exemplars import pick_exemplar | |
| from .prompts import build_freeform_messages | |
| prompt = "a small round dragon with two curved horns and a long tail" | |
| ex = pick_exemplar(prompt) | |
| try: | |
| _run_freeform_completion( | |
| model_path=model_path, | |
| messages=build_freeform_messages(prompt, json.dumps(ex)), | |
| schema=None, | |
| max_tokens=int(os.getenv("BUDDY_GRAMMAR_BENCH_TOKENS", "400")), | |
| temperature=0.8, | |
| top_p=0.9, | |
| n_ctx=max(cfg.n_ctx, 4096), | |
| n_threads=cfg.n_threads, | |
| n_gpu_layers=cfg.n_gpu_layers, | |
| flash_attn=cfg.flash_attn, | |
| label="grammar-bench-off", | |
| ) | |
| except Exception as exc: # noqa: BLE001 - diagnostic only, never break warmup | |
| print(f"[grammar-bench] failed: {exc}", flush=True) | |
| def _optional_int(value: str | None) -> int | None: | |
| if value is None or value == "": | |
| return None | |
| return int(value) | |