"""Eval harness: the bar a molt must clear before an adapter swap. Programmatic, judge-free metrics on the held-out set (data/eval.jsonl): - fact_retention — numbers, filenames, proper nouns from the event survive - no_banned — zero corporate-assistant phrases - length_ok — speakable: ≤ 280 chars - clean_shape — one utterance: no preamble, no JSON, no paragraph breaks Also reports mean output length per mischief level — the register calibration curve (plain should be shorter than mythic) at a glance. Usage: PUCK_BRAIN_URL=http://127.0.0.1:11434/v1 \ PUCK_BRAIN_MODEL=hf.co/mradermacher/Holo-3.1-4B-GGUF:Q8_0 \ uv run eval.py [--n 999] [--tag baseline] Writes data/eval_report_.json with per-sample outputs for human reading — the metrics gate the swap, but read the samples; charm isn't programmatic. """ import argparse import json import re import statistics import sys from pathlib import Path HERE = Path(__file__).resolve().parent ROOT = HERE.parent sys.path.insert(0, str(ROOT / "server")) from brain import BRAIN_MODEL, BRAIN_URL, _chat_completion # noqa: E402 BANNED = [ "i'd be happy", "i would be happy", "as an ai", "great question", "certainly!", "i cannot assist", "how can i help", "feel free to", "let me know if", ] MAX_CHARS = 280 # the mythic register's vocabulary — fine at high mischief, a calibration # failure at low. Derived from the deck's own tier lines. MYTHIC_WORDS = ["goblin", "bog", "spell", "scroll", "mist", "ritual", "omen", "spirit", "forge", "realm", "bloom", "nest", "whisper"] def facts(title: str) -> list[str]: """The concrete tokens whimsy must not eat: numbers, filenames, proper nouns.""" out = set(re.findall(r"\d+(?:m \d+s)?", title)) # counts, durations out |= set(re.findall(r"[\w-]+\.[a-z]{1,4}(?:\.[a-z]{1,4})?", title)) # file.ts, auth.test.ts out |= {w for w in re.findall(r"\b[A-Z][a-z]{2,}\b", title) if w not in {"The", "New", "Your"}} return sorted(out) def score_one(user: str, output: str, mischief: int | None) -> dict: title = next((ln.removeprefix("What happened: ") for ln in user.splitlines() if ln.startswith("What happened: ")), "") fs = facts(title) kept = [f for f in fs if f.lower() in output.lower()] low = output.lower() # Puck speaks AS himself; naming himself = narrator voice break. # Strip title-derived tokens first — retaining "#puck-build" is a fact, not a voice break. voice_text = low for tok in title.lower().split(): if "puck" in tok: voice_text = voice_text.replace(tok, "") first_person = "puck" not in voice_text # low mischief must be sober: no exclamations, no mythic lexicon register_ok = True if mischief is not None and mischief <= 20: register_ok = "!" not in output and not any(w in low for w in MYTHIC_WORDS) return { "fact_retention": len(kept) / len(fs) if fs else 1.0, "facts_missing": [f for f in fs if f.lower() not in low], "no_banned": not any(b in low for b in BANNED), "length_ok": len(output) <= MAX_CHARS, "clean_shape": "\n\n" not in output and not low.startswith(("here", "sure", "okay,", "{")) and "{" not in output, "first_person": first_person, "register_ok": register_ok, "chars": len(output), } def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--n", type=int, default=999, help="max samples") ap.add_argument("--k", type=int, default=3, help="generations per sample — n=6 at temp 0.7 is noise; k repeats make the gate stable") ap.add_argument("--tag", default="baseline", help="report filename tag") ap.add_argument("--set", default="eval", choices=["eval", "sft"], help="which split to run") args = ap.parse_args() rows = [json.loads(line) for line in (HERE / "data" / f"{args.set}.jsonl").read_text().splitlines()] rows = rows[: args.n] * args.k print(f"evaluating {len(rows)} generations ({args.k} per sample) against {BRAIN_URL} ({BRAIN_MODEL})\n") samples = [] for i, ex in enumerate(rows): system, user, gold = (m["content"] for m in ex["messages"]) output = _chat_completion(system, user, temperature=0.7) s = score_one(user, output, ex["meta"].get("mischief")) samples.append({"meta": ex["meta"], "user": user, "gold": gold, "output": output, **s}) flag = "" if all((s["fact_retention"] == 1, s["no_banned"], s["length_ok"], s["clean_shape"], s["first_person"], s["register_ok"])) else " ⚠" print(f"[{i + 1}/{len(rows)}] {ex['meta'].get('event', ex['meta']['kind'])} m={ex['meta'].get('mischief')}{flag}") print(f" {output[:160]}") agg = { "n": len(samples), "fact_retention": round(statistics.mean(s["fact_retention"] for s in samples), 3), "no_banned": sum(s["no_banned"] for s in samples) / len(samples), "length_ok": sum(s["length_ok"] for s in samples) / len(samples), "clean_shape": sum(s["clean_shape"] for s in samples) / len(samples), "first_person": sum(s["first_person"] for s in samples) / len(samples), "register_ok": sum(s["register_ok"] for s in samples) / len(samples), "mean_chars_by_mischief": { str(m): round(statistics.mean(s["chars"] for s in samples if s["meta"].get("mischief") == m)) for m in sorted({s["meta"].get("mischief") for s in samples if s["meta"].get("mischief") is not None}) }, } report = {"brain": {"url": BRAIN_URL, "model": BRAIN_MODEL}, "aggregate": agg, "samples": samples} out = HERE / "data" / f"eval_report_{args.tag}.json" out.write_text(json.dumps(report, indent=2, ensure_ascii=False)) print(f"\n=== aggregate ({args.tag}) ===") print(json.dumps(agg, indent=2)) print(f"\nreport: {out}") if __name__ == "__main__": main()