Spaces:
Running
Running
| """Eval harness: the bar a molt must clear before an adapter swap. | |
| Programmatic, judge-free metrics on the held-out set (data/eval.jsonl): | |
| - fact_retention — numbers, filenames, proper nouns from the event survive | |
| - no_banned — zero corporate-assistant phrases | |
| - length_ok — speakable: ≤ 280 chars | |
| - clean_shape — one utterance: no preamble, no JSON, no paragraph breaks | |
| Also reports mean output length per mischief level — the register | |
| calibration curve (plain should be shorter than mythic) at a glance. | |
| Usage: | |
| PUCK_BRAIN_URL=http://127.0.0.1:11434/v1 \ | |
| PUCK_BRAIN_MODEL=hf.co/mradermacher/Holo-3.1-4B-GGUF:Q8_0 \ | |
| uv run eval.py [--n 999] [--tag baseline] | |
| Writes data/eval_report_<tag>.json with per-sample outputs for human reading — | |
| the metrics gate the swap, but read the samples; charm isn't programmatic. | |
| """ | |
| import argparse | |
| import json | |
| import re | |
| import statistics | |
| import sys | |
| from pathlib import Path | |
| HERE = Path(__file__).resolve().parent | |
| ROOT = HERE.parent | |
| sys.path.insert(0, str(ROOT / "server")) | |
| from brain import BRAIN_MODEL, BRAIN_URL, _chat_completion # noqa: E402 | |
| BANNED = [ | |
| "i'd be happy", | |
| "i would be happy", | |
| "as an ai", | |
| "great question", | |
| "certainly!", | |
| "i cannot assist", | |
| "how can i help", | |
| "feel free to", | |
| "let me know if", | |
| ] | |
| MAX_CHARS = 280 | |
| # the mythic register's vocabulary — fine at high mischief, a calibration | |
| # failure at low. Derived from the deck's own tier lines. | |
| MYTHIC_WORDS = ["goblin", "bog", "spell", "scroll", "mist", "ritual", "omen", "spirit", "forge", "realm", "bloom", "nest", "whisper"] | |
| def facts(title: str) -> list[str]: | |
| """The concrete tokens whimsy must not eat: numbers, filenames, proper nouns.""" | |
| out = set(re.findall(r"\d+(?:m \d+s)?", title)) # counts, durations | |
| out |= set(re.findall(r"[\w-]+\.[a-z]{1,4}(?:\.[a-z]{1,4})?", title)) # file.ts, auth.test.ts | |
| out |= {w for w in re.findall(r"\b[A-Z][a-z]{2,}\b", title) if w not in {"The", "New", "Your"}} | |
| return sorted(out) | |
| def score_one(user: str, output: str, mischief: int | None) -> dict: | |
| title = next((ln.removeprefix("What happened: ") for ln in user.splitlines() if ln.startswith("What happened: ")), "") | |
| fs = facts(title) | |
| kept = [f for f in fs if f.lower() in output.lower()] | |
| low = output.lower() | |
| # Puck speaks AS himself; naming himself = narrator voice break. | |
| # Strip title-derived tokens first — retaining "#puck-build" is a fact, not a voice break. | |
| voice_text = low | |
| for tok in title.lower().split(): | |
| if "puck" in tok: | |
| voice_text = voice_text.replace(tok, "") | |
| first_person = "puck" not in voice_text | |
| # low mischief must be sober: no exclamations, no mythic lexicon | |
| register_ok = True | |
| if mischief is not None and mischief <= 20: | |
| register_ok = "!" not in output and not any(w in low for w in MYTHIC_WORDS) | |
| return { | |
| "fact_retention": len(kept) / len(fs) if fs else 1.0, | |
| "facts_missing": [f for f in fs if f.lower() not in low], | |
| "no_banned": not any(b in low for b in BANNED), | |
| "length_ok": len(output) <= MAX_CHARS, | |
| "clean_shape": "\n\n" not in output and not low.startswith(("here", "sure", "okay,", "{")) and "{" not in output, | |
| "first_person": first_person, | |
| "register_ok": register_ok, | |
| "chars": len(output), | |
| } | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--n", type=int, default=999, help="max samples") | |
| ap.add_argument("--k", type=int, default=3, help="generations per sample — n=6 at temp 0.7 is noise; k repeats make the gate stable") | |
| ap.add_argument("--tag", default="baseline", help="report filename tag") | |
| ap.add_argument("--set", default="eval", choices=["eval", "sft"], help="which split to run") | |
| args = ap.parse_args() | |
| rows = [json.loads(line) for line in (HERE / "data" / f"{args.set}.jsonl").read_text().splitlines()] | |
| rows = rows[: args.n] * args.k | |
| print(f"evaluating {len(rows)} generations ({args.k} per sample) against {BRAIN_URL} ({BRAIN_MODEL})\n") | |
| samples = [] | |
| for i, ex in enumerate(rows): | |
| system, user, gold = (m["content"] for m in ex["messages"]) | |
| output = _chat_completion(system, user, temperature=0.7) | |
| s = score_one(user, output, ex["meta"].get("mischief")) | |
| samples.append({"meta": ex["meta"], "user": user, "gold": gold, "output": output, **s}) | |
| flag = "" if all((s["fact_retention"] == 1, s["no_banned"], s["length_ok"], s["clean_shape"], s["first_person"], s["register_ok"])) else " ⚠" | |
| print(f"[{i + 1}/{len(rows)}] {ex['meta'].get('event', ex['meta']['kind'])} m={ex['meta'].get('mischief')}{flag}") | |
| print(f" {output[:160]}") | |
| agg = { | |
| "n": len(samples), | |
| "fact_retention": round(statistics.mean(s["fact_retention"] for s in samples), 3), | |
| "no_banned": sum(s["no_banned"] for s in samples) / len(samples), | |
| "length_ok": sum(s["length_ok"] for s in samples) / len(samples), | |
| "clean_shape": sum(s["clean_shape"] for s in samples) / len(samples), | |
| "first_person": sum(s["first_person"] for s in samples) / len(samples), | |
| "register_ok": sum(s["register_ok"] for s in samples) / len(samples), | |
| "mean_chars_by_mischief": { | |
| str(m): round(statistics.mean(s["chars"] for s in samples if s["meta"].get("mischief") == m)) | |
| for m in sorted({s["meta"].get("mischief") for s in samples if s["meta"].get("mischief") is not None}) | |
| }, | |
| } | |
| report = {"brain": {"url": BRAIN_URL, "model": BRAIN_MODEL}, "aggregate": agg, "samples": samples} | |
| out = HERE / "data" / f"eval_report_{args.tag}.json" | |
| out.write_text(json.dumps(report, indent=2, ensure_ascii=False)) | |
| print(f"\n=== aggregate ({args.tag}) ===") | |
| print(json.dumps(agg, indent=2)) | |
| print(f"\nreport: {out}") | |
| if __name__ == "__main__": | |
| main() | |