puck / molt /eval.py
vu1n's picture
Puck — desktop fairy familiar (HF Build Small)
3c124f3
Raw
History Blame Contribute Delete
5.92 kB
"""Eval harness: the bar a molt must clear before an adapter swap.
Programmatic, judge-free metrics on the held-out set (data/eval.jsonl):
- fact_retention — numbers, filenames, proper nouns from the event survive
- no_banned — zero corporate-assistant phrases
- length_ok — speakable: ≤ 280 chars
- clean_shape — one utterance: no preamble, no JSON, no paragraph breaks
Also reports mean output length per mischief level — the register
calibration curve (plain should be shorter than mythic) at a glance.
Usage:
PUCK_BRAIN_URL=http://127.0.0.1:11434/v1 \
PUCK_BRAIN_MODEL=hf.co/mradermacher/Holo-3.1-4B-GGUF:Q8_0 \
uv run eval.py [--n 999] [--tag baseline]
Writes data/eval_report_<tag>.json with per-sample outputs for human reading —
the metrics gate the swap, but read the samples; charm isn't programmatic.
"""
import argparse
import json
import re
import statistics
import sys
from pathlib import Path
HERE = Path(__file__).resolve().parent
ROOT = HERE.parent
sys.path.insert(0, str(ROOT / "server"))
from brain import BRAIN_MODEL, BRAIN_URL, _chat_completion # noqa: E402
BANNED = [
"i'd be happy",
"i would be happy",
"as an ai",
"great question",
"certainly!",
"i cannot assist",
"how can i help",
"feel free to",
"let me know if",
]
MAX_CHARS = 280
# the mythic register's vocabulary — fine at high mischief, a calibration
# failure at low. Derived from the deck's own tier lines.
MYTHIC_WORDS = ["goblin", "bog", "spell", "scroll", "mist", "ritual", "omen", "spirit", "forge", "realm", "bloom", "nest", "whisper"]
def facts(title: str) -> list[str]:
"""The concrete tokens whimsy must not eat: numbers, filenames, proper nouns."""
out = set(re.findall(r"\d+(?:m \d+s)?", title)) # counts, durations
out |= set(re.findall(r"[\w-]+\.[a-z]{1,4}(?:\.[a-z]{1,4})?", title)) # file.ts, auth.test.ts
out |= {w for w in re.findall(r"\b[A-Z][a-z]{2,}\b", title) if w not in {"The", "New", "Your"}}
return sorted(out)
def score_one(user: str, output: str, mischief: int | None) -> dict:
title = next((ln.removeprefix("What happened: ") for ln in user.splitlines() if ln.startswith("What happened: ")), "")
fs = facts(title)
kept = [f for f in fs if f.lower() in output.lower()]
low = output.lower()
# Puck speaks AS himself; naming himself = narrator voice break.
# Strip title-derived tokens first — retaining "#puck-build" is a fact, not a voice break.
voice_text = low
for tok in title.lower().split():
if "puck" in tok:
voice_text = voice_text.replace(tok, "")
first_person = "puck" not in voice_text
# low mischief must be sober: no exclamations, no mythic lexicon
register_ok = True
if mischief is not None and mischief <= 20:
register_ok = "!" not in output and not any(w in low for w in MYTHIC_WORDS)
return {
"fact_retention": len(kept) / len(fs) if fs else 1.0,
"facts_missing": [f for f in fs if f.lower() not in low],
"no_banned": not any(b in low for b in BANNED),
"length_ok": len(output) <= MAX_CHARS,
"clean_shape": "\n\n" not in output and not low.startswith(("here", "sure", "okay,", "{")) and "{" not in output,
"first_person": first_person,
"register_ok": register_ok,
"chars": len(output),
}
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--n", type=int, default=999, help="max samples")
ap.add_argument("--k", type=int, default=3, help="generations per sample — n=6 at temp 0.7 is noise; k repeats make the gate stable")
ap.add_argument("--tag", default="baseline", help="report filename tag")
ap.add_argument("--set", default="eval", choices=["eval", "sft"], help="which split to run")
args = ap.parse_args()
rows = [json.loads(line) for line in (HERE / "data" / f"{args.set}.jsonl").read_text().splitlines()]
rows = rows[: args.n] * args.k
print(f"evaluating {len(rows)} generations ({args.k} per sample) against {BRAIN_URL} ({BRAIN_MODEL})\n")
samples = []
for i, ex in enumerate(rows):
system, user, gold = (m["content"] for m in ex["messages"])
output = _chat_completion(system, user, temperature=0.7)
s = score_one(user, output, ex["meta"].get("mischief"))
samples.append({"meta": ex["meta"], "user": user, "gold": gold, "output": output, **s})
flag = "" if all((s["fact_retention"] == 1, s["no_banned"], s["length_ok"], s["clean_shape"], s["first_person"], s["register_ok"])) else " ⚠"
print(f"[{i + 1}/{len(rows)}] {ex['meta'].get('event', ex['meta']['kind'])} m={ex['meta'].get('mischief')}{flag}")
print(f" {output[:160]}")
agg = {
"n": len(samples),
"fact_retention": round(statistics.mean(s["fact_retention"] for s in samples), 3),
"no_banned": sum(s["no_banned"] for s in samples) / len(samples),
"length_ok": sum(s["length_ok"] for s in samples) / len(samples),
"clean_shape": sum(s["clean_shape"] for s in samples) / len(samples),
"first_person": sum(s["first_person"] for s in samples) / len(samples),
"register_ok": sum(s["register_ok"] for s in samples) / len(samples),
"mean_chars_by_mischief": {
str(m): round(statistics.mean(s["chars"] for s in samples if s["meta"].get("mischief") == m))
for m in sorted({s["meta"].get("mischief") for s in samples if s["meta"].get("mischief") is not None})
},
}
report = {"brain": {"url": BRAIN_URL, "model": BRAIN_MODEL}, "aggregate": agg, "samples": samples}
out = HERE / "data" / f"eval_report_{args.tag}.json"
out.write_text(json.dumps(report, indent=2, ensure_ascii=False))
print(f"\n=== aggregate ({args.tag}) ===")
print(json.dumps(agg, indent=2))
print(f"\nreport: {out}")
if __name__ == "__main__":
main()