Spaces:

build-small-hackathon
/

puck

Running

App Files Files Community

puck / molt /eval.py

vu1n

Puck — desktop fairy familiar (HF Build Small)

3c124f3 19 days ago

Raw

History Blame Contribute Delete

5.92 kB

	"""Eval harness: the bar a molt must clear before an adapter swap.

	Programmatic, judge-free metrics on the held-out set (data/eval.jsonl):
	- fact_retention — numbers, filenames, proper nouns from the event survive
	- no_banned — zero corporate-assistant phrases
	- length_ok — speakable: ≤ 280 chars
	- clean_shape — one utterance: no preamble, no JSON, no paragraph breaks

	Also reports mean output length per mischief level — the register
	calibration curve (plain should be shorter than mythic) at a glance.

	Usage:
	PUCK_BRAIN_URL=http://127.0.0.1:11434/v1 \
	PUCK_BRAIN_MODEL=hf.co/mradermacher/Holo-3.1-4B-GGUF:Q8_0 \
	uv run eval.py [--n 999] [--tag baseline]

	Writes data/eval_report_<tag>.json with per-sample outputs for human reading —
	the metrics gate the swap, but read the samples; charm isn't programmatic.
	"""

	import argparse
	import json
	import re
	import statistics
	import sys
	from pathlib import Path

	HERE = Path(__file__).resolve().parent
	ROOT = HERE.parent
	sys.path.insert(0, str(ROOT / "server"))
	from brain import BRAIN_MODEL, BRAIN_URL, _chat_completion # noqa: E402

	BANNED = [
	"i'd be happy",
	"i would be happy",
	"as an ai",
	"great question",
	"certainly!",
	"i cannot assist",
	"how can i help",
	"feel free to",
	"let me know if",
	]

	MAX_CHARS = 280

	# the mythic register's vocabulary — fine at high mischief, a calibration
	# failure at low. Derived from the deck's own tier lines.
	MYTHIC_WORDS = ["goblin", "bog", "spell", "scroll", "mist", "ritual", "omen", "spirit", "forge", "realm", "bloom", "nest", "whisper"]


	def facts(title: str) -> list[str]:
	"""The concrete tokens whimsy must not eat: numbers, filenames, proper nouns."""
	out = set(re.findall(r"\d+(?:m \d+s)?", title)) # counts, durations
	out \|= set(re.findall(r"[\w-]+\.[a-z]{1,4}(?:\.[a-z]{1,4})?", title)) # file.ts, auth.test.ts
	out \|= {w for w in re.findall(r"\b[A-Z][a-z]{2,}\b", title) if w not in {"The", "New", "Your"}}
	return sorted(out)


	def score_one(user: str, output: str, mischief: int \| None) -> dict:
	title = next((ln.removeprefix("What happened: ") for ln in user.splitlines() if ln.startswith("What happened: ")), "")
	fs = facts(title)
	kept = [f for f in fs if f.lower() in output.lower()]
	low = output.lower()
	# Puck speaks AS himself; naming himself = narrator voice break.
	# Strip title-derived tokens first — retaining "#puck-build" is a fact, not a voice break.
	voice_text = low
	for tok in title.lower().split():
	if "puck" in tok:
	voice_text = voice_text.replace(tok, "")
	first_person = "puck" not in voice_text
	# low mischief must be sober: no exclamations, no mythic lexicon
	register_ok = True
	if mischief is not None and mischief <= 20:
	register_ok = "!" not in output and not any(w in low for w in MYTHIC_WORDS)
	return {
	"fact_retention": len(kept) / len(fs) if fs else 1.0,
	"facts_missing": [f for f in fs if f.lower() not in low],
	"no_banned": not any(b in low for b in BANNED),
	"length_ok": len(output) <= MAX_CHARS,
	"clean_shape": "\n\n" not in output and not low.startswith(("here", "sure", "okay,", "{")) and "{" not in output,
	"first_person": first_person,
	"register_ok": register_ok,
	"chars": len(output),
	}


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--n", type=int, default=999, help="max samples")
	ap.add_argument("--k", type=int, default=3, help="generations per sample — n=6 at temp 0.7 is noise; k repeats make the gate stable")
	ap.add_argument("--tag", default="baseline", help="report filename tag")
	ap.add_argument("--set", default="eval", choices=["eval", "sft"], help="which split to run")
	args = ap.parse_args()

	rows = [json.loads(line) for line in (HERE / "data" / f"{args.set}.jsonl").read_text().splitlines()]
	rows = rows[: args.n] * args.k
	print(f"evaluating {len(rows)} generations ({args.k} per sample) against {BRAIN_URL} ({BRAIN_MODEL})\n")

	samples = []
	for i, ex in enumerate(rows):
	system, user, gold = (m["content"] for m in ex["messages"])
	output = _chat_completion(system, user, temperature=0.7)
	s = score_one(user, output, ex["meta"].get("mischief"))
	samples.append({"meta": ex["meta"], "user": user, "gold": gold, "output": output, **s})
	flag = "" if all((s["fact_retention"] == 1, s["no_banned"], s["length_ok"], s["clean_shape"], s["first_person"], s["register_ok"])) else " ⚠"
	print(f"[{i + 1}/{len(rows)}] {ex['meta'].get('event', ex['meta']['kind'])} m={ex['meta'].get('mischief')}{flag}")
	print(f" {output[:160]}")

	agg = {
	"n": len(samples),
	"fact_retention": round(statistics.mean(s["fact_retention"] for s in samples), 3),
	"no_banned": sum(s["no_banned"] for s in samples) / len(samples),
	"length_ok": sum(s["length_ok"] for s in samples) / len(samples),
	"clean_shape": sum(s["clean_shape"] for s in samples) / len(samples),
	"first_person": sum(s["first_person"] for s in samples) / len(samples),
	"register_ok": sum(s["register_ok"] for s in samples) / len(samples),
	"mean_chars_by_mischief": {
	str(m): round(statistics.mean(s["chars"] for s in samples if s["meta"].get("mischief") == m))
	for m in sorted({s["meta"].get("mischief") for s in samples if s["meta"].get("mischief") is not None})
	},
	}
	report = {"brain": {"url": BRAIN_URL, "model": BRAIN_MODEL}, "aggregate": agg, "samples": samples}
	out = HERE / "data" / f"eval_report_{args.tag}.json"
	out.write_text(json.dumps(report, indent=2, ensure_ascii=False))
	print(f"\n=== aggregate ({args.tag}) ===")
	print(json.dumps(agg, indent=2))
	print(f"\nreport: {out}")


	if __name__ == "__main__":
	main()