"""Eval harness: the bar a molt must clear before an adapter swap.

Programmatic, judge-free metrics on the held-out set (data/eval.jsonl):
- fact_retention  — numbers, filenames, proper nouns from the event survive
- no_banned       — zero corporate-assistant phrases
- length_ok       — speakable: ≤ 280 chars
- clean_shape     — one utterance: no preamble, no JSON, no paragraph breaks

Also reports mean output length per mischief level — the register
calibration curve (plain should be shorter than mythic) at a glance.

Usage:
  PUCK_BRAIN_URL=http://127.0.0.1:11434/v1 \
  PUCK_BRAIN_MODEL=hf.co/mradermacher/Holo-3.1-4B-GGUF:Q8_0 \
  uv run eval.py [--n 999] [--tag baseline]

Writes data/eval_report_<tag>.json with per-sample outputs for human reading —
the metrics gate the swap, but read the samples; charm isn't programmatic.
"""

import argparse
import json
import re
import statistics
import sys
from pathlib import Path

HERE = Path(__file__).resolve().parent
ROOT = HERE.parent
sys.path.insert(0, str(ROOT / "server"))
from brain import BRAIN_MODEL, BRAIN_URL, _chat_completion  # noqa: E402

BANNED = [
    "i'd be happy",
    "i would be happy",
    "as an ai",
    "great question",
    "certainly!",
    "i cannot assist",
    "how can i help",
    "feel free to",
    "let me know if",
]

MAX_CHARS = 280

# the mythic register's vocabulary — fine at high mischief, a calibration
# failure at low. Derived from the deck's own tier lines.
MYTHIC_WORDS = ["goblin", "bog", "spell", "scroll", "mist", "ritual", "omen", "spirit", "forge", "realm", "bloom", "nest", "whisper"]


def facts(title: str) -> list[str]:
    """The concrete tokens whimsy must not eat: numbers, filenames, proper nouns."""
    out = set(re.findall(r"\d+(?:m \d+s)?", title))  # counts, durations
    out |= set(re.findall(r"[\w-]+\.[a-z]{1,4}(?:\.[a-z]{1,4})?", title))  # file.ts, auth.test.ts
    out |= {w for w in re.findall(r"\b[A-Z][a-z]{2,}\b", title) if w not in {"The", "New", "Your"}}
    return sorted(out)


def score_one(user: str, output: str, mischief: int | None) -> dict:
    title = next((ln.removeprefix("What happened: ") for ln in user.splitlines() if ln.startswith("What happened: ")), "")
    fs = facts(title)
    kept = [f for f in fs if f.lower() in output.lower()]
    low = output.lower()
    # Puck speaks AS himself; naming himself = narrator voice break.
    # Strip title-derived tokens first — retaining "#puck-build" is a fact, not a voice break.
    voice_text = low
    for tok in title.lower().split():
        if "puck" in tok:
            voice_text = voice_text.replace(tok, "")
    first_person = "puck" not in voice_text
    # low mischief must be sober: no exclamations, no mythic lexicon
    register_ok = True
    if mischief is not None and mischief <= 20:
        register_ok = "!" not in output and not any(w in low for w in MYTHIC_WORDS)
    return {
        "fact_retention": len(kept) / len(fs) if fs else 1.0,
        "facts_missing": [f for f in fs if f.lower() not in low],
        "no_banned": not any(b in low for b in BANNED),
        "length_ok": len(output) <= MAX_CHARS,
        "clean_shape": "\n\n" not in output and not low.startswith(("here", "sure", "okay,", "{")) and "{" not in output,
        "first_person": first_person,
        "register_ok": register_ok,
        "chars": len(output),
    }


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--n", type=int, default=999, help="max samples")
    ap.add_argument("--k", type=int, default=3, help="generations per sample — n=6 at temp 0.7 is noise; k repeats make the gate stable")
    ap.add_argument("--tag", default="baseline", help="report filename tag")
    ap.add_argument("--set", default="eval", choices=["eval", "sft"], help="which split to run")
    args = ap.parse_args()

    rows = [json.loads(line) for line in (HERE / "data" / f"{args.set}.jsonl").read_text().splitlines()]
    rows = rows[: args.n] * args.k
    print(f"evaluating {len(rows)} generations ({args.k} per sample) against {BRAIN_URL} ({BRAIN_MODEL})\n")

    samples = []
    for i, ex in enumerate(rows):
        system, user, gold = (m["content"] for m in ex["messages"])
        output = _chat_completion(system, user, temperature=0.7)
        s = score_one(user, output, ex["meta"].get("mischief"))
        samples.append({"meta": ex["meta"], "user": user, "gold": gold, "output": output, **s})
        flag = "" if all((s["fact_retention"] == 1, s["no_banned"], s["length_ok"], s["clean_shape"], s["first_person"], s["register_ok"])) else "  ⚠"
        print(f"[{i + 1}/{len(rows)}] {ex['meta'].get('event', ex['meta']['kind'])} m={ex['meta'].get('mischief')}{flag}")
        print(f"   {output[:160]}")

    agg = {
        "n": len(samples),
        "fact_retention": round(statistics.mean(s["fact_retention"] for s in samples), 3),
        "no_banned": sum(s["no_banned"] for s in samples) / len(samples),
        "length_ok": sum(s["length_ok"] for s in samples) / len(samples),
        "clean_shape": sum(s["clean_shape"] for s in samples) / len(samples),
        "first_person": sum(s["first_person"] for s in samples) / len(samples),
        "register_ok": sum(s["register_ok"] for s in samples) / len(samples),
        "mean_chars_by_mischief": {
            str(m): round(statistics.mean(s["chars"] for s in samples if s["meta"].get("mischief") == m))
            for m in sorted({s["meta"].get("mischief") for s in samples if s["meta"].get("mischief") is not None})
        },
    }
    report = {"brain": {"url": BRAIN_URL, "model": BRAIN_MODEL}, "aggregate": agg, "samples": samples}
    out = HERE / "data" / f"eval_report_{args.tag}.json"
    out.write_text(json.dumps(report, indent=2, ensure_ascii=False))
    print(f"\n=== aggregate ({args.tag}) ===")
    print(json.dumps(agg, indent=2))
    print(f"\nreport: {out}")


if __name__ == "__main__":
    main()