"""Run the eval harness: baseline (heuristic) vs oracle (gold) on a held-out gold set. uv run eval/run_eval.py --n 300 --seed 4242 Plug a fine-tuned model in later by passing a planner callable to `evaluate`. The held-out seed differs from the training seed so gold instances are unseen. """ from __future__ import annotations import argparse from scrubdata.planner import mock_plan from . import metrics from .gold import load_gold def _micro_f1(items, extract) -> dict: tp = fp = fn = 0 for pred_plan, gold_plan in items: pred, gold = extract(pred_plan), extract(gold_plan) tp += len(pred & gold) fp += len(pred - gold) fn += len(gold - pred) return metrics._prf(tp, fp, fn) def evaluate(planner, gold) -> dict: """planner: (dirty_df, gold_plan) -> plan dict. gold: list of make_example dicts.""" preds = [(planner(ex["dirty_df"], ex["plan"]), ex) for ex in gold] valid = sum(metrics.is_valid(p) for p, _ in preds) / len(preds) op = _micro_f1([(p, ex["plan"]) for p, ex in preds], metrics.op_pairs) canon = _micro_f1([(p, ex["plan"]) for p, ex in preds], metrics.canon_pairs) rec = sum(metrics.recovery(ex["clean_df"], ex["dirty_df"], p) for p, ex in preds) / len(preds) return {"json_valid": valid, "op_f1": op["f1"], "op_r": op["r"], "canon_f1": canon["f1"], "canon_r": canon["r"], "recovery": rec} def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--n", type=int, default=300) ap.add_argument("--seed", type=int, default=4242) args = ap.parse_args() gold = load_gold()[:args.n] # frozen, committed test set (eval/gold.jsonl) systems = { "ORACLE (gold plan)": lambda df, gold_plan: gold_plan, "HEURISTIC (baseline)": lambda df, gold_plan: mock_plan(df), } rows = {name: evaluate(fn, gold) for name, fn in systems.items()} cols = ["json_valid", "op_f1", "op_r", "canon_f1", "canon_r", "recovery"] print(f"\nEval on {len(gold)} frozen held-out gold examples (eval/gold.jsonl)\n") print(f"{'system':<22}" + "".join(f"{c:>11}" for c in cols)) print("-" * (22 + 11 * len(cols))) for name, m in rows.items(): print(f"{name:<22}" + "".join(f"{m[c]:>11.3f}" for c in cols)) print("\nGoalpost: the fine-tuned model should approach ORACLE and clearly beat " "HEURISTIC — especially on canon_f1/canon_r (the fuzzy skill).") if __name__ == "__main__": main()