File size: 2,485 Bytes
16dc556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""Run the eval harness: baseline (heuristic) vs oracle (gold) on a held-out gold set.

    uv run eval/run_eval.py --n 300 --seed 4242

Plug a fine-tuned model in later by passing a planner callable to `evaluate`.
The held-out seed differs from the training seed so gold instances are unseen.
"""

from __future__ import annotations

import argparse

from scrubdata.planner import mock_plan

from . import metrics
from .gold import load_gold


def _micro_f1(items, extract) -> dict:
    tp = fp = fn = 0
    for pred_plan, gold_plan in items:
        pred, gold = extract(pred_plan), extract(gold_plan)
        tp += len(pred & gold)
        fp += len(pred - gold)
        fn += len(gold - pred)
    return metrics._prf(tp, fp, fn)


def evaluate(planner, gold) -> dict:
    """planner: (dirty_df, gold_plan) -> plan dict. gold: list of make_example dicts."""
    preds = [(planner(ex["dirty_df"], ex["plan"]), ex) for ex in gold]
    valid = sum(metrics.is_valid(p) for p, _ in preds) / len(preds)
    op = _micro_f1([(p, ex["plan"]) for p, ex in preds], metrics.op_pairs)
    canon = _micro_f1([(p, ex["plan"]) for p, ex in preds], metrics.canon_pairs)
    rec = sum(metrics.recovery(ex["clean_df"], ex["dirty_df"], p)
              for p, ex in preds) / len(preds)
    return {"json_valid": valid, "op_f1": op["f1"], "op_r": op["r"],
            "canon_f1": canon["f1"], "canon_r": canon["r"], "recovery": rec}


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--n", type=int, default=300)
    ap.add_argument("--seed", type=int, default=4242)
    args = ap.parse_args()

    gold = load_gold()[:args.n]  # frozen, committed test set (eval/gold.jsonl)

    systems = {
        "ORACLE (gold plan)": lambda df, gold_plan: gold_plan,
        "HEURISTIC (baseline)": lambda df, gold_plan: mock_plan(df),
    }
    rows = {name: evaluate(fn, gold) for name, fn in systems.items()}

    cols = ["json_valid", "op_f1", "op_r", "canon_f1", "canon_r", "recovery"]
    print(f"\nEval on {len(gold)} frozen held-out gold examples (eval/gold.jsonl)\n")
    print(f"{'system':<22}" + "".join(f"{c:>11}" for c in cols))
    print("-" * (22 + 11 * len(cols)))
    for name, m in rows.items():
        print(f"{name:<22}" + "".join(f"{m[c]:>11.3f}" for c in cols))
    print("\nGoalpost: the fine-tuned model should approach ORACLE and clearly beat "
          "HEURISTIC — especially on canon_f1/canon_r (the fuzzy skill).")


if __name__ == "__main__":
    main()