Spaces:
Running
Running
| """Supervisor REQUIRED 5 — per-dataset Raha table (real-error slice). | |
| Per-dataset repair precision/recall/F1 + damage on the 5 Raha real-error benchmarks | |
| (hospital, beers, flights, rayyan, movies_1) for: | |
| * grounded (ours) — shipped deterministic path (mock_plan), money-table protocol | |
| * OpenRefine fingerprint — clustering baseline, same protocol | |
| * OpenRefine kNN — clustering baseline, same protocol | |
| * verified union (v6) — captured raw model plan -> verify(tau=0.5) -> union with | |
| heuristic -> repairs-only (the WS1 gate protocol), where a | |
| captured plan exists (hospital + the gen_plans_seed21 set) | |
| * Baran — reference row from eval/results/baran_raha.json (oracle | |
| detection + 20 gold labels; mean over its 3 label seeds) | |
| Deterministic (no injection): the real slice is seed-free, so the grounded macro F1 | |
| must recompute exactly to money_table.json's real_f1 — the acceptance check. (The | |
| supervisor's 0.174 is the value at the money-table commit 536cbfb; planner commits | |
| since then moved the HEAD macro — money_table.json is re-run in lockstep.) | |
| uv run python -m eval.raha_table | |
| Writes eval/results/raha_per_dataset.json and prints LaTeX rows. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan | |
| from scrubdata.executor import apply_plan | |
| from scrubdata.planner import mock_plan | |
| from scrubdata.verifier import union_plans, verify_plan | |
| from .precision_curve import _repairs_only | |
| from .run_real_multi import RAHA, _cell_only, _raha_pair, score | |
| RESULTS = Path(__file__).resolve().parent / "results" | |
| TAU = 0.5 # the pre-registered WS1 operating point | |
| # captured raw model plans (shipped v6 = mixA seed 21). Modal bf16 captures first; | |
| # local Q8_0 captures (eval/capture_plan_local.py) as fallback, suffix-disclosed. | |
| UNION_PLANS = {"hospital": RESULTS / "v6_hospital_raw_plan.json"} | |
| for _n in ("beers", "movies_1", "flights", "rayyan"): | |
| for suffix in ("raw_plan", "raw_plan_localq8"): | |
| p = RESULTS / f"v6_{_n}_{suffix}.json" | |
| if p.exists(): | |
| UNION_PLANS[_n] = p | |
| break | |
| _GEN = RESULTS / "gen_plans_seed21.json" # v6 champion captures (flights, rayyan, ...) | |
| def _gen_plan(name): | |
| plans = json.load(open(_GEN)) | |
| return plans.get(name) | |
| def _row(name, m): | |
| return {"dataset": name, "f1": m["f1"], "precision": m["precision"], | |
| "recall": m["recall"], "damage": m["damage"], | |
| "errors": m["_errors"], "changed": m["_changed"], "fixed": m["_fixed"]} | |
| def main() -> None: | |
| table = {"protocol": { | |
| "deterministic_rows": "full plan minus row-dropping ops (_cell_only), " | |
| "churn-neutral score — identical to run_real_multi", | |
| "union_rows": f"verify(tau={TAU}) -> union with heuristic -> repairs-only " | |
| "(canonicalize decisions; the WS1 gate protocol)", | |
| "baran_row": "eval/results/baran_raha.json — oracle error positions + 20 gold " | |
| "labels (upper bound), mean over 3 label-sampling seeds", | |
| "movies_1": "scored on first 2000 rows (_raha_pair), as in the money table"}, | |
| "systems": {}} | |
| systems = [("grounded", mock_plan), | |
| ("openrefine_fingerprint", openrefine_fingerprint_plan), | |
| ("openrefine_knn", openrefine_knn_plan)] | |
| for label, planner in systems: | |
| rows = [] | |
| for name, _dom in RAHA: | |
| dirty, clean = _raha_pair(name) | |
| cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty))) | |
| m = score(dirty, clean, cleaned) | |
| rows.append(_row(name, m)) | |
| print(f" {label:<24}{name:<12} F1={m['f1']:.3f} P={m['precision']:.3f} " | |
| f"R={m['recall']:.3f} dmg={m['damage']:.4f}", flush=True) | |
| macro = sum(r["f1"] for r in rows) / len(rows) | |
| table["systems"][label] = {"per_dataset": rows, "macro_f1": macro} | |
| # verified-union operating point per dataset (where a raw v6 plan was captured) | |
| rows = [] | |
| for name, _dom in RAHA: | |
| if name in UNION_PLANS: | |
| base, src = json.load(open(UNION_PLANS[name])), UNION_PLANS[name].name | |
| else: | |
| base, src = _gen_plan(name), _GEN.name | |
| if base is None: | |
| rows.append({"dataset": name, "missing": "no captured v6 raw plan"}) | |
| print(f" union@tau={TAU:<18}{name:<12} (no captured plan)", flush=True) | |
| continue | |
| dirty, clean = _raha_pair(name) | |
| plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU), | |
| mock_plan(dirty))) | |
| cleaned, _ = apply_plan(dirty, plan) | |
| m = score(dirty, clean, cleaned) | |
| rows.append({**_row(name, m), "plan_source": src}) | |
| print(f" union@tau={TAU:<18}{name:<12} P={m['precision']:.3f} " | |
| f"cov={m['recall']:.3f} F1={m['f1']:.3f} dmg={m['damage']:.4f} " | |
| f"changed={m['_changed']} fixed={m['_fixed']}", flush=True) | |
| table["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows} | |
| jelly = json.load(open(RESULTS / "jellyfish_raha.json")) | |
| rows = [{"dataset": name, | |
| **{k: jelly["per_dataset"][name][k] | |
| for k in ("f1", "precision", "recall", "damage")}} | |
| for name, _dom in RAHA] | |
| table["systems"]["jellyfish_ed_di"] = { | |
| "per_dataset": rows, | |
| "macro_f1": sum(r["f1"] for r in rows) / len(rows)} | |
| baran = json.load(open(RESULTS / "baran_raha.json")) | |
| by_ds: dict[str, list] = {} | |
| for r in baran["per_dataset"]: | |
| by_ds.setdefault(r["name"], []).append(r) | |
| rows = [] | |
| for name, _dom in RAHA: | |
| rs = by_ds[name] | |
| rows.append({"dataset": name, | |
| **{k: sum(r[k] for r in rs) / len(rs) | |
| for k in ("f1", "precision", "recall", "damage")}, | |
| "n_seeds": len(rs)}) | |
| table["systems"]["baran_oracle20"] = { | |
| "per_dataset": rows, | |
| "macro_f1": sum(r["f1"] for r in rows) / len(rows)} | |
| # acceptance check: grounded macro must match the money table's REAL-F1 | |
| money = json.load(open(RESULTS / "money_table.json")) | |
| expect = next(r["real_f1"] for r in money if r["system"] == "grounded (ours)") | |
| got = table["systems"]["grounded"]["macro_f1"] | |
| ok = abs(got - expect) < 1e-9 | |
| table["macro_check"] = {"grounded_macro_f1": got, "money_table_real_f1": expect, | |
| "match": ok} | |
| print(f"\nmacro check: grounded {got:.6f} vs money-table {expect:.6f} " | |
| f"-> {'PASS' if ok else 'FAIL'}") | |
| out = RESULTS / "raha_per_dataset.json" | |
| json.dump(table, open(out, "w"), indent=1) | |
| print(f"written to {out}") | |
| print(latex(table)) | |
| LABELS = [("grounded", "Grounded (ours, deterministic)"), | |
| ("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"), | |
| ("openrefine_fingerprint", "OpenRefine fingerprint"), | |
| ("openrefine_knn", "OpenRefine kNN"), | |
| ("jellyfish_ed_di", "Jellyfish-13B ED+DI"), | |
| ("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")] | |
| def latex(table: dict) -> str: | |
| """Booktabs rows: one block per system, one line per dataset.""" | |
| L = [r"\begin{tabular}{llrrrr}", r"\toprule", | |
| r"System & Dataset & Prec. & Rec. & F1 & Damage \\", r"\midrule"] | |
| for key, label in LABELS: | |
| sysrows = table["systems"][key]["per_dataset"] | |
| for i, r in enumerate(sysrows): | |
| head = label if i == 0 else "" | |
| if "missing" in r: | |
| L.append(f"{head} & {r['dataset'].replace('_', r'\_')} & " | |
| r"\multicolumn{4}{c}{--- (no captured plan)} \\") | |
| continue | |
| L.append(f"{head} & {r['dataset'].replace('_', r'\_')} & " | |
| f"{r['precision']:.3f} & {r['recall']:.3f} & {r['f1']:.3f} & " | |
| f"{r['damage']:.3f} \\\\") | |
| macro = table["systems"][key].get("macro_f1") | |
| if macro is not None: | |
| L.append(f" & \\emph{{macro}} & & & \\emph{{{macro:.3f}}} & \\\\") | |
| L.append(r"\midrule") | |
| L[-1] = r"\bottomrule" | |
| L.append(r"\end{tabular}") | |
| return "\n".join(L) | |
| if __name__ == "__main__": | |
| main() | |