Spaces:
Running
Running
| """B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets. | |
| Scores every system under BOTH metric families, side by side: | |
| * original — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan, | |
| PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation): | |
| values minimally normalized (html-unescape, whitespace collapse — their | |
| value_normalizer), then RAW string equality; precision = exact-gold repairs / | |
| cells changed; recall = exact-gold repairs / (dirty->clean diff); no | |
| churn-neutrality, no case folding, no semantic tolerance, no damage metric. | |
| * churn_neutral — our eval.run_real_multi.score (the scoring contract): | |
| convention-normalized, churn ignored, damage reported. | |
| Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan | |
| files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling | |
| budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds, | |
| seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish | |
| from the repaired-vs-dirty diff, so reconstructed |changed| is a lower bound on | |
| Baran's own output_size (precision an upper bound; recall exact). | |
| Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s | |
| (per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the | |
| published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below). | |
| Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json | |
| exactly (checked, hard-fails otherwise). | |
| uv run python -m eval.cross_scoring | |
| Writes eval/results/cross_scoring.json and prints LaTeX rows. | |
| """ | |
| from __future__ import annotations | |
| import html | |
| import json | |
| import re | |
| from pathlib import Path | |
| import pandas as pd | |
| from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan | |
| from scrubdata.executor import apply_plan | |
| from scrubdata.planner import mock_plan | |
| from scrubdata.verifier import union_plans, verify_plan | |
| from .precision_curve import _repairs_only | |
| from .raha_table import TAU, UNION_PLANS, _gen_plan | |
| from .run_real_multi import RAHA, _cell_only, _raha_pair, score | |
| RESULTS = Path(__file__).resolve().parent / "results" | |
| BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"} | |
| # Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as | |
| # input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading | |
| # vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated | |
| # in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax). | |
| PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87}, | |
| "flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00}, | |
| "beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90}, | |
| "rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}} | |
| def _norm(v: str) -> str: | |
| """raha.dataset.Dataset.value_normalizer, verbatim semantics.""" | |
| v = html.unescape(str(v)) | |
| v = re.sub("[\t\n ]+", " ", v, re.UNICODE) | |
| return v.strip("\t\n ") | |
| def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict: | |
| """The original Raha/Baran repair metric over a repaired DataFrame: minimal | |
| normalization then raw equality; changed = repaired-vs-dirty diff.""" | |
| n = min(len(dirty), len(out), len(clean)) | |
| errors = changed = tp = 0 | |
| for j, col in enumerate(dirty.columns): | |
| present = col in out.columns | |
| for i in range(n): | |
| dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j]) | |
| ov = _norm(out.iloc[i][col]) if present else dv | |
| err, chg = dv != cv, ov != dv | |
| errors += err | |
| changed += chg | |
| tp += chg and err and ov == cv | |
| p = tp / changed if changed else 0.0 | |
| r = tp / errors if errors else 0.0 | |
| f1 = 2 * p * r / (p + r) if (p + r) else 0.0 | |
| return {"f1": f1, "precision": p, "recall": r, | |
| "_errors": errors, "_changed": changed, "_tp": tp} | |
| def _both(dirty, clean, out) -> dict: | |
| m = score(dirty, clean, out) | |
| return {"original": baran_score(dirty, clean, out), | |
| "churn_neutral": {k: m[k] for k in | |
| ("f1", "precision", "recall", "damage", | |
| "_errors", "_changed", "_fixed")}} | |
| def kendall_tau(xs, ys) -> float: | |
| """Kendall tau-b (tie-corrected), stdlib.""" | |
| n = len(xs) | |
| n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0 | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| a, b = xs[i] - xs[j], ys[i] - ys[j] | |
| n1 += a == 0 | |
| n2 += b == 0 | |
| if a != 0 and b != 0: | |
| nc += (a > 0) == (b > 0) | |
| nd += (a > 0) != (b > 0) | |
| denom = ((n0 - n1) * (n0 - n2)) ** 0.5 | |
| return (nc - nd) / denom if denom else 0.0 | |
| def _mean_rows(rows: list[dict]) -> dict: | |
| return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]} | |
| def main() -> None: | |
| out = {"protocol": { | |
| "original": "Raha/Baran convention: value_normalizer (html-unescape + " | |
| "whitespace collapse) then raw string equality; P = exact-gold " | |
| "repairs / changed cells, R = exact-gold repairs / (dirty->clean " | |
| "diff); no churn-neutrality, no damage", | |
| "churn_neutral": "eval.run_real_multi.score — the scoring contract", | |
| "baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; " | |
| "reconstructed from repaired CSVs (no-op corrections vanish: " | |
| "|changed| lower-bounds Baran's output_size)", | |
| "movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"}, | |
| "systems": {}} | |
| deterministic = [("grounded", mock_plan), | |
| ("openrefine_fingerprint", openrefine_fingerprint_plan), | |
| ("openrefine_knn", openrefine_knn_plan)] | |
| for label, planner in deterministic: | |
| rows = [] | |
| for name, _dom in RAHA: | |
| dirty, clean = _raha_pair(name) | |
| cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty))) | |
| m = _both(dirty, clean, cleaned) | |
| rows.append({"dataset": name, **m}) | |
| print(f" {label:<24}{name:<10} orig={m['original']['f1']:.3f} " | |
| f"cn={m['churn_neutral']['f1']:.3f}", flush=True) | |
| out["systems"][label] = {"per_dataset": rows} | |
| rows = [] | |
| for name, _dom in RAHA: | |
| base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS | |
| else _gen_plan(name)) | |
| dirty, clean = _raha_pair(name) | |
| plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU), | |
| mock_plan(dirty))) | |
| cleaned, _ = apply_plan(dirty, plan) | |
| m = _both(dirty, clean, cleaned) | |
| rows.append({"dataset": name, **m}) | |
| print(f" {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} " | |
| f"cn={m['churn_neutral']['f1']:.3f}", flush=True) | |
| out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows} | |
| for n_labels, d in BARAN_DIRS.items(): | |
| rows = [] | |
| for name, _dom in RAHA: | |
| dirty, clean = _raha_pair(name) | |
| per_seed = [] | |
| for p in sorted(d.glob(f"{name}_seed*_repaired.csv")): | |
| repaired = pd.read_csv(p, dtype=str, keep_default_na=False) | |
| per_seed.append(_both(dirty, clean, repaired)) | |
| m = {"original": _mean_rows([s["original"] for s in per_seed]), | |
| "churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])} | |
| rows.append({"dataset": name, "n_seeds": len(per_seed), **m}) | |
| print(f" {'baran_oracle%d' % n_labels:<24}{name:<10} " | |
| f"orig={m['original']['f1']:.3f} " | |
| f"cn={m['churn_neutral']['f1']:.3f}", flush=True) | |
| out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows} | |
| for sys in out["systems"].values(): | |
| for fam in ("original", "churn_neutral"): | |
| sys[f"macro_f1_{fam}"] = _mean_rows( | |
| [r[fam] for r in sys["per_dataset"]])["f1"] | |
| # acceptance: churn-neutral rows == raha_per_dataset.json (exact) | |
| ref = json.load(open(RESULTS / "raha_per_dataset.json")) | |
| checks = [] | |
| for key, ref_key in [("grounded", "grounded"), | |
| ("openrefine_fingerprint", "openrefine_fingerprint"), | |
| ("openrefine_knn", "openrefine_knn"), | |
| ("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"), | |
| ("baran_oracle20", "baran_oracle20")]: | |
| for got, want in zip(out["systems"][key]["per_dataset"], | |
| ref["systems"][ref_key]["per_dataset"]): | |
| for k in ("f1", "precision", "recall", "damage"): | |
| ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9 | |
| checks.append(ok) | |
| if not ok: | |
| print(f"MISMATCH {key}/{got['dataset']}/{k}: " | |
| f"{got['churn_neutral'][k]} vs {want[k]}") | |
| out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks), | |
| "pass": all(checks)} | |
| print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match " | |
| f"-> {'PASS' if all(checks) else 'FAIL'}") | |
| if not all(checks): | |
| raise SystemExit("acceptance FAILED") | |
| # Kendall tau-b between system rankings under the two F1s | |
| primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint", | |
| "openrefine_knn", "baran_oracle20"] | |
| extended = primary + ["baran_oracle0", "baran_oracle5"] | |
| taus = {} | |
| for label, sysset in [("primary", primary), ("extended", extended)]: | |
| per_ds = {} | |
| for i, (name, _dom) in enumerate(RAHA): | |
| xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset] | |
| ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset] | |
| per_ds[name] = kendall_tau(xs, ys) | |
| xs = [out["systems"][s]["macro_f1_original"] for s in sysset] | |
| ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset] | |
| taus[label] = {"systems": sysset, "per_dataset": per_ds, | |
| "macro": kendall_tau(xs, ys)} | |
| print(f"tau-b ({label}): macro={taus[label]['macro']:.3f} " + | |
| " ".join(f"{n}={t:.3f}" for n, t in per_ds.items())) | |
| out["kendall_tau_b"] = taus | |
| # calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3 | |
| cal = [] | |
| b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]} | |
| for name, pub in PUBLISHED.items(): | |
| ours = b20[name]["original"] | |
| cal.append({"dataset": name, "published_f1": pub["f1"], | |
| "published_precision": pub["precision"], | |
| "published_recall": pub["recall"], | |
| "repro_f1": ours["f1"], "repro_precision": ours["precision"], | |
| "repro_recall": ours["recall"], | |
| "delta_f1": ours["f1"] - pub["f1"]}) | |
| print(f"calibration {name:<10} published F1={pub['f1']:.2f} " | |
| f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})") | |
| out["calibration"] = { | |
| "source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' " | |
| "(no TL): complete error set given (oracle detection), budget 20, " | |
| "mean of 10 runs; PDF read 2026-06-12", | |
| "notes": "their runs: full datasets, 10 label seeds, Wikipedia value models " | |
| "available in package but Table-3 row is without TL; ours: 3 label " | |
| "seeds, no pretraining, movies_1 not in their paper; our " | |
| "churn-neutral macro for this row is the paper's 0.811", | |
| "rows": cal} | |
| dest = RESULTS / "cross_scoring.json" | |
| json.dump(out, open(dest, "w"), indent=1) | |
| print(f"written to {dest}") | |
| print(latex(out)) | |
| LABELS = [("grounded", "Grounded (ours, deterministic)"), | |
| ("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"), | |
| ("openrefine_fingerprint", "OpenRefine fingerprint"), | |
| ("openrefine_knn", "OpenRefine kNN"), | |
| ("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")] | |
| def latex(out: dict) -> str: | |
| """Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral | |
| P/R/F1 + damage.""" | |
| L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule", | |
| r" & & \multicolumn{3}{c}{Original (Baran) metric} & " | |
| r"\multicolumn{4}{c}{Churn-neutral (ours)} \\", | |
| r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}", | |
| r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\", | |
| r"\midrule"] | |
| for key, label in LABELS: | |
| for i, r in enumerate(out["systems"][key]["per_dataset"]): | |
| o, c = r["original"], r["churn_neutral"] | |
| L.append(f"{label if i == 0 else ''} & " | |
| f"{r['dataset'].replace('_', r'\_')} & " | |
| f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & " | |
| f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & " | |
| f"{c['damage']:.3f} \\\\") | |
| L.append(f" & \\emph{{macro}} & & & " | |
| f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} & & & " | |
| f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} & \\\\") | |
| L.append(r"\midrule") | |
| t = out["kendall_tau_b"]["primary"] | |
| L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: " | |
| f"macro {t['macro']:.2f}; per dataset " + | |
| ", ".join(f"{n.replace('_', r'\_')} {v:.2f}" | |
| for n, v in t["per_dataset"].items()) + r"} \\") | |
| cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs " | |
| f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"]) | |
| L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 " | |
| r"repro vs PVLDB'20 Table~3): " + cal + r"} \\") | |
| L.append(r"\bottomrule") | |
| L.append(r"\end{tabular}") | |
| return "\n".join(L) | |
| if __name__ == "__main__": | |
| main() | |