Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /cross_scoring.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 13 days ago

14.4 kB

	"""B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets.

	Scores every system under BOTH metric families, side by side:
	* original — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan,
	PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation):
	values minimally normalized (html-unescape, whitespace collapse — their
	value_normalizer), then RAW string equality; precision = exact-gold repairs /
	cells changed; recall = exact-gold repairs / (dirty->clean diff); no
	churn-neutrality, no case folding, no semantic tolerance, no damage metric.
	* churn_neutral — our eval.run_real_multi.score (the scoring contract):
	convention-normalized, churn ignored, damage reported.

	Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan
	files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling
	budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds,
	seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish
	from the repaired-vs-dirty diff, so reconstructed \|changed\| is a lower bound on
	Baran's own output_size (precision an upper bound; recall exact).

	Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s
	(per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the
	published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below).

	Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json
	exactly (checked, hard-fails otherwise).

	uv run python -m eval.cross_scoring
	Writes eval/results/cross_scoring.json and prints LaTeX rows.
	"""

	from __future__ import annotations

	import html
	import json
	import re
	from pathlib import Path

	import pandas as pd

	from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
	from scrubdata.executor import apply_plan
	from scrubdata.planner import mock_plan
	from scrubdata.verifier import union_plans, verify_plan

	from .precision_curve import _repairs_only
	from .raha_table import TAU, UNION_PLANS, _gen_plan
	from .run_real_multi import RAHA, _cell_only, _raha_pair, score

	RESULTS = Path(__file__).resolve().parent / "results"
	BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"}

	# Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as
	# input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading
	# vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated
	# in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax).
	PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87},
	"flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00},
	"beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90},
	"rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}}


	def _norm(v: str) -> str:
	"""raha.dataset.Dataset.value_normalizer, verbatim semantics."""
	v = html.unescape(str(v))
	v = re.sub("[\t\n ]+", " ", v, re.UNICODE)
	return v.strip("\t\n ")


	def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
	"""The original Raha/Baran repair metric over a repaired DataFrame: minimal
	normalization then raw equality; changed = repaired-vs-dirty diff."""
	n = min(len(dirty), len(out), len(clean))
	errors = changed = tp = 0
	for j, col in enumerate(dirty.columns):
	present = col in out.columns
	for i in range(n):
	dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j])
	ov = _norm(out.iloc[i][col]) if present else dv
	err, chg = dv != cv, ov != dv
	errors += err
	changed += chg
	tp += chg and err and ov == cv
	p = tp / changed if changed else 0.0
	r = tp / errors if errors else 0.0
	f1 = 2 * p * r / (p + r) if (p + r) else 0.0
	return {"f1": f1, "precision": p, "recall": r,
	"_errors": errors, "_changed": changed, "_tp": tp}


	def _both(dirty, clean, out) -> dict:
	m = score(dirty, clean, out)
	return {"original": baran_score(dirty, clean, out),
	"churn_neutral": {k: m[k] for k in
	("f1", "precision", "recall", "damage",
	"_errors", "_changed", "_fixed")}}


	def kendall_tau(xs, ys) -> float:
	"""Kendall tau-b (tie-corrected), stdlib."""
	n = len(xs)
	n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0
	for i in range(n):
	for j in range(i + 1, n):
	a, b = xs[i] - xs[j], ys[i] - ys[j]
	n1 += a == 0
	n2 += b == 0
	if a != 0 and b != 0:
	nc += (a > 0) == (b > 0)
	nd += (a > 0) != (b > 0)
	denom = ((n0 - n1) * (n0 - n2)) ** 0.5
	return (nc - nd) / denom if denom else 0.0


	def _mean_rows(rows: list[dict]) -> dict:
	return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]}


	def main() -> None:
	out = {"protocol": {
	"original": "Raha/Baran convention: value_normalizer (html-unescape + "
	"whitespace collapse) then raw string equality; P = exact-gold "
	"repairs / changed cells, R = exact-gold repairs / (dirty->clean "
	"diff); no churn-neutrality, no damage",
	"churn_neutral": "eval.run_real_multi.score — the scoring contract",
	"baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; "
	"reconstructed from repaired CSVs (no-op corrections vanish: "
	"\|changed\| lower-bounds Baran's output_size)",
	"movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"},
	"systems": {}}

	deterministic = [("grounded", mock_plan),
	("openrefine_fingerprint", openrefine_fingerprint_plan),
	("openrefine_knn", openrefine_knn_plan)]
	for label, planner in deterministic:
	rows = []
	for name, _dom in RAHA:
	dirty, clean = _raha_pair(name)
	cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
	m = _both(dirty, clean, cleaned)
	rows.append({"dataset": name, **m})
	print(f" {label:<24}{name:<10} orig={m['original']['f1']:.3f} "
	f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
	out["systems"][label] = {"per_dataset": rows}

	rows = []
	for name, _dom in RAHA:
	base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS
	else _gen_plan(name))
	dirty, clean = _raha_pair(name)
	plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU),
	mock_plan(dirty)))
	cleaned, _ = apply_plan(dirty, plan)
	m = _both(dirty, clean, cleaned)
	rows.append({"dataset": name, **m})
	print(f" {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} "
	f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
	out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows}

	for n_labels, d in BARAN_DIRS.items():
	rows = []
	for name, _dom in RAHA:
	dirty, clean = _raha_pair(name)
	per_seed = []
	for p in sorted(d.glob(f"{name}_seed*_repaired.csv")):
	repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
	per_seed.append(_both(dirty, clean, repaired))
	m = {"original": _mean_rows([s["original"] for s in per_seed]),
	"churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])}
	rows.append({"dataset": name, "n_seeds": len(per_seed), **m})
	print(f" {'baran_oracle%d' % n_labels:<24}{name:<10} "
	f"orig={m['original']['f1']:.3f} "
	f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
	out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows}

	for sys in out["systems"].values():
	for fam in ("original", "churn_neutral"):
	sys[f"macro_f1_{fam}"] = _mean_rows(
	[r[fam] for r in sys["per_dataset"]])["f1"]

	# acceptance: churn-neutral rows == raha_per_dataset.json (exact)
	ref = json.load(open(RESULTS / "raha_per_dataset.json"))
	checks = []
	for key, ref_key in [("grounded", "grounded"),
	("openrefine_fingerprint", "openrefine_fingerprint"),
	("openrefine_knn", "openrefine_knn"),
	("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"),
	("baran_oracle20", "baran_oracle20")]:
	for got, want in zip(out["systems"][key]["per_dataset"],
	ref["systems"][ref_key]["per_dataset"]):
	for k in ("f1", "precision", "recall", "damage"):
	ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9
	checks.append(ok)
	if not ok:
	print(f"MISMATCH {key}/{got['dataset']}/{k}: "
	f"{got['churn_neutral'][k]} vs {want[k]}")
	out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks),
	"pass": all(checks)}
	print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match "
	f"-> {'PASS' if all(checks) else 'FAIL'}")
	if not all(checks):
	raise SystemExit("acceptance FAILED")

	# Kendall tau-b between system rankings under the two F1s
	primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint",
	"openrefine_knn", "baran_oracle20"]
	extended = primary + ["baran_oracle0", "baran_oracle5"]
	taus = {}
	for label, sysset in [("primary", primary), ("extended", extended)]:
	per_ds = {}
	for i, (name, _dom) in enumerate(RAHA):
	xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset]
	ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset]
	per_ds[name] = kendall_tau(xs, ys)
	xs = [out["systems"][s]["macro_f1_original"] for s in sysset]
	ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset]
	taus[label] = {"systems": sysset, "per_dataset": per_ds,
	"macro": kendall_tau(xs, ys)}
	print(f"tau-b ({label}): macro={taus[label]['macro']:.3f} " +
	" ".join(f"{n}={t:.3f}" for n, t in per_ds.items()))
	out["kendall_tau_b"] = taus

	# calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3
	cal = []
	b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]}
	for name, pub in PUBLISHED.items():
	ours = b20[name]["original"]
	cal.append({"dataset": name, "published_f1": pub["f1"],
	"published_precision": pub["precision"],
	"published_recall": pub["recall"],
	"repro_f1": ours["f1"], "repro_precision": ours["precision"],
	"repro_recall": ours["recall"],
	"delta_f1": ours["f1"] - pub["f1"]})
	print(f"calibration {name:<10} published F1={pub['f1']:.2f} "
	f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})")
	out["calibration"] = {
	"source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' "
	"(no TL): complete error set given (oracle detection), budget 20, "
	"mean of 10 runs; PDF read 2026-06-12",
	"notes": "their runs: full datasets, 10 label seeds, Wikipedia value models "
	"available in package but Table-3 row is without TL; ours: 3 label "
	"seeds, no pretraining, movies_1 not in their paper; our "
	"churn-neutral macro for this row is the paper's 0.811",
	"rows": cal}

	dest = RESULTS / "cross_scoring.json"
	json.dump(out, open(dest, "w"), indent=1)
	print(f"written to {dest}")
	print(latex(out))


	LABELS = [("grounded", "Grounded (ours, deterministic)"),
	("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"),
	("openrefine_fingerprint", "OpenRefine fingerprint"),
	("openrefine_knn", "OpenRefine kNN"),
	("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")]


	def latex(out: dict) -> str:
	"""Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral
	P/R/F1 + damage."""
	L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule",
	r" & & \multicolumn{3}{c}{Original (Baran) metric} & "
	r"\multicolumn{4}{c}{Churn-neutral (ours)} \\",
	r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}",
	r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\",
	r"\midrule"]
	for key, label in LABELS:
	for i, r in enumerate(out["systems"][key]["per_dataset"]):
	o, c = r["original"], r["churn_neutral"]
	L.append(f"{label if i == 0 else ''} & "
	f"{r['dataset'].replace('_', r'\_')} & "
	f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & "
	f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & "
	f"{c['damage']:.3f} \\\\")
	L.append(f" & \\emph{{macro}} & & & "
	f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} & & & "
	f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} & \\\\")
	L.append(r"\midrule")
	t = out["kendall_tau_b"]["primary"]
	L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: "
	f"macro {t['macro']:.2f}; per dataset " +
	", ".join(f"{n.replace('_', r'\_')} {v:.2f}"
	for n, v in t["per_dataset"].items()) + r"} \\")
	cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs "
	f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"])
	L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 "
	r"repro vs PVLDB'20 Table~3): " + cal + r"} \\")
	L.append(r"\bottomrule")
	L.append(r"\end{tabular}")
	return "\n".join(L)


	if __name__ == "__main__":
	main()