Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /run_real_multi.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 13 days ago

13.1 kB

	"""Improved real-data north-star — fixes the biases of single-dataset `repair_recall`.

	repair_recall on hospital alone is biased: (1) ONE dataset (overfit to its quirks),
	(2) RECALL-ONLY so it rewards over-correction (v5: recall .42 / precision .16),
	(3) CONVENTION-sensitive (penalizes case/format differences that are semantically right),
	(4) ABSTAIN-blind (punishes correctly declining on uncertain values).

	This harness fixes all four:
	* MULTI-DATASET (Raha suite) -> macro-average, not one table.
	* F1 (recall AND precision) + DAMAGE rate (clean cells we corrupted) -> over-correction
	is penalized, not rewarded.
	* CONVENTION-NORMALIZED correctness (case/whitespace-insensitive) -> measures the right
	VALUE, not surface convention.
	* an ADVERSARIAL ABSTAIN slice -> the system must fix real typos but LEAVE trap values
	(ambiguous / not-a-typo) untouched. Abstaining correctly scores; wrong-merging doesn't.

	uv run python -m eval.run_real_multi
	"""

	from __future__ import annotations

	import random

	import pandas as pd

	from scrubdata.executor import apply_plan
	from scrubdata.planner import mock_plan

	from .metrics import _cell_equal

	DATASETS = ["hospital", "beers", "flights", "rayyan"]
	RAW = "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets"


	def _cell_only(plan: dict) -> dict:
	"""Strip row-count-changing table ops (dedup / drop-empty-rows) so the cleaned output
	stays row-aligned with the reference — we are measuring CELL-level cleaning here."""
	p = dict(plan)
	p["table_operations"] = [o for o in plan.get("table_operations", [])
	if o.get("op") not in ("drop_exact_duplicates", "drop_empty_rows")]
	return p


	def _sem_equal(a, b) -> bool:
	"""Semantic equality: numeric-tolerant (via _cell_equal) OR case/whitespace-folded.
	Separates the right VALUE from surface convention (Birmingham == birmingham)."""
	if _cell_equal(a, b):
	return True
	return str(a).strip().lower() == str(b).strip().lower()


	def _fetch(name: str):
	import urllib.request
	from pathlib import Path
	base = Path(__file__).resolve().parent.parent / "data" / "real" / name
	base.mkdir(parents=True, exist_ok=True)
	out = []
	for fn in ("dirty.csv", "clean.csv"):
	p = base / fn
	if not p.exists():
	urllib.request.urlretrieve(f"{RAW}/{name}/{fn}", p)
	out.append(pd.read_csv(p, dtype=str, keep_default_na=False))
	return out


	def score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
	"""Precision/recall/F1 (convention-normalized) + damage rate.

	Churn-neutral: a rewrite that is sem-equal to the INPUT but does not restore the
	gold (pure case/whitespace churn) counts as NOTHING — not a change, not a fix, not
	damage. Otherwise bulk convention-rewrites of clean columns inflate precision (the
	'-case match' ablation artifact). Fixing an error also requires actually ACTING:
	a case-injected error left untouched is sem-equal to gold but is NOT a fix."""
	n = min(len(dirty), len(out), len(clean))
	errors = fixed = changed = good_changes = clean_cells = damage = errors_abstained = 0
	for j, col in enumerate(dirty.columns):
	present = col in out.columns
	for i in range(n):
	dv, cv = dirty.iat[i, j], clean.iat[i, j]
	ov = out.iloc[i][col] if present else dv
	err = not _cell_equal(dv, cv) # benchmark error (raw)
	chg = present and not _cell_equal(ov, dv) # we changed it
	raw_ok = present and _cell_equal(ov, cv) # exactly restored gold
	sem_ok = _sem_equal(ov, cv) # right value (convention-tolerant)
	if chg and _sem_equal(ov, dv) and not raw_ok:
	chg = False # churn: ignore entirely
	if err:
	errors += 1
	if raw_ok or (sem_ok and chg):
	fixed += 1 # real restoration or semantic fix
	elif not chg:
	errors_abstained += 1 # left an error untouched
	else:
	clean_cells += 1
	if chg and not sem_ok:
	damage += 1 # corrupted a clean cell
	if chg:
	changed += 1
	if sem_ok:
	good_changes += 1
	recall = fixed / errors if errors else 0.0
	precision = good_changes / changed if changed else 1.0
	f1 = 2 * recall * precision / (recall + precision) if (recall + precision) else 0.0
	return {"f1": f1, "recall": recall, "precision": precision,
	"damage": damage / clean_cells if clean_cells else 0.0,
	"_errors": errors, "_changed": changed, "_fixed": fixed}


	def abstain_slice(planner, seed: int = 3) -> dict:
	"""Adversarial: a city column with real typos (must fix) + TRAP values (must leave).
	Tests that the system abstains rather than wrong-merges."""
	rng = random.Random(seed)
	canon = ["Chicago", "Boston", "Houston", "Phoenix", "Dallas"]
	col, gold, kind = [], [], []
	for c in canon: # clean anchors
	for _ in range(8):
	col.append(c); gold.append(c); kind.append("clean")
	typos = {"Chcago": "Chicago", "Bostton": "Boston", "Houston": "Houston"}
	for t, g in typos.items(): # real typos -> must FIX
	col.append(t); gold.append(g); kind.append("typo")
	# traps must NOT be single-edit variants of any reference entity (else mapping them
	# is arguably CORRECT and the trap mis-scores grounding): garbage strings + one real
	# rare city (must stay; catches freq-cluster over-merge into the dominant values).
	traps = ["Xqzzyville", "Qwortelby", "Zzanthor Flats", "Carmel"]
	for t in traps:
	col.append(t); gold.append(t); kind.append("trap")
	idx = list(range(len(col))); rng.shuffle(idx)
	col = [col[i] for i in idx]; gold = [gold[i] for i in idx]; kind = [kind[i] for i in idx]
	df = pd.DataFrame({"city": col})
	cleaned, _ = apply_plan(df, _cell_only(planner(df)))
	out = cleaned["city"].tolist() if "city" in cleaned.columns else col
	typo_fixed = sum(1 for o, g, k in zip(out, gold, kind) if k == "typo" and _sem_equal(o, g))
	trap_left = sum(1 for o, g, k in zip(out, gold, kind) if k == "trap" and _sem_equal(o, g))
	n_typo = kind.count("typo"); n_trap = kind.count("trap")
	return {"typo_recall": typo_fixed / n_typo, "abstain_accuracy": trap_left / n_trap,
	"_typos": n_typo, "_traps": n_trap}


	def _mean(xs):
	xs = list(xs)
	return sum(xs) / len(xs) if xs else 0.0


	# ---- the validation SUITE: Raha real-error pairs + injected harvested domains ----
	RAHA = [("hospital", "health"), ("beers", "beverages"), ("flights", "travel"),
	("rayyan", "citations"), ("movies_1", "entertainment")]
	# diverse subset of the 20+ harvested clean domains (cached locally)
	SUITE_DOMAINS = ["restaurants", "business", "jobs", "complaints", "film", "transport",
	"education", "music", "contractors", "alcohol-bars", "vehicles",
	"sf-business", "la-business", "real-estate", "food-inspections"]
	INJECT = {"typo": "canonicalization", "ocr": "canonicalization",
	"case": "format", "whitespace": "format"}


	def _raha_pair(name):
	dirty, clean = _fetch(name)
	if len(dirty) > 2200: # movies_1: subsample for speed
	dirty, clean = dirty.head(2000).reset_index(drop=True), clean.head(2000).reset_index(drop=True)
	return dirty, clean


	def _injected_pair(path, error_type, seed):
	import pandas as _pd
	from .inject import inject
	clean = _pd.read_csv(path, dtype=str, keep_default_na=False, nrows=600, on_bad_lines="skip")
	dirty = inject(clean, error_type, seed)
	return (dirty, clean) if dirty is not None else None


	def build_suite(seed: int = 7):
	"""List of specs: {name, domain, error_type, source, load() -> (dirty, clean) \| None}.
	source 'real' = curated real-error benchmarks; 'injected' = seeded synthetic errors on
	harvested clean domains. Report both slices — injected typos are in-distribution for
	frequency clustering (canonical always present+dominant by construction), so the
	grounding claim lives in the REAL slice."""
	from pathlib import Path
	specs = [{"name": n, "domain": d, "error_type": "mixed", "source": "real",
	"load": (lambda n=n: _raha_pair(n))} for n, d in RAHA]
	import json
	src = {s["name"]: s["domain"] for s in
	json.load(open("training/unpaired_sources.json"))}
	cache = Path("data/real/cache")
	for fname, domain in src.items():
	if domain not in SUITE_DOMAINS:
	continue
	p = cache / f"{fname}.csv"
	if not p.exists():
	continue
	for et, group in INJECT.items():
	specs.append({"name": f"{domain}:{et}", "domain": domain, "error_type": group,
	"source": "injected",
	"load": (lambda p=p, et=et: _injected_pair(p, et, seed))})
	return specs


	def evaluate_suite(planner, seed: int = 7) -> dict:
	"""Run a planner over the whole suite at one injection seed; return the double-macro."""
	import collections
	rows = []
	for spec in build_suite(seed=seed):
	try:
	loaded = spec["load"]()
	except Exception: # noqa: BLE001
	continue
	if loaded is None:
	continue
	dirty, clean = loaded
	cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
	m = score(dirty, clean, cleaned)
	rows.append({**spec, "f1": m["f1"], "damage": m["damage"]})
	by_et = collections.defaultdict(list)
	by_dom = collections.defaultdict(list)
	by_src = collections.defaultdict(list)
	for r in rows:
	by_et[r["error_type"]].append(r["f1"])
	by_dom[r["domain"]].append(r["f1"])
	by_src[r.get("source", "injected")].append(r["f1"])
	et_macro = _mean(_mean(v) for v in by_et.values())
	dom_macro = _mean(_mean(v) for v in by_dom.values())
	north = (2 * et_macro * dom_macro / (et_macro + dom_macro)) if (et_macro + dom_macro) else 0.0
	ab = abstain_slice(planner)
	return {"north": north, "et_macro": et_macro, "dom_macro": dom_macro,
	"real": _mean(by_src.get("real", [])), "injected": _mean(by_src.get("injected", [])),
	"damage": _mean(r["damage"] for r in rows), "abstain": ab["abstain_accuracy"],
	"n": len(rows), "by_et": {k: _mean(v) for k, v in by_et.items()}}


	def main(seeds=(7, 17, 27), out: str \| None = None) -> None:
	from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
	systems = [("grounded (ours)", mock_plan),
	("OpenRefine fingerprint", openrefine_fingerprint_plan),
	("OpenRefine kNN", openrefine_knn_plan),
	("no-op", lambda df: {"table_operations": [], "columns": [], "flags": []})]
	print("\n=== Cleaning north-star — WIDE validation suite (Raha + injected harvested), "
	f"{len(seeds)} seeds ===\n")
	print(f"{'system':<24}{'NORTH*':>9}{'±95%CI':>9}{'REAL-F1':>9}{'INJ-F1':>8}"
	f"{'damage':>8}{'abstain':>9}")
	print("-" * 76)
	table = []
	for name, planner in systems:
	norths, results = [], []
	for s in seeds:
	r = evaluate_suite(planner, seed=s)
	norths.append(r["north"]); results.append(r)
	mean_n = _mean(norths)
	# 95% CI ~ 1.96 * std / sqrt(n)
	var = _mean([(x - mean_n) ** 2 for x in norths])
	ci = 1.96 * (var 0.5) / (len(norths) 0.5)
	last = results[-1]
	row = {"system": name, "north": mean_n, "north_ci": ci,
	"real_f1": _mean(r["real"] for r in results),
	"real_f1_per_seed": [r["real"] for r in results],
	"inj_f1": _mean(r["injected"] for r in results),
	"damage": _mean(r["damage"] for r in results),
	"abstain": _mean(r["abstain"] for r in results),
	"n_datasets": last["n"], "seeds": list(seeds)}
	table.append(row)
	print(f"{name:<24}{mean_n:>9.3f}{ci:>9.3f}{row['real_f1']:>9.3f}"
	f"{row['inj_f1']:>8.3f}{row['damage']:>8.3f}{row['abstain']:>9.3f}", flush=True)
	if out:
	import json
	json.dump(table, open(out, "w"), indent=1)
	print(f"rows written to {out}")
	print(f"\nNORTH* = harmonic mean(error-type macro, domain macro) over {last['n']} datasets. "
	"Double-macro + damage + abstain = un-gameable; hospital is 1 dataset of many. "
	"The money result: grounded vs the tool people actually use (OpenRefine).")


	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("--out", type=str, default=None, help="write table rows to JSON")
	main(out=ap.parse_args().out)