Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 16 days ago

4.44 kB

	"""Layer-2 eval: our pipeline on a REAL dirty/clean dataset (out-of-distribution).

	Uses Raha's `hospital` (1000×20, ~2.5% cells are char-substitution typos), row-aligned
	dirty/clean. Reports the Raha-style repair protocol — the right metric when the data is
	already mostly correct — plus overall recovery.

	uv run eval/run_real.py

	Metrics (per system, vs the clean reference):
	recovery fraction of cells matching clean (tolerant of pure type-coercion)
	repair_recall corrected errors / total errors (did we FIX the errors?)
	repair_prec corrected errors / cells we changed (did we avoid BREAKING good cells?)
	broken good cells we changed to wrong (lower is better)
	"""

	from __future__ import annotations

	from pathlib import Path

	import pandas as pd

	from scrubdata.executor import apply_plan
	from scrubdata.planner import mock_plan

	from .metrics import _cell_equal

	BASE = Path(__file__).resolve().parent.parent / "data" / "real" / "hospital"
	URLS = {
	"dirty.csv": "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets/hospital/dirty.csv",
	"clean.csv": "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets/hospital/clean.csv",
	}


	def _ensure_data() -> None:
	BASE.mkdir(parents=True, exist_ok=True)
	import urllib.request
	for fn, url in URLS.items():
	p = BASE / fn
	if not p.exists():
	urllib.request.urlretrieve(url, p)


	def _load():
	d = pd.read_csv(BASE / "dirty.csv", dtype=str, keep_default_na=False)
	c = pd.read_csv(BASE / "clean.csv", dtype=str, keep_default_na=False)
	return d, c


	def _score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
	"""Compare a system output `out` to `clean`, by position, vs the `dirty` input."""
	cols = [c for c in dirty.columns if c in out.columns]
	nrows = min(len(dirty), len(out), len(clean))
	total = errors = fixed = changed = broken = recovered = 0
	for j, col in enumerate(dirty.columns):
	present = col in out.columns
	for i in range(nrows):
	total += 1
	dv, cv = dirty.iat[i, j], clean.iat[i, j]
	ov = out.iloc[i][col] if present else None
	is_err = not _cell_equal(dv, cv)
	is_changed = present and not _cell_equal(ov, dv)
	ok = present and _cell_equal(ov, cv)
	if ok:
	recovered += 1
	if is_err:
	errors += 1
	if ok:
	fixed += 1
	if is_changed:
	changed += 1
	if not is_err and not ok: # we changed a good cell into a wrong one
	broken += 1
	return {
	"recovery": recovered / total,
	"repair_recall": fixed / errors if errors else 0.0,
	"repair_prec": fixed / changed if changed else 0.0,
	"broken": broken,
	"_errors": errors, "_changed": changed, "_fixed": fixed,
	}


	def main() -> None:
	_ensure_data()
	dirty, clean = _load()
	noop = _score(dirty, clean, dirty)
	h_plan = mock_plan(dirty)
	cleaned, _ = apply_plan(dirty, h_plan)
	heur = _score(dirty, clean, cleaned)

	print(f"\nLayer-2 real-data eval — Raha hospital ({dirty.shape[0]}×{dirty.shape[1]}, "
	f"{noop['_errors']} error cells)\n")
	cols = ["recovery", "repair_recall", "repair_prec", "broken"]
	print(f"{'system':<22}" + "".join(f"{c:>14}" for c in cols))
	print("-" * (22 + 14 * len(cols)))
	for name, m in [("NO-OP (dirty as-is)", noop), ("HEURISTIC (baseline)", heur)]:
	print(f"{name:<22}" + "".join(f"{m[c]:>14.3f}" if isinstance(m[c], float)
	else f"{m[c]:>14}" for c in cols))
	print(f"\nHeuristic changed {heur['_changed']} cells, fixed {heur['_fixed']} errors, "
	f"diverged-from-convention on {heur['broken']}.")
	print("HEADLINE METRIC = repair_recall (did we FIX the typo errors?). The errors here "
	"are char-substitution typos, fixable by cluster-canonicalization (the model's "
	"job; the rule heuristic scores 0).")
	print("NOTE: 'broken' here is mostly CONVENTION DIVERGENCE, not error — our tool parses "
	"'100%'->1.0 and reformats phones, which this benchmark stores as raw text. That "
	"is product value, not a mistake; it's why raw recovery understates a standardizer.")


	if __name__ == "__main__":
	main()