Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /inject_validity.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 11 days ago

14.3 kB

	"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
	like the real slice?

	(1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
	sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
	case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
	other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
	(3) reports Jensen-Shannon divergence (base 2) between injected and real type
	distributions, pooled and per real source; (4) reports Kendall tau-b between system
	rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
	policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
	Honesty rule: if the injector is far from real (high JSD), that IS the result — the
	paper's mitigation (both slices reported separately) already stands.

	uv run python -m eval.inject_validity # full run (~15 min CPU)
	uv run python -m eval.inject_validity --tex-only # rebuild the snippet from JSON
	Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
	"""

	from __future__ import annotations

	import collections
	import json
	import math
	import time
	from datetime import datetime
	from pathlib import Path

	from .degenerate import _abstain_all, _oracle, _random_edit
	from .metrics import _cell_equal
	from .paired_bench import _load, pairs
	from .run_real_multi import build_suite, score

	ROOT = Path(__file__).resolve().parent.parent
	SEEDS = (7, 17, 27) # money-table seeds (run_real_multi.main)
	CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
	"token-swap", "missing", "other"]
	EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
	_MOJI = ("�", "Ã", "Â", "â€", "ï¿")
	_DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
	"%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")


	def _num(s: str):
	t = s.strip().replace(",", "").lstrip("$").rstrip("%")
	try:
	return float(t)
	except ValueError:
	return None


	def _date(s: str):
	for f in _DATE_FMTS:
	try:
	return datetime.strptime(s.strip(), f).date()
	except ValueError:
	pass
	return None


	def _lev_gt2(a: str, b: str) -> bool:
	"""True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
	k = 2
	la, lb = len(a), len(b)
	if abs(la - lb) > k:
	return True
	INF = k + 1
	prev = [min(j, INF) for j in range(lb + 1)]
	for i in range(1, la + 1):
	lo, hi = max(1, i - k), min(lb, i + k)
	cur = [INF] * (lb + 1)
	if i <= k:
	cur[0] = i
	for j in range(lo, hi + 1):
	cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
	prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
	prev = cur
	if min(prev[max(0, lo - 1):hi + 1]) >= INF:
	return True
	return prev[lb] > k


	def classify(d, g) -> str:
	"""Deterministic error type from (dirty, gold) cell pair. Order matters:
	surface classes first, then value classes, edit-distance last."""
	ds, gs = str(d), str(g)
	if not ds.strip() or not gs.strip():
	return "missing"
	if "".join(ds.split()) == "".join(gs.split()):
	return "whitespace"
	if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
	return "case"
	if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
	return "encoding"
	if _num(ds) is not None and _num(gs) is not None:
	return "numeric"
	dd, gd = _date(ds), _date(gs)
	if dd is not None and dd == gd:
	return "date-format"
	dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
	if dt == gt and len(dt) > 1:
	return "token-swap"
	if not _lev_gt2(ds.strip(), gs.strip()):
	return "typo"
	return "other"


	def _classify_pair(dirty, clean) -> collections.Counter:
	n = min(len(dirty), len(clean))
	c = collections.Counter()
	for j in range(dirty.shape[1]):
	for i in range(n):
	dv, cv = dirty.iat[i, j], clean.iat[i, j]
	if not _cell_equal(dv, cv):
	c[classify(dv, cv)] += 1
	return c


	def _jsd(p: dict, q: dict) -> float:
	"""Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
	sp, sq = sum(p.values()), sum(q.values())
	out = 0.0
	for k in set(p) \| set(q):
	a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
	m = (a + b) / 2
	if a:
	out += 0.5 * a * math.log2(a / m)
	if b:
	out += 0.5 * b * math.log2(b / m)
	return out


	def _tau_b(xs, ys) -> float:
	"""Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
	n0 = nc = nd = tx = ty = 0
	for i in range(len(xs)):
	for j in range(i + 1, len(xs)):
	n0 += 1
	a, b = xs[i] - xs[j], ys[i] - ys[j]
	tx += a == 0
	ty += b == 0
	nc += a * b > 0
	nd += a * b < 0
	den = ((n0 - tx) * (n0 - ty)) ** 0.5
	return (nc - nd) / den if den else 0.0


	def _dist(counter) -> dict:
	tot = sum(counter.values())
	return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}


	def _suite_slices(cleaner) -> tuple[float, float]:
	"""(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
	cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
	real = []
	for spec in build_suite(seed=SEEDS[0]):
	if spec["source"] != "real":
	continue
	dirty, clean = spec["load"]()
	real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
	inj = []
	for s in SEEDS:
	fs = []
	for spec in build_suite(seed=s):
	if spec["source"] != "injected":
	continue
	loaded = spec["load"]()
	if loaded is None:
	continue
	dirty, clean = loaded
	fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
	inj.append(sum(fs) / len(fs))
	return sum(real) / len(real), sum(inj) / len(inj)


	def _write_tex(out: dict, res: Path) -> None:
	rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
	j, rk = out["jsd"], out["ranking"]
	L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
	r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
	r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
	r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
	r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
	f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
	f"{out['real']['n']:,}".replace(",", r"{,}") +
	r"$ real errors across the 42 paired sources (hospital's " +
	f"{out['real']['hospital_n']}" + r" included).",
	r"\begin{table}[t]\centering\small",
	r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
	r"\label{tab:inject-validity}",
	r"\begin{tabular}{lrr}\toprule",
	r"error type & real & injected \\ \midrule"]
	for c in CATS:
	L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
	L += [r"\bottomrule\end{tabular}\end{table}",
	r"The injector covers only the recoverable surface classes it targets by design",
	r"(typo/case/whitespace; injector--taxonomy agreement " +
	f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
	r"are dominated by substitutions beyond edit distance~2 (other, " +
	f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
	r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
	f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
	r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
	r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
	f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
	f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
	r"interchangeable, which is why the paper reports them separately and localizes",
	r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
	r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
	f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
	f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
	r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
	r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
	r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
	r"strong exactly where the canonical form is present and dominant by construction.",
	r"Injected-only evaluation would therefore overstate frequency-clustering",
	r"baselines."]
	(res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")


	def main() -> None:
	t0 = time.perf_counter()
	# (1) real errors: all 42 paired sources (hospital included -> its 509)
	real_per: dict[str, collections.Counter] = {}
	for p in pairs():
	try:
	dirty, clean = _load(p)
	except Exception as e: # noqa: BLE001
	print(f" {p.name}: LOAD FAILED {type(e).__name__}")
	continue
	real_per[p.name] = _classify_pair(dirty, clean)
	print(f" real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
	real_pool = sum(real_per.values(), collections.Counter())
	t_real = time.perf_counter() - t0

	# (2) injected errors at the money-table seeds, via the SAME suite generator
	inj_pool = collections.Counter()
	inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
	inj_per_seed = {}
	for s in SEEDS:
	cs = collections.Counter()
	for spec in build_suite(seed=s):
	if spec["source"] != "injected":
	continue
	loaded = spec["load"]()
	if loaded is None:
	continue
	dirty, clean = loaded
	c = _classify_pair(dirty, clean)
	cs += c
	inj_per_injector[spec["name"].split(":")[1]] += c
	inj_per_seed[s] = sum(cs.values())
	inj_pool += cs
	print(f" injected seed={s} n={inj_per_seed[s]}", flush=True)
	agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
	t_inj = time.perf_counter() - t0 - t_real

	# (3) distribution similarity
	jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
	for k in sorted(real_per) if real_per[k]}
	jsd_vals = sorted(jsd_per_source.values())
	# (4) ranking preservation: money-table systems + degenerate anchors
	money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
	systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
	"anchor": False} for r in money]
	for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
	("oracle", _oracle)]:
	rf, jf = _suite_slices(fn)
	systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
	print(f" anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
	tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
	[s["inj_f1"] for s in systems if not s["anchor"]])
	tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])

	out = {
	"taxonomy": CATS, "seeds": list(SEEDS),
	"real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
	"hospital_n": sum(real_per.get("hospital", {}).values()),
	"pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
	"per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
	for k, v in sorted(real_per.items())}},
	"injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
	"pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
	"per_injector_dist": {k: _dist(v)
	for k, v in sorted(inj_per_injector.items())},
	"injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
	"jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
	"hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
	"per_real_source_vs_injected": jsd_per_source,
	"min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
	"max": jsd_vals[-1]},
	"ranking": {"systems": systems,
	"kendall_tau_b_money_table": round(tau_money, 4),
	"kendall_tau_b_with_anchors": round(tau_all, 4)},
	"sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
	"total": round(time.perf_counter() - t0, 1)},
	}
	res = ROOT / "eval" / "results"
	json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
	_write_tex(out, res)
	print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
	f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
	f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")


	if __name__ == "__main__":
	import argparse
	ap = argparse.ArgumentParser()
	ap.add_argument("--tex-only", action="store_true",
	help="rebuild the LaTeX snippet from the existing JSON")
	if ap.parse_args().tex_only:
	res = ROOT / "eval" / "results"
	_write_tex(json.load(open(res / "inject_validity.json")), res)
	print(f"-> {res / 'inject_validity_appendix.tex'}")
	else:
	main()