"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK like the real slice? (1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2, case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing, other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27); (3) reports Jensen-Shannon divergence (base 2) between injected and real type distributions, pooled and per real source; (4) reports Kendall tau-b between system rankings on the injected vs real F1 slices of money_table_head.json, with degenerate policies (abstain-all / random-edit / oracle) run through the same suite as anchors. Honesty rule: if the injector is far from real (high JSD), that IS the result — the paper's mitigation (both slices reported separately) already stands. uv run python -m eval.inject_validity # full run (~15 min CPU) uv run python -m eval.inject_validity --tex-only # rebuild the snippet from JSON Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex. """ from __future__ import annotations import collections import json import math import time from datetime import datetime from pathlib import Path from .degenerate import _abstain_all, _oracle, _random_edit from .metrics import _cell_equal from .paired_bench import _load, pairs from .run_real_multi import build_suite, score ROOT = Path(__file__).resolve().parent.parent SEEDS = (7, 17, 27) # money-table seeds (run_real_multi.main) CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format", "token-swap", "missing", "other"] EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"} _MOJI = ("�", "Ã", "Â", "â€", "ï¿") _DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d", "%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d") def _num(s: str): t = s.strip().replace(",", "").lstrip("$").rstrip("%") try: return float(t) except ValueError: return None def _date(s: str): for f in _DATE_FMTS: try: return datetime.strptime(s.strip(), f).date() except ValueError: pass return None def _lev_gt2(a: str, b: str) -> bool: """True iff Levenshtein(a, b) > 2 (banded DP, O(len*5)).""" k = 2 la, lb = len(a), len(b) if abs(la - lb) > k: return True INF = k + 1 prev = [min(j, INF) for j in range(lb + 1)] for i in range(1, la + 1): lo, hi = max(1, i - k), min(lb, i + k) cur = [INF] * (lb + 1) if i <= k: cur[0] = i for j in range(lo, hi + 1): cur[j] = min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (a[i - 1] != b[j - 1]), INF) prev = cur if min(prev[max(0, lo - 1):hi + 1]) >= INF: return True return prev[lb] > k def classify(d, g) -> str: """Deterministic error type from (dirty, gold) cell pair. Order matters: surface classes first, then value classes, edit-distance last.""" ds, gs = str(d), str(g) if not ds.strip() or not gs.strip(): return "missing" if "".join(ds.split()) == "".join(gs.split()): return "whitespace" if "".join(ds.split()).casefold() == "".join(gs.split()).casefold(): return "case" if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI): return "encoding" if _num(ds) is not None and _num(gs) is not None: return "numeric" dd, gd = _date(ds), _date(gs) if dd is not None and dd == gd: return "date-format" dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split()) if dt == gt and len(dt) > 1: return "token-swap" if not _lev_gt2(ds.strip(), gs.strip()): return "typo" return "other" def _classify_pair(dirty, clean) -> collections.Counter: n = min(len(dirty), len(clean)) c = collections.Counter() for j in range(dirty.shape[1]): for i in range(n): dv, cv = dirty.iat[i, j], clean.iat[i, j] if not _cell_equal(dv, cv): c[classify(dv, cv)] += 1 return c def _jsd(p: dict, q: dict) -> float: """Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint).""" sp, sq = sum(p.values()), sum(q.values()) out = 0.0 for k in set(p) | set(q): a, b = p.get(k, 0) / sp, q.get(k, 0) / sq m = (a + b) / 2 if a: out += 0.5 * a * math.log2(a / m) if b: out += 0.5 * b * math.log2(b / m) return out def _tau_b(xs, ys) -> float: """Kendall tau-b (tie-corrected); n is small, O(n^2) is fine.""" n0 = nc = nd = tx = ty = 0 for i in range(len(xs)): for j in range(i + 1, len(xs)): n0 += 1 a, b = xs[i] - xs[j], ys[i] - ys[j] tx += a == 0 ty += b == 0 nc += a * b > 0 nd += a * b < 0 den = ((n0 - tx) * (n0 - ty)) ** 0.5 return (nc - nd) / den if den else 0.0 def _dist(counter) -> dict: tot = sum(counter.values()) return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {} def _suite_slices(cleaner) -> tuple[float, float]: """(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means.""" real = [] for spec in build_suite(seed=SEEDS[0]): if spec["source"] != "real": continue dirty, clean = spec["load"]() real.append(score(dirty, clean, cleaner(dirty, clean))["f1"]) inj = [] for s in SEEDS: fs = [] for spec in build_suite(seed=s): if spec["source"] != "injected": continue loaded = spec["load"]() if loaded is None: continue dirty, clean = loaded fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"]) inj.append(sum(fs) / len(fs)) return sum(real) / len(real), sum(inj) / len(inj) def _write_tex(out: dict, res: Path) -> None: rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"] j, rk = out["jsd"], out["ranking"] L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.", r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}", r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)", r"with a deterministic taxonomy and compare the suite's injected errors (money-table", r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" + f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" + f"{out['real']['n']:,}".replace(",", r"{,}") + r"$ real errors across the 42 paired sources (hospital's " + f"{out['real']['hospital_n']}" + r" included).", r"\begin{table}[t]\centering\small", r"\caption{Error-type distributions, real vs.\ injected (pooled).}", r"\label{tab:inject-validity}", r"\begin{tabular}{lrr}\toprule", r"error type & real & injected \\ \midrule"] for c in CATS: L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\") L += [r"\bottomrule\end{tabular}\end{table}", r"The injector covers only the recoverable surface classes it targets by design", r"(typo/case/whitespace; injector--taxonomy agreement " + f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors", r"are dominated by substitutions beyond edit distance~2 (other, " + f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" + r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" + f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.", r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" + r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " + f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " + f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}", r"interchangeable, which is why the paper reports them separately and localizes", r"the grounding claim in the real slice. Ranking preservation is partial: Kendall", r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" + f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" + f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors", r"(abstain-all, random-edit, oracle) included. The injected slice preserves the", r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system", r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks", r"strong exactly where the canonical form is present and dominant by construction.", r"Injected-only evaluation would therefore overstate frequency-clustering", r"baselines."] (res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n") def main() -> None: t0 = time.perf_counter() # (1) real errors: all 42 paired sources (hospital included -> its 509) real_per: dict[str, collections.Counter] = {} for p in pairs(): try: dirty, clean = _load(p) except Exception as e: # noqa: BLE001 print(f" {p.name}: LOAD FAILED {type(e).__name__}") continue real_per[p.name] = _classify_pair(dirty, clean) print(f" real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True) real_pool = sum(real_per.values(), collections.Counter()) t_real = time.perf_counter() - t0 # (2) injected errors at the money-table seeds, via the SAME suite generator inj_pool = collections.Counter() inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter) inj_per_seed = {} for s in SEEDS: cs = collections.Counter() for spec in build_suite(seed=s): if spec["source"] != "injected": continue loaded = spec["load"]() if loaded is None: continue dirty, clean = loaded c = _classify_pair(dirty, clean) cs += c inj_per_injector[spec["name"].split(":")[1]] += c inj_per_seed[s] = sum(cs.values()) inj_pool += cs print(f" injected seed={s} n={inj_per_seed[s]}", flush=True) agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items()) t_inj = time.perf_counter() - t0 - t_real # (3) distribution similarity jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4) for k in sorted(real_per) if real_per[k]} jsd_vals = sorted(jsd_per_source.values()) # (4) ranking preservation: money-table systems + degenerate anchors money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json")) systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"], "anchor": False} for r in money] for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit), ("oracle", _oracle)]: rf, jf = _suite_slices(fn) systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True}) print(f" anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True) tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]], [s["inj_f1"] for s in systems if not s["anchor"]]) tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems]) out = { "taxonomy": CATS, "seeds": list(SEEDS), "real": {"n": sum(real_pool.values()), "n_sources": len(real_per), "hospital_n": sum(real_per.get("hospital", {}).values()), "pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool), "per_source": {k: {"n": sum(v.values()), "dist": _dist(v)} for k, v in sorted(real_per.items())}}, "injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed, "pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool), "per_injector_dist": {k: _dist(v) for k, v in sorted(inj_per_injector.items())}, "injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)}, "jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4), "hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4), "per_real_source_vs_injected": jsd_per_source, "min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2], "max": jsd_vals[-1]}, "ranking": {"systems": systems, "kendall_tau_b_money_table": round(tau_money, 4), "kendall_tau_b_with_anchors": round(tau_all, 4)}, "sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1), "total": round(time.perf_counter() - t0, 1)}, } res = ROOT / "eval" / "results" json.dump(out, open(res / "inject_validity.json", "w"), indent=1) _write_tex(out, res) print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} " f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} " f"+ inject_validity_appendix.tex ({out['sec']['total']}s)") if __name__ == "__main__": import argparse ap = argparse.ArgumentParser() ap.add_argument("--tex-only", action="store_true", help="rebuild the LaTeX snippet from the existing JSON") if ap.parse_args().tex_only: res = ROOT / "eval" / "results" _write_tex(json.load(open(res / "inject_validity.json")), res) print(f"-> {res / 'inject_validity_appendix.tex'}") else: main()