scrubdata / eval /inject_validity.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
14.3 kB
"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
like the real slice?
(1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
(3) reports Jensen-Shannon divergence (base 2) between injected and real type
distributions, pooled and per real source; (4) reports Kendall tau-b between system
rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
Honesty rule: if the injector is far from real (high JSD), that IS the result — the
paper's mitigation (both slices reported separately) already stands.
uv run python -m eval.inject_validity # full run (~15 min CPU)
uv run python -m eval.inject_validity --tex-only # rebuild the snippet from JSON
Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
"""
from __future__ import annotations
import collections
import json
import math
import time
from datetime import datetime
from pathlib import Path
from .degenerate import _abstain_all, _oracle, _random_edit
from .metrics import _cell_equal
from .paired_bench import _load, pairs
from .run_real_multi import build_suite, score
ROOT = Path(__file__).resolve().parent.parent
SEEDS = (7, 17, 27) # money-table seeds (run_real_multi.main)
CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
"token-swap", "missing", "other"]
EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
_MOJI = ("�", "Ã", "Â", "â€", "ï¿")
_DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
"%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")
def _num(s: str):
t = s.strip().replace(",", "").lstrip("$").rstrip("%")
try:
return float(t)
except ValueError:
return None
def _date(s: str):
for f in _DATE_FMTS:
try:
return datetime.strptime(s.strip(), f).date()
except ValueError:
pass
return None
def _lev_gt2(a: str, b: str) -> bool:
"""True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
k = 2
la, lb = len(a), len(b)
if abs(la - lb) > k:
return True
INF = k + 1
prev = [min(j, INF) for j in range(lb + 1)]
for i in range(1, la + 1):
lo, hi = max(1, i - k), min(lb, i + k)
cur = [INF] * (lb + 1)
if i <= k:
cur[0] = i
for j in range(lo, hi + 1):
cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
prev = cur
if min(prev[max(0, lo - 1):hi + 1]) >= INF:
return True
return prev[lb] > k
def classify(d, g) -> str:
"""Deterministic error type from (dirty, gold) cell pair. Order matters:
surface classes first, then value classes, edit-distance last."""
ds, gs = str(d), str(g)
if not ds.strip() or not gs.strip():
return "missing"
if "".join(ds.split()) == "".join(gs.split()):
return "whitespace"
if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
return "case"
if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
return "encoding"
if _num(ds) is not None and _num(gs) is not None:
return "numeric"
dd, gd = _date(ds), _date(gs)
if dd is not None and dd == gd:
return "date-format"
dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
if dt == gt and len(dt) > 1:
return "token-swap"
if not _lev_gt2(ds.strip(), gs.strip()):
return "typo"
return "other"
def _classify_pair(dirty, clean) -> collections.Counter:
n = min(len(dirty), len(clean))
c = collections.Counter()
for j in range(dirty.shape[1]):
for i in range(n):
dv, cv = dirty.iat[i, j], clean.iat[i, j]
if not _cell_equal(dv, cv):
c[classify(dv, cv)] += 1
return c
def _jsd(p: dict, q: dict) -> float:
"""Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
sp, sq = sum(p.values()), sum(q.values())
out = 0.0
for k in set(p) | set(q):
a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
m = (a + b) / 2
if a:
out += 0.5 * a * math.log2(a / m)
if b:
out += 0.5 * b * math.log2(b / m)
return out
def _tau_b(xs, ys) -> float:
"""Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
n0 = nc = nd = tx = ty = 0
for i in range(len(xs)):
for j in range(i + 1, len(xs)):
n0 += 1
a, b = xs[i] - xs[j], ys[i] - ys[j]
tx += a == 0
ty += b == 0
nc += a * b > 0
nd += a * b < 0
den = ((n0 - tx) * (n0 - ty)) ** 0.5
return (nc - nd) / den if den else 0.0
def _dist(counter) -> dict:
tot = sum(counter.values())
return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}
def _suite_slices(cleaner) -> tuple[float, float]:
"""(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
real = []
for spec in build_suite(seed=SEEDS[0]):
if spec["source"] != "real":
continue
dirty, clean = spec["load"]()
real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
inj = []
for s in SEEDS:
fs = []
for spec in build_suite(seed=s):
if spec["source"] != "injected":
continue
loaded = spec["load"]()
if loaded is None:
continue
dirty, clean = loaded
fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
inj.append(sum(fs) / len(fs))
return sum(real) / len(real), sum(inj) / len(inj)
def _write_tex(out: dict, res: Path) -> None:
rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
j, rk = out["jsd"], out["ranking"]
L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
f"{out['real']['n']:,}".replace(",", r"{,}") +
r"$ real errors across the 42 paired sources (hospital's " +
f"{out['real']['hospital_n']}" + r" included).",
r"\begin{table}[t]\centering\small",
r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
r"\label{tab:inject-validity}",
r"\begin{tabular}{lrr}\toprule",
r"error type & real & injected \\ \midrule"]
for c in CATS:
L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
L += [r"\bottomrule\end{tabular}\end{table}",
r"The injector covers only the recoverable surface classes it targets by design",
r"(typo/case/whitespace; injector--taxonomy agreement " +
f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
r"are dominated by substitutions beyond edit distance~2 (other, " +
f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
r"interchangeable, which is why the paper reports them separately and localizes",
r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
r"strong exactly where the canonical form is present and dominant by construction.",
r"Injected-only evaluation would therefore overstate frequency-clustering",
r"baselines."]
(res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")
def main() -> None:
t0 = time.perf_counter()
# (1) real errors: all 42 paired sources (hospital included -> its 509)
real_per: dict[str, collections.Counter] = {}
for p in pairs():
try:
dirty, clean = _load(p)
except Exception as e: # noqa: BLE001
print(f" {p.name}: LOAD FAILED {type(e).__name__}")
continue
real_per[p.name] = _classify_pair(dirty, clean)
print(f" real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
real_pool = sum(real_per.values(), collections.Counter())
t_real = time.perf_counter() - t0
# (2) injected errors at the money-table seeds, via the SAME suite generator
inj_pool = collections.Counter()
inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
inj_per_seed = {}
for s in SEEDS:
cs = collections.Counter()
for spec in build_suite(seed=s):
if spec["source"] != "injected":
continue
loaded = spec["load"]()
if loaded is None:
continue
dirty, clean = loaded
c = _classify_pair(dirty, clean)
cs += c
inj_per_injector[spec["name"].split(":")[1]] += c
inj_per_seed[s] = sum(cs.values())
inj_pool += cs
print(f" injected seed={s} n={inj_per_seed[s]}", flush=True)
agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
t_inj = time.perf_counter() - t0 - t_real
# (3) distribution similarity
jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
for k in sorted(real_per) if real_per[k]}
jsd_vals = sorted(jsd_per_source.values())
# (4) ranking preservation: money-table systems + degenerate anchors
money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
"anchor": False} for r in money]
for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
("oracle", _oracle)]:
rf, jf = _suite_slices(fn)
systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
print(f" anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
[s["inj_f1"] for s in systems if not s["anchor"]])
tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])
out = {
"taxonomy": CATS, "seeds": list(SEEDS),
"real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
"hospital_n": sum(real_per.get("hospital", {}).values()),
"pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
"per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
for k, v in sorted(real_per.items())}},
"injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
"pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
"per_injector_dist": {k: _dist(v)
for k, v in sorted(inj_per_injector.items())},
"injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
"jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
"hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
"per_real_source_vs_injected": jsd_per_source,
"min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
"max": jsd_vals[-1]},
"ranking": {"systems": systems,
"kendall_tau_b_money_table": round(tau_money, 4),
"kendall_tau_b_with_anchors": round(tau_all, 4)},
"sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
"total": round(time.perf_counter() - t0, 1)},
}
res = ROOT / "eval" / "results"
json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
_write_tex(out, res)
print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("--tex-only", action="store_true",
help="rebuild the LaTeX snippet from the existing JSON")
if ap.parse_args().tex_only:
res = ROOT / "eval" / "results"
_write_tex(json.load(open(res / "inject_validity.json")), res)
print(f"-> {res / 'inject_validity_appendix.tex'}")
else:
main()