Spaces:

build-small-hackathon
/

scrubdata

Running

File size: 14,301 Bytes

16dc556

"""W4.5 inject-validity (TableEG-style) — does the injected slice LOOK like and RANK
like the real slice?

(1) Classifies every real dirty->gold cell error (hospital's 509 + all 42 paired
sources eval/paired_bench.py walks) with a deterministic taxonomy (typo/edit-dist<=2,
case-only, whitespace, encoding/mojibake, numeric, date-format, token-swap, missing,
other); (2) classifies the suite's INJECTED errors at the money-table seeds (7/17/27);
(3) reports Jensen-Shannon divergence (base 2) between injected and real type
distributions, pooled and per real source; (4) reports Kendall tau-b between system
rankings on the injected vs real F1 slices of money_table_head.json, with degenerate
policies (abstain-all / random-edit / oracle) run through the same suite as anchors.
Honesty rule: if the injector is far from real (high JSD), that IS the result — the
paper's mitigation (both slices reported separately) already stands.

    uv run python -m eval.inject_validity              # full run (~15 min CPU)
    uv run python -m eval.inject_validity --tex-only   # rebuild the snippet from JSON
Writes eval/results/inject_validity.json + eval/results/inject_validity_appendix.tex.
"""

from __future__ import annotations

import collections
import json
import math
import time
from datetime import datetime
from pathlib import Path

from .degenerate import _abstain_all, _oracle, _random_edit
from .metrics import _cell_equal
from .paired_bench import _load, pairs
from .run_real_multi import build_suite, score

ROOT = Path(__file__).resolve().parent.parent
SEEDS = (7, 17, 27)            # money-table seeds (run_real_multi.main)
CATS = ["typo", "case", "whitespace", "encoding", "numeric", "date-format",
        "token-swap", "missing", "other"]
EXPECT = {"typo": "typo", "ocr": "typo", "case": "case", "whitespace": "whitespace"}
_MOJI = ("�", "Ã", "Â", "â€", "ï¿")
_DATE_FMTS = ("%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%m/%d/%y", "%Y/%m/%d",
              "%d-%m-%Y", "%b %d, %Y", "%B %d, %Y", "%d %b %Y", "%Y%m%d")


def _num(s: str):
    t = s.strip().replace(",", "").lstrip("$").rstrip("%")
    try:
        return float(t)
    except ValueError:
        return None


def _date(s: str):
    for f in _DATE_FMTS:
        try:
            return datetime.strptime(s.strip(), f).date()
        except ValueError:
            pass
    return None


def _lev_gt2(a: str, b: str) -> bool:
    """True iff Levenshtein(a, b) > 2 (banded DP, O(len*5))."""
    k = 2
    la, lb = len(a), len(b)
    if abs(la - lb) > k:
        return True
    INF = k + 1
    prev = [min(j, INF) for j in range(lb + 1)]
    for i in range(1, la + 1):
        lo, hi = max(1, i - k), min(lb, i + k)
        cur = [INF] * (lb + 1)
        if i <= k:
            cur[0] = i
        for j in range(lo, hi + 1):
            cur[j] = min(prev[j] + 1, cur[j - 1] + 1,
                         prev[j - 1] + (a[i - 1] != b[j - 1]), INF)
        prev = cur
        if min(prev[max(0, lo - 1):hi + 1]) >= INF:
            return True
    return prev[lb] > k


def classify(d, g) -> str:
    """Deterministic error type from (dirty, gold) cell pair. Order matters:
    surface classes first, then value classes, edit-distance last."""
    ds, gs = str(d), str(g)
    if not ds.strip() or not gs.strip():
        return "missing"
    if "".join(ds.split()) == "".join(gs.split()):
        return "whitespace"
    if "".join(ds.split()).casefold() == "".join(gs.split()).casefold():
        return "case"
    if any(m in ds for m in _MOJI) != any(m in gs for m in _MOJI):
        return "encoding"
    if _num(ds) is not None and _num(gs) is not None:
        return "numeric"
    dd, gd = _date(ds), _date(gs)
    if dd is not None and dd == gd:
        return "date-format"
    dt, gt = sorted(ds.casefold().split()), sorted(gs.casefold().split())
    if dt == gt and len(dt) > 1:
        return "token-swap"
    if not _lev_gt2(ds.strip(), gs.strip()):
        return "typo"
    return "other"


def _classify_pair(dirty, clean) -> collections.Counter:
    n = min(len(dirty), len(clean))
    c = collections.Counter()
    for j in range(dirty.shape[1]):
        for i in range(n):
            dv, cv = dirty.iat[i, j], clean.iat[i, j]
            if not _cell_equal(dv, cv):
                c[classify(dv, cv)] += 1
    return c


def _jsd(p: dict, q: dict) -> float:
    """Jensen-Shannon divergence, base 2 (0 = identical, 1 = disjoint)."""
    sp, sq = sum(p.values()), sum(q.values())
    out = 0.0
    for k in set(p) | set(q):
        a, b = p.get(k, 0) / sp, q.get(k, 0) / sq
        m = (a + b) / 2
        if a:
            out += 0.5 * a * math.log2(a / m)
        if b:
            out += 0.5 * b * math.log2(b / m)
    return out


def _tau_b(xs, ys) -> float:
    """Kendall tau-b (tie-corrected); n is small, O(n^2) is fine."""
    n0 = nc = nd = tx = ty = 0
    for i in range(len(xs)):
        for j in range(i + 1, len(xs)):
            n0 += 1
            a, b = xs[i] - xs[j], ys[i] - ys[j]
            tx += a == 0
            ty += b == 0
            nc += a * b > 0
            nd += a * b < 0
    den = ((n0 - tx) * (n0 - ty)) ** 0.5
    return (nc - nd) / den if den else 0.0


def _dist(counter) -> dict:
    tot = sum(counter.values())
    return {k: round(counter.get(k, 0) / tot, 4) for k in CATS} if tot else {}


def _suite_slices(cleaner) -> tuple[float, float]:
    """(real-slice mean F1, injected-slice mean F1 over SEEDS) for a degenerate
    cleaner(dirty, clean) -> out, mirroring run_real_multi's by-source means."""
    real = []
    for spec in build_suite(seed=SEEDS[0]):
        if spec["source"] != "real":
            continue
        dirty, clean = spec["load"]()
        real.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
    inj = []
    for s in SEEDS:
        fs = []
        for spec in build_suite(seed=s):
            if spec["source"] != "injected":
                continue
            loaded = spec["load"]()
            if loaded is None:
                continue
            dirty, clean = loaded
            fs.append(score(dirty, clean, cleaner(dirty, clean))["f1"])
        inj.append(sum(fs) / len(fs))
    return sum(real) / len(real), sum(inj) / len(inj)


def _write_tex(out: dict, res: Path) -> None:
    rd, jd = out["real"]["pooled_dist"], out["injected"]["pooled_dist"]
    j, rk = out["jsd"], out["ranking"]
    L = [r"% Auto-generated by eval/inject_validity.py — do not edit by hand.",
         r"\subsection{Validity of the Injected Slice}\label{app:inject-validity}",
         r"Following the TableEG-style audit, we classify every error cell (dirty vs.\ gold)",
         r"with a deterministic taxonomy and compare the suite's injected errors (money-table",
         r"seeds " + "/".join(map(str, out["seeds"])) + r", $n=" +
         f"{out['injected']['n']:,}".replace(",", r"{,}") + r"$) against the $" +
         f"{out['real']['n']:,}".replace(",", r"{,}") +
         r"$ real errors across the 42 paired sources (hospital's " +
         f"{out['real']['hospital_n']}" + r" included).",
         r"\begin{table}[t]\centering\small",
         r"\caption{Error-type distributions, real vs.\ injected (pooled).}",
         r"\label{tab:inject-validity}",
         r"\begin{tabular}{lrr}\toprule",
         r"error type & real & injected \\ \midrule"]
    for c in CATS:
        L.append(f"{c} & {rd.get(c, 0):.3f} & {jd.get(c, 0):.3f} " + r"\\")
    L += [r"\bottomrule\end{tabular}\end{table}",
          r"The injector covers only the recoverable surface classes it targets by design",
          r"(typo/case/whitespace; injector--taxonomy agreement " +
          f"{out['injected']['injector_taxonomy_agreement']:.3f}" + r"), whereas real errors",
          r"are dominated by substitutions beyond edit distance~2 (other, " +
          f"{rd['other']:.3f}" + r") and short typos (" + f"{rd['typo']:.3f}" +
          r"), with numeric (" + f"{rd['numeric']:.3f}" + r"), missing-value (" +
          f"{rd['missing']:.3f}" + r"), and encoding classes the injector never produces.",
          r"Pooled Jensen--Shannon divergence is " + f"{j['pooled']:.3f}" +
          r"~bits (per-source median " + f"{j['median']:.3f}" + r", range " +
          f"{j['min']:.3f}" + r"--" + f"{j['max']:.3f}" + r"; hospital " +
          f"{j['hospital_vs_injected']:.3f}" + r"): the two slices are \emph{not}",
          r"interchangeable, which is why the paper reports them separately and localizes",
          r"the grounding claim in the real slice. Ranking preservation is partial: Kendall",
          r"$\tau_b$ between system rankings on the injected vs.\ real F1 slices is $" +
          f"{rk['kendall_tau_b_money_table']:.2f}" + r"$ over the four cross-system rows and $" +
          f"{rk['kendall_tau_b_with_anchors']:.2f}" + r"$ with the degenerate anchors",
          r"(abstain-all, random-edit, oracle) included. The injected slice preserves the",
          r"floor/ceiling ordering but ranks OpenRefine fingerprint above both our system",
          r"and OpenRefine kNN, the reverse of the real slice --- frequency clustering looks",
          r"strong exactly where the canonical form is present and dominant by construction.",
          r"Injected-only evaluation would therefore overstate frequency-clustering",
          r"baselines."]
    (res / "inject_validity_appendix.tex").write_text("\n".join(L) + "\n")


def main() -> None:
    t0 = time.perf_counter()
    # (1) real errors: all 42 paired sources (hospital included -> its 509)
    real_per: dict[str, collections.Counter] = {}
    for p in pairs():
        try:
            dirty, clean = _load(p)
        except Exception as e:  # noqa: BLE001
            print(f"  {p.name}: LOAD FAILED {type(e).__name__}")
            continue
        real_per[p.name] = _classify_pair(dirty, clean)
        print(f"  real {p.name:<46} n={sum(real_per[p.name].values())}", flush=True)
    real_pool = sum(real_per.values(), collections.Counter())
    t_real = time.perf_counter() - t0

    # (2) injected errors at the money-table seeds, via the SAME suite generator
    inj_pool = collections.Counter()
    inj_per_injector: dict[str, collections.Counter] = collections.defaultdict(collections.Counter)
    inj_per_seed = {}
    for s in SEEDS:
        cs = collections.Counter()
        for spec in build_suite(seed=s):
            if spec["source"] != "injected":
                continue
            loaded = spec["load"]()
            if loaded is None:
                continue
            dirty, clean = loaded
            c = _classify_pair(dirty, clean)
            cs += c
            inj_per_injector[spec["name"].split(":")[1]] += c
        inj_per_seed[s] = sum(cs.values())
        inj_pool += cs
        print(f"  injected seed={s} n={inj_per_seed[s]}", flush=True)
    agree = sum(inj_per_injector[et][want] for et, want in EXPECT.items())
    t_inj = time.perf_counter() - t0 - t_real

    # (3) distribution similarity
    jsd_per_source = {k: round(_jsd(real_per[k], inj_pool), 4)
                      for k in sorted(real_per) if real_per[k]}
    jsd_vals = sorted(jsd_per_source.values())
    # (4) ranking preservation: money-table systems + degenerate anchors
    money = json.load(open(ROOT / "eval" / "results" / "money_table_head.json"))
    systems = [{"system": r["system"], "real_f1": r["real_f1"], "inj_f1": r["inj_f1"],
                "anchor": False} for r in money]
    for name, fn in [("abstain-all", _abstain_all), ("random-edit", _random_edit),
                     ("oracle", _oracle)]:
        rf, jf = _suite_slices(fn)
        systems.append({"system": name, "real_f1": rf, "inj_f1": jf, "anchor": True})
        print(f"  anchor {name:<12} real={rf:.3f} inj={jf:.3f}", flush=True)
    tau_money = _tau_b([s["real_f1"] for s in systems if not s["anchor"]],
                       [s["inj_f1"] for s in systems if not s["anchor"]])
    tau_all = _tau_b([s["real_f1"] for s in systems], [s["inj_f1"] for s in systems])

    out = {
        "taxonomy": CATS, "seeds": list(SEEDS),
        "real": {"n": sum(real_pool.values()), "n_sources": len(real_per),
                 "hospital_n": sum(real_per.get("hospital", {}).values()),
                 "pooled_counts": dict(real_pool), "pooled_dist": _dist(real_pool),
                 "per_source": {k: {"n": sum(v.values()), "dist": _dist(v)}
                                for k, v in sorted(real_per.items())}},
        "injected": {"n": sum(inj_pool.values()), "per_seed_n": inj_per_seed,
                     "pooled_counts": dict(inj_pool), "pooled_dist": _dist(inj_pool),
                     "per_injector_dist": {k: _dist(v)
                                           for k, v in sorted(inj_per_injector.items())},
                     "injector_taxonomy_agreement": round(agree / sum(inj_pool.values()), 4)},
        "jsd": {"pooled": round(_jsd(real_pool, inj_pool), 4),
                "hospital_vs_injected": round(_jsd(real_per["hospital"], inj_pool), 4),
                "per_real_source_vs_injected": jsd_per_source,
                "min": jsd_vals[0], "median": jsd_vals[len(jsd_vals) // 2],
                "max": jsd_vals[-1]},
        "ranking": {"systems": systems,
                    "kendall_tau_b_money_table": round(tau_money, 4),
                    "kendall_tau_b_with_anchors": round(tau_all, 4)},
        "sec": {"real_classify": round(t_real, 1), "injected_classify": round(t_inj, 1),
                "total": round(time.perf_counter() - t0, 1)},
    }
    res = ROOT / "eval" / "results"
    json.dump(out, open(res / "inject_validity.json", "w"), indent=1)
    _write_tex(out, res)
    print(f"JSD pooled={out['jsd']['pooled']} tau(money)={tau_money:.3f} "
          f"tau(+anchors)={tau_all:.3f} -> {res / 'inject_validity.json'} "
          f"+ inject_validity_appendix.tex ({out['sec']['total']}s)")


if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--tex-only", action="store_true",
                    help="rebuild the LaTeX snippet from the existing JSON")
    if ap.parse_args().tex_only:
        res = ROOT / "eval" / "results"
        _write_tex(json.load(open(res / "inject_validity.json")), res)
        print(f"-> {res / 'inject_validity_appendix.tex'}")
    else:
        main()