Spaces:

build-small-hackathon
/

scrubdata

Running

File size: 14,375 Bytes

16dc556

"""B1 (W4.2) dual-metric cross-scoring on the 5 Raha real-error datasets.

Scores every system under BOTH metric families, side by side:
  * original  — the Raha/Baran cell-level repair protocol (Mahdavi & Abedjan,
    PVLDB 13(12), p1948, Sec 6.1 + raha/dataset.py get_data_cleaning_evaluation):
    values minimally normalized (html-unescape, whitespace collapse — their
    value_normalizer), then RAW string equality; precision = exact-gold repairs /
    cells changed; recall = exact-gold repairs / (dirty->clean diff); no
    churn-neutrality, no case folding, no semantic tolerance, no damage metric.
  * churn_neutral — our eval.run_real_multi.score (the scoring contract):
    convention-normalized, churn ignored, damage reported.

Systems: grounded (HEAD mock_plan), verified union (v6, tau=0.5 — identical plan
files to eval.raha_table), OpenRefine fingerprint/kNN, and Baran at labeling
budgets 0/5/20 (oracle detection; repaired CSVs from eval/run_baran.py, 3 seeds,
seed-mean). Baran-from-CSV caveat: corrections equal to the dirty value vanish
from the repaired-vs-dirty diff, so reconstructed |changed| is a lower bound on
Baran's own output_size (precision an upper bound; recall exact).

Also computes Kendall tau-b between the SYSTEM RANKINGS induced by the two F1s
(per dataset + macro), and a calibration block: our Baran oracle+20 repro vs the
published Table 3 "Baran" row (verified from the PVLDB PDF; see PUBLISHED below).

Acceptance: the churn-neutral rows must reproduce eval/results/raha_per_dataset.json
exactly (checked, hard-fails otherwise).

    uv run python -m eval.cross_scoring
Writes eval/results/cross_scoring.json and prints LaTeX rows.
"""

from __future__ import annotations

import html
import json
import re
from pathlib import Path

import pandas as pd

from scrubdata.baselines import openrefine_fingerprint_plan, openrefine_knn_plan
from scrubdata.executor import apply_plan
from scrubdata.planner import mock_plan
from scrubdata.verifier import union_plans, verify_plan

from .precision_curve import _repairs_only
from .raha_table import TAU, UNION_PLANS, _gen_plan
from .run_real_multi import RAHA, _cell_only, _raha_pair, score

RESULTS = Path(__file__).resolve().parent / "results"
BARAN_DIRS = {0: RESULTS / "baran_n0", 5: RESULTS / "baran_n5", 20: RESULTS / "baran"}

# Baran PVLDB'20 Table 3, row "Baran" (no TL): complete set of data errors given as
# input (= oracle detection), labeling budget 20, mean of 10 runs. Verified by reading
# vldb.org/pvldb/vol13/p1948-mahdavi.pdf p1957 (2026-06-12). movies_1 is not evaluated
# in the paper (its real-error sets are hospital/flights/address/beers/rayyan/it/tax).
PUBLISHED = {"hospital": {"precision": 0.88, "recall": 0.86, "f1": 0.87},
             "flights": {"precision": 1.00, "recall": 1.00, "f1": 1.00},
             "beers": {"precision": 0.91, "recall": 0.89, "f1": 0.90},
             "rayyan": {"precision": 0.76, "recall": 0.40, "f1": 0.52}}


def _norm(v: str) -> str:
    """raha.dataset.Dataset.value_normalizer, verbatim semantics."""
    v = html.unescape(str(v))
    v = re.sub("[\t\n ]+", " ", v, re.UNICODE)
    return v.strip("\t\n ")


def baran_score(dirty: pd.DataFrame, clean: pd.DataFrame, out: pd.DataFrame) -> dict:
    """The original Raha/Baran repair metric over a repaired DataFrame: minimal
    normalization then raw equality; changed = repaired-vs-dirty diff."""
    n = min(len(dirty), len(out), len(clean))
    errors = changed = tp = 0
    for j, col in enumerate(dirty.columns):
        present = col in out.columns
        for i in range(n):
            dv, cv = _norm(dirty.iat[i, j]), _norm(clean.iat[i, j])
            ov = _norm(out.iloc[i][col]) if present else dv
            err, chg = dv != cv, ov != dv
            errors += err
            changed += chg
            tp += chg and err and ov == cv
    p = tp / changed if changed else 0.0
    r = tp / errors if errors else 0.0
    f1 = 2 * p * r / (p + r) if (p + r) else 0.0
    return {"f1": f1, "precision": p, "recall": r,
            "_errors": errors, "_changed": changed, "_tp": tp}


def _both(dirty, clean, out) -> dict:
    m = score(dirty, clean, out)
    return {"original": baran_score(dirty, clean, out),
            "churn_neutral": {k: m[k] for k in
                              ("f1", "precision", "recall", "damage",
                               "_errors", "_changed", "_fixed")}}


def kendall_tau(xs, ys) -> float:
    """Kendall tau-b (tie-corrected), stdlib."""
    n = len(xs)
    n0, n1, n2, nc, nd = n * (n - 1) // 2, 0, 0, 0, 0
    for i in range(n):
        for j in range(i + 1, n):
            a, b = xs[i] - xs[j], ys[i] - ys[j]
            n1 += a == 0
            n2 += b == 0
            if a != 0 and b != 0:
                nc += (a > 0) == (b > 0)
                nd += (a > 0) != (b > 0)
    denom = ((n0 - n1) * (n0 - n2)) ** 0.5
    return (nc - nd) / denom if denom else 0.0


def _mean_rows(rows: list[dict]) -> dict:
    return {k: sum(r[k] for r in rows) / len(rows) for k in rows[0]}


def main() -> None:
    out = {"protocol": {
        "original": "Raha/Baran convention: value_normalizer (html-unescape + "
                    "whitespace collapse) then raw string equality; P = exact-gold "
                    "repairs / changed cells, R = exact-gold repairs / (dirty->clean "
                    "diff); no churn-neutrality, no damage",
        "churn_neutral": "eval.run_real_multi.score — the scoring contract",
        "baran_rows": "oracle error positions + n gold labels, 3 seeds, seed-mean; "
                      "reconstructed from repaired CSVs (no-op corrections vanish: "
                      "|changed| lower-bounds Baran's output_size)",
        "movies_1": "first 2000 rows (_raha_pair), as everywhere in the suite"},
        "systems": {}}

    deterministic = [("grounded", mock_plan),
                     ("openrefine_fingerprint", openrefine_fingerprint_plan),
                     ("openrefine_knn", openrefine_knn_plan)]
    for label, planner in deterministic:
        rows = []
        for name, _dom in RAHA:
            dirty, clean = _raha_pair(name)
            cleaned, _ = apply_plan(dirty, _cell_only(planner(dirty)))
            m = _both(dirty, clean, cleaned)
            rows.append({"dataset": name, **m})
            print(f"  {label:<24}{name:<10} orig={m['original']['f1']:.3f} "
                  f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
        out["systems"][label] = {"per_dataset": rows}

    rows = []
    for name, _dom in RAHA:
        base = (json.load(open(UNION_PLANS[name])) if name in UNION_PLANS
                else _gen_plan(name))
        dirty, clean = _raha_pair(name)
        plan = _repairs_only(union_plans(verify_plan(dirty, base, tau=TAU),
                                         mock_plan(dirty)))
        cleaned, _ = apply_plan(dirty, plan)
        m = _both(dirty, clean, cleaned)
        rows.append({"dataset": name, **m})
        print(f"  {'verified_union':<24}{name:<10} orig={m['original']['f1']:.3f} "
              f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
    out["systems"]["verified_union_v6_tau0.5"] = {"per_dataset": rows}

    for n_labels, d in BARAN_DIRS.items():
        rows = []
        for name, _dom in RAHA:
            dirty, clean = _raha_pair(name)
            per_seed = []
            for p in sorted(d.glob(f"{name}_seed*_repaired.csv")):
                repaired = pd.read_csv(p, dtype=str, keep_default_na=False)
                per_seed.append(_both(dirty, clean, repaired))
            m = {"original": _mean_rows([s["original"] for s in per_seed]),
                 "churn_neutral": _mean_rows([s["churn_neutral"] for s in per_seed])}
            rows.append({"dataset": name, "n_seeds": len(per_seed), **m})
            print(f"  {'baran_oracle%d' % n_labels:<24}{name:<10} "
                  f"orig={m['original']['f1']:.3f} "
                  f"cn={m['churn_neutral']['f1']:.3f}", flush=True)
        out["systems"][f"baran_oracle{n_labels}"] = {"per_dataset": rows}

    for sys in out["systems"].values():
        for fam in ("original", "churn_neutral"):
            sys[f"macro_f1_{fam}"] = _mean_rows(
                [r[fam] for r in sys["per_dataset"]])["f1"]

    # acceptance: churn-neutral rows == raha_per_dataset.json (exact)
    ref = json.load(open(RESULTS / "raha_per_dataset.json"))
    checks = []
    for key, ref_key in [("grounded", "grounded"),
                         ("openrefine_fingerprint", "openrefine_fingerprint"),
                         ("openrefine_knn", "openrefine_knn"),
                         ("verified_union_v6_tau0.5", "verified_union_v6_tau0.5"),
                         ("baran_oracle20", "baran_oracle20")]:
        for got, want in zip(out["systems"][key]["per_dataset"],
                             ref["systems"][ref_key]["per_dataset"]):
            for k in ("f1", "precision", "recall", "damage"):
                ok = abs(got["churn_neutral"][k] - want[k]) < 1e-9
                checks.append(ok)
                if not ok:
                    print(f"MISMATCH {key}/{got['dataset']}/{k}: "
                          f"{got['churn_neutral'][k]} vs {want[k]}")
    out["acceptance"] = {"vs": "raha_per_dataset.json", "n_cells": len(checks),
                         "pass": all(checks)}
    print(f"\nacceptance: {sum(checks)}/{len(checks)} cells match "
          f"-> {'PASS' if all(checks) else 'FAIL'}")
    if not all(checks):
        raise SystemExit("acceptance FAILED")

    # Kendall tau-b between system rankings under the two F1s
    primary = ["grounded", "verified_union_v6_tau0.5", "openrefine_fingerprint",
               "openrefine_knn", "baran_oracle20"]
    extended = primary + ["baran_oracle0", "baran_oracle5"]
    taus = {}
    for label, sysset in [("primary", primary), ("extended", extended)]:
        per_ds = {}
        for i, (name, _dom) in enumerate(RAHA):
            xs = [out["systems"][s]["per_dataset"][i]["original"]["f1"] for s in sysset]
            ys = [out["systems"][s]["per_dataset"][i]["churn_neutral"]["f1"] for s in sysset]
            per_ds[name] = kendall_tau(xs, ys)
        xs = [out["systems"][s]["macro_f1_original"] for s in sysset]
        ys = [out["systems"][s]["macro_f1_churn_neutral"] for s in sysset]
        taus[label] = {"systems": sysset, "per_dataset": per_ds,
                       "macro": kendall_tau(xs, ys)}
        print(f"tau-b ({label}): macro={taus[label]['macro']:.3f}  " +
              "  ".join(f"{n}={t:.3f}" for n, t in per_ds.items()))
    out["kendall_tau_b"] = taus

    # calibration: our Baran oracle+20 repro (ORIGINAL metric) vs published Table 3
    cal = []
    b20 = {r["dataset"]: r for r in out["systems"]["baran_oracle20"]["per_dataset"]}
    for name, pub in PUBLISHED.items():
        ours = b20[name]["original"]
        cal.append({"dataset": name, "published_f1": pub["f1"],
                    "published_precision": pub["precision"],
                    "published_recall": pub["recall"],
                    "repro_f1": ours["f1"], "repro_precision": ours["precision"],
                    "repro_recall": ours["recall"],
                    "delta_f1": ours["f1"] - pub["f1"]})
        print(f"calibration {name:<10} published F1={pub['f1']:.2f} "
              f"repro F1={ours['f1']:.3f} (d={ours['f1'] - pub['f1']:+.3f})")
    out["calibration"] = {
        "source": "Mahdavi & Abedjan, PVLDB 13(12) p1948, Table 3 row 'Baran' "
                  "(no TL): complete error set given (oracle detection), budget 20, "
                  "mean of 10 runs; PDF read 2026-06-12",
        "notes": "their runs: full datasets, 10 label seeds, Wikipedia value models "
                 "available in package but Table-3 row is without TL; ours: 3 label "
                 "seeds, no pretraining, movies_1 not in their paper; our "
                 "churn-neutral macro for this row is the paper's 0.811",
        "rows": cal}

    dest = RESULTS / "cross_scoring.json"
    json.dump(out, open(dest, "w"), indent=1)
    print(f"written to {dest}")
    print(latex(out))


LABELS = [("grounded", "Grounded (ours, deterministic)"),
          ("verified_union_v6_tau0.5", r"Verified union (v6, $\tau{=}0.5$)"),
          ("openrefine_fingerprint", "OpenRefine fingerprint"),
          ("openrefine_knn", "OpenRefine kNN"),
          ("baran_oracle20", r"Baran (oracle det.\ + 20 labels)")]


def latex(out: dict) -> str:
    """Booktabs rows: per system x dataset, original P/R/F1 next to churn-neutral
    P/R/F1 + damage."""
    L = [r"\begin{tabular}{llrrrrrrr}", r"\toprule",
         r" & & \multicolumn{3}{c}{Original (Baran) metric} & "
         r"\multicolumn{4}{c}{Churn-neutral (ours)} \\",
         r"\cmidrule(lr){3-5}\cmidrule(lr){6-9}",
         r"System & Dataset & Prec. & Rec. & F1 & Prec. & Rec. & F1 & Damage \\",
         r"\midrule"]
    for key, label in LABELS:
        for i, r in enumerate(out["systems"][key]["per_dataset"]):
            o, c = r["original"], r["churn_neutral"]
            L.append(f"{label if i == 0 else ''} & "
                     f"{r['dataset'].replace('_', r'\_')} & "
                     f"{o['precision']:.3f} & {o['recall']:.3f} & {o['f1']:.3f} & "
                     f"{c['precision']:.3f} & {c['recall']:.3f} & {c['f1']:.3f} & "
                     f"{c['damage']:.3f} \\\\")
        L.append(f" & \\emph{{macro}} &  &  & "
                 f"\\emph{{{out['systems'][key]['macro_f1_original']:.3f}}} &  &  & "
                 f"\\emph{{{out['systems'][key]['macro_f1_churn_neutral']:.3f}}} &  \\\\")
        L.append(r"\midrule")
    t = out["kendall_tau_b"]["primary"]
    L.append(r"\multicolumn{9}{l}{Kendall $\tau_b$ between system rankings: "
             f"macro {t['macro']:.2f}; per dataset " +
             ", ".join(f"{n.replace('_', r'\_')} {v:.2f}"
                       for n, v in t["per_dataset"].items()) + r"} \\")
    cal = ", ".join(f"{r['dataset'].replace('_', r'\_')} {r['repro_f1']:.3f} vs "
                    f"{r['published_f1']:.2f}" for r in out["calibration"]["rows"])
    L.append(r"\multicolumn{9}{l}{Calibration, original metric (our Baran oracle+20 "
             r"repro vs PVLDB'20 Table~3): " + cal + r"} \\")
    L.append(r"\bottomrule")
    L.append(r"\end{tabular}")
    return "\n".join(L)


if __name__ == "__main__":
    main()