scrubdata / eval /radar_bench.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
3.43 kB
"""RADAR mini-board: shipped pipeline vs RADAR's perturbed tables, by artifact type.
RADAR (kenqgu/RADAR, CC-BY-4.0; table-QA under data artifacts) ships, per example,
the perturbed table AND the gold recovery transform (overwrite_cells + drop_rows) —
so dirty/clean pairs are derivable exactly. We score the shipped deterministic
pipeline per table with the churn-neutral metric and aggregate by artifact_type:
which artifact classes the system repairs, where it abstains, what it damages.
uv run python -m eval.radar_bench --n 150
"""
from __future__ import annotations
import argparse
import collections
import json
from pathlib import Path
import pandas as pd
from scrubdata.executor import apply_plan
from scrubdata.planner import mock_plan
from .run_real_multi import _cell_only, score
ROOT = Path(__file__).resolve().parent.parent
def derive_pair(ex):
t = ex["table"]
rows = [list(r) for r in t["rows"]] # numpy object-array -> lists
dirty = pd.DataFrame(rows, columns=list(t["headers"])).astype(str)
spec = ex.get("recovered_tables_transform_spec") or {}
clean = dirty.copy()
oc = spec.get("overwrite_cells")
groups = list(oc) if oc is not None else []
cells = list(groups[0]) if len(groups) else [] # first consistent recovery group
for cell in cells:
r, c, v = int(cell["row"]), cell["col"], cell["new_value"]
if c in clean.columns and 0 <= r < len(clean):
clean.iat[r, clean.columns.get_loc(c)] = str(v)
dr = spec.get("drop_rows")
drops = sorted({int(r) for grp in (list(dr) if dr is not None else [])
for r in list(grp)})
if drops:
keep = [i for i in range(len(clean)) if i not in drops]
dirty = dirty.iloc[keep].reset_index(drop=True) # row-drop class: align both
clean = clean.iloc[keep].reset_index(drop=True)
return dirty, clean
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--n", type=int, default=150)
args = ap.parse_args()
ds = pd.read_parquet("hf://datasets/kenqgu/RADAR/radar/test-00000-of-00001.parquet")
by_type = collections.defaultdict(list)
seen = collections.Counter()
for _, ex in ds.iterrows():
at = ex["artifact_type"]
if seen[at] >= max(8, args.n // ds["artifact_type"].nunique()):
continue
seen[at] += 1
try:
dirty, clean = derive_pair(ex)
if dirty.empty or dirty.shape != clean.shape:
continue
cleaned, _ = apply_plan(dirty, _cell_only(mock_plan(dirty)))
m = score(dirty, clean, cleaned)
if m["_errors"] > 0:
by_type[at].append((m["f1"], m["damage"]))
except Exception: # noqa: BLE001
continue
if sum(seen.values()) >= args.n:
break
rows = []
for at, ms in sorted(by_type.items()):
f1 = sum(f for f, _ in ms) / len(ms)
dmg = sum(d for _, d in ms) / len(ms)
rows.append({"artifact_type": at, "tables": len(ms),
"macro_f1": round(f1, 3), "macro_damage": round(dmg, 4)})
print(f" {at:<28} n={len(ms):<4} F1={f1:.3f} dmg={dmg:.4f}")
json.dump(rows, open(ROOT / "eval" / "results" / "radar_bench.json", "w"), indent=1)
print(f"-> eval/results/radar_bench.json ({sum(seen.values())} examples scanned)")
if __name__ == "__main__":
main()