Spaces:
Running
Running
| """RADAR mini-board: shipped pipeline vs RADAR's perturbed tables, by artifact type. | |
| RADAR (kenqgu/RADAR, CC-BY-4.0; table-QA under data artifacts) ships, per example, | |
| the perturbed table AND the gold recovery transform (overwrite_cells + drop_rows) — | |
| so dirty/clean pairs are derivable exactly. We score the shipped deterministic | |
| pipeline per table with the churn-neutral metric and aggregate by artifact_type: | |
| which artifact classes the system repairs, where it abstains, what it damages. | |
| uv run python -m eval.radar_bench --n 150 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import collections | |
| import json | |
| from pathlib import Path | |
| import pandas as pd | |
| from scrubdata.executor import apply_plan | |
| from scrubdata.planner import mock_plan | |
| from .run_real_multi import _cell_only, score | |
| ROOT = Path(__file__).resolve().parent.parent | |
| def derive_pair(ex): | |
| t = ex["table"] | |
| rows = [list(r) for r in t["rows"]] # numpy object-array -> lists | |
| dirty = pd.DataFrame(rows, columns=list(t["headers"])).astype(str) | |
| spec = ex.get("recovered_tables_transform_spec") or {} | |
| clean = dirty.copy() | |
| oc = spec.get("overwrite_cells") | |
| groups = list(oc) if oc is not None else [] | |
| cells = list(groups[0]) if len(groups) else [] # first consistent recovery group | |
| for cell in cells: | |
| r, c, v = int(cell["row"]), cell["col"], cell["new_value"] | |
| if c in clean.columns and 0 <= r < len(clean): | |
| clean.iat[r, clean.columns.get_loc(c)] = str(v) | |
| dr = spec.get("drop_rows") | |
| drops = sorted({int(r) for grp in (list(dr) if dr is not None else []) | |
| for r in list(grp)}) | |
| if drops: | |
| keep = [i for i in range(len(clean)) if i not in drops] | |
| dirty = dirty.iloc[keep].reset_index(drop=True) # row-drop class: align both | |
| clean = clean.iloc[keep].reset_index(drop=True) | |
| return dirty, clean | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--n", type=int, default=150) | |
| args = ap.parse_args() | |
| ds = pd.read_parquet("hf://datasets/kenqgu/RADAR/radar/test-00000-of-00001.parquet") | |
| by_type = collections.defaultdict(list) | |
| seen = collections.Counter() | |
| for _, ex in ds.iterrows(): | |
| at = ex["artifact_type"] | |
| if seen[at] >= max(8, args.n // ds["artifact_type"].nunique()): | |
| continue | |
| seen[at] += 1 | |
| try: | |
| dirty, clean = derive_pair(ex) | |
| if dirty.empty or dirty.shape != clean.shape: | |
| continue | |
| cleaned, _ = apply_plan(dirty, _cell_only(mock_plan(dirty))) | |
| m = score(dirty, clean, cleaned) | |
| if m["_errors"] > 0: | |
| by_type[at].append((m["f1"], m["damage"])) | |
| except Exception: # noqa: BLE001 | |
| continue | |
| if sum(seen.values()) >= args.n: | |
| break | |
| rows = [] | |
| for at, ms in sorted(by_type.items()): | |
| f1 = sum(f for f, _ in ms) / len(ms) | |
| dmg = sum(d for _, d in ms) / len(ms) | |
| rows.append({"artifact_type": at, "tables": len(ms), | |
| "macro_f1": round(f1, 3), "macro_damage": round(dmg, 4)}) | |
| print(f" {at:<28} n={len(ms):<4} F1={f1:.3f} dmg={dmg:.4f}") | |
| json.dump(rows, open(ROOT / "eval" / "results" / "radar_bench.json", "w"), indent=1) | |
| print(f"-> eval/results/radar_bench.json ({sum(seen.values())} examples scanned)") | |
| if __name__ == "__main__": | |
| main() | |