Spaces:

build-small-hackathon
/

scrubdata

Running

scrubdata / eval /radar_bench.py

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 16 days ago

3.43 kB

	"""RADAR mini-board: shipped pipeline vs RADAR's perturbed tables, by artifact type.

	RADAR (kenqgu/RADAR, CC-BY-4.0; table-QA under data artifacts) ships, per example,
	the perturbed table AND the gold recovery transform (overwrite_cells + drop_rows) —
	so dirty/clean pairs are derivable exactly. We score the shipped deterministic
	pipeline per table with the churn-neutral metric and aggregate by artifact_type:
	which artifact classes the system repairs, where it abstains, what it damages.

	uv run python -m eval.radar_bench --n 150
	"""

	from __future__ import annotations

	import argparse
	import collections
	import json
	from pathlib import Path

	import pandas as pd

	from scrubdata.executor import apply_plan
	from scrubdata.planner import mock_plan

	from .run_real_multi import _cell_only, score

	ROOT = Path(__file__).resolve().parent.parent


	def derive_pair(ex):
	t = ex["table"]
	rows = [list(r) for r in t["rows"]] # numpy object-array -> lists
	dirty = pd.DataFrame(rows, columns=list(t["headers"])).astype(str)
	spec = ex.get("recovered_tables_transform_spec") or {}
	clean = dirty.copy()
	oc = spec.get("overwrite_cells")
	groups = list(oc) if oc is not None else []
	cells = list(groups[0]) if len(groups) else [] # first consistent recovery group
	for cell in cells:
	r, c, v = int(cell["row"]), cell["col"], cell["new_value"]
	if c in clean.columns and 0 <= r < len(clean):
	clean.iat[r, clean.columns.get_loc(c)] = str(v)
	dr = spec.get("drop_rows")
	drops = sorted({int(r) for grp in (list(dr) if dr is not None else [])
	for r in list(grp)})
	if drops:
	keep = [i for i in range(len(clean)) if i not in drops]
	dirty = dirty.iloc[keep].reset_index(drop=True) # row-drop class: align both
	clean = clean.iloc[keep].reset_index(drop=True)
	return dirty, clean


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--n", type=int, default=150)
	args = ap.parse_args()
	ds = pd.read_parquet("hf://datasets/kenqgu/RADAR/radar/test-00000-of-00001.parquet")
	by_type = collections.defaultdict(list)
	seen = collections.Counter()
	for _, ex in ds.iterrows():
	at = ex["artifact_type"]
	if seen[at] >= max(8, args.n // ds["artifact_type"].nunique()):
	continue
	seen[at] += 1
	try:
	dirty, clean = derive_pair(ex)
	if dirty.empty or dirty.shape != clean.shape:
	continue
	cleaned, _ = apply_plan(dirty, _cell_only(mock_plan(dirty)))
	m = score(dirty, clean, cleaned)
	if m["_errors"] > 0:
	by_type[at].append((m["f1"], m["damage"]))
	except Exception: # noqa: BLE001
	continue
	if sum(seen.values()) >= args.n:
	break
	rows = []
	for at, ms in sorted(by_type.items()):
	f1 = sum(f for f, _ in ms) / len(ms)
	dmg = sum(d for _, d in ms) / len(ms)
	rows.append({"artifact_type": at, "tables": len(ms),
	"macro_f1": round(f1, 3), "macro_damage": round(dmg, 4)})
	print(f" {at:<28} n={len(ms):<4} F1={f1:.3f} dmg={dmg:.4f}")
	json.dump(rows, open(ROOT / "eval" / "results" / "radar_bench.json", "w"), indent=1)
	print(f"-> eval/results/radar_bench.json ({sum(seen.values())} examples scanned)")


	if __name__ == "__main__":
	main()