Spaces:

build-small-hackathon
/

scrubdata

Running

OpenAI Codex

deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build

16dc556 19 days ago

4.44 kB

	"""WS4 baseline: Baran (Mahdavi & Abedjan, VLDB 2020) on the Raha real-error slice.

	Runs Baran in its OWN reference configuration (the package's __main__ example):
	oracle error positions from the dirty/gold diff + LABELING_BUDGET gold-labeled tuples
	(auto-sampled), no Wikipedia-pretrained value models. This is an UPPER BOUND under a
	strictly more informed protocol than ours (we are zero-label, no oracle detection) —
	disclosed in the paper. With oracle detection Baran only edits true-error cells, so its
	damage rate is NEAR-zero structurally — but not exactly 0: raha normalizes values at
	CSV load (html-unescape + whitespace collapse), so its repaired output differs from the
	raw-loaded dirty table at cells it never corrected (measured churn-neutral damage:
	hospital 0.004, rayyan 0.010 — see eval/cross_scoring.py).

	STANDALONE on purpose: stdlib + pandas + raha only — it runs inside a pinned ephemeral
	env (raha 1.26 is 2023 code), never importing scrubdata:

	uv run --python 3.10 --with "raha==1.26" --with "numpy<2" --with "pandas<2.1" \
	--with "scikit-learn<1.4" python eval/run_baran.py

	Outputs eval/results/baran/<name>_seed<k>_repaired.csv; scored in the main env by
	eval/baselines_learned.py under the identical churn-neutral protocol.
	"""

	from __future__ import annotations

	import argparse
	import os
	import random
	import tempfile
	import urllib.request
	from pathlib import Path

	import pandas as pd

	DATASETS = ["hospital", "beers", "flights", "rayyan", "movies_1"]
	RAW = "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets"
	BASE = Path(__file__).resolve().parent.parent


	def _fetch(name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Same fetch + movies_1 truncation as eval.run_real_multi._raha_pair (duplicated
	so this file never imports scrubdata inside the pinned env)."""
	d = BASE / "data" / "real" / name
	d.mkdir(parents=True, exist_ok=True)
	out = []
	for fn in ("dirty.csv", "clean.csv"):
	p = d / fn
	if not p.exists():
	urllib.request.urlretrieve(f"{RAW}/{name}/{fn}", p)
	out.append(pd.read_csv(p, dtype=str, keep_default_na=False))
	dirty, clean = out
	if len(dirty) > 2200:
	dirty, clean = dirty.head(2000).reset_index(drop=True), clean.head(2000).reset_index(drop=True)
	return dirty, clean


	def baran_repair(dirty_csv: str, clean_csv: str, name: str,
	n_labels: int = 20, seed: int = 0) -> pd.DataFrame:
	"""Official Baran reference config; returns the repaired DataFrame."""
	import numpy as np
	import raha
	random.seed(seed)
	np.random.seed(seed)
	data = raha.dataset.Dataset({"name": name, "path": dirty_csv, "clean_path": clean_csv})
	data.detected_cells = dict(data.get_actual_errors_dictionary()) # oracle detection
	app = raha.correction.Correction()
	app.LABELING_BUDGET = n_labels
	app.SAVE_RESULTS = False
	app.VERBOSE = False
	corrections = app.run(data) # {(row, col_idx): value}
	out = data.dataframe.copy()
	for (i, j), v in corrections.items():
	out.iat[i, j] = v
	return out


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--out", default="eval/results/baran")
	ap.add_argument("--seeds", default="0,1,2")
	ap.add_argument("--datasets", default=",".join(DATASETS))
	ap.add_argument("--n-labels", type=int, default=20)
	args = ap.parse_args()
	out_dir = Path(args.out)
	out_dir.mkdir(parents=True, exist_ok=True)
	seeds = [int(s) for s in args.seeds.split(",")]
	for name in args.datasets.split(","):
	dirty, clean = _fetch(name)
	with tempfile.TemporaryDirectory() as td: # Dataset wants file paths
	dp, cp = os.path.join(td, "dirty.csv"), os.path.join(td, "clean.csv")
	dirty.to_csv(dp, index=False)
	clean.to_csv(cp, index=False)
	for seed in seeds:
	dest = out_dir / f"{name}_seed{seed}_repaired.csv"
	if dest.exists():
	print(f"skip {dest} (exists)", flush=True)
	continue
	print(f"baran: {name} seed={seed} ...", flush=True)
	repaired = baran_repair(dp, cp, name, n_labels=args.n_labels, seed=seed)
	repaired.to_csv(dest, index=False)
	print(f" -> {dest}", flush=True)
	print("baran runs complete")


	if __name__ == "__main__":
	main()