scrubdata / eval /run_baran.py
OpenAI Codex
deploy: add sponsor:openai tag (Best Use of Codex) + Codex-hardened build
16dc556
Raw
History Blame Contribute Delete
4.44 kB
"""WS4 baseline: Baran (Mahdavi & Abedjan, VLDB 2020) on the Raha real-error slice.
Runs Baran in its OWN reference configuration (the package's __main__ example):
oracle error positions from the dirty/gold diff + LABELING_BUDGET gold-labeled tuples
(auto-sampled), no Wikipedia-pretrained value models. This is an UPPER BOUND under a
strictly more informed protocol than ours (we are zero-label, no oracle detection) β€”
disclosed in the paper. With oracle detection Baran only edits true-error cells, so its
damage rate is NEAR-zero structurally β€” but not exactly 0: raha normalizes values at
CSV load (html-unescape + whitespace collapse), so its repaired output differs from the
raw-loaded dirty table at cells it never corrected (measured churn-neutral damage:
hospital 0.004, rayyan 0.010 β€” see eval/cross_scoring.py).
STANDALONE on purpose: stdlib + pandas + raha only β€” it runs inside a pinned ephemeral
env (raha 1.26 is 2023 code), never importing scrubdata:
uv run --python 3.10 --with "raha==1.26" --with "numpy<2" --with "pandas<2.1" \
--with "scikit-learn<1.4" python eval/run_baran.py
Outputs eval/results/baran/<name>_seed<k>_repaired.csv; scored in the main env by
eval/baselines_learned.py under the identical churn-neutral protocol.
"""
from __future__ import annotations
import argparse
import os
import random
import tempfile
import urllib.request
from pathlib import Path
import pandas as pd
DATASETS = ["hospital", "beers", "flights", "rayyan", "movies_1"]
RAW = "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets"
BASE = Path(__file__).resolve().parent.parent
def _fetch(name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Same fetch + movies_1 truncation as eval.run_real_multi._raha_pair (duplicated
so this file never imports scrubdata inside the pinned env)."""
d = BASE / "data" / "real" / name
d.mkdir(parents=True, exist_ok=True)
out = []
for fn in ("dirty.csv", "clean.csv"):
p = d / fn
if not p.exists():
urllib.request.urlretrieve(f"{RAW}/{name}/{fn}", p)
out.append(pd.read_csv(p, dtype=str, keep_default_na=False))
dirty, clean = out
if len(dirty) > 2200:
dirty, clean = dirty.head(2000).reset_index(drop=True), clean.head(2000).reset_index(drop=True)
return dirty, clean
def baran_repair(dirty_csv: str, clean_csv: str, name: str,
n_labels: int = 20, seed: int = 0) -> pd.DataFrame:
"""Official Baran reference config; returns the repaired DataFrame."""
import numpy as np
import raha
random.seed(seed)
np.random.seed(seed)
data = raha.dataset.Dataset({"name": name, "path": dirty_csv, "clean_path": clean_csv})
data.detected_cells = dict(data.get_actual_errors_dictionary()) # oracle detection
app = raha.correction.Correction()
app.LABELING_BUDGET = n_labels
app.SAVE_RESULTS = False
app.VERBOSE = False
corrections = app.run(data) # {(row, col_idx): value}
out = data.dataframe.copy()
for (i, j), v in corrections.items():
out.iat[i, j] = v
return out
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", default="eval/results/baran")
ap.add_argument("--seeds", default="0,1,2")
ap.add_argument("--datasets", default=",".join(DATASETS))
ap.add_argument("--n-labels", type=int, default=20)
args = ap.parse_args()
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
seeds = [int(s) for s in args.seeds.split(",")]
for name in args.datasets.split(","):
dirty, clean = _fetch(name)
with tempfile.TemporaryDirectory() as td: # Dataset wants file paths
dp, cp = os.path.join(td, "dirty.csv"), os.path.join(td, "clean.csv")
dirty.to_csv(dp, index=False)
clean.to_csv(cp, index=False)
for seed in seeds:
dest = out_dir / f"{name}_seed{seed}_repaired.csv"
if dest.exists():
print(f"skip {dest} (exists)", flush=True)
continue
print(f"baran: {name} seed={seed} ...", flush=True)
repaired = baran_repair(dp, cp, name, n_labels=args.n_labels, seed=seed)
repaired.to_csv(dest, index=False)
print(f" -> {dest}", flush=True)
print("baran runs complete")
if __name__ == "__main__":
main()