"""WS4 baseline: Baran (Mahdavi & Abedjan, VLDB 2020) on the Raha real-error slice.

Runs Baran in its OWN reference configuration (the package's __main__ example):
oracle error positions from the dirty/gold diff + LABELING_BUDGET gold-labeled tuples
(auto-sampled), no Wikipedia-pretrained value models. This is an UPPER BOUND under a
strictly more informed protocol than ours (we are zero-label, no oracle detection) —
disclosed in the paper. With oracle detection Baran only edits true-error cells, so its
damage rate is NEAR-zero structurally — but not exactly 0: raha normalizes values at
CSV load (html-unescape + whitespace collapse), so its repaired output differs from the
raw-loaded dirty table at cells it never corrected (measured churn-neutral damage:
hospital 0.004, rayyan 0.010 — see eval/cross_scoring.py).

STANDALONE on purpose: stdlib + pandas + raha only — it runs inside a pinned ephemeral
env (raha 1.26 is 2023 code), never importing scrubdata:

    uv run --python 3.10 --with "raha==1.26" --with "numpy<2" --with "pandas<2.1" \
      --with "scikit-learn<1.4" python eval/run_baran.py

Outputs eval/results/baran/<name>_seed<k>_repaired.csv; scored in the main env by
eval/baselines_learned.py under the identical churn-neutral protocol.
"""

from __future__ import annotations

import argparse
import os
import random
import tempfile
import urllib.request
from pathlib import Path

import pandas as pd

DATASETS = ["hospital", "beers", "flights", "rayyan", "movies_1"]
RAW = "https://raw.githubusercontent.com/BigDaMa/raha/master/datasets"
BASE = Path(__file__).resolve().parent.parent


def _fetch(name: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Same fetch + movies_1 truncation as eval.run_real_multi._raha_pair (duplicated
    so this file never imports scrubdata inside the pinned env)."""
    d = BASE / "data" / "real" / name
    d.mkdir(parents=True, exist_ok=True)
    out = []
    for fn in ("dirty.csv", "clean.csv"):
        p = d / fn
        if not p.exists():
            urllib.request.urlretrieve(f"{RAW}/{name}/{fn}", p)
        out.append(pd.read_csv(p, dtype=str, keep_default_na=False))
    dirty, clean = out
    if len(dirty) > 2200:
        dirty, clean = dirty.head(2000).reset_index(drop=True), clean.head(2000).reset_index(drop=True)
    return dirty, clean


def baran_repair(dirty_csv: str, clean_csv: str, name: str,
                 n_labels: int = 20, seed: int = 0) -> pd.DataFrame:
    """Official Baran reference config; returns the repaired DataFrame."""
    import numpy as np
    import raha
    random.seed(seed)
    np.random.seed(seed)
    data = raha.dataset.Dataset({"name": name, "path": dirty_csv, "clean_path": clean_csv})
    data.detected_cells = dict(data.get_actual_errors_dictionary())   # oracle detection
    app = raha.correction.Correction()
    app.LABELING_BUDGET = n_labels
    app.SAVE_RESULTS = False
    app.VERBOSE = False
    corrections = app.run(data)                       # {(row, col_idx): value}
    out = data.dataframe.copy()
    for (i, j), v in corrections.items():
        out.iat[i, j] = v
    return out


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", default="eval/results/baran")
    ap.add_argument("--seeds", default="0,1,2")
    ap.add_argument("--datasets", default=",".join(DATASETS))
    ap.add_argument("--n-labels", type=int, default=20)
    args = ap.parse_args()
    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)
    seeds = [int(s) for s in args.seeds.split(",")]
    for name in args.datasets.split(","):
        dirty, clean = _fetch(name)
        with tempfile.TemporaryDirectory() as td:     # Dataset wants file paths
            dp, cp = os.path.join(td, "dirty.csv"), os.path.join(td, "clean.csv")
            dirty.to_csv(dp, index=False)
            clean.to_csv(cp, index=False)
            for seed in seeds:
                dest = out_dir / f"{name}_seed{seed}_repaired.csv"
                if dest.exists():
                    print(f"skip {dest} (exists)", flush=True)
                    continue
                print(f"baran: {name} seed={seed} ...", flush=True)
                repaired = baran_repair(dp, cp, name, n_labels=args.n_labels, seed=seed)
                repaired.to_csv(dest, index=False)
                print(f"  -> {dest}", flush=True)
    print("baran runs complete")


if __name__ == "__main__":
    main()