"""RADAR mini-board: shipped pipeline vs RADAR's perturbed tables, by artifact type.

RADAR (kenqgu/RADAR, CC-BY-4.0; table-QA under data artifacts) ships, per example,
the perturbed table AND the gold recovery transform (overwrite_cells + drop_rows) —
so dirty/clean pairs are derivable exactly. We score the shipped deterministic
pipeline per table with the churn-neutral metric and aggregate by artifact_type:
which artifact classes the system repairs, where it abstains, what it damages.

    uv run python -m eval.radar_bench --n 150
"""

from __future__ import annotations

import argparse
import collections
import json
from pathlib import Path

import pandas as pd

from scrubdata.executor import apply_plan
from scrubdata.planner import mock_plan

from .run_real_multi import _cell_only, score

ROOT = Path(__file__).resolve().parent.parent


def derive_pair(ex):
    t = ex["table"]
    rows = [list(r) for r in t["rows"]]              # numpy object-array -> lists
    dirty = pd.DataFrame(rows, columns=list(t["headers"])).astype(str)
    spec = ex.get("recovered_tables_transform_spec") or {}
    clean = dirty.copy()
    oc = spec.get("overwrite_cells")
    groups = list(oc) if oc is not None else []
    cells = list(groups[0]) if len(groups) else []   # first consistent recovery group
    for cell in cells:
        r, c, v = int(cell["row"]), cell["col"], cell["new_value"]
        if c in clean.columns and 0 <= r < len(clean):
            clean.iat[r, clean.columns.get_loc(c)] = str(v)
    dr = spec.get("drop_rows")
    drops = sorted({int(r) for grp in (list(dr) if dr is not None else [])
                    for r in list(grp)})
    if drops:
        keep = [i for i in range(len(clean)) if i not in drops]
        dirty = dirty.iloc[keep].reset_index(drop=True)   # row-drop class: align both
        clean = clean.iloc[keep].reset_index(drop=True)
    return dirty, clean


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--n", type=int, default=150)
    args = ap.parse_args()
    ds = pd.read_parquet("hf://datasets/kenqgu/RADAR/radar/test-00000-of-00001.parquet")
    by_type = collections.defaultdict(list)
    seen = collections.Counter()
    for _, ex in ds.iterrows():
        at = ex["artifact_type"]
        if seen[at] >= max(8, args.n // ds["artifact_type"].nunique()):
            continue
        seen[at] += 1
        try:
            dirty, clean = derive_pair(ex)
            if dirty.empty or dirty.shape != clean.shape:
                continue
            cleaned, _ = apply_plan(dirty, _cell_only(mock_plan(dirty)))
            m = score(dirty, clean, cleaned)
            if m["_errors"] > 0:
                by_type[at].append((m["f1"], m["damage"]))
        except Exception:  # noqa: BLE001
            continue
        if sum(seen.values()) >= args.n:
            break
    rows = []
    for at, ms in sorted(by_type.items()):
        f1 = sum(f for f, _ in ms) / len(ms)
        dmg = sum(d for _, d in ms) / len(ms)
        rows.append({"artifact_type": at, "tables": len(ms),
                     "macro_f1": round(f1, 3), "macro_damage": round(dmg, 4)})
        print(f"  {at:<28} n={len(ms):<4} F1={f1:.3f} dmg={dmg:.4f}")
    json.dump(rows, open(ROOT / "eval" / "results" / "radar_bench.json", "w"), indent=1)
    print(f"-> eval/results/radar_bench.json ({sum(seen.values())} examples scanned)")


if __name__ == "__main__":
    main()