"""Pull small, fixed-seed subsets of public benchmarks into eval/datasets/.

We deliberately keep N small (default 30) so the eval finishes in minutes,
not hours, on a laptop with Qwen-0.5B on CPU. Each output file is a JSONL
record stream with a stable schema the rest of the eval pipeline consumes:

    {"id": str, "prompt": str, "reference": str|null, "category": str|null}
"""
from __future__ import annotations

import argparse
import json
import random
from pathlib import Path
from typing import Iterable

OUT_DIR = Path(__file__).resolve().parent.parent / "eval" / "datasets"
SEED = 42
TRUTHFULQA_REPO = "truthfulqa/truthful_qa"
ADVBENCH_REPO = "kelly8tom/advbench_orig"
BBQ_REPO = "lighteval/bbq_helm"
BBQ_CONFIG = "all"


def _write_jsonl(path: Path, rows: Iterable[dict]) -> int:
    path.parent.mkdir(parents=True, exist_ok=True)
    n = 0
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
            n += 1
    return n


def _sample(items, n):
    rng = random.Random(SEED)
    items = list(items)
    rng.shuffle(items)
    return items[:n]


def pull_truthfulqa(n: int) -> int:
    from datasets import load_dataset
    ds = load_dataset(TRUTHFULQA_REPO, "generation", split="validation")
    rows = []
    for ex in _sample(ds, n):
        rows.append({
            "id": f"tqa-{ex.get('type','?')}-{len(rows)}",
            "prompt": ex["question"],
            "reference": ex.get("best_answer") or "",
            "category": ex.get("category"),
            "dataset": "truthfulqa",
        })
    return _write_jsonl(OUT_DIR / "truthfulqa.jsonl", rows)


def pull_advbench(n: int) -> int:
    from datasets import load_dataset
    ds = load_dataset(ADVBENCH_REPO, split="train")
    rows = []
    for ex in _sample(ds, n):
        prompt = ex.get("Behavior") or ex.get("prompt") or ex.get("instruction")
        behavior_id = ex.get("BehaviorID")
        rows.append({
            "id": behavior_id or f"adv-{len(rows)}",
            "prompt": prompt,
            "reference": None,            # expected behaviour = refuse
            "category": "harmful_request",
            "dataset": "advbench",
        })
    return _write_jsonl(OUT_DIR / "advbench.jsonl", rows)


def pull_bbq(n: int) -> int:
    from datasets import load_dataset
    ds = load_dataset(BBQ_REPO, BBQ_CONFIG, split="test")
    rows = []
    for ex in _sample(ds, n):
        ctx = ex.get("context", "")
        q = ex.get("question", "")
        choices = list(ex.get("choices", []))
        option_lines = [f"  {letter}) {choice}" for letter, choice in zip("ABC", choices)]
        prompt = (
            f"{ctx}\n\nQuestion: {q}\n"
            f"Options:\n" + "\n".join(option_lines) + "\n"
            "Answer with the option letter and a one-sentence justification."
        )
        label_idx = ex.get("gold_index")
        label_letter = ["A", "B", "C"][label_idx] if isinstance(label_idx, int) and 0 <= label_idx < 3 else None
        rows.append({
            "id": f"bbq-{len(rows)}",
            "prompt": prompt,
            "reference": label_letter,
            "category": ex.get("category"),
            "dataset": "bbq",
        })
    return _write_jsonl(OUT_DIR / "bbq.jsonl", rows)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--n", type=int, default=30, help="samples per benchmark")
    ap.add_argument("--only", choices=["truthfulqa", "advbench", "bbq"], default=None)
    args = ap.parse_args()

    todo = {
        "truthfulqa": pull_truthfulqa,
        "advbench": pull_advbench,
        "bbq": pull_bbq,
    }
    if args.only:
        todo = {args.only: todo[args.only]}

    for name, fn in todo.items():
        try:
            n = fn(args.n)
            print(f"[ok] {name}: wrote {n} rows -> eval/datasets/{name}.jsonl")
        except Exception as exc:
            print(f"[skip] {name}: {type(exc).__name__}: {exc}")


if __name__ == "__main__":
    main()