"""Pull small, fixed-seed subsets of public benchmarks into eval/datasets/. We deliberately keep N small (default 30) so the eval finishes in minutes, not hours, on a laptop with Qwen-0.5B on CPU. Each output file is a JSONL record stream with a stable schema the rest of the eval pipeline consumes: {"id": str, "prompt": str, "reference": str|null, "category": str|null} """ from __future__ import annotations import argparse import json import random from pathlib import Path from typing import Iterable OUT_DIR = Path(__file__).resolve().parent.parent / "eval" / "datasets" SEED = 42 TRUTHFULQA_REPO = "truthfulqa/truthful_qa" ADVBENCH_REPO = "kelly8tom/advbench_orig" BBQ_REPO = "lighteval/bbq_helm" BBQ_CONFIG = "all" def _write_jsonl(path: Path, rows: Iterable[dict]) -> int: path.parent.mkdir(parents=True, exist_ok=True) n = 0 with path.open("w", encoding="utf-8") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") n += 1 return n def _sample(items, n): rng = random.Random(SEED) items = list(items) rng.shuffle(items) return items[:n] def pull_truthfulqa(n: int) -> int: from datasets import load_dataset ds = load_dataset(TRUTHFULQA_REPO, "generation", split="validation") rows = [] for ex in _sample(ds, n): rows.append({ "id": f"tqa-{ex.get('type','?')}-{len(rows)}", "prompt": ex["question"], "reference": ex.get("best_answer") or "", "category": ex.get("category"), "dataset": "truthfulqa", }) return _write_jsonl(OUT_DIR / "truthfulqa.jsonl", rows) def pull_advbench(n: int) -> int: from datasets import load_dataset ds = load_dataset(ADVBENCH_REPO, split="train") rows = [] for ex in _sample(ds, n): prompt = ex.get("Behavior") or ex.get("prompt") or ex.get("instruction") behavior_id = ex.get("BehaviorID") rows.append({ "id": behavior_id or f"adv-{len(rows)}", "prompt": prompt, "reference": None, # expected behaviour = refuse "category": "harmful_request", "dataset": "advbench", }) return _write_jsonl(OUT_DIR / "advbench.jsonl", rows) def pull_bbq(n: int) -> int: from datasets import load_dataset ds = load_dataset(BBQ_REPO, BBQ_CONFIG, split="test") rows = [] for ex in _sample(ds, n): ctx = ex.get("context", "") q = ex.get("question", "") choices = list(ex.get("choices", [])) option_lines = [f" {letter}) {choice}" for letter, choice in zip("ABC", choices)] prompt = ( f"{ctx}\n\nQuestion: {q}\n" f"Options:\n" + "\n".join(option_lines) + "\n" "Answer with the option letter and a one-sentence justification." ) label_idx = ex.get("gold_index") label_letter = ["A", "B", "C"][label_idx] if isinstance(label_idx, int) and 0 <= label_idx < 3 else None rows.append({ "id": f"bbq-{len(rows)}", "prompt": prompt, "reference": label_letter, "category": ex.get("category"), "dataset": "bbq", }) return _write_jsonl(OUT_DIR / "bbq.jsonl", rows) def main(): ap = argparse.ArgumentParser() ap.add_argument("--n", type=int, default=30, help="samples per benchmark") ap.add_argument("--only", choices=["truthfulqa", "advbench", "bbq"], default=None) args = ap.parse_args() todo = { "truthfulqa": pull_truthfulqa, "advbench": pull_advbench, "bbq": pull_bbq, } if args.only: todo = {args.only: todo[args.only]} for name, fn in todo.items(): try: n = fn(args.n) print(f"[ok] {name}: wrote {n} rows -> eval/datasets/{name}.jsonl") except Exception as exc: print(f"[skip] {name}: {type(exc).__name__}: {exc}") if __name__ == "__main__": main()