Spaces:
Sleeping
Sleeping
| """Pull small, fixed-seed subsets of public benchmarks into eval/datasets/. | |
| We deliberately keep N small (default 30) so the eval finishes in minutes, | |
| not hours, on a laptop with Qwen-0.5B on CPU. Each output file is a JSONL | |
| record stream with a stable schema the rest of the eval pipeline consumes: | |
| {"id": str, "prompt": str, "reference": str|null, "category": str|null} | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import random | |
| from pathlib import Path | |
| from typing import Iterable | |
| OUT_DIR = Path(__file__).resolve().parent.parent / "eval" / "datasets" | |
| SEED = 42 | |
| TRUTHFULQA_REPO = "truthfulqa/truthful_qa" | |
| ADVBENCH_REPO = "kelly8tom/advbench_orig" | |
| BBQ_REPO = "lighteval/bbq_helm" | |
| BBQ_CONFIG = "all" | |
| def _write_jsonl(path: Path, rows: Iterable[dict]) -> int: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| n = 0 | |
| with path.open("w", encoding="utf-8") as f: | |
| for r in rows: | |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") | |
| n += 1 | |
| return n | |
| def _sample(items, n): | |
| rng = random.Random(SEED) | |
| items = list(items) | |
| rng.shuffle(items) | |
| return items[:n] | |
| def pull_truthfulqa(n: int) -> int: | |
| from datasets import load_dataset | |
| ds = load_dataset(TRUTHFULQA_REPO, "generation", split="validation") | |
| rows = [] | |
| for ex in _sample(ds, n): | |
| rows.append({ | |
| "id": f"tqa-{ex.get('type','?')}-{len(rows)}", | |
| "prompt": ex["question"], | |
| "reference": ex.get("best_answer") or "", | |
| "category": ex.get("category"), | |
| "dataset": "truthfulqa", | |
| }) | |
| return _write_jsonl(OUT_DIR / "truthfulqa.jsonl", rows) | |
| def pull_advbench(n: int) -> int: | |
| from datasets import load_dataset | |
| ds = load_dataset(ADVBENCH_REPO, split="train") | |
| rows = [] | |
| for ex in _sample(ds, n): | |
| prompt = ex.get("Behavior") or ex.get("prompt") or ex.get("instruction") | |
| behavior_id = ex.get("BehaviorID") | |
| rows.append({ | |
| "id": behavior_id or f"adv-{len(rows)}", | |
| "prompt": prompt, | |
| "reference": None, # expected behaviour = refuse | |
| "category": "harmful_request", | |
| "dataset": "advbench", | |
| }) | |
| return _write_jsonl(OUT_DIR / "advbench.jsonl", rows) | |
| def pull_bbq(n: int) -> int: | |
| from datasets import load_dataset | |
| ds = load_dataset(BBQ_REPO, BBQ_CONFIG, split="test") | |
| rows = [] | |
| for ex in _sample(ds, n): | |
| ctx = ex.get("context", "") | |
| q = ex.get("question", "") | |
| choices = list(ex.get("choices", [])) | |
| option_lines = [f" {letter}) {choice}" for letter, choice in zip("ABC", choices)] | |
| prompt = ( | |
| f"{ctx}\n\nQuestion: {q}\n" | |
| f"Options:\n" + "\n".join(option_lines) + "\n" | |
| "Answer with the option letter and a one-sentence justification." | |
| ) | |
| label_idx = ex.get("gold_index") | |
| label_letter = ["A", "B", "C"][label_idx] if isinstance(label_idx, int) and 0 <= label_idx < 3 else None | |
| rows.append({ | |
| "id": f"bbq-{len(rows)}", | |
| "prompt": prompt, | |
| "reference": label_letter, | |
| "category": ex.get("category"), | |
| "dataset": "bbq", | |
| }) | |
| return _write_jsonl(OUT_DIR / "bbq.jsonl", rows) | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--n", type=int, default=30, help="samples per benchmark") | |
| ap.add_argument("--only", choices=["truthfulqa", "advbench", "bbq"], default=None) | |
| args = ap.parse_args() | |
| todo = { | |
| "truthfulqa": pull_truthfulqa, | |
| "advbench": pull_advbench, | |
| "bbq": pull_bbq, | |
| } | |
| if args.only: | |
| todo = {args.only: todo[args.only]} | |
| for name, fn in todo.items(): | |
| try: | |
| n = fn(args.n) | |
| print(f"[ok] {name}: wrote {n} rows -> eval/datasets/{name}.jsonl") | |
| except Exception as exc: | |
| print(f"[skip] {name}: {type(exc).__name__}: {exc}") | |
| if __name__ == "__main__": | |
| main() | |