ai-assistants-eval / scripts /download_datasets.py
sid-007
Deploy AI Assistants Eval — OSS vs Frontier
a9141f4
"""Pull small, fixed-seed subsets of public benchmarks into eval/datasets/.
We deliberately keep N small (default 30) so the eval finishes in minutes,
not hours, on a laptop with Qwen-0.5B on CPU. Each output file is a JSONL
record stream with a stable schema the rest of the eval pipeline consumes:
{"id": str, "prompt": str, "reference": str|null, "category": str|null}
"""
from __future__ import annotations
import argparse
import json
import random
from pathlib import Path
from typing import Iterable
OUT_DIR = Path(__file__).resolve().parent.parent / "eval" / "datasets"
SEED = 42
TRUTHFULQA_REPO = "truthfulqa/truthful_qa"
ADVBENCH_REPO = "kelly8tom/advbench_orig"
BBQ_REPO = "lighteval/bbq_helm"
BBQ_CONFIG = "all"
def _write_jsonl(path: Path, rows: Iterable[dict]) -> int:
path.parent.mkdir(parents=True, exist_ok=True)
n = 0
with path.open("w", encoding="utf-8") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
n += 1
return n
def _sample(items, n):
rng = random.Random(SEED)
items = list(items)
rng.shuffle(items)
return items[:n]
def pull_truthfulqa(n: int) -> int:
from datasets import load_dataset
ds = load_dataset(TRUTHFULQA_REPO, "generation", split="validation")
rows = []
for ex in _sample(ds, n):
rows.append({
"id": f"tqa-{ex.get('type','?')}-{len(rows)}",
"prompt": ex["question"],
"reference": ex.get("best_answer") or "",
"category": ex.get("category"),
"dataset": "truthfulqa",
})
return _write_jsonl(OUT_DIR / "truthfulqa.jsonl", rows)
def pull_advbench(n: int) -> int:
from datasets import load_dataset
ds = load_dataset(ADVBENCH_REPO, split="train")
rows = []
for ex in _sample(ds, n):
prompt = ex.get("Behavior") or ex.get("prompt") or ex.get("instruction")
behavior_id = ex.get("BehaviorID")
rows.append({
"id": behavior_id or f"adv-{len(rows)}",
"prompt": prompt,
"reference": None, # expected behaviour = refuse
"category": "harmful_request",
"dataset": "advbench",
})
return _write_jsonl(OUT_DIR / "advbench.jsonl", rows)
def pull_bbq(n: int) -> int:
from datasets import load_dataset
ds = load_dataset(BBQ_REPO, BBQ_CONFIG, split="test")
rows = []
for ex in _sample(ds, n):
ctx = ex.get("context", "")
q = ex.get("question", "")
choices = list(ex.get("choices", []))
option_lines = [f" {letter}) {choice}" for letter, choice in zip("ABC", choices)]
prompt = (
f"{ctx}\n\nQuestion: {q}\n"
f"Options:\n" + "\n".join(option_lines) + "\n"
"Answer with the option letter and a one-sentence justification."
)
label_idx = ex.get("gold_index")
label_letter = ["A", "B", "C"][label_idx] if isinstance(label_idx, int) and 0 <= label_idx < 3 else None
rows.append({
"id": f"bbq-{len(rows)}",
"prompt": prompt,
"reference": label_letter,
"category": ex.get("category"),
"dataset": "bbq",
})
return _write_jsonl(OUT_DIR / "bbq.jsonl", rows)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--n", type=int, default=30, help="samples per benchmark")
ap.add_argument("--only", choices=["truthfulqa", "advbench", "bbq"], default=None)
args = ap.parse_args()
todo = {
"truthfulqa": pull_truthfulqa,
"advbench": pull_advbench,
"bbq": pull_bbq,
}
if args.only:
todo = {args.only: todo[args.only]}
for name, fn in todo.items():
try:
n = fn(args.n)
print(f"[ok] {name}: wrote {n} rows -> eval/datasets/{name}.jsonl")
except Exception as exc:
print(f"[skip] {name}: {type(exc).__name__}: {exc}")
if __name__ == "__main__":
main()