Spaces:

sid-007
/

ai-assistants-eval

Sleeping

ai-assistants-eval / scripts /download_datasets.py

sid-007

Deploy AI Assistants Eval — OSS vs Frontier

a9141f4 10 days ago

4 kB

	"""Pull small, fixed-seed subsets of public benchmarks into eval/datasets/.

	We deliberately keep N small (default 30) so the eval finishes in minutes,
	not hours, on a laptop with Qwen-0.5B on CPU. Each output file is a JSONL
	record stream with a stable schema the rest of the eval pipeline consumes:

	{"id": str, "prompt": str, "reference": str\|null, "category": str\|null}
	"""
	from __future__ import annotations

	import argparse
	import json
	import random
	from pathlib import Path
	from typing import Iterable

	OUT_DIR = Path(__file__).resolve().parent.parent / "eval" / "datasets"
	SEED = 42
	TRUTHFULQA_REPO = "truthfulqa/truthful_qa"
	ADVBENCH_REPO = "kelly8tom/advbench_orig"
	BBQ_REPO = "lighteval/bbq_helm"
	BBQ_CONFIG = "all"


	def _write_jsonl(path: Path, rows: Iterable[dict]) -> int:
	path.parent.mkdir(parents=True, exist_ok=True)
	n = 0
	with path.open("w", encoding="utf-8") as f:
	for r in rows:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")
	n += 1
	return n


	def _sample(items, n):
	rng = random.Random(SEED)
	items = list(items)
	rng.shuffle(items)
	return items[:n]


	def pull_truthfulqa(n: int) -> int:
	from datasets import load_dataset
	ds = load_dataset(TRUTHFULQA_REPO, "generation", split="validation")
	rows = []
	for ex in _sample(ds, n):
	rows.append({
	"id": f"tqa-{ex.get('type','?')}-{len(rows)}",
	"prompt": ex["question"],
	"reference": ex.get("best_answer") or "",
	"category": ex.get("category"),
	"dataset": "truthfulqa",
	})
	return _write_jsonl(OUT_DIR / "truthfulqa.jsonl", rows)


	def pull_advbench(n: int) -> int:
	from datasets import load_dataset
	ds = load_dataset(ADVBENCH_REPO, split="train")
	rows = []
	for ex in _sample(ds, n):
	prompt = ex.get("Behavior") or ex.get("prompt") or ex.get("instruction")
	behavior_id = ex.get("BehaviorID")
	rows.append({
	"id": behavior_id or f"adv-{len(rows)}",
	"prompt": prompt,
	"reference": None, # expected behaviour = refuse
	"category": "harmful_request",
	"dataset": "advbench",
	})
	return _write_jsonl(OUT_DIR / "advbench.jsonl", rows)


	def pull_bbq(n: int) -> int:
	from datasets import load_dataset
	ds = load_dataset(BBQ_REPO, BBQ_CONFIG, split="test")
	rows = []
	for ex in _sample(ds, n):
	ctx = ex.get("context", "")
	q = ex.get("question", "")
	choices = list(ex.get("choices", []))
	option_lines = [f" {letter}) {choice}" for letter, choice in zip("ABC", choices)]
	prompt = (
	f"{ctx}\n\nQuestion: {q}\n"
	f"Options:\n" + "\n".join(option_lines) + "\n"
	"Answer with the option letter and a one-sentence justification."
	)
	label_idx = ex.get("gold_index")
	label_letter = ["A", "B", "C"][label_idx] if isinstance(label_idx, int) and 0 <= label_idx < 3 else None
	rows.append({
	"id": f"bbq-{len(rows)}",
	"prompt": prompt,
	"reference": label_letter,
	"category": ex.get("category"),
	"dataset": "bbq",
	})
	return _write_jsonl(OUT_DIR / "bbq.jsonl", rows)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--n", type=int, default=30, help="samples per benchmark")
	ap.add_argument("--only", choices=["truthfulqa", "advbench", "bbq"], default=None)
	args = ap.parse_args()

	todo = {
	"truthfulqa": pull_truthfulqa,
	"advbench": pull_advbench,
	"bbq": pull_bbq,
	}
	if args.only:
	todo = {args.only: todo[args.only]}

	for name, fn in todo.items():
	try:
	n = fn(args.n)
	print(f"[ok] {name}: wrote {n} rows -> eval/datasets/{name}.jsonl")
	except Exception as exc:
	print(f"[skip] {name}: {type(exc).__name__}: {exc}")


	if __name__ == "__main__":
	main()