feather-a10-runtime / overlay /scripts /build_benchmark_subset.py
Jackoatmon's picture
Update benchmark runtime image
6a47c48 verified
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import random
import sys
from pathlib import Path
from typing import Any
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from scripts.benchmark_datasets import resolve_benchmark_dataset
from scripts.benchmark_suite import validate_sample
def load_rows(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for line in path.read_text(encoding="utf-8").splitlines():
if line.strip():
rows.append(json.loads(line))
return rows
def build_subset_rows(*, source: Path, benchmark: str, n: int, seed: int) -> list[dict[str, Any]]:
rows = load_rows(source)
if n > len(rows):
raise ValueError(f"requested subset size {n} exceeds dataset size {len(rows)}")
chooser = random.Random(seed)
selected_indices = sorted(chooser.sample(range(len(rows)), n))
subset: list[dict[str, Any]] = []
for index in selected_indices:
row = dict(rows[index])
validate_sample(benchmark, row)
row["source_row_id"] = index
subset.append(row)
return subset
def write_subset(*, source: Path, benchmark: str, n: int, seed: int, out: Path) -> Path:
subset = build_subset_rows(source=source, benchmark=benchmark, n=n, seed=seed)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text("".join(json.dumps(row) + "\n" for row in subset), encoding="utf-8")
manifest_path = out.with_suffix(out.suffix + ".manifest.json")
manifest = {
"benchmark": benchmark,
"n": n,
"seed": seed,
"source_path": str(source),
"out_path": str(out),
"source_row_ids": [row["source_row_id"] for row in subset],
}
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8")
return manifest_path
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build a deterministic benchmark subset JSONL and manifest")
parser.add_argument("--benchmark", required=True, choices=["MBPP", "GSM8K", "HumanEval", "ARC-Challenge"])
parser.add_argument("--samples", type=Path)
parser.add_argument("--n", type=int, required=True)
parser.add_argument("--seed", type=int, required=True)
parser.add_argument("--out", type=Path, required=True)
return parser.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
source = resolve_benchmark_dataset(args.benchmark, args.samples)
write_subset(source=source, benchmark=args.benchmark, n=args.n, seed=args.seed, out=args.out)
return 0
if __name__ == "__main__":
raise SystemExit(main())