agentbench / scripts /_dev /sample_calibration_v1.py
Nomearod's picture
feat(calibration): 30-item stratified calibration_v1 sample
8ef480a
"""One-shot stratified sampler for calibration_v1.json. Run once; output
is committed to agent_bench/evaluation/datasets/calibration_v1.json.
The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md
under Calibration Methodology > Stratified sampling.
"""
from __future__ import annotations
import json
import random
import subprocess
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
SEED = 20260504 # date-derived; deterministic across runs
FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2}
K8S_TARGETS = {
"simple": 4,
"simple_w_condition": 3,
"comparison": 3,
"multi_hop": 4,
"false_premise": 3,
"set": 1,
}
SPARE_TOTAL = 4
def main() -> None:
rng = random.Random(SEED)
fastapi = json.loads(FASTAPI_PATH.read_text())
k8s = json.loads(K8S_PATH.read_text())["questions"]
selected: list[dict] = []
by_cat: dict[str, list[dict]] = {}
for q in fastapi:
by_cat.setdefault(q["category"], []).append(q)
for cat, n in FASTAPI_TARGETS.items():
pool = by_cat.get(cat, [])
if len(pool) < n:
raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}")
sample = rng.sample(pool, n)
for q in sample:
selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat})
by_qt: dict[str, list[dict]] = {}
for q in k8s:
by_qt.setdefault(q.get("question_type", "?"), []).append(q)
for qt, n in K8S_TARGETS.items():
pool = by_qt.get(qt, [])
if len(pool) < n:
raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}")
sample = rng.sample(pool, n)
for q in sample:
selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt})
# Spare slots — fill from highest-variance K8s strata. Original target
# was simple_w_condition + multi_hop; expanded to include comparison and
# false_premise because the K8s golden set has only 4 simple_w_condition
# and 6 multi_hop items, of which Targets already consumed 7, leaving
# only 3 in the original pool. Adding comparison/false_premise gives
# enough headroom for 4 spares.
selected_ids = {s["id"] for s in selected}
spare_pool: list[dict] = [
q
for q in k8s
if q.get("question_type")
in ("simple_w_condition", "multi_hop", "comparison", "false_premise")
and q["id"] not in selected_ids
]
if len(spare_pool) < SPARE_TOTAL:
raise SystemExit(
f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}"
)
spare = rng.sample(spare_pool, SPARE_TOTAL)
for q in spare:
selected.append(
{
"id": q["id"],
"corpus": "k8s",
"stratum": f"spare_{q['question_type']}",
}
)
if len(selected) != 30:
raise SystemExit(f"Expected 30 items; got {len(selected)}")
sha = subprocess.check_output(
["git", "rev-parse", "HEAD"], cwd=REPO, text=True
).strip()
out = {
"version": "v1",
"system_config_git_sha": sha,
"sample_seed": SEED,
"notes": (
"30-item stratified calibration set per the design doc. "
"Spare slots filled from K8s simple_w_condition and multi_hop "
"(typically highest-variance R@5 strata)."
),
"items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])),
}
OUTPUT.write_text(json.dumps(out, indent=2) + "\n")
print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}")
if __name__ == "__main__":
main()