Spaces:
Running
Running
| """One-shot stratified sampler for calibration_v1.json. Run once; output | |
| is committed to agent_bench/evaluation/datasets/calibration_v1.json. | |
| The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md | |
| under Calibration Methodology > Stratified sampling. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import random | |
| import subprocess | |
| from pathlib import Path | |
| REPO = Path(__file__).resolve().parents[2] | |
| FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json" | |
| K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json" | |
| OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json" | |
| SEED = 20260504 # date-derived; deterministic across runs | |
| FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2} | |
| K8S_TARGETS = { | |
| "simple": 4, | |
| "simple_w_condition": 3, | |
| "comparison": 3, | |
| "multi_hop": 4, | |
| "false_premise": 3, | |
| "set": 1, | |
| } | |
| SPARE_TOTAL = 4 | |
| def main() -> None: | |
| rng = random.Random(SEED) | |
| fastapi = json.loads(FASTAPI_PATH.read_text()) | |
| k8s = json.loads(K8S_PATH.read_text())["questions"] | |
| selected: list[dict] = [] | |
| by_cat: dict[str, list[dict]] = {} | |
| for q in fastapi: | |
| by_cat.setdefault(q["category"], []).append(q) | |
| for cat, n in FASTAPI_TARGETS.items(): | |
| pool = by_cat.get(cat, []) | |
| if len(pool) < n: | |
| raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}") | |
| sample = rng.sample(pool, n) | |
| for q in sample: | |
| selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat}) | |
| by_qt: dict[str, list[dict]] = {} | |
| for q in k8s: | |
| by_qt.setdefault(q.get("question_type", "?"), []).append(q) | |
| for qt, n in K8S_TARGETS.items(): | |
| pool = by_qt.get(qt, []) | |
| if len(pool) < n: | |
| raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}") | |
| sample = rng.sample(pool, n) | |
| for q in sample: | |
| selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt}) | |
| # Spare slots — fill from highest-variance K8s strata. Original target | |
| # was simple_w_condition + multi_hop; expanded to include comparison and | |
| # false_premise because the K8s golden set has only 4 simple_w_condition | |
| # and 6 multi_hop items, of which Targets already consumed 7, leaving | |
| # only 3 in the original pool. Adding comparison/false_premise gives | |
| # enough headroom for 4 spares. | |
| selected_ids = {s["id"] for s in selected} | |
| spare_pool: list[dict] = [ | |
| q | |
| for q in k8s | |
| if q.get("question_type") | |
| in ("simple_w_condition", "multi_hop", "comparison", "false_premise") | |
| and q["id"] not in selected_ids | |
| ] | |
| if len(spare_pool) < SPARE_TOTAL: | |
| raise SystemExit( | |
| f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}" | |
| ) | |
| spare = rng.sample(spare_pool, SPARE_TOTAL) | |
| for q in spare: | |
| selected.append( | |
| { | |
| "id": q["id"], | |
| "corpus": "k8s", | |
| "stratum": f"spare_{q['question_type']}", | |
| } | |
| ) | |
| if len(selected) != 30: | |
| raise SystemExit(f"Expected 30 items; got {len(selected)}") | |
| sha = subprocess.check_output( | |
| ["git", "rev-parse", "HEAD"], cwd=REPO, text=True | |
| ).strip() | |
| out = { | |
| "version": "v1", | |
| "system_config_git_sha": sha, | |
| "sample_seed": SEED, | |
| "notes": ( | |
| "30-item stratified calibration set per the design doc. " | |
| "Spare slots filled from K8s simple_w_condition and multi_hop " | |
| "(typically highest-variance R@5 strata)." | |
| ), | |
| "items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])), | |
| } | |
| OUTPUT.write_text(json.dumps(out, indent=2) + "\n") | |
| print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}") | |
| if __name__ == "__main__": | |
| main() | |