Spaces:

Nomearod
/

agentbench

Running

App Files Files Community

agentbench / scripts /_dev /sample_calibration_v1.py

Nomearod

feat(calibration): 30-item stratified calibration_v1 sample

8ef480a 4 days ago

raw

history blame contribute delete

3.89 kB

	"""One-shot stratified sampler for calibration_v1.json. Run once; output
	is committed to agent_bench/evaluation/datasets/calibration_v1.json.

	The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md
	under Calibration Methodology > Stratified sampling.
	"""

	from __future__ import annotations

	import json
	import random
	import subprocess
	from pathlib import Path

	REPO = Path(__file__).resolve().parents[2]
	FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
	K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
	OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"

	SEED = 20260504 # date-derived; deterministic across runs

	FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2}
	K8S_TARGETS = {
	"simple": 4,
	"simple_w_condition": 3,
	"comparison": 3,
	"multi_hop": 4,
	"false_premise": 3,
	"set": 1,
	}
	SPARE_TOTAL = 4


	def main() -> None:
	rng = random.Random(SEED)

	fastapi = json.loads(FASTAPI_PATH.read_text())
	k8s = json.loads(K8S_PATH.read_text())["questions"]

	selected: list[dict] = []

	by_cat: dict[str, list[dict]] = {}
	for q in fastapi:
	by_cat.setdefault(q["category"], []).append(q)
	for cat, n in FASTAPI_TARGETS.items():
	pool = by_cat.get(cat, [])
	if len(pool) < n:
	raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}")
	sample = rng.sample(pool, n)
	for q in sample:
	selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat})

	by_qt: dict[str, list[dict]] = {}
	for q in k8s:
	by_qt.setdefault(q.get("question_type", "?"), []).append(q)
	for qt, n in K8S_TARGETS.items():
	pool = by_qt.get(qt, [])
	if len(pool) < n:
	raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}")
	sample = rng.sample(pool, n)
	for q in sample:
	selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt})

	# Spare slots — fill from highest-variance K8s strata. Original target
	# was simple_w_condition + multi_hop; expanded to include comparison and
	# false_premise because the K8s golden set has only 4 simple_w_condition
	# and 6 multi_hop items, of which Targets already consumed 7, leaving
	# only 3 in the original pool. Adding comparison/false_premise gives
	# enough headroom for 4 spares.
	selected_ids = {s["id"] for s in selected}
	spare_pool: list[dict] = [
	q
	for q in k8s
	if q.get("question_type")
	in ("simple_w_condition", "multi_hop", "comparison", "false_premise")
	and q["id"] not in selected_ids
	]
	if len(spare_pool) < SPARE_TOTAL:
	raise SystemExit(
	f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}"
	)
	spare = rng.sample(spare_pool, SPARE_TOTAL)
	for q in spare:
	selected.append(
	{
	"id": q["id"],
	"corpus": "k8s",
	"stratum": f"spare_{q['question_type']}",
	}
	)

	if len(selected) != 30:
	raise SystemExit(f"Expected 30 items; got {len(selected)}")

	sha = subprocess.check_output(
	["git", "rev-parse", "HEAD"], cwd=REPO, text=True
	).strip()

	out = {
	"version": "v1",
	"system_config_git_sha": sha,
	"sample_seed": SEED,
	"notes": (
	"30-item stratified calibration set per the design doc. "
	"Spare slots filled from K8s simple_w_condition and multi_hop "
	"(typically highest-variance R@5 strata)."
	),
	"items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])),
	}
	OUTPUT.write_text(json.dumps(out, indent=2) + "\n")
	print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}")


	if __name__ == "__main__":
	main()