agentbench / scripts /_dev /rerun_completeness_v1_1_1.py
Nomearod's picture
calibrate(jury): v1.1+v1.1.1 β€” fix weighting bugs; recency-position paraphrase clause
ab0e054
"""Plan 3A follow-through: re-run gpt-4o-mini on completeness for all 26
calibration items with the v1.1.1 recency-positioned paraphrase clause now
permanent in CompletenessJudge.
Methodological note: only gpt-4o-mini is re-run. Haiku stays as control β€”
its v1.1 completeness predictions remain valid. This makes the v1.1.1
delta cleanly attributable to the intervention's effect on the affected
judge, not a confound from re-prompting both judges.
Outputs:
- results/calibration_v1_judge_jury_kappa_weighted_v1_1_1_members.jsonl
(Haiku rows copied from v1.1 sidecar; gpt-4o-mini rows fresh)
- results/calibration_v1_judge_jury_kappa_weighted_v1_1_1.json
(re-aggregated jury verdicts using fresh gpt-4o-mini + existing Haiku)
"""
from __future__ import annotations
import asyncio
import json
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))
from agent_bench.agents.orchestrator import AgentResponse, SourceReference # noqa: E402
from agent_bench.core.provider import OpenAIProvider # noqa: E402
from agent_bench.core.types import TokenUsage # noqa: E402
from agent_bench.evaluation.harness import GoldenQuestion # noqa: E402
from agent_bench.evaluation.judges.base import Rubric # noqa: E402
from agent_bench.evaluation.judges.completeness import CompletenessJudge # noqa: E402
LABELS = REPO / "measurements/2026-05-04-judge-calibration-labels.jsonl"
SIDECAR_V1_1 = REPO / "results/calibration_v1_judge_jury_kappa_weighted_members.jsonl"
SYSTEM_OUTPUTS = REPO / "results/calibration_v1_system_outputs.json"
NEW_SIDECAR = REPO / "results/calibration_v1_judge_jury_kappa_weighted_v1_1_1_members.jsonl"
def _build_item_and_output(rec: dict) -> tuple[GoldenQuestion, AgentResponse]:
item = GoldenQuestion(
id=rec["item_id"],
question=rec.get("question", ""),
expected_answer_keywords=[],
expected_sources=[],
category=rec.get("category", "retrieval"),
difficulty="easy",
requires_calculator=False,
reference_answer=rec.get("reference_answer", ""),
source_snippets=rec.get("source_snippets", []),
)
output = AgentResponse(
answer=rec["answer"],
sources=[SourceReference(source=s) for s in rec.get("sources", [])],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0.0),
latency_ms=0,
)
return item, output
async def main() -> None:
rubric = Rubric.from_markdown_file(
REPO / "agent_bench/evaluation/rubrics/completeness.md"
)
outputs = json.loads(SYSTEM_OUTPUTS.read_text())
by_id = {r["item_id"]: r for r in outputs}
# Load existing Haiku completeness rows from v1.1 sidecar (control).
haiku_completeness_rows: list[dict] = []
by_hash_latest: dict[tuple[str, str], dict] = {}
for line in SIDECAR_V1_1.read_text().splitlines():
if not line.strip():
continue
rec = json.loads(line)
by_hash_latest[(rec["judge_id"], rec["system_output_hash"])] = rec
for rec in by_hash_latest.values():
if "haiku" in rec["judge_id"].lower() and rec["judge_id"].endswith("_completeness"):
haiku_completeness_rows.append(rec)
# Run gpt-4o-mini CompletenessJudge with the v1.1.1 prompt on all items
# that have a system output (= 30 items).
provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18")
judge = CompletenessJudge(
judge_provider=provider, rubric=rubric, model_id="gpt-4o-mini-2024-07-18"
)
print(f"Running gpt-4o-mini CompletenessJudge (v1.1.1 prompt) on {len(outputs)} items")
fresh_gpt_rows: list[dict] = []
for rec in outputs:
item, output = _build_item_and_output(rec)
result = await judge.score(item, output)
row = result.model_dump()
row["item_id"] = item.id
fresh_gpt_rows.append(row)
score_marker = result.score
print(f" {item.id:<10} score={score_marker} cost=${result.cost_usd:.4f}")
total_cost = sum(r["cost_usd"] for r in fresh_gpt_rows)
print(f"\nTotal cost: ${total_cost:.4f}")
# Write the v1.1.1 sidecar: Haiku completeness rows (unchanged from v1.1)
# + fresh gpt-4o-mini completeness rows.
with NEW_SIDECAR.open("w") as f:
for r in haiku_completeness_rows:
f.write(json.dumps(r) + "\n")
for r in fresh_gpt_rows:
f.write(json.dumps(r) + "\n")
print(f"Wrote {len(haiku_completeness_rows)} Haiku + {len(fresh_gpt_rows)} GPT rows to {NEW_SIDECAR}")
if __name__ == "__main__":
asyncio.run(main())