Spaces:
Sleeping
Sleeping
| """Plan 4A probe: GPT-4o (full, not mini) on 5 of the 14 v1.1.1 unchanged | |
| items, using the v1.1.1 production prompt (paraphrase recency clause | |
| included). | |
| Items (gold=2/pred=1 unchanged after v1.1.1 intervention): | |
| - k8s_006, k8s_018 β the 2/5 that didn't shift in the 3A 5-item probe. | |
| We already have GPT-4o-mini's reasoning on these | |
| WITH the intervention; GPT-4o on the same prompt | |
| is a clean A/B at fixed prompt, varying model. | |
| - q011, q012 β fastapi residuals. | |
| - k8s_001 β k8s residual where Haiku also disagreed (Haiku | |
| scored 1, gold 2). | |
| Diagnostic question: does a stronger model handle the residual at the | |
| same v1.1.1 prompt? | |
| - GPT-4o scores 2 on most β residual is small-model-specific; | |
| v1.2 fix #3 (per-dim exclusion / stronger model on completeness) | |
| gets clean empirical support. | |
| - GPT-4o also scores 1 β rubric is under-specified for whatever | |
| failure mode these items hit; v1.2 needs additional rubric anchoring, | |
| not just judge-membership tuning. | |
| Run: | |
| OPENAI_API_KEY=... python scripts/_dev/probe_4a_gpt4o_full.py | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import sys | |
| from pathlib import Path | |
| REPO = Path(__file__).resolve().parents[2] | |
| sys.path.insert(0, str(REPO)) | |
| from agent_bench.agents.orchestrator import AgentResponse, SourceReference # noqa: E402 | |
| from agent_bench.core.provider import OpenAIProvider # noqa: E402 | |
| from agent_bench.core.types import TokenUsage # noqa: E402 | |
| from agent_bench.evaluation.harness import GoldenQuestion # noqa: E402 | |
| from agent_bench.evaluation.judges.base import Rubric # noqa: E402 | |
| from agent_bench.evaluation.judges.completeness import CompletenessJudge # noqa: E402 | |
| ITEMS = ["k8s_006", "k8s_018", "q011", "q012", "k8s_001"] | |
| GPT4O_FULL = "gpt-4o-2024-08-06" | |
| # Prior scores (gpt-4o-mini under v1.1.1 prompt, full-26 re-run output) | |
| PRIOR_GPT4O_MINI_V1_1_1 = {iid: 1 for iid in ITEMS} | |
| GOLD = {iid: 2 for iid in ITEMS} | |
| def _build_item_and_output(rec: dict) -> tuple[GoldenQuestion, AgentResponse]: | |
| item = GoldenQuestion( | |
| id=rec["item_id"], | |
| question=rec.get("question", ""), | |
| expected_answer_keywords=[], | |
| expected_sources=[], | |
| category=rec.get("category", "retrieval"), | |
| difficulty="easy", | |
| requires_calculator=False, | |
| reference_answer=rec.get("reference_answer", ""), | |
| source_snippets=rec.get("source_snippets", []), | |
| ) | |
| output = AgentResponse( | |
| answer=rec["answer"], | |
| sources=[SourceReference(source=s) for s in rec.get("sources", [])], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0.0), | |
| latency_ms=0, | |
| ) | |
| return item, output | |
| async def main() -> None: | |
| rubric = Rubric.from_markdown_file( | |
| REPO / "agent_bench/evaluation/rubrics/completeness.md" | |
| ) | |
| outputs = json.loads( | |
| (REPO / "results/calibration_v1_system_outputs.json").read_text() | |
| ) | |
| by_id = {r["item_id"]: r for r in outputs} | |
| provider = OpenAIProvider(model=GPT4O_FULL) | |
| judge = CompletenessJudge( | |
| judge_provider=provider, rubric=rubric, model_id=GPT4O_FULL | |
| ) | |
| print("=" * 80) | |
| print(f"Plan 4A β GPT-4o full ({GPT4O_FULL}) on 5 v1.1.1-unchanged items") | |
| print("=" * 80) | |
| print("Same v1.1.1 production prompt (paraphrase recency clause active).") | |
| print(f"Prior gpt-4o-mini scores under v1.1.1: {PRIOR_GPT4O_MINI_V1_1_1}") | |
| print(f"Gold: {GOLD}\n") | |
| results: list[dict] = [] | |
| total_cost = 0.0 | |
| for iid in ITEMS: | |
| item, output = _build_item_and_output(by_id[iid]) | |
| score_result = await judge.score(item, output) | |
| prior = PRIOR_GPT4O_MINI_V1_1_1[iid] | |
| gold = GOLD[iid] | |
| score = score_result.score | |
| if isinstance(score, int) and score > prior: | |
| marker = f"β GPT-4o disagrees with mini (mini={prior}, 4o={score})" | |
| elif score == prior: | |
| marker = f"= GPT-4o agrees with mini ({score})" | |
| else: | |
| marker = f"β GPT-4o below mini ({score})" | |
| correctness = "β matches gold" if score == gold else f"β vs gold={gold}" | |
| print(f" {iid}: 4o={score} mini-prior={prior} gold={gold} {marker} {correctness}") | |
| print(f" reasoning: {score_result.reasoning[:300]}{'...' if len(score_result.reasoning) > 300 else ''}") | |
| print(f" evidence_quotes: {score_result.evidence_quotes}") | |
| print() | |
| row = score_result.model_dump() | |
| row["item_id"] = iid | |
| row["mini_prior_score"] = prior | |
| row["gold_score"] = gold | |
| results.append(row) | |
| total_cost += score_result.cost_usd | |
| n_correct = sum(1 for r in results if r["score"] == r["gold_score"]) | |
| n_disagree_with_mini = sum( | |
| 1 for r in results | |
| if isinstance(r["score"], int) and r["score"] != r["mini_prior_score"] | |
| ) | |
| print("=" * 80) | |
| print(f"GPT-4o correct (matches gold): {n_correct}/5") | |
| print(f"GPT-4o disagrees with gpt-4o-mini-v1.1.1: {n_disagree_with_mini}/5") | |
| print(f"Total cost: ${total_cost:.4f}") | |
| print() | |
| if n_correct >= 4: | |
| print("β Residual is small-model-specific. v1.2 fix #3 (per-dim exclusion or") | |
| print(" stronger model on completeness) has clean empirical support.") | |
| elif n_correct >= 2: | |
| print("β Mixed: GPT-4o handles some residuals but not all. Some failure modes") | |
| print(" are model-class limited; others may be rubric-limited.") | |
| else: | |
| print("β Rubric is the limiting factor. Even GPT-4o struggles on these items") | |
| print(" with the v1.1.1 prompt. v1.2 needs rubric anchoring/simplification,") | |
| print(" not just judge-membership tuning.") | |
| out = REPO / "measurements/2026-05-06-4a-gpt4o-full-probe.jsonl" | |
| with out.open("w") as f: | |
| for r in results: | |
| f.write(json.dumps(r) + "\n") | |
| print(f"\nProbe artifact: {out}") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |