Spaces:

Nomearod
/

agentbench

Running

File size: 6,108 Bytes

504a35c

"""Plan 4A probe: GPT-4o (full, not mini) on 5 of the 14 v1.1.1 unchanged
items, using the v1.1.1 production prompt (paraphrase recency clause
included).

Items (gold=2/pred=1 unchanged after v1.1.1 intervention):
  - k8s_006, k8s_018  — the 2/5 that didn't shift in the 3A 5-item probe.
                         We already have GPT-4o-mini's reasoning on these
                         WITH the intervention; GPT-4o on the same prompt
                         is a clean A/B at fixed prompt, varying model.
  - q011, q012        — fastapi residuals.
  - k8s_001           — k8s residual where Haiku also disagreed (Haiku
                         scored 1, gold 2).

Diagnostic question: does a stronger model handle the residual at the
same v1.1.1 prompt?

  - GPT-4o scores 2 on most → residual is small-model-specific;
    v1.2 fix #3 (per-dim exclusion / stronger model on completeness)
    gets clean empirical support.
  - GPT-4o also scores 1 → rubric is under-specified for whatever
    failure mode these items hit; v1.2 needs additional rubric anchoring,
    not just judge-membership tuning.

Run:
    OPENAI_API_KEY=... python scripts/_dev/probe_4a_gpt4o_full.py
"""

from __future__ import annotations

import asyncio
import json
import sys
from pathlib import Path

REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))

from agent_bench.agents.orchestrator import AgentResponse, SourceReference  # noqa: E402
from agent_bench.core.provider import OpenAIProvider  # noqa: E402
from agent_bench.core.types import TokenUsage  # noqa: E402
from agent_bench.evaluation.harness import GoldenQuestion  # noqa: E402
from agent_bench.evaluation.judges.base import Rubric  # noqa: E402
from agent_bench.evaluation.judges.completeness import CompletenessJudge  # noqa: E402

ITEMS = ["k8s_006", "k8s_018", "q011", "q012", "k8s_001"]
GPT4O_FULL = "gpt-4o-2024-08-06"

# Prior scores (gpt-4o-mini under v1.1.1 prompt, full-26 re-run output)
PRIOR_GPT4O_MINI_V1_1_1 = {iid: 1 for iid in ITEMS}
GOLD = {iid: 2 for iid in ITEMS}


def _build_item_and_output(rec: dict) -> tuple[GoldenQuestion, AgentResponse]:
    item = GoldenQuestion(
        id=rec["item_id"],
        question=rec.get("question", ""),
        expected_answer_keywords=[],
        expected_sources=[],
        category=rec.get("category", "retrieval"),
        difficulty="easy",
        requires_calculator=False,
        reference_answer=rec.get("reference_answer", ""),
        source_snippets=rec.get("source_snippets", []),
    )
    output = AgentResponse(
        answer=rec["answer"],
        sources=[SourceReference(source=s) for s in rec.get("sources", [])],
        iterations=1,
        usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0.0),
        latency_ms=0,
    )
    return item, output


async def main() -> None:
    rubric = Rubric.from_markdown_file(
        REPO / "agent_bench/evaluation/rubrics/completeness.md"
    )
    outputs = json.loads(
        (REPO / "results/calibration_v1_system_outputs.json").read_text()
    )
    by_id = {r["item_id"]: r for r in outputs}

    provider = OpenAIProvider(model=GPT4O_FULL)
    judge = CompletenessJudge(
        judge_provider=provider, rubric=rubric, model_id=GPT4O_FULL
    )

    print("=" * 80)
    print(f"Plan 4A — GPT-4o full ({GPT4O_FULL}) on 5 v1.1.1-unchanged items")
    print("=" * 80)
    print("Same v1.1.1 production prompt (paraphrase recency clause active).")
    print(f"Prior gpt-4o-mini scores under v1.1.1: {PRIOR_GPT4O_MINI_V1_1_1}")
    print(f"Gold:                                    {GOLD}\n")

    results: list[dict] = []
    total_cost = 0.0
    for iid in ITEMS:
        item, output = _build_item_and_output(by_id[iid])
        score_result = await judge.score(item, output)
        prior = PRIOR_GPT4O_MINI_V1_1_1[iid]
        gold = GOLD[iid]
        score = score_result.score
        if isinstance(score, int) and score > prior:
            marker = f"→ GPT-4o disagrees with mini (mini={prior}, 4o={score})"
        elif score == prior:
            marker = f"= GPT-4o agrees with mini ({score})"
        else:
            marker = f"→ GPT-4o below mini ({score})"
        correctness = "✓ matches gold" if score == gold else f"✗ vs gold={gold}"
        print(f"  {iid}: 4o={score}  mini-prior={prior}  gold={gold}  {marker}  {correctness}")
        print(f"    reasoning: {score_result.reasoning[:300]}{'...' if len(score_result.reasoning) > 300 else ''}")
        print(f"    evidence_quotes: {score_result.evidence_quotes}")
        print()
        row = score_result.model_dump()
        row["item_id"] = iid
        row["mini_prior_score"] = prior
        row["gold_score"] = gold
        results.append(row)
        total_cost += score_result.cost_usd

    n_correct = sum(1 for r in results if r["score"] == r["gold_score"])
    n_disagree_with_mini = sum(
        1 for r in results
        if isinstance(r["score"], int) and r["score"] != r["mini_prior_score"]
    )
    print("=" * 80)
    print(f"GPT-4o correct (matches gold): {n_correct}/5")
    print(f"GPT-4o disagrees with gpt-4o-mini-v1.1.1: {n_disagree_with_mini}/5")
    print(f"Total cost: ${total_cost:.4f}")
    print()
    if n_correct >= 4:
        print("→ Residual is small-model-specific. v1.2 fix #3 (per-dim exclusion or")
        print("  stronger model on completeness) has clean empirical support.")
    elif n_correct >= 2:
        print("→ Mixed: GPT-4o handles some residuals but not all. Some failure modes")
        print("  are model-class limited; others may be rubric-limited.")
    else:
        print("→ Rubric is the limiting factor. Even GPT-4o struggles on these items")
        print("  with the v1.1.1 prompt. v1.2 needs rubric anchoring/simplification,")
        print("  not just judge-membership tuning.")

    out = REPO / "measurements/2026-05-06-4a-gpt4o-full-probe.jsonl"
    with out.open("w") as f:
        for r in results:
            f.write(json.dumps(r) + "\n")
    print(f"\nProbe artifact: {out}")


if __name__ == "__main__":
    asyncio.run(main())