File size: 4,623 Bytes
ab0e054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Plan 3A follow-through: re-run gpt-4o-mini on completeness for all 26
calibration items with the v1.1.1 recency-positioned paraphrase clause now
permanent in CompletenessJudge.

Methodological note: only gpt-4o-mini is re-run. Haiku stays as control —
its v1.1 completeness predictions remain valid. This makes the v1.1.1
delta cleanly attributable to the intervention's effect on the affected
judge, not a confound from re-prompting both judges.

Outputs:
  - results/calibration_v1_judge_jury_kappa_weighted_v1_1_1_members.jsonl
    (Haiku rows copied from v1.1 sidecar; gpt-4o-mini rows fresh)
  - results/calibration_v1_judge_jury_kappa_weighted_v1_1_1.json
    (re-aggregated jury verdicts using fresh gpt-4o-mini + existing Haiku)
"""

from __future__ import annotations

import asyncio
import json
import sys
from pathlib import Path

REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))

from agent_bench.agents.orchestrator import AgentResponse, SourceReference  # noqa: E402
from agent_bench.core.provider import OpenAIProvider  # noqa: E402
from agent_bench.core.types import TokenUsage  # noqa: E402
from agent_bench.evaluation.harness import GoldenQuestion  # noqa: E402
from agent_bench.evaluation.judges.base import Rubric  # noqa: E402
from agent_bench.evaluation.judges.completeness import CompletenessJudge  # noqa: E402

LABELS = REPO / "measurements/2026-05-04-judge-calibration-labels.jsonl"
SIDECAR_V1_1 = REPO / "results/calibration_v1_judge_jury_kappa_weighted_members.jsonl"
SYSTEM_OUTPUTS = REPO / "results/calibration_v1_system_outputs.json"
NEW_SIDECAR = REPO / "results/calibration_v1_judge_jury_kappa_weighted_v1_1_1_members.jsonl"


def _build_item_and_output(rec: dict) -> tuple[GoldenQuestion, AgentResponse]:
    item = GoldenQuestion(
        id=rec["item_id"],
        question=rec.get("question", ""),
        expected_answer_keywords=[],
        expected_sources=[],
        category=rec.get("category", "retrieval"),
        difficulty="easy",
        requires_calculator=False,
        reference_answer=rec.get("reference_answer", ""),
        source_snippets=rec.get("source_snippets", []),
    )
    output = AgentResponse(
        answer=rec["answer"],
        sources=[SourceReference(source=s) for s in rec.get("sources", [])],
        iterations=1,
        usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0.0),
        latency_ms=0,
    )
    return item, output


async def main() -> None:
    rubric = Rubric.from_markdown_file(
        REPO / "agent_bench/evaluation/rubrics/completeness.md"
    )
    outputs = json.loads(SYSTEM_OUTPUTS.read_text())
    by_id = {r["item_id"]: r for r in outputs}

    # Load existing Haiku completeness rows from v1.1 sidecar (control).
    haiku_completeness_rows: list[dict] = []
    by_hash_latest: dict[tuple[str, str], dict] = {}
    for line in SIDECAR_V1_1.read_text().splitlines():
        if not line.strip():
            continue
        rec = json.loads(line)
        by_hash_latest[(rec["judge_id"], rec["system_output_hash"])] = rec

    for rec in by_hash_latest.values():
        if "haiku" in rec["judge_id"].lower() and rec["judge_id"].endswith("_completeness"):
            haiku_completeness_rows.append(rec)

    # Run gpt-4o-mini CompletenessJudge with the v1.1.1 prompt on all items
    # that have a system output (= 30 items).
    provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18")
    judge = CompletenessJudge(
        judge_provider=provider, rubric=rubric, model_id="gpt-4o-mini-2024-07-18"
    )

    print(f"Running gpt-4o-mini CompletenessJudge (v1.1.1 prompt) on {len(outputs)} items")
    fresh_gpt_rows: list[dict] = []
    for rec in outputs:
        item, output = _build_item_and_output(rec)
        result = await judge.score(item, output)
        row = result.model_dump()
        row["item_id"] = item.id
        fresh_gpt_rows.append(row)
        score_marker = result.score
        print(f"  {item.id:<10} score={score_marker} cost=${result.cost_usd:.4f}")

    total_cost = sum(r["cost_usd"] for r in fresh_gpt_rows)
    print(f"\nTotal cost: ${total_cost:.4f}")

    # Write the v1.1.1 sidecar: Haiku completeness rows (unchanged from v1.1)
    # + fresh gpt-4o-mini completeness rows.
    with NEW_SIDECAR.open("w") as f:
        for r in haiku_completeness_rows:
            f.write(json.dumps(r) + "\n")
        for r in fresh_gpt_rows:
            f.write(json.dumps(r) + "\n")
    print(f"Wrote {len(haiku_completeness_rows)} Haiku + {len(fresh_gpt_rows)} GPT rows to {NEW_SIDECAR}")


if __name__ == "__main__":
    asyncio.run(main())