File size: 5,577 Bytes
ab0e054
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""Plan 3A probe: recency-positioned paraphrase instruction on 5 disputed
completeness items.

Sends the same rubric body, gold reference, and system answer as the
production CompletenessJudge prompt, with one extra sentence inserted
between the system answer and the score instruction:

  "Note: a paraphrase that captures the same meaning as a gold-answer
   point counts as covered. Score on content equivalence, not surface
   form."

Prior scores (from the v1 jury sidecar): all 5 disputed items scored 1
by gpt-4o-mini-2024-07-18; gold=2 on all 5; Haiku scored 2 on all 5.

Pre-committed criteria (DECISIONS "Plan 3A" entry):
  - Fixed:    β‰₯3/5 shift from 1 β†’ 2
  - Partial:  1–2/5 shift
  - Not fix:  0/5 shift

Run:
    OPENAI_API_KEY=... python scripts/_dev/probe_3a_paraphrase_recency.py
"""

from __future__ import annotations

import asyncio
import json
import re
import sys
from pathlib import Path

REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))

from agent_bench.core.provider import OpenAIProvider  # noqa: E402
from agent_bench.core.types import Message, Role  # noqa: E402
from agent_bench.evaluation.judges.base import (  # noqa: E402
    Rubric,
    _strip_markdown_fence,
)

DISPUTED_IDS = ["q006", "q011", "k8s_002", "k8s_006", "k8s_018"]
PRIOR_SCORES = {iid: 1 for iid in DISPUTED_IDS}  # all five scored 1 in v1 sidecar
GOLD_SCORES = {iid: 2 for iid in DISPUTED_IDS}  # all five gold=2

PARAPHRASE_RECENCY_CLAUSE = (
    "Note: a paraphrase that captures the same meaning as a gold-answer "
    "point counts as covered. Score on content equivalence, not surface form."
)


def _load_outputs() -> dict[str, dict]:
    raw = (REPO / "results/calibration_v1_system_outputs.json").read_text()
    return {r["item_id"]: r for r in json.loads(raw)}


def _build_prompt(rubric: Rubric, item_record: dict) -> str:
    """Mirror CompletenessJudge.score's prompt construction, with the
    recency clause inserted between the system answer and the score
    instruction."""
    schema_clause = (
        'JSON object: {"reasoning": "...", "evidence_quotes": [...], '
        '"score": 0 or 1 or 2 or "Unknown"}.'
    )
    return (
        f"{rubric.render_prompt(level_permutation_seed=0)}\n\n"
        f"---\n\n"
        f"## Reference answer (gold)\n{item_record['reference_answer']}\n\n"
        f"## Answer to score\n{item_record['answer']}\n\n"
        f"{PARAPHRASE_RECENCY_CLAUSE}\n\n"
        f"Score this answer against the rubric above. Respond with ONLY a "
        f"{schema_clause}"
    )


def _parse_score(content: str) -> tuple[int | str, str, list[str]]:
    """Mirror _call_judge_with_retry's parse path: fence-strip then
    json.loads, return (score, reasoning, evidence_quotes)."""
    stripped = _strip_markdown_fence(content)
    data = json.loads(stripped)
    return (
        data["score"],
        str(data.get("reasoning", "")),
        list(data.get("evidence_quotes", [])),
    )


async def main() -> None:
    rubric = Rubric.from_markdown_file(
        REPO / "agent_bench/evaluation/rubrics/completeness.md"
    )
    outputs = _load_outputs()
    provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18")

    print("=" * 80)
    print("Plan 3A β€” recency-positioned paraphrase instruction probe")
    print("=" * 80)
    print(f"prior scores: {PRIOR_SCORES}")
    print(f"gold scores:  {GOLD_SCORES}")
    print(f"intervention: \n  {PARAPHRASE_RECENCY_CLAUSE!r}\n")

    results: dict[str, dict] = {}
    total_cost = 0.0
    for iid in DISPUTED_IDS:
        item = outputs[iid]
        prompt = _build_prompt(rubric, item)
        response = await provider.complete(
            [Message(role=Role.USER, content=prompt)],
            temperature=0.0,
            max_tokens=1024,
        )
        try:
            score, reasoning, ev = _parse_score(response.content)
        except (json.JSONDecodeError, KeyError) as e:
            print(f"  {iid}: PARSE FAILED β€” {e}; raw={response.content[:200]!r}")
            continue
        prior = PRIOR_SCORES[iid]
        gold = GOLD_SCORES[iid]
        shifted = isinstance(score, int) and score > prior
        marker = "β†’ SHIFTED 1β†’2" if shifted else ("β†’ unchanged" if score == prior else f"β†’ shifted to {score}")
        print(f"  {iid}: prior={prior} new={score} gold={gold} {marker}")
        print(f"    reasoning: {reasoning[:300]}{'...' if len(reasoning) > 300 else ''}")
        print(f"    evidence_quotes: {ev}")
        print()
        results[iid] = {
            "prior": prior,
            "new": score,
            "gold": gold,
            "reasoning": reasoning,
            "evidence_quotes": ev,
            "shifted_up": shifted,
        }
        total_cost += response.usage.estimated_cost_usd

    n_shifted = sum(1 for r in results.values() if r["shifted_up"])
    print("=" * 80)
    print(f"Result: {n_shifted}/5 items shifted 1 β†’ 2")
    print(f"Total cost: ${total_cost:.4f}")
    print()
    if n_shifted >= 3:
        print("β†’ FIXED (per pre-committed criteria). Re-run on full 26 disputed items.")
    elif n_shifted >= 1:
        print("β†’ PARTIALLY FIXED. Re-run on full 26 disputed items for clean number.")
    else:
        print("β†’ NOT FIXED. Escalate to 4A (GPT-4o full).")

    out_path = REPO / "measurements/2026-05-06-3a-paraphrase-recency-probe.jsonl"
    with out_path.open("w") as f:
        for iid, r in results.items():
            f.write(json.dumps({"item_id": iid, **r}) + "\n")
    print(f"\nProbe artifact: {out_path}")


if __name__ == "__main__":
    asyncio.run(main())