Spaces:
Running
Running
| """Plan 3A probe: recency-positioned paraphrase instruction on 5 disputed | |
| completeness items. | |
| Sends the same rubric body, gold reference, and system answer as the | |
| production CompletenessJudge prompt, with one extra sentence inserted | |
| between the system answer and the score instruction: | |
| "Note: a paraphrase that captures the same meaning as a gold-answer | |
| point counts as covered. Score on content equivalence, not surface | |
| form." | |
| Prior scores (from the v1 jury sidecar): all 5 disputed items scored 1 | |
| by gpt-4o-mini-2024-07-18; gold=2 on all 5; Haiku scored 2 on all 5. | |
| Pre-committed criteria (DECISIONS "Plan 3A" entry): | |
| - Fixed: β₯3/5 shift from 1 β 2 | |
| - Partial: 1β2/5 shift | |
| - Not fix: 0/5 shift | |
| Run: | |
| OPENAI_API_KEY=... python scripts/_dev/probe_3a_paraphrase_recency.py | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| REPO = Path(__file__).resolve().parents[2] | |
| sys.path.insert(0, str(REPO)) | |
| from agent_bench.core.provider import OpenAIProvider # noqa: E402 | |
| from agent_bench.core.types import Message, Role # noqa: E402 | |
| from agent_bench.evaluation.judges.base import ( # noqa: E402 | |
| Rubric, | |
| _strip_markdown_fence, | |
| ) | |
| DISPUTED_IDS = ["q006", "q011", "k8s_002", "k8s_006", "k8s_018"] | |
| PRIOR_SCORES = {iid: 1 for iid in DISPUTED_IDS} # all five scored 1 in v1 sidecar | |
| GOLD_SCORES = {iid: 2 for iid in DISPUTED_IDS} # all five gold=2 | |
| PARAPHRASE_RECENCY_CLAUSE = ( | |
| "Note: a paraphrase that captures the same meaning as a gold-answer " | |
| "point counts as covered. Score on content equivalence, not surface form." | |
| ) | |
| def _load_outputs() -> dict[str, dict]: | |
| raw = (REPO / "results/calibration_v1_system_outputs.json").read_text() | |
| return {r["item_id"]: r for r in json.loads(raw)} | |
| def _build_prompt(rubric: Rubric, item_record: dict) -> str: | |
| """Mirror CompletenessJudge.score's prompt construction, with the | |
| recency clause inserted between the system answer and the score | |
| instruction.""" | |
| schema_clause = ( | |
| 'JSON object: {"reasoning": "...", "evidence_quotes": [...], ' | |
| '"score": 0 or 1 or 2 or "Unknown"}.' | |
| ) | |
| return ( | |
| f"{rubric.render_prompt(level_permutation_seed=0)}\n\n" | |
| f"---\n\n" | |
| f"## Reference answer (gold)\n{item_record['reference_answer']}\n\n" | |
| f"## Answer to score\n{item_record['answer']}\n\n" | |
| f"{PARAPHRASE_RECENCY_CLAUSE}\n\n" | |
| f"Score this answer against the rubric above. Respond with ONLY a " | |
| f"{schema_clause}" | |
| ) | |
| def _parse_score(content: str) -> tuple[int | str, str, list[str]]: | |
| """Mirror _call_judge_with_retry's parse path: fence-strip then | |
| json.loads, return (score, reasoning, evidence_quotes).""" | |
| stripped = _strip_markdown_fence(content) | |
| data = json.loads(stripped) | |
| return ( | |
| data["score"], | |
| str(data.get("reasoning", "")), | |
| list(data.get("evidence_quotes", [])), | |
| ) | |
| async def main() -> None: | |
| rubric = Rubric.from_markdown_file( | |
| REPO / "agent_bench/evaluation/rubrics/completeness.md" | |
| ) | |
| outputs = _load_outputs() | |
| provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18") | |
| print("=" * 80) | |
| print("Plan 3A β recency-positioned paraphrase instruction probe") | |
| print("=" * 80) | |
| print(f"prior scores: {PRIOR_SCORES}") | |
| print(f"gold scores: {GOLD_SCORES}") | |
| print(f"intervention: \n {PARAPHRASE_RECENCY_CLAUSE!r}\n") | |
| results: dict[str, dict] = {} | |
| total_cost = 0.0 | |
| for iid in DISPUTED_IDS: | |
| item = outputs[iid] | |
| prompt = _build_prompt(rubric, item) | |
| response = await provider.complete( | |
| [Message(role=Role.USER, content=prompt)], | |
| temperature=0.0, | |
| max_tokens=1024, | |
| ) | |
| try: | |
| score, reasoning, ev = _parse_score(response.content) | |
| except (json.JSONDecodeError, KeyError) as e: | |
| print(f" {iid}: PARSE FAILED β {e}; raw={response.content[:200]!r}") | |
| continue | |
| prior = PRIOR_SCORES[iid] | |
| gold = GOLD_SCORES[iid] | |
| shifted = isinstance(score, int) and score > prior | |
| marker = "β SHIFTED 1β2" if shifted else ("β unchanged" if score == prior else f"β shifted to {score}") | |
| print(f" {iid}: prior={prior} new={score} gold={gold} {marker}") | |
| print(f" reasoning: {reasoning[:300]}{'...' if len(reasoning) > 300 else ''}") | |
| print(f" evidence_quotes: {ev}") | |
| print() | |
| results[iid] = { | |
| "prior": prior, | |
| "new": score, | |
| "gold": gold, | |
| "reasoning": reasoning, | |
| "evidence_quotes": ev, | |
| "shifted_up": shifted, | |
| } | |
| total_cost += response.usage.estimated_cost_usd | |
| n_shifted = sum(1 for r in results.values() if r["shifted_up"]) | |
| print("=" * 80) | |
| print(f"Result: {n_shifted}/5 items shifted 1 β 2") | |
| print(f"Total cost: ${total_cost:.4f}") | |
| print() | |
| if n_shifted >= 3: | |
| print("β FIXED (per pre-committed criteria). Re-run on full 26 disputed items.") | |
| elif n_shifted >= 1: | |
| print("β PARTIALLY FIXED. Re-run on full 26 disputed items for clean number.") | |
| else: | |
| print("β NOT FIXED. Escalate to 4A (GPT-4o full).") | |
| out_path = REPO / "measurements/2026-05-06-3a-paraphrase-recency-probe.jsonl" | |
| with out_path.open("w") as f: | |
| for iid, r in results.items(): | |
| f.write(json.dumps({"item_id": iid, **r}) + "\n") | |
| print(f"\nProbe artifact: {out_path}") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |