Spaces:

Nomearod
/

agentbench

Running

File size: 5,577 Bytes

ab0e054

"""Plan 3A probe: recency-positioned paraphrase instruction on 5 disputed
completeness items.

Sends the same rubric body, gold reference, and system answer as the
production CompletenessJudge prompt, with one extra sentence inserted
between the system answer and the score instruction:

  "Note: a paraphrase that captures the same meaning as a gold-answer
   point counts as covered. Score on content equivalence, not surface
   form."

Prior scores (from the v1 jury sidecar): all 5 disputed items scored 1
by gpt-4o-mini-2024-07-18; gold=2 on all 5; Haiku scored 2 on all 5.

Pre-committed criteria (DECISIONS "Plan 3A" entry):
  - Fixed:    ≥3/5 shift from 1 → 2
  - Partial:  1–2/5 shift
  - Not fix:  0/5 shift

Run:
    OPENAI_API_KEY=... python scripts/_dev/probe_3a_paraphrase_recency.py
"""

from __future__ import annotations

import asyncio
import json
import re
import sys
from pathlib import Path

REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))

from agent_bench.core.provider import OpenAIProvider  # noqa: E402
from agent_bench.core.types import Message, Role  # noqa: E402
from agent_bench.evaluation.judges.base import (  # noqa: E402
    Rubric,
    _strip_markdown_fence,
)

DISPUTED_IDS = ["q006", "q011", "k8s_002", "k8s_006", "k8s_018"]
PRIOR_SCORES = {iid: 1 for iid in DISPUTED_IDS}  # all five scored 1 in v1 sidecar
GOLD_SCORES = {iid: 2 for iid in DISPUTED_IDS}  # all five gold=2

PARAPHRASE_RECENCY_CLAUSE = (
    "Note: a paraphrase that captures the same meaning as a gold-answer "
    "point counts as covered. Score on content equivalence, not surface form."
)


def _load_outputs() -> dict[str, dict]:
    raw = (REPO / "results/calibration_v1_system_outputs.json").read_text()
    return {r["item_id"]: r for r in json.loads(raw)}


def _build_prompt(rubric: Rubric, item_record: dict) -> str:
    """Mirror CompletenessJudge.score's prompt construction, with the
    recency clause inserted between the system answer and the score
    instruction."""
    schema_clause = (
        'JSON object: {"reasoning": "...", "evidence_quotes": [...], '
        '"score": 0 or 1 or 2 or "Unknown"}.'
    )
    return (
        f"{rubric.render_prompt(level_permutation_seed=0)}\n\n"
        f"---\n\n"
        f"## Reference answer (gold)\n{item_record['reference_answer']}\n\n"
        f"## Answer to score\n{item_record['answer']}\n\n"
        f"{PARAPHRASE_RECENCY_CLAUSE}\n\n"
        f"Score this answer against the rubric above. Respond with ONLY a "
        f"{schema_clause}"
    )


def _parse_score(content: str) -> tuple[int | str, str, list[str]]:
    """Mirror _call_judge_with_retry's parse path: fence-strip then
    json.loads, return (score, reasoning, evidence_quotes)."""
    stripped = _strip_markdown_fence(content)
    data = json.loads(stripped)
    return (
        data["score"],
        str(data.get("reasoning", "")),
        list(data.get("evidence_quotes", [])),
    )


async def main() -> None:
    rubric = Rubric.from_markdown_file(
        REPO / "agent_bench/evaluation/rubrics/completeness.md"
    )
    outputs = _load_outputs()
    provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18")

    print("=" * 80)
    print("Plan 3A — recency-positioned paraphrase instruction probe")
    print("=" * 80)
    print(f"prior scores: {PRIOR_SCORES}")
    print(f"gold scores:  {GOLD_SCORES}")
    print(f"intervention: \n  {PARAPHRASE_RECENCY_CLAUSE!r}\n")

    results: dict[str, dict] = {}
    total_cost = 0.0
    for iid in DISPUTED_IDS:
        item = outputs[iid]
        prompt = _build_prompt(rubric, item)
        response = await provider.complete(
            [Message(role=Role.USER, content=prompt)],
            temperature=0.0,
            max_tokens=1024,
        )
        try:
            score, reasoning, ev = _parse_score(response.content)
        except (json.JSONDecodeError, KeyError) as e:
            print(f"  {iid}: PARSE FAILED — {e}; raw={response.content[:200]!r}")
            continue
        prior = PRIOR_SCORES[iid]
        gold = GOLD_SCORES[iid]
        shifted = isinstance(score, int) and score > prior
        marker = "→ SHIFTED 1→2" if shifted else ("→ unchanged" if score == prior else f"→ shifted to {score}")
        print(f"  {iid}: prior={prior} new={score} gold={gold} {marker}")
        print(f"    reasoning: {reasoning[:300]}{'...' if len(reasoning) > 300 else ''}")
        print(f"    evidence_quotes: {ev}")
        print()
        results[iid] = {
            "prior": prior,
            "new": score,
            "gold": gold,
            "reasoning": reasoning,
            "evidence_quotes": ev,
            "shifted_up": shifted,
        }
        total_cost += response.usage.estimated_cost_usd

    n_shifted = sum(1 for r in results.values() if r["shifted_up"])
    print("=" * 80)
    print(f"Result: {n_shifted}/5 items shifted 1 → 2")
    print(f"Total cost: ${total_cost:.4f}")
    print()
    if n_shifted >= 3:
        print("→ FIXED (per pre-committed criteria). Re-run on full 26 disputed items.")
    elif n_shifted >= 1:
        print("→ PARTIALLY FIXED. Re-run on full 26 disputed items for clean number.")
    else:
        print("→ NOT FIXED. Escalate to 4A (GPT-4o full).")

    out_path = REPO / "measurements/2026-05-06-3a-paraphrase-recency-probe.jsonl"
    with out_path.open("w") as f:
        for iid, r in results.items():
            f.write(json.dumps({"item_id": iid, **r}) + "\n")
    print(f"\nProbe artifact: {out_path}")


if __name__ == "__main__":
    asyncio.run(main())