Spaces:

Nomearod
/

agentbench

Running

App Files Files Community

agentbench / scripts /_dev /probe_3a_paraphrase_recency.py

Nomearod

calibrate(jury): v1.1+v1.1.1 — fix weighting bugs; recency-position paraphrase clause

ab0e054 5 days ago

raw

history blame contribute delete

5.58 kB

	"""Plan 3A probe: recency-positioned paraphrase instruction on 5 disputed
	completeness items.

	Sends the same rubric body, gold reference, and system answer as the
	production CompletenessJudge prompt, with one extra sentence inserted
	between the system answer and the score instruction:

	"Note: a paraphrase that captures the same meaning as a gold-answer
	point counts as covered. Score on content equivalence, not surface
	form."

	Prior scores (from the v1 jury sidecar): all 5 disputed items scored 1
	by gpt-4o-mini-2024-07-18; gold=2 on all 5; Haiku scored 2 on all 5.

	Pre-committed criteria (DECISIONS "Plan 3A" entry):
	- Fixed: ≥3/5 shift from 1 → 2
	- Partial: 1–2/5 shift
	- Not fix: 0/5 shift

	Run:
	OPENAI_API_KEY=... python scripts/_dev/probe_3a_paraphrase_recency.py
	"""

	from __future__ import annotations

	import asyncio
	import json
	import re
	import sys
	from pathlib import Path

	REPO = Path(__file__).resolve().parents[2]
	sys.path.insert(0, str(REPO))

	from agent_bench.core.provider import OpenAIProvider # noqa: E402
	from agent_bench.core.types import Message, Role # noqa: E402
	from agent_bench.evaluation.judges.base import ( # noqa: E402
	Rubric,
	_strip_markdown_fence,
	)

	DISPUTED_IDS = ["q006", "q011", "k8s_002", "k8s_006", "k8s_018"]
	PRIOR_SCORES = {iid: 1 for iid in DISPUTED_IDS} # all five scored 1 in v1 sidecar
	GOLD_SCORES = {iid: 2 for iid in DISPUTED_IDS} # all five gold=2

	PARAPHRASE_RECENCY_CLAUSE = (
	"Note: a paraphrase that captures the same meaning as a gold-answer "
	"point counts as covered. Score on content equivalence, not surface form."
	)


	def _load_outputs() -> dict[str, dict]:
	raw = (REPO / "results/calibration_v1_system_outputs.json").read_text()
	return {r["item_id"]: r for r in json.loads(raw)}


	def _build_prompt(rubric: Rubric, item_record: dict) -> str:
	"""Mirror CompletenessJudge.score's prompt construction, with the
	recency clause inserted between the system answer and the score
	instruction."""
	schema_clause = (
	'JSON object: {"reasoning": "...", "evidence_quotes": [...], '
	'"score": 0 or 1 or 2 or "Unknown"}.'
	)
	return (
	f"{rubric.render_prompt(level_permutation_seed=0)}\n\n"
	f"---\n\n"
	f"## Reference answer (gold)\n{item_record['reference_answer']}\n\n"
	f"## Answer to score\n{item_record['answer']}\n\n"
	f"{PARAPHRASE_RECENCY_CLAUSE}\n\n"
	f"Score this answer against the rubric above. Respond with ONLY a "
	f"{schema_clause}"
	)


	def _parse_score(content: str) -> tuple[int \| str, str, list[str]]:
	"""Mirror _call_judge_with_retry's parse path: fence-strip then
	json.loads, return (score, reasoning, evidence_quotes)."""
	stripped = _strip_markdown_fence(content)
	data = json.loads(stripped)
	return (
	data["score"],
	str(data.get("reasoning", "")),
	list(data.get("evidence_quotes", [])),
	)


	async def main() -> None:
	rubric = Rubric.from_markdown_file(
	REPO / "agent_bench/evaluation/rubrics/completeness.md"
	)
	outputs = _load_outputs()
	provider = OpenAIProvider(model="gpt-4o-mini-2024-07-18")

	print("=" * 80)
	print("Plan 3A — recency-positioned paraphrase instruction probe")
	print("=" * 80)
	print(f"prior scores: {PRIOR_SCORES}")
	print(f"gold scores: {GOLD_SCORES}")
	print(f"intervention: \n {PARAPHRASE_RECENCY_CLAUSE!r}\n")

	results: dict[str, dict] = {}
	total_cost = 0.0
	for iid in DISPUTED_IDS:
	item = outputs[iid]
	prompt = _build_prompt(rubric, item)
	response = await provider.complete(
	[Message(role=Role.USER, content=prompt)],
	temperature=0.0,
	max_tokens=1024,
	)
	try:
	score, reasoning, ev = _parse_score(response.content)
	except (json.JSONDecodeError, KeyError) as e:
	print(f" {iid}: PARSE FAILED — {e}; raw={response.content[:200]!r}")
	continue
	prior = PRIOR_SCORES[iid]
	gold = GOLD_SCORES[iid]
	shifted = isinstance(score, int) and score > prior
	marker = "→ SHIFTED 1→2" if shifted else ("→ unchanged" if score == prior else f"→ shifted to {score}")
	print(f" {iid}: prior={prior} new={score} gold={gold} {marker}")
	print(f" reasoning: {reasoning[:300]}{'...' if len(reasoning) > 300 else ''}")
	print(f" evidence_quotes: {ev}")
	print()
	results[iid] = {
	"prior": prior,
	"new": score,
	"gold": gold,
	"reasoning": reasoning,
	"evidence_quotes": ev,
	"shifted_up": shifted,
	}
	total_cost += response.usage.estimated_cost_usd

	n_shifted = sum(1 for r in results.values() if r["shifted_up"])
	print("=" * 80)
	print(f"Result: {n_shifted}/5 items shifted 1 → 2")
	print(f"Total cost: ${total_cost:.4f}")
	print()
	if n_shifted >= 3:
	print("→ FIXED (per pre-committed criteria). Re-run on full 26 disputed items.")
	elif n_shifted >= 1:
	print("→ PARTIALLY FIXED. Re-run on full 26 disputed items for clean number.")
	else:
	print("→ NOT FIXED. Escalate to 4A (GPT-4o full).")

	out_path = REPO / "measurements/2026-05-06-3a-paraphrase-recency-probe.jsonl"
	with out_path.open("w") as f:
	for iid, r in results.items():
	f.write(json.dumps({"item_id": iid, **r}) + "\n")
	print(f"\nProbe artifact: {out_path}")


	if __name__ == "__main__":
	asyncio.run(main())