Spaces:

Nomearod
/

agentbench

Running

App Files Files Community

agentbench / scripts /_dev /reaggregate_jury_v1_1.py

Nomearod

calibrate(jury): v1.1+v1.1.1 — fix weighting bugs; recency-position paraphrase clause

ab0e054 24 days ago

raw

history blame contribute delete

11.6 kB

	"""Plan B (v1.1 jury rescue): re-aggregate the existing 164 member-rows
	in `results/calibration_v1_judge_jury_kappa_weighted_members.jsonl` with
	corrected κ-derived weights, no new API spend.

	Maps the resulting jury κ on completeness to the predefined outcome
	criteria committed in DECISIONS.md ("v1.1 jury rescue" entry):
	- Outcome 1: jury κ ≥ Haiku-baseline + 0.05 → A+B sufficient
	- Outcome 2: jury κ within ±0.05 of Haiku → soft exclusion via weighting
	- Outcome 3: jury κ < Haiku-baseline - 0.05 → escalate to per-dim exclusion (C)

	Run:
	python scripts/_dev/reaggregate_jury_v1_1.py
	"""

	from __future__ import annotations

	import json
	from collections import defaultdict
	from pathlib import Path

	REPO = Path(__file__).resolve().parents[2]
	SIDECAR = REPO / "results/calibration_v1_judge_jury_kappa_weighted_members.jsonl"
	LABELS = REPO / "measurements/2026-05-04-judge-calibration-labels.jsonl"
	HAIKU_BASELINE_COMPLETENESS_KAPPA = 0.416 # from kappa_table.md

	# Mirror agent_bench.evaluation.variance.jury._discretize_mean
	def _discretize_mean(mean: float, scale: str) -> int:
	if scale == "binary":
	return 1 if mean > 0.5 else 0
	floor = int(mean)
	frac = mean - floor
	return floor + 1 if frac > 0.5 else floor


	def _load_labels(path: Path, dimension: str) -> dict[str, int]:
	out: dict[str, int] = {}
	for line in path.read_text().splitlines():
	if not line.strip():
	continue
	rec = json.loads(line)
	if rec.get("dimension") != dimension or rec.get("abstained"):
	continue
	out[rec["system_output_hash"]] = int(rec["score"])
	return out


	def _load_predictions_by_judge(
	path: Path, dimension: str
	) -> dict[str, dict[str, int \| str]]:
	"""Return {judge_id: {hash: score}} for the dimension.

	The sidecar is append-only; if there are duplicate (judge, hash)
	pairs from re-runs, the last write wins (mirrors what generate-table
	sees from the JSON output file path which is overwritten per row).
	"""
	by_judge: dict[str, dict[str, int \| str]] = defaultdict(dict)
	for line in path.read_text().splitlines():
	if not line.strip():
	continue
	rec = json.loads(line)
	if not rec["judge_id"].endswith(f"_{dimension}"):
	continue
	by_judge[rec["judge_id"]][rec["system_output_hash"]] = rec["score"]
	return by_judge


	def _kappa(y1: list[int], y2: list[int]) -> float:
	from agent_bench.evaluation.calibration.metrics import cohen_kappa
	return cohen_kappa(y1, y2)


	def _per_judge_kappa(
	by_judge: dict[str, dict[str, int \| str]], labels: dict[str, int]
	) -> dict[str, tuple[float, int]]:
	out: dict[str, tuple[float, int]] = {}
	for jid, preds in by_judge.items():
	y_lab: list[int] = []
	y_pred: list[int] = []
	for h, score in preds.items():
	if score == "Unknown":
	continue
	if h not in labels:
	continue
	y_lab.append(labels[h])
	y_pred.append(int(score))
	if not y_lab:
	continue
	out[jid] = (_kappa(y_lab, y_pred), len(y_lab))
	return out


	def _load_full_member_rows(path: Path, dimension: str) -> list[dict]:
	"""Return the most-recent record per (judge_id, system_output_hash) for
	the dimension. The sidecar is append-only; if there are duplicates from
	re-runs, the later record wins (mirrors how the JSON output file would
	reflect the last successful run)."""
	by_key: dict[tuple[str, str], dict] = {}
	for line in path.read_text().splitlines():
	if not line.strip():
	continue
	rec = json.loads(line)
	if not rec["judge_id"].endswith(f"_{dimension}"):
	continue
	by_key[(rec["judge_id"], rec["system_output_hash"])] = rec
	return list(by_key.values())


	def _aggregate_jury(
	by_judge: dict[str, dict[str, int \| str]],
	labels: dict[str, int],
	weights: dict[str, float],
	scale: str,
	) -> tuple[list[int], list[int], int]:
	"""Strict quorum: any member abstain on an item → jury abstain (skipped).

	Returns (y_lab, y_pred, abstained_count) where each list element is
	one item that survived strict quorum.
	"""
	judge_ids = list(by_judge.keys())
	# Common item set: hashes scored by every judge (any judge abstaining
	# on an item also drops it under strict quorum).
	all_hashes = set.intersection(*[set(d.keys()) for d in by_judge.values()])
	y_lab: list[int] = []
	y_pred: list[int] = []
	abstained = 0
	for h in sorted(all_hashes):
	scores = [by_judge[jid][h] for jid in judge_ids]
	if any(s == "Unknown" for s in scores):
	abstained += 1
	continue
	if h not in labels:
	continue
	int_scores = [int(s) for s in scores]
	wts = [weights[jid] for jid in judge_ids]
	weighted_sum = sum(s * w for s, w in zip(int_scores, wts))
	weight_total = sum(wts)
	if weight_total <= 0:
	abstained += 1
	continue
	agg = _discretize_mean(weighted_sum / weight_total, scale)
	y_lab.append(labels[h])
	y_pred.append(agg)
	return y_lab, y_pred, abstained


	def _hash_to_item_id_map(labels_path: Path) -> dict[str, str]:
	"""Recover hash → item_id from the labels file, since the sidecar
	JSONL was written before the v1.1 item_id backfill (which only
	touched the per-row JSON output files, not the sidecar)."""
	out: dict[str, str] = {}
	for line in labels_path.read_text().splitlines():
	if not line.strip():
	continue
	rec = json.loads(line)
	out[rec["system_output_hash"]] = rec["item_id"]
	return out


	def _build_v1_1_jury_predictions(
	by_judge: dict[str, dict[str, int \| str]],
	member_rows: list[dict],
	weights: dict[str, float],
	scale: str,
	dimension: str,
	hash_to_item: dict[str, str],
	) -> list[dict]:
	"""Per-item jury verdicts for the κ-table-format output. Pulls metadata
	(rubric_version, item_id) from member rows; aggregates score/cost/latency
	via the same rules as the production Jury class."""
	judge_ids = list(by_judge.keys())
	by_judge_hash_row = {
	(r["judge_id"], r["system_output_hash"]): r for r in member_rows
	}
	common_hashes = set.intersection(*[set(d.keys()) for d in by_judge.values()])
	out: list[dict] = []
	for h in sorted(common_hashes):
	scores = [by_judge[jid][h] for jid in judge_ids]
	member_meta = [by_judge_hash_row[(jid, h)] for jid in judge_ids]
	rubric_version = member_meta[0]["rubric_version"]
	item_id = member_meta[0].get("item_id") or hash_to_item.get(h)
	if item_id is None:
	# Sidecar + labels both lack mapping for this hash — drop,
	# since κ-table can't join without item_id.
	continue
	cost = sum(r.get("cost_usd", 0.0) for r in member_meta)
	latency = max(r.get("latency_ms", 0.0) for r in member_meta)

	if any(s == "Unknown" for s in scores):
	out.append({
	"item_id": item_id,
	"dimension": dimension,
	"reasoning": (
	f"jury_below_quorum: 1+ member abstain (members="
	f"{[s for s in scores]})"
	),
	"evidence_quotes": [],
	"score": "Unknown",
	"judge_id": "jury_v1_1_kappa_weighted",
	"rubric_version": rubric_version,
	"prompt_seed": 0,
	"system_output_hash": h,
	"cost_usd": cost,
	"latency_ms": latency,
	})
	continue
	int_scores = [int(s) for s in scores]
	wts = [weights[jid] for jid in judge_ids]
	weighted_sum = sum(s * w for s, w in zip(int_scores, wts))
	weight_total = sum(wts)
	weighted_mean = weighted_sum / weight_total if weight_total > 0 else 0.0
	agg = _discretize_mean(weighted_mean, scale)
	out.append({
	"item_id": item_id,
	"dimension": dimension,
	"reasoning": (
	f"jury_kappa_weighted_v1_1: members={int_scores}, weights={wts}"
	),
	"evidence_quotes": [],
	"score": agg,
	"judge_id": "jury_v1_1_kappa_weighted",
	"rubric_version": rubric_version,
	"prompt_seed": 0,
	"system_output_hash": h,
	"cost_usd": cost,
	"latency_ms": latency,
	})
	return out


	def _classify_outcome(jury_k: float, baseline_k: float) -> str:
	delta = jury_k - baseline_k
	if delta >= 0.05:
	return f"OUTCOME 1 (Δ={delta:+.3f}, ≥+0.05) — A+B sufficient; writeup as 'weights bug masked aggregation'"
	if delta > -0.05:
	return f"OUTCOME 2 (Δ={delta:+.3f}, within ±0.05) — soft exclusion via weighting"
	return f"OUTCOME 3 (Δ={delta:+.3f}, <-0.05) — escalate to per-dim exclusion (C)"


	def main(write_output: bool = False) -> None:
	print("=" * 78)
	print("v1.1 jury rescue — Plan B re-aggregation")
	print("=" * 78)

	all_predictions: list[dict] = []
	for dim, scale in [
	("completeness", "three_point"),
	("groundedness", "binary"),
	("relevance", "three_point"),
	]:
	print(f"\n--- dimension: {dim} (scale={scale}) ---")
	labels = _load_labels(LABELS, dim)
	by_judge = _load_predictions_by_judge(SIDECAR, dim)
	if not by_judge:
	print(f" no predictions for {dim} in sidecar — skipping")
	continue

	# Per-judge κ → weight (negative κ clipped to 0)
	per_judge = _per_judge_kappa(by_judge, labels)
	print(f" Gold labels (non-abstain): {len(labels)}")
	for jid, (k, n) in sorted(per_judge.items()):
	w = max(0.0, k)
	print(f" per-judge κ: {jid} κ={k:+.3f} n={n} → weight={w:.3f}")
	weights = {jid: max(0.0, k) for jid, (k, _) in per_judge.items()}

	# Jury aggregate with corrected weights
	y_lab, y_pred, abstained = _aggregate_jury(by_judge, labels, weights, scale)
	if len(y_lab) < 2:
	print(f" insufficient data after strict-quorum filter (n={len(y_lab)})")
	continue
	jury_k = _kappa(y_lab, y_pred)
	# Raw agreement
	raw_agree = sum(1 for a, b in zip(y_lab, y_pred) if a == b) / len(y_lab)
	print(
	f" JURY (corrected weights): κ={jury_k:+.3f} "
	f"raw={raw_agree:.3f} n={len(y_lab)} abstained={abstained}"
	)
	if dim == "completeness":
	print(f"\n Haiku-baseline completeness κ = {HAIKU_BASELINE_COMPLETENESS_KAPPA}")
	print(f" → {_classify_outcome(jury_k, HAIKU_BASELINE_COMPLETENESS_KAPPA)}")

	if write_output:
	member_rows = _load_full_member_rows(SIDECAR, dim)
	hash_to_item = _hash_to_item_id_map(LABELS)
	all_predictions.extend(
	_build_v1_1_jury_predictions(
	by_judge, member_rows, weights, scale, dim, hash_to_item
	)
	)

	if write_output:
	out_path = REPO / "results/calibration_v1_judge_jury_kappa_weighted_v1_1.json"
	out_path.write_text(json.dumps(all_predictions, indent=2) + "\n")
	print(f"\nwrote {len(all_predictions)} v1.1-jury predictions to {out_path}")


	if __name__ == "__main__":
	import sys

	sys.path.insert(0, str(REPO))
	main(write_output="--write-output" in sys.argv)