Spaces:

Nomearod
/

agentbench

Running

App Files Files Community

agentbench / tests /evaluation /test_jury_aggregation.py

Nomearod

calibrate(jury): v1.1+v1.1.1 — fix weighting bugs; recency-position paraphrase clause

ab0e054 about 1 month ago

raw

history blame contribute delete

10.9 kB

	"""Tests for PermutedJudge and Jury — aggregation, quorum, sidecar."""

	from __future__ import annotations

	import json
	from unittest.mock import AsyncMock

	import pytest

	from agent_bench.agents.orchestrator import AgentResponse, SourceReference
	from agent_bench.core.provider import LLMProvider
	from agent_bench.core.types import CompletionResponse, TokenUsage
	from agent_bench.evaluation.harness import GoldenQuestion
	from agent_bench.evaluation.judges.base import Rubric
	from agent_bench.evaluation.judges.relevance import RelevanceJudge


	def _mk_response(content: str) -> CompletionResponse:
	return CompletionResponse(
	content=content,
	tool_calls=[],
	usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0001),
	provider="mock",
	model="m",
	latency_ms=1.0,
	)


	def _vj(score) -> str:
	return json.dumps({"reasoning": "r", "evidence_quotes": [], "score": score})


	def _item(item_id: str = "i1") -> GoldenQuestion:
	return GoldenQuestion(
	id=item_id,
	question="?",
	expected_answer_keywords=[],
	expected_sources=[],
	category="retrieval",
	difficulty="easy",
	requires_calculator=False,
	)


	def _output(answer: str = "A.") -> AgentResponse:
	return AgentResponse(
	answer=answer,
	sources=[SourceReference(source="x.md")],
	iterations=1,
	usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
	latency_ms=0,
	)


	def _relevance_judge_with_responses(responses: list[str]) -> RelevanceJudge:
	rubric = Rubric.from_markdown_file("agent_bench/evaluation/rubrics/relevance.md")
	provider = AsyncMock(spec=LLMProvider)
	provider.complete.side_effect = [_mk_response(r) for r in responses]
	return RelevanceJudge(judge_provider=provider, rubric=rubric, model_id="m")


	class TestPermutedJudge:
	@pytest.mark.asyncio
	async def test_runs_n_permutations_and_means(self, tmp_path):
	from agent_bench.evaluation.variance.rubric_permute import rubric_permute

	# Two seeds produce two scores: 1 and 2; mean=1.5; ties→lower → 1
	judge = _relevance_judge_with_responses([_vj(1), _vj(2)])
	permuted = rubric_permute(
	judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
	)
	result = await permuted.score(_item(), _output())
	assert result.score == 1
	assert result.judge_id == "m_relevance_perm2"
	assert result.prompt_seed == 0

	@pytest.mark.asyncio
	async def test_any_abstain_propagates_unknown(self, tmp_path):
	from agent_bench.evaluation.variance.rubric_permute import rubric_permute

	judge = _relevance_judge_with_responses([_vj(1), _vj("Unknown")])
	permuted = rubric_permute(
	judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
	)
	result = await permuted.score(_item(), _output())
	assert result.score == "Unknown"
	assert result.abstained

	@pytest.mark.asyncio
	async def test_writes_per_permutation_sidecar(self, tmp_path):
	from agent_bench.evaluation.variance.rubric_permute import rubric_permute

	sidecar = tmp_path / "perm_members.jsonl"
	judge = _relevance_judge_with_responses([_vj(2), _vj(2)])
	permuted = rubric_permute(judge, n=2, seeds=[5, 7], sidecar_path=sidecar)
	await permuted.score(_item(), _output())

	lines = sidecar.read_text().strip().split("\n")
	assert len(lines) == 2
	records = [json.loads(line) for line in lines]
	assert {r["prompt_seed"] for r in records} == {5, 7}


	class TestJury:
	@pytest.mark.asyncio
	async def test_mean_aggregation_two_judges(self, tmp_path):
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(2)])
	j2 = _relevance_judge_with_responses([_vj(2)])
	j1.judge_id = "claude-haiku_relevance"
	j2.judge_id = "gpt-4o-mini_relevance"

	ju = jury(
	judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
	)
	result = await ju.score(_item(), _output())
	assert result.score == 2
	assert result.judge_id == "jury_v1_mean"

	@pytest.mark.asyncio
	async def test_strict_quorum_default_abstains_on_one_failure(self, tmp_path):
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(1)])
	j1.judge_id = "claude-haiku_relevance"
	# Both attempts return garbage → abstain via schema-parse-after-retry
	j2 = _relevance_judge_with_responses(["garbage", "garbage"])
	j2.judge_id = "gpt-4o-mini_relevance"

	ju = jury(
	judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
	)
	result = await ju.score(_item(), _output())
	assert result.score == "Unknown"
	assert "jury_below_quorum" in result.reasoning
	assert "1/2" in result.reasoning

	@pytest.mark.asyncio
	async def test_sidecar_captures_both_members_including_abstain(self, tmp_path):
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(1)])
	j1.judge_id = "claude-haiku_relevance"
	j2 = _relevance_judge_with_responses(["garbage", "garbage"])
	j2.judge_id = "gpt-4o-mini_relevance"

	sidecar = tmp_path / "jury.jsonl"
	ju = jury(judges=[j1, j2], aggregation="mean", sidecar_path=sidecar)
	await ju.score(_item(), _output())

	records = [
	json.loads(line) for line in sidecar.read_text().strip().split("\n")
	]
	assert len(records) == 2
	scores = [r["score"] for r in records]
	assert 1 in scores
	assert "Unknown" in scores

	@pytest.mark.asyncio
	async def test_kappa_weighted_requires_weights(self, tmp_path):
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(2)])
	with pytest.raises(ValueError, match="weights"):
	jury(judges=[j1], aggregation="kappa_weighted")

	@pytest.mark.asyncio
	async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path):
	"""Regression for ties-to-lower divergence between mean and
	kappa_weighted paths. Two judges score [1, 2] with equal weights;
	weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the
	kappa_weighted path must also return 1 — banker's rounding would
	return 2 and silently violate the policy.
	"""
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(1)])
	j1.judge_id = "claude-haiku_relevance"
	j2 = _relevance_judge_with_responses([_vj(2)])
	j2.judge_id = "gpt-4o-mini_relevance"

	weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0}
	ju = jury(
	judges=[j1, j2],
	aggregation="kappa_weighted",
	weights=weights,
	sidecar_path=tmp_path / "jury.jsonl",
	)
	result = await ju.score(_item(), _output())
	assert result.score == 1, (
	f"kappa_weighted with equal weights on [1, 2] returned "
	f"{result.score}; expected 1 (ties-to-lower per "
	f"_aggregate_scores policy). banker's-rounding bug?"
	)

	@pytest.mark.asyncio
	async def test_kappa_weighted_reasoning_reports_applied_weights(
	self, tmp_path
	):
	"""The reasoning string must surface the per-member weights actually
	used so the aggregation is auditable from the sidecar alone (no need
	to re-derive weights from the source).
	"""
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(2)])
	j1.judge_id = "claude-haiku_relevance"
	j2 = _relevance_judge_with_responses([_vj(2)])
	j2.judge_id = "gpt-4o-mini_relevance"

	weights = {
	"claude-haiku_relevance": 5.0,
	"gpt-4o-mini_relevance": 0.25,
	}
	ju = jury(
	judges=[j1, j2],
	aggregation="kappa_weighted",
	weights=weights,
	sidecar_path=tmp_path / "jury.jsonl",
	)
	result = await ju.score(_item(), _output())
	assert "5.0" in result.reasoning, (
	f"applied weight 5.0 missing from reasoning: {result.reasoning!r}"
	)
	assert "0.25" in result.reasoning, (
	f"applied weight 0.25 missing from reasoning: {result.reasoning!r}"
	)

	@pytest.mark.asyncio
	async def test_kappa_weighted_hard_errors_on_missing_weight(self, tmp_path):
	"""v1.1 regression: a member judge_id missing from the weights dict
	is a hard error, not a silent fallback to 1.0. v1's silent fallback
	let an asymmetric weights source amplify the unweighted member —
	see DECISIONS "v1.1 jury rescue" entry for the calibration evidence.
	"""
	from agent_bench.evaluation.variance.jury import jury

	j1 = _relevance_judge_with_responses([_vj(1)])
	j1.judge_id = "claude-haiku_relevance"
	j2 = _relevance_judge_with_responses([_vj(1)])
	j2.judge_id = "gpt-4o-mini_relevance"

	weights = {"claude-haiku_relevance": 1.0} # j2 missing
	ju = jury(
	judges=[j1, j2],
	aggregation="kappa_weighted",
	weights=weights,
	sidecar_path=tmp_path / "jury.jsonl",
	)
	with pytest.raises(ValueError, match="missing entries.*gpt-4o-mini"):
	await ju.score(_item(), _output())

	@pytest.mark.asyncio
	async def test_cancel_on_non_retryable(self, tmp_path):
	"""Non-retryable exception in any member must propagate immediately."""
	from agent_bench.evaluation.judges.base import Rubric
	from agent_bench.evaluation.variance.jury import jury

	rubric = Rubric.from_markdown_file(
	"agent_bench/evaluation/rubrics/relevance.md"
	)
	# j1 raises ValueError (caller bug — not in retryable taxonomy)
	provider1 = AsyncMock(spec=LLMProvider)
	provider1.complete.side_effect = ValueError("auth_error")
	j1 = RelevanceJudge(judge_provider=provider1, rubric=rubric, model_id="m1")

	# j2 would succeed if it ran
	provider2 = AsyncMock(spec=LLMProvider)
	provider2.complete.return_value = _mk_response(_vj(1))
	j2 = RelevanceJudge(judge_provider=provider2, rubric=rubric, model_id="m2")

	ju = jury(
	judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
	)
	with pytest.raises(ValueError, match="auth_error"):
	await ju.score(_item(), _output())