agentbench / tests /evaluation /test_jury_aggregation.py
Nomearod's picture
calibrate(jury): v1.1+v1.1.1 — fix weighting bugs; recency-position paraphrase clause
ab0e054
"""Tests for PermutedJudge and Jury — aggregation, quorum, sidecar."""
from __future__ import annotations
import json
from unittest.mock import AsyncMock
import pytest
from agent_bench.agents.orchestrator import AgentResponse, SourceReference
from agent_bench.core.provider import LLMProvider
from agent_bench.core.types import CompletionResponse, TokenUsage
from agent_bench.evaluation.harness import GoldenQuestion
from agent_bench.evaluation.judges.base import Rubric
from agent_bench.evaluation.judges.relevance import RelevanceJudge
def _mk_response(content: str) -> CompletionResponse:
return CompletionResponse(
content=content,
tool_calls=[],
usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0001),
provider="mock",
model="m",
latency_ms=1.0,
)
def _vj(score) -> str:
return json.dumps({"reasoning": "r", "evidence_quotes": [], "score": score})
def _item(item_id: str = "i1") -> GoldenQuestion:
return GoldenQuestion(
id=item_id,
question="?",
expected_answer_keywords=[],
expected_sources=[],
category="retrieval",
difficulty="easy",
requires_calculator=False,
)
def _output(answer: str = "A.") -> AgentResponse:
return AgentResponse(
answer=answer,
sources=[SourceReference(source="x.md")],
iterations=1,
usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
latency_ms=0,
)
def _relevance_judge_with_responses(responses: list[str]) -> RelevanceJudge:
rubric = Rubric.from_markdown_file("agent_bench/evaluation/rubrics/relevance.md")
provider = AsyncMock(spec=LLMProvider)
provider.complete.side_effect = [_mk_response(r) for r in responses]
return RelevanceJudge(judge_provider=provider, rubric=rubric, model_id="m")
class TestPermutedJudge:
@pytest.mark.asyncio
async def test_runs_n_permutations_and_means(self, tmp_path):
from agent_bench.evaluation.variance.rubric_permute import rubric_permute
# Two seeds produce two scores: 1 and 2; mean=1.5; ties→lower → 1
judge = _relevance_judge_with_responses([_vj(1), _vj(2)])
permuted = rubric_permute(
judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
)
result = await permuted.score(_item(), _output())
assert result.score == 1
assert result.judge_id == "m_relevance_perm2"
assert result.prompt_seed == 0
@pytest.mark.asyncio
async def test_any_abstain_propagates_unknown(self, tmp_path):
from agent_bench.evaluation.variance.rubric_permute import rubric_permute
judge = _relevance_judge_with_responses([_vj(1), _vj("Unknown")])
permuted = rubric_permute(
judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl"
)
result = await permuted.score(_item(), _output())
assert result.score == "Unknown"
assert result.abstained
@pytest.mark.asyncio
async def test_writes_per_permutation_sidecar(self, tmp_path):
from agent_bench.evaluation.variance.rubric_permute import rubric_permute
sidecar = tmp_path / "perm_members.jsonl"
judge = _relevance_judge_with_responses([_vj(2), _vj(2)])
permuted = rubric_permute(judge, n=2, seeds=[5, 7], sidecar_path=sidecar)
await permuted.score(_item(), _output())
lines = sidecar.read_text().strip().split("\n")
assert len(lines) == 2
records = [json.loads(line) for line in lines]
assert {r["prompt_seed"] for r in records} == {5, 7}
class TestJury:
@pytest.mark.asyncio
async def test_mean_aggregation_two_judges(self, tmp_path):
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(2)])
j2 = _relevance_judge_with_responses([_vj(2)])
j1.judge_id = "claude-haiku_relevance"
j2.judge_id = "gpt-4o-mini_relevance"
ju = jury(
judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
)
result = await ju.score(_item(), _output())
assert result.score == 2
assert result.judge_id == "jury_v1_mean"
@pytest.mark.asyncio
async def test_strict_quorum_default_abstains_on_one_failure(self, tmp_path):
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(1)])
j1.judge_id = "claude-haiku_relevance"
# Both attempts return garbage → abstain via schema-parse-after-retry
j2 = _relevance_judge_with_responses(["garbage", "garbage"])
j2.judge_id = "gpt-4o-mini_relevance"
ju = jury(
judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
)
result = await ju.score(_item(), _output())
assert result.score == "Unknown"
assert "jury_below_quorum" in result.reasoning
assert "1/2" in result.reasoning
@pytest.mark.asyncio
async def test_sidecar_captures_both_members_including_abstain(self, tmp_path):
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(1)])
j1.judge_id = "claude-haiku_relevance"
j2 = _relevance_judge_with_responses(["garbage", "garbage"])
j2.judge_id = "gpt-4o-mini_relevance"
sidecar = tmp_path / "jury.jsonl"
ju = jury(judges=[j1, j2], aggregation="mean", sidecar_path=sidecar)
await ju.score(_item(), _output())
records = [
json.loads(line) for line in sidecar.read_text().strip().split("\n")
]
assert len(records) == 2
scores = [r["score"] for r in records]
assert 1 in scores
assert "Unknown" in scores
@pytest.mark.asyncio
async def test_kappa_weighted_requires_weights(self, tmp_path):
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(2)])
with pytest.raises(ValueError, match="weights"):
jury(judges=[j1], aggregation="kappa_weighted")
@pytest.mark.asyncio
async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path):
"""Regression for ties-to-lower divergence between mean and
kappa_weighted paths. Two judges score [1, 2] with equal weights;
weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the
kappa_weighted path must also return 1 — banker's rounding would
return 2 and silently violate the policy.
"""
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(1)])
j1.judge_id = "claude-haiku_relevance"
j2 = _relevance_judge_with_responses([_vj(2)])
j2.judge_id = "gpt-4o-mini_relevance"
weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0}
ju = jury(
judges=[j1, j2],
aggregation="kappa_weighted",
weights=weights,
sidecar_path=tmp_path / "jury.jsonl",
)
result = await ju.score(_item(), _output())
assert result.score == 1, (
f"kappa_weighted with equal weights on [1, 2] returned "
f"{result.score}; expected 1 (ties-to-lower per "
f"_aggregate_scores policy). banker's-rounding bug?"
)
@pytest.mark.asyncio
async def test_kappa_weighted_reasoning_reports_applied_weights(
self, tmp_path
):
"""The reasoning string must surface the per-member weights actually
used so the aggregation is auditable from the sidecar alone (no need
to re-derive weights from the source).
"""
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(2)])
j1.judge_id = "claude-haiku_relevance"
j2 = _relevance_judge_with_responses([_vj(2)])
j2.judge_id = "gpt-4o-mini_relevance"
weights = {
"claude-haiku_relevance": 5.0,
"gpt-4o-mini_relevance": 0.25,
}
ju = jury(
judges=[j1, j2],
aggregation="kappa_weighted",
weights=weights,
sidecar_path=tmp_path / "jury.jsonl",
)
result = await ju.score(_item(), _output())
assert "5.0" in result.reasoning, (
f"applied weight 5.0 missing from reasoning: {result.reasoning!r}"
)
assert "0.25" in result.reasoning, (
f"applied weight 0.25 missing from reasoning: {result.reasoning!r}"
)
@pytest.mark.asyncio
async def test_kappa_weighted_hard_errors_on_missing_weight(self, tmp_path):
"""v1.1 regression: a member judge_id missing from the weights dict
is a hard error, not a silent fallback to 1.0. v1's silent fallback
let an asymmetric weights source amplify the unweighted member —
see DECISIONS "v1.1 jury rescue" entry for the calibration evidence.
"""
from agent_bench.evaluation.variance.jury import jury
j1 = _relevance_judge_with_responses([_vj(1)])
j1.judge_id = "claude-haiku_relevance"
j2 = _relevance_judge_with_responses([_vj(1)])
j2.judge_id = "gpt-4o-mini_relevance"
weights = {"claude-haiku_relevance": 1.0} # j2 missing
ju = jury(
judges=[j1, j2],
aggregation="kappa_weighted",
weights=weights,
sidecar_path=tmp_path / "jury.jsonl",
)
with pytest.raises(ValueError, match="missing entries.*gpt-4o-mini"):
await ju.score(_item(), _output())
@pytest.mark.asyncio
async def test_cancel_on_non_retryable(self, tmp_path):
"""Non-retryable exception in any member must propagate immediately."""
from agent_bench.evaluation.judges.base import Rubric
from agent_bench.evaluation.variance.jury import jury
rubric = Rubric.from_markdown_file(
"agent_bench/evaluation/rubrics/relevance.md"
)
# j1 raises ValueError (caller bug — not in retryable taxonomy)
provider1 = AsyncMock(spec=LLMProvider)
provider1.complete.side_effect = ValueError("auth_error")
j1 = RelevanceJudge(judge_provider=provider1, rubric=rubric, model_id="m1")
# j2 would succeed if it ran
provider2 = AsyncMock(spec=LLMProvider)
provider2.complete.return_value = _mk_response(_vj(1))
j2 = RelevanceJudge(judge_provider=provider2, rubric=rubric, model_id="m2")
ju = jury(
judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl"
)
with pytest.raises(ValueError, match="auth_error"):
await ju.score(_item(), _output())