Spaces:
Running
Running
| """Tests for PermutedJudge and Jury — aggregation, quorum, sidecar.""" | |
| from __future__ import annotations | |
| import json | |
| from unittest.mock import AsyncMock | |
| import pytest | |
| from agent_bench.agents.orchestrator import AgentResponse, SourceReference | |
| from agent_bench.core.provider import LLMProvider | |
| from agent_bench.core.types import CompletionResponse, TokenUsage | |
| from agent_bench.evaluation.harness import GoldenQuestion | |
| from agent_bench.evaluation.judges.base import Rubric | |
| from agent_bench.evaluation.judges.relevance import RelevanceJudge | |
| def _mk_response(content: str) -> CompletionResponse: | |
| return CompletionResponse( | |
| content=content, | |
| tool_calls=[], | |
| usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0001), | |
| provider="mock", | |
| model="m", | |
| latency_ms=1.0, | |
| ) | |
| def _vj(score) -> str: | |
| return json.dumps({"reasoning": "r", "evidence_quotes": [], "score": score}) | |
| def _item(item_id: str = "i1") -> GoldenQuestion: | |
| return GoldenQuestion( | |
| id=item_id, | |
| question="?", | |
| expected_answer_keywords=[], | |
| expected_sources=[], | |
| category="retrieval", | |
| difficulty="easy", | |
| requires_calculator=False, | |
| ) | |
| def _output(answer: str = "A.") -> AgentResponse: | |
| return AgentResponse( | |
| answer=answer, | |
| sources=[SourceReference(source="x.md")], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=0, | |
| ) | |
| def _relevance_judge_with_responses(responses: list[str]) -> RelevanceJudge: | |
| rubric = Rubric.from_markdown_file("agent_bench/evaluation/rubrics/relevance.md") | |
| provider = AsyncMock(spec=LLMProvider) | |
| provider.complete.side_effect = [_mk_response(r) for r in responses] | |
| return RelevanceJudge(judge_provider=provider, rubric=rubric, model_id="m") | |
| class TestPermutedJudge: | |
| async def test_runs_n_permutations_and_means(self, tmp_path): | |
| from agent_bench.evaluation.variance.rubric_permute import rubric_permute | |
| # Two seeds produce two scores: 1 and 2; mean=1.5; ties→lower → 1 | |
| judge = _relevance_judge_with_responses([_vj(1), _vj(2)]) | |
| permuted = rubric_permute( | |
| judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl" | |
| ) | |
| result = await permuted.score(_item(), _output()) | |
| assert result.score == 1 | |
| assert result.judge_id == "m_relevance_perm2" | |
| assert result.prompt_seed == 0 | |
| async def test_any_abstain_propagates_unknown(self, tmp_path): | |
| from agent_bench.evaluation.variance.rubric_permute import rubric_permute | |
| judge = _relevance_judge_with_responses([_vj(1), _vj("Unknown")]) | |
| permuted = rubric_permute( | |
| judge, n=2, seeds=[1, 2], sidecar_path=tmp_path / "side.jsonl" | |
| ) | |
| result = await permuted.score(_item(), _output()) | |
| assert result.score == "Unknown" | |
| assert result.abstained | |
| async def test_writes_per_permutation_sidecar(self, tmp_path): | |
| from agent_bench.evaluation.variance.rubric_permute import rubric_permute | |
| sidecar = tmp_path / "perm_members.jsonl" | |
| judge = _relevance_judge_with_responses([_vj(2), _vj(2)]) | |
| permuted = rubric_permute(judge, n=2, seeds=[5, 7], sidecar_path=sidecar) | |
| await permuted.score(_item(), _output()) | |
| lines = sidecar.read_text().strip().split("\n") | |
| assert len(lines) == 2 | |
| records = [json.loads(line) for line in lines] | |
| assert {r["prompt_seed"] for r in records} == {5, 7} | |
| class TestJury: | |
| async def test_mean_aggregation_two_judges(self, tmp_path): | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(2)]) | |
| j2 = _relevance_judge_with_responses([_vj(2)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| ju = jury( | |
| judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl" | |
| ) | |
| result = await ju.score(_item(), _output()) | |
| assert result.score == 2 | |
| assert result.judge_id == "jury_v1_mean" | |
| async def test_strict_quorum_default_abstains_on_one_failure(self, tmp_path): | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(1)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| # Both attempts return garbage → abstain via schema-parse-after-retry | |
| j2 = _relevance_judge_with_responses(["garbage", "garbage"]) | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| ju = jury( | |
| judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl" | |
| ) | |
| result = await ju.score(_item(), _output()) | |
| assert result.score == "Unknown" | |
| assert "jury_below_quorum" in result.reasoning | |
| assert "1/2" in result.reasoning | |
| async def test_sidecar_captures_both_members_including_abstain(self, tmp_path): | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(1)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| j2 = _relevance_judge_with_responses(["garbage", "garbage"]) | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| sidecar = tmp_path / "jury.jsonl" | |
| ju = jury(judges=[j1, j2], aggregation="mean", sidecar_path=sidecar) | |
| await ju.score(_item(), _output()) | |
| records = [ | |
| json.loads(line) for line in sidecar.read_text().strip().split("\n") | |
| ] | |
| assert len(records) == 2 | |
| scores = [r["score"] for r in records] | |
| assert 1 in scores | |
| assert "Unknown" in scores | |
| async def test_kappa_weighted_requires_weights(self, tmp_path): | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(2)]) | |
| with pytest.raises(ValueError, match="weights"): | |
| jury(judges=[j1], aggregation="kappa_weighted") | |
| async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path): | |
| """Regression for ties-to-lower divergence between mean and | |
| kappa_weighted paths. Two judges score [1, 2] with equal weights; | |
| weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the | |
| kappa_weighted path must also return 1 — banker's rounding would | |
| return 2 and silently violate the policy. | |
| """ | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(1)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| j2 = _relevance_judge_with_responses([_vj(2)]) | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0} | |
| ju = jury( | |
| judges=[j1, j2], | |
| aggregation="kappa_weighted", | |
| weights=weights, | |
| sidecar_path=tmp_path / "jury.jsonl", | |
| ) | |
| result = await ju.score(_item(), _output()) | |
| assert result.score == 1, ( | |
| f"kappa_weighted with equal weights on [1, 2] returned " | |
| f"{result.score}; expected 1 (ties-to-lower per " | |
| f"_aggregate_scores policy). banker's-rounding bug?" | |
| ) | |
| async def test_kappa_weighted_reasoning_reports_applied_weights( | |
| self, tmp_path | |
| ): | |
| """The reasoning string must surface the per-member weights actually | |
| used so the aggregation is auditable from the sidecar alone (no need | |
| to re-derive weights from the source). | |
| """ | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(2)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| j2 = _relevance_judge_with_responses([_vj(2)]) | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| weights = { | |
| "claude-haiku_relevance": 5.0, | |
| "gpt-4o-mini_relevance": 0.25, | |
| } | |
| ju = jury( | |
| judges=[j1, j2], | |
| aggregation="kappa_weighted", | |
| weights=weights, | |
| sidecar_path=tmp_path / "jury.jsonl", | |
| ) | |
| result = await ju.score(_item(), _output()) | |
| assert "5.0" in result.reasoning, ( | |
| f"applied weight 5.0 missing from reasoning: {result.reasoning!r}" | |
| ) | |
| assert "0.25" in result.reasoning, ( | |
| f"applied weight 0.25 missing from reasoning: {result.reasoning!r}" | |
| ) | |
| async def test_kappa_weighted_hard_errors_on_missing_weight(self, tmp_path): | |
| """v1.1 regression: a member judge_id missing from the weights dict | |
| is a hard error, not a silent fallback to 1.0. v1's silent fallback | |
| let an asymmetric weights source amplify the unweighted member — | |
| see DECISIONS "v1.1 jury rescue" entry for the calibration evidence. | |
| """ | |
| from agent_bench.evaluation.variance.jury import jury | |
| j1 = _relevance_judge_with_responses([_vj(1)]) | |
| j1.judge_id = "claude-haiku_relevance" | |
| j2 = _relevance_judge_with_responses([_vj(1)]) | |
| j2.judge_id = "gpt-4o-mini_relevance" | |
| weights = {"claude-haiku_relevance": 1.0} # j2 missing | |
| ju = jury( | |
| judges=[j1, j2], | |
| aggregation="kappa_weighted", | |
| weights=weights, | |
| sidecar_path=tmp_path / "jury.jsonl", | |
| ) | |
| with pytest.raises(ValueError, match="missing entries.*gpt-4o-mini"): | |
| await ju.score(_item(), _output()) | |
| async def test_cancel_on_non_retryable(self, tmp_path): | |
| """Non-retryable exception in any member must propagate immediately.""" | |
| from agent_bench.evaluation.judges.base import Rubric | |
| from agent_bench.evaluation.variance.jury import jury | |
| rubric = Rubric.from_markdown_file( | |
| "agent_bench/evaluation/rubrics/relevance.md" | |
| ) | |
| # j1 raises ValueError (caller bug — not in retryable taxonomy) | |
| provider1 = AsyncMock(spec=LLMProvider) | |
| provider1.complete.side_effect = ValueError("auth_error") | |
| j1 = RelevanceJudge(judge_provider=provider1, rubric=rubric, model_id="m1") | |
| # j2 would succeed if it ran | |
| provider2 = AsyncMock(spec=LLMProvider) | |
| provider2.complete.return_value = _mk_response(_vj(1)) | |
| j2 = RelevanceJudge(judge_provider=provider2, rubric=rubric, model_id="m2") | |
| ju = jury( | |
| judges=[j1, j2], aggregation="mean", sidecar_path=tmp_path / "jury.jsonl" | |
| ) | |
| with pytest.raises(ValueError, match="auth_error"): | |
| await ju.score(_item(), _output()) | |