Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on May 4

Commit

9255fb5

1 Parent(s): 508e5ef

fix(judges): four review-blocking bugs (review items 1–4 + 8)

#1 — harness.py CompletenessJudge gate restored. Pre-supersession code
gated correctness on `if q.reference_answer:`; the new per-dimension
loop iterated all dims unconditionally, burning tokens on guaranteed-
noisy verdicts when reference_answer was empty. Now: skip completeness
when reference_answer is falsy, matching the prior contract. Test
asserts the gate by mocking a judge_provider and confirming
'completeness' is absent from judge_scores when reference is "".

#2 — Rubric loader was fence-blind. `## Score N` literals inside
fenced code blocks in anchored examples were counted as structural
level headers, producing arity-mismatch errors on rubrics that wanted
to quote header-shaped strings (which the design encourages). Fix:
mask fenced regions with same-length whitespace before scanning for
level headers, then slice the original body at the masked-text header
positions to recover level bodies with their fenced content intact.
New fixture rubrics_valid_with_fenced_examples.md exercises the case;
test was failing before this change.

#3 — Jury kappa_weighted contradicted ties-to-lower policy. The
`mean` aggregation path discretizes via _aggregate_scores (frac > 0.5
→ ceil, else floor; ties go to floor). The `kappa_weighted` path went
through int(round(weighted_mean)) which is Python's banker's rounding
(0.5 → 0, 1.5 → 2). Result: two judges scoring [1, 2] with equal
weights returned 1 under `mean` and 2 under `kappa_weighted`. Now:
extracted _discretize_mean helper that mirrors _aggregate_scores
exactly. Test pins the equivalence at the half-integer boundary.

#4 — Jury reasoning string concealed the silent weight fallback.
When the kappa_weighted weights dict was missing a member's judge_id,
runtime fell back to 1.0 silently — but the reasoning string printed
the constructor's dict (`list(self.weights.values())`), so anyone
debugging a calibration row saw the configured weights, not the
applied ones. Now: reasoning reports per-successful-member applied
weights; a structlog WARN ('jury_missing_weight_fallback_to_one')
fires for each fallback so operators notice the contract violation.
Two regression tests: applied-weights-in-reasoning, warn-on-missing.

#8 — Hoisted vestigial inline imports in harness.py from the
TYPE_CHECKING attempt. ScoreResult is already module-top imported,
no cycle risk. _JUDGE_CLASS_BY_DIMENSION is now a module-level
constant.

All 514 tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (7) hide show

agent_bench/evaluation/harness.py +19 -14
agent_bench/evaluation/judges/base.py +34 -7
agent_bench/evaluation/variance/jury.py +46 -12
tests/evaluation/fixtures/rubrics_valid_with_fenced_examples.md +43 -0
tests/evaluation/test_harness_migration.py +80 -0
tests/evaluation/test_jury_aggregation.py +90 -0
tests/evaluation/test_rubric_loading.py +15 -0

agent_bench/evaluation/harness.py CHANGED Viewed

@@ -8,9 +8,13 @@ from pathlib import Path
 from pydantic import BaseModel, Field
 from agent_bench.agents.orchestrator import Orchestrator
 from agent_bench.core.provider import LLMProvider
 from agent_bench.core.types import TokenUsage
-from agent_bench.evaluation.judges.base import ScoreResult
 from agent_bench.evaluation.metrics import (
     calculator_used_when_expected,
     citation_accuracy,
@@ -22,6 +26,12 @@ from agent_bench.evaluation.metrics import (
     tool_call_count,
 )
 class GoldenQuestion(BaseModel):
     id: str
@@ -155,24 +165,19 @@ async def run_evaluation(
         # behavior); the q.category != 'out_of_scope' gate is preserved
         # (L2 doesn't apply to refusals — that's L1's job).
         if judge_provider is not None and q.category != "out_of_scope":
-            from agent_bench.core.config import load_config
-            from agent_bench.evaluation.judges.base import Rubric
-            from agent_bench.evaluation.judges.completeness import CompletenessJudge
-            from agent_bench.evaluation.judges.groundedness import GroundednessJudge
-            from agent_bench.evaluation.judges.relevance import RelevanceJudge
             cfg = load_config()
             rubric_dir = Path(__file__).resolve().parent / "rubrics"
-            judge_class = {
-                "groundedness": GroundednessJudge,
-                "relevance": RelevanceJudge,
-                "completeness": CompletenessJudge,
-            }
             for dim in cfg.evaluation.judge_dimensions:
-                if dim not in judge_class:
                     continue  # citation_faithfulness opt-in; not in default loop
                 rubric = Rubric.from_markdown_file(rubric_dir / f"{dim}.md")
-                judge = judge_class[dim](
                     judge_provider=judge_provider,
                     rubric=rubric,
                     model_id=getattr(judge_provider, "model", "unknown"),

 from pydantic import BaseModel, Field
 from agent_bench.agents.orchestrator import Orchestrator
+from agent_bench.core.config import load_config
 from agent_bench.core.provider import LLMProvider
 from agent_bench.core.types import TokenUsage
+from agent_bench.evaluation.judges.base import Rubric, ScoreResult
+from agent_bench.evaluation.judges.completeness import CompletenessJudge
+from agent_bench.evaluation.judges.groundedness import GroundednessJudge
+from agent_bench.evaluation.judges.relevance import RelevanceJudge
 from agent_bench.evaluation.metrics import (
     calculator_used_when_expected,
     citation_accuracy,
     tool_call_count,
 )
+_JUDGE_CLASS_BY_DIMENSION = {
+    "groundedness": GroundednessJudge,
+    "relevance": RelevanceJudge,
+    "completeness": CompletenessJudge,
+}
 class GoldenQuestion(BaseModel):
     id: str
         # behavior); the q.category != 'out_of_scope' gate is preserved
         # (L2 doesn't apply to refusals — that's L1's job).
         if judge_provider is not None and q.category != "out_of_scope":
             cfg = load_config()
             rubric_dir = Path(__file__).resolve().parent / "rubrics"
             for dim in cfg.evaluation.judge_dimensions:
+                if dim not in _JUDGE_CLASS_BY_DIMENSION:
                     continue  # citation_faithfulness opt-in; not in default loop
+                # CompletenessJudge is reference-based on q.reference_answer;
+                # scoring an empty reference is guaranteed-noisy and burns
+                # tokens. Pre-supersession code had the same gate (correctness
+                # was conditional on reference_answer being non-empty).
+                if dim == "completeness" and not q.reference_answer:
+                    continue
                 rubric = Rubric.from_markdown_file(rubric_dir / f"{dim}.md")
+                judge = _JUDGE_CLASS_BY_DIMENSION[dim](
                     judge_provider=judge_provider,
                     rubric=rubric,
                     model_id=getattr(judge_provider, "model", "unknown"),

agent_bench/evaluation/judges/base.py CHANGED Viewed

@@ -77,6 +77,22 @@ class ScoreResult(BaseModel):
         return self.score == "Unknown"
 class RubricLevel(BaseModel):
     """One score level in a rubric, with anchored examples.
@@ -150,14 +166,25 @@ class Rubric(BaseModel):
                 f"must be 'binary' or 'three_point'"
             )
-        # Parse levels by ## Score N headers
         body_no_fm = fm_match.group(2)
-        level_pattern = re.compile(
-            r"^## Score (\d+)\n(.*?)(?=^## Score |\Z)", re.MULTILINE | re.DOTALL
-        )
-        raw_levels: list[tuple[int, str]] = [
-            (int(m.group(1)), m.group(2)) for m in level_pattern.finditer(body_no_fm)
-        ]
         expected_arity = 2 if scale == "binary" else 3
         if len(raw_levels) != expected_arity:

         return self.score == "Unknown"
+_FENCE_PATTERN = re.compile(r"^```[^\n]*\n.*?^```\n?", re.MULTILINE | re.DOTALL)
+def _mask_code_fences(text: str) -> str:
+    """Replace fenced code blocks (``` ... ```) with same-length whitespace,
+    preserving newlines so byte offsets align with the original. Used by
+    the rubric loader to skip fenced ``## Score N`` literals when scanning
+    for structural level headers.
+    """
+    def _replace(match: re.Match[str]) -> str:
+        return "".join("\n" if c == "\n" else " " for c in match.group(0))
+    return _FENCE_PATTERN.sub(_replace, text)
 class RubricLevel(BaseModel):
     """One score level in a rubric, with anchored examples.
                 f"must be 'binary' or 'three_point'"
             )
+        # Parse levels by ## Score N headers. Mask fenced code blocks first
+        # so a literal "## Score N" inside an example's code fence is not
+        # interpreted as a structural level header. The mask preserves byte
+        # offsets (replacing non-newline chars with spaces) so we can slice
+        # the original `body_no_fm` at the masked-text header positions to
+        # recover level bodies with their fenced content intact.
         body_no_fm = fm_match.group(2)
+        masked_body = _mask_code_fences(body_no_fm)
+        header_pattern = re.compile(r"^## Score (\d+)\n", re.MULTILINE)
+        header_matches = list(header_pattern.finditer(masked_body))
+        raw_levels: list[tuple[int, str]] = []
+        for i, m in enumerate(header_matches):
+            start = m.end()
+            end = (
+                header_matches[i + 1].start()
+                if i + 1 < len(header_matches)
+                else len(body_no_fm)
+            )
+            raw_levels.append((int(m.group(1)), body_no_fm[start:end]))
         expected_arity = 2 if scale == "binary" else 3
         if len(raw_levels) != expected_arity:

agent_bench/evaluation/variance/jury.py CHANGED Viewed

@@ -6,6 +6,8 @@ import asyncio
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 from agent_bench.evaluation.judges.base import Judge, ScoreResult
 from agent_bench.evaluation.variance.rubric_permute import _aggregate_scores
@@ -15,6 +17,21 @@ if TYPE_CHECKING:
 _DEFAULT_SIDECAR_TEMPLATE = "results/calibration_v1_judge_{aggregation}_members.jsonl"
 class Jury:
     """Aggregates a list of Judge instances into one ScoreResult per item.
@@ -94,21 +111,38 @@ class Jury:
         # Aggregate over successful members
         scores = [int(r.score) for r in successful]
         scale = self.judges[0].rubric.scale
         if self.aggregation == "mean":
             agg = _aggregate_scores(scores, scale)
         else:  # kappa_weighted
-            # Weight successful members by judge_id; missing weights → 1.0 (mean fallback)
-            ws = [self.weights.get(r.judge_id, 1.0) for r in successful]
-            weighted_sum = sum(s * w for s, w in zip(scores, ws))
-            weight_total = sum(ws)
-            mean = weighted_sum / weight_total if weight_total > 0 else 0.0
-            agg = _aggregate_scores([int(round(mean))], scale)
-        weights_str = (
-            list(self.weights.values())
-            if self.aggregation == "kappa_weighted"
-            else "n/a"
-        )
         return ScoreResult(
             reasoning=(
                 f"jury_{self.aggregation}: "

 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
+import structlog
 from agent_bench.evaluation.judges.base import Judge, ScoreResult
 from agent_bench.evaluation.variance.rubric_permute import _aggregate_scores
 _DEFAULT_SIDECAR_TEMPLATE = "results/calibration_v1_judge_{aggregation}_members.jsonl"
+logger = structlog.get_logger()
+def _discretize_mean(mean: float, scale: str) -> int:
+    """Discretize a float mean to a discrete level per scale, ties → lower
+    (mirrors `_aggregate_scores`'s policy without going through int(round())
+    which would invoke Python's banker's rounding and silently violate the
+    tie-breaking contract).
+    """
+    if scale == "binary":
+        return 1 if mean > 0.5 else 0
+    floor = int(mean)
+    frac = mean - floor
+    return floor + 1 if frac > 0.5 else floor
 class Jury:
     """Aggregates a list of Judge instances into one ScoreResult per item.
         # Aggregate over successful members
         scores = [int(r.score) for r in successful]
         scale = self.judges[0].rubric.scale
+        applied_weights: list[float] = []
         if self.aggregation == "mean":
             agg = _aggregate_scores(scores, scale)
         else:  # kappa_weighted
+            # Weight successful members by judge_id; missing weights → 1.0
+            # (mean fallback). Warn loudly when this fallback fires —
+            # `kappa_weighted` is supposed to use explicit weights, and
+            # silently substituting 1.0 violates that contract.
+            for r in successful:
+                if r.judge_id not in self.weights:
+                    logger.warning(
+                        "jury_missing_weight_fallback_to_one",
+                        judge_id=r.judge_id,
+                        aggregation=self.aggregation,
+                        configured_weights=sorted(self.weights.keys()),
+                    )
+                applied_weights.append(self.weights.get(r.judge_id, 1.0))
+            weighted_sum = sum(s * w for s, w in zip(scores, applied_weights))
+            weight_total = sum(applied_weights)
+            weighted_mean = (
+                weighted_sum / weight_total if weight_total > 0 else 0.0
+            )
+            # Discretize via the shared ties-to-lower policy (NOT int(round())
+            # which uses banker's rounding and would diverge from the `mean`
+            # path on half-integer aggregates).
+            agg = _discretize_mean(weighted_mean, scale)
+        # Reasoning string reports the per-member weights actually applied
+        # (not the constructor's dict — the dict may be missing entries that
+        # silently fell back to 1.0; printing the constructor's dict would
+        # conceal that fallback from anyone debugging a calibration row).
+        weights_str = applied_weights if self.aggregation == "kappa_weighted" else "n/a"
         return ScoreResult(
             reasoning=(
                 f"jury_{self.aggregation}: "

tests/evaluation/fixtures/rubrics_valid_with_fenced_examples.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+dimension: groundedness
+scale: binary
+reference_based: true
+abstain_allowed: true
+---
+# Groundedness with fenced code examples
+## Score 0
+Answer adds an unsupported claim.
+### Example A — answer references nonexistent score in a code fence
+The agent's answer might contain markdown that LOOKS like a section header
+but is actually inside a code fence. Example output:
+```markdown
+## Score 7
+This isn't a real rubric level — it's a string that happens to match the
+level-header pattern, embedded in a code-fence example.
+```
+Score=0 because the cited claim above is fabricated; the rubric loader
+must not interpret the fenced `## Score 7` as a real level.
+## Score 1
+Every claim is supported.
+### Example B — fenced reference excerpt
+The agent might quote a config snippet with a header inside:
+```yaml
+# Config heading
+## Score handler
+score_handler: default
+```
+Score=1 because the fenced YAML is illustrative, not a rubric-structural
+header.

tests/evaluation/test_harness_migration.py CHANGED Viewed

@@ -2,7 +2,14 @@
 from __future__ import annotations
 from agent_bench.core.config import EvaluationConfig
 class TestJudgeProviderConfigPreserved:
@@ -33,3 +40,76 @@ class TestEvalResultJudgeScores:
         assert "judge_scores" in fields, (
             "judge_scores: dict[str, ScoreResult] should be added"
         )

 from __future__ import annotations
+from unittest.mock import AsyncMock
+import pytest
+from agent_bench.agents.orchestrator import AgentResponse, SourceReference
 from agent_bench.core.config import EvaluationConfig
+from agent_bench.core.provider import LLMProvider
+from agent_bench.core.types import CompletionResponse, TokenUsage
 class TestJudgeProviderConfigPreserved:
         assert "judge_scores" in fields, (
             "judge_scores: dict[str, ScoreResult] should be added"
         )
+def _mk_judge_response(score: int) -> CompletionResponse:
+    import json
+    return CompletionResponse(
+        content=json.dumps(
+            {"reasoning": "r", "evidence_quotes": [], "score": score}
+        ),
+        tool_calls=[],
+        usage=TokenUsage(input_tokens=10, output_tokens=10, estimated_cost_usd=0.0),
+        provider="mock",
+        model="m",
+        latency_ms=1.0,
+    )
+class TestCompletenessGatedOnReferenceAnswer:
+    """Regression: pre-supersession code gated correctness on
+    `if q.reference_answer:` — the new per-dimension loop must preserve
+    that gate so empty references don't burn tokens on guaranteed-noisy
+    verdicts.
+    """
+    @pytest.mark.asyncio
+    async def test_empty_reference_answer_skips_completeness_judge(self, tmp_path):
+        from agent_bench.agents.orchestrator import Orchestrator
+        from agent_bench.evaluation.harness import run_evaluation
+        # Minimal golden item with an EMPTY reference_answer
+        golden_path = tmp_path / "golden.json"
+        golden_path.write_text(
+            '[{"id": "q1", "question": "?", "expected_answer_keywords": [],'
+            ' "expected_sources": [], "category": "retrieval",'
+            ' "difficulty": "easy", "requires_calculator": false,'
+            ' "reference_answer": ""}]'
+        )
+        # Mock orchestrator returning a fixed AgentResponse
+        orch = AsyncMock(spec=Orchestrator)
+        orch.run.return_value = AgentResponse(
+            answer="Some answer.",
+            sources=[SourceReference(source="a.md")],
+            ranked_sources=["a.md"],
+            source_chunks=["chunk a"],
+            iterations=1,
+            usage=TokenUsage(
+                input_tokens=0, output_tokens=0, estimated_cost_usd=0.0
+            ),
+            latency_ms=0.0,
+        )
+        # Track calls to the judge provider
+        judge_provider = AsyncMock(spec=LLMProvider)
+        judge_provider.complete.return_value = _mk_judge_response(1)
+        judge_provider.model = "test-model"
+        results = await run_evaluation(
+            orchestrator=orch,
+            system_prompt="x",
+            golden_path=golden_path,
+            judge_provider=judge_provider,
+        )
+        assert len(results) == 1
+        # Groundedness + relevance should run; completeness must be skipped
+        # because reference_answer == ""
+        assert "completeness" not in results[0].judge_scores, (
+            "CompletenessJudge ran with empty reference_answer — "
+            "should be gated on q.reference_answer truthiness"
+        )
+        assert "groundedness" in results[0].judge_scores
+        assert "relevance" in results[0].judge_scores

tests/evaluation/test_jury_aggregation.py CHANGED Viewed

@@ -165,6 +165,96 @@ class TestJury:
         with pytest.raises(ValueError, match="weights"):
             jury(judges=[j1], aggregation="kappa_weighted")
     @pytest.mark.asyncio
     async def test_cancel_on_non_retryable(self, tmp_path):
         """Non-retryable exception in any member must propagate immediately."""

         with pytest.raises(ValueError, match="weights"):
             jury(judges=[j1], aggregation="kappa_weighted")
+    @pytest.mark.asyncio
+    async def test_kappa_weighted_with_equal_weights_matches_mean(self, tmp_path):
+        """Regression for ties-to-lower divergence between mean and
+        kappa_weighted paths. Two judges score [1, 2] with equal weights;
+        weighted mean == 1.5. The mean path returns 1 (ties-to-lower); the
+        kappa_weighted path must also return 1 — banker's rounding would
+        return 2 and silently violate the policy.
+        """
+        from agent_bench.evaluation.variance.jury import jury
+        j1 = _relevance_judge_with_responses([_vj(1)])
+        j1.judge_id = "claude-haiku_relevance"
+        j2 = _relevance_judge_with_responses([_vj(2)])
+        j2.judge_id = "gpt-4o-mini_relevance"
+        weights = {"claude-haiku_relevance": 1.0, "gpt-4o-mini_relevance": 1.0}
+        ju = jury(
+            judges=[j1, j2],
+            aggregation="kappa_weighted",
+            weights=weights,
+            sidecar_path=tmp_path / "jury.jsonl",
+        )
+        result = await ju.score(_item(), _output())
+        assert result.score == 1, (
+            f"kappa_weighted with equal weights on [1, 2] returned "
+            f"{result.score}; expected 1 (ties-to-lower per "
+            f"_aggregate_scores policy). banker's-rounding bug?"
+        )
+    @pytest.mark.asyncio
+    async def test_kappa_weighted_reasoning_reports_applied_weights_not_dict(
+        self, tmp_path
+    ):
+        """Regression: when the weights dict is missing a member's judge_id,
+        the runtime applies 1.0 silently. The reasoning string MUST report
+        the per-member weights actually used (so the fallback is visible),
+        not the constructor's dict (which would conceal it).
+        """
+        from agent_bench.evaluation.variance.jury import jury
+        j1 = _relevance_judge_with_responses([_vj(2)])
+        j1.judge_id = "claude-haiku_relevance"
+        j2 = _relevance_judge_with_responses([_vj(2)])
+        j2.judge_id = "gpt-4o-mini_relevance"
+        # weights dict only covers j1 — j2 should fall back to 1.0
+        weights = {"claude-haiku_relevance": 5.0}
+        ju = jury(
+            judges=[j1, j2],
+            aggregation="kappa_weighted",
+            weights=weights,
+            sidecar_path=tmp_path / "jury.jsonl",
+        )
+        result = await ju.score(_item(), _output())
+        # Reasoning must surface BOTH applied weights (5.0 and 1.0)
+        assert "5.0" in result.reasoning, (
+            f"applied weight 5.0 missing from reasoning: {result.reasoning!r}"
+        )
+        assert "1.0" in result.reasoning, (
+            f"fallback weight 1.0 missing from reasoning: {result.reasoning!r}"
+        )
+    @pytest.mark.asyncio
+    async def test_kappa_weighted_logs_warning_on_missing_weight(self, tmp_path):
+        """Regression: silent 1.0 substitution for a missing judge_id should
+        emit a structlog WARN so the operator notices a contract violation.
+        """
+        import structlog
+        from agent_bench.evaluation.variance.jury import jury
+        j1 = _relevance_judge_with_responses([_vj(1)])
+        j1.judge_id = "claude-haiku_relevance"
+        j2 = _relevance_judge_with_responses([_vj(1)])
+        j2.judge_id = "gpt-4o-mini_relevance"
+        weights = {"claude-haiku_relevance": 1.0}  # j2 missing
+        ju = jury(
+            judges=[j1, j2],
+            aggregation="kappa_weighted",
+            weights=weights,
+            sidecar_path=tmp_path / "jury.jsonl",
+        )
+        with structlog.testing.capture_logs() as logs:
+            await ju.score(_item(), _output())
+        assert any(
+            entry.get("event") == "jury_missing_weight_fallback_to_one"
+            for entry in logs
+        ), f"no missing-weight warning in {logs!r}"
     @pytest.mark.asyncio
     async def test_cancel_on_non_retryable(self, tmp_path):
         """Non-retryable exception in any member must propagate immediately."""

tests/evaluation/test_rubric_loading.py CHANGED Viewed

@@ -26,6 +26,21 @@ class TestRubricLoading:
         assert r.scale == "three_point"
         assert len(r.levels) == 3
 class TestRubricValidationErrors:
     @pytest.mark.parametrize(

         assert r.scale == "three_point"
         assert len(r.levels) == 3
+    def test_fenced_code_examples_do_not_break_level_count(self):
+        """Regression: the level-pattern regex must skip ``## Score N`` strings
+        that appear inside fenced code blocks. A binary rubric whose
+        Example A contains a code-fenced ``## Score 7`` literal should still
+        load as a 2-level binary rubric, not be rejected with arity mismatch.
+        """
+        r = Rubric.from_markdown_file(
+            FIXTURES / "rubrics_valid_with_fenced_examples.md"
+        )
+        assert r.dimension == "groundedness"
+        assert r.scale == "binary"
+        assert len(r.levels) == 2, (
+            f"fenced ## Score 7 leaked into level count; got {len(r.levels)} levels"
+        )
 class TestRubricValidationErrors:
     @pytest.mark.parametrize(