Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on May 4

Commit

2192305

1 Parent(s): 7b72b2c

feat(judges): Judge ABC with judge_id derived from model + dimension

Judge is abstract — concrete subclasses (groundedness, relevance,
completeness, citation_faithfulness) land in Phase 2 as thin
~30-line classes per the no-shared-base-method discipline.

judge_id format: '{model_id}_{rubric.dimension}', e.g.
'claude-haiku-4-5_groundedness'. The format is load-bearing for
the calibration report's per-judge κ breakdown — the report
groups by judge_id when computing per-judge agreement against
the human labels.

Cross-package types (GoldenQuestion, AgentResponse, LLMProvider)
are imported under TYPE_CHECKING to keep judges/base.py loadable
before harness.py is migrated in Phase 8 (avoiding a circular
import once harness imports back from judges).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

agent_bench/evaluation/judges/base.py +42 -1
tests/evaluation/test_judges.py +28 -0

agent_bench/evaluation/judges/base.py CHANGED Viewed

@@ -11,12 +11,18 @@ from __future__ import annotations
 import hashlib
 import random
 import re
 from pathlib import Path
-from typing import Literal, Self
 import yaml
 from pydantic import BaseModel, Field
 # --- Abstain-reason constants ---
 #
 # Failure-as-abstain ScoreResults carry a reasoning string with one of
@@ -199,3 +205,38 @@ class Rubric(BaseModel):
             for lvl in permuted_levels
         )
         return permuted_body

 import hashlib
 import random
 import re
+from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import TYPE_CHECKING, Literal, Self
 import yaml
 from pydantic import BaseModel, Field
+if TYPE_CHECKING:
+    from agent_bench.agents.orchestrator import AgentResponse
+    from agent_bench.core.provider import LLMProvider
+    from agent_bench.evaluation.harness import GoldenQuestion
 # --- Abstain-reason constants ---
 #
 # Failure-as-abstain ScoreResults carry a reasoning string with one of
             for lvl in permuted_levels
         )
         return permuted_body
+class Judge(ABC):
+    """Per-dimension LLM judge. Concrete subclasses implement score()
+    for one rubric dimension; they are thin (~30 lines) and not
+    factored against a shared base method (see design doc for why).
+    """
+    def __init__(
+        self,
+        judge_provider: "LLMProvider",
+        rubric: Rubric,
+        model_id: str,
+    ) -> None:
+        self.judge_provider = judge_provider
+        self.rubric = rubric
+        self.model_id = model_id
+        self.judge_id = f"{model_id}_{rubric.dimension}"
+    @abstractmethod
+    async def score(
+        self,
+        item: "GoldenQuestion",
+        output: "AgentResponse",
+        *,
+        prompt_seed: int = 0,
+    ) -> ScoreResult:
+        """Score one (item, output) pair against this judge's rubric.
+        Returns a ScoreResult whose system_output_hash is computed from
+        (item.id, output.answer, sorted(output.sources)). Failures map
+        to abstain via the abstain-reason constants; provider non-
+        retryable errors raise (caller bug, not noise).
+        """
+        ...

tests/evaluation/test_judges.py CHANGED Viewed

@@ -69,3 +69,31 @@ class TestScoreResult:
     def test_score_rejects_other_strings(self):
         with pytest.raises(ValueError):
             ScoreResult(score="maybe", **self._base_kwargs())  # type: ignore[arg-type]

     def test_score_rejects_other_strings(self):
         with pytest.raises(ValueError):
             ScoreResult(score="maybe", **self._base_kwargs())  # type: ignore[arg-type]
+from abc import ABC
+from pathlib import Path
+from agent_bench.evaluation.judges.base import Judge
+class TestJudgeABC:
+    def test_judge_is_abstract(self):
+        assert issubclass(Judge, ABC)
+        # Cannot instantiate directly — score is abstract
+        with pytest.raises(TypeError, match="abstract"):
+            Judge(judge_provider=None, rubric=None, model_id="test")  # type: ignore[abstract,arg-type]
+    def test_judge_id_built_from_model_and_dimension(self):
+        # Concrete subclass that satisfies the abstract method
+        class _ConcreteJudge(Judge):
+            async def score(self, item, output, *, prompt_seed=0):
+                raise NotImplementedError
+        from agent_bench.evaluation.judges.base import Rubric
+        rubric = Rubric.from_markdown_file(
+            Path(__file__).parent / "fixtures" / "rubrics_valid_binary.md"
+        )
+        j = _ConcreteJudge(judge_provider=None, rubric=rubric, model_id="claude-haiku-4-5")  # type: ignore[arg-type]
+        assert j.judge_id == "claude-haiku-4-5_groundedness"