Spaces:

Nomearod
/

agentbench

Sleeping

Nomearod Claude Opus 4.7 (1M context) commited on May 4

Commit

12cb8b7

1 Parent(s): 1d47106

feat(config): add evaluation.judge_dimensions field

Default ['groundedness', 'relevance', 'completeness'] — the v1
dimensions that have rubrics + judges + calibration coverage.
citation_faithfulness is opt-in v1 (default-on v1.1) so the
citation deterministic-vs-LLM head-to-head is decoupled from the
harness migration.

judge_provider field unchanged — preserves the YAML knob across
configs/{default,production,anthropic,selfhosted_local,
selfhosted_modal}.yaml. Zero user-facing config migration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

agent_bench/core/config.py +3 -0
tests/evaluation/test_harness_migration.py +35 -0

agent_bench/core/config.py CHANGED Viewed

@@ -88,6 +88,9 @@ class MemoryConfig(BaseModel):
 class EvaluationConfig(BaseModel):
     judge_provider: str = "openai"
     golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
 _VALID_TIERS = {"heuristic", "classifier"}

 class EvaluationConfig(BaseModel):
     judge_provider: str = "openai"
     golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
+    # New in judge-layer v1: which dimensions to score with L2 LLM judges.
+    # citation_faithfulness is opt-in v1 (default-on v1.1).
+    judge_dimensions: list[str] = ["groundedness", "relevance", "completeness"]
 _VALID_TIERS = {"heuristic", "classifier"}

tests/evaluation/test_harness_migration.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Tests for the harness migration to the new judge layer."""
+from __future__ import annotations
+from agent_bench.core.config import EvaluationConfig
+class TestJudgeProviderConfigPreserved:
+    def test_judge_provider_field_still_exists_with_default(self):
+        # Regression — the judge_provider knob must not be removed/renamed
+        # (5 YAML configs reference it).
+        c = EvaluationConfig()
+        assert c.judge_provider == "openai"
+    def test_judge_dimensions_default_is_three(self):
+        c = EvaluationConfig()
+        assert c.judge_dimensions == ["groundedness", "relevance", "completeness"]
+        # citation_faithfulness is opt-in v1, default-on v1.1
+        assert "citation_faithfulness" not in c.judge_dimensions
+class TestEvalResultJudgeScores:
+    def test_eval_result_no_longer_has_faithfulness_field(self):
+        from agent_bench.evaluation.harness import EvalResult
+        fields = EvalResult.model_fields
+        assert "faithfulness" not in fields, (
+            "faithfulness field should be removed in the supersession"
+        )
+        assert "correctness" not in fields, (
+            "correctness field should be removed in the supersession"
+        )
+        assert "judge_scores" in fields, (
+            "judge_scores: dict[str, ScoreResult] should be added"
+        )