Nomearod Claude Opus 4.7 (1M context) commited on
Commit
12cb8b7
·
1 Parent(s): 1d47106

feat(config): add evaluation.judge_dimensions field

Browse files

Default ['groundedness', 'relevance', 'completeness'] — the v1
dimensions that have rubrics + judges + calibration coverage.
citation_faithfulness is opt-in v1 (default-on v1.1) so the
citation deterministic-vs-LLM head-to-head is decoupled from the
harness migration.

judge_provider field unchanged — preserves the YAML knob across
configs/{default,production,anthropic,selfhosted_local,
selfhosted_modal}.yaml. Zero user-facing config migration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/core/config.py CHANGED
@@ -88,6 +88,9 @@ class MemoryConfig(BaseModel):
88
  class EvaluationConfig(BaseModel):
89
  judge_provider: str = "openai"
90
  golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
 
 
 
91
 
92
 
93
  _VALID_TIERS = {"heuristic", "classifier"}
 
88
  class EvaluationConfig(BaseModel):
89
  judge_provider: str = "openai"
90
  golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
91
+ # New in judge-layer v1: which dimensions to score with L2 LLM judges.
92
+ # citation_faithfulness is opt-in v1 (default-on v1.1).
93
+ judge_dimensions: list[str] = ["groundedness", "relevance", "completeness"]
94
 
95
 
96
  _VALID_TIERS = {"heuristic", "classifier"}
tests/evaluation/test_harness_migration.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the harness migration to the new judge layer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from agent_bench.core.config import EvaluationConfig
6
+
7
+
8
+ class TestJudgeProviderConfigPreserved:
9
+ def test_judge_provider_field_still_exists_with_default(self):
10
+ # Regression — the judge_provider knob must not be removed/renamed
11
+ # (5 YAML configs reference it).
12
+ c = EvaluationConfig()
13
+ assert c.judge_provider == "openai"
14
+
15
+ def test_judge_dimensions_default_is_three(self):
16
+ c = EvaluationConfig()
17
+ assert c.judge_dimensions == ["groundedness", "relevance", "completeness"]
18
+ # citation_faithfulness is opt-in v1, default-on v1.1
19
+ assert "citation_faithfulness" not in c.judge_dimensions
20
+
21
+
22
+ class TestEvalResultJudgeScores:
23
+ def test_eval_result_no_longer_has_faithfulness_field(self):
24
+ from agent_bench.evaluation.harness import EvalResult
25
+
26
+ fields = EvalResult.model_fields
27
+ assert "faithfulness" not in fields, (
28
+ "faithfulness field should be removed in the supersession"
29
+ )
30
+ assert "correctness" not in fields, (
31
+ "correctness field should be removed in the supersession"
32
+ )
33
+ assert "judge_scores" in fields, (
34
+ "judge_scores: dict[str, ScoreResult] should be added"
35
+ )