Spaces:
Sleeping
Sleeping
feat(config): add evaluation.judge_dimensions field
Browse filesDefault ['groundedness', 'relevance', 'completeness'] — the v1
dimensions that have rubrics + judges + calibration coverage.
citation_faithfulness is opt-in v1 (default-on v1.1) so the
citation deterministic-vs-LLM head-to-head is decoupled from the
harness migration.
judge_provider field unchanged — preserves the YAML knob across
configs/{default,production,anthropic,selfhosted_local,
selfhosted_modal}.yaml. Zero user-facing config migration.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
agent_bench/core/config.py
CHANGED
|
@@ -88,6 +88,9 @@ class MemoryConfig(BaseModel):
|
|
| 88 |
class EvaluationConfig(BaseModel):
|
| 89 |
judge_provider: str = "openai"
|
| 90 |
golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
_VALID_TIERS = {"heuristic", "classifier"}
|
|
|
|
| 88 |
class EvaluationConfig(BaseModel):
|
| 89 |
judge_provider: str = "openai"
|
| 90 |
golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
|
| 91 |
+
# New in judge-layer v1: which dimensions to score with L2 LLM judges.
|
| 92 |
+
# citation_faithfulness is opt-in v1 (default-on v1.1).
|
| 93 |
+
judge_dimensions: list[str] = ["groundedness", "relevance", "completeness"]
|
| 94 |
|
| 95 |
|
| 96 |
_VALID_TIERS = {"heuristic", "classifier"}
|
tests/evaluation/test_harness_migration.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the harness migration to the new judge layer."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from agent_bench.core.config import EvaluationConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestJudgeProviderConfigPreserved:
|
| 9 |
+
def test_judge_provider_field_still_exists_with_default(self):
|
| 10 |
+
# Regression — the judge_provider knob must not be removed/renamed
|
| 11 |
+
# (5 YAML configs reference it).
|
| 12 |
+
c = EvaluationConfig()
|
| 13 |
+
assert c.judge_provider == "openai"
|
| 14 |
+
|
| 15 |
+
def test_judge_dimensions_default_is_three(self):
|
| 16 |
+
c = EvaluationConfig()
|
| 17 |
+
assert c.judge_dimensions == ["groundedness", "relevance", "completeness"]
|
| 18 |
+
# citation_faithfulness is opt-in v1, default-on v1.1
|
| 19 |
+
assert "citation_faithfulness" not in c.judge_dimensions
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TestEvalResultJudgeScores:
|
| 23 |
+
def test_eval_result_no_longer_has_faithfulness_field(self):
|
| 24 |
+
from agent_bench.evaluation.harness import EvalResult
|
| 25 |
+
|
| 26 |
+
fields = EvalResult.model_fields
|
| 27 |
+
assert "faithfulness" not in fields, (
|
| 28 |
+
"faithfulness field should be removed in the supersession"
|
| 29 |
+
)
|
| 30 |
+
assert "correctness" not in fields, (
|
| 31 |
+
"correctness field should be removed in the supersession"
|
| 32 |
+
)
|
| 33 |
+
assert "judge_scores" in fields, (
|
| 34 |
+
"judge_scores: dict[str, ScoreResult] should be added"
|
| 35 |
+
)
|