Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 27 days ago

Commit

80be2d8

1 Parent(s): b170eb6

feat(judges): CompletenessJudge + three-point reference-based rubric

Three-point rubric (none / partial / full) scored against the gold
reference_answer. Coverage-of-facts framing: score only on what
fraction of the reference's points are present, not on additional
correct facts. Two anchored examples per level.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

agent_bench/evaluation/judges/completeness.py +48 -0
agent_bench/evaluation/rubrics/completeness.md +71 -0
tests/evaluation/test_judges.py +42 -0

agent_bench/evaluation/judges/completeness.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""CompletenessJudge — three-point, reference-based on item.reference_answer."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from agent_bench.evaluation.judges.base import (
+    Judge,
+    ScoreResult,
+    _call_judge_with_retry,
+)
+from agent_bench.evaluation.judges.groundedness import _system_output_hash
+if TYPE_CHECKING:
+    from agent_bench.agents.orchestrator import AgentResponse
+    from agent_bench.evaluation.harness import GoldenQuestion
+class CompletenessJudge(Judge):
+    async def score(
+        self,
+        item: "GoldenQuestion",
+        output: "AgentResponse",
+        *,
+        prompt_seed: int = 0,
+    ) -> ScoreResult:
+        prompt = (
+            f"{self.rubric.render_prompt(level_permutation_seed=prompt_seed)}\n\n"
+            f"---\n\n"
+            f"## Reference answer (gold)\n{item.reference_answer}\n\n"
+            f"## Answer to score\n{output.answer}\n\n"
+            f"Score this answer against the rubric above. Respond with ONLY a "
+            f'JSON object: {{"reasoning": "...", "evidence_quotes": [...], '
+            f'"score": 0 or 1 or 2 or "Unknown"}}.'
+        )
+        return await _call_judge_with_retry(
+            provider=self.judge_provider,
+            prompt=prompt,
+            valid_scores={0, 1, 2},
+            judge_id=self.judge_id,
+            rubric_version=self.rubric.source_hash,
+            prompt_seed=prompt_seed,
+            system_output_hash=_system_output_hash(
+                item.id, output.answer, [s.source for s in output.sources]
+            ),
+            item_id=item.id,
+            abstain_allowed=self.rubric.abstain_allowed,
+        )

agent_bench/evaluation/rubrics/completeness.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+dimension: completeness
+scale: three_point
+reference_based: true
+abstain_allowed: true
+---
+# Completeness (three-point)
+Score how much of the gold reference answer is covered by the agent's
+answer. This is reference-based — the judge sees the gold reference
+and the agent's answer; score on **coverage of facts** in the
+reference, not on additional facts the agent may have included.
+The judge does not penalize the agent for adding correct extra detail
+(that's a separate concern). Score only on what fraction of the
+reference's points are present.
+## Score 0
+None of the reference's key points are present in the answer.
+### Example A — answer addresses different facts
+Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
+Answer: "Kubernetes uses YAML manifests to declare resources."
+Score=0 — none of the three reference points (ordinal, hostname, storage) appear.
+### Example B — refusal that covers nothing
+Reference: "The default port is 8080."
+Answer: "I cannot find that information."
+Score=0 — the reference's single point (port=8080) is not in the answer.
+## Score 1
+Some but not all of the reference's points are present.
+### Example C — partial coverage
+Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
+Answer: "StatefulSet pods get ordinal indices."
+Score=1 — one of three points covered.
+### Example D — half a comparison
+Reference: "Deployments manage stateless replicas; StatefulSets manage stateful pods with stable identities."
+Answer: "Deployments manage stateless replicas with rolling updates."
+Score=1 — Deployment side covered, StatefulSet side missing.
+## Score 2
+All of the reference's key points are present (paraphrase allowed).
+### Example E — full coverage with paraphrase
+Reference: "StatefulSet pods receive ordinal indices, stable hostnames, and persistent storage."
+Answer: "Each pod gets an ordinal number, a stable DNS name, and storage that survives restarts."
+Score=2 — all three points covered with paraphrase.
+### Example F — full coverage of single-fact reference
+Reference: "The default port is 8080."
+Answer: "Port 8080."
+Score=2 — the only reference point is covered.

tests/evaluation/test_judges.py CHANGED Viewed

@@ -449,3 +449,45 @@ class TestRelevanceJudge:
         assert result.score == 2
         sent_prompt = provider.complete.await_args.args[0][0].content
         assert "What's the default kubelet port?" in sent_prompt

         assert result.score == 2
         sent_prompt = provider.complete.await_args.args[0][0].content
         assert "What's the default kubelet port?" in sent_prompt
+class TestCompletenessJudge:
+    @pytest.mark.asyncio
+    async def test_reference_answer_in_prompt(self):
+        from agent_bench.agents.orchestrator import AgentResponse
+        from agent_bench.core.types import TokenUsage
+        from agent_bench.evaluation.harness import GoldenQuestion
+        from agent_bench.evaluation.judges.base import Rubric
+        from agent_bench.evaluation.judges.completeness import CompletenessJudge
+        rubric = Rubric.from_markdown_file(
+            "agent_bench/evaluation/rubrics/completeness.md"
+        )
+        provider = AsyncMock(spec=LLMProvider)
+        provider.complete.return_value = _mk_response(_valid_json(2))
+        judge = CompletenessJudge(judge_provider=provider, rubric=rubric, model_id="m")
+        item = GoldenQuestion(
+            id="k8s_003",
+            question="?",
+            expected_answer_keywords=[],
+            expected_sources=[],
+            category="retrieval",
+            difficulty="easy",
+            requires_calculator=False,
+            reference_answer="The default port is 8080.",
+        )
+        output = AgentResponse(
+            answer="Port 8080.",
+            sources=[],
+            iterations=1,
+            usage=TokenUsage(
+                input_tokens=0, output_tokens=0, estimated_cost_usd=0
+            ),
+            latency_ms=0,
+        )
+        result = await judge.score(item, output)
+        assert result.score == 2
+        assert result.judge_id == "m_completeness"
+        sent_prompt = provider.complete.await_args.args[0][0].content
+        assert "The default port is 8080." in sent_prompt