File size: 10,175 Bytes
3d027cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4454894
 
3d027cb
4454894
3d027cb
 
 
 
4454894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d027cb
4454894
 
 
 
 
520796c
4454894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520796c
3d027cb
4454894
3d027cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
"""Tests for evaluation metrics, harness, and report generation."""

from __future__ import annotations

import pytest

from agent_bench.agents.orchestrator import AgentResponse, SourceReference
from agent_bench.core.types import TokenUsage
from agent_bench.evaluation.harness import EvalResult, load_golden_dataset
from agent_bench.evaluation.metrics import (
    calculator_used_when_expected,
    citation_accuracy,
    grounded_refusal,
    keyword_hit_rate,
    retrieval_precision_at_k,
    retrieval_recall_at_k,
    source_presence,
)
from agent_bench.evaluation.report import generate_report

# --- Metrics tests ---


class TestRetrievalMetrics:
    def test_precision_at_k_perfect(self):
        assert retrieval_precision_at_k(["a.md", "b.md"], ["a.md", "b.md"]) == 1.0

    def test_precision_at_k_partial(self):
        assert retrieval_precision_at_k(["a.md", "b.md", "c.md"], ["a.md"]) == pytest.approx(1 / 3)

    def test_precision_at_k_empty_retrieved(self):
        assert retrieval_precision_at_k([], ["a.md"]) == 0.0

    def test_recall_at_k_perfect(self):
        assert retrieval_recall_at_k(["a.md", "b.md", "c.md"], ["a.md", "b.md"]) == 1.0

    def test_recall_at_k_partial(self):
        assert retrieval_recall_at_k(["a.md"], ["a.md", "b.md"]) == 0.5

    def test_recall_at_k_empty_expected(self):
        assert retrieval_recall_at_k(["a.md"], []) == 0.0

    def test_precision_uses_ranked_sources_with_duplicates(self):
        """Ranked sources may have duplicates — precision should count correctly."""
        retrieved = ["a.md", "a.md", "b.md", "c.md", "d.md"]
        expected = ["a.md"]
        # 2 out of 5 retrieved are "a.md"
        assert retrieval_precision_at_k(retrieved, expected, k=5) == pytest.approx(2 / 5)


class TestKeywordMetrics:
    def test_keyword_hit_rate_all_match(self):
        assert keyword_hit_rate("curly braces in path", ["curly braces", "path"]) == 1.0

    def test_keyword_hit_rate_none_match(self):
        assert keyword_hit_rate("something else", ["curly", "braces"]) == 0.0

    def test_keyword_hit_rate_case_insensitive(self):
        assert keyword_hit_rate("CORSMiddleware", ["corsmiddleware"]) == 1.0


class TestSourcePresence:
    def test_has_sources(self):
        resp = AgentResponse(
            answer="test",
            sources=[SourceReference(source="a.md")],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert source_presence(resp) is True

    def test_no_sources(self):
        resp = AgentResponse(
            answer="test",
            sources=[],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert source_presence(resp) is False


class TestGroundedRefusal:
    def test_out_of_scope_with_refusal_no_citations(self):
        """Refusal phrase + no [source:] citations in answer text = passes."""
        assert (
            grounded_refusal("The documentation does not contain this info.", "out_of_scope")
            is True
        )

    def test_out_of_scope_without_refusal(self):
        assert grounded_refusal("Here is how you do it...", "out_of_scope") is False

    def test_out_of_scope_refusal_with_citation_in_answer_fails(self):
        """Refusal phrase + [source:] citation in answer text = NOT a grounded refusal.

        The metric inspects the answer text for citations rather than the
        retrieved-sources list — a correct flavor-A refusal retrieves
        candidates, inspects them, and explicitly declines to cite any of
        them, which is the behavior the metric is designed to measure.
        """
        answer = (
            "The documentation does not contain this info. "
            "[source: some_doc.md]"
        )
        assert grounded_refusal(answer, "out_of_scope") is False

    def test_out_of_scope_refusal_no_citation_passes_even_with_retrieval(self):
        """Flavor-A refusal: agent retrieved candidates but answer cites none."""
        answer = (
            "The retrieved context does not contain information about Jaeger "
            "sidecar injection. I cannot provide an answer."
        )
        # Under the old signature this test would have failed because the
        # retrieved-sources list was non-empty. The fix moves the check to
        # the answer text where the actual citations live.
        assert grounded_refusal(answer, "out_of_scope") is True

    def test_canonical_refusal_phrasing_recognized(self):
        """System-prompt-taught shape "not in the {label} documentation" passes.

        core/prompts.py:17-18 instructs the agent to say "the answer is not
        in the {corpus_label} documentation and stop" on out-of-scope queries.
        The metric must recognize that canonical form.
        """
        answer = "The answer is not in the Kubernetes documentation."
        assert grounded_refusal(answer, "out_of_scope") is True

    def test_not_in_the_is_not_substring_refusal(self):
        """Bare "not in the" fragment must NOT count as refusal.

        Pins the design choice to match the canonical shape via a narrow
        regex anchored on "documentation" rather than a loose substring.
        A future refactor that widens the matcher to substring "not in the"
        will break this test — that is the point.
        """
        answer = "The rate limit is not in the same scope as the request timeout."
        assert grounded_refusal(answer, "out_of_scope") is False

    def test_in_scope_always_true(self):
        assert grounded_refusal("any answer", "retrieval") is True


class TestCitationAccuracy:
    def test_all_citations_valid(self):
        answer = "Info from [source: a.md] and [source: b.md]."
        assert citation_accuracy(answer, ["a.md", "b.md"]) == 1.0

    def test_hallucinated_citation(self):
        answer = "Info from [source: fake.md]."
        assert citation_accuracy(answer, ["a.md"]) == 0.0

    def test_no_citations(self):
        assert citation_accuracy("No citations here.", ["a.md"]) == 1.0


class TestCalculatorMetric:
    def test_calculator_used_when_required(self):
        resp = AgentResponse(
            answer="9",
            tools_used=["search_documents", "calculator"],
            iterations=2,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=True) is True

    def test_calculator_not_used_when_required(self):
        resp = AgentResponse(
            answer="9",
            tools_used=["search_documents"],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=True) is False

    def test_not_required_always_true(self):
        resp = AgentResponse(
            answer="test",
            tools_used=[],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=False) is True


# --- Golden dataset loading ---


class TestGoldenDataset:
    def test_load_golden_dataset(self):
        questions = load_golden_dataset("agent_bench/evaluation/datasets/tech_docs_golden.json")
        assert len(questions) == 27
        # Check distribution
        categories = [q.category for q in questions]
        assert categories.count("out_of_scope") == 5
        assert categories.count("calculation") == 3
        # All have required fields
        for q in questions:
            assert q.id
            assert q.question
            assert q.expected_answer_keywords


# --- Report generation ---


class TestReportGeneration:
    def _make_results(self) -> list[EvalResult]:
        usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001)
        return [
            EvalResult(
                question_id="q001",
                question="Test question?",
                category="retrieval",
                difficulty="easy",
                retrieval_precision=0.8,
                retrieval_recall=1.0,
                keyword_hit_rate=0.75,
                has_source_citation=True,
                grounded_refusal=True,
                citation_accuracy=1.0,
                calculator_used_correctly=True,
                tool_calls_made=2,
                latency_ms=100.0,
                tokens_used=usage,
                answer="Test answer",
                retrieved_sources=["a.md"],
            ),
            EvalResult(
                question_id="q002",
                question="Out of scope?",
                category="out_of_scope",
                difficulty="easy",
                retrieval_precision=0.0,
                retrieval_recall=0.0,
                keyword_hit_rate=0.5,
                has_source_citation=False,
                grounded_refusal=True,
                citation_accuracy=1.0,
                calculator_used_correctly=True,
                tool_calls_made=1,
                latency_ms=50.0,
                tokens_used=usage,
                answer="Does not contain",
                retrieved_sources=[],
            ),
        ]

    def test_report_contains_required_sections(self):
        report = generate_report(self._make_results(), provider_name="test")
        assert "## Aggregate Metrics" in report
        assert "## By Category" in report
        assert "## By Difficulty" in report
        assert "## Chunking Strategy Comparison" in report
        assert "## Failure Analysis" in report
        assert "## Per-Question Results" in report

    def test_report_contains_metrics(self):
        report = generate_report(self._make_results(), provider_name="test")
        assert "Retrieval P@5" in report
        assert "Grounded Refusal Rate" in report
        assert "Citation Accuracy" in report