Spaces:
Running
Running
| """Tests for evaluation metrics, harness, and report generation.""" | |
| from __future__ import annotations | |
| import pytest | |
| from agent_bench.agents.orchestrator import AgentResponse, SourceReference | |
| from agent_bench.core.types import TokenUsage | |
| from agent_bench.evaluation.harness import EvalResult, load_golden_dataset | |
| from agent_bench.evaluation.metrics import ( | |
| calculator_used_when_expected, | |
| citation_accuracy, | |
| grounded_refusal, | |
| keyword_hit_rate, | |
| retrieval_precision_at_k, | |
| retrieval_recall_at_k, | |
| source_presence, | |
| ) | |
| from agent_bench.evaluation.report import generate_report | |
| # --- Metrics tests --- | |
| class TestRetrievalMetrics: | |
| def test_precision_at_k_perfect(self): | |
| assert retrieval_precision_at_k(["a.md", "b.md"], ["a.md", "b.md"]) == 1.0 | |
| def test_precision_at_k_partial(self): | |
| assert retrieval_precision_at_k(["a.md", "b.md", "c.md"], ["a.md"]) == pytest.approx(1 / 3) | |
| def test_precision_at_k_empty_retrieved(self): | |
| assert retrieval_precision_at_k([], ["a.md"]) == 0.0 | |
| def test_recall_at_k_perfect(self): | |
| assert retrieval_recall_at_k(["a.md", "b.md", "c.md"], ["a.md", "b.md"]) == 1.0 | |
| def test_recall_at_k_partial(self): | |
| assert retrieval_recall_at_k(["a.md"], ["a.md", "b.md"]) == 0.5 | |
| def test_recall_at_k_empty_expected(self): | |
| assert retrieval_recall_at_k(["a.md"], []) == 0.0 | |
| def test_precision_uses_ranked_sources_with_duplicates(self): | |
| """Ranked sources may have duplicates β precision should count correctly.""" | |
| retrieved = ["a.md", "a.md", "b.md", "c.md", "d.md"] | |
| expected = ["a.md"] | |
| # 2 out of 5 retrieved are "a.md" | |
| assert retrieval_precision_at_k(retrieved, expected, k=5) == pytest.approx(2 / 5) | |
| class TestKeywordMetrics: | |
| def test_keyword_hit_rate_all_match(self): | |
| assert keyword_hit_rate("curly braces in path", ["curly braces", "path"]) == 1.0 | |
| def test_keyword_hit_rate_none_match(self): | |
| assert keyword_hit_rate("something else", ["curly", "braces"]) == 0.0 | |
| def test_keyword_hit_rate_case_insensitive(self): | |
| assert keyword_hit_rate("CORSMiddleware", ["corsmiddleware"]) == 1.0 | |
| class TestSourcePresence: | |
| def test_has_sources(self): | |
| resp = AgentResponse( | |
| answer="test", | |
| sources=[SourceReference(source="a.md")], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=1.0, | |
| ) | |
| assert source_presence(resp) is True | |
| def test_no_sources(self): | |
| resp = AgentResponse( | |
| answer="test", | |
| sources=[], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=1.0, | |
| ) | |
| assert source_presence(resp) is False | |
| class TestGroundedRefusal: | |
| def test_out_of_scope_with_refusal_no_citations(self): | |
| """Refusal phrase + no [source:] citations in answer text = passes.""" | |
| assert ( | |
| grounded_refusal("The documentation does not contain this info.", "out_of_scope") | |
| is True | |
| ) | |
| def test_out_of_scope_without_refusal(self): | |
| assert grounded_refusal("Here is how you do it...", "out_of_scope") is False | |
| def test_out_of_scope_refusal_with_citation_in_answer_fails(self): | |
| """Refusal phrase + [source:] citation in answer text = NOT a grounded refusal. | |
| The metric inspects the answer text for citations rather than the | |
| retrieved-sources list β a correct flavor-A refusal retrieves | |
| candidates, inspects them, and explicitly declines to cite any of | |
| them, which is the behavior the metric is designed to measure. | |
| """ | |
| answer = ( | |
| "The documentation does not contain this info. " | |
| "[source: some_doc.md]" | |
| ) | |
| assert grounded_refusal(answer, "out_of_scope") is False | |
| def test_out_of_scope_refusal_no_citation_passes_even_with_retrieval(self): | |
| """Flavor-A refusal: agent retrieved candidates but answer cites none.""" | |
| answer = ( | |
| "The retrieved context does not contain information about Jaeger " | |
| "sidecar injection. I cannot provide an answer." | |
| ) | |
| # Under the old signature this test would have failed because the | |
| # retrieved-sources list was non-empty. The fix moves the check to | |
| # the answer text where the actual citations live. | |
| assert grounded_refusal(answer, "out_of_scope") is True | |
| def test_canonical_refusal_phrasing_recognized(self): | |
| """System-prompt-taught shape "not in the {label} documentation" passes. | |
| core/prompts.py:17-18 instructs the agent to say "the answer is not | |
| in the {corpus_label} documentation and stop" on out-of-scope queries. | |
| The metric must recognize that canonical form. | |
| """ | |
| answer = "The answer is not in the Kubernetes documentation." | |
| assert grounded_refusal(answer, "out_of_scope") is True | |
| def test_not_in_the_is_not_substring_refusal(self): | |
| """Bare "not in the" fragment must NOT count as refusal. | |
| Pins the design choice to match the canonical shape via a narrow | |
| regex anchored on "documentation" rather than a loose substring. | |
| A future refactor that widens the matcher to substring "not in the" | |
| will break this test β that is the point. | |
| """ | |
| answer = "The rate limit is not in the same scope as the request timeout." | |
| assert grounded_refusal(answer, "out_of_scope") is False | |
| def test_in_scope_always_true(self): | |
| assert grounded_refusal("any answer", "retrieval") is True | |
| class TestCitationAccuracy: | |
| def test_all_citations_valid(self): | |
| answer = "Info from [source: a.md] and [source: b.md]." | |
| assert citation_accuracy(answer, ["a.md", "b.md"]) == 1.0 | |
| def test_hallucinated_citation(self): | |
| answer = "Info from [source: fake.md]." | |
| assert citation_accuracy(answer, ["a.md"]) == 0.0 | |
| def test_no_citations(self): | |
| assert citation_accuracy("No citations here.", ["a.md"]) == 1.0 | |
| class TestCalculatorMetric: | |
| def test_calculator_used_when_required(self): | |
| resp = AgentResponse( | |
| answer="9", | |
| tools_used=["search_documents", "calculator"], | |
| iterations=2, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=1.0, | |
| ) | |
| assert calculator_used_when_expected(resp, requires_calculator=True) is True | |
| def test_calculator_not_used_when_required(self): | |
| resp = AgentResponse( | |
| answer="9", | |
| tools_used=["search_documents"], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=1.0, | |
| ) | |
| assert calculator_used_when_expected(resp, requires_calculator=True) is False | |
| def test_not_required_always_true(self): | |
| resp = AgentResponse( | |
| answer="test", | |
| tools_used=[], | |
| iterations=1, | |
| usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0), | |
| latency_ms=1.0, | |
| ) | |
| assert calculator_used_when_expected(resp, requires_calculator=False) is True | |
| # --- Golden dataset loading --- | |
| class TestGoldenDataset: | |
| def test_load_golden_dataset(self): | |
| questions = load_golden_dataset("agent_bench/evaluation/datasets/tech_docs_golden.json") | |
| assert len(questions) == 27 | |
| # Check distribution | |
| categories = [q.category for q in questions] | |
| assert categories.count("out_of_scope") == 5 | |
| assert categories.count("calculation") == 3 | |
| # All have required fields | |
| for q in questions: | |
| assert q.id | |
| assert q.question | |
| assert q.expected_answer_keywords | |
| # --- Report generation --- | |
| class TestReportGeneration: | |
| def _make_results(self) -> list[EvalResult]: | |
| usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001) | |
| return [ | |
| EvalResult( | |
| question_id="q001", | |
| question="Test question?", | |
| category="retrieval", | |
| difficulty="easy", | |
| retrieval_precision=0.8, | |
| retrieval_recall=1.0, | |
| keyword_hit_rate=0.75, | |
| has_source_citation=True, | |
| grounded_refusal=True, | |
| citation_accuracy=1.0, | |
| calculator_used_correctly=True, | |
| tool_calls_made=2, | |
| latency_ms=100.0, | |
| tokens_used=usage, | |
| answer="Test answer", | |
| retrieved_sources=["a.md"], | |
| ), | |
| EvalResult( | |
| question_id="q002", | |
| question="Out of scope?", | |
| category="out_of_scope", | |
| difficulty="easy", | |
| retrieval_precision=0.0, | |
| retrieval_recall=0.0, | |
| keyword_hit_rate=0.5, | |
| has_source_citation=False, | |
| grounded_refusal=True, | |
| citation_accuracy=1.0, | |
| calculator_used_correctly=True, | |
| tool_calls_made=1, | |
| latency_ms=50.0, | |
| tokens_used=usage, | |
| answer="Does not contain", | |
| retrieved_sources=[], | |
| ), | |
| ] | |
| def test_report_contains_required_sections(self): | |
| report = generate_report(self._make_results(), provider_name="test") | |
| assert "## Aggregate Metrics" in report | |
| assert "## By Category" in report | |
| assert "## By Difficulty" in report | |
| assert "## Chunking Strategy Comparison" in report | |
| assert "## Failure Analysis" in report | |
| assert "## Per-Question Results" in report | |
| def test_report_contains_metrics(self): | |
| report = generate_report(self._make_results(), provider_name="test") | |
| assert "Retrieval P@5" in report | |
| assert "Grounded Refusal Rate" in report | |
| assert "Citation Accuracy" in report | |