Spaces:

Nomearod
/

agentbench

Running

File size: 10,175 Bytes

"""Tests for evaluation metrics, harness, and report generation."""

from __future__ import annotations

import pytest

from agent_bench.agents.orchestrator import AgentResponse, SourceReference
from agent_bench.core.types import TokenUsage
from agent_bench.evaluation.harness import EvalResult, load_golden_dataset
from agent_bench.evaluation.metrics import (
    calculator_used_when_expected,
    citation_accuracy,
    grounded_refusal,
    keyword_hit_rate,
    retrieval_precision_at_k,
    retrieval_recall_at_k,
    source_presence,
)
from agent_bench.evaluation.report import generate_report

# --- Metrics tests ---


class TestRetrievalMetrics:
    def test_precision_at_k_perfect(self):
        assert retrieval_precision_at_k(["a.md", "b.md"], ["a.md", "b.md"]) == 1.0

    def test_precision_at_k_partial(self):
        assert retrieval_precision_at_k(["a.md", "b.md", "c.md"], ["a.md"]) == pytest.approx(1 / 3)

    def test_precision_at_k_empty_retrieved(self):
        assert retrieval_precision_at_k([], ["a.md"]) == 0.0

    def test_recall_at_k_perfect(self):
        assert retrieval_recall_at_k(["a.md", "b.md", "c.md"], ["a.md", "b.md"]) == 1.0

    def test_recall_at_k_partial(self):
        assert retrieval_recall_at_k(["a.md"], ["a.md", "b.md"]) == 0.5

    def test_recall_at_k_empty_expected(self):
        assert retrieval_recall_at_k(["a.md"], []) == 0.0

    def test_precision_uses_ranked_sources_with_duplicates(self):
        """Ranked sources may have duplicates — precision should count correctly."""
        retrieved = ["a.md", "a.md", "b.md", "c.md", "d.md"]
        expected = ["a.md"]
        # 2 out of 5 retrieved are "a.md"
        assert retrieval_precision_at_k(retrieved, expected, k=5) == pytest.approx(2 / 5)


class TestKeywordMetrics:
    def test_keyword_hit_rate_all_match(self):
        assert keyword_hit_rate("curly braces in path", ["curly braces", "path"]) == 1.0

    def test_keyword_hit_rate_none_match(self):
        assert keyword_hit_rate("something else", ["curly", "braces"]) == 0.0

    def test_keyword_hit_rate_case_insensitive(self):
        assert keyword_hit_rate("CORSMiddleware", ["corsmiddleware"]) == 1.0


class TestSourcePresence:
    def test_has_sources(self):
        resp = AgentResponse(
            answer="test",
            sources=[SourceReference(source="a.md")],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert source_presence(resp) is True

    def test_no_sources(self):
        resp = AgentResponse(
            answer="test",
            sources=[],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert source_presence(resp) is False


class TestGroundedRefusal:
    def test_out_of_scope_with_refusal_no_citations(self):
        """Refusal phrase + no [source:] citations in answer text = passes."""
        assert (
            grounded_refusal("The documentation does not contain this info.", "out_of_scope")
            is True
        )

    def test_out_of_scope_without_refusal(self):
        assert grounded_refusal("Here is how you do it...", "out_of_scope") is False

    def test_out_of_scope_refusal_with_citation_in_answer_fails(self):
        """Refusal phrase + [source:] citation in answer text = NOT a grounded refusal.

        The metric inspects the answer text for citations rather than the
        retrieved-sources list — a correct flavor-A refusal retrieves
        candidates, inspects them, and explicitly declines to cite any of
        them, which is the behavior the metric is designed to measure.
        """
        answer = (
            "The documentation does not contain this info. "
            "[source: some_doc.md]"
        )
        assert grounded_refusal(answer, "out_of_scope") is False

    def test_out_of_scope_refusal_no_citation_passes_even_with_retrieval(self):
        """Flavor-A refusal: agent retrieved candidates but answer cites none."""
        answer = (
            "The retrieved context does not contain information about Jaeger "
            "sidecar injection. I cannot provide an answer."
        )
        # Under the old signature this test would have failed because the
        # retrieved-sources list was non-empty. The fix moves the check to
        # the answer text where the actual citations live.
        assert grounded_refusal(answer, "out_of_scope") is True

    def test_canonical_refusal_phrasing_recognized(self):
        """System-prompt-taught shape "not in the {label} documentation" passes.

        core/prompts.py:17-18 instructs the agent to say "the answer is not
        in the {corpus_label} documentation and stop" on out-of-scope queries.
        The metric must recognize that canonical form.
        """
        answer = "The answer is not in the Kubernetes documentation."
        assert grounded_refusal(answer, "out_of_scope") is True

    def test_not_in_the_is_not_substring_refusal(self):
        """Bare "not in the" fragment must NOT count as refusal.

        Pins the design choice to match the canonical shape via a narrow
        regex anchored on "documentation" rather than a loose substring.
        A future refactor that widens the matcher to substring "not in the"
        will break this test — that is the point.
        """
        answer = "The rate limit is not in the same scope as the request timeout."
        assert grounded_refusal(answer, "out_of_scope") is False

    def test_in_scope_always_true(self):
        assert grounded_refusal("any answer", "retrieval") is True


class TestCitationAccuracy:
    def test_all_citations_valid(self):
        answer = "Info from [source: a.md] and [source: b.md]."
        assert citation_accuracy(answer, ["a.md", "b.md"]) == 1.0

    def test_hallucinated_citation(self):
        answer = "Info from [source: fake.md]."
        assert citation_accuracy(answer, ["a.md"]) == 0.0

    def test_no_citations(self):
        assert citation_accuracy("No citations here.", ["a.md"]) == 1.0


class TestCalculatorMetric:
    def test_calculator_used_when_required(self):
        resp = AgentResponse(
            answer="9",
            tools_used=["search_documents", "calculator"],
            iterations=2,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=True) is True

    def test_calculator_not_used_when_required(self):
        resp = AgentResponse(
            answer="9",
            tools_used=["search_documents"],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=True) is False

    def test_not_required_always_true(self):
        resp = AgentResponse(
            answer="test",
            tools_used=[],
            iterations=1,
            usage=TokenUsage(input_tokens=0, output_tokens=0, estimated_cost_usd=0),
            latency_ms=1.0,
        )
        assert calculator_used_when_expected(resp, requires_calculator=False) is True


# --- Golden dataset loading ---


class TestGoldenDataset:
    def test_load_golden_dataset(self):
        questions = load_golden_dataset("agent_bench/evaluation/datasets/tech_docs_golden.json")
        assert len(questions) == 27
        # Check distribution
        categories = [q.category for q in questions]
        assert categories.count("out_of_scope") == 5
        assert categories.count("calculation") == 3
        # All have required fields
        for q in questions:
            assert q.id
            assert q.question
            assert q.expected_answer_keywords


# --- Report generation ---


class TestReportGeneration:
    def _make_results(self) -> list[EvalResult]:
        usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001)
        return [
            EvalResult(
                question_id="q001",
                question="Test question?",
                category="retrieval",
                difficulty="easy",
                retrieval_precision=0.8,
                retrieval_recall=1.0,
                keyword_hit_rate=0.75,
                has_source_citation=True,
                grounded_refusal=True,
                citation_accuracy=1.0,
                calculator_used_correctly=True,
                tool_calls_made=2,
                latency_ms=100.0,
                tokens_used=usage,
                answer="Test answer",
                retrieved_sources=["a.md"],
            ),
            EvalResult(
                question_id="q002",
                question="Out of scope?",
                category="out_of_scope",
                difficulty="easy",
                retrieval_precision=0.0,
                retrieval_recall=0.0,
                keyword_hit_rate=0.5,
                has_source_citation=False,
                grounded_refusal=True,
                citation_accuracy=1.0,
                calculator_used_correctly=True,
                tool_calls_made=1,
                latency_ms=50.0,
                tokens_used=usage,
                answer="Does not contain",
                retrieved_sources=[],
            ),
        ]

    def test_report_contains_required_sections(self):
        report = generate_report(self._make_results(), provider_name="test")
        assert "## Aggregate Metrics" in report
        assert "## By Category" in report
        assert "## By Difficulty" in report
        assert "## Chunking Strategy Comparison" in report
        assert "## Failure Analysis" in report
        assert "## Per-Question Results" in report

    def test_report_contains_metrics(self):
        report = generate_report(self._make_results(), provider_name="test")
        assert "Retrieval P@5" in report
        assert "Grounded Refusal Rate" in report
        assert "Citation Accuracy" in report