File size: 7,002 Bytes

c7b7c5e

"""Tests for the HallucinationGrader scoring system."""
import pytest
from server.grader import HallucinationGrader


class TestGraderScoreRange:
    """Tests that grader returns valid score ranges."""

    def test_grader_returns_score_in_range(self):
        """Grader should return score between 0.0 and 1.0."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is 2+2?",
            context="2+2 equals 4.",
            answer="4",
            ground_truth="4"
        )

        assert 0.0 <= result <= 1.0

    def test_grader_with_exact_match(self):
        """Exact match should score high."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is the capital of France?",
            context="The capital of France is Paris.",
            answer="Paris",
            ground_truth="Paris"
        )

        assert result >= 0.7

    def test_grader_with_wrong_answer(self):
        """Wrong answer should score low."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is the capital of France?",
            context="The capital of France is Paris.",
            answer="London",
            ground_truth="Paris"
        )

        assert result < 0.5

    def test_grader_with_partial_match(self):
        """Partial match should score moderately."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="Who wrote Romeo and Juliet?",
            context="Romeo and Juliet was written by William Shakespeare.",
            answer="Shakespeare",
            ground_truth="William Shakespeare"
        )

        assert 0.3 <= result <= 0.9


class TestHallucinationDetection:
    """Tests for hallucination detection."""

    def test_detects_fabricated_fact(self):
        """Grader should detect fabricated facts."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is the population of Tokyo?",
            context="Tokyo is a major city in Japan.",
            answer="Tokyo has 50 million people.",  # Not in context
            ground_truth="Not mentioned"
        )

        assert result < 0.5

    def test_detects_false_citation(self):
        """Grader should detect false citations."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What color is the sky?",
            context="The sky appears blue during clear days.",
            answer="The sky is green.",
            ground_truth="blue",
            source_quote="The sky appears green"  # Not in context
        )

        assert result < 0.5

    def test_overconfident_wrong_answer(self):
        """High confidence on wrong answer should be penalized."""
        grader = HallucinationGrader()
        result_confident = grader.grade(
            question="What is 5+5?",
            context="Basic arithmetic.",
            answer="20",  # Wrong
            ground_truth="10",
            confidence=0.95  # High confidence
        )
        result_uncertain = grader.grade(
            question="What is 5+5?",
            context="Basic arithmetic.",
            answer="20",  # Wrong
            ground_truth="10",
            confidence=0.3  # Low confidence
        )

        # Confident wrong answer should score lower
        assert result_confident < result_uncertain


class TestSourceGrounding:
    """Tests for source grounding verification."""

    def test_answer_grounded_in_context(self):
        """Answer supported by context should score higher."""
        grader = HallucinationGrader()
        result_grounded = grader.grade(
            question="What is Python?",
            context="Python is a programming language created by Guido van Rossum.",
            answer="Python is a programming language.",
            ground_truth="programming language"
        )

        assert result_grounded >= 0.5

    def test_answer_not_in_context(self):
        """Answer not supported by context should be penalized."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="Who created Python?",
            context="Python is a programming language.",
            answer="Guido van Rossum created Python in 1991.",  # Details not in context
            ground_truth="Not mentioned"
        )

        assert result < 0.7


class TestConfidenceCalibration:
    """Tests for confidence calibration."""

    def test_confident_correct_answer(self):
        """High confidence on correct answer should be rewarded."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is 1+1?",
            context="Basic math.",
            answer="2",
            ground_truth="2",
            confidence=0.95
        )

        assert result >= 0.7

    def test_uncertain_correct_answer(self):
        """Low confidence on correct answer should be slightly penalized."""
        grader = HallucinationGrader()
        result_high_conf = grader.grade(
            question="What is 1+1?",
            context="Basic math.",
            answer="2",
            ground_truth="2",
            confidence=0.95
        )
        result_low_conf = grader.grade(
            question="What is 1+1?",
            context="Basic math.",
            answer="2",
            ground_truth="2",
            confidence=0.3
        )

        # High confidence on correct answer should score higher
        assert result_high_conf >= result_low_conf


class TestGraderDeterminism:
    """Tests for grader determinism."""

    def test_grader_is_deterministic(self):
        """Same inputs should produce same output."""
        grader = HallucinationGrader()

        result1 = grader.grade(
            question="What is the capital of France?",
            context="The capital of France is Paris.",
            answer="Paris",
            ground_truth="Paris"
        )
        result2 = grader.grade(
            question="What is the capital of France?",
            context="The capital of France is Paris.",
            answer="Paris",
            ground_truth="Paris"
        )

        assert result1 == result2

    def test_grader_handles_empty_answer(self):
        """Grader should handle empty answer gracefully."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is the capital of France?",
            context="The capital of France is Paris.",
            answer="",
            ground_truth="Paris"
        )

        assert 0.0 <= result <= 1.0

    def test_grader_handles_empty_context(self):
        """Grader should handle empty context gracefully."""
        grader = HallucinationGrader()
        result = grader.grade(
            question="What is the capital of France?",
            context="",
            answer="Paris",
            ground_truth="Paris"
        )

        assert 0.0 <= result <= 1.0