| """Tests for the HallucinationGrader scoring system.""" |
| import pytest |
| from server.grader import HallucinationGrader |
|
|
|
|
| class TestGraderScoreRange: |
| """Tests that grader returns valid score ranges.""" |
|
|
| def test_grader_returns_score_in_range(self): |
| """Grader should return score between 0.0 and 1.0.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is 2+2?", |
| context="2+2 equals 4.", |
| answer="4", |
| ground_truth="4" |
| ) |
|
|
| assert 0.0 <= result <= 1.0 |
|
|
| def test_grader_with_exact_match(self): |
| """Exact match should score high.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is the capital of France?", |
| context="The capital of France is Paris.", |
| answer="Paris", |
| ground_truth="Paris" |
| ) |
|
|
| assert result >= 0.7 |
|
|
| def test_grader_with_wrong_answer(self): |
| """Wrong answer should score low.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is the capital of France?", |
| context="The capital of France is Paris.", |
| answer="London", |
| ground_truth="Paris" |
| ) |
|
|
| assert result < 0.5 |
|
|
| def test_grader_with_partial_match(self): |
| """Partial match should score moderately.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="Who wrote Romeo and Juliet?", |
| context="Romeo and Juliet was written by William Shakespeare.", |
| answer="Shakespeare", |
| ground_truth="William Shakespeare" |
| ) |
|
|
| assert 0.3 <= result <= 0.9 |
|
|
|
|
| class TestHallucinationDetection: |
| """Tests for hallucination detection.""" |
|
|
| def test_detects_fabricated_fact(self): |
| """Grader should detect fabricated facts.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is the population of Tokyo?", |
| context="Tokyo is a major city in Japan.", |
| answer="Tokyo has 50 million people.", |
| ground_truth="Not mentioned" |
| ) |
|
|
| assert result < 0.5 |
|
|
| def test_detects_false_citation(self): |
| """Grader should detect false citations.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What color is the sky?", |
| context="The sky appears blue during clear days.", |
| answer="The sky is green.", |
| ground_truth="blue", |
| source_quote="The sky appears green" |
| ) |
|
|
| assert result < 0.5 |
|
|
| def test_overconfident_wrong_answer(self): |
| """High confidence on wrong answer should be penalized.""" |
| grader = HallucinationGrader() |
| result_confident = grader.grade( |
| question="What is 5+5?", |
| context="Basic arithmetic.", |
| answer="20", |
| ground_truth="10", |
| confidence=0.95 |
| ) |
| result_uncertain = grader.grade( |
| question="What is 5+5?", |
| context="Basic arithmetic.", |
| answer="20", |
| ground_truth="10", |
| confidence=0.3 |
| ) |
|
|
| |
| assert result_confident < result_uncertain |
|
|
|
|
| class TestSourceGrounding: |
| """Tests for source grounding verification.""" |
|
|
| def test_answer_grounded_in_context(self): |
| """Answer supported by context should score higher.""" |
| grader = HallucinationGrader() |
| result_grounded = grader.grade( |
| question="What is Python?", |
| context="Python is a programming language created by Guido van Rossum.", |
| answer="Python is a programming language.", |
| ground_truth="programming language" |
| ) |
|
|
| assert result_grounded >= 0.5 |
|
|
| def test_answer_not_in_context(self): |
| """Answer not supported by context should be penalized.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="Who created Python?", |
| context="Python is a programming language.", |
| answer="Guido van Rossum created Python in 1991.", |
| ground_truth="Not mentioned" |
| ) |
|
|
| assert result < 0.7 |
|
|
|
|
| class TestConfidenceCalibration: |
| """Tests for confidence calibration.""" |
|
|
| def test_confident_correct_answer(self): |
| """High confidence on correct answer should be rewarded.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is 1+1?", |
| context="Basic math.", |
| answer="2", |
| ground_truth="2", |
| confidence=0.95 |
| ) |
|
|
| assert result >= 0.7 |
|
|
| def test_uncertain_correct_answer(self): |
| """Low confidence on correct answer should be slightly penalized.""" |
| grader = HallucinationGrader() |
| result_high_conf = grader.grade( |
| question="What is 1+1?", |
| context="Basic math.", |
| answer="2", |
| ground_truth="2", |
| confidence=0.95 |
| ) |
| result_low_conf = grader.grade( |
| question="What is 1+1?", |
| context="Basic math.", |
| answer="2", |
| ground_truth="2", |
| confidence=0.3 |
| ) |
|
|
| |
| assert result_high_conf >= result_low_conf |
|
|
|
|
| class TestGraderDeterminism: |
| """Tests for grader determinism.""" |
|
|
| def test_grader_is_deterministic(self): |
| """Same inputs should produce same output.""" |
| grader = HallucinationGrader() |
|
|
| result1 = grader.grade( |
| question="What is the capital of France?", |
| context="The capital of France is Paris.", |
| answer="Paris", |
| ground_truth="Paris" |
| ) |
| result2 = grader.grade( |
| question="What is the capital of France?", |
| context="The capital of France is Paris.", |
| answer="Paris", |
| ground_truth="Paris" |
| ) |
|
|
| assert result1 == result2 |
|
|
| def test_grader_handles_empty_answer(self): |
| """Grader should handle empty answer gracefully.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is the capital of France?", |
| context="The capital of France is Paris.", |
| answer="", |
| ground_truth="Paris" |
| ) |
|
|
| assert 0.0 <= result <= 1.0 |
|
|
| def test_grader_handles_empty_context(self): |
| """Grader should handle empty context gracefully.""" |
| grader = HallucinationGrader() |
| result = grader.grade( |
| question="What is the capital of France?", |
| context="", |
| answer="Paris", |
| ground_truth="Paris" |
| ) |
|
|
| assert 0.0 <= result <= 1.0 |