File size: 7,002 Bytes
c7b7c5e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | """Tests for the HallucinationGrader scoring system."""
import pytest
from server.grader import HallucinationGrader
class TestGraderScoreRange:
"""Tests that grader returns valid score ranges."""
def test_grader_returns_score_in_range(self):
"""Grader should return score between 0.0 and 1.0."""
grader = HallucinationGrader()
result = grader.grade(
question="What is 2+2?",
context="2+2 equals 4.",
answer="4",
ground_truth="4"
)
assert 0.0 <= result <= 1.0
def test_grader_with_exact_match(self):
"""Exact match should score high."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
assert result >= 0.7
def test_grader_with_wrong_answer(self):
"""Wrong answer should score low."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="London",
ground_truth="Paris"
)
assert result < 0.5
def test_grader_with_partial_match(self):
"""Partial match should score moderately."""
grader = HallucinationGrader()
result = grader.grade(
question="Who wrote Romeo and Juliet?",
context="Romeo and Juliet was written by William Shakespeare.",
answer="Shakespeare",
ground_truth="William Shakespeare"
)
assert 0.3 <= result <= 0.9
class TestHallucinationDetection:
"""Tests for hallucination detection."""
def test_detects_fabricated_fact(self):
"""Grader should detect fabricated facts."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the population of Tokyo?",
context="Tokyo is a major city in Japan.",
answer="Tokyo has 50 million people.", # Not in context
ground_truth="Not mentioned"
)
assert result < 0.5
def test_detects_false_citation(self):
"""Grader should detect false citations."""
grader = HallucinationGrader()
result = grader.grade(
question="What color is the sky?",
context="The sky appears blue during clear days.",
answer="The sky is green.",
ground_truth="blue",
source_quote="The sky appears green" # Not in context
)
assert result < 0.5
def test_overconfident_wrong_answer(self):
"""High confidence on wrong answer should be penalized."""
grader = HallucinationGrader()
result_confident = grader.grade(
question="What is 5+5?",
context="Basic arithmetic.",
answer="20", # Wrong
ground_truth="10",
confidence=0.95 # High confidence
)
result_uncertain = grader.grade(
question="What is 5+5?",
context="Basic arithmetic.",
answer="20", # Wrong
ground_truth="10",
confidence=0.3 # Low confidence
)
# Confident wrong answer should score lower
assert result_confident < result_uncertain
class TestSourceGrounding:
"""Tests for source grounding verification."""
def test_answer_grounded_in_context(self):
"""Answer supported by context should score higher."""
grader = HallucinationGrader()
result_grounded = grader.grade(
question="What is Python?",
context="Python is a programming language created by Guido van Rossum.",
answer="Python is a programming language.",
ground_truth="programming language"
)
assert result_grounded >= 0.5
def test_answer_not_in_context(self):
"""Answer not supported by context should be penalized."""
grader = HallucinationGrader()
result = grader.grade(
question="Who created Python?",
context="Python is a programming language.",
answer="Guido van Rossum created Python in 1991.", # Details not in context
ground_truth="Not mentioned"
)
assert result < 0.7
class TestConfidenceCalibration:
"""Tests for confidence calibration."""
def test_confident_correct_answer(self):
"""High confidence on correct answer should be rewarded."""
grader = HallucinationGrader()
result = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.95
)
assert result >= 0.7
def test_uncertain_correct_answer(self):
"""Low confidence on correct answer should be slightly penalized."""
grader = HallucinationGrader()
result_high_conf = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.95
)
result_low_conf = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.3
)
# High confidence on correct answer should score higher
assert result_high_conf >= result_low_conf
class TestGraderDeterminism:
"""Tests for grader determinism."""
def test_grader_is_deterministic(self):
"""Same inputs should produce same output."""
grader = HallucinationGrader()
result1 = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
result2 = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
assert result1 == result2
def test_grader_handles_empty_answer(self):
"""Grader should handle empty answer gracefully."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="",
ground_truth="Paris"
)
assert 0.0 <= result <= 1.0
def test_grader_handles_empty_context(self):
"""Grader should handle empty context gracefully."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="",
answer="Paris",
ground_truth="Paris"
)
assert 0.0 <= result <= 1.0 |