hallucination-guard-env / tests /test_grader.py
SamSankar's picture
Upload folder using huggingface_hub
c7b7c5e verified
"""Tests for the HallucinationGrader scoring system."""
import pytest
from server.grader import HallucinationGrader
class TestGraderScoreRange:
"""Tests that grader returns valid score ranges."""
def test_grader_returns_score_in_range(self):
"""Grader should return score between 0.0 and 1.0."""
grader = HallucinationGrader()
result = grader.grade(
question="What is 2+2?",
context="2+2 equals 4.",
answer="4",
ground_truth="4"
)
assert 0.0 <= result <= 1.0
def test_grader_with_exact_match(self):
"""Exact match should score high."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
assert result >= 0.7
def test_grader_with_wrong_answer(self):
"""Wrong answer should score low."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="London",
ground_truth="Paris"
)
assert result < 0.5
def test_grader_with_partial_match(self):
"""Partial match should score moderately."""
grader = HallucinationGrader()
result = grader.grade(
question="Who wrote Romeo and Juliet?",
context="Romeo and Juliet was written by William Shakespeare.",
answer="Shakespeare",
ground_truth="William Shakespeare"
)
assert 0.3 <= result <= 0.9
class TestHallucinationDetection:
"""Tests for hallucination detection."""
def test_detects_fabricated_fact(self):
"""Grader should detect fabricated facts."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the population of Tokyo?",
context="Tokyo is a major city in Japan.",
answer="Tokyo has 50 million people.", # Not in context
ground_truth="Not mentioned"
)
assert result < 0.5
def test_detects_false_citation(self):
"""Grader should detect false citations."""
grader = HallucinationGrader()
result = grader.grade(
question="What color is the sky?",
context="The sky appears blue during clear days.",
answer="The sky is green.",
ground_truth="blue",
source_quote="The sky appears green" # Not in context
)
assert result < 0.5
def test_overconfident_wrong_answer(self):
"""High confidence on wrong answer should be penalized."""
grader = HallucinationGrader()
result_confident = grader.grade(
question="What is 5+5?",
context="Basic arithmetic.",
answer="20", # Wrong
ground_truth="10",
confidence=0.95 # High confidence
)
result_uncertain = grader.grade(
question="What is 5+5?",
context="Basic arithmetic.",
answer="20", # Wrong
ground_truth="10",
confidence=0.3 # Low confidence
)
# Confident wrong answer should score lower
assert result_confident < result_uncertain
class TestSourceGrounding:
"""Tests for source grounding verification."""
def test_answer_grounded_in_context(self):
"""Answer supported by context should score higher."""
grader = HallucinationGrader()
result_grounded = grader.grade(
question="What is Python?",
context="Python is a programming language created by Guido van Rossum.",
answer="Python is a programming language.",
ground_truth="programming language"
)
assert result_grounded >= 0.5
def test_answer_not_in_context(self):
"""Answer not supported by context should be penalized."""
grader = HallucinationGrader()
result = grader.grade(
question="Who created Python?",
context="Python is a programming language.",
answer="Guido van Rossum created Python in 1991.", # Details not in context
ground_truth="Not mentioned"
)
assert result < 0.7
class TestConfidenceCalibration:
"""Tests for confidence calibration."""
def test_confident_correct_answer(self):
"""High confidence on correct answer should be rewarded."""
grader = HallucinationGrader()
result = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.95
)
assert result >= 0.7
def test_uncertain_correct_answer(self):
"""Low confidence on correct answer should be slightly penalized."""
grader = HallucinationGrader()
result_high_conf = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.95
)
result_low_conf = grader.grade(
question="What is 1+1?",
context="Basic math.",
answer="2",
ground_truth="2",
confidence=0.3
)
# High confidence on correct answer should score higher
assert result_high_conf >= result_low_conf
class TestGraderDeterminism:
"""Tests for grader determinism."""
def test_grader_is_deterministic(self):
"""Same inputs should produce same output."""
grader = HallucinationGrader()
result1 = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
result2 = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="Paris",
ground_truth="Paris"
)
assert result1 == result2
def test_grader_handles_empty_answer(self):
"""Grader should handle empty answer gracefully."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="The capital of France is Paris.",
answer="",
ground_truth="Paris"
)
assert 0.0 <= result <= 1.0
def test_grader_handles_empty_context(self):
"""Grader should handle empty context gracefully."""
grader = HallucinationGrader()
result = grader.grade(
question="What is the capital of France?",
context="",
answer="Paris",
ground_truth="Paris"
)
assert 0.0 <= result <= 1.0