AutoClean-Ai / tests /test_grader.py
sairaj2's picture
Upload folder using huggingface_hub
61da702 verified
"""Tests for the 9-component reward system and hallucination detection."""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pytest
from server.grader import (
calculate_reward,
detect_hallucination_advanced,
compute_calibration_error,
is_refusal_answer,
normalize_text,
check_quote_in_context_advanced,
check_factual_accuracy_advanced,
compute_rouge,
compute_bertscore,
HallucinationType,
HallucinationSeverity,
)
class TestRewardRange:
"""Rewards must always be in [0, 1]."""
@pytest.mark.parametrize("difficulty", ["beginner", "intermediate", "advanced", "expert"])
def test_reward_in_range_correct_answer(self, difficulty):
reward, info = calculate_reward(
answer="Paris is the capital of France.",
confidence=0.9,
source_quote="Paris is the capital of France.",
context="Paris is the capital of France. It is located in northern France.",
ground_truth="Paris",
difficulty_level=difficulty,
)
assert 0.0 <= reward <= 1.0, f"Reward {reward} out of range for {difficulty}"
def test_reward_in_range_wrong_answer(self):
reward, info = calculate_reward(
answer="London is the capital of France.",
confidence=0.9,
source_quote="London is the capital of France.",
context="Paris is the capital of France.",
ground_truth="Paris",
)
assert 0.0 <= reward <= 1.0
def test_reward_in_range_empty_answer(self):
reward, info = calculate_reward(
answer="",
confidence=0.5,
source_quote="",
context="Some context here.",
ground_truth="Some answer",
)
assert 0.0 <= reward <= 1.0
def test_reward_in_range_refusal(self):
reward, info = calculate_reward(
answer="I cannot answer from the provided context.",
confidence=0.3,
source_quote="",
context="Some unrelated context.",
ground_truth="not mentioned in context",
)
assert 0.0 <= reward <= 1.0
class TestRefusalHandling:
"""Proper refusals on unanswerable questions should be rewarded."""
def test_proper_refusal_rewarded(self):
reward, info = calculate_reward(
answer="I cannot answer from the provided context.",
confidence=0.3,
source_quote="",
context="The sky is blue.",
ground_truth="not mentioned in context",
)
assert reward >= 0.5, f"Proper refusal should get reward >= 0.5, got {reward}"
assert info.get("is_refusal") is True
def test_underconfident_refusal_penalized(self):
"""Refusing when the answer IS in context should be penalized."""
reward, info = calculate_reward(
answer="I cannot determine the answer from the context.",
confidence=0.3,
source_quote="",
context="The capital of France is Paris.",
ground_truth="Paris",
)
assert reward <= 0.4, f"Underconfident refusal should be penalized, got {reward}"
def test_overconfident_refusal(self):
"""High confidence refusal on answerable question should be penalized."""
reward, info = calculate_reward(
answer="I don't know the answer.",
confidence=0.9,
source_quote="",
context="The capital of France is Paris.",
ground_truth="Paris",
)
assert reward <= 0.5
class TestHallucinationDetection:
"""Hallucination detection should classify types correctly."""
def test_no_hallucination_for_grounded_answer(self):
score, htype, severity, analysis = detect_hallucination_advanced(
answer="Paris is the capital of France.",
context="Paris is the capital of France.",
ground_truth="Paris",
confidence=0.9,
)
assert score < 0.3, f"Grounded answer should have low hallucination score, got {score}"
def test_fabricated_fact_detected(self):
score, htype, severity, analysis = detect_hallucination_advanced(
answer="Berlin is the capital of France.",
context="Paris is the capital of France.",
ground_truth="Paris",
confidence=0.9,
)
assert score > 0.3, f"Fabricated fact should have high hallucination score, got {score}"
def test_numerical_fabrication_detected(self):
score, htype, severity, analysis = detect_hallucination_advanced(
answer="The population is 8.7 million.",
context="The population is 2.1 million people.",
ground_truth="2.1 million",
confidence=0.8,
)
assert analysis.get("numerical_fabrication", 0) > 0, \
f"Fabricated number 8.7 should be detected, got {analysis}"
class TestCitationAccuracy:
"""Source quote verification should work correctly."""
def test_exact_quote_match(self):
score, analysis = check_quote_in_context_advanced(
"Paris is the capital of France.",
"Paris is the capital of France. It is a beautiful city.",
)
assert score == 1.0, f"Exact quote should score 1.0, got {score}"
def test_no_quote(self):
score, analysis = check_quote_in_context_advanced(
"",
"Some context here.",
)
assert score == 0.0
def test_partial_quote(self):
score, analysis = check_quote_in_context_advanced(
"capital of France",
"Paris is the capital of France.",
)
assert score > 0.5, f"Partial quote should score > 0.5, got {score}"
class TestCalibrationError:
"""Calibration error should penalize overconfidence."""
def test_perfect_calibration(self):
error = compute_calibration_error(0.9, 0.9)
assert error == 0.0
def test_overconfidence_penalized(self):
error = compute_calibration_error(0.95, 0.3)
assert error > 0.5, f"Overconfidence should be heavily penalized, got {error}"
def test_underconfidence_safe(self):
error = compute_calibration_error(0.3, 0.9)
assert error < compute_calibration_error(0.95, 0.3), \
"Overconfidence should be penalized more than underconfidence"
class TestBERTScoreEdgeCases:
"""BERTScore should not crash on edge cases."""
def test_empty_strings(self):
result = compute_bertscore("", "")
assert result["f1"] == 0.0
def test_identical_strings(self):
result = compute_bertscore("The cat sat on the mat.", "The cat sat on the mat.")
assert result["f1"] > 0.8, f"Identical strings should have high BERTScore, got {result['f1']}"
def test_short_strings(self):
result = compute_bertscore("yes", "no")
assert "f1" in result # Should not crash
class TestROUGE:
"""ROUGE scores should be computed correctly."""
def test_identical_strings(self):
result = compute_rouge("The cat sat on the mat.", "The cat sat on the mat.")
assert result["rougeL"] == 1.0
def test_completely_different(self):
result = compute_rouge("The cat sat on the mat.", "Dogs run in the park.")
assert result["rougeL"] < 0.5
def test_empty_strings(self):
result = compute_rouge("", "")
assert result["rouge1"] == 0.0
class TestFactualAccuracy:
"""Factual accuracy should handle various answer types."""
def test_exact_match(self):
score, analysis = check_factual_accuracy_advanced(
"Paris", "Paris", "Paris is the capital of France."
)
assert score >= 0.9, f"Exact match should score high, got {score}"
def test_wrong_answer(self):
score, analysis = check_factual_accuracy_advanced(
"London", "Paris", "Paris is the capital of France."
)
assert score < 0.5, f"Wrong answer should score low, got {score}"
def test_contains_truth(self):
score, analysis = check_factual_accuracy_advanced(
"The capital is Paris, which is in northern France.",
"Paris",
"Paris is the capital of France.",
)
assert score >= 0.8, f"Answer containing truth should score high, got {score}"
class TestNormalizeText:
"""Text normalization should handle edge cases."""
def test_empty_string(self):
assert normalize_text("") == ""
def test_whitespace_normalization(self):
result = normalize_text(" The cat sat ")
assert " " not in result
def test_case_normalization(self):
result = normalize_text("PARIS IS THE CAPITAL")
assert result == result.lower()