Spaces:
Sleeping
Sleeping
| """Tests for the 9-component reward system and hallucination detection.""" | |
| import sys, os | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| import pytest | |
| from server.grader import ( | |
| calculate_reward, | |
| detect_hallucination_advanced, | |
| compute_calibration_error, | |
| is_refusal_answer, | |
| normalize_text, | |
| check_quote_in_context_advanced, | |
| check_factual_accuracy_advanced, | |
| compute_rouge, | |
| compute_bertscore, | |
| HallucinationType, | |
| HallucinationSeverity, | |
| ) | |
| class TestRewardRange: | |
| """Rewards must always be in [0, 1].""" | |
| def test_reward_in_range_correct_answer(self, difficulty): | |
| reward, info = calculate_reward( | |
| answer="Paris is the capital of France.", | |
| confidence=0.9, | |
| source_quote="Paris is the capital of France.", | |
| context="Paris is the capital of France. It is located in northern France.", | |
| ground_truth="Paris", | |
| difficulty_level=difficulty, | |
| ) | |
| assert 0.0 <= reward <= 1.0, f"Reward {reward} out of range for {difficulty}" | |
| def test_reward_in_range_wrong_answer(self): | |
| reward, info = calculate_reward( | |
| answer="London is the capital of France.", | |
| confidence=0.9, | |
| source_quote="London is the capital of France.", | |
| context="Paris is the capital of France.", | |
| ground_truth="Paris", | |
| ) | |
| assert 0.0 <= reward <= 1.0 | |
| def test_reward_in_range_empty_answer(self): | |
| reward, info = calculate_reward( | |
| answer="", | |
| confidence=0.5, | |
| source_quote="", | |
| context="Some context here.", | |
| ground_truth="Some answer", | |
| ) | |
| assert 0.0 <= reward <= 1.0 | |
| def test_reward_in_range_refusal(self): | |
| reward, info = calculate_reward( | |
| answer="I cannot answer from the provided context.", | |
| confidence=0.3, | |
| source_quote="", | |
| context="Some unrelated context.", | |
| ground_truth="not mentioned in context", | |
| ) | |
| assert 0.0 <= reward <= 1.0 | |
| class TestRefusalHandling: | |
| """Proper refusals on unanswerable questions should be rewarded.""" | |
| def test_proper_refusal_rewarded(self): | |
| reward, info = calculate_reward( | |
| answer="I cannot answer from the provided context.", | |
| confidence=0.3, | |
| source_quote="", | |
| context="The sky is blue.", | |
| ground_truth="not mentioned in context", | |
| ) | |
| assert reward >= 0.5, f"Proper refusal should get reward >= 0.5, got {reward}" | |
| assert info.get("is_refusal") is True | |
| def test_underconfident_refusal_penalized(self): | |
| """Refusing when the answer IS in context should be penalized.""" | |
| reward, info = calculate_reward( | |
| answer="I cannot determine the answer from the context.", | |
| confidence=0.3, | |
| source_quote="", | |
| context="The capital of France is Paris.", | |
| ground_truth="Paris", | |
| ) | |
| assert reward <= 0.4, f"Underconfident refusal should be penalized, got {reward}" | |
| def test_overconfident_refusal(self): | |
| """High confidence refusal on answerable question should be penalized.""" | |
| reward, info = calculate_reward( | |
| answer="I don't know the answer.", | |
| confidence=0.9, | |
| source_quote="", | |
| context="The capital of France is Paris.", | |
| ground_truth="Paris", | |
| ) | |
| assert reward <= 0.5 | |
| class TestHallucinationDetection: | |
| """Hallucination detection should classify types correctly.""" | |
| def test_no_hallucination_for_grounded_answer(self): | |
| score, htype, severity, analysis = detect_hallucination_advanced( | |
| answer="Paris is the capital of France.", | |
| context="Paris is the capital of France.", | |
| ground_truth="Paris", | |
| confidence=0.9, | |
| ) | |
| assert score < 0.3, f"Grounded answer should have low hallucination score, got {score}" | |
| def test_fabricated_fact_detected(self): | |
| score, htype, severity, analysis = detect_hallucination_advanced( | |
| answer="Berlin is the capital of France.", | |
| context="Paris is the capital of France.", | |
| ground_truth="Paris", | |
| confidence=0.9, | |
| ) | |
| assert score > 0.3, f"Fabricated fact should have high hallucination score, got {score}" | |
| def test_numerical_fabrication_detected(self): | |
| score, htype, severity, analysis = detect_hallucination_advanced( | |
| answer="The population is 8.7 million.", | |
| context="The population is 2.1 million people.", | |
| ground_truth="2.1 million", | |
| confidence=0.8, | |
| ) | |
| assert analysis.get("numerical_fabrication", 0) > 0, \ | |
| f"Fabricated number 8.7 should be detected, got {analysis}" | |
| class TestCitationAccuracy: | |
| """Source quote verification should work correctly.""" | |
| def test_exact_quote_match(self): | |
| score, analysis = check_quote_in_context_advanced( | |
| "Paris is the capital of France.", | |
| "Paris is the capital of France. It is a beautiful city.", | |
| ) | |
| assert score == 1.0, f"Exact quote should score 1.0, got {score}" | |
| def test_no_quote(self): | |
| score, analysis = check_quote_in_context_advanced( | |
| "", | |
| "Some context here.", | |
| ) | |
| assert score == 0.0 | |
| def test_partial_quote(self): | |
| score, analysis = check_quote_in_context_advanced( | |
| "capital of France", | |
| "Paris is the capital of France.", | |
| ) | |
| assert score > 0.5, f"Partial quote should score > 0.5, got {score}" | |
| class TestCalibrationError: | |
| """Calibration error should penalize overconfidence.""" | |
| def test_perfect_calibration(self): | |
| error = compute_calibration_error(0.9, 0.9) | |
| assert error == 0.0 | |
| def test_overconfidence_penalized(self): | |
| error = compute_calibration_error(0.95, 0.3) | |
| assert error > 0.5, f"Overconfidence should be heavily penalized, got {error}" | |
| def test_underconfidence_safe(self): | |
| error = compute_calibration_error(0.3, 0.9) | |
| assert error < compute_calibration_error(0.95, 0.3), \ | |
| "Overconfidence should be penalized more than underconfidence" | |
| class TestBERTScoreEdgeCases: | |
| """BERTScore should not crash on edge cases.""" | |
| def test_empty_strings(self): | |
| result = compute_bertscore("", "") | |
| assert result["f1"] == 0.0 | |
| def test_identical_strings(self): | |
| result = compute_bertscore("The cat sat on the mat.", "The cat sat on the mat.") | |
| assert result["f1"] > 0.8, f"Identical strings should have high BERTScore, got {result['f1']}" | |
| def test_short_strings(self): | |
| result = compute_bertscore("yes", "no") | |
| assert "f1" in result # Should not crash | |
| class TestROUGE: | |
| """ROUGE scores should be computed correctly.""" | |
| def test_identical_strings(self): | |
| result = compute_rouge("The cat sat on the mat.", "The cat sat on the mat.") | |
| assert result["rougeL"] == 1.0 | |
| def test_completely_different(self): | |
| result = compute_rouge("The cat sat on the mat.", "Dogs run in the park.") | |
| assert result["rougeL"] < 0.5 | |
| def test_empty_strings(self): | |
| result = compute_rouge("", "") | |
| assert result["rouge1"] == 0.0 | |
| class TestFactualAccuracy: | |
| """Factual accuracy should handle various answer types.""" | |
| def test_exact_match(self): | |
| score, analysis = check_factual_accuracy_advanced( | |
| "Paris", "Paris", "Paris is the capital of France." | |
| ) | |
| assert score >= 0.9, f"Exact match should score high, got {score}" | |
| def test_wrong_answer(self): | |
| score, analysis = check_factual_accuracy_advanced( | |
| "London", "Paris", "Paris is the capital of France." | |
| ) | |
| assert score < 0.5, f"Wrong answer should score low, got {score}" | |
| def test_contains_truth(self): | |
| score, analysis = check_factual_accuracy_advanced( | |
| "The capital is Paris, which is in northern France.", | |
| "Paris", | |
| "Paris is the capital of France.", | |
| ) | |
| assert score >= 0.8, f"Answer containing truth should score high, got {score}" | |
| class TestNormalizeText: | |
| """Text normalization should handle edge cases.""" | |
| def test_empty_string(self): | |
| assert normalize_text("") == "" | |
| def test_whitespace_normalization(self): | |
| result = normalize_text(" The cat sat ") | |
| assert " " not in result | |
| def test_case_normalization(self): | |
| result = normalize_text("PARIS IS THE CAPITAL") | |
| assert result == result.lower() |