Spaces:

sairaj2
/

AutoClean-Ai

Sleeping

App Files Files Community

AutoClean-Ai / tests /test_grader.py

sairaj2

Upload folder using huggingface_hub

61da702 verified 4 days ago

raw

history blame contribute delete

8.85 kB

	"""Tests for the 9-component reward system and hallucination detection."""

	import sys, os
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	import pytest
	from server.grader import (
	calculate_reward,
	detect_hallucination_advanced,
	compute_calibration_error,
	is_refusal_answer,
	normalize_text,
	check_quote_in_context_advanced,
	check_factual_accuracy_advanced,
	compute_rouge,
	compute_bertscore,
	HallucinationType,
	HallucinationSeverity,
	)


	class TestRewardRange:
	"""Rewards must always be in [0, 1]."""

	@pytest.mark.parametrize("difficulty", ["beginner", "intermediate", "advanced", "expert"])
	def test_reward_in_range_correct_answer(self, difficulty):
	reward, info = calculate_reward(
	answer="Paris is the capital of France.",
	confidence=0.9,
	source_quote="Paris is the capital of France.",
	context="Paris is the capital of France. It is located in northern France.",
	ground_truth="Paris",
	difficulty_level=difficulty,
	)
	assert 0.0 <= reward <= 1.0, f"Reward {reward} out of range for {difficulty}"

	def test_reward_in_range_wrong_answer(self):
	reward, info = calculate_reward(
	answer="London is the capital of France.",
	confidence=0.9,
	source_quote="London is the capital of France.",
	context="Paris is the capital of France.",
	ground_truth="Paris",
	)
	assert 0.0 <= reward <= 1.0

	def test_reward_in_range_empty_answer(self):
	reward, info = calculate_reward(
	answer="",
	confidence=0.5,
	source_quote="",
	context="Some context here.",
	ground_truth="Some answer",
	)
	assert 0.0 <= reward <= 1.0

	def test_reward_in_range_refusal(self):
	reward, info = calculate_reward(
	answer="I cannot answer from the provided context.",
	confidence=0.3,
	source_quote="",
	context="Some unrelated context.",
	ground_truth="not mentioned in context",
	)
	assert 0.0 <= reward <= 1.0


	class TestRefusalHandling:
	"""Proper refusals on unanswerable questions should be rewarded."""

	def test_proper_refusal_rewarded(self):
	reward, info = calculate_reward(
	answer="I cannot answer from the provided context.",
	confidence=0.3,
	source_quote="",
	context="The sky is blue.",
	ground_truth="not mentioned in context",
	)
	assert reward >= 0.5, f"Proper refusal should get reward >= 0.5, got {reward}"
	assert info.get("is_refusal") is True

	def test_underconfident_refusal_penalized(self):
	"""Refusing when the answer IS in context should be penalized."""
	reward, info = calculate_reward(
	answer="I cannot determine the answer from the context.",
	confidence=0.3,
	source_quote="",
	context="The capital of France is Paris.",
	ground_truth="Paris",
	)
	assert reward <= 0.4, f"Underconfident refusal should be penalized, got {reward}"

	def test_overconfident_refusal(self):
	"""High confidence refusal on answerable question should be penalized."""
	reward, info = calculate_reward(
	answer="I don't know the answer.",
	confidence=0.9,
	source_quote="",
	context="The capital of France is Paris.",
	ground_truth="Paris",
	)
	assert reward <= 0.5


	class TestHallucinationDetection:
	"""Hallucination detection should classify types correctly."""

	def test_no_hallucination_for_grounded_answer(self):
	score, htype, severity, analysis = detect_hallucination_advanced(
	answer="Paris is the capital of France.",
	context="Paris is the capital of France.",
	ground_truth="Paris",
	confidence=0.9,
	)
	assert score < 0.3, f"Grounded answer should have low hallucination score, got {score}"

	def test_fabricated_fact_detected(self):
	score, htype, severity, analysis = detect_hallucination_advanced(
	answer="Berlin is the capital of France.",
	context="Paris is the capital of France.",
	ground_truth="Paris",
	confidence=0.9,
	)
	assert score > 0.3, f"Fabricated fact should have high hallucination score, got {score}"

	def test_numerical_fabrication_detected(self):
	score, htype, severity, analysis = detect_hallucination_advanced(
	answer="The population is 8.7 million.",
	context="The population is 2.1 million people.",
	ground_truth="2.1 million",
	confidence=0.8,
	)
	assert analysis.get("numerical_fabrication", 0) > 0, \
	f"Fabricated number 8.7 should be detected, got {analysis}"


	class TestCitationAccuracy:
	"""Source quote verification should work correctly."""

	def test_exact_quote_match(self):
	score, analysis = check_quote_in_context_advanced(
	"Paris is the capital of France.",
	"Paris is the capital of France. It is a beautiful city.",
	)
	assert score == 1.0, f"Exact quote should score 1.0, got {score}"

	def test_no_quote(self):
	score, analysis = check_quote_in_context_advanced(
	"",
	"Some context here.",
	)
	assert score == 0.0

	def test_partial_quote(self):
	score, analysis = check_quote_in_context_advanced(
	"capital of France",
	"Paris is the capital of France.",
	)
	assert score > 0.5, f"Partial quote should score > 0.5, got {score}"


	class TestCalibrationError:
	"""Calibration error should penalize overconfidence."""

	def test_perfect_calibration(self):
	error = compute_calibration_error(0.9, 0.9)
	assert error == 0.0

	def test_overconfidence_penalized(self):
	error = compute_calibration_error(0.95, 0.3)
	assert error > 0.5, f"Overconfidence should be heavily penalized, got {error}"

	def test_underconfidence_safe(self):
	error = compute_calibration_error(0.3, 0.9)
	assert error < compute_calibration_error(0.95, 0.3), \
	"Overconfidence should be penalized more than underconfidence"


	class TestBERTScoreEdgeCases:
	"""BERTScore should not crash on edge cases."""

	def test_empty_strings(self):
	result = compute_bertscore("", "")
	assert result["f1"] == 0.0

	def test_identical_strings(self):
	result = compute_bertscore("The cat sat on the mat.", "The cat sat on the mat.")
	assert result["f1"] > 0.8, f"Identical strings should have high BERTScore, got {result['f1']}"

	def test_short_strings(self):
	result = compute_bertscore("yes", "no")
	assert "f1" in result # Should not crash


	class TestROUGE:
	"""ROUGE scores should be computed correctly."""

	def test_identical_strings(self):
	result = compute_rouge("The cat sat on the mat.", "The cat sat on the mat.")
	assert result["rougeL"] == 1.0

	def test_completely_different(self):
	result = compute_rouge("The cat sat on the mat.", "Dogs run in the park.")
	assert result["rougeL"] < 0.5

	def test_empty_strings(self):
	result = compute_rouge("", "")
	assert result["rouge1"] == 0.0


	class TestFactualAccuracy:
	"""Factual accuracy should handle various answer types."""

	def test_exact_match(self):
	score, analysis = check_factual_accuracy_advanced(
	"Paris", "Paris", "Paris is the capital of France."
	)
	assert score >= 0.9, f"Exact match should score high, got {score}"

	def test_wrong_answer(self):
	score, analysis = check_factual_accuracy_advanced(
	"London", "Paris", "Paris is the capital of France."
	)
	assert score < 0.5, f"Wrong answer should score low, got {score}"

	def test_contains_truth(self):
	score, analysis = check_factual_accuracy_advanced(
	"The capital is Paris, which is in northern France.",
	"Paris",
	"Paris is the capital of France.",
	)
	assert score >= 0.8, f"Answer containing truth should score high, got {score}"


	class TestNormalizeText:
	"""Text normalization should handle edge cases."""

	def test_empty_string(self):
	assert normalize_text("") == ""

	def test_whitespace_normalization(self):
	result = normalize_text(" The cat sat ")
	assert " " not in result

	def test_case_normalization(self):
	result = normalize_text("PARIS IS THE CAPITAL")
	assert result == result.lower()