Upload folder using huggingface_hub

c7b7c5e verified about 2 months ago

7 kB

	"""Tests for the HallucinationGrader scoring system."""
	import pytest
	from server.grader import HallucinationGrader


	class TestGraderScoreRange:
	"""Tests that grader returns valid score ranges."""

	def test_grader_returns_score_in_range(self):
	"""Grader should return score between 0.0 and 1.0."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is 2+2?",
	context="2+2 equals 4.",
	answer="4",
	ground_truth="4"
	)

	assert 0.0 <= result <= 1.0

	def test_grader_with_exact_match(self):
	"""Exact match should score high."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is the capital of France?",
	context="The capital of France is Paris.",
	answer="Paris",
	ground_truth="Paris"
	)

	assert result >= 0.7

	def test_grader_with_wrong_answer(self):
	"""Wrong answer should score low."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is the capital of France?",
	context="The capital of France is Paris.",
	answer="London",
	ground_truth="Paris"
	)

	assert result < 0.5

	def test_grader_with_partial_match(self):
	"""Partial match should score moderately."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="Who wrote Romeo and Juliet?",
	context="Romeo and Juliet was written by William Shakespeare.",
	answer="Shakespeare",
	ground_truth="William Shakespeare"
	)

	assert 0.3 <= result <= 0.9


	class TestHallucinationDetection:
	"""Tests for hallucination detection."""

	def test_detects_fabricated_fact(self):
	"""Grader should detect fabricated facts."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is the population of Tokyo?",
	context="Tokyo is a major city in Japan.",
	answer="Tokyo has 50 million people.", # Not in context
	ground_truth="Not mentioned"
	)

	assert result < 0.5

	def test_detects_false_citation(self):
	"""Grader should detect false citations."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What color is the sky?",
	context="The sky appears blue during clear days.",
	answer="The sky is green.",
	ground_truth="blue",
	source_quote="The sky appears green" # Not in context
	)

	assert result < 0.5

	def test_overconfident_wrong_answer(self):
	"""High confidence on wrong answer should be penalized."""
	grader = HallucinationGrader()
	result_confident = grader.grade(
	question="What is 5+5?",
	context="Basic arithmetic.",
	answer="20", # Wrong
	ground_truth="10",
	confidence=0.95 # High confidence
	)
	result_uncertain = grader.grade(
	question="What is 5+5?",
	context="Basic arithmetic.",
	answer="20", # Wrong
	ground_truth="10",
	confidence=0.3 # Low confidence
	)

	# Confident wrong answer should score lower
	assert result_confident < result_uncertain


	class TestSourceGrounding:
	"""Tests for source grounding verification."""

	def test_answer_grounded_in_context(self):
	"""Answer supported by context should score higher."""
	grader = HallucinationGrader()
	result_grounded = grader.grade(
	question="What is Python?",
	context="Python is a programming language created by Guido van Rossum.",
	answer="Python is a programming language.",
	ground_truth="programming language"
	)

	assert result_grounded >= 0.5

	def test_answer_not_in_context(self):
	"""Answer not supported by context should be penalized."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="Who created Python?",
	context="Python is a programming language.",
	answer="Guido van Rossum created Python in 1991.", # Details not in context
	ground_truth="Not mentioned"
	)

	assert result < 0.7


	class TestConfidenceCalibration:
	"""Tests for confidence calibration."""

	def test_confident_correct_answer(self):
	"""High confidence on correct answer should be rewarded."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is 1+1?",
	context="Basic math.",
	answer="2",
	ground_truth="2",
	confidence=0.95
	)

	assert result >= 0.7

	def test_uncertain_correct_answer(self):
	"""Low confidence on correct answer should be slightly penalized."""
	grader = HallucinationGrader()
	result_high_conf = grader.grade(
	question="What is 1+1?",
	context="Basic math.",
	answer="2",
	ground_truth="2",
	confidence=0.95
	)
	result_low_conf = grader.grade(
	question="What is 1+1?",
	context="Basic math.",
	answer="2",
	ground_truth="2",
	confidence=0.3
	)

	# High confidence on correct answer should score higher
	assert result_high_conf >= result_low_conf


	class TestGraderDeterminism:
	"""Tests for grader determinism."""

	def test_grader_is_deterministic(self):
	"""Same inputs should produce same output."""
	grader = HallucinationGrader()

	result1 = grader.grade(
	question="What is the capital of France?",
	context="The capital of France is Paris.",
	answer="Paris",
	ground_truth="Paris"
	)
	result2 = grader.grade(
	question="What is the capital of France?",
	context="The capital of France is Paris.",
	answer="Paris",
	ground_truth="Paris"
	)

	assert result1 == result2

	def test_grader_handles_empty_answer(self):
	"""Grader should handle empty answer gracefully."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is the capital of France?",
	context="The capital of France is Paris.",
	answer="",
	ground_truth="Paris"
	)

	assert 0.0 <= result <= 1.0

	def test_grader_handles_empty_context(self):
	"""Grader should handle empty context gracefully."""
	grader = HallucinationGrader()
	result = grader.grade(
	question="What is the capital of France?",
	context="",
	answer="Paris",
	ground_truth="Paris"
	)

	assert 0.0 <= result <= 1.0