Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

tests/__init__.py +1 -0
tests/test_dataset_loader.py +135 -0
tests/test_environment.py +196 -0
tests/test_grader.py +218 -0

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # HallucinationGuard-Env Test Suite

tests/test_dataset_loader.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Tests for the DatasetLoader."""
+import pytest
+from server.dataset_loader import DatasetLoader
+class TestDatasetLoaderInit:
+    """Tests for DatasetLoader initialization."""
+    def test_loader_initializes(self):
+        """DatasetLoader should initialize without errors."""
+        loader = DatasetLoader()
+        assert loader is not None
+class TestDatasetSampling:
+    """Tests for dataset sampling."""
+    def test_get_sample_returns_data(self):
+        """get_sample should return a sample with required fields."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_1_factual_grounding")
+        assert sample is not None
+        assert hasattr(sample, 'question') or 'question' in sample
+    def test_get_sample_has_context(self):
+        """Sample should include context."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_1_factual_grounding")
+        assert hasattr(sample, 'context') or 'context' in sample
+    def test_get_sample_has_ground_truth(self):
+        """Sample should include ground truth."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_1_factual_grounding")
+        assert hasattr(sample, 'ground_truth') or 'ground_truth' in sample
+    def test_get_sample_for_task_1(self):
+        """get_sample for task_1 should work."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_1_factual_grounding")
+        assert sample is not None
+    def test_get_sample_for_task_2(self):
+        """get_sample for task_2 should work."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_2_multi_hop_synthesis")
+        assert sample is not None
+    def test_get_sample_for_task_3(self):
+        """get_sample for task_3 should work."""
+        loader = DatasetLoader()
+        sample = loader.get_sample("task_3_adversarial_resistance")
+        assert sample is not None
+class TestDatasetStats:
+    """Tests for dataset statistics."""
+    def test_get_available_datasets(self):
+        """Should be able to get list of available datasets."""
+        loader = DatasetLoader()
+        if hasattr(loader, 'get_available_datasets'):
+            datasets = loader.get_available_datasets()
+            assert isinstance(datasets, list)
+            assert len(datasets) > 0
+    def test_get_dataset_size(self):
+        """Should be able to get dataset size info."""
+        loader = DatasetLoader()
+        if hasattr(loader, 'get_dataset_size'):
+            size = loader.get_dataset_size("task_1_factual_grounding")
+            assert size is None or isinstance(size, int)
+class TestDatasetCaching:
+    """Tests for dataset caching behavior."""
+    def test_multiple_samples_dont_crash(self):
+        """Requesting multiple samples should work."""
+        loader = DatasetLoader()
+        samples = []
+        for _ in range(5):
+            sample = loader.get_sample("task_1_factual_grounding")
+            samples.append(sample)
+        assert len(samples) == 5
+    def test_loader_handles_missing_dataset(self):
+        """Loader should handle request for missing dataset gracefully."""
+        loader = DatasetLoader()
+        try:
+            sample = loader.get_sample("nonexistent_dataset")
+            # If it returns None, that's acceptable
+            assert sample is None or sample is not None
+        except Exception as e:
+            # Should raise a reasonable exception, not crash
+            assert "not found" in str(e).lower() or "invalid" in str(e).lower() or "error" in str(e).lower()
+class TestTaskDifficulty:
+    """Tests for task difficulty mapping."""
+    def test_task_1_is_beginner(self):
+        """Task 1 should map to beginner difficulty."""
+        loader = DatasetLoader()
+        if hasattr(loader, 'get_task_difficulty'):
+            difficulty = loader.get_task_difficulty("task_1_factual_grounding")
+            assert difficulty in ["beginner", "easy", 1, "1"]
+    def test_task_2_is_intermediate(self):
+        """Task 2 should map to intermediate difficulty."""
+        loader = DatasetLoader()
+        if hasattr(loader, 'get_task_difficulty'):
+            difficulty = loader.get_task_difficulty("task_2_multi_hop_synthesis")
+            assert difficulty in ["intermediate", "medium", 2, "2"]
+    def test_task_3_is_advanced(self):
+        """Task 3 should map to advanced difficulty."""
+        loader = DatasetLoader()
+        if hasattr(loader, 'get_task_difficulty'):
+            difficulty = loader.get_task_difficulty("task_3_adversarial_resistance")
+            assert difficulty in ["advanced", "hard", 3, "3"]

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""Tests for the HallucinationGuard environment."""
+import pytest
+from server.environment import HallucinationGuardEnvironment
+class TestEnvironmentReset:
+    """Tests for environment reset functionality."""
+    def test_reset_returns_observation(self):
+        """Reset should return a valid observation."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset()
+        assert obs is not None
+        assert hasattr(obs, 'question')
+        assert hasattr(obs, 'context')
+        assert hasattr(obs, 'reward')
+        assert hasattr(obs, 'done')
+    def test_reset_sets_initial_reward_to_zero(self):
+        """Initial reward should be zero."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset()
+        assert obs.reward == 0.0
+    def test_reset_sets_done_to_false(self):
+        """Episode should not be done after reset."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset()
+        assert obs.done is False
+    def test_reset_provides_attempts_remaining(self):
+        """Reset should indicate attempts remaining."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset()
+        assert obs.attempts_remaining > 0
+    def test_reset_with_task_id(self):
+        """Reset with specific task ID should work."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset(task_id="task_1_factual_grounding")
+        assert obs is not None
+    def test_reset_clears_previous_state(self):
+        """Multiple resets should produce clean state each time."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        obs = env.reset()
+        assert obs.reward == 0.0
+        assert obs.done is False
+class TestEnvironmentStep:
+    """Tests for environment step functionality."""
+    def test_step_returns_observation(self):
+        """Step should return a valid observation."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        action = {
+            "answer": "test answer",
+            "confidence": 0.8,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        obs = env.step(action)
+        assert obs is not None
+        assert hasattr(obs, 'reward')
+    def test_step_reward_in_valid_range(self):
+        """Step reward should be in [0.0, 1.0] range."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        action = {
+            "answer": "test answer",
+            "confidence": 0.5,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        obs = env.step(action)
+        assert -1.0 <= obs.reward <= 1.0
+    def test_step_with_high_confidence(self):
+        """Step with high confidence should work."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        action = {
+            "answer": "test answer",
+            "confidence": 1.0,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        obs = env.step(action)
+        assert obs is not None
+    def test_step_with_low_confidence(self):
+        """Step with low confidence should work."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        action = {
+            "answer": "test answer",
+            "confidence": 0.1,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        obs = env.step(action)
+        assert obs is not None
+    def test_step_updates_attempts(self):
+        """Step should decrement attempts remaining."""
+        env = HallucinationGuardEnvironment()
+        obs1 = env.reset()
+        action = {
+            "answer": "test",
+            "confidence": 0.5,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        obs2 = env.step(action)
+        assert obs2.attempts_remaining < obs1.attempts_remaining
+class TestEnvironmentState:
+    """Tests for environment state functionality."""
+    def test_state_returns_metadata(self):
+        """State should return episode metadata."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        state = env.state()
+        assert state is not None
+        assert hasattr(state, 'episode_id') or hasattr(state, 'step_count') or 'episode_id' in state or 'step_count' in state
+    def test_state_tracks_step_count(self):
+        """State should track step count."""
+        env = HallucinationGuardEnvironment()
+        env.reset()
+        action = {
+            "answer": "test",
+            "confidence": 0.5,
+            "source_quote": "",
+            "reasoning": "",
+            "uncertainty_flags": []
+        }
+        env.step(action)
+        state = env.state()
+        # State should reflect that a step was taken
+        assert state is not None
+class TestTaskSelection:
+    """Tests for task selection."""
+    def test_reset_with_task_1(self):
+        """Reset with task_1_factual_grounding should work."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset(task_id="task_1_factual_grounding")
+        assert obs is not None
+    def test_reset_with_task_2(self):
+        """Reset with task_2_multi_hop_synthesis should work."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset(task_id="task_2_multi_hop_synthesis")
+        assert obs is not None
+    def test_reset_with_task_3(self):
+        """Reset with task_3_adversarial_resistance should work."""
+        env = HallucinationGuardEnvironment()
+        obs = env.reset(task_id="task_3_adversarial_resistance")
+        assert obs is not None

tests/test_grader.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""Tests for the HallucinationGrader scoring system."""
+import pytest
+from server.grader import HallucinationGrader
+class TestGraderScoreRange:
+    """Tests that grader returns valid score ranges."""
+    def test_grader_returns_score_in_range(self):
+        """Grader should return score between 0.0 and 1.0."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is 2+2?",
+            context="2+2 equals 4.",
+            answer="4",
+            ground_truth="4"
+        )
+        assert 0.0 <= result <= 1.0
+    def test_grader_with_exact_match(self):
+        """Exact match should score high."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is the capital of France?",
+            context="The capital of France is Paris.",
+            answer="Paris",
+            ground_truth="Paris"
+        )
+        assert result >= 0.7
+    def test_grader_with_wrong_answer(self):
+        """Wrong answer should score low."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is the capital of France?",
+            context="The capital of France is Paris.",
+            answer="London",
+            ground_truth="Paris"
+        )
+        assert result < 0.5
+    def test_grader_with_partial_match(self):
+        """Partial match should score moderately."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="Who wrote Romeo and Juliet?",
+            context="Romeo and Juliet was written by William Shakespeare.",
+            answer="Shakespeare",
+            ground_truth="William Shakespeare"
+        )
+        assert 0.3 <= result <= 0.9
+class TestHallucinationDetection:
+    """Tests for hallucination detection."""
+    def test_detects_fabricated_fact(self):
+        """Grader should detect fabricated facts."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is the population of Tokyo?",
+            context="Tokyo is a major city in Japan.",
+            answer="Tokyo has 50 million people.",  # Not in context
+            ground_truth="Not mentioned"
+        )
+        assert result < 0.5
+    def test_detects_false_citation(self):
+        """Grader should detect false citations."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What color is the sky?",
+            context="The sky appears blue during clear days.",
+            answer="The sky is green.",
+            ground_truth="blue",
+            source_quote="The sky appears green"  # Not in context
+        )
+        assert result < 0.5
+    def test_overconfident_wrong_answer(self):
+        """High confidence on wrong answer should be penalized."""
+        grader = HallucinationGrader()
+        result_confident = grader.grade(
+            question="What is 5+5?",
+            context="Basic arithmetic.",
+            answer="20",  # Wrong
+            ground_truth="10",
+            confidence=0.95  # High confidence
+        )
+        result_uncertain = grader.grade(
+            question="What is 5+5?",
+            context="Basic arithmetic.",
+            answer="20",  # Wrong
+            ground_truth="10",
+            confidence=0.3  # Low confidence
+        )
+        # Confident wrong answer should score lower
+        assert result_confident < result_uncertain
+class TestSourceGrounding:
+    """Tests for source grounding verification."""
+    def test_answer_grounded_in_context(self):
+        """Answer supported by context should score higher."""
+        grader = HallucinationGrader()
+        result_grounded = grader.grade(
+            question="What is Python?",
+            context="Python is a programming language created by Guido van Rossum.",
+            answer="Python is a programming language.",
+            ground_truth="programming language"
+        )
+        assert result_grounded >= 0.5
+    def test_answer_not_in_context(self):
+        """Answer not supported by context should be penalized."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="Who created Python?",
+            context="Python is a programming language.",
+            answer="Guido van Rossum created Python in 1991.",  # Details not in context
+            ground_truth="Not mentioned"
+        )
+        assert result < 0.7
+class TestConfidenceCalibration:
+    """Tests for confidence calibration."""
+    def test_confident_correct_answer(self):
+        """High confidence on correct answer should be rewarded."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is 1+1?",
+            context="Basic math.",
+            answer="2",
+            ground_truth="2",
+            confidence=0.95
+        )
+        assert result >= 0.7
+    def test_uncertain_correct_answer(self):
+        """Low confidence on correct answer should be slightly penalized."""
+        grader = HallucinationGrader()
+        result_high_conf = grader.grade(
+            question="What is 1+1?",
+            context="Basic math.",
+            answer="2",
+            ground_truth="2",
+            confidence=0.95
+        )
+        result_low_conf = grader.grade(
+            question="What is 1+1?",
+            context="Basic math.",
+            answer="2",
+            ground_truth="2",
+            confidence=0.3
+        )
+        # High confidence on correct answer should score higher
+        assert result_high_conf >= result_low_conf
+class TestGraderDeterminism:
+    """Tests for grader determinism."""
+    def test_grader_is_deterministic(self):
+        """Same inputs should produce same output."""
+        grader = HallucinationGrader()
+        result1 = grader.grade(
+            question="What is the capital of France?",
+            context="The capital of France is Paris.",
+            answer="Paris",
+            ground_truth="Paris"
+        )
+        result2 = grader.grade(
+            question="What is the capital of France?",
+            context="The capital of France is Paris.",
+            answer="Paris",
+            ground_truth="Paris"
+        )
+        assert result1 == result2
+    def test_grader_handles_empty_answer(self):
+        """Grader should handle empty answer gracefully."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is the capital of France?",
+            context="The capital of France is Paris.",
+            answer="",
+            ground_truth="Paris"
+        )
+        assert 0.0 <= result <= 1.0
+    def test_grader_handles_empty_context(self):
+        """Grader should handle empty context gracefully."""
+        grader = HallucinationGrader()
+        result = grader.grade(
+            question="What is the capital of France?",
+            context="",
+            answer="Paris",
+            ground_truth="Paris"
+        )
+        assert 0.0 <= result <= 1.0