Upload folder using huggingface_hub
Browse files- tests/__init__.py +1 -0
- tests/test_dataset_loader.py +135 -0
- tests/test_environment.py +196 -0
- tests/test_grader.py +218 -0
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# HallucinationGuard-Env Test Suite
|
tests/test_dataset_loader.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the DatasetLoader."""
|
| 2 |
+
import pytest
|
| 3 |
+
from server.dataset_loader import DatasetLoader
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestDatasetLoaderInit:
|
| 7 |
+
"""Tests for DatasetLoader initialization."""
|
| 8 |
+
|
| 9 |
+
def test_loader_initializes(self):
|
| 10 |
+
"""DatasetLoader should initialize without errors."""
|
| 11 |
+
loader = DatasetLoader()
|
| 12 |
+
assert loader is not None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TestDatasetSampling:
|
| 16 |
+
"""Tests for dataset sampling."""
|
| 17 |
+
|
| 18 |
+
def test_get_sample_returns_data(self):
|
| 19 |
+
"""get_sample should return a sample with required fields."""
|
| 20 |
+
loader = DatasetLoader()
|
| 21 |
+
sample = loader.get_sample("task_1_factual_grounding")
|
| 22 |
+
|
| 23 |
+
assert sample is not None
|
| 24 |
+
assert hasattr(sample, 'question') or 'question' in sample
|
| 25 |
+
|
| 26 |
+
def test_get_sample_has_context(self):
|
| 27 |
+
"""Sample should include context."""
|
| 28 |
+
loader = DatasetLoader()
|
| 29 |
+
sample = loader.get_sample("task_1_factual_grounding")
|
| 30 |
+
|
| 31 |
+
assert hasattr(sample, 'context') or 'context' in sample
|
| 32 |
+
|
| 33 |
+
def test_get_sample_has_ground_truth(self):
|
| 34 |
+
"""Sample should include ground truth."""
|
| 35 |
+
loader = DatasetLoader()
|
| 36 |
+
sample = loader.get_sample("task_1_factual_grounding")
|
| 37 |
+
|
| 38 |
+
assert hasattr(sample, 'ground_truth') or 'ground_truth' in sample
|
| 39 |
+
|
| 40 |
+
def test_get_sample_for_task_1(self):
|
| 41 |
+
"""get_sample for task_1 should work."""
|
| 42 |
+
loader = DatasetLoader()
|
| 43 |
+
sample = loader.get_sample("task_1_factual_grounding")
|
| 44 |
+
|
| 45 |
+
assert sample is not None
|
| 46 |
+
|
| 47 |
+
def test_get_sample_for_task_2(self):
|
| 48 |
+
"""get_sample for task_2 should work."""
|
| 49 |
+
loader = DatasetLoader()
|
| 50 |
+
sample = loader.get_sample("task_2_multi_hop_synthesis")
|
| 51 |
+
|
| 52 |
+
assert sample is not None
|
| 53 |
+
|
| 54 |
+
def test_get_sample_for_task_3(self):
|
| 55 |
+
"""get_sample for task_3 should work."""
|
| 56 |
+
loader = DatasetLoader()
|
| 57 |
+
sample = loader.get_sample("task_3_adversarial_resistance")
|
| 58 |
+
|
| 59 |
+
assert sample is not None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TestDatasetStats:
|
| 63 |
+
"""Tests for dataset statistics."""
|
| 64 |
+
|
| 65 |
+
def test_get_available_datasets(self):
|
| 66 |
+
"""Should be able to get list of available datasets."""
|
| 67 |
+
loader = DatasetLoader()
|
| 68 |
+
|
| 69 |
+
if hasattr(loader, 'get_available_datasets'):
|
| 70 |
+
datasets = loader.get_available_datasets()
|
| 71 |
+
assert isinstance(datasets, list)
|
| 72 |
+
assert len(datasets) > 0
|
| 73 |
+
|
| 74 |
+
def test_get_dataset_size(self):
|
| 75 |
+
"""Should be able to get dataset size info."""
|
| 76 |
+
loader = DatasetLoader()
|
| 77 |
+
|
| 78 |
+
if hasattr(loader, 'get_dataset_size'):
|
| 79 |
+
size = loader.get_dataset_size("task_1_factual_grounding")
|
| 80 |
+
assert size is None or isinstance(size, int)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class TestDatasetCaching:
|
| 84 |
+
"""Tests for dataset caching behavior."""
|
| 85 |
+
|
| 86 |
+
def test_multiple_samples_dont_crash(self):
|
| 87 |
+
"""Requesting multiple samples should work."""
|
| 88 |
+
loader = DatasetLoader()
|
| 89 |
+
|
| 90 |
+
samples = []
|
| 91 |
+
for _ in range(5):
|
| 92 |
+
sample = loader.get_sample("task_1_factual_grounding")
|
| 93 |
+
samples.append(sample)
|
| 94 |
+
|
| 95 |
+
assert len(samples) == 5
|
| 96 |
+
|
| 97 |
+
def test_loader_handles_missing_dataset(self):
|
| 98 |
+
"""Loader should handle request for missing dataset gracefully."""
|
| 99 |
+
loader = DatasetLoader()
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
sample = loader.get_sample("nonexistent_dataset")
|
| 103 |
+
# If it returns None, that's acceptable
|
| 104 |
+
assert sample is None or sample is not None
|
| 105 |
+
except Exception as e:
|
| 106 |
+
# Should raise a reasonable exception, not crash
|
| 107 |
+
assert "not found" in str(e).lower() or "invalid" in str(e).lower() or "error" in str(e).lower()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class TestTaskDifficulty:
|
| 111 |
+
"""Tests for task difficulty mapping."""
|
| 112 |
+
|
| 113 |
+
def test_task_1_is_beginner(self):
|
| 114 |
+
"""Task 1 should map to beginner difficulty."""
|
| 115 |
+
loader = DatasetLoader()
|
| 116 |
+
|
| 117 |
+
if hasattr(loader, 'get_task_difficulty'):
|
| 118 |
+
difficulty = loader.get_task_difficulty("task_1_factual_grounding")
|
| 119 |
+
assert difficulty in ["beginner", "easy", 1, "1"]
|
| 120 |
+
|
| 121 |
+
def test_task_2_is_intermediate(self):
|
| 122 |
+
"""Task 2 should map to intermediate difficulty."""
|
| 123 |
+
loader = DatasetLoader()
|
| 124 |
+
|
| 125 |
+
if hasattr(loader, 'get_task_difficulty'):
|
| 126 |
+
difficulty = loader.get_task_difficulty("task_2_multi_hop_synthesis")
|
| 127 |
+
assert difficulty in ["intermediate", "medium", 2, "2"]
|
| 128 |
+
|
| 129 |
+
def test_task_3_is_advanced(self):
|
| 130 |
+
"""Task 3 should map to advanced difficulty."""
|
| 131 |
+
loader = DatasetLoader()
|
| 132 |
+
|
| 133 |
+
if hasattr(loader, 'get_task_difficulty'):
|
| 134 |
+
difficulty = loader.get_task_difficulty("task_3_adversarial_resistance")
|
| 135 |
+
assert difficulty in ["advanced", "hard", 3, "3"]
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the HallucinationGuard environment."""
|
| 2 |
+
import pytest
|
| 3 |
+
from server.environment import HallucinationGuardEnvironment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestEnvironmentReset:
|
| 7 |
+
"""Tests for environment reset functionality."""
|
| 8 |
+
|
| 9 |
+
def test_reset_returns_observation(self):
|
| 10 |
+
"""Reset should return a valid observation."""
|
| 11 |
+
env = HallucinationGuardEnvironment()
|
| 12 |
+
obs = env.reset()
|
| 13 |
+
|
| 14 |
+
assert obs is not None
|
| 15 |
+
assert hasattr(obs, 'question')
|
| 16 |
+
assert hasattr(obs, 'context')
|
| 17 |
+
assert hasattr(obs, 'reward')
|
| 18 |
+
assert hasattr(obs, 'done')
|
| 19 |
+
|
| 20 |
+
def test_reset_sets_initial_reward_to_zero(self):
|
| 21 |
+
"""Initial reward should be zero."""
|
| 22 |
+
env = HallucinationGuardEnvironment()
|
| 23 |
+
obs = env.reset()
|
| 24 |
+
|
| 25 |
+
assert obs.reward == 0.0
|
| 26 |
+
|
| 27 |
+
def test_reset_sets_done_to_false(self):
|
| 28 |
+
"""Episode should not be done after reset."""
|
| 29 |
+
env = HallucinationGuardEnvironment()
|
| 30 |
+
obs = env.reset()
|
| 31 |
+
|
| 32 |
+
assert obs.done is False
|
| 33 |
+
|
| 34 |
+
def test_reset_provides_attempts_remaining(self):
|
| 35 |
+
"""Reset should indicate attempts remaining."""
|
| 36 |
+
env = HallucinationGuardEnvironment()
|
| 37 |
+
obs = env.reset()
|
| 38 |
+
|
| 39 |
+
assert obs.attempts_remaining > 0
|
| 40 |
+
|
| 41 |
+
def test_reset_with_task_id(self):
|
| 42 |
+
"""Reset with specific task ID should work."""
|
| 43 |
+
env = HallucinationGuardEnvironment()
|
| 44 |
+
obs = env.reset(task_id="task_1_factual_grounding")
|
| 45 |
+
|
| 46 |
+
assert obs is not None
|
| 47 |
+
|
| 48 |
+
def test_reset_clears_previous_state(self):
|
| 49 |
+
"""Multiple resets should produce clean state each time."""
|
| 50 |
+
env = HallucinationGuardEnvironment()
|
| 51 |
+
env.reset()
|
| 52 |
+
obs = env.reset()
|
| 53 |
+
|
| 54 |
+
assert obs.reward == 0.0
|
| 55 |
+
assert obs.done is False
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class TestEnvironmentStep:
|
| 59 |
+
"""Tests for environment step functionality."""
|
| 60 |
+
|
| 61 |
+
def test_step_returns_observation(self):
|
| 62 |
+
"""Step should return a valid observation."""
|
| 63 |
+
env = HallucinationGuardEnvironment()
|
| 64 |
+
env.reset()
|
| 65 |
+
|
| 66 |
+
action = {
|
| 67 |
+
"answer": "test answer",
|
| 68 |
+
"confidence": 0.8,
|
| 69 |
+
"source_quote": "",
|
| 70 |
+
"reasoning": "",
|
| 71 |
+
"uncertainty_flags": []
|
| 72 |
+
}
|
| 73 |
+
obs = env.step(action)
|
| 74 |
+
|
| 75 |
+
assert obs is not None
|
| 76 |
+
assert hasattr(obs, 'reward')
|
| 77 |
+
|
| 78 |
+
def test_step_reward_in_valid_range(self):
|
| 79 |
+
"""Step reward should be in [0.0, 1.0] range."""
|
| 80 |
+
env = HallucinationGuardEnvironment()
|
| 81 |
+
env.reset()
|
| 82 |
+
|
| 83 |
+
action = {
|
| 84 |
+
"answer": "test answer",
|
| 85 |
+
"confidence": 0.5,
|
| 86 |
+
"source_quote": "",
|
| 87 |
+
"reasoning": "",
|
| 88 |
+
"uncertainty_flags": []
|
| 89 |
+
}
|
| 90 |
+
obs = env.step(action)
|
| 91 |
+
|
| 92 |
+
assert -1.0 <= obs.reward <= 1.0
|
| 93 |
+
|
| 94 |
+
def test_step_with_high_confidence(self):
|
| 95 |
+
"""Step with high confidence should work."""
|
| 96 |
+
env = HallucinationGuardEnvironment()
|
| 97 |
+
env.reset()
|
| 98 |
+
|
| 99 |
+
action = {
|
| 100 |
+
"answer": "test answer",
|
| 101 |
+
"confidence": 1.0,
|
| 102 |
+
"source_quote": "",
|
| 103 |
+
"reasoning": "",
|
| 104 |
+
"uncertainty_flags": []
|
| 105 |
+
}
|
| 106 |
+
obs = env.step(action)
|
| 107 |
+
|
| 108 |
+
assert obs is not None
|
| 109 |
+
|
| 110 |
+
def test_step_with_low_confidence(self):
|
| 111 |
+
"""Step with low confidence should work."""
|
| 112 |
+
env = HallucinationGuardEnvironment()
|
| 113 |
+
env.reset()
|
| 114 |
+
|
| 115 |
+
action = {
|
| 116 |
+
"answer": "test answer",
|
| 117 |
+
"confidence": 0.1,
|
| 118 |
+
"source_quote": "",
|
| 119 |
+
"reasoning": "",
|
| 120 |
+
"uncertainty_flags": []
|
| 121 |
+
}
|
| 122 |
+
obs = env.step(action)
|
| 123 |
+
|
| 124 |
+
assert obs is not None
|
| 125 |
+
|
| 126 |
+
def test_step_updates_attempts(self):
|
| 127 |
+
"""Step should decrement attempts remaining."""
|
| 128 |
+
env = HallucinationGuardEnvironment()
|
| 129 |
+
obs1 = env.reset()
|
| 130 |
+
|
| 131 |
+
action = {
|
| 132 |
+
"answer": "test",
|
| 133 |
+
"confidence": 0.5,
|
| 134 |
+
"source_quote": "",
|
| 135 |
+
"reasoning": "",
|
| 136 |
+
"uncertainty_flags": []
|
| 137 |
+
}
|
| 138 |
+
obs2 = env.step(action)
|
| 139 |
+
|
| 140 |
+
assert obs2.attempts_remaining < obs1.attempts_remaining
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class TestEnvironmentState:
|
| 144 |
+
"""Tests for environment state functionality."""
|
| 145 |
+
|
| 146 |
+
def test_state_returns_metadata(self):
|
| 147 |
+
"""State should return episode metadata."""
|
| 148 |
+
env = HallucinationGuardEnvironment()
|
| 149 |
+
env.reset()
|
| 150 |
+
state = env.state()
|
| 151 |
+
|
| 152 |
+
assert state is not None
|
| 153 |
+
assert hasattr(state, 'episode_id') or hasattr(state, 'step_count') or 'episode_id' in state or 'step_count' in state
|
| 154 |
+
|
| 155 |
+
def test_state_tracks_step_count(self):
|
| 156 |
+
"""State should track step count."""
|
| 157 |
+
env = HallucinationGuardEnvironment()
|
| 158 |
+
env.reset()
|
| 159 |
+
|
| 160 |
+
action = {
|
| 161 |
+
"answer": "test",
|
| 162 |
+
"confidence": 0.5,
|
| 163 |
+
"source_quote": "",
|
| 164 |
+
"reasoning": "",
|
| 165 |
+
"uncertainty_flags": []
|
| 166 |
+
}
|
| 167 |
+
env.step(action)
|
| 168 |
+
state = env.state()
|
| 169 |
+
|
| 170 |
+
# State should reflect that a step was taken
|
| 171 |
+
assert state is not None
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class TestTaskSelection:
|
| 175 |
+
"""Tests for task selection."""
|
| 176 |
+
|
| 177 |
+
def test_reset_with_task_1(self):
|
| 178 |
+
"""Reset with task_1_factual_grounding should work."""
|
| 179 |
+
env = HallucinationGuardEnvironment()
|
| 180 |
+
obs = env.reset(task_id="task_1_factual_grounding")
|
| 181 |
+
|
| 182 |
+
assert obs is not None
|
| 183 |
+
|
| 184 |
+
def test_reset_with_task_2(self):
|
| 185 |
+
"""Reset with task_2_multi_hop_synthesis should work."""
|
| 186 |
+
env = HallucinationGuardEnvironment()
|
| 187 |
+
obs = env.reset(task_id="task_2_multi_hop_synthesis")
|
| 188 |
+
|
| 189 |
+
assert obs is not None
|
| 190 |
+
|
| 191 |
+
def test_reset_with_task_3(self):
|
| 192 |
+
"""Reset with task_3_adversarial_resistance should work."""
|
| 193 |
+
env = HallucinationGuardEnvironment()
|
| 194 |
+
obs = env.reset(task_id="task_3_adversarial_resistance")
|
| 195 |
+
|
| 196 |
+
assert obs is not None
|
tests/test_grader.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the HallucinationGrader scoring system."""
|
| 2 |
+
import pytest
|
| 3 |
+
from server.grader import HallucinationGrader
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TestGraderScoreRange:
|
| 7 |
+
"""Tests that grader returns valid score ranges."""
|
| 8 |
+
|
| 9 |
+
def test_grader_returns_score_in_range(self):
|
| 10 |
+
"""Grader should return score between 0.0 and 1.0."""
|
| 11 |
+
grader = HallucinationGrader()
|
| 12 |
+
result = grader.grade(
|
| 13 |
+
question="What is 2+2?",
|
| 14 |
+
context="2+2 equals 4.",
|
| 15 |
+
answer="4",
|
| 16 |
+
ground_truth="4"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
assert 0.0 <= result <= 1.0
|
| 20 |
+
|
| 21 |
+
def test_grader_with_exact_match(self):
|
| 22 |
+
"""Exact match should score high."""
|
| 23 |
+
grader = HallucinationGrader()
|
| 24 |
+
result = grader.grade(
|
| 25 |
+
question="What is the capital of France?",
|
| 26 |
+
context="The capital of France is Paris.",
|
| 27 |
+
answer="Paris",
|
| 28 |
+
ground_truth="Paris"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
assert result >= 0.7
|
| 32 |
+
|
| 33 |
+
def test_grader_with_wrong_answer(self):
|
| 34 |
+
"""Wrong answer should score low."""
|
| 35 |
+
grader = HallucinationGrader()
|
| 36 |
+
result = grader.grade(
|
| 37 |
+
question="What is the capital of France?",
|
| 38 |
+
context="The capital of France is Paris.",
|
| 39 |
+
answer="London",
|
| 40 |
+
ground_truth="Paris"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
assert result < 0.5
|
| 44 |
+
|
| 45 |
+
def test_grader_with_partial_match(self):
|
| 46 |
+
"""Partial match should score moderately."""
|
| 47 |
+
grader = HallucinationGrader()
|
| 48 |
+
result = grader.grade(
|
| 49 |
+
question="Who wrote Romeo and Juliet?",
|
| 50 |
+
context="Romeo and Juliet was written by William Shakespeare.",
|
| 51 |
+
answer="Shakespeare",
|
| 52 |
+
ground_truth="William Shakespeare"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
assert 0.3 <= result <= 0.9
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class TestHallucinationDetection:
|
| 59 |
+
"""Tests for hallucination detection."""
|
| 60 |
+
|
| 61 |
+
def test_detects_fabricated_fact(self):
|
| 62 |
+
"""Grader should detect fabricated facts."""
|
| 63 |
+
grader = HallucinationGrader()
|
| 64 |
+
result = grader.grade(
|
| 65 |
+
question="What is the population of Tokyo?",
|
| 66 |
+
context="Tokyo is a major city in Japan.",
|
| 67 |
+
answer="Tokyo has 50 million people.", # Not in context
|
| 68 |
+
ground_truth="Not mentioned"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
assert result < 0.5
|
| 72 |
+
|
| 73 |
+
def test_detects_false_citation(self):
|
| 74 |
+
"""Grader should detect false citations."""
|
| 75 |
+
grader = HallucinationGrader()
|
| 76 |
+
result = grader.grade(
|
| 77 |
+
question="What color is the sky?",
|
| 78 |
+
context="The sky appears blue during clear days.",
|
| 79 |
+
answer="The sky is green.",
|
| 80 |
+
ground_truth="blue",
|
| 81 |
+
source_quote="The sky appears green" # Not in context
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
assert result < 0.5
|
| 85 |
+
|
| 86 |
+
def test_overconfident_wrong_answer(self):
|
| 87 |
+
"""High confidence on wrong answer should be penalized."""
|
| 88 |
+
grader = HallucinationGrader()
|
| 89 |
+
result_confident = grader.grade(
|
| 90 |
+
question="What is 5+5?",
|
| 91 |
+
context="Basic arithmetic.",
|
| 92 |
+
answer="20", # Wrong
|
| 93 |
+
ground_truth="10",
|
| 94 |
+
confidence=0.95 # High confidence
|
| 95 |
+
)
|
| 96 |
+
result_uncertain = grader.grade(
|
| 97 |
+
question="What is 5+5?",
|
| 98 |
+
context="Basic arithmetic.",
|
| 99 |
+
answer="20", # Wrong
|
| 100 |
+
ground_truth="10",
|
| 101 |
+
confidence=0.3 # Low confidence
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Confident wrong answer should score lower
|
| 105 |
+
assert result_confident < result_uncertain
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class TestSourceGrounding:
|
| 109 |
+
"""Tests for source grounding verification."""
|
| 110 |
+
|
| 111 |
+
def test_answer_grounded_in_context(self):
|
| 112 |
+
"""Answer supported by context should score higher."""
|
| 113 |
+
grader = HallucinationGrader()
|
| 114 |
+
result_grounded = grader.grade(
|
| 115 |
+
question="What is Python?",
|
| 116 |
+
context="Python is a programming language created by Guido van Rossum.",
|
| 117 |
+
answer="Python is a programming language.",
|
| 118 |
+
ground_truth="programming language"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
assert result_grounded >= 0.5
|
| 122 |
+
|
| 123 |
+
def test_answer_not_in_context(self):
|
| 124 |
+
"""Answer not supported by context should be penalized."""
|
| 125 |
+
grader = HallucinationGrader()
|
| 126 |
+
result = grader.grade(
|
| 127 |
+
question="Who created Python?",
|
| 128 |
+
context="Python is a programming language.",
|
| 129 |
+
answer="Guido van Rossum created Python in 1991.", # Details not in context
|
| 130 |
+
ground_truth="Not mentioned"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
assert result < 0.7
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
class TestConfidenceCalibration:
|
| 137 |
+
"""Tests for confidence calibration."""
|
| 138 |
+
|
| 139 |
+
def test_confident_correct_answer(self):
|
| 140 |
+
"""High confidence on correct answer should be rewarded."""
|
| 141 |
+
grader = HallucinationGrader()
|
| 142 |
+
result = grader.grade(
|
| 143 |
+
question="What is 1+1?",
|
| 144 |
+
context="Basic math.",
|
| 145 |
+
answer="2",
|
| 146 |
+
ground_truth="2",
|
| 147 |
+
confidence=0.95
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
assert result >= 0.7
|
| 151 |
+
|
| 152 |
+
def test_uncertain_correct_answer(self):
|
| 153 |
+
"""Low confidence on correct answer should be slightly penalized."""
|
| 154 |
+
grader = HallucinationGrader()
|
| 155 |
+
result_high_conf = grader.grade(
|
| 156 |
+
question="What is 1+1?",
|
| 157 |
+
context="Basic math.",
|
| 158 |
+
answer="2",
|
| 159 |
+
ground_truth="2",
|
| 160 |
+
confidence=0.95
|
| 161 |
+
)
|
| 162 |
+
result_low_conf = grader.grade(
|
| 163 |
+
question="What is 1+1?",
|
| 164 |
+
context="Basic math.",
|
| 165 |
+
answer="2",
|
| 166 |
+
ground_truth="2",
|
| 167 |
+
confidence=0.3
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# High confidence on correct answer should score higher
|
| 171 |
+
assert result_high_conf >= result_low_conf
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class TestGraderDeterminism:
|
| 175 |
+
"""Tests for grader determinism."""
|
| 176 |
+
|
| 177 |
+
def test_grader_is_deterministic(self):
|
| 178 |
+
"""Same inputs should produce same output."""
|
| 179 |
+
grader = HallucinationGrader()
|
| 180 |
+
|
| 181 |
+
result1 = grader.grade(
|
| 182 |
+
question="What is the capital of France?",
|
| 183 |
+
context="The capital of France is Paris.",
|
| 184 |
+
answer="Paris",
|
| 185 |
+
ground_truth="Paris"
|
| 186 |
+
)
|
| 187 |
+
result2 = grader.grade(
|
| 188 |
+
question="What is the capital of France?",
|
| 189 |
+
context="The capital of France is Paris.",
|
| 190 |
+
answer="Paris",
|
| 191 |
+
ground_truth="Paris"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
assert result1 == result2
|
| 195 |
+
|
| 196 |
+
def test_grader_handles_empty_answer(self):
|
| 197 |
+
"""Grader should handle empty answer gracefully."""
|
| 198 |
+
grader = HallucinationGrader()
|
| 199 |
+
result = grader.grade(
|
| 200 |
+
question="What is the capital of France?",
|
| 201 |
+
context="The capital of France is Paris.",
|
| 202 |
+
answer="",
|
| 203 |
+
ground_truth="Paris"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
assert 0.0 <= result <= 1.0
|
| 207 |
+
|
| 208 |
+
def test_grader_handles_empty_context(self):
|
| 209 |
+
"""Grader should handle empty context gracefully."""
|
| 210 |
+
grader = HallucinationGrader()
|
| 211 |
+
result = grader.grade(
|
| 212 |
+
question="What is the capital of France?",
|
| 213 |
+
context="",
|
| 214 |
+
answer="Paris",
|
| 215 |
+
ground_truth="Paris"
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
assert 0.0 <= result <= 1.0
|