SamSankar commited on
Commit
c7b7c5e
·
verified ·
1 Parent(s): 4c72eb0

Upload folder using huggingface_hub

Browse files
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # HallucinationGuard-Env Test Suite
tests/test_dataset_loader.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the DatasetLoader."""
2
+ import pytest
3
+ from server.dataset_loader import DatasetLoader
4
+
5
+
6
+ class TestDatasetLoaderInit:
7
+ """Tests for DatasetLoader initialization."""
8
+
9
+ def test_loader_initializes(self):
10
+ """DatasetLoader should initialize without errors."""
11
+ loader = DatasetLoader()
12
+ assert loader is not None
13
+
14
+
15
+ class TestDatasetSampling:
16
+ """Tests for dataset sampling."""
17
+
18
+ def test_get_sample_returns_data(self):
19
+ """get_sample should return a sample with required fields."""
20
+ loader = DatasetLoader()
21
+ sample = loader.get_sample("task_1_factual_grounding")
22
+
23
+ assert sample is not None
24
+ assert hasattr(sample, 'question') or 'question' in sample
25
+
26
+ def test_get_sample_has_context(self):
27
+ """Sample should include context."""
28
+ loader = DatasetLoader()
29
+ sample = loader.get_sample("task_1_factual_grounding")
30
+
31
+ assert hasattr(sample, 'context') or 'context' in sample
32
+
33
+ def test_get_sample_has_ground_truth(self):
34
+ """Sample should include ground truth."""
35
+ loader = DatasetLoader()
36
+ sample = loader.get_sample("task_1_factual_grounding")
37
+
38
+ assert hasattr(sample, 'ground_truth') or 'ground_truth' in sample
39
+
40
+ def test_get_sample_for_task_1(self):
41
+ """get_sample for task_1 should work."""
42
+ loader = DatasetLoader()
43
+ sample = loader.get_sample("task_1_factual_grounding")
44
+
45
+ assert sample is not None
46
+
47
+ def test_get_sample_for_task_2(self):
48
+ """get_sample for task_2 should work."""
49
+ loader = DatasetLoader()
50
+ sample = loader.get_sample("task_2_multi_hop_synthesis")
51
+
52
+ assert sample is not None
53
+
54
+ def test_get_sample_for_task_3(self):
55
+ """get_sample for task_3 should work."""
56
+ loader = DatasetLoader()
57
+ sample = loader.get_sample("task_3_adversarial_resistance")
58
+
59
+ assert sample is not None
60
+
61
+
62
+ class TestDatasetStats:
63
+ """Tests for dataset statistics."""
64
+
65
+ def test_get_available_datasets(self):
66
+ """Should be able to get list of available datasets."""
67
+ loader = DatasetLoader()
68
+
69
+ if hasattr(loader, 'get_available_datasets'):
70
+ datasets = loader.get_available_datasets()
71
+ assert isinstance(datasets, list)
72
+ assert len(datasets) > 0
73
+
74
+ def test_get_dataset_size(self):
75
+ """Should be able to get dataset size info."""
76
+ loader = DatasetLoader()
77
+
78
+ if hasattr(loader, 'get_dataset_size'):
79
+ size = loader.get_dataset_size("task_1_factual_grounding")
80
+ assert size is None or isinstance(size, int)
81
+
82
+
83
+ class TestDatasetCaching:
84
+ """Tests for dataset caching behavior."""
85
+
86
+ def test_multiple_samples_dont_crash(self):
87
+ """Requesting multiple samples should work."""
88
+ loader = DatasetLoader()
89
+
90
+ samples = []
91
+ for _ in range(5):
92
+ sample = loader.get_sample("task_1_factual_grounding")
93
+ samples.append(sample)
94
+
95
+ assert len(samples) == 5
96
+
97
+ def test_loader_handles_missing_dataset(self):
98
+ """Loader should handle request for missing dataset gracefully."""
99
+ loader = DatasetLoader()
100
+
101
+ try:
102
+ sample = loader.get_sample("nonexistent_dataset")
103
+ # If it returns None, that's acceptable
104
+ assert sample is None or sample is not None
105
+ except Exception as e:
106
+ # Should raise a reasonable exception, not crash
107
+ assert "not found" in str(e).lower() or "invalid" in str(e).lower() or "error" in str(e).lower()
108
+
109
+
110
+ class TestTaskDifficulty:
111
+ """Tests for task difficulty mapping."""
112
+
113
+ def test_task_1_is_beginner(self):
114
+ """Task 1 should map to beginner difficulty."""
115
+ loader = DatasetLoader()
116
+
117
+ if hasattr(loader, 'get_task_difficulty'):
118
+ difficulty = loader.get_task_difficulty("task_1_factual_grounding")
119
+ assert difficulty in ["beginner", "easy", 1, "1"]
120
+
121
+ def test_task_2_is_intermediate(self):
122
+ """Task 2 should map to intermediate difficulty."""
123
+ loader = DatasetLoader()
124
+
125
+ if hasattr(loader, 'get_task_difficulty'):
126
+ difficulty = loader.get_task_difficulty("task_2_multi_hop_synthesis")
127
+ assert difficulty in ["intermediate", "medium", 2, "2"]
128
+
129
+ def test_task_3_is_advanced(self):
130
+ """Task 3 should map to advanced difficulty."""
131
+ loader = DatasetLoader()
132
+
133
+ if hasattr(loader, 'get_task_difficulty'):
134
+ difficulty = loader.get_task_difficulty("task_3_adversarial_resistance")
135
+ assert difficulty in ["advanced", "hard", 3, "3"]
tests/test_environment.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the HallucinationGuard environment."""
2
+ import pytest
3
+ from server.environment import HallucinationGuardEnvironment
4
+
5
+
6
+ class TestEnvironmentReset:
7
+ """Tests for environment reset functionality."""
8
+
9
+ def test_reset_returns_observation(self):
10
+ """Reset should return a valid observation."""
11
+ env = HallucinationGuardEnvironment()
12
+ obs = env.reset()
13
+
14
+ assert obs is not None
15
+ assert hasattr(obs, 'question')
16
+ assert hasattr(obs, 'context')
17
+ assert hasattr(obs, 'reward')
18
+ assert hasattr(obs, 'done')
19
+
20
+ def test_reset_sets_initial_reward_to_zero(self):
21
+ """Initial reward should be zero."""
22
+ env = HallucinationGuardEnvironment()
23
+ obs = env.reset()
24
+
25
+ assert obs.reward == 0.0
26
+
27
+ def test_reset_sets_done_to_false(self):
28
+ """Episode should not be done after reset."""
29
+ env = HallucinationGuardEnvironment()
30
+ obs = env.reset()
31
+
32
+ assert obs.done is False
33
+
34
+ def test_reset_provides_attempts_remaining(self):
35
+ """Reset should indicate attempts remaining."""
36
+ env = HallucinationGuardEnvironment()
37
+ obs = env.reset()
38
+
39
+ assert obs.attempts_remaining > 0
40
+
41
+ def test_reset_with_task_id(self):
42
+ """Reset with specific task ID should work."""
43
+ env = HallucinationGuardEnvironment()
44
+ obs = env.reset(task_id="task_1_factual_grounding")
45
+
46
+ assert obs is not None
47
+
48
+ def test_reset_clears_previous_state(self):
49
+ """Multiple resets should produce clean state each time."""
50
+ env = HallucinationGuardEnvironment()
51
+ env.reset()
52
+ obs = env.reset()
53
+
54
+ assert obs.reward == 0.0
55
+ assert obs.done is False
56
+
57
+
58
+ class TestEnvironmentStep:
59
+ """Tests for environment step functionality."""
60
+
61
+ def test_step_returns_observation(self):
62
+ """Step should return a valid observation."""
63
+ env = HallucinationGuardEnvironment()
64
+ env.reset()
65
+
66
+ action = {
67
+ "answer": "test answer",
68
+ "confidence": 0.8,
69
+ "source_quote": "",
70
+ "reasoning": "",
71
+ "uncertainty_flags": []
72
+ }
73
+ obs = env.step(action)
74
+
75
+ assert obs is not None
76
+ assert hasattr(obs, 'reward')
77
+
78
+ def test_step_reward_in_valid_range(self):
79
+ """Step reward should be in [0.0, 1.0] range."""
80
+ env = HallucinationGuardEnvironment()
81
+ env.reset()
82
+
83
+ action = {
84
+ "answer": "test answer",
85
+ "confidence": 0.5,
86
+ "source_quote": "",
87
+ "reasoning": "",
88
+ "uncertainty_flags": []
89
+ }
90
+ obs = env.step(action)
91
+
92
+ assert -1.0 <= obs.reward <= 1.0
93
+
94
+ def test_step_with_high_confidence(self):
95
+ """Step with high confidence should work."""
96
+ env = HallucinationGuardEnvironment()
97
+ env.reset()
98
+
99
+ action = {
100
+ "answer": "test answer",
101
+ "confidence": 1.0,
102
+ "source_quote": "",
103
+ "reasoning": "",
104
+ "uncertainty_flags": []
105
+ }
106
+ obs = env.step(action)
107
+
108
+ assert obs is not None
109
+
110
+ def test_step_with_low_confidence(self):
111
+ """Step with low confidence should work."""
112
+ env = HallucinationGuardEnvironment()
113
+ env.reset()
114
+
115
+ action = {
116
+ "answer": "test answer",
117
+ "confidence": 0.1,
118
+ "source_quote": "",
119
+ "reasoning": "",
120
+ "uncertainty_flags": []
121
+ }
122
+ obs = env.step(action)
123
+
124
+ assert obs is not None
125
+
126
+ def test_step_updates_attempts(self):
127
+ """Step should decrement attempts remaining."""
128
+ env = HallucinationGuardEnvironment()
129
+ obs1 = env.reset()
130
+
131
+ action = {
132
+ "answer": "test",
133
+ "confidence": 0.5,
134
+ "source_quote": "",
135
+ "reasoning": "",
136
+ "uncertainty_flags": []
137
+ }
138
+ obs2 = env.step(action)
139
+
140
+ assert obs2.attempts_remaining < obs1.attempts_remaining
141
+
142
+
143
+ class TestEnvironmentState:
144
+ """Tests for environment state functionality."""
145
+
146
+ def test_state_returns_metadata(self):
147
+ """State should return episode metadata."""
148
+ env = HallucinationGuardEnvironment()
149
+ env.reset()
150
+ state = env.state()
151
+
152
+ assert state is not None
153
+ assert hasattr(state, 'episode_id') or hasattr(state, 'step_count') or 'episode_id' in state or 'step_count' in state
154
+
155
+ def test_state_tracks_step_count(self):
156
+ """State should track step count."""
157
+ env = HallucinationGuardEnvironment()
158
+ env.reset()
159
+
160
+ action = {
161
+ "answer": "test",
162
+ "confidence": 0.5,
163
+ "source_quote": "",
164
+ "reasoning": "",
165
+ "uncertainty_flags": []
166
+ }
167
+ env.step(action)
168
+ state = env.state()
169
+
170
+ # State should reflect that a step was taken
171
+ assert state is not None
172
+
173
+
174
+ class TestTaskSelection:
175
+ """Tests for task selection."""
176
+
177
+ def test_reset_with_task_1(self):
178
+ """Reset with task_1_factual_grounding should work."""
179
+ env = HallucinationGuardEnvironment()
180
+ obs = env.reset(task_id="task_1_factual_grounding")
181
+
182
+ assert obs is not None
183
+
184
+ def test_reset_with_task_2(self):
185
+ """Reset with task_2_multi_hop_synthesis should work."""
186
+ env = HallucinationGuardEnvironment()
187
+ obs = env.reset(task_id="task_2_multi_hop_synthesis")
188
+
189
+ assert obs is not None
190
+
191
+ def test_reset_with_task_3(self):
192
+ """Reset with task_3_adversarial_resistance should work."""
193
+ env = HallucinationGuardEnvironment()
194
+ obs = env.reset(task_id="task_3_adversarial_resistance")
195
+
196
+ assert obs is not None
tests/test_grader.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the HallucinationGrader scoring system."""
2
+ import pytest
3
+ from server.grader import HallucinationGrader
4
+
5
+
6
+ class TestGraderScoreRange:
7
+ """Tests that grader returns valid score ranges."""
8
+
9
+ def test_grader_returns_score_in_range(self):
10
+ """Grader should return score between 0.0 and 1.0."""
11
+ grader = HallucinationGrader()
12
+ result = grader.grade(
13
+ question="What is 2+2?",
14
+ context="2+2 equals 4.",
15
+ answer="4",
16
+ ground_truth="4"
17
+ )
18
+
19
+ assert 0.0 <= result <= 1.0
20
+
21
+ def test_grader_with_exact_match(self):
22
+ """Exact match should score high."""
23
+ grader = HallucinationGrader()
24
+ result = grader.grade(
25
+ question="What is the capital of France?",
26
+ context="The capital of France is Paris.",
27
+ answer="Paris",
28
+ ground_truth="Paris"
29
+ )
30
+
31
+ assert result >= 0.7
32
+
33
+ def test_grader_with_wrong_answer(self):
34
+ """Wrong answer should score low."""
35
+ grader = HallucinationGrader()
36
+ result = grader.grade(
37
+ question="What is the capital of France?",
38
+ context="The capital of France is Paris.",
39
+ answer="London",
40
+ ground_truth="Paris"
41
+ )
42
+
43
+ assert result < 0.5
44
+
45
+ def test_grader_with_partial_match(self):
46
+ """Partial match should score moderately."""
47
+ grader = HallucinationGrader()
48
+ result = grader.grade(
49
+ question="Who wrote Romeo and Juliet?",
50
+ context="Romeo and Juliet was written by William Shakespeare.",
51
+ answer="Shakespeare",
52
+ ground_truth="William Shakespeare"
53
+ )
54
+
55
+ assert 0.3 <= result <= 0.9
56
+
57
+
58
+ class TestHallucinationDetection:
59
+ """Tests for hallucination detection."""
60
+
61
+ def test_detects_fabricated_fact(self):
62
+ """Grader should detect fabricated facts."""
63
+ grader = HallucinationGrader()
64
+ result = grader.grade(
65
+ question="What is the population of Tokyo?",
66
+ context="Tokyo is a major city in Japan.",
67
+ answer="Tokyo has 50 million people.", # Not in context
68
+ ground_truth="Not mentioned"
69
+ )
70
+
71
+ assert result < 0.5
72
+
73
+ def test_detects_false_citation(self):
74
+ """Grader should detect false citations."""
75
+ grader = HallucinationGrader()
76
+ result = grader.grade(
77
+ question="What color is the sky?",
78
+ context="The sky appears blue during clear days.",
79
+ answer="The sky is green.",
80
+ ground_truth="blue",
81
+ source_quote="The sky appears green" # Not in context
82
+ )
83
+
84
+ assert result < 0.5
85
+
86
+ def test_overconfident_wrong_answer(self):
87
+ """High confidence on wrong answer should be penalized."""
88
+ grader = HallucinationGrader()
89
+ result_confident = grader.grade(
90
+ question="What is 5+5?",
91
+ context="Basic arithmetic.",
92
+ answer="20", # Wrong
93
+ ground_truth="10",
94
+ confidence=0.95 # High confidence
95
+ )
96
+ result_uncertain = grader.grade(
97
+ question="What is 5+5?",
98
+ context="Basic arithmetic.",
99
+ answer="20", # Wrong
100
+ ground_truth="10",
101
+ confidence=0.3 # Low confidence
102
+ )
103
+
104
+ # Confident wrong answer should score lower
105
+ assert result_confident < result_uncertain
106
+
107
+
108
+ class TestSourceGrounding:
109
+ """Tests for source grounding verification."""
110
+
111
+ def test_answer_grounded_in_context(self):
112
+ """Answer supported by context should score higher."""
113
+ grader = HallucinationGrader()
114
+ result_grounded = grader.grade(
115
+ question="What is Python?",
116
+ context="Python is a programming language created by Guido van Rossum.",
117
+ answer="Python is a programming language.",
118
+ ground_truth="programming language"
119
+ )
120
+
121
+ assert result_grounded >= 0.5
122
+
123
+ def test_answer_not_in_context(self):
124
+ """Answer not supported by context should be penalized."""
125
+ grader = HallucinationGrader()
126
+ result = grader.grade(
127
+ question="Who created Python?",
128
+ context="Python is a programming language.",
129
+ answer="Guido van Rossum created Python in 1991.", # Details not in context
130
+ ground_truth="Not mentioned"
131
+ )
132
+
133
+ assert result < 0.7
134
+
135
+
136
+ class TestConfidenceCalibration:
137
+ """Tests for confidence calibration."""
138
+
139
+ def test_confident_correct_answer(self):
140
+ """High confidence on correct answer should be rewarded."""
141
+ grader = HallucinationGrader()
142
+ result = grader.grade(
143
+ question="What is 1+1?",
144
+ context="Basic math.",
145
+ answer="2",
146
+ ground_truth="2",
147
+ confidence=0.95
148
+ )
149
+
150
+ assert result >= 0.7
151
+
152
+ def test_uncertain_correct_answer(self):
153
+ """Low confidence on correct answer should be slightly penalized."""
154
+ grader = HallucinationGrader()
155
+ result_high_conf = grader.grade(
156
+ question="What is 1+1?",
157
+ context="Basic math.",
158
+ answer="2",
159
+ ground_truth="2",
160
+ confidence=0.95
161
+ )
162
+ result_low_conf = grader.grade(
163
+ question="What is 1+1?",
164
+ context="Basic math.",
165
+ answer="2",
166
+ ground_truth="2",
167
+ confidence=0.3
168
+ )
169
+
170
+ # High confidence on correct answer should score higher
171
+ assert result_high_conf >= result_low_conf
172
+
173
+
174
+ class TestGraderDeterminism:
175
+ """Tests for grader determinism."""
176
+
177
+ def test_grader_is_deterministic(self):
178
+ """Same inputs should produce same output."""
179
+ grader = HallucinationGrader()
180
+
181
+ result1 = grader.grade(
182
+ question="What is the capital of France?",
183
+ context="The capital of France is Paris.",
184
+ answer="Paris",
185
+ ground_truth="Paris"
186
+ )
187
+ result2 = grader.grade(
188
+ question="What is the capital of France?",
189
+ context="The capital of France is Paris.",
190
+ answer="Paris",
191
+ ground_truth="Paris"
192
+ )
193
+
194
+ assert result1 == result2
195
+
196
+ def test_grader_handles_empty_answer(self):
197
+ """Grader should handle empty answer gracefully."""
198
+ grader = HallucinationGrader()
199
+ result = grader.grade(
200
+ question="What is the capital of France?",
201
+ context="The capital of France is Paris.",
202
+ answer="",
203
+ ground_truth="Paris"
204
+ )
205
+
206
+ assert 0.0 <= result <= 1.0
207
+
208
+ def test_grader_handles_empty_context(self):
209
+ """Grader should handle empty context gracefully."""
210
+ grader = HallucinationGrader()
211
+ result = grader.grade(
212
+ question="What is the capital of France?",
213
+ context="",
214
+ answer="Paris",
215
+ ground_truth="Paris"
216
+ )
217
+
218
+ assert 0.0 <= result <= 1.0