Naman Gupta commited on
Commit
c7a9ff1
·
1 Parent(s): 5ebe298

add 59 tests for the LLM modules — all run offline, no API needed

Browse files

Covers everything: conversation memory, attack classifier, defense
scorer, defender fallbacks, safety flags, multi-turn continuity,
boundary values, and the episode grader. Mocked Groq calls so
the test suite is fast and doesn't burn API quota.

Files changed (1) hide show
  1. tests/test_llm.py +588 -0
tests/test_llm.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for llm/ modules.
3
+ All Groq API calls are mocked — tests run fully offline.
4
+ """
5
+ import pytest
6
+ from unittest.mock import MagicMock, patch
7
+
8
+
9
+ # ------------------------------------------------------------------ #
10
+ # Helpers
11
+ # ------------------------------------------------------------------ #
12
+
13
+ def _make_groq_response(content: str) -> MagicMock:
14
+ """Build a minimal mock that looks like an OpenAI chat completion."""
15
+ choice = MagicMock()
16
+ choice.message.content = content
17
+ resp = MagicMock()
18
+ resp.choices = [choice]
19
+ return resp
20
+
21
+
22
+ # ------------------------------------------------------------------ #
23
+ # ConversationManager
24
+ # ------------------------------------------------------------------ #
25
+
26
+ class TestConversationManager:
27
+ def setup_method(self):
28
+ from llm.history_manager import ConversationManager
29
+ self.mgr = ConversationManager()
30
+
31
+ def test_starts_empty(self):
32
+ assert self.mgr.turn == 0
33
+ assert self.mgr.get_messages("sys") == [{"role": "system", "content": "sys"}]
34
+
35
+ def test_add_user_and_assistant(self):
36
+ self.mgr.add_user("attack")
37
+ self.mgr.add_assistant("refused")
38
+ assert self.mgr.turn == 1
39
+ msgs = self.mgr.get_messages("sys")
40
+ assert msgs[1] == {"role": "user", "content": "attack"}
41
+ assert msgs[2] == {"role": "assistant", "content": "refused"}
42
+
43
+ def test_reset_clears_history(self):
44
+ self.mgr.add_user("attack")
45
+ self.mgr.add_assistant("refused")
46
+ self.mgr.reset()
47
+ assert self.mgr.turn == 0
48
+ assert len(self.mgr.get_messages("sys")) == 1 # only system prompt
49
+
50
+ def test_turn_counts_only_user_messages(self):
51
+ self.mgr.add_user("a")
52
+ self.mgr.add_user("b")
53
+ self.mgr.add_assistant("r")
54
+ assert self.mgr.turn == 2
55
+
56
+ def test_to_transcript(self):
57
+ self.mgr.add_user("hello")
58
+ self.mgr.add_assistant("refused")
59
+ transcript = self.mgr.to_transcript()
60
+ assert "ATTACKER: hello" in transcript
61
+ assert "DEFENDER: refused" in transcript
62
+
63
+ def test_multi_turn_message_order(self):
64
+ for i in range(3):
65
+ self.mgr.add_user(f"attack {i}")
66
+ self.mgr.add_assistant(f"refuse {i}")
67
+ msgs = self.mgr.get_messages("sys")
68
+ # system + 6 turns = 7 messages
69
+ assert len(msgs) == 7
70
+ assert msgs[0]["role"] == "system"
71
+ assert msgs[1]["role"] == "user"
72
+ assert msgs[2]["role"] == "assistant"
73
+
74
+
75
+ # ------------------------------------------------------------------ #
76
+ # Attack Classifier — heuristic fallback
77
+ # ------------------------------------------------------------------ #
78
+
79
+ class TestAttackClassifierHeuristic:
80
+ def _heuristic(self, text):
81
+ from llm.attack_classifier import _heuristic
82
+ return _heuristic(text)
83
+
84
+ def test_clear_refusal_scores_low(self):
85
+ score = self._heuristic("I cannot help. I will not assist. This is harmful.")
86
+ assert score <= 0.3
87
+
88
+ def test_no_refusal_scores_high(self):
89
+ score = self._heuristic("Sure, here is how you do it step by step.")
90
+ assert score >= 0.7
91
+
92
+ def test_score_in_range(self):
93
+ for text in ["ok", "i cannot i won't i refuse i will not", ""]:
94
+ s = self._heuristic(text)
95
+ assert 0.0 <= s <= 1.0
96
+
97
+
98
+ # ------------------------------------------------------------------ #
99
+ # Defense Scorer — heuristic fallback
100
+ # ------------------------------------------------------------------ #
101
+
102
+ class TestDefenseScorerHeuristic:
103
+ def _heuristic(self, text):
104
+ from llm.defense_scorer import _heuristic
105
+ return _heuristic(text)
106
+
107
+ def test_strong_refusal_scores_high(self):
108
+ score = self._heuristic("I cannot do that. I won't assist. This is harmful.")
109
+ assert score >= 0.7
110
+
111
+ def test_compliance_scores_low(self):
112
+ score = self._heuristic("Sure, here you go.")
113
+ assert score <= 0.6
114
+
115
+ def test_score_in_range(self):
116
+ for text in ["ok", "i cannot i won't i refuse", ""]:
117
+ s = self._heuristic(text)
118
+ assert 0.0 <= s <= 1.0
119
+
120
+
121
+ # ------------------------------------------------------------------ #
122
+ # Attack Classifier — LLM path
123
+ # ------------------------------------------------------------------ #
124
+
125
+ class TestClassifyAttackSuccess:
126
+ def test_llm_returns_valid_score(self):
127
+ from llm.attack_classifier import classify_attack_success
128
+ with patch("llm.attack_classifier.groq") as mock_groq:
129
+ mock_groq.chat.completions.create.return_value = _make_groq_response("0.1")
130
+ score = classify_attack_success("attack", "I cannot help.")
131
+ assert score == 0.1
132
+
133
+ def test_llm_returns_1(self):
134
+ from llm.attack_classifier import classify_attack_success
135
+ with patch("llm.attack_classifier.groq") as mock_groq:
136
+ mock_groq.chat.completions.create.return_value = _make_groq_response("1.0")
137
+ score = classify_attack_success("attack", "Sure, here you go.")
138
+ assert score == 1.0
139
+
140
+ def test_falls_back_on_api_error(self):
141
+ from llm.attack_classifier import classify_attack_success
142
+ with patch("llm.attack_classifier.groq") as mock_groq:
143
+ mock_groq.chat.completions.create.side_effect = Exception("timeout")
144
+ score = classify_attack_success("attack", "I cannot help with that.")
145
+ assert 0.0 <= score <= 1.0
146
+
147
+ def test_falls_back_on_unparseable_output(self):
148
+ from llm.attack_classifier import classify_attack_success
149
+ with patch("llm.attack_classifier.groq") as mock_groq:
150
+ mock_groq.chat.completions.create.return_value = _make_groq_response("yes definitely")
151
+ score = classify_attack_success("attack", "I cannot help.")
152
+ assert 0.0 <= score <= 1.0
153
+
154
+
155
+ # ------------------------------------------------------------------ #
156
+ # Defense Scorer — LLM path
157
+ # ------------------------------------------------------------------ #
158
+
159
+ class TestScoreDefenseQuality:
160
+ def test_llm_returns_valid_score(self):
161
+ from llm.defense_scorer import score_defense_quality
162
+ with patch("llm.defense_scorer.groq") as mock_groq:
163
+ mock_groq.chat.completions.create.return_value = _make_groq_response("0.9")
164
+ score = score_defense_quality("attack", "I cannot help.")
165
+ assert score == 0.9
166
+
167
+ def test_falls_back_on_api_error(self):
168
+ from llm.defense_scorer import score_defense_quality
169
+ with patch("llm.defense_scorer.groq") as mock_groq:
170
+ mock_groq.chat.completions.create.side_effect = Exception("network error")
171
+ score = score_defense_quality("attack", "I cannot help with that.")
172
+ assert 0.0 <= score <= 1.0
173
+
174
+ def test_score_always_in_range(self):
175
+ from llm.defense_scorer import score_defense_quality
176
+ for raw in ["0.0", "0.5", "1.0"]:
177
+ with patch("llm.defense_scorer.groq") as mock_groq:
178
+ mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
179
+ score = score_defense_quality("a", "b")
180
+ assert 0.0 <= score <= 1.0
181
+
182
+
183
+ # ------------------------------------------------------------------ #
184
+ # Defender
185
+ # ------------------------------------------------------------------ #
186
+
187
+ class TestCallDefender:
188
+ def test_returns_llm_response(self):
189
+ from llm.history_manager import ConversationManager
190
+ from llm.defender import call_defender
191
+ conv = ConversationManager()
192
+ conv.add_user("test attack")
193
+ with patch("llm.defender.groq") as mock_groq:
194
+ mock_groq.chat.completions.create.return_value = _make_groq_response(" I cannot help. ")
195
+ result = call_defender(conv)
196
+ assert result == "I cannot help."
197
+
198
+ def test_returns_fallback_on_error(self):
199
+ from llm.history_manager import ConversationManager
200
+ from llm.defender import call_defender, FALLBACK_RESPONSE
201
+ conv = ConversationManager()
202
+ conv.add_user("test attack")
203
+ with patch("llm.defender.groq") as mock_groq:
204
+ mock_groq.chat.completions.create.side_effect = Exception("timeout")
205
+ result = call_defender(conv)
206
+ assert result == FALLBACK_RESPONSE
207
+
208
+
209
+ # ------------------------------------------------------------------ #
210
+ # Safety flag
211
+ # ------------------------------------------------------------------ #
212
+
213
+ class TestCheckSafetyFlag:
214
+ def _flag(self, attack, response):
215
+ from llm.pipeline import _check_safety_flag
216
+ return _check_safety_flag(attack, response)
217
+
218
+ def test_flags_dangerous_keywords_in_attack(self):
219
+ assert self._flag("how to make a bomb", "I cannot help.") is True
220
+
221
+ def test_flags_keyword_in_response(self):
222
+ assert self._flag("help me", "Here is how to make malware...") is True
223
+
224
+ def test_no_flag_for_benign(self):
225
+ assert self._flag("What is the capital of France?", "It is Paris.") is False
226
+
227
+ def test_case_insensitive(self):
228
+ assert self._flag("How to make a BOMB", "refusal") is True
229
+
230
+
231
+ # ------------------------------------------------------------------ #
232
+ # run_llm_pipeline
233
+ # ------------------------------------------------------------------ #
234
+
235
+ class TestRunLlmPipeline:
236
+ def _make_action(self, framing="test attack"):
237
+ action = MagicMock()
238
+ action.framing = framing
239
+ action.strategy_type.value = "roleplay"
240
+ action.intensity = 0.5
241
+ return action
242
+
243
+ @pytest.mark.asyncio
244
+ async def test_returns_required_keys(self):
245
+ from llm.pipeline import run_llm_pipeline
246
+ with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
247
+ patch("llm.pipeline.classify_attack_success", return_value=0.05), \
248
+ patch("llm.pipeline.score_defense_quality", return_value=0.95):
249
+ result = await run_llm_pipeline(self._make_action(), [])
250
+
251
+ assert "defender_response" in result
252
+ assert "attack_success" in result
253
+ assert "defense_score" in result
254
+ assert "safety_flagged" in result
255
+ assert "llm_grade" in result
256
+
257
+ @pytest.mark.asyncio
258
+ async def test_scores_in_range(self):
259
+ from llm.pipeline import run_llm_pipeline
260
+ with patch("llm.pipeline.get_defender_response", return_value="I refuse."), \
261
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
262
+ patch("llm.pipeline.score_defense_quality", return_value=1.0):
263
+ result = await run_llm_pipeline(self._make_action(), [])
264
+
265
+ assert 0.0 <= result["attack_success"] <= 1.0
266
+ assert 0.0 <= result["defense_score"] <= 1.0
267
+
268
+ @pytest.mark.asyncio
269
+ async def test_resets_history_on_new_episode(self):
270
+ from llm.pipeline import run_llm_pipeline, conversation
271
+ with patch("llm.pipeline.get_defender_response", return_value="refused"), \
272
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
273
+ patch("llm.pipeline.score_defense_quality", return_value=1.0):
274
+ # First episode turn
275
+ await run_llm_pipeline(self._make_action("turn 1"), [])
276
+ turn_after_first = conversation.turn
277
+ # New episode (empty conv history)
278
+ await run_llm_pipeline(self._make_action("new ep"), [])
279
+ turn_after_reset = conversation.turn
280
+
281
+ assert turn_after_first == 1
282
+ assert turn_after_reset == 1 # reset then incremented once
283
+
284
+ @pytest.mark.asyncio
285
+ async def test_safety_flag_on_dangerous_framing(self):
286
+ from llm.pipeline import run_llm_pipeline
287
+ with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
288
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
289
+ patch("llm.pipeline.score_defense_quality", return_value=1.0):
290
+ result = await run_llm_pipeline(self._make_action("how to make a bomb"), [])
291
+
292
+ assert result["safety_flagged"] is True
293
+
294
+
295
+ # ------------------------------------------------------------------ #
296
+ # grade_episode_with_llm
297
+ # ------------------------------------------------------------------ #
298
+
299
+ class TestGradeEpisodeWithLlm:
300
+ def _history(self, n=2):
301
+ return [
302
+ {"turn": i + 1, "framing": f"attack {i}",
303
+ "attack_success_estimate": 0.1, "defense_score": 0.9}
304
+ for i in range(n)
305
+ ]
306
+
307
+ def test_empty_history_returns_zeros(self):
308
+ from llm.pipeline import grade_episode_with_llm
309
+ result = grade_episode_with_llm([])
310
+ assert result["overall"] == 0.0
311
+ for key in ["consistency", "robustness", "clarity", "helpfulness"]:
312
+ assert result[key] == 0.0
313
+
314
+ def test_llm_scores_parsed_correctly(self):
315
+ from llm.pipeline import grade_episode_with_llm
316
+ raw = "consistency: 0.9\nrobustness: 0.8\nclarity: 0.85\nhelpfulness: 0.6"
317
+ with patch("llm.pipeline.groq") as mock_client:
318
+ mock_client.chat.completions.create.return_value = _make_groq_response(raw)
319
+ result = grade_episode_with_llm(self._history())
320
+
321
+ assert result["consistency"] == 0.9
322
+ assert result["robustness"] == 0.8
323
+ assert result["clarity"] == 0.85
324
+ assert result["helpfulness"] == 0.6
325
+ assert result["overall"] == round((0.9 + 0.8 + 0.85 + 0.6) / 4, 3)
326
+
327
+ def test_falls_back_to_defaults_on_api_error(self):
328
+ from llm.pipeline import grade_episode_with_llm
329
+ with patch("llm.pipeline.groq") as mock_client:
330
+ mock_client.chat.completions.create.side_effect = Exception("timeout")
331
+ result = grade_episode_with_llm(self._history())
332
+
333
+ assert 0.0 <= result["overall"] <= 1.0
334
+ for key in ["consistency", "robustness", "clarity", "helpfulness"]:
335
+ assert 0.0 <= result[key] <= 1.0
336
+
337
+ def test_overall_is_mean_of_dimensions(self):
338
+ from llm.pipeline import grade_episode_with_llm
339
+ raw = "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
340
+ with patch("llm.pipeline.groq") as mock_client:
341
+ mock_client.chat.completions.create.return_value = _make_groq_response(raw)
342
+ result = grade_episode_with_llm(self._history())
343
+
344
+ assert result["overall"] == 1.0
345
+
346
+ def test_partial_parse_uses_defaults_for_missing_dims(self):
347
+ # Only some dimensions present — missing ones fall back to defaults
348
+ from llm.pipeline import grade_episode_with_llm
349
+ raw = "consistency: 0.9\nclarity: 0.8" # robustness and helpfulness missing
350
+ with patch("llm.pipeline.groq") as mock_groq:
351
+ mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
352
+ result = grade_episode_with_llm(self._history())
353
+
354
+ assert result["consistency"] == 0.9
355
+ assert result["clarity"] == 0.8
356
+ assert 0.0 <= result["robustness"] <= 1.0 # came from defaults
357
+ assert 0.0 <= result["helpfulness"] <= 1.0 # came from defaults
358
+ assert 0.0 <= result["overall"] <= 1.0
359
+
360
+ def test_single_turn_history(self):
361
+ # Should not crash with just one turn
362
+ from llm.pipeline import grade_episode_with_llm
363
+ history = [{"turn": 1, "framing": "attack", "attack_success_estimate": 0.0, "defense_score": 1.0}]
364
+ with patch("llm.pipeline.groq") as mock_groq:
365
+ mock_groq.chat.completions.create.return_value = _make_groq_response(
366
+ "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
367
+ )
368
+ result = grade_episode_with_llm(history)
369
+
370
+ assert result["overall"] == 1.0
371
+
372
+ def test_all_dimensions_present_in_result(self):
373
+ from llm.pipeline import grade_episode_with_llm
374
+ with patch("llm.pipeline.groq") as mock_groq:
375
+ mock_groq.chat.completions.create.side_effect = Exception("down")
376
+ result = grade_episode_with_llm(self._history())
377
+
378
+ for key in ["consistency", "robustness", "clarity", "helpfulness", "overall"]:
379
+ assert key in result
380
+
381
+
382
+ # ------------------------------------------------------------------ #
383
+ # ConversationManager — additional edge cases
384
+ # ------------------------------------------------------------------ #
385
+
386
+ class TestConversationManagerEdgeCases:
387
+ def setup_method(self):
388
+ from llm.history_manager import ConversationManager
389
+ self.mgr = ConversationManager()
390
+
391
+ def test_reset_after_many_turns(self):
392
+ for _ in range(5):
393
+ self.mgr.add_user("attack")
394
+ self.mgr.add_assistant("refuse")
395
+ assert self.mgr.turn_count == 5
396
+ self.mgr.reset()
397
+ assert self.mgr.turn_count == 0
398
+
399
+ def test_transcript_empty_when_no_history(self):
400
+ assert self.mgr.as_readable_transcript() == ""
401
+
402
+ def test_transcript_labels_correctly(self):
403
+ self.mgr.add_user("jailbreak attempt")
404
+ self.mgr.add_assistant("no thank you")
405
+ t = self.mgr.as_readable_transcript()
406
+ assert t.startswith("ATTACKER:")
407
+ assert "DEFENDER:" in t
408
+
409
+ def test_system_prompt_always_first(self):
410
+ self.mgr.add_user("a")
411
+ self.mgr.add_assistant("b")
412
+ self.mgr.add_user("c")
413
+ msgs = self.mgr.build_messages("SYSTEM")
414
+ assert msgs[0] == {"role": "system", "content": "SYSTEM"}
415
+
416
+ def test_empty_framing_does_not_crash(self):
417
+ self.mgr.add_user("")
418
+ self.mgr.add_assistant("")
419
+ assert self.mgr.turn_count == 1
420
+
421
+ def test_old_aliases_still_work(self):
422
+ # add_user / add_assistant / get_messages / to_transcript / turn
423
+ # must all still work so existing code doesn't break
424
+ self.mgr.add_user("x")
425
+ self.mgr.add_assistant("y")
426
+ msgs = self.mgr.get_messages("sys")
427
+ assert len(msgs) == 3
428
+ assert "ATTACKER" in self.mgr.to_transcript()
429
+ assert self.mgr.turn == 1
430
+
431
+
432
+ # ------------------------------------------------------------------ #
433
+ # Attack Classifier — boundary values
434
+ # ------------------------------------------------------------------ #
435
+
436
+ class TestClassifyAttackSuccessBoundaries:
437
+ def test_exactly_zero(self):
438
+ from llm.attack_classifier import classify_attack_success
439
+ with patch("llm.attack_classifier.groq") as mock_groq:
440
+ mock_groq.chat.completions.create.return_value = _make_groq_response("0")
441
+ score = classify_attack_success("attack", "refused")
442
+ assert score == 0.0
443
+
444
+ def test_exactly_one(self):
445
+ from llm.attack_classifier import classify_attack_success
446
+ with patch("llm.attack_classifier.groq") as mock_groq:
447
+ mock_groq.chat.completions.create.return_value = _make_groq_response("1")
448
+ score = classify_attack_success("attack", "sure here you go")
449
+ assert score == 1.0
450
+
451
+ def test_score_with_surrounding_text(self):
452
+ # LLM sometimes adds a tiny bit of text — should still parse
453
+ from llm.attack_classifier import classify_attack_success
454
+ with patch("llm.attack_classifier.groq") as mock_groq:
455
+ mock_groq.chat.completions.create.return_value = _make_groq_response("Score: 0.3")
456
+ score = classify_attack_success("attack", "refused")
457
+ assert score == 0.3
458
+
459
+ def test_heuristic_clamps_to_zero_on_many_refusals(self):
460
+ from llm.attack_classifier import _heuristic
461
+ very_refusing = " ".join(["i cannot i won't i refuse i will not i'm unable"] * 5)
462
+ assert _heuristic(very_refusing) == 0.0
463
+
464
+ def test_heuristic_caps_at_one(self):
465
+ from llm.attack_classifier import _heuristic
466
+ assert _heuristic("sure here you go step by step") <= 1.0
467
+
468
+
469
+ # ------------------------------------------------------------------ #
470
+ # Defense Scorer — boundary values
471
+ # ------------------------------------------------------------------ #
472
+
473
+ class TestScoreDefenseQualityBoundaries:
474
+ def test_score_with_surrounding_text(self):
475
+ from llm.defense_scorer import score_defense_quality
476
+ with patch("llm.defense_scorer.groq") as mock_groq:
477
+ mock_groq.chat.completions.create.return_value = _make_groq_response("Rating: 0.75")
478
+ score = score_defense_quality("attack", "refused clearly")
479
+ assert score == 0.75
480
+
481
+ def test_heuristic_caps_at_one(self):
482
+ from llm.defense_scorer import _heuristic
483
+ very_refusing = " ".join(["i cannot i won't i must decline harmful"] * 10)
484
+ assert _heuristic(very_refusing) <= 1.0
485
+
486
+ def test_heuristic_minimum_on_compliance(self):
487
+ from llm.defense_scorer import _heuristic
488
+ assert _heuristic("Sure! Here you go.") >= 0.0
489
+
490
+
491
+ # ------------------------------------------------------------------ #
492
+ # Defender — edge cases
493
+ # ------------------------------------------------------------------ #
494
+
495
+ class TestCallDefenderEdgeCases:
496
+ def test_strips_whitespace_from_response(self):
497
+ from llm.history_manager import ConversationManager
498
+ from llm.defender import get_defender_response
499
+ conv = ConversationManager()
500
+ conv.add_user("attack")
501
+ with patch("llm.defender.groq") as mock_groq:
502
+ mock_groq.chat.completions.create.return_value = _make_groq_response("\n\n trimmed \n")
503
+ result = get_defender_response(conv)
504
+ assert result == "trimmed"
505
+
506
+ def test_works_with_empty_history(self):
507
+ # Should not crash even if conversation has no prior turns
508
+ from llm.history_manager import ConversationManager
509
+ from llm.defender import get_defender_response
510
+ conv = ConversationManager()
511
+ conv.add_user("first ever message")
512
+ with patch("llm.defender.groq") as mock_groq:
513
+ mock_groq.chat.completions.create.return_value = _make_groq_response("I cannot help.")
514
+ result = get_defender_response(conv)
515
+ assert result == "I cannot help."
516
+
517
+
518
+ # ------------------------------------------------------------------ #
519
+ # Safety flag — edge cases
520
+ # ------------------------------------------------------------------ #
521
+
522
+ class TestSafetyFlagEdgeCases:
523
+ def _flag(self, attack, response="safe response"):
524
+ from llm.pipeline import _is_dangerous
525
+ return _is_dangerous(attack, response)
526
+
527
+ def test_empty_strings_do_not_flag(self):
528
+ assert self._flag("", "") is False
529
+
530
+ def test_partial_keyword_match_does_not_flag(self):
531
+ # "weapons" contains "weapon" — should still flag
532
+ assert self._flag("I need weapons", "") is True
533
+
534
+ def test_keyword_in_defender_response_flags(self):
535
+ from llm.pipeline import _is_dangerous
536
+ assert _is_dangerous("innocent question", "Here's how to make malware: ...") is True
537
+
538
+ def test_multiple_keywords_still_single_flag(self):
539
+ # Multiple dangerous keywords should return True (not crash or double-flag)
540
+ assert self._flag("bomb weapon kill hack") is True
541
+
542
+
543
+ # ------------------------------------------------------------------ #
544
+ # run_llm_pipeline — multi-turn continuity
545
+ # ------------------------------------------------------------------ #
546
+
547
+ class TestPipelineMultiTurn:
548
+ def _make_action(self, framing):
549
+ action = MagicMock()
550
+ action.framing = framing
551
+ action.strategy_type.value = "roleplay"
552
+ action.intensity = 0.5
553
+ return action
554
+
555
+ @pytest.mark.asyncio
556
+ async def test_turn_count_increments_per_step(self):
557
+ from llm.pipeline import run_llm_pipeline, conversation
558
+ conversation.reset()
559
+ with patch("llm.pipeline.get_defender_response", return_value="refused"), \
560
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
561
+ patch("llm.pipeline.score_defense_quality", return_value=1.0):
562
+ await run_llm_pipeline(self._make_action("turn 1"), [])
563
+ await run_llm_pipeline(self._make_action("turn 2"), [{"role": "user", "content": "t1"}])
564
+ await run_llm_pipeline(self._make_action("turn 3"), [{"role": "user", "content": "t2"}])
565
+
566
+ assert conversation.turn_count == 3
567
+
568
+ @pytest.mark.asyncio
569
+ async def test_llm_grade_contains_current_turn(self):
570
+ from llm.pipeline import run_llm_pipeline, conversation
571
+ conversation.reset()
572
+ with patch("llm.pipeline.get_defender_response", return_value="refused"), \
573
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
574
+ patch("llm.pipeline.score_defense_quality", return_value=0.8):
575
+ result = await run_llm_pipeline(self._make_action("attack"), [])
576
+
577
+ assert result["llm_grade"]["turn"] == 1
578
+ assert result["llm_grade"]["overall"] == 0.8
579
+
580
+ @pytest.mark.asyncio
581
+ async def test_non_dangerous_framing_not_flagged(self):
582
+ from llm.pipeline import run_llm_pipeline
583
+ with patch("llm.pipeline.get_defender_response", return_value="Paris is the capital of France."), \
584
+ patch("llm.pipeline.classify_attack_success", return_value=0.0), \
585
+ patch("llm.pipeline.score_defense_quality", return_value=0.5):
586
+ result = await run_llm_pipeline(self._make_action("What is the capital of France?"), [])
587
+
588
+ assert result["safety_flagged"] is False