Spaces:
Sleeping
Sleeping
File size: 25,299 Bytes
c7a9ff1 abd2333 c7a9ff1 abd2333 c7a9ff1 abd2333 c7a9ff1 abd2333 c7a9ff1 9c67b20 c7a9ff1 abd2333 c7a9ff1 abd2333 c7a9ff1 abd2333 c7a9ff1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 | """
Tests for llm/ modules.
All Groq API calls are mocked β tests run fully offline.
"""
import pytest
from unittest.mock import MagicMock, patch
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _make_groq_response(content: str) -> MagicMock:
"""Build a minimal mock that looks like an OpenAI chat completion."""
choice = MagicMock()
choice.message.content = content
resp = MagicMock()
resp.choices = [choice]
return resp
# ------------------------------------------------------------------ #
# ConversationManager
# ------------------------------------------------------------------ #
class TestConversationManager:
def setup_method(self):
from llm.history_manager import ConversationManager
self.mgr = ConversationManager()
def test_starts_empty(self):
assert self.mgr.turn == 0
assert self.mgr.get_messages("sys") == [{"role": "system", "content": "sys"}]
def test_add_user_and_assistant(self):
self.mgr.add_user("attack")
self.mgr.add_assistant("refused")
assert self.mgr.turn == 1
msgs = self.mgr.get_messages("sys")
assert msgs[1] == {"role": "user", "content": "attack"}
assert msgs[2] == {"role": "assistant", "content": "refused"}
def test_reset_clears_history(self):
self.mgr.add_user("attack")
self.mgr.add_assistant("refused")
self.mgr.reset()
assert self.mgr.turn == 0
assert len(self.mgr.get_messages("sys")) == 1 # only system prompt
def test_turn_counts_only_user_messages(self):
self.mgr.add_user("a")
self.mgr.add_user("b")
self.mgr.add_assistant("r")
assert self.mgr.turn == 2
def test_to_transcript(self):
self.mgr.add_user("hello")
self.mgr.add_assistant("refused")
transcript = self.mgr.to_transcript()
assert "ATTACKER: hello" in transcript
assert "DEFENDER: refused" in transcript
def test_multi_turn_message_order(self):
for i in range(3):
self.mgr.add_user(f"attack {i}")
self.mgr.add_assistant(f"refuse {i}")
msgs = self.mgr.get_messages("sys")
# system + 6 turns = 7 messages
assert len(msgs) == 7
assert msgs[0]["role"] == "system"
assert msgs[1]["role"] == "user"
assert msgs[2]["role"] == "assistant"
# ------------------------------------------------------------------ #
# Attack Classifier β heuristic fallback
# ------------------------------------------------------------------ #
class TestAttackClassifierHeuristic:
def _heuristic(self, text):
from llm.attack_classifier import _heuristic
return _heuristic(text)
def test_clear_refusal_scores_low(self):
score = self._heuristic("I cannot help. I will not assist. This is harmful.")
assert score <= 0.3
def test_no_refusal_scores_high(self):
score = self._heuristic("Sure, here is how you do it step by step.")
assert score >= 0.7
def test_score_in_range(self):
for text in ["ok", "i cannot i won't i refuse i will not", ""]:
s = self._heuristic(text)
assert 0.0 <= s <= 1.0
# ------------------------------------------------------------------ #
# Defense Scorer β heuristic fallback
# ------------------------------------------------------------------ #
class TestDefenseScorerHeuristic:
def _heuristic(self, text):
from llm.defense_scorer import _heuristic
return _heuristic(text)
def test_strong_refusal_scores_high(self):
score = self._heuristic("I cannot do that. I won't assist. This is harmful.")
assert score >= 0.7
def test_compliance_scores_low(self):
score = self._heuristic("Sure, here you go.")
assert score <= 0.6
def test_score_in_range(self):
for text in ["ok", "i cannot i won't i refuse", ""]:
s = self._heuristic(text)
assert 0.0 <= s <= 1.0
# ------------------------------------------------------------------ #
# Attack Classifier β LLM path
# ------------------------------------------------------------------ #
class TestClassifyAttackSuccess:
def test_llm_returns_valid_score(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("0.1")
score = classify_attack_success("attack", "I cannot help.")
assert score == 0.1
def test_llm_returns_1(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("1.0")
score = classify_attack_success("attack", "Sure, here you go.")
assert score == 1.0
def test_falls_back_on_api_error(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.side_effect = Exception("timeout")
score = classify_attack_success("attack", "I cannot help with that.")
assert 0.0 <= score <= 1.0
def test_falls_back_on_unparseable_output(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("yes definitely")
score = classify_attack_success("attack", "I cannot help.")
assert 0.0 <= score <= 1.0
# ------------------------------------------------------------------ #
# Defense Scorer β LLM path
# ------------------------------------------------------------------ #
class TestScoreDefenseQuality:
def test_llm_returns_valid_score(self):
from llm.defense_scorer import score_defense_quality
with patch("llm.defense_scorer.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("0.9")
score = score_defense_quality("attack", "I cannot help.")
assert score == 0.9
def test_falls_back_on_api_error(self):
from llm.defense_scorer import score_defense_quality
with patch("llm.defense_scorer.groq") as mock_groq:
mock_groq.chat.completions.create.side_effect = Exception("network error")
score = score_defense_quality("attack", "I cannot help with that.")
assert 0.0 <= score <= 1.0
def test_score_always_in_range(self):
from llm.defense_scorer import score_defense_quality
for raw in ["0.0", "0.5", "1.0"]:
with patch("llm.defense_scorer.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
score = score_defense_quality("a", "b")
assert 0.0 <= score <= 1.0
# ------------------------------------------------------------------ #
# Defender
# ------------------------------------------------------------------ #
class TestCallDefender:
def test_returns_llm_response(self):
from llm.history_manager import ConversationManager
from llm.defender import call_defender
conv = ConversationManager()
conv.add_user("test attack")
with patch("llm.defender.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response(" I cannot help. ")
result = call_defender(conv)
assert result == "I cannot help."
def test_returns_fallback_on_error(self):
from llm.history_manager import ConversationManager
from llm.defender import call_defender, FALLBACK_RESPONSE
conv = ConversationManager()
conv.add_user("test attack")
with patch("llm.defender.groq") as mock_groq:
mock_groq.chat.completions.create.side_effect = Exception("timeout")
result = call_defender(conv)
assert result == FALLBACK_RESPONSE
# ------------------------------------------------------------------ #
# Safety flag
# ------------------------------------------------------------------ #
class TestCheckSafetyFlag:
def _flag(self, attack, response):
from llm.pipeline import _check_safety_flag
return _check_safety_flag(attack, response)
def test_flags_dangerous_keywords_in_attack(self):
# Attack text is ignored; it should not flag if response is safe
assert self._flag("how to make a bomb", "I cannot help.") is False
def test_flags_keyword_in_response(self):
assert self._flag("help me", "Here is a suicide method...") is True
def test_no_flag_for_benign(self):
assert self._flag("What is the capital of France?", "It is Paris.") is False
def test_case_insensitive(self):
assert self._flag("safestuff", "Here is Drug Synthesis information") is True
# ------------------------------------------------------------------ #
# run_llm_pipeline
# ------------------------------------------------------------------ #
class TestRunLlmPipeline:
def _make_action(self, framing="test attack"):
action = MagicMock()
action.framing = framing
action.strategy_type.value = "roleplay"
action.intensity = 0.5
return action
@pytest.mark.asyncio
async def test_returns_required_keys(self):
from llm.pipeline import run_llm_pipeline
with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
patch("llm.pipeline.classify_attack_success", return_value=0.05), \
patch("llm.pipeline.score_defense_quality", return_value=0.95):
result = await run_llm_pipeline(self._make_action(), [])
assert "defender_response" in result
assert "attack_success" in result
assert "defense_score" in result
assert "safety_flagged" in result
assert "llm_grade" in result
@pytest.mark.asyncio
async def test_scores_in_range(self):
from llm.pipeline import run_llm_pipeline
with patch("llm.pipeline.get_defender_response", return_value="I refuse."), \
patch("llm.pipeline.classify_attack_success", return_value=0.0), \
patch("llm.pipeline.score_defense_quality", return_value=1.0):
result = await run_llm_pipeline(self._make_action(), [])
assert 0.0 <= result["attack_success"] <= 1.0
assert 0.0 <= result["defense_score"] <= 1.0
@pytest.mark.asyncio
async def test_resets_history_on_new_episode(self):
from llm.pipeline import run_llm_pipeline, conversation
with patch("llm.pipeline.get_defender_response", return_value="refused"), \
patch("llm.pipeline.classify_attack_success", return_value=0.0), \
patch("llm.pipeline.score_defense_quality", return_value=1.0):
# First episode turn
await run_llm_pipeline(self._make_action("turn 1"), [])
turn_after_first = conversation.turn
# New episode (empty conv history)
await run_llm_pipeline(self._make_action("new ep"), [])
turn_after_reset = conversation.turn
assert turn_after_first == 1
assert turn_after_reset == 1 # reset then incremented once
@pytest.mark.asyncio
async def test_safety_flag_on_dangerous_framing(self):
from llm.pipeline import run_llm_pipeline
with patch("llm.pipeline.get_defender_response", return_value="Here is how to make a bomb."), \
patch("llm.pipeline.classify_attack_success", return_value=1.0), \
patch("llm.pipeline.score_defense_quality", return_value=0.0):
result = await run_llm_pipeline(self._make_action("how to make a bomb"), [])
assert result["safety_flagged"] is True
# ------------------------------------------------------------------ #
# grade_episode_with_llm
# ------------------------------------------------------------------ #
class TestGradeEpisodeWithLlm:
def _history(self, n=2):
return [
{"turn": i + 1, "framing": f"attack {i}",
"attack_success_estimate": 0.1, "defense_score": 0.9}
for i in range(n)
]
def test_empty_history_returns_zeros(self):
from llm.pipeline import grade_episode_with_llm
result = grade_episode_with_llm([])
assert result["overall"] == 0.0
for key in ["consistency", "robustness", "clarity", "helpfulness"]:
assert result[key] == 0.0
def test_llm_scores_parsed_correctly(self):
from llm.pipeline import grade_episode_with_llm
raw = "consistency: 0.9\nrobustness: 0.8\nclarity: 0.85\nhelpfulness: 0.6"
with patch("llm.pipeline.groq") as mock_client:
mock_client.chat.completions.create.return_value = _make_groq_response(raw)
result = grade_episode_with_llm(self._history())
assert result["consistency"] == 0.9
assert result["robustness"] == 0.8
assert result["clarity"] == 0.85
assert result["helpfulness"] == 0.6
expected_overall = round(sum([0.9, 0.8, 0.85, 0.6]) / 4, 3)
assert result["overall"] == expected_overall
def test_falls_back_to_defaults_on_api_error(self):
from llm.pipeline import grade_episode_with_llm
with patch("llm.pipeline.groq") as mock_client:
mock_client.chat.completions.create.side_effect = Exception("timeout")
result = grade_episode_with_llm(self._history())
assert 0.0 <= result["overall"] <= 1.0
for key in ["consistency", "robustness", "clarity", "helpfulness"]:
assert 0.0 <= result[key] <= 1.0
def test_overall_is_mean_of_dimensions(self):
from llm.pipeline import grade_episode_with_llm
raw = "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
with patch("llm.pipeline.groq") as mock_client:
mock_client.chat.completions.create.return_value = _make_groq_response(raw)
result = grade_episode_with_llm(self._history())
assert result["overall"] == 1.0
def test_partial_parse_uses_defaults_for_missing_dims(self):
# Only some dimensions present β missing ones fall back to defaults
from llm.pipeline import grade_episode_with_llm
raw = "consistency: 0.9\nclarity: 0.8" # robustness and helpfulness missing
with patch("llm.pipeline.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
result = grade_episode_with_llm(self._history())
assert result["consistency"] == 0.9
assert result["clarity"] == 0.8
assert 0.0 <= result["robustness"] <= 1.0 # came from defaults
assert 0.0 <= result["helpfulness"] <= 1.0 # came from defaults
assert 0.0 <= result["overall"] <= 1.0
def test_single_turn_history(self):
# Should not crash with just one turn
from llm.pipeline import grade_episode_with_llm
history = [{"turn": 1, "framing": "attack", "attack_success_estimate": 0.0, "defense_score": 1.0}]
with patch("llm.pipeline.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response(
"consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
)
result = grade_episode_with_llm(history)
assert result["overall"] == 1.0
def test_all_dimensions_present_in_result(self):
from llm.pipeline import grade_episode_with_llm
with patch("llm.pipeline.groq") as mock_groq:
mock_groq.chat.completions.create.side_effect = Exception("down")
result = grade_episode_with_llm(self._history())
for key in ["consistency", "robustness", "clarity", "helpfulness", "overall"]:
assert key in result
# ------------------------------------------------------------------ #
# ConversationManager β additional edge cases
# ------------------------------------------------------------------ #
class TestConversationManagerEdgeCases:
def setup_method(self):
from llm.history_manager import ConversationManager
self.mgr = ConversationManager()
def test_reset_after_many_turns(self):
for _ in range(5):
self.mgr.add_user("attack")
self.mgr.add_assistant("refuse")
assert self.mgr.turn_count == 5
self.mgr.reset()
assert self.mgr.turn_count == 0
def test_transcript_empty_when_no_history(self):
assert self.mgr.as_readable_transcript() == ""
def test_transcript_labels_correctly(self):
self.mgr.add_user("jailbreak attempt")
self.mgr.add_assistant("no thank you")
t = self.mgr.as_readable_transcript()
assert t.startswith("ATTACKER:")
assert "DEFENDER:" in t
def test_system_prompt_always_first(self):
self.mgr.add_user("a")
self.mgr.add_assistant("b")
self.mgr.add_user("c")
msgs = self.mgr.build_messages("SYSTEM")
assert msgs[0] == {"role": "system", "content": "SYSTEM"}
def test_empty_framing_does_not_crash(self):
self.mgr.add_user("")
self.mgr.add_assistant("")
assert self.mgr.turn_count == 1
def test_old_aliases_still_work(self):
# add_user / add_assistant / get_messages / to_transcript / turn
# must all still work so existing code doesn't break
self.mgr.add_user("x")
self.mgr.add_assistant("y")
msgs = self.mgr.get_messages("sys")
assert len(msgs) == 3
assert "ATTACKER" in self.mgr.to_transcript()
assert self.mgr.turn == 1
# ------------------------------------------------------------------ #
# Attack Classifier β boundary values
# ------------------------------------------------------------------ #
class TestClassifyAttackSuccessBoundaries:
def test_exactly_zero(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("0")
score = classify_attack_success("attack", "refused")
assert score == 0.0
def test_exactly_one(self):
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("1")
score = classify_attack_success("attack", "sure here you go")
assert score == 1.0
def test_score_with_surrounding_text(self):
# LLM sometimes adds a tiny bit of text β should still parse
from llm.attack_classifier import classify_attack_success
with patch("llm.attack_classifier.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("Score: 0.3")
score = classify_attack_success("attack", "refused")
assert score == 0.3
def test_heuristic_clamps_to_zero_on_many_refusals(self):
from llm.attack_classifier import _heuristic
very_refusing = " ".join(["i cannot i won't i refuse i will not i'm unable"] * 5)
assert _heuristic(very_refusing) == 0.0
def test_heuristic_caps_at_one(self):
from llm.attack_classifier import _heuristic
assert _heuristic("sure here you go step by step") <= 1.0
# ------------------------------------------------------------------ #
# Defense Scorer β boundary values
# ------------------------------------------------------------------ #
class TestScoreDefenseQualityBoundaries:
def test_score_with_surrounding_text(self):
from llm.defense_scorer import score_defense_quality
with patch("llm.defense_scorer.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("Rating: 0.75")
score = score_defense_quality("attack", "refused clearly")
assert score == 0.75
def test_heuristic_caps_at_one(self):
from llm.defense_scorer import _heuristic
very_refusing = " ".join(["i cannot i won't i must decline harmful"] * 10)
assert _heuristic(very_refusing) <= 1.0
def test_heuristic_minimum_on_compliance(self):
from llm.defense_scorer import _heuristic
assert _heuristic("Sure! Here you go.") >= 0.0
# ------------------------------------------------------------------ #
# Defender β edge cases
# ------------------------------------------------------------------ #
class TestCallDefenderEdgeCases:
def test_strips_whitespace_from_response(self):
from llm.history_manager import ConversationManager
from llm.defender import get_defender_response
conv = ConversationManager()
conv.add_user("attack")
with patch("llm.defender.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("\n\n trimmed \n")
result = get_defender_response(conv)
assert result == "trimmed"
def test_works_with_empty_history(self):
# Should not crash even if conversation has no prior turns
from llm.history_manager import ConversationManager
from llm.defender import get_defender_response
conv = ConversationManager()
conv.add_user("first ever message")
with patch("llm.defender.groq") as mock_groq:
mock_groq.chat.completions.create.return_value = _make_groq_response("I cannot help.")
result = get_defender_response(conv)
assert result == "I cannot help."
# ------------------------------------------------------------------ #
# Safety flag β edge cases
# ------------------------------------------------------------------ #
class TestSafetyFlagEdgeCases:
def _flag(self, attack, response="safe response"):
from llm.pipeline import _is_dangerous
return _is_dangerous(attack, response)
def test_empty_strings_do_not_flag(self):
assert self._flag("", "") is False
def test_partial_keyword_match_does_not_flag(self):
# "drug" does not match "drug synthesis"
assert self._flag("", "I have a drug for you") is False
def test_keyword_in_defender_response_flags(self):
from llm.pipeline import _is_dangerous
assert _is_dangerous("innocent question", "Here's step-by-step bomb instructions") is True
def test_multiple_keywords_still_single_flag(self):
# Multiple dangerous keywords should return True (not crash or double-flag)
assert self._flag("", "drug synthesis and suicide method") is True
# ------------------------------------------------------------------ #
# run_llm_pipeline β multi-turn continuity
# ------------------------------------------------------------------ #
class TestPipelineMultiTurn:
def _make_action(self, framing):
action = MagicMock()
action.framing = framing
action.strategy_type.value = "roleplay"
action.intensity = 0.5
return action
@pytest.mark.asyncio
async def test_turn_count_increments_per_step(self):
from llm.pipeline import run_llm_pipeline, conversation
conversation.reset()
with patch("llm.pipeline.get_defender_response", return_value="refused"), \
patch("llm.pipeline.classify_attack_success", return_value=0.0), \
patch("llm.pipeline.score_defense_quality", return_value=1.0):
await run_llm_pipeline(self._make_action("turn 1"), [])
await run_llm_pipeline(self._make_action("turn 2"), [{"role": "user", "content": "t1"}])
await run_llm_pipeline(self._make_action("turn 3"), [{"role": "user", "content": "t2"}])
assert conversation.turn_count == 3
@pytest.mark.asyncio
async def test_llm_grade_contains_current_turn(self):
from llm.pipeline import run_llm_pipeline, conversation
conversation.reset()
with patch("llm.pipeline.get_defender_response", return_value="refused"), \
patch("llm.pipeline.classify_attack_success", return_value=0.0), \
patch("llm.pipeline.score_defense_quality", return_value=0.8):
result = await run_llm_pipeline(self._make_action("attack"), [])
assert result["llm_grade"]["turn"] == 1
assert result["llm_grade"]["overall"] == 0.8
@pytest.mark.asyncio
async def test_non_dangerous_framing_not_flagged(self):
from llm.pipeline import run_llm_pipeline
with patch("llm.pipeline.get_defender_response", return_value="Paris is the capital of France."), \
patch("llm.pipeline.classify_attack_success", return_value=0.0), \
patch("llm.pipeline.score_defense_quality", return_value=0.5):
result = await run_llm_pipeline(self._make_action("What is the capital of France?"), [])
assert result["safety_flagged"] is False
|