Spaces:

Rayugacodes
/

Breach-OS

Sleeping

Naman Gupta commited on Apr 1

Commit

c7a9ff1

1 Parent(s): 5ebe298

add 59 tests for the LLM modules — all run offline, no API needed

Covers everything: conversation memory, attack classifier, defense
scorer, defender fallbacks, safety flags, multi-turn continuity,
boundary values, and the episode grader. Mocked Groq calls so
the test suite is fast and doesn't burn API quota.

Files changed (1) hide show

tests/test_llm.py +588 -0

tests/test_llm.py ADDED Viewed

	@@ -0,0 +1,588 @@

+"""
+Tests for llm/ modules.
+All Groq API calls are mocked — tests run fully offline.
+"""
+import pytest
+from unittest.mock import MagicMock, patch
+# ------------------------------------------------------------------ #
+# Helpers
+# ------------------------------------------------------------------ #
+def _make_groq_response(content: str) -> MagicMock:
+    """Build a minimal mock that looks like an OpenAI chat completion."""
+    choice = MagicMock()
+    choice.message.content = content
+    resp = MagicMock()
+    resp.choices = [choice]
+    return resp
+# ------------------------------------------------------------------ #
+# ConversationManager
+# ------------------------------------------------------------------ #
+class TestConversationManager:
+    def setup_method(self):
+        from llm.history_manager import ConversationManager
+        self.mgr = ConversationManager()
+    def test_starts_empty(self):
+        assert self.mgr.turn == 0
+        assert self.mgr.get_messages("sys") == [{"role": "system", "content": "sys"}]
+    def test_add_user_and_assistant(self):
+        self.mgr.add_user("attack")
+        self.mgr.add_assistant("refused")
+        assert self.mgr.turn == 1
+        msgs = self.mgr.get_messages("sys")
+        assert msgs[1] == {"role": "user",      "content": "attack"}
+        assert msgs[2] == {"role": "assistant",  "content": "refused"}
+    def test_reset_clears_history(self):
+        self.mgr.add_user("attack")
+        self.mgr.add_assistant("refused")
+        self.mgr.reset()
+        assert self.mgr.turn == 0
+        assert len(self.mgr.get_messages("sys")) == 1  # only system prompt
+    def test_turn_counts_only_user_messages(self):
+        self.mgr.add_user("a")
+        self.mgr.add_user("b")
+        self.mgr.add_assistant("r")
+        assert self.mgr.turn == 2
+    def test_to_transcript(self):
+        self.mgr.add_user("hello")
+        self.mgr.add_assistant("refused")
+        transcript = self.mgr.to_transcript()
+        assert "ATTACKER: hello" in transcript
+        assert "DEFENDER: refused" in transcript
+    def test_multi_turn_message_order(self):
+        for i in range(3):
+            self.mgr.add_user(f"attack {i}")
+            self.mgr.add_assistant(f"refuse {i}")
+        msgs = self.mgr.get_messages("sys")
+        # system + 6 turns = 7 messages
+        assert len(msgs) == 7
+        assert msgs[0]["role"] == "system"
+        assert msgs[1]["role"] == "user"
+        assert msgs[2]["role"] == "assistant"
+# ------------------------------------------------------------------ #
+# Attack Classifier — heuristic fallback
+# ------------------------------------------------------------------ #
+class TestAttackClassifierHeuristic:
+    def _heuristic(self, text):
+        from llm.attack_classifier import _heuristic
+        return _heuristic(text)
+    def test_clear_refusal_scores_low(self):
+        score = self._heuristic("I cannot help. I will not assist. This is harmful.")
+        assert score <= 0.3
+    def test_no_refusal_scores_high(self):
+        score = self._heuristic("Sure, here is how you do it step by step.")
+        assert score >= 0.7
+    def test_score_in_range(self):
+        for text in ["ok", "i cannot i won't i refuse i will not", ""]:
+            s = self._heuristic(text)
+            assert 0.0 <= s <= 1.0
+# ------------------------------------------------------------------ #
+# Defense Scorer — heuristic fallback
+# ------------------------------------------------------------------ #
+class TestDefenseScorerHeuristic:
+    def _heuristic(self, text):
+        from llm.defense_scorer import _heuristic
+        return _heuristic(text)
+    def test_strong_refusal_scores_high(self):
+        score = self._heuristic("I cannot do that. I won't assist. This is harmful.")
+        assert score >= 0.7
+    def test_compliance_scores_low(self):
+        score = self._heuristic("Sure, here you go.")
+        assert score <= 0.6
+    def test_score_in_range(self):
+        for text in ["ok", "i cannot i won't i refuse", ""]:
+            s = self._heuristic(text)
+            assert 0.0 <= s <= 1.0
+# ------------------------------------------------------------------ #
+# Attack Classifier — LLM path
+# ------------------------------------------------------------------ #
+class TestClassifyAttackSuccess:
+    def test_llm_returns_valid_score(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("0.1")
+            score = classify_attack_success("attack", "I cannot help.")
+        assert score == 0.1
+    def test_llm_returns_1(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("1.0")
+            score = classify_attack_success("attack", "Sure, here you go.")
+        assert score == 1.0
+    def test_falls_back_on_api_error(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.side_effect = Exception("timeout")
+            score = classify_attack_success("attack", "I cannot help with that.")
+        assert 0.0 <= score <= 1.0
+    def test_falls_back_on_unparseable_output(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("yes definitely")
+            score = classify_attack_success("attack", "I cannot help.")
+        assert 0.0 <= score <= 1.0
+# ------------------------------------------------------------------ #
+# Defense Scorer — LLM path
+# ------------------------------------------------------------------ #
+class TestScoreDefenseQuality:
+    def test_llm_returns_valid_score(self):
+        from llm.defense_scorer import score_defense_quality
+        with patch("llm.defense_scorer.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("0.9")
+            score = score_defense_quality("attack", "I cannot help.")
+        assert score == 0.9
+    def test_falls_back_on_api_error(self):
+        from llm.defense_scorer import score_defense_quality
+        with patch("llm.defense_scorer.groq") as mock_groq:
+            mock_groq.chat.completions.create.side_effect = Exception("network error")
+            score = score_defense_quality("attack", "I cannot help with that.")
+        assert 0.0 <= score <= 1.0
+    def test_score_always_in_range(self):
+        from llm.defense_scorer import score_defense_quality
+        for raw in ["0.0", "0.5", "1.0"]:
+            with patch("llm.defense_scorer.groq") as mock_groq:
+                mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
+                score = score_defense_quality("a", "b")
+            assert 0.0 <= score <= 1.0
+# ------------------------------------------------------------------ #
+# Defender
+# ------------------------------------------------------------------ #
+class TestCallDefender:
+    def test_returns_llm_response(self):
+        from llm.history_manager import ConversationManager
+        from llm.defender import call_defender
+        conv = ConversationManager()
+        conv.add_user("test attack")
+        with patch("llm.defender.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("  I cannot help.  ")
+            result = call_defender(conv)
+        assert result == "I cannot help."
+    def test_returns_fallback_on_error(self):
+        from llm.history_manager import ConversationManager
+        from llm.defender import call_defender, FALLBACK_RESPONSE
+        conv = ConversationManager()
+        conv.add_user("test attack")
+        with patch("llm.defender.groq") as mock_groq:
+            mock_groq.chat.completions.create.side_effect = Exception("timeout")
+            result = call_defender(conv)
+        assert result == FALLBACK_RESPONSE
+# ------------------------------------------------------------------ #
+# Safety flag
+# ------------------------------------------------------------------ #
+class TestCheckSafetyFlag:
+    def _flag(self, attack, response):
+        from llm.pipeline import _check_safety_flag
+        return _check_safety_flag(attack, response)
+    def test_flags_dangerous_keywords_in_attack(self):
+        assert self._flag("how to make a bomb", "I cannot help.") is True
+    def test_flags_keyword_in_response(self):
+        assert self._flag("help me", "Here is how to make malware...") is True
+    def test_no_flag_for_benign(self):
+        assert self._flag("What is the capital of France?", "It is Paris.") is False
+    def test_case_insensitive(self):
+        assert self._flag("How to make a BOMB", "refusal") is True
+# ------------------------------------------------------------------ #
+# run_llm_pipeline
+# ------------------------------------------------------------------ #
+class TestRunLlmPipeline:
+    def _make_action(self, framing="test attack"):
+        action = MagicMock()
+        action.framing = framing
+        action.strategy_type.value = "roleplay"
+        action.intensity = 0.5
+        return action
+    @pytest.mark.asyncio
+    async def test_returns_required_keys(self):
+        from llm.pipeline import run_llm_pipeline
+        with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.05), \
+             patch("llm.pipeline.score_defense_quality", return_value=0.95):
+            result = await run_llm_pipeline(self._make_action(), [])
+        assert "defender_response" in result
+        assert "attack_success"    in result
+        assert "defense_score"     in result
+        assert "safety_flagged"    in result
+        assert "llm_grade"         in result
+    @pytest.mark.asyncio
+    async def test_scores_in_range(self):
+        from llm.pipeline import run_llm_pipeline
+        with patch("llm.pipeline.get_defender_response", return_value="I refuse."), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=1.0):
+            result = await run_llm_pipeline(self._make_action(), [])
+        assert 0.0 <= result["attack_success"] <= 1.0
+        assert 0.0 <= result["defense_score"]  <= 1.0
+    @pytest.mark.asyncio
+    async def test_resets_history_on_new_episode(self):
+        from llm.pipeline import run_llm_pipeline, conversation
+        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=1.0):
+            # First episode turn
+            await run_llm_pipeline(self._make_action("turn 1"), [])
+            turn_after_first = conversation.turn
+            # New episode (empty conv history)
+            await run_llm_pipeline(self._make_action("new ep"), [])
+            turn_after_reset = conversation.turn
+        assert turn_after_first == 1
+        assert turn_after_reset == 1  # reset then incremented once
+    @pytest.mark.asyncio
+    async def test_safety_flag_on_dangerous_framing(self):
+        from llm.pipeline import run_llm_pipeline
+        with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=1.0):
+            result = await run_llm_pipeline(self._make_action("how to make a bomb"), [])
+        assert result["safety_flagged"] is True
+# ------------------------------------------------------------------ #
+# grade_episode_with_llm
+# ------------------------------------------------------------------ #
+class TestGradeEpisodeWithLlm:
+    def _history(self, n=2):
+        return [
+            {"turn": i + 1, "framing": f"attack {i}",
+             "attack_success_estimate": 0.1, "defense_score": 0.9}
+            for i in range(n)
+        ]
+    def test_empty_history_returns_zeros(self):
+        from llm.pipeline import grade_episode_with_llm
+        result = grade_episode_with_llm([])
+        assert result["overall"] == 0.0
+        for key in ["consistency", "robustness", "clarity", "helpfulness"]:
+            assert result[key] == 0.0
+    def test_llm_scores_parsed_correctly(self):
+        from llm.pipeline import grade_episode_with_llm
+        raw = "consistency: 0.9\nrobustness: 0.8\nclarity: 0.85\nhelpfulness: 0.6"
+        with patch("llm.pipeline.groq") as mock_client:
+            mock_client.chat.completions.create.return_value = _make_groq_response(raw)
+            result = grade_episode_with_llm(self._history())
+        assert result["consistency"] == 0.9
+        assert result["robustness"]  == 0.8
+        assert result["clarity"]     == 0.85
+        assert result["helpfulness"] == 0.6
+        assert result["overall"]     == round((0.9 + 0.8 + 0.85 + 0.6) / 4, 3)
+    def test_falls_back_to_defaults_on_api_error(self):
+        from llm.pipeline import grade_episode_with_llm
+        with patch("llm.pipeline.groq") as mock_client:
+            mock_client.chat.completions.create.side_effect = Exception("timeout")
+            result = grade_episode_with_llm(self._history())
+        assert 0.0 <= result["overall"] <= 1.0
+        for key in ["consistency", "robustness", "clarity", "helpfulness"]:
+            assert 0.0 <= result[key] <= 1.0
+    def test_overall_is_mean_of_dimensions(self):
+        from llm.pipeline import grade_episode_with_llm
+        raw = "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
+        with patch("llm.pipeline.groq") as mock_client:
+            mock_client.chat.completions.create.return_value = _make_groq_response(raw)
+            result = grade_episode_with_llm(self._history())
+        assert result["overall"] == 1.0
+    def test_partial_parse_uses_defaults_for_missing_dims(self):
+        # Only some dimensions present — missing ones fall back to defaults
+        from llm.pipeline import grade_episode_with_llm
+        raw = "consistency: 0.9\nclarity: 0.8"   # robustness and helpfulness missing
+        with patch("llm.pipeline.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
+            result = grade_episode_with_llm(self._history())
+        assert result["consistency"] == 0.9
+        assert result["clarity"]     == 0.8
+        assert 0.0 <= result["robustness"]  <= 1.0   # came from defaults
+        assert 0.0 <= result["helpfulness"] <= 1.0   # came from defaults
+        assert 0.0 <= result["overall"]     <= 1.0
+    def test_single_turn_history(self):
+        # Should not crash with just one turn
+        from llm.pipeline import grade_episode_with_llm
+        history = [{"turn": 1, "framing": "attack", "attack_success_estimate": 0.0, "defense_score": 1.0}]
+        with patch("llm.pipeline.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response(
+                "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
+            )
+            result = grade_episode_with_llm(history)
+        assert result["overall"] == 1.0
+    def test_all_dimensions_present_in_result(self):
+        from llm.pipeline import grade_episode_with_llm
+        with patch("llm.pipeline.groq") as mock_groq:
+            mock_groq.chat.completions.create.side_effect = Exception("down")
+            result = grade_episode_with_llm(self._history())
+        for key in ["consistency", "robustness", "clarity", "helpfulness", "overall"]:
+            assert key in result
+# ------------------------------------------------------------------ #
+# ConversationManager — additional edge cases
+# ------------------------------------------------------------------ #
+class TestConversationManagerEdgeCases:
+    def setup_method(self):
+        from llm.history_manager import ConversationManager
+        self.mgr = ConversationManager()
+    def test_reset_after_many_turns(self):
+        for _ in range(5):
+            self.mgr.add_user("attack")
+            self.mgr.add_assistant("refuse")
+        assert self.mgr.turn_count == 5
+        self.mgr.reset()
+        assert self.mgr.turn_count == 0
+    def test_transcript_empty_when_no_history(self):
+        assert self.mgr.as_readable_transcript() == ""
+    def test_transcript_labels_correctly(self):
+        self.mgr.add_user("jailbreak attempt")
+        self.mgr.add_assistant("no thank you")
+        t = self.mgr.as_readable_transcript()
+        assert t.startswith("ATTACKER:")
+        assert "DEFENDER:" in t
+    def test_system_prompt_always_first(self):
+        self.mgr.add_user("a")
+        self.mgr.add_assistant("b")
+        self.mgr.add_user("c")
+        msgs = self.mgr.build_messages("SYSTEM")
+        assert msgs[0] == {"role": "system", "content": "SYSTEM"}
+    def test_empty_framing_does_not_crash(self):
+        self.mgr.add_user("")
+        self.mgr.add_assistant("")
+        assert self.mgr.turn_count == 1
+    def test_old_aliases_still_work(self):
+        # add_user / add_assistant / get_messages / to_transcript / turn
+        # must all still work so existing code doesn't break
+        self.mgr.add_user("x")
+        self.mgr.add_assistant("y")
+        msgs = self.mgr.get_messages("sys")
+        assert len(msgs) == 3
+        assert "ATTACKER" in self.mgr.to_transcript()
+        assert self.mgr.turn == 1
+# ------------------------------------------------------------------ #
+# Attack Classifier — boundary values
+# ------------------------------------------------------------------ #
+class TestClassifyAttackSuccessBoundaries:
+    def test_exactly_zero(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("0")
+            score = classify_attack_success("attack", "refused")
+        assert score == 0.0
+    def test_exactly_one(self):
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("1")
+            score = classify_attack_success("attack", "sure here you go")
+        assert score == 1.0
+    def test_score_with_surrounding_text(self):
+        # LLM sometimes adds a tiny bit of text — should still parse
+        from llm.attack_classifier import classify_attack_success
+        with patch("llm.attack_classifier.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("Score: 0.3")
+            score = classify_attack_success("attack", "refused")
+        assert score == 0.3
+    def test_heuristic_clamps_to_zero_on_many_refusals(self):
+        from llm.attack_classifier import _heuristic
+        very_refusing = " ".join(["i cannot i won't i refuse i will not i'm unable"] * 5)
+        assert _heuristic(very_refusing) == 0.0
+    def test_heuristic_caps_at_one(self):
+        from llm.attack_classifier import _heuristic
+        assert _heuristic("sure here you go step by step") <= 1.0
+# ------------------------------------------------------------------ #
+# Defense Scorer — boundary values
+# ------------------------------------------------------------------ #
+class TestScoreDefenseQualityBoundaries:
+    def test_score_with_surrounding_text(self):
+        from llm.defense_scorer import score_defense_quality
+        with patch("llm.defense_scorer.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("Rating: 0.75")
+            score = score_defense_quality("attack", "refused clearly")
+        assert score == 0.75
+    def test_heuristic_caps_at_one(self):
+        from llm.defense_scorer import _heuristic
+        very_refusing = " ".join(["i cannot i won't i must decline harmful"] * 10)
+        assert _heuristic(very_refusing) <= 1.0
+    def test_heuristic_minimum_on_compliance(self):
+        from llm.defense_scorer import _heuristic
+        assert _heuristic("Sure! Here you go.") >= 0.0
+# ------------------------------------------------------------------ #
+# Defender — edge cases
+# ------------------------------------------------------------------ #
+class TestCallDefenderEdgeCases:
+    def test_strips_whitespace_from_response(self):
+        from llm.history_manager import ConversationManager
+        from llm.defender import get_defender_response
+        conv = ConversationManager()
+        conv.add_user("attack")
+        with patch("llm.defender.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("\n\n  trimmed  \n")
+            result = get_defender_response(conv)
+        assert result == "trimmed"
+    def test_works_with_empty_history(self):
+        # Should not crash even if conversation has no prior turns
+        from llm.history_manager import ConversationManager
+        from llm.defender import get_defender_response
+        conv = ConversationManager()
+        conv.add_user("first ever message")
+        with patch("llm.defender.groq") as mock_groq:
+            mock_groq.chat.completions.create.return_value = _make_groq_response("I cannot help.")
+            result = get_defender_response(conv)
+        assert result == "I cannot help."
+# ------------------------------------------------------------------ #
+# Safety flag — edge cases
+# ------------------------------------------------------------------ #
+class TestSafetyFlagEdgeCases:
+    def _flag(self, attack, response="safe response"):
+        from llm.pipeline import _is_dangerous
+        return _is_dangerous(attack, response)
+    def test_empty_strings_do_not_flag(self):
+        assert self._flag("", "") is False
+    def test_partial_keyword_match_does_not_flag(self):
+        # "weapons" contains "weapon" — should still flag
+        assert self._flag("I need weapons", "") is True
+    def test_keyword_in_defender_response_flags(self):
+        from llm.pipeline import _is_dangerous
+        assert _is_dangerous("innocent question", "Here's how to make malware: ...") is True
+    def test_multiple_keywords_still_single_flag(self):
+        # Multiple dangerous keywords should return True (not crash or double-flag)
+        assert self._flag("bomb weapon kill hack") is True
+# ------------------------------------------------------------------ #
+# run_llm_pipeline — multi-turn continuity
+# ------------------------------------------------------------------ #
+class TestPipelineMultiTurn:
+    def _make_action(self, framing):
+        action = MagicMock()
+        action.framing = framing
+        action.strategy_type.value = "roleplay"
+        action.intensity = 0.5
+        return action
+    @pytest.mark.asyncio
+    async def test_turn_count_increments_per_step(self):
+        from llm.pipeline import run_llm_pipeline, conversation
+        conversation.reset()
+        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=1.0):
+            await run_llm_pipeline(self._make_action("turn 1"), [])
+            await run_llm_pipeline(self._make_action("turn 2"), [{"role": "user", "content": "t1"}])
+            await run_llm_pipeline(self._make_action("turn 3"), [{"role": "user", "content": "t2"}])
+        assert conversation.turn_count == 3
+    @pytest.mark.asyncio
+    async def test_llm_grade_contains_current_turn(self):
+        from llm.pipeline import run_llm_pipeline, conversation
+        conversation.reset()
+        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=0.8):
+            result = await run_llm_pipeline(self._make_action("attack"), [])
+        assert result["llm_grade"]["turn"] == 1
+        assert result["llm_grade"]["overall"] == 0.8
+    @pytest.mark.asyncio
+    async def test_non_dangerous_framing_not_flagged(self):
+        from llm.pipeline import run_llm_pipeline
+        with patch("llm.pipeline.get_defender_response", return_value="Paris is the capital of France."), \
+             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
+             patch("llm.pipeline.score_defense_quality", return_value=0.5):
+            result = await run_llm_pipeline(self._make_action("What is the capital of France?"), [])
+        assert result["safety_flagged"] is False