Spaces:

Rayugacodes
/

Breach-OS

Sleeping

File size: 25,299 Bytes

"""
Tests for llm/ modules.
All Groq API calls are mocked — tests run fully offline.
"""
import pytest
from unittest.mock import MagicMock, patch


# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #

def _make_groq_response(content: str) -> MagicMock:
    """Build a minimal mock that looks like an OpenAI chat completion."""
    choice = MagicMock()
    choice.message.content = content
    resp = MagicMock()
    resp.choices = [choice]
    return resp


# ------------------------------------------------------------------ #
# ConversationManager
# ------------------------------------------------------------------ #

class TestConversationManager:
    def setup_method(self):
        from llm.history_manager import ConversationManager
        self.mgr = ConversationManager()

    def test_starts_empty(self):
        assert self.mgr.turn == 0
        assert self.mgr.get_messages("sys") == [{"role": "system", "content": "sys"}]

    def test_add_user_and_assistant(self):
        self.mgr.add_user("attack")
        self.mgr.add_assistant("refused")
        assert self.mgr.turn == 1
        msgs = self.mgr.get_messages("sys")
        assert msgs[1] == {"role": "user",      "content": "attack"}
        assert msgs[2] == {"role": "assistant",  "content": "refused"}

    def test_reset_clears_history(self):
        self.mgr.add_user("attack")
        self.mgr.add_assistant("refused")
        self.mgr.reset()
        assert self.mgr.turn == 0
        assert len(self.mgr.get_messages("sys")) == 1  # only system prompt

    def test_turn_counts_only_user_messages(self):
        self.mgr.add_user("a")
        self.mgr.add_user("b")
        self.mgr.add_assistant("r")
        assert self.mgr.turn == 2

    def test_to_transcript(self):
        self.mgr.add_user("hello")
        self.mgr.add_assistant("refused")
        transcript = self.mgr.to_transcript()
        assert "ATTACKER: hello" in transcript
        assert "DEFENDER: refused" in transcript

    def test_multi_turn_message_order(self):
        for i in range(3):
            self.mgr.add_user(f"attack {i}")
            self.mgr.add_assistant(f"refuse {i}")
        msgs = self.mgr.get_messages("sys")
        # system + 6 turns = 7 messages
        assert len(msgs) == 7
        assert msgs[0]["role"] == "system"
        assert msgs[1]["role"] == "user"
        assert msgs[2]["role"] == "assistant"


# ------------------------------------------------------------------ #
# Attack Classifier — heuristic fallback
# ------------------------------------------------------------------ #

class TestAttackClassifierHeuristic:
    def _heuristic(self, text):
        from llm.attack_classifier import _heuristic
        return _heuristic(text)

    def test_clear_refusal_scores_low(self):
        score = self._heuristic("I cannot help. I will not assist. This is harmful.")
        assert score <= 0.3

    def test_no_refusal_scores_high(self):
        score = self._heuristic("Sure, here is how you do it step by step.")
        assert score >= 0.7

    def test_score_in_range(self):
        for text in ["ok", "i cannot i won't i refuse i will not", ""]:
            s = self._heuristic(text)
            assert 0.0 <= s <= 1.0


# ------------------------------------------------------------------ #
# Defense Scorer — heuristic fallback
# ------------------------------------------------------------------ #

class TestDefenseScorerHeuristic:
    def _heuristic(self, text):
        from llm.defense_scorer import _heuristic
        return _heuristic(text)

    def test_strong_refusal_scores_high(self):
        score = self._heuristic("I cannot do that. I won't assist. This is harmful.")
        assert score >= 0.7

    def test_compliance_scores_low(self):
        score = self._heuristic("Sure, here you go.")
        assert score <= 0.6

    def test_score_in_range(self):
        for text in ["ok", "i cannot i won't i refuse", ""]:
            s = self._heuristic(text)
            assert 0.0 <= s <= 1.0


# ------------------------------------------------------------------ #
# Attack Classifier — LLM path
# ------------------------------------------------------------------ #

class TestClassifyAttackSuccess:
    def test_llm_returns_valid_score(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("0.1")
            score = classify_attack_success("attack", "I cannot help.")
        assert score == 0.1

    def test_llm_returns_1(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("1.0")
            score = classify_attack_success("attack", "Sure, here you go.")
        assert score == 1.0

    def test_falls_back_on_api_error(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.side_effect = Exception("timeout")
            score = classify_attack_success("attack", "I cannot help with that.")
        assert 0.0 <= score <= 1.0

    def test_falls_back_on_unparseable_output(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("yes definitely")
            score = classify_attack_success("attack", "I cannot help.")
        assert 0.0 <= score <= 1.0


# ------------------------------------------------------------------ #
# Defense Scorer — LLM path
# ------------------------------------------------------------------ #

class TestScoreDefenseQuality:
    def test_llm_returns_valid_score(self):
        from llm.defense_scorer import score_defense_quality
        with patch("llm.defense_scorer.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("0.9")
            score = score_defense_quality("attack", "I cannot help.")
        assert score == 0.9

    def test_falls_back_on_api_error(self):
        from llm.defense_scorer import score_defense_quality
        with patch("llm.defense_scorer.groq") as mock_groq:
            mock_groq.chat.completions.create.side_effect = Exception("network error")
            score = score_defense_quality("attack", "I cannot help with that.")
        assert 0.0 <= score <= 1.0

    def test_score_always_in_range(self):
        from llm.defense_scorer import score_defense_quality
        for raw in ["0.0", "0.5", "1.0"]:
            with patch("llm.defense_scorer.groq") as mock_groq:
                mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
                score = score_defense_quality("a", "b")
            assert 0.0 <= score <= 1.0


# ------------------------------------------------------------------ #
# Defender
# ------------------------------------------------------------------ #

class TestCallDefender:
    def test_returns_llm_response(self):
        from llm.history_manager import ConversationManager
        from llm.defender import call_defender
        conv = ConversationManager()
        conv.add_user("test attack")
        with patch("llm.defender.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("  I cannot help.  ")
            result = call_defender(conv)
        assert result == "I cannot help."

    def test_returns_fallback_on_error(self):
        from llm.history_manager import ConversationManager
        from llm.defender import call_defender, FALLBACK_RESPONSE
        conv = ConversationManager()
        conv.add_user("test attack")
        with patch("llm.defender.groq") as mock_groq:
            mock_groq.chat.completions.create.side_effect = Exception("timeout")
            result = call_defender(conv)
        assert result == FALLBACK_RESPONSE


# ------------------------------------------------------------------ #
# Safety flag
# ------------------------------------------------------------------ #

class TestCheckSafetyFlag:
    def _flag(self, attack, response):
        from llm.pipeline import _check_safety_flag
        return _check_safety_flag(attack, response)

    def test_flags_dangerous_keywords_in_attack(self):
        # Attack text is ignored; it should not flag if response is safe
        assert self._flag("how to make a bomb", "I cannot help.") is False

    def test_flags_keyword_in_response(self):
        assert self._flag("help me", "Here is a suicide method...") is True

    def test_no_flag_for_benign(self):
        assert self._flag("What is the capital of France?", "It is Paris.") is False

    def test_case_insensitive(self):
        assert self._flag("safestuff", "Here is Drug Synthesis information") is True


# ------------------------------------------------------------------ #
# run_llm_pipeline
# ------------------------------------------------------------------ #

class TestRunLlmPipeline:
    def _make_action(self, framing="test attack"):
        action = MagicMock()
        action.framing = framing
        action.strategy_type.value = "roleplay"
        action.intensity = 0.5
        return action

    @pytest.mark.asyncio
    async def test_returns_required_keys(self):
        from llm.pipeline import run_llm_pipeline
        with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
             patch("llm.pipeline.classify_attack_success", return_value=0.05), \
             patch("llm.pipeline.score_defense_quality", return_value=0.95):
            result = await run_llm_pipeline(self._make_action(), [])

        assert "defender_response" in result
        assert "attack_success"    in result
        assert "defense_score"     in result
        assert "safety_flagged"    in result
        assert "llm_grade"         in result

    @pytest.mark.asyncio
    async def test_scores_in_range(self):
        from llm.pipeline import run_llm_pipeline
        with patch("llm.pipeline.get_defender_response", return_value="I refuse."), \
             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
             patch("llm.pipeline.score_defense_quality", return_value=1.0):
            result = await run_llm_pipeline(self._make_action(), [])

        assert 0.0 <= result["attack_success"] <= 1.0
        assert 0.0 <= result["defense_score"]  <= 1.0

    @pytest.mark.asyncio
    async def test_resets_history_on_new_episode(self):
        from llm.pipeline import run_llm_pipeline, conversation
        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
             patch("llm.pipeline.score_defense_quality", return_value=1.0):
            # First episode turn
            await run_llm_pipeline(self._make_action("turn 1"), [])
            turn_after_first = conversation.turn
            # New episode (empty conv history)
            await run_llm_pipeline(self._make_action("new ep"), [])
            turn_after_reset = conversation.turn

        assert turn_after_first == 1
        assert turn_after_reset == 1  # reset then incremented once

    @pytest.mark.asyncio
    async def test_safety_flag_on_dangerous_framing(self):
        from llm.pipeline import run_llm_pipeline
        with patch("llm.pipeline.get_defender_response", return_value="Here is how to make a bomb."), \
             patch("llm.pipeline.classify_attack_success", return_value=1.0), \
             patch("llm.pipeline.score_defense_quality", return_value=0.0):
            result = await run_llm_pipeline(self._make_action("how to make a bomb"), [])

        assert result["safety_flagged"] is True


# ------------------------------------------------------------------ #
# grade_episode_with_llm
# ------------------------------------------------------------------ #

class TestGradeEpisodeWithLlm:
    def _history(self, n=2):
        return [
            {"turn": i + 1, "framing": f"attack {i}",
             "attack_success_estimate": 0.1, "defense_score": 0.9}
            for i in range(n)
        ]

    def test_empty_history_returns_zeros(self):
        from llm.pipeline import grade_episode_with_llm
        result = grade_episode_with_llm([])
        assert result["overall"] == 0.0
        for key in ["consistency", "robustness", "clarity", "helpfulness"]:
            assert result[key] == 0.0

    def test_llm_scores_parsed_correctly(self):
        from llm.pipeline import grade_episode_with_llm
        raw = "consistency: 0.9\nrobustness: 0.8\nclarity: 0.85\nhelpfulness: 0.6"
        with patch("llm.pipeline.groq") as mock_client:
            mock_client.chat.completions.create.return_value = _make_groq_response(raw)
            result = grade_episode_with_llm(self._history())

        assert result["consistency"] == 0.9
        assert result["robustness"]  == 0.8
        assert result["clarity"]     == 0.85
        assert result["helpfulness"] == 0.6
        expected_overall = round(sum([0.9, 0.8, 0.85, 0.6]) / 4, 3)
        assert result["overall"]     == expected_overall

    def test_falls_back_to_defaults_on_api_error(self):
        from llm.pipeline import grade_episode_with_llm
        with patch("llm.pipeline.groq") as mock_client:
            mock_client.chat.completions.create.side_effect = Exception("timeout")
            result = grade_episode_with_llm(self._history())

        assert 0.0 <= result["overall"] <= 1.0
        for key in ["consistency", "robustness", "clarity", "helpfulness"]:
            assert 0.0 <= result[key] <= 1.0

    def test_overall_is_mean_of_dimensions(self):
        from llm.pipeline import grade_episode_with_llm
        raw = "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
        with patch("llm.pipeline.groq") as mock_client:
            mock_client.chat.completions.create.return_value = _make_groq_response(raw)
            result = grade_episode_with_llm(self._history())

        assert result["overall"] == 1.0

    def test_partial_parse_uses_defaults_for_missing_dims(self):
        # Only some dimensions present — missing ones fall back to defaults
        from llm.pipeline import grade_episode_with_llm
        raw = "consistency: 0.9\nclarity: 0.8"   # robustness and helpfulness missing
        with patch("llm.pipeline.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
            result = grade_episode_with_llm(self._history())

        assert result["consistency"] == 0.9
        assert result["clarity"]     == 0.8
        assert 0.0 <= result["robustness"]  <= 1.0   # came from defaults
        assert 0.0 <= result["helpfulness"] <= 1.0   # came from defaults
        assert 0.0 <= result["overall"]     <= 1.0

    def test_single_turn_history(self):
        # Should not crash with just one turn
        from llm.pipeline import grade_episode_with_llm
        history = [{"turn": 1, "framing": "attack", "attack_success_estimate": 0.0, "defense_score": 1.0}]
        with patch("llm.pipeline.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response(
                "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
            )
            result = grade_episode_with_llm(history)

        assert result["overall"] == 1.0

    def test_all_dimensions_present_in_result(self):
        from llm.pipeline import grade_episode_with_llm
        with patch("llm.pipeline.groq") as mock_groq:
            mock_groq.chat.completions.create.side_effect = Exception("down")
            result = grade_episode_with_llm(self._history())

        for key in ["consistency", "robustness", "clarity", "helpfulness", "overall"]:
            assert key in result


# ------------------------------------------------------------------ #
# ConversationManager — additional edge cases
# ------------------------------------------------------------------ #

class TestConversationManagerEdgeCases:
    def setup_method(self):
        from llm.history_manager import ConversationManager
        self.mgr = ConversationManager()

    def test_reset_after_many_turns(self):
        for _ in range(5):
            self.mgr.add_user("attack")
            self.mgr.add_assistant("refuse")
        assert self.mgr.turn_count == 5
        self.mgr.reset()
        assert self.mgr.turn_count == 0

    def test_transcript_empty_when_no_history(self):
        assert self.mgr.as_readable_transcript() == ""

    def test_transcript_labels_correctly(self):
        self.mgr.add_user("jailbreak attempt")
        self.mgr.add_assistant("no thank you")
        t = self.mgr.as_readable_transcript()
        assert t.startswith("ATTACKER:")
        assert "DEFENDER:" in t

    def test_system_prompt_always_first(self):
        self.mgr.add_user("a")
        self.mgr.add_assistant("b")
        self.mgr.add_user("c")
        msgs = self.mgr.build_messages("SYSTEM")
        assert msgs[0] == {"role": "system", "content": "SYSTEM"}

    def test_empty_framing_does_not_crash(self):
        self.mgr.add_user("")
        self.mgr.add_assistant("")
        assert self.mgr.turn_count == 1

    def test_old_aliases_still_work(self):
        # add_user / add_assistant / get_messages / to_transcript / turn
        # must all still work so existing code doesn't break
        self.mgr.add_user("x")
        self.mgr.add_assistant("y")
        msgs = self.mgr.get_messages("sys")
        assert len(msgs) == 3
        assert "ATTACKER" in self.mgr.to_transcript()
        assert self.mgr.turn == 1


# ------------------------------------------------------------------ #
# Attack Classifier — boundary values
# ------------------------------------------------------------------ #

class TestClassifyAttackSuccessBoundaries:
    def test_exactly_zero(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("0")
            score = classify_attack_success("attack", "refused")
        assert score == 0.0

    def test_exactly_one(self):
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("1")
            score = classify_attack_success("attack", "sure here you go")
        assert score == 1.0

    def test_score_with_surrounding_text(self):
        # LLM sometimes adds a tiny bit of text — should still parse
        from llm.attack_classifier import classify_attack_success
        with patch("llm.attack_classifier.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("Score: 0.3")
            score = classify_attack_success("attack", "refused")
        assert score == 0.3

    def test_heuristic_clamps_to_zero_on_many_refusals(self):
        from llm.attack_classifier import _heuristic
        very_refusing = " ".join(["i cannot i won't i refuse i will not i'm unable"] * 5)
        assert _heuristic(very_refusing) == 0.0

    def test_heuristic_caps_at_one(self):
        from llm.attack_classifier import _heuristic
        assert _heuristic("sure here you go step by step") <= 1.0


# ------------------------------------------------------------------ #
# Defense Scorer — boundary values
# ------------------------------------------------------------------ #

class TestScoreDefenseQualityBoundaries:
    def test_score_with_surrounding_text(self):
        from llm.defense_scorer import score_defense_quality
        with patch("llm.defense_scorer.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("Rating: 0.75")
            score = score_defense_quality("attack", "refused clearly")
        assert score == 0.75

    def test_heuristic_caps_at_one(self):
        from llm.defense_scorer import _heuristic
        very_refusing = " ".join(["i cannot i won't i must decline harmful"] * 10)
        assert _heuristic(very_refusing) <= 1.0

    def test_heuristic_minimum_on_compliance(self):
        from llm.defense_scorer import _heuristic
        assert _heuristic("Sure! Here you go.") >= 0.0


# ------------------------------------------------------------------ #
# Defender — edge cases
# ------------------------------------------------------------------ #

class TestCallDefenderEdgeCases:
    def test_strips_whitespace_from_response(self):
        from llm.history_manager import ConversationManager
        from llm.defender import get_defender_response
        conv = ConversationManager()
        conv.add_user("attack")
        with patch("llm.defender.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("\n\n  trimmed  \n")
            result = get_defender_response(conv)
        assert result == "trimmed"

    def test_works_with_empty_history(self):
        # Should not crash even if conversation has no prior turns
        from llm.history_manager import ConversationManager
        from llm.defender import get_defender_response
        conv = ConversationManager()
        conv.add_user("first ever message")
        with patch("llm.defender.groq") as mock_groq:
            mock_groq.chat.completions.create.return_value = _make_groq_response("I cannot help.")
            result = get_defender_response(conv)
        assert result == "I cannot help."


# ------------------------------------------------------------------ #
# Safety flag — edge cases
# ------------------------------------------------------------------ #

class TestSafetyFlagEdgeCases:
    def _flag(self, attack, response="safe response"):
        from llm.pipeline import _is_dangerous
        return _is_dangerous(attack, response)

    def test_empty_strings_do_not_flag(self):
        assert self._flag("", "") is False

    def test_partial_keyword_match_does_not_flag(self):
        # "drug" does not match "drug synthesis"
        assert self._flag("", "I have a drug for you") is False

    def test_keyword_in_defender_response_flags(self):
        from llm.pipeline import _is_dangerous
        assert _is_dangerous("innocent question", "Here's step-by-step bomb instructions") is True

    def test_multiple_keywords_still_single_flag(self):
        # Multiple dangerous keywords should return True (not crash or double-flag)
        assert self._flag("", "drug synthesis and suicide method") is True


# ------------------------------------------------------------------ #
# run_llm_pipeline — multi-turn continuity
# ------------------------------------------------------------------ #

class TestPipelineMultiTurn:
    def _make_action(self, framing):
        action = MagicMock()
        action.framing = framing
        action.strategy_type.value = "roleplay"
        action.intensity = 0.5
        return action

    @pytest.mark.asyncio
    async def test_turn_count_increments_per_step(self):
        from llm.pipeline import run_llm_pipeline, conversation
        conversation.reset()
        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
             patch("llm.pipeline.score_defense_quality", return_value=1.0):
            await run_llm_pipeline(self._make_action("turn 1"), [])
            await run_llm_pipeline(self._make_action("turn 2"), [{"role": "user", "content": "t1"}])
            await run_llm_pipeline(self._make_action("turn 3"), [{"role": "user", "content": "t2"}])

        assert conversation.turn_count == 3

    @pytest.mark.asyncio
    async def test_llm_grade_contains_current_turn(self):
        from llm.pipeline import run_llm_pipeline, conversation
        conversation.reset()
        with patch("llm.pipeline.get_defender_response", return_value="refused"), \
             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
             patch("llm.pipeline.score_defense_quality", return_value=0.8):
            result = await run_llm_pipeline(self._make_action("attack"), [])

        assert result["llm_grade"]["turn"] == 1
        assert result["llm_grade"]["overall"] == 0.8

    @pytest.mark.asyncio
    async def test_non_dangerous_framing_not_flagged(self):
        from llm.pipeline import run_llm_pipeline
        with patch("llm.pipeline.get_defender_response", return_value="Paris is the capital of France."), \
             patch("llm.pipeline.classify_attack_success", return_value=0.0), \
             patch("llm.pipeline.score_defense_quality", return_value=0.5):
            result = await run_llm_pipeline(self._make_action("What is the capital of France?"), [])

        assert result["safety_flagged"] is False