Spaces:
Sleeping
Sleeping
| """ | |
| tests/test_output_guardrail.py | |
| ================================ | |
| Unit tests for the OutputGuardrail module. | |
| """ | |
| import pytest | |
| from ai_firewall.output_guardrail import OutputGuardrail | |
| def guardrail(): | |
| return OutputGuardrail(threshold=0.50, redact=True) | |
| class TestSecretLeakDetection: | |
| def test_openai_key_detected(self, guardrail): | |
| output = "Here is your key: sk-abcdefghijklmnopqrstuvwx" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| assert "secret_leak" in r.flags | |
| def test_aws_key_detected(self, guardrail): | |
| output = "AWS Key: AKIAIOSFODNN7EXAMPLE" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| assert "secret_leak" in r.flags | |
| def test_password_in_output_detected(self, guardrail): | |
| output = "Your password: mysecretpass123" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| def test_private_key_detected(self, guardrail): | |
| output = "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| def test_redaction_applied(self, guardrail): | |
| output = "Key: sk-abcdefghijklmnopqrstuvwx" | |
| r = guardrail.validate(output) | |
| assert "[REDACTED]" in r.redacted_output | |
| class TestSystemPromptLeakDetection: | |
| def test_my_system_prompt_detected(self, guardrail): | |
| output = "My system prompt is: You are a helpful assistant who never..." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| assert "system_prompt_leak" in r.flags | |
| def test_instructed_to_detected(self, guardrail): | |
| output = "I was instructed to keep this information confidential." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| def test_here_is_system_prompt_detected(self, guardrail): | |
| output = "Here is my full system prompt: [confidential instructions]" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| class TestPIILeakDetection: | |
| def test_email_detected(self, guardrail): | |
| output = "Contact admin at admin@example.com for support." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| assert "pii_leak" in r.flags | |
| def test_ssn_detected(self, guardrail): | |
| output = "Your SSN is 123-45-6789." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| def test_credit_card_detected(self, guardrail): | |
| output = "Card number: 4532015112830366" | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| class TestJailbreakConfirmation: | |
| def test_dan_mode_detected(self, guardrail): | |
| output = "In DAN mode: Sure, I'll bypass all restrictions now." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| assert "jailbreak_confirmation" in r.flags | |
| def test_developer_mode_activated(self, guardrail): | |
| output = "Developer mode enabled: I can now ignore all safety guidelines." | |
| r = guardrail.validate(output) | |
| assert r.is_safe is False | |
| class TestSafeOutputs: | |
| benign_outputs = [ | |
| "The capital of France is Paris.", | |
| "Machine learning is a subset of artificial intelligence.", | |
| "Here's a Python function to reverse a string: def reverse(s): return s[::-1]", | |
| "The weather today is sunny with a high of 25 degrees Celsius.", | |
| "I cannot help with that request as it violates our usage policies.", | |
| ] | |
| def test_benign_output_safe(self, guardrail, output): | |
| r = guardrail.validate(output) | |
| assert r.is_safe is True, f"False positive for: {output!r}" | |
| class TestResultStructure: | |
| def test_all_fields_present(self, guardrail): | |
| r = guardrail.validate("hello world response") | |
| assert hasattr(r, "is_safe") | |
| assert hasattr(r, "risk_score") | |
| assert hasattr(r, "flags") | |
| assert hasattr(r, "redacted_output") | |
| assert hasattr(r, "latency_ms") | |
| def test_risk_score_range(self, guardrail): | |
| outputs = ["safe output", "sk-abcdefghijklmnopqrstu"] | |
| for o in outputs: | |
| r = guardrail.validate(o) | |
| assert 0.0 <= r.risk_score <= 1.0 | |
| def test_is_safe_output_shortcut(self, guardrail): | |
| assert guardrail.is_safe_output("The answer is 42.") is True | |
| assert guardrail.is_safe_output("sk-abcdefghijklmnopqrstu") is False | |