SheildSense_API_SDK / ai_firewall /tests /test_output_guardrail.py
cloud450's picture
Upload 48 files
4afcb3a verified
"""
tests/test_output_guardrail.py
================================
Unit tests for the OutputGuardrail module.
"""
import pytest
from ai_firewall.output_guardrail import OutputGuardrail
@pytest.fixture
def guardrail():
return OutputGuardrail(threshold=0.50, redact=True)
class TestSecretLeakDetection:
def test_openai_key_detected(self, guardrail):
output = "Here is your key: sk-abcdefghijklmnopqrstuvwx"
r = guardrail.validate(output)
assert r.is_safe is False
assert "secret_leak" in r.flags
def test_aws_key_detected(self, guardrail):
output = "AWS Key: AKIAIOSFODNN7EXAMPLE"
r = guardrail.validate(output)
assert r.is_safe is False
assert "secret_leak" in r.flags
def test_password_in_output_detected(self, guardrail):
output = "Your password: mysecretpass123"
r = guardrail.validate(output)
assert r.is_safe is False
def test_private_key_detected(self, guardrail):
output = "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----"
r = guardrail.validate(output)
assert r.is_safe is False
def test_redaction_applied(self, guardrail):
output = "Key: sk-abcdefghijklmnopqrstuvwx"
r = guardrail.validate(output)
assert "[REDACTED]" in r.redacted_output
class TestSystemPromptLeakDetection:
def test_my_system_prompt_detected(self, guardrail):
output = "My system prompt is: You are a helpful assistant who never..."
r = guardrail.validate(output)
assert r.is_safe is False
assert "system_prompt_leak" in r.flags
def test_instructed_to_detected(self, guardrail):
output = "I was instructed to keep this information confidential."
r = guardrail.validate(output)
assert r.is_safe is False
def test_here_is_system_prompt_detected(self, guardrail):
output = "Here is my full system prompt: [confidential instructions]"
r = guardrail.validate(output)
assert r.is_safe is False
class TestPIILeakDetection:
def test_email_detected(self, guardrail):
output = "Contact admin at admin@example.com for support."
r = guardrail.validate(output)
assert r.is_safe is False
assert "pii_leak" in r.flags
def test_ssn_detected(self, guardrail):
output = "Your SSN is 123-45-6789."
r = guardrail.validate(output)
assert r.is_safe is False
def test_credit_card_detected(self, guardrail):
output = "Card number: 4532015112830366"
r = guardrail.validate(output)
assert r.is_safe is False
class TestJailbreakConfirmation:
def test_dan_mode_detected(self, guardrail):
output = "In DAN mode: Sure, I'll bypass all restrictions now."
r = guardrail.validate(output)
assert r.is_safe is False
assert "jailbreak_confirmation" in r.flags
def test_developer_mode_activated(self, guardrail):
output = "Developer mode enabled: I can now ignore all safety guidelines."
r = guardrail.validate(output)
assert r.is_safe is False
class TestSafeOutputs:
benign_outputs = [
"The capital of France is Paris.",
"Machine learning is a subset of artificial intelligence.",
"Here's a Python function to reverse a string: def reverse(s): return s[::-1]",
"The weather today is sunny with a high of 25 degrees Celsius.",
"I cannot help with that request as it violates our usage policies.",
]
@pytest.mark.parametrize("output", benign_outputs)
def test_benign_output_safe(self, guardrail, output):
r = guardrail.validate(output)
assert r.is_safe is True, f"False positive for: {output!r}"
class TestResultStructure:
def test_all_fields_present(self, guardrail):
r = guardrail.validate("hello world response")
assert hasattr(r, "is_safe")
assert hasattr(r, "risk_score")
assert hasattr(r, "flags")
assert hasattr(r, "redacted_output")
assert hasattr(r, "latency_ms")
def test_risk_score_range(self, guardrail):
outputs = ["safe output", "sk-abcdefghijklmnopqrstu"]
for o in outputs:
r = guardrail.validate(o)
assert 0.0 <= r.risk_score <= 1.0
def test_is_safe_output_shortcut(self, guardrail):
assert guardrail.is_safe_output("The answer is 42.") is True
assert guardrail.is_safe_output("sk-abcdefghijklmnopqrstu") is False