File size: 8,781 Bytes
efd15e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f665498
 
 
 
 
 
 
 
 
 
7d3f664
 
 
 
 
 
 
 
 
f665498
 
 
 
 
 
 
 
 
efd15e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ca375c
 
 
 
 
 
 
 
 
4dc3e01
 
 
 
 
 
 
 
 
6ca375c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efd15e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
"""Tests for output validation gate."""

from __future__ import annotations

import pytest

from agent_bench.security.output_validator import OutputValidator


class TestPIILeakage:
    """PII in LLM output should be caught."""

    @pytest.fixture
    def validator(self):
        return OutputValidator(pii_check=True, url_check=False, blocklist=[])

    def test_detects_email_in_output(self, validator):
        verdict = validator.validate(
            output="Contact john@example.com for help.",
            retrieved_chunks=[],
        )
        assert verdict.passed is False
        assert any("pii_leakage" in v for v in verdict.violations)

    def test_detects_ssn_in_output(self, validator):
        verdict = validator.validate(
            output="His SSN is 123-45-6789.",
            retrieved_chunks=[],
        )
        assert verdict.passed is False

    def test_clean_output_passes(self, validator):
        verdict = validator.validate(
            output="FastAPI uses path parameters with curly braces.",
            retrieved_chunks=[],
        )
        assert verdict.passed is True
        assert verdict.violations == []


class TestURLValidation:
    """URLs in output must appear in retrieved chunks."""

    @pytest.fixture
    def validator(self):
        return OutputValidator(pii_check=False, url_check=True, blocklist=[])

    def test_url_from_chunks_passes(self, validator):
        chunks = ["Visit https://fastapi.tiangolo.com for docs."]
        verdict = validator.validate(
            output="See https://fastapi.tiangolo.com for details.",
            retrieved_chunks=chunks,
        )
        assert verdict.passed is True

    def test_hallucinated_url_fails(self, validator):
        chunks = ["FastAPI is a modern framework."]
        verdict = validator.validate(
            output="See https://malicious-site.com for details.",
            retrieved_chunks=chunks,
        )
        assert verdict.passed is False
        assert any("url_hallucination" in v for v in verdict.violations)

    def test_trailing_slash_normalization(self, validator):
        """URLs differing only by trailing slash should not be flagged."""
        chunks = ["Visit https://fastapi.tiangolo.com/ for docs."]
        verdict = validator.validate(
            output="See https://fastapi.tiangolo.com for details.",
            retrieved_chunks=chunks,
        )
        assert verdict.passed is True
        assert verdict.violations == []

    def test_trailing_slash_with_sentence_punctuation(self, validator):
        """Chunk URL followed by period: 'https://x.com/.' must match 'https://x.com/'."""
        chunks = ["Visit https://fastapi.tiangolo.com/."]
        verdict = validator.validate(
            output="See https://fastapi.tiangolo.com/ for details.",
            retrieved_chunks=chunks,
        )
        assert verdict.passed is True

    def test_trailing_slash_normalization_reverse(self, validator):
        """Chunk without slash, output with slash."""
        chunks = ["Visit https://fastapi.tiangolo.com for docs."]
        verdict = validator.validate(
            output="See https://fastapi.tiangolo.com/ for details.",
            retrieved_chunks=chunks,
        )
        assert verdict.passed is True

    def test_no_urls_passes(self, validator):
        verdict = validator.validate(
            output="Path parameters use curly braces.",
            retrieved_chunks=["Some chunk."],
        )
        assert verdict.passed is True


class TestBlocklist:
    """Blocklisted patterns should be caught."""

    def test_blocklist_match(self):
        validator = OutputValidator(
            pii_check=False, url_check=False,
            blocklist=["sk-[a-zA-Z0-9]{20,}", "SYSTEM_PROMPT"],
        )
        verdict = validator.validate(
            output="Here is the key: sk-abcdefghijklmnopqrstuvwxyz",
            retrieved_chunks=[],
        )
        assert verdict.passed is False
        assert any("blocklist" in v for v in verdict.violations)

    def test_system_prompt_fragment(self):
        validator = OutputValidator(
            pii_check=False, url_check=False,
            blocklist=["You are a (?:helpful |test )?assistant"],
        )
        verdict = validator.validate(
            output="My instructions say: You are a helpful assistant",
            retrieved_chunks=[],
        )
        assert verdict.passed is False

    def test_no_blocklist_match(self):
        validator = OutputValidator(
            pii_check=False, url_check=False,
            blocklist=["FORBIDDEN_TERM"],
        )
        verdict = validator.validate(
            output="A perfectly normal answer.",
            retrieved_chunks=[],
        )
        assert verdict.passed is True


class TestSecretLeakage:
    """Secret patterns in LLM output must be blocked (fail closed)."""

    @pytest.fixture
    def validator(self):
        return OutputValidator(
            pii_check=False, url_check=False, secret_check=True, blocklist=[],
        )

    # Google API key format fixture temporarily removed following the
    # 2026-04-14/15 credential-exposure incident (see DECISIONS.md).
    # The validator's regex is \bAIza[0-9A-Za-z_\-]{35}\b, which is
    # identical to GitHub secret-scanning's Google API Key detection
    # pattern, so any static literal that satisfies the validator also
    # triggers GitHub push protection. Parallel-tracks item: restore
    # Google API key format coverage via a runtime-generated fixture
    # that builds a 35-char AIza-prefixed string at test time, never
    # landing as a literal in source. Validator regex unchanged.
    @pytest.mark.parametrize("output", [
        "Your key is sk-abcdefghijklmnopqrstuvwxyz1234",
        "here: sk-proj-ABCDEFGHIJKLMNOP0123456789",
        "key=sk-ant-abcdefghijklmnopqrstuvwxyz",
        "aws key AKIAIOSFODNN7EXAMPLE",
        "use Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.abc",
        "env: OPENAI_API_KEY=sk-test123",
        "set ANTHROPIC_API_KEY=sk-ant-xyz",
    ])
    def test_blocks_known_secret_formats(self, validator, output):
        verdict = validator.validate(output=output, retrieved_chunks=[])
        assert verdict.passed is False, f"Should block: {output!r}"
        assert any("secret_leakage" in v for v in verdict.violations)
        assert verdict.action == "block"

    @pytest.mark.parametrize("output", [
        "FastAPI uses path parameters with curly braces.",
        "You can store secrets in environment variables.",
        "To configure the OpenAI client, set your API key in OPENAI_API_KEY env var.",
        "Use a .env file for local development.",
        "Kubernetes Secrets store sensitive configuration.",
    ])
    def test_allows_benign_credential_adjacent_output(self, validator, output):
        """Educational content about secrets should pass — only literal
        key formats and env-var assignments are blocked."""
        verdict = validator.validate(output=output, retrieved_chunks=[])
        assert verdict.passed is True, (
            f"False positive on: {output!r} -> {verdict.violations}"
        )

    def test_secret_check_can_be_disabled(self):
        """When secret_check=False, literal keys pass through."""
        validator = OutputValidator(
            pii_check=False, url_check=False, secret_check=False, blocklist=[],
        )
        verdict = validator.validate(
            output="sk-abcdefghijklmnopqrstuvwxyz1234",
            retrieved_chunks=[],
        )
        assert verdict.passed is True


class TestCombinedChecks:
    def test_multiple_violations(self):
        validator = OutputValidator(
            pii_check=True, url_check=True,
            blocklist=["SECRET"],
        )
        verdict = validator.validate(
            output="Email john@test.com, see https://evil.com, also SECRET.",
            retrieved_chunks=["No URLs here."],
        )
        assert verdict.passed is False
        assert len(verdict.violations) >= 2  # PII + URL at minimum
        assert verdict.action == "block"

    def test_all_checks_pass(self):
        validator = OutputValidator(
            pii_check=True, url_check=True,
            blocklist=["SECRET"],
        )
        verdict = validator.validate(
            output="FastAPI supports path parameters.",
            retrieved_chunks=["FastAPI supports path parameters."],
        )
        assert verdict.passed is True
        assert verdict.action == "pass"

    def test_disabled_checks(self):
        validator = OutputValidator(pii_check=False, url_check=False, blocklist=[])
        verdict = validator.validate(
            output="Email: a@b.com, URL: https://evil.com",
            retrieved_chunks=[],
        )
        assert verdict.passed is True