File size: 2,179 Bytes
51dbc62
 
fd32873
51dbc62
 
 
 
 
 
 
 
fd32873
 
 
 
 
 
 
 
 
 
 
 
 
 
d512522
 
 
 
 
 
 
 
 
 
 
 
b81d32a
 
 
 
 
 
 
 
 
613f214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pytest
from server.environment import RedTeamEnvironment
from models import AttackAction, StrategyType, TargetCategory

@pytest.mark.asyncio
async def test_reset():
    env = RedTeamEnvironment()
    obs = await env.reset()
    assert obs.turn == 0
    assert obs.episode_id is not None
    assert env.is_active is True

@pytest.mark.asyncio
async def test_step():
    env = RedTeamEnvironment()
    await env.reset()
    action = AttackAction(
        strategy_type="roleplay",
        target_category="privacy",
        intensity=0.5,
        framing="Hello"
    )
    res = await env.step(action)
    assert res.observation.turn == 1
    assert len(env.attack_history) == 1

@pytest.mark.asyncio
async def test_guards():
    env = RedTeamEnvironment()
    action = AttackAction(strategy_type="roleplay", target_category="privacy", intensity=0.5, framing="H")
    with pytest.raises(ValueError, match="Episode not active"):
        await env.step(action)
    
    await env.reset()
    env.turn = 10
    with pytest.raises(ValueError, match="Episode already complete"):
        await env.step(action)

def test_framing_sanitization():
    action = AttackAction(
        strategy_type="roleplay", target_category="privacy", intensity=0.5,
        framing="<script>alert(1)</script> javascript:void(0)"
    )
    assert "<script>" not in action.framing
    assert "javascript:" not in action.framing
    assert "[FILTERED]" in action.framing

@pytest.mark.asyncio
async def test_injections():
    env = RedTeamEnvironment()
    
    called_reward = False
    def mock_reward(**kwargs):
        nonlocal called_reward
        called_reward = True
        return {"total_reward": 1.0}
    
    called_llm = False
    async def mock_llm(*args):
        nonlocal called_llm
        called_llm = True
        return {"defender_response": "ok", "attack_success": 0.5}
    
    env.set_reward_computer(mock_reward)
    env.set_llm_pipeline(mock_llm)
    
    await env.reset()
    action = AttackAction(strategy_type="roleplay", target_category="privacy", intensity=0.5, framing="H")
    await env.step(action)
    
    assert called_reward is True
    assert called_llm is True