File size: 2,100 Bytes
c651e61
4df57fe
c651e61
 
7532e50
c651e61
 
 
695624d
 
 
 
 
 
 
 
 
2a11723
c651e61
2a11723
 
 
 
 
 
e090821
c651e61
e090821
 
 
 
 
7532e50
 
 
 
 
 
 
 
4b08451
c651e61
abd2333
 
 
 
 
4b08451
 
 
 
 
 
 
 
 
04130ef
c651e61
04130ef
 
 
 
 
 
f15e9b2
c651e61
f15e9b2
 
 
 
c651e61
f15e9b2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
Shared Pydantic models and Enums for the BreachOS environment.
These define the API contract between the server, environment, and clients.
"""
from pydantic import BaseModel, Field, field_validator
from typing import Optional
from enum import Enum


class StrategyType(str, Enum):
    ROLEPLAY        = "roleplay"
    HYPOTHETICAL    = "hypothetical"
    INJECTION       = "injection"
    PERSONA_SWITCH  = "persona_switch"
    AUTHORITY_CLAIM = "authority_claim"
    ENCODING        = "encoding"
    MULTI_TURN      = "multi_turn"


class TargetCategory(str, Enum):
    PRIVACY               = "privacy"
    MISINFORMATION        = "misinformation"
    HARMFUL_INSTRUCTIONS  = "harmful_instructions"
    MANIPULATION          = "manipulation"
    ILLEGAL_ACTIVITY      = "illegal_activity"


class AttackAction(BaseModel):
    strategy_type:   StrategyType
    target_category: TargetCategory
    intensity:       float = Field(ge=0.0, le=1.0)
    framing:         str   = Field(max_length=500)

    @field_validator("framing")
    @classmethod
    def sanitize_framing(cls, v: str) -> str:
        dangerous = ["<script>", "javascript:", "onerror="]
        for pattern in dangerous:
            v = v.replace(pattern, "[FILTERED]")
        return v.strip()


class AutoAttackRequest(BaseModel):
    strategy_type:   StrategyType
    target_category: TargetCategory


class RedTeamObservation(BaseModel):
    defender_response:       str
    defense_score:           float = Field(ge=0.0, le=1.0)
    attack_success_estimate: float = Field(ge=0.0, le=1.0)
    novelty_score:           float = Field(ge=0.0, le=1.0)
    turn:                    int   = Field(ge=0)
    episode_done:            bool
    feedback:                str
    episode_id:              str


class EpisodeState(BaseModel):
    episode_id:    str
    turn:          int
    max_turns:     int
    attacks_so_far:int
    is_active:     bool


class StepResult(BaseModel):
    observation: RedTeamObservation
    reward:      float


class ResetResponse(BaseModel):
    observation: RedTeamObservation
    episode_id:  str