Breach-OS / models.py
subhdotsol's picture
refactor(models): clean up imports and add module docstring
c651e61
raw
history blame
1.99 kB
"""
Shared Pydantic models and Enums for the RedTeamOS environment.
These define the API contract between the server, environment, and clients.
"""
from pydantic import BaseModel, Field, field_validator
from typing import Optional
from enum import Enum
class StrategyType(str, Enum):
ROLEPLAY = "roleplay"
HYPOTHETICAL = "hypothetical"
INJECTION = "injection"
PERSONA_SWITCH = "persona_switch"
AUTHORITY_CLAIM = "authority_claim"
ENCODING = "encoding"
MULTI_TURN = "multi_turn"
class TargetCategory(str, Enum):
PRIVACY = "privacy"
MISINFORMATION = "misinformation"
HARMFUL_INSTRUCTIONS = "harmful_instructions"
MANIPULATION = "manipulation"
ILLEGAL_ACTIVITY = "illegal_activity"
class AttackAction(BaseModel):
strategy_type: StrategyType
target_category: TargetCategory
intensity: float = Field(ge=0.0, le=1.0)
framing: str = Field(max_length=500)
@field_validator("framing")
@classmethod
def sanitize_framing(cls, v: str) -> str:
dangerous = ["<script>", "javascript:", "onerror="]
for pattern in dangerous:
v = v.replace(pattern, "[FILTERED]")
return v.strip()
class RedTeamObservation(BaseModel):
defender_response: str
defense_score: float = Field(ge=0.0, le=1.0)
attack_success_estimate: float = Field(ge=0.0, le=1.0)
novelty_score: float = Field(ge=0.0, le=1.0)
turn: int = Field(ge=0)
episode_done: bool
feedback: str
episode_id: str
class EpisodeState(BaseModel):
episode_id: str
turn: int
max_turns: int
attacks_so_far:int
is_active: bool
class StepResult(BaseModel):
observation: RedTeamObservation
reward: float
class ResetResponse(BaseModel):
observation: RedTeamObservation
episode_id: str