red_teaming_env / models.py
Huggingansuman's picture
Upload red_teaming_env Space files
410276d verified
"""Typed models for the RedTeamEnv OpenEnv environment."""
from __future__ import annotations
from typing import Any, Literal
from openenv.core.env_server.types import Action, Observation, State
from pydantic import Field
RedTeamStrategy = Literal[
"roleplay",
"obfuscation",
"decomposition",
"narrative_wrap",
"multilingual",
"direct",
]
class RedTeamAction(Action):
"""Fixed-vocabulary red-teaming action."""
strategy: RedTeamStrategy = Field(
..., description="The fixed strategy selected for the current turn."
)
prompt: str = Field(
...,
min_length=1,
description="The adversarial prompt sent to the target model.",
)
turn_context: str = Field(
default="",
description="Optional framing or grooming context carried into the turn.",
)
class RedTeamObservation(Observation):
"""Observation returned after a target-model interaction."""
target_response: str = Field(
default="", description="The target model response for the last turn."
)
compliance_flags: list[str] = Field(
default_factory=list,
description="Verified compliance or policy-violation flags for the turn.",
)
attack_score: float = Field(
default=0.0, ge=0.0, le=1.0, description="Turn-level attack quality score."
)
strategy_used: str = Field(
default="", description="Echo of the action strategy used for the turn."
)
attempts_remaining: int = Field(
default=0, ge=0, description="Remaining turns in the active episode."
)
conversation_history: list[dict[str, Any]] = Field(
default_factory=list,
description="Full multi-turn conversation transcript for the episode.",
)
message: str = Field(
default="", description="Human-readable status message for the step."
)
class RedTeamState(State):
"""Environment state returned by the `/state` endpoint."""
task_name: str = Field(
default="",
description="stereotype_probe | policy_bypass | obfuscated_extraction",
)
target_model: str = Field(
default="", description="Model identifier used by the target wrapper."
)
target_temperature: float = Field(
default=0.7, ge=0.0, le=1.0, description="Episode temperature sample."
)
strategies_used: list[str] = Field(
default_factory=list,
description="Unique strategies used so far in the episode.",
)
flags_found: list[str] = Field(
default_factory=list,
description="Cumulative verified compliance flags found this episode.",
)
defense_drift_active: bool = Field(
default=False,
description="Whether adaptive defense drift has activated for the target.",
)
audit_this_episode: bool = Field(
default=False,
description="Whether detailed auditing is enabled for this episode.",
)
max_turns: int = Field(
default=0, ge=0, description="Maximum steps for the active task."
)
seed: int | None = Field(
default=None, description="Episode seed used to initialize stochasticity."
)
reward_history: list[float] = Field(
default_factory=list, description="Per-step reward history for the episode."
)
judge_history: list[float] = Field(
default_factory=list,
description="Per-step judge scores used for dense reward shaping.",
)
audit_log: list[dict[str, Any]] = Field(
default_factory=list,
description="Detailed verifier and reward traces captured during audit.",
)
last_success: bool = Field(
default=False, description="Whether the most recent terminal state succeeded."
)
# Backward-compatible aliases for the scaffolded names.
RedTeamingAction = RedTeamAction
RedTeamingObservation = RedTeamObservation
RedTeamingState = RedTeamState