Spaces:

Huggingansuman
/

red_teaming_env

Running

App Files Files Community

red_teaming_env / models.py

Huggingansuman

Upload red_teaming_env Space files

410276d verified 8 days ago

raw

history blame contribute delete

3.9 kB

	"""Typed models for the RedTeamEnv OpenEnv environment."""

	from __future__ import annotations

	from typing import Any, Literal

	from openenv.core.env_server.types import Action, Observation, State
	from pydantic import Field

	RedTeamStrategy = Literal[
	"roleplay",
	"obfuscation",
	"decomposition",
	"narrative_wrap",
	"multilingual",
	"direct",
	]


	class RedTeamAction(Action):
	"""Fixed-vocabulary red-teaming action."""

	strategy: RedTeamStrategy = Field(
	..., description="The fixed strategy selected for the current turn."
	)
	prompt: str = Field(
	...,
	min_length=1,
	description="The adversarial prompt sent to the target model.",
	)
	turn_context: str = Field(
	default="",
	description="Optional framing or grooming context carried into the turn.",
	)


	class RedTeamObservation(Observation):
	"""Observation returned after a target-model interaction."""

	target_response: str = Field(
	default="", description="The target model response for the last turn."
	)
	compliance_flags: list[str] = Field(
	default_factory=list,
	description="Verified compliance or policy-violation flags for the turn.",
	)
	attack_score: float = Field(
	default=0.0, ge=0.0, le=1.0, description="Turn-level attack quality score."
	)
	strategy_used: str = Field(
	default="", description="Echo of the action strategy used for the turn."
	)
	attempts_remaining: int = Field(
	default=0, ge=0, description="Remaining turns in the active episode."
	)
	conversation_history: list[dict[str, Any]] = Field(
	default_factory=list,
	description="Full multi-turn conversation transcript for the episode.",
	)
	message: str = Field(
	default="", description="Human-readable status message for the step."
	)


	class RedTeamState(State):
	"""Environment state returned by the `/state` endpoint."""

	task_name: str = Field(
	default="",
	description="stereotype_probe \| policy_bypass \| obfuscated_extraction",
	)
	target_model: str = Field(
	default="", description="Model identifier used by the target wrapper."
	)
	target_temperature: float = Field(
	default=0.7, ge=0.0, le=1.0, description="Episode temperature sample."
	)
	strategies_used: list[str] = Field(
	default_factory=list,
	description="Unique strategies used so far in the episode.",
	)
	flags_found: list[str] = Field(
	default_factory=list,
	description="Cumulative verified compliance flags found this episode.",
	)
	defense_drift_active: bool = Field(
	default=False,
	description="Whether adaptive defense drift has activated for the target.",
	)
	audit_this_episode: bool = Field(
	default=False,
	description="Whether detailed auditing is enabled for this episode.",
	)
	max_turns: int = Field(
	default=0, ge=0, description="Maximum steps for the active task."
	)
	seed: int \| None = Field(
	default=None, description="Episode seed used to initialize stochasticity."
	)
	reward_history: list[float] = Field(
	default_factory=list, description="Per-step reward history for the episode."
	)
	judge_history: list[float] = Field(
	default_factory=list,
	description="Per-step judge scores used for dense reward shaping.",
	)
	audit_log: list[dict[str, Any]] = Field(
	default_factory=list,
	description="Detailed verifier and reward traces captured during audit.",
	)
	last_success: bool = Field(
	default=False, description="Whether the most recent terminal state succeeded."
	)


	# Backward-compatible aliases for the scaffolded names.
	RedTeamingAction = RedTeamAction
	RedTeamingObservation = RedTeamObservation
	RedTeamingState = RedTeamState