| |
| |
| |
| |
| |
|
|
| """ |
| Data models for the SafeSpace Content Moderation Environment. |
| |
| SafeSpace is an RL environment where an AI agent acts as a content moderator, |
| investigating reported posts and making structured moderation decisions. |
| """ |
|
|
| from typing import Any, Dict, List, Literal, Optional |
|
|
| from openenv.core.env_server.types import Action, Observation, State |
| from pydantic import BaseModel, ConfigDict, Field |
|
|
| ActionType = Literal[ |
| "request_author_profile", |
| "request_author_violations", |
| "request_thread_context", |
| "request_community_rules", |
| "request_linked_content", |
| "request_similar_precedents", |
| "request_reporter_credibility", |
| "decide", |
| ] |
| DecisionType = Literal["approve", "remove", "escalate", "warn"] |
| SeverityType = Literal["none", "low", "medium", "high", "critical"] |
| TriggerType = Literal["user_report", "auto_flag", "appeal", "proactive_audit"] |
| MediaType = Literal["text", "text+image", "text+link"] |
| DifficultyType = Literal["easy", "medium", "hard"] |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ContentItem(BaseModel): |
| """A content item (post) that needs moderation review.""" |
|
|
| post_id: str = Field(..., description="Unique identifier for the post") |
| text: str = Field(..., description="The text content of the post") |
| author_id: str = Field(..., description="Unique identifier of the author") |
| community: str = Field( |
| ..., description="Community where the post was made (e.g., 'gaming', 'health')" |
| ) |
| timestamp: str = Field(..., description="ISO timestamp when the post was created") |
| media_type: MediaType = Field( |
| ..., description="Type of media: 'text', 'text+image', or 'text+link'" |
| ) |
| media_description: Optional[str] = Field( |
| default=None, description="Text description of image/link if present" |
| ) |
|
|
|
|
| class TriggerInfo(BaseModel): |
| """How this content entered the moderation queue.""" |
|
|
| trigger_type: TriggerType = Field( |
| ..., |
| description="One of: 'user_report', 'auto_flag', 'appeal', 'proactive_audit'", |
| ) |
| |
| report_count: int = Field(default=0, description="Number of reports received") |
| report_categories: List[str] = Field( |
| default_factory=list, description="Categories selected by reporters" |
| ) |
| sample_report_reason: Optional[str] = Field( |
| default=None, description="Example report reason from a user" |
| ) |
| |
| auto_flag_reason: Optional[str] = Field( |
| default=None, description="Why automated system flagged this content" |
| ) |
| |
| original_decision: Optional[str] = Field( |
| default=None, description="The original moderation decision being appealed" |
| ) |
| appeal_text: Optional[str] = Field( |
| default=None, description="User's appeal message" |
| ) |
| |
| audit_reason: Optional[str] = Field( |
| default=None, description="Why this content was selected for audit" |
| ) |
|
|
|
|
| class GatheredContext(BaseModel): |
| """Context gathered through investigation actions. Starts empty.""" |
|
|
| author_profile: Optional[Dict[str, Any]] = Field( |
| default=None, description="Author's bio, account age, follower count" |
| ) |
| author_violations: Optional[List[Dict[str, Any]]] = Field( |
| default=None, description="Author's past moderation violations" |
| ) |
| thread_context: Optional[List[Dict[str, Any]]] = Field( |
| default=None, description="Full conversation thread" |
| ) |
| community_rules: Optional[str] = Field( |
| default=None, description="Community-specific moderation guidelines" |
| ) |
| linked_content_summary: Optional[str] = Field( |
| default=None, description="What the linked content contains" |
| ) |
| similar_precedents: Optional[List[Dict[str, Any]]] = Field( |
| default=None, description="How similar posts were moderated before" |
| ) |
| reporter_credibility: Optional[Dict[str, Any]] = Field( |
| default=None, description="Reporter's history of accurate vs false reports" |
| ) |
|
|
|
|
| class BreakdownComponent(BaseModel): |
| """Typed reward or grading component with room for structured details.""" |
|
|
| model_config = ConfigDict(extra="allow") |
|
|
| score: Optional[float] = Field(default=None, description="Component score") |
| max: Optional[float] = Field(default=None, description="Maximum component score") |
| min: Optional[float] = Field(default=None, description="Minimum component score") |
| raw_score: Optional[float] = Field( |
| default=None, description="Raw component score before normalization" |
| ) |
| raw_max: Optional[float] = Field( |
| default=None, description="Raw maximum component score before normalization" |
| ) |
| raw_min: Optional[float] = Field( |
| default=None, description="Raw minimum component score before normalization" |
| ) |
| weight: Optional[float] = Field( |
| default=None, description="Normalized weighting used by the task grade" |
| ) |
| details: Dict[str, Any] = Field( |
| default_factory=dict, |
| description="Structured details for the component calculation", |
| ) |
|
|
|
|
| class RewardBreakdown(BaseModel): |
| """Typed reward breakdown returned on reset, intermediate, and terminal steps.""" |
|
|
| model_config = ConfigDict(extra="allow") |
|
|
| reward_type: str = Field(default="unknown", description="Reward breakdown category") |
| total: float = Field(default=0.0, description="Total reward for this step") |
| raw_total: Optional[float] = Field( |
| default=None, description="Raw total reward for this step before normalization" |
| ) |
| score: Optional[float] = Field( |
| default=None, description="Normalized score for simple cases" |
| ) |
| raw_score: Optional[float] = Field( |
| default=None, description="Raw score for simple cases before normalization" |
| ) |
| requested_score: Optional[float] = Field( |
| default=None, description="Normalized uncapped score requested by the reward rule" |
| ) |
| raw_requested_score: Optional[float] = Field( |
| default=None, |
| description="Raw uncapped score requested by the reward rule before normalization", |
| ) |
| applied_score: Optional[float] = Field( |
| default=None, description="Normalized score applied after caps or bounds" |
| ) |
| raw_applied_score: Optional[float] = Field( |
| default=None, |
| description="Raw score applied after caps or bounds before normalization", |
| ) |
| step_total: Optional[float] = Field( |
| default=None, description="Normalized combined step reward in multi-part terminal cases" |
| ) |
| raw_step_total: Optional[float] = Field( |
| default=None, |
| description="Raw combined step reward in multi-part terminal cases before normalization", |
| ) |
| trajectory_total: Optional[float] = Field( |
| default=None, description="Normalized cumulative trajectory shaping reward" |
| ) |
| raw_trajectory_total: Optional[float] = Field( |
| default=None, |
| description="Raw cumulative trajectory shaping reward before normalization", |
| ) |
| episode_total: Optional[float] = Field( |
| default=None, description="Normalized running episode reward after this step" |
| ) |
| raw_episode_total: Optional[float] = Field( |
| default=None, |
| description="Raw running episode reward after this step before normalization", |
| ) |
| cumulative_total: Optional[float] = Field( |
| default=None, |
| description="Normalized episode reward total after terminal application", |
| ) |
| raw_cumulative_total: Optional[float] = Field( |
| default=None, |
| description="Raw episode reward total after terminal application before normalization", |
| ) |
| theoretical_terminal_max: Optional[float] = Field( |
| default=None, description="Normalized maximum possible terminal reward" |
| ) |
| theoretical_terminal_min: Optional[float] = Field( |
| default=None, description="Normalized minimum possible terminal reward" |
| ) |
| raw_theoretical_terminal_max: Optional[float] = Field( |
| default=None, |
| description="Raw maximum possible terminal reward before normalization", |
| ) |
| raw_theoretical_terminal_min: Optional[float] = Field( |
| default=None, |
| description="Raw minimum possible terminal reward before normalization", |
| ) |
| context_field: Optional[str] = Field( |
| default=None, description="Context source involved in the reward" |
| ) |
| context_needed: List[str] = Field( |
| default_factory=list, description="Ground-truth context sources needed" |
| ) |
| is_needed: Optional[bool] = Field( |
| default=None, description="Whether the requested context was useful" |
| ) |
| retrieved: Optional[bool] = Field( |
| default=None, description="Whether the context source had retrievable data" |
| ) |
| reason: Optional[str] = Field(default=None, description="Machine-readable reason") |
| action_type: Optional[str] = Field( |
| default=None, description="Action type involved in the reward" |
| ) |
| trajectory_cap: Optional[float] = Field( |
| default=None, description="Trajectory reward cap in effect" |
| ) |
| decision: Optional[BreakdownComponent] = Field( |
| default=None, description="Decision-scoring component" |
| ) |
| factor: Optional[BreakdownComponent] = Field( |
| default=None, description="Factor overlap component" |
| ) |
| efficiency: Optional[BreakdownComponent] = Field( |
| default=None, description="Efficiency component" |
| ) |
| calibration: Optional[BreakdownComponent] = Field( |
| default=None, description="Calibration component" |
| ) |
| trajectory: Optional[Dict[str, Any]] = Field( |
| default=None, |
| description="Nested trajectory reward payload for no-decision terminal cases", |
| ) |
| no_decision: Optional[Dict[str, Any]] = Field( |
| default=None, |
| description="Nested no-decision penalty payload when the budget is exhausted", |
| ) |
| last_terminal_breakdown: Optional[Dict[str, Any]] = Field( |
| default=None, |
| description="Previous terminal reward payload when guarding completed episodes", |
| ) |
|
|
|
|
| class TaskGradeBreakdown(BaseModel): |
| """Typed normalized grader breakdown returned on terminal steps.""" |
|
|
| model_config = ConfigDict(extra="allow") |
|
|
| decision: Optional[BreakdownComponent] = Field( |
| default=None, description="Decision grading component" |
| ) |
| factor_overlap: Optional[BreakdownComponent] = Field( |
| default=None, description="Factor-overlap grading component" |
| ) |
| efficiency: Optional[BreakdownComponent] = Field( |
| default=None, description="Efficiency grading component" |
| ) |
| calibration: Optional[BreakdownComponent] = Field( |
| default=None, description="Calibration grading component" |
| ) |
| total: float = Field(default=0.0, description="Normalized task grade in the open interval (0, 1)") |
|
|
|
|
| |
| |
| |
|
|
|
|
| class ModerationAction(Action): |
| """ |
| Action to be executed in the SafeSpace environment. |
| |
| Investigation actions (cost 1 action each): |
| - request_author_profile |
| - request_author_violations |
| - request_thread_context |
| - request_community_rules |
| - request_linked_content |
| - request_similar_precedents |
| - request_reporter_credibility |
| |
| Terminal action: |
| - decide (requires decision fields) |
| """ |
|
|
| action_type: ActionType = Field( |
| ..., |
| description=( |
| "One of: 'request_author_profile', 'request_author_violations', " |
| "'request_thread_context', 'request_community_rules', " |
| "'request_linked_content', 'request_similar_precedents', " |
| "'request_reporter_credibility', 'decide'" |
| ), |
| ) |
|
|
| |
|
|
| decision: Optional[DecisionType] = Field( |
| default=None, |
| description="One of: 'approve', 'remove', 'escalate', 'warn'", |
| ) |
| primary_violation: Optional[str] = Field( |
| default=None, |
| description="Policy section ID (e.g., '1.0', '2.1', '3.1') or 'none'", |
| ) |
| severity: Optional[SeverityType] = Field( |
| default=None, |
| description="One of: 'none', 'low', 'medium', 'high', 'critical'", |
| ) |
| confidence: Optional[float] = Field( |
| default=None, |
| ge=0.0, |
| le=1.0, |
| description="Agent's confidence in the decision (0.0 to 1.0)", |
| ) |
| key_factors: Optional[List[str]] = Field( |
| default=None, |
| description="Selected factors from the FACTOR_LIST that influenced the decision", |
| ) |
|
|
|
|
| class ModerationObservation(Observation): |
| """ |
| Observation returned from the SafeSpace environment. |
| |
| Contains the content to moderate, trigger information, gathered context, |
| platform policy, and episode progress. |
| """ |
|
|
| |
| content_item: Optional[ContentItem] = Field( |
| default=None, description="The content item being moderated" |
| ) |
| trigger_info: Optional[TriggerInfo] = Field( |
| default=None, description="How this content entered the moderation queue" |
| ) |
|
|
| |
| gathered_context: GatheredContext = Field( |
| default_factory=GatheredContext, |
| description="Context gathered through investigation actions", |
| ) |
|
|
| |
| platform_policy: str = Field( |
| default="", description="The platform's content moderation policy document" |
| ) |
| available_factors: List[str] = Field( |
| default_factory=list, |
| description="List of factors the agent can cite in its decision", |
| ) |
|
|
| |
| actions_taken: int = Field( |
| default=0, description="Number of actions taken this episode" |
| ) |
| max_actions: int = Field( |
| default=8, description="Maximum actions allowed per episode" |
| ) |
| action_history: List[str] = Field( |
| default_factory=list, description="List of actions taken so far" |
| ) |
| feedback: str = Field( |
| default="", description="Feedback message from the last action" |
| ) |
| error_code: Optional[str] = Field( |
| default=None, |
| description="Structured error code for invalid or rejected actions", |
| ) |
|
|
| |
| reward_breakdown: Optional[RewardBreakdown] = Field( |
| default=None, |
| description="Breakdown of reward components for the last step", |
| ) |
| task_grade: Optional[float] = Field( |
| default=None, |
| ge=0.0, |
| le=1.0, |
| description="Deterministic normalized task grade for the current episode, strictly between 0 and 1 on terminal steps", |
| ) |
| grade_breakdown: Optional[TaskGradeBreakdown] = Field( |
| default=None, |
| description="Breakdown of normalized task-grade components", |
| ) |
|
|
|
|
| class ModerationState(State): |
| """ |
| State of the SafeSpace environment. |
| |
| Tracks episode metadata and progress. |
| """ |
|
|
| |
| episode_id: Optional[str] = Field( |
| default=None, description="Unique identifier for this episode" |
| ) |
| step_count: int = Field(default=0, description="Number of steps taken") |
|
|
| |
| scenario_id: Optional[str] = Field( |
| default=None, description="Current scenario ID" |
| ) |
| task_id: Optional[str] = Field( |
| default=None, description="Task ID used to load this scenario" |
| ) |
| difficulty: Optional[DifficultyType] = Field( |
| default=None, description="Scenario difficulty: easy, medium, or hard" |
| ) |
| trigger_type: Optional[TriggerType] = Field( |
| default=None, description="How this content entered the moderation queue" |
| ) |
|
|
| |
| actions_taken: int = Field( |
| default=0, description="Number of investigation actions taken" |
| ) |
| max_actions: int = Field( |
| default=8, description="Maximum actions allowed per episode" |
| ) |
| context_requested: List[str] = Field( |
| default_factory=list, description="List of context types requested" |
| ) |
| decision_made: bool = Field( |
| default=False, description="Whether a terminal decision has been made" |
| ) |
| episode_reward: float = Field( |
| default=0.0, description="Normalized total reward for episode" |
| ) |
| raw_episode_reward: float = Field( |
| default=0.0, description="Raw total reward for episode before normalization" |
| ) |
| done: bool = Field(default=False, description="Whether the episode is terminal") |
| last_error_code: Optional[str] = Field( |
| default=None, |
| description="Structured error code from the most recent rejected action", |
| ) |
|
|