Spaces:

Ishangtxl
/

SafeSpace

Sleeping

File size: 17,166 Bytes

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Data models for the SafeSpace Content Moderation Environment.

SafeSpace is an RL environment where an AI agent acts as a content moderator,
investigating reported posts and making structured moderation decisions.
"""

from typing import Any, Dict, List, Literal, Optional

from openenv.core.env_server.types import Action, Observation, State
from pydantic import BaseModel, ConfigDict, Field

ActionType = Literal[
    "request_author_profile",
    "request_author_violations",
    "request_thread_context",
    "request_community_rules",
    "request_linked_content",
    "request_similar_precedents",
    "request_reporter_credibility",
    "decide",
]
DecisionType = Literal["approve", "remove", "escalate", "warn"]
SeverityType = Literal["none", "low", "medium", "high", "critical"]
TriggerType = Literal["user_report", "auto_flag", "appeal", "proactive_audit"]
MediaType = Literal["text", "text+image", "text+link"]
DifficultyType = Literal["easy", "medium", "hard"]


# ============================================================================
# Supporting Models (nested in Observation)
# ============================================================================


class ContentItem(BaseModel):
    """A content item (post) that needs moderation review."""

    post_id: str = Field(..., description="Unique identifier for the post")
    text: str = Field(..., description="The text content of the post")
    author_id: str = Field(..., description="Unique identifier of the author")
    community: str = Field(
        ..., description="Community where the post was made (e.g., 'gaming', 'health')"
    )
    timestamp: str = Field(..., description="ISO timestamp when the post was created")
    media_type: MediaType = Field(
        ..., description="Type of media: 'text', 'text+image', or 'text+link'"
    )
    media_description: Optional[str] = Field(
        default=None, description="Text description of image/link if present"
    )


class TriggerInfo(BaseModel):
    """How this content entered the moderation queue."""

    trigger_type: TriggerType = Field(
        ...,
        description="One of: 'user_report', 'auto_flag', 'appeal', 'proactive_audit'",
    )
    # For user_report
    report_count: int = Field(default=0, description="Number of reports received")
    report_categories: List[str] = Field(
        default_factory=list, description="Categories selected by reporters"
    )
    sample_report_reason: Optional[str] = Field(
        default=None, description="Example report reason from a user"
    )
    # For auto_flag
    auto_flag_reason: Optional[str] = Field(
        default=None, description="Why automated system flagged this content"
    )
    # For appeal
    original_decision: Optional[str] = Field(
        default=None, description="The original moderation decision being appealed"
    )
    appeal_text: Optional[str] = Field(
        default=None, description="User's appeal message"
    )
    # For proactive_audit
    audit_reason: Optional[str] = Field(
        default=None, description="Why this content was selected for audit"
    )


class GatheredContext(BaseModel):
    """Context gathered through investigation actions. Starts empty."""

    author_profile: Optional[Dict[str, Any]] = Field(
        default=None, description="Author's bio, account age, follower count"
    )
    author_violations: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="Author's past moderation violations"
    )
    thread_context: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="Full conversation thread"
    )
    community_rules: Optional[str] = Field(
        default=None, description="Community-specific moderation guidelines"
    )
    linked_content_summary: Optional[str] = Field(
        default=None, description="What the linked content contains"
    )
    similar_precedents: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="How similar posts were moderated before"
    )
    reporter_credibility: Optional[Dict[str, Any]] = Field(
        default=None, description="Reporter's history of accurate vs false reports"
    )


class BreakdownComponent(BaseModel):
    """Typed reward or grading component with room for structured details."""

    model_config = ConfigDict(extra="allow")

    score: Optional[float] = Field(default=None, description="Component score")
    max: Optional[float] = Field(default=None, description="Maximum component score")
    min: Optional[float] = Field(default=None, description="Minimum component score")
    raw_score: Optional[float] = Field(
        default=None, description="Raw component score before normalization"
    )
    raw_max: Optional[float] = Field(
        default=None, description="Raw maximum component score before normalization"
    )
    raw_min: Optional[float] = Field(
        default=None, description="Raw minimum component score before normalization"
    )
    weight: Optional[float] = Field(
        default=None, description="Normalized weighting used by the task grade"
    )
    details: Dict[str, Any] = Field(
        default_factory=dict,
        description="Structured details for the component calculation",
    )


class RewardBreakdown(BaseModel):
    """Typed reward breakdown returned on reset, intermediate, and terminal steps."""

    model_config = ConfigDict(extra="allow")

    reward_type: str = Field(default="unknown", description="Reward breakdown category")
    total: float = Field(default=0.0, description="Total reward for this step")
    raw_total: Optional[float] = Field(
        default=None, description="Raw total reward for this step before normalization"
    )
    score: Optional[float] = Field(
        default=None, description="Normalized score for simple cases"
    )
    raw_score: Optional[float] = Field(
        default=None, description="Raw score for simple cases before normalization"
    )
    requested_score: Optional[float] = Field(
        default=None, description="Normalized uncapped score requested by the reward rule"
    )
    raw_requested_score: Optional[float] = Field(
        default=None,
        description="Raw uncapped score requested by the reward rule before normalization",
    )
    applied_score: Optional[float] = Field(
        default=None, description="Normalized score applied after caps or bounds"
    )
    raw_applied_score: Optional[float] = Field(
        default=None,
        description="Raw score applied after caps or bounds before normalization",
    )
    step_total: Optional[float] = Field(
        default=None, description="Normalized combined step reward in multi-part terminal cases"
    )
    raw_step_total: Optional[float] = Field(
        default=None,
        description="Raw combined step reward in multi-part terminal cases before normalization",
    )
    trajectory_total: Optional[float] = Field(
        default=None, description="Normalized cumulative trajectory shaping reward"
    )
    raw_trajectory_total: Optional[float] = Field(
        default=None,
        description="Raw cumulative trajectory shaping reward before normalization",
    )
    episode_total: Optional[float] = Field(
        default=None, description="Normalized running episode reward after this step"
    )
    raw_episode_total: Optional[float] = Field(
        default=None,
        description="Raw running episode reward after this step before normalization",
    )
    cumulative_total: Optional[float] = Field(
        default=None,
        description="Normalized episode reward total after terminal application",
    )
    raw_cumulative_total: Optional[float] = Field(
        default=None,
        description="Raw episode reward total after terminal application before normalization",
    )
    theoretical_terminal_max: Optional[float] = Field(
        default=None, description="Normalized maximum possible terminal reward"
    )
    theoretical_terminal_min: Optional[float] = Field(
        default=None, description="Normalized minimum possible terminal reward"
    )
    raw_theoretical_terminal_max: Optional[float] = Field(
        default=None,
        description="Raw maximum possible terminal reward before normalization",
    )
    raw_theoretical_terminal_min: Optional[float] = Field(
        default=None,
        description="Raw minimum possible terminal reward before normalization",
    )
    context_field: Optional[str] = Field(
        default=None, description="Context source involved in the reward"
    )
    context_needed: List[str] = Field(
        default_factory=list, description="Ground-truth context sources needed"
    )
    is_needed: Optional[bool] = Field(
        default=None, description="Whether the requested context was useful"
    )
    retrieved: Optional[bool] = Field(
        default=None, description="Whether the context source had retrievable data"
    )
    reason: Optional[str] = Field(default=None, description="Machine-readable reason")
    action_type: Optional[str] = Field(
        default=None, description="Action type involved in the reward"
    )
    trajectory_cap: Optional[float] = Field(
        default=None, description="Trajectory reward cap in effect"
    )
    decision: Optional[BreakdownComponent] = Field(
        default=None, description="Decision-scoring component"
    )
    factor: Optional[BreakdownComponent] = Field(
        default=None, description="Factor overlap component"
    )
    efficiency: Optional[BreakdownComponent] = Field(
        default=None, description="Efficiency component"
    )
    calibration: Optional[BreakdownComponent] = Field(
        default=None, description="Calibration component"
    )
    trajectory: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Nested trajectory reward payload for no-decision terminal cases",
    )
    no_decision: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Nested no-decision penalty payload when the budget is exhausted",
    )
    last_terminal_breakdown: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Previous terminal reward payload when guarding completed episodes",
    )


class TaskGradeBreakdown(BaseModel):
    """Typed normalized grader breakdown returned on terminal steps."""

    model_config = ConfigDict(extra="allow")

    decision: Optional[BreakdownComponent] = Field(
        default=None, description="Decision grading component"
    )
    factor_overlap: Optional[BreakdownComponent] = Field(
        default=None, description="Factor-overlap grading component"
    )
    efficiency: Optional[BreakdownComponent] = Field(
        default=None, description="Efficiency grading component"
    )
    calibration: Optional[BreakdownComponent] = Field(
        default=None, description="Calibration grading component"
    )
    total: float = Field(default=0.0, description="Normalized task grade in the open interval (0, 1)")


# ============================================================================
# Core OpenEnv Models
# ============================================================================


class ModerationAction(Action):
    """
    Action to be executed in the SafeSpace environment.

    Investigation actions (cost 1 action each):
    - request_author_profile
    - request_author_violations
    - request_thread_context
    - request_community_rules
    - request_linked_content
    - request_similar_precedents
    - request_reporter_credibility

    Terminal action:
    - decide (requires decision fields)
    """

    action_type: ActionType = Field(
        ...,
        description=(
            "One of: 'request_author_profile', 'request_author_violations', "
            "'request_thread_context', 'request_community_rules', "
            "'request_linked_content', 'request_similar_precedents', "
            "'request_reporter_credibility', 'decide'"
        ),
    )

    # === Decision fields (required ONLY when action_type == "decide") ===

    decision: Optional[DecisionType] = Field(
        default=None,
        description="One of: 'approve', 'remove', 'escalate', 'warn'",
    )
    primary_violation: Optional[str] = Field(
        default=None,
        description="Policy section ID (e.g., '1.0', '2.1', '3.1') or 'none'",
    )
    severity: Optional[SeverityType] = Field(
        default=None,
        description="One of: 'none', 'low', 'medium', 'high', 'critical'",
    )
    confidence: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Agent's confidence in the decision (0.0 to 1.0)",
    )
    key_factors: Optional[List[str]] = Field(
        default=None,
        description="Selected factors from the FACTOR_LIST that influenced the decision",
    )


class ModerationObservation(Observation):
    """
    Observation returned from the SafeSpace environment.

    Contains the content to moderate, trigger information, gathered context,
    platform policy, and episode progress.
    """

    # Content and trigger info
    content_item: Optional[ContentItem] = Field(
        default=None, description="The content item being moderated"
    )
    trigger_info: Optional[TriggerInfo] = Field(
        default=None, description="How this content entered the moderation queue"
    )

    # Investigation results (populated as agent gathers context)
    gathered_context: GatheredContext = Field(
        default_factory=GatheredContext,
        description="Context gathered through investigation actions",
    )

    # Policy and factors
    platform_policy: str = Field(
        default="", description="The platform's content moderation policy document"
    )
    available_factors: List[str] = Field(
        default_factory=list,
        description="List of factors the agent can cite in its decision",
    )

    # Episode progress
    actions_taken: int = Field(
        default=0, description="Number of actions taken this episode"
    )
    max_actions: int = Field(
        default=8, description="Maximum actions allowed per episode"
    )
    action_history: List[str] = Field(
        default_factory=list, description="List of actions taken so far"
    )
    feedback: str = Field(
        default="", description="Feedback message from the last action"
    )
    error_code: Optional[str] = Field(
        default=None,
        description="Structured error code for invalid or rejected actions",
    )

    # Reward breakdown (populated after terminal decision)
    reward_breakdown: Optional[RewardBreakdown] = Field(
        default=None,
        description="Breakdown of reward components for the last step",
    )
    task_grade: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Deterministic normalized task grade for the current episode, strictly between 0 and 1 on terminal steps",
    )
    grade_breakdown: Optional[TaskGradeBreakdown] = Field(
        default=None,
        description="Breakdown of normalized task-grade components",
    )


class ModerationState(State):
    """
    State of the SafeSpace environment.

    Tracks episode metadata and progress.
    """

    # Override base State fields
    episode_id: Optional[str] = Field(
        default=None, description="Unique identifier for this episode"
    )
    step_count: int = Field(default=0, description="Number of steps taken")

    # Episode identification
    scenario_id: Optional[str] = Field(
        default=None, description="Current scenario ID"
    )
    task_id: Optional[str] = Field(
        default=None, description="Task ID used to load this scenario"
    )
    difficulty: Optional[DifficultyType] = Field(
        default=None, description="Scenario difficulty: easy, medium, or hard"
    )
    trigger_type: Optional[TriggerType] = Field(
        default=None, description="How this content entered the moderation queue"
    )

    # SafeSpace-specific public progress fields
    actions_taken: int = Field(
        default=0, description="Number of investigation actions taken"
    )
    max_actions: int = Field(
        default=8, description="Maximum actions allowed per episode"
    )
    context_requested: List[str] = Field(
        default_factory=list, description="List of context types requested"
    )
    decision_made: bool = Field(
        default=False, description="Whether a terminal decision has been made"
    )
    episode_reward: float = Field(
        default=0.0, description="Normalized total reward for episode"
    )
    raw_episode_reward: float = Field(
        default=0.0, description="Raw total reward for episode before normalization"
    )
    done: bool = Field(default=False, description="Whether the episode is terminal")
    last_error_code: Optional[str] = Field(
        default=None,
        description="Structured error code from the most recent rejected action",
    )