Spaces:

Ishangtxl
/

SafeSpace

Sleeping

App Files Files Community

SafeSpace / models.py

Ishangtxl

Upload folder using huggingface_hub

1ccd052 verified about 1 month ago

raw

history blame contribute delete

17.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Data models for the SafeSpace Content Moderation Environment.

	SafeSpace is an RL environment where an AI agent acts as a content moderator,
	investigating reported posts and making structured moderation decisions.
	"""

	from typing import Any, Dict, List, Literal, Optional

	from openenv.core.env_server.types import Action, Observation, State
	from pydantic import BaseModel, ConfigDict, Field

	ActionType = Literal[
	"request_author_profile",
	"request_author_violations",
	"request_thread_context",
	"request_community_rules",
	"request_linked_content",
	"request_similar_precedents",
	"request_reporter_credibility",
	"decide",
	]
	DecisionType = Literal["approve", "remove", "escalate", "warn"]
	SeverityType = Literal["none", "low", "medium", "high", "critical"]
	TriggerType = Literal["user_report", "auto_flag", "appeal", "proactive_audit"]
	MediaType = Literal["text", "text+image", "text+link"]
	DifficultyType = Literal["easy", "medium", "hard"]


	# ============================================================================
	# Supporting Models (nested in Observation)
	# ============================================================================


	class ContentItem(BaseModel):
	"""A content item (post) that needs moderation review."""

	post_id: str = Field(..., description="Unique identifier for the post")
	text: str = Field(..., description="The text content of the post")
	author_id: str = Field(..., description="Unique identifier of the author")
	community: str = Field(
	..., description="Community where the post was made (e.g., 'gaming', 'health')"
	)
	timestamp: str = Field(..., description="ISO timestamp when the post was created")
	media_type: MediaType = Field(
	..., description="Type of media: 'text', 'text+image', or 'text+link'"
	)
	media_description: Optional[str] = Field(
	default=None, description="Text description of image/link if present"
	)


	class TriggerInfo(BaseModel):
	"""How this content entered the moderation queue."""

	trigger_type: TriggerType = Field(
	...,
	description="One of: 'user_report', 'auto_flag', 'appeal', 'proactive_audit'",
	)
	# For user_report
	report_count: int = Field(default=0, description="Number of reports received")
	report_categories: List[str] = Field(
	default_factory=list, description="Categories selected by reporters"
	)
	sample_report_reason: Optional[str] = Field(
	default=None, description="Example report reason from a user"
	)
	# For auto_flag
	auto_flag_reason: Optional[str] = Field(
	default=None, description="Why automated system flagged this content"
	)
	# For appeal
	original_decision: Optional[str] = Field(
	default=None, description="The original moderation decision being appealed"
	)
	appeal_text: Optional[str] = Field(
	default=None, description="User's appeal message"
	)
	# For proactive_audit
	audit_reason: Optional[str] = Field(
	default=None, description="Why this content was selected for audit"
	)


	class GatheredContext(BaseModel):
	"""Context gathered through investigation actions. Starts empty."""

	author_profile: Optional[Dict[str, Any]] = Field(
	default=None, description="Author's bio, account age, follower count"
	)
	author_violations: Optional[List[Dict[str, Any]]] = Field(
	default=None, description="Author's past moderation violations"
	)
	thread_context: Optional[List[Dict[str, Any]]] = Field(
	default=None, description="Full conversation thread"
	)
	community_rules: Optional[str] = Field(
	default=None, description="Community-specific moderation guidelines"
	)
	linked_content_summary: Optional[str] = Field(
	default=None, description="What the linked content contains"
	)
	similar_precedents: Optional[List[Dict[str, Any]]] = Field(
	default=None, description="How similar posts were moderated before"
	)
	reporter_credibility: Optional[Dict[str, Any]] = Field(
	default=None, description="Reporter's history of accurate vs false reports"
	)


	class BreakdownComponent(BaseModel):
	"""Typed reward or grading component with room for structured details."""

	model_config = ConfigDict(extra="allow")

	score: Optional[float] = Field(default=None, description="Component score")
	max: Optional[float] = Field(default=None, description="Maximum component score")
	min: Optional[float] = Field(default=None, description="Minimum component score")
	raw_score: Optional[float] = Field(
	default=None, description="Raw component score before normalization"
	)
	raw_max: Optional[float] = Field(
	default=None, description="Raw maximum component score before normalization"
	)
	raw_min: Optional[float] = Field(
	default=None, description="Raw minimum component score before normalization"
	)
	weight: Optional[float] = Field(
	default=None, description="Normalized weighting used by the task grade"
	)
	details: Dict[str, Any] = Field(
	default_factory=dict,
	description="Structured details for the component calculation",
	)


	class RewardBreakdown(BaseModel):
	"""Typed reward breakdown returned on reset, intermediate, and terminal steps."""

	model_config = ConfigDict(extra="allow")

	reward_type: str = Field(default="unknown", description="Reward breakdown category")
	total: float = Field(default=0.0, description="Total reward for this step")
	raw_total: Optional[float] = Field(
	default=None, description="Raw total reward for this step before normalization"
	)
	score: Optional[float] = Field(
	default=None, description="Normalized score for simple cases"
	)
	raw_score: Optional[float] = Field(
	default=None, description="Raw score for simple cases before normalization"
	)
	requested_score: Optional[float] = Field(
	default=None, description="Normalized uncapped score requested by the reward rule"
	)
	raw_requested_score: Optional[float] = Field(
	default=None,
	description="Raw uncapped score requested by the reward rule before normalization",
	)
	applied_score: Optional[float] = Field(
	default=None, description="Normalized score applied after caps or bounds"
	)
	raw_applied_score: Optional[float] = Field(
	default=None,
	description="Raw score applied after caps or bounds before normalization",
	)
	step_total: Optional[float] = Field(
	default=None, description="Normalized combined step reward in multi-part terminal cases"
	)
	raw_step_total: Optional[float] = Field(
	default=None,
	description="Raw combined step reward in multi-part terminal cases before normalization",
	)
	trajectory_total: Optional[float] = Field(
	default=None, description="Normalized cumulative trajectory shaping reward"
	)
	raw_trajectory_total: Optional[float] = Field(
	default=None,
	description="Raw cumulative trajectory shaping reward before normalization",
	)
	episode_total: Optional[float] = Field(
	default=None, description="Normalized running episode reward after this step"
	)
	raw_episode_total: Optional[float] = Field(
	default=None,
	description="Raw running episode reward after this step before normalization",
	)
	cumulative_total: Optional[float] = Field(
	default=None,
	description="Normalized episode reward total after terminal application",
	)
	raw_cumulative_total: Optional[float] = Field(
	default=None,
	description="Raw episode reward total after terminal application before normalization",
	)
	theoretical_terminal_max: Optional[float] = Field(
	default=None, description="Normalized maximum possible terminal reward"
	)
	theoretical_terminal_min: Optional[float] = Field(
	default=None, description="Normalized minimum possible terminal reward"
	)
	raw_theoretical_terminal_max: Optional[float] = Field(
	default=None,
	description="Raw maximum possible terminal reward before normalization",
	)
	raw_theoretical_terminal_min: Optional[float] = Field(
	default=None,
	description="Raw minimum possible terminal reward before normalization",
	)
	context_field: Optional[str] = Field(
	default=None, description="Context source involved in the reward"
	)
	context_needed: List[str] = Field(
	default_factory=list, description="Ground-truth context sources needed"
	)
	is_needed: Optional[bool] = Field(
	default=None, description="Whether the requested context was useful"
	)
	retrieved: Optional[bool] = Field(
	default=None, description="Whether the context source had retrievable data"
	)
	reason: Optional[str] = Field(default=None, description="Machine-readable reason")
	action_type: Optional[str] = Field(
	default=None, description="Action type involved in the reward"
	)
	trajectory_cap: Optional[float] = Field(
	default=None, description="Trajectory reward cap in effect"
	)
	decision: Optional[BreakdownComponent] = Field(
	default=None, description="Decision-scoring component"
	)
	factor: Optional[BreakdownComponent] = Field(
	default=None, description="Factor overlap component"
	)
	efficiency: Optional[BreakdownComponent] = Field(
	default=None, description="Efficiency component"
	)
	calibration: Optional[BreakdownComponent] = Field(
	default=None, description="Calibration component"
	)
	trajectory: Optional[Dict[str, Any]] = Field(
	default=None,
	description="Nested trajectory reward payload for no-decision terminal cases",
	)
	no_decision: Optional[Dict[str, Any]] = Field(
	default=None,
	description="Nested no-decision penalty payload when the budget is exhausted",
	)
	last_terminal_breakdown: Optional[Dict[str, Any]] = Field(
	default=None,
	description="Previous terminal reward payload when guarding completed episodes",
	)


	class TaskGradeBreakdown(BaseModel):
	"""Typed normalized grader breakdown returned on terminal steps."""

	model_config = ConfigDict(extra="allow")

	decision: Optional[BreakdownComponent] = Field(
	default=None, description="Decision grading component"
	)
	factor_overlap: Optional[BreakdownComponent] = Field(
	default=None, description="Factor-overlap grading component"
	)
	efficiency: Optional[BreakdownComponent] = Field(
	default=None, description="Efficiency grading component"
	)
	calibration: Optional[BreakdownComponent] = Field(
	default=None, description="Calibration grading component"
	)
	total: float = Field(default=0.0, description="Normalized task grade in the open interval (0, 1)")


	# ============================================================================
	# Core OpenEnv Models
	# ============================================================================


	class ModerationAction(Action):
	"""
	Action to be executed in the SafeSpace environment.

	Investigation actions (cost 1 action each):
	- request_author_profile
	- request_author_violations
	- request_thread_context
	- request_community_rules
	- request_linked_content
	- request_similar_precedents
	- request_reporter_credibility

	Terminal action:
	- decide (requires decision fields)
	"""

	action_type: ActionType = Field(
	...,
	description=(
	"One of: 'request_author_profile', 'request_author_violations', "
	"'request_thread_context', 'request_community_rules', "
	"'request_linked_content', 'request_similar_precedents', "
	"'request_reporter_credibility', 'decide'"
	),
	)

	# === Decision fields (required ONLY when action_type == "decide") ===

	decision: Optional[DecisionType] = Field(
	default=None,
	description="One of: 'approve', 'remove', 'escalate', 'warn'",
	)
	primary_violation: Optional[str] = Field(
	default=None,
	description="Policy section ID (e.g., '1.0', '2.1', '3.1') or 'none'",
	)
	severity: Optional[SeverityType] = Field(
	default=None,
	description="One of: 'none', 'low', 'medium', 'high', 'critical'",
	)
	confidence: Optional[float] = Field(
	default=None,
	ge=0.0,
	le=1.0,
	description="Agent's confidence in the decision (0.0 to 1.0)",
	)
	key_factors: Optional[List[str]] = Field(
	default=None,
	description="Selected factors from the FACTOR_LIST that influenced the decision",
	)


	class ModerationObservation(Observation):
	"""
	Observation returned from the SafeSpace environment.

	Contains the content to moderate, trigger information, gathered context,
	platform policy, and episode progress.
	"""

	# Content and trigger info
	content_item: Optional[ContentItem] = Field(
	default=None, description="The content item being moderated"
	)
	trigger_info: Optional[TriggerInfo] = Field(
	default=None, description="How this content entered the moderation queue"
	)

	# Investigation results (populated as agent gathers context)
	gathered_context: GatheredContext = Field(
	default_factory=GatheredContext,
	description="Context gathered through investigation actions",
	)

	# Policy and factors
	platform_policy: str = Field(
	default="", description="The platform's content moderation policy document"
	)
	available_factors: List[str] = Field(
	default_factory=list,
	description="List of factors the agent can cite in its decision",
	)

	# Episode progress
	actions_taken: int = Field(
	default=0, description="Number of actions taken this episode"
	)
	max_actions: int = Field(
	default=8, description="Maximum actions allowed per episode"
	)
	action_history: List[str] = Field(
	default_factory=list, description="List of actions taken so far"
	)
	feedback: str = Field(
	default="", description="Feedback message from the last action"
	)
	error_code: Optional[str] = Field(
	default=None,
	description="Structured error code for invalid or rejected actions",
	)

	# Reward breakdown (populated after terminal decision)
	reward_breakdown: Optional[RewardBreakdown] = Field(
	default=None,
	description="Breakdown of reward components for the last step",
	)
	task_grade: Optional[float] = Field(
	default=None,
	ge=0.0,
	le=1.0,
	description="Deterministic normalized task grade for the current episode, strictly between 0 and 1 on terminal steps",
	)
	grade_breakdown: Optional[TaskGradeBreakdown] = Field(
	default=None,
	description="Breakdown of normalized task-grade components",
	)


	class ModerationState(State):
	"""
	State of the SafeSpace environment.

	Tracks episode metadata and progress.
	"""

	# Override base State fields
	episode_id: Optional[str] = Field(
	default=None, description="Unique identifier for this episode"
	)
	step_count: int = Field(default=0, description="Number of steps taken")

	# Episode identification
	scenario_id: Optional[str] = Field(
	default=None, description="Current scenario ID"
	)
	task_id: Optional[str] = Field(
	default=None, description="Task ID used to load this scenario"
	)
	difficulty: Optional[DifficultyType] = Field(
	default=None, description="Scenario difficulty: easy, medium, or hard"
	)
	trigger_type: Optional[TriggerType] = Field(
	default=None, description="How this content entered the moderation queue"
	)

	# SafeSpace-specific public progress fields
	actions_taken: int = Field(
	default=0, description="Number of investigation actions taken"
	)
	max_actions: int = Field(
	default=8, description="Maximum actions allowed per episode"
	)
	context_requested: List[str] = Field(
	default_factory=list, description="List of context types requested"
	)
	decision_made: bool = Field(
	default=False, description="Whether a terminal decision has been made"
	)
	episode_reward: float = Field(
	default=0.0, description="Normalized total reward for episode"
	)
	raw_episode_reward: float = Field(
	default=0.0, description="Raw total reward for episode before normalization"
	)
	done: bool = Field(default=False, description="Whether the episode is terminal")
	last_error_code: Optional[str] = Field(
	default=None,
	description="Structured error code from the most recent rejected action",
	)