File size: 17,166 Bytes
65b5532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7a9ff7
 
 
 
 
 
 
 
 
65b5532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7a9ff7
 
 
 
 
 
 
 
 
65b5532
c7a9ff7
 
 
 
 
65b5532
 
c7a9ff7
 
 
 
 
65b5532
 
c7a9ff7
 
 
 
 
65b5532
 
c7a9ff7
 
 
 
 
65b5532
 
c7a9ff7
 
 
 
 
65b5532
 
c7a9ff7
 
 
 
 
 
65b5532
 
c7a9ff7
65b5532
 
c7a9ff7
 
 
 
 
 
 
 
 
65b5532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ccd052
65b5532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ccd052
65b5532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7a9ff7
 
 
 
 
 
65b5532
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
Data models for the SafeSpace Content Moderation Environment.

SafeSpace is an RL environment where an AI agent acts as a content moderator,
investigating reported posts and making structured moderation decisions.
"""

from typing import Any, Dict, List, Literal, Optional

from openenv.core.env_server.types import Action, Observation, State
from pydantic import BaseModel, ConfigDict, Field

ActionType = Literal[
    "request_author_profile",
    "request_author_violations",
    "request_thread_context",
    "request_community_rules",
    "request_linked_content",
    "request_similar_precedents",
    "request_reporter_credibility",
    "decide",
]
DecisionType = Literal["approve", "remove", "escalate", "warn"]
SeverityType = Literal["none", "low", "medium", "high", "critical"]
TriggerType = Literal["user_report", "auto_flag", "appeal", "proactive_audit"]
MediaType = Literal["text", "text+image", "text+link"]
DifficultyType = Literal["easy", "medium", "hard"]


# ============================================================================
# Supporting Models (nested in Observation)
# ============================================================================


class ContentItem(BaseModel):
    """A content item (post) that needs moderation review."""

    post_id: str = Field(..., description="Unique identifier for the post")
    text: str = Field(..., description="The text content of the post")
    author_id: str = Field(..., description="Unique identifier of the author")
    community: str = Field(
        ..., description="Community where the post was made (e.g., 'gaming', 'health')"
    )
    timestamp: str = Field(..., description="ISO timestamp when the post was created")
    media_type: MediaType = Field(
        ..., description="Type of media: 'text', 'text+image', or 'text+link'"
    )
    media_description: Optional[str] = Field(
        default=None, description="Text description of image/link if present"
    )


class TriggerInfo(BaseModel):
    """How this content entered the moderation queue."""

    trigger_type: TriggerType = Field(
        ...,
        description="One of: 'user_report', 'auto_flag', 'appeal', 'proactive_audit'",
    )
    # For user_report
    report_count: int = Field(default=0, description="Number of reports received")
    report_categories: List[str] = Field(
        default_factory=list, description="Categories selected by reporters"
    )
    sample_report_reason: Optional[str] = Field(
        default=None, description="Example report reason from a user"
    )
    # For auto_flag
    auto_flag_reason: Optional[str] = Field(
        default=None, description="Why automated system flagged this content"
    )
    # For appeal
    original_decision: Optional[str] = Field(
        default=None, description="The original moderation decision being appealed"
    )
    appeal_text: Optional[str] = Field(
        default=None, description="User's appeal message"
    )
    # For proactive_audit
    audit_reason: Optional[str] = Field(
        default=None, description="Why this content was selected for audit"
    )


class GatheredContext(BaseModel):
    """Context gathered through investigation actions. Starts empty."""

    author_profile: Optional[Dict[str, Any]] = Field(
        default=None, description="Author's bio, account age, follower count"
    )
    author_violations: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="Author's past moderation violations"
    )
    thread_context: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="Full conversation thread"
    )
    community_rules: Optional[str] = Field(
        default=None, description="Community-specific moderation guidelines"
    )
    linked_content_summary: Optional[str] = Field(
        default=None, description="What the linked content contains"
    )
    similar_precedents: Optional[List[Dict[str, Any]]] = Field(
        default=None, description="How similar posts were moderated before"
    )
    reporter_credibility: Optional[Dict[str, Any]] = Field(
        default=None, description="Reporter's history of accurate vs false reports"
    )


class BreakdownComponent(BaseModel):
    """Typed reward or grading component with room for structured details."""

    model_config = ConfigDict(extra="allow")

    score: Optional[float] = Field(default=None, description="Component score")
    max: Optional[float] = Field(default=None, description="Maximum component score")
    min: Optional[float] = Field(default=None, description="Minimum component score")
    raw_score: Optional[float] = Field(
        default=None, description="Raw component score before normalization"
    )
    raw_max: Optional[float] = Field(
        default=None, description="Raw maximum component score before normalization"
    )
    raw_min: Optional[float] = Field(
        default=None, description="Raw minimum component score before normalization"
    )
    weight: Optional[float] = Field(
        default=None, description="Normalized weighting used by the task grade"
    )
    details: Dict[str, Any] = Field(
        default_factory=dict,
        description="Structured details for the component calculation",
    )


class RewardBreakdown(BaseModel):
    """Typed reward breakdown returned on reset, intermediate, and terminal steps."""

    model_config = ConfigDict(extra="allow")

    reward_type: str = Field(default="unknown", description="Reward breakdown category")
    total: float = Field(default=0.0, description="Total reward for this step")
    raw_total: Optional[float] = Field(
        default=None, description="Raw total reward for this step before normalization"
    )
    score: Optional[float] = Field(
        default=None, description="Normalized score for simple cases"
    )
    raw_score: Optional[float] = Field(
        default=None, description="Raw score for simple cases before normalization"
    )
    requested_score: Optional[float] = Field(
        default=None, description="Normalized uncapped score requested by the reward rule"
    )
    raw_requested_score: Optional[float] = Field(
        default=None,
        description="Raw uncapped score requested by the reward rule before normalization",
    )
    applied_score: Optional[float] = Field(
        default=None, description="Normalized score applied after caps or bounds"
    )
    raw_applied_score: Optional[float] = Field(
        default=None,
        description="Raw score applied after caps or bounds before normalization",
    )
    step_total: Optional[float] = Field(
        default=None, description="Normalized combined step reward in multi-part terminal cases"
    )
    raw_step_total: Optional[float] = Field(
        default=None,
        description="Raw combined step reward in multi-part terminal cases before normalization",
    )
    trajectory_total: Optional[float] = Field(
        default=None, description="Normalized cumulative trajectory shaping reward"
    )
    raw_trajectory_total: Optional[float] = Field(
        default=None,
        description="Raw cumulative trajectory shaping reward before normalization",
    )
    episode_total: Optional[float] = Field(
        default=None, description="Normalized running episode reward after this step"
    )
    raw_episode_total: Optional[float] = Field(
        default=None,
        description="Raw running episode reward after this step before normalization",
    )
    cumulative_total: Optional[float] = Field(
        default=None,
        description="Normalized episode reward total after terminal application",
    )
    raw_cumulative_total: Optional[float] = Field(
        default=None,
        description="Raw episode reward total after terminal application before normalization",
    )
    theoretical_terminal_max: Optional[float] = Field(
        default=None, description="Normalized maximum possible terminal reward"
    )
    theoretical_terminal_min: Optional[float] = Field(
        default=None, description="Normalized minimum possible terminal reward"
    )
    raw_theoretical_terminal_max: Optional[float] = Field(
        default=None,
        description="Raw maximum possible terminal reward before normalization",
    )
    raw_theoretical_terminal_min: Optional[float] = Field(
        default=None,
        description="Raw minimum possible terminal reward before normalization",
    )
    context_field: Optional[str] = Field(
        default=None, description="Context source involved in the reward"
    )
    context_needed: List[str] = Field(
        default_factory=list, description="Ground-truth context sources needed"
    )
    is_needed: Optional[bool] = Field(
        default=None, description="Whether the requested context was useful"
    )
    retrieved: Optional[bool] = Field(
        default=None, description="Whether the context source had retrievable data"
    )
    reason: Optional[str] = Field(default=None, description="Machine-readable reason")
    action_type: Optional[str] = Field(
        default=None, description="Action type involved in the reward"
    )
    trajectory_cap: Optional[float] = Field(
        default=None, description="Trajectory reward cap in effect"
    )
    decision: Optional[BreakdownComponent] = Field(
        default=None, description="Decision-scoring component"
    )
    factor: Optional[BreakdownComponent] = Field(
        default=None, description="Factor overlap component"
    )
    efficiency: Optional[BreakdownComponent] = Field(
        default=None, description="Efficiency component"
    )
    calibration: Optional[BreakdownComponent] = Field(
        default=None, description="Calibration component"
    )
    trajectory: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Nested trajectory reward payload for no-decision terminal cases",
    )
    no_decision: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Nested no-decision penalty payload when the budget is exhausted",
    )
    last_terminal_breakdown: Optional[Dict[str, Any]] = Field(
        default=None,
        description="Previous terminal reward payload when guarding completed episodes",
    )


class TaskGradeBreakdown(BaseModel):
    """Typed normalized grader breakdown returned on terminal steps."""

    model_config = ConfigDict(extra="allow")

    decision: Optional[BreakdownComponent] = Field(
        default=None, description="Decision grading component"
    )
    factor_overlap: Optional[BreakdownComponent] = Field(
        default=None, description="Factor-overlap grading component"
    )
    efficiency: Optional[BreakdownComponent] = Field(
        default=None, description="Efficiency grading component"
    )
    calibration: Optional[BreakdownComponent] = Field(
        default=None, description="Calibration grading component"
    )
    total: float = Field(default=0.0, description="Normalized task grade in the open interval (0, 1)")


# ============================================================================
# Core OpenEnv Models
# ============================================================================


class ModerationAction(Action):
    """
    Action to be executed in the SafeSpace environment.

    Investigation actions (cost 1 action each):
    - request_author_profile
    - request_author_violations
    - request_thread_context
    - request_community_rules
    - request_linked_content
    - request_similar_precedents
    - request_reporter_credibility

    Terminal action:
    - decide (requires decision fields)
    """

    action_type: ActionType = Field(
        ...,
        description=(
            "One of: 'request_author_profile', 'request_author_violations', "
            "'request_thread_context', 'request_community_rules', "
            "'request_linked_content', 'request_similar_precedents', "
            "'request_reporter_credibility', 'decide'"
        ),
    )

    # === Decision fields (required ONLY when action_type == "decide") ===

    decision: Optional[DecisionType] = Field(
        default=None,
        description="One of: 'approve', 'remove', 'escalate', 'warn'",
    )
    primary_violation: Optional[str] = Field(
        default=None,
        description="Policy section ID (e.g., '1.0', '2.1', '3.1') or 'none'",
    )
    severity: Optional[SeverityType] = Field(
        default=None,
        description="One of: 'none', 'low', 'medium', 'high', 'critical'",
    )
    confidence: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Agent's confidence in the decision (0.0 to 1.0)",
    )
    key_factors: Optional[List[str]] = Field(
        default=None,
        description="Selected factors from the FACTOR_LIST that influenced the decision",
    )


class ModerationObservation(Observation):
    """
    Observation returned from the SafeSpace environment.

    Contains the content to moderate, trigger information, gathered context,
    platform policy, and episode progress.
    """

    # Content and trigger info
    content_item: Optional[ContentItem] = Field(
        default=None, description="The content item being moderated"
    )
    trigger_info: Optional[TriggerInfo] = Field(
        default=None, description="How this content entered the moderation queue"
    )

    # Investigation results (populated as agent gathers context)
    gathered_context: GatheredContext = Field(
        default_factory=GatheredContext,
        description="Context gathered through investigation actions",
    )

    # Policy and factors
    platform_policy: str = Field(
        default="", description="The platform's content moderation policy document"
    )
    available_factors: List[str] = Field(
        default_factory=list,
        description="List of factors the agent can cite in its decision",
    )

    # Episode progress
    actions_taken: int = Field(
        default=0, description="Number of actions taken this episode"
    )
    max_actions: int = Field(
        default=8, description="Maximum actions allowed per episode"
    )
    action_history: List[str] = Field(
        default_factory=list, description="List of actions taken so far"
    )
    feedback: str = Field(
        default="", description="Feedback message from the last action"
    )
    error_code: Optional[str] = Field(
        default=None,
        description="Structured error code for invalid or rejected actions",
    )

    # Reward breakdown (populated after terminal decision)
    reward_breakdown: Optional[RewardBreakdown] = Field(
        default=None,
        description="Breakdown of reward components for the last step",
    )
    task_grade: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Deterministic normalized task grade for the current episode, strictly between 0 and 1 on terminal steps",
    )
    grade_breakdown: Optional[TaskGradeBreakdown] = Field(
        default=None,
        description="Breakdown of normalized task-grade components",
    )


class ModerationState(State):
    """
    State of the SafeSpace environment.

    Tracks episode metadata and progress.
    """

    # Override base State fields
    episode_id: Optional[str] = Field(
        default=None, description="Unique identifier for this episode"
    )
    step_count: int = Field(default=0, description="Number of steps taken")

    # Episode identification
    scenario_id: Optional[str] = Field(
        default=None, description="Current scenario ID"
    )
    task_id: Optional[str] = Field(
        default=None, description="Task ID used to load this scenario"
    )
    difficulty: Optional[DifficultyType] = Field(
        default=None, description="Scenario difficulty: easy, medium, or hard"
    )
    trigger_type: Optional[TriggerType] = Field(
        default=None, description="How this content entered the moderation queue"
    )

    # SafeSpace-specific public progress fields
    actions_taken: int = Field(
        default=0, description="Number of investigation actions taken"
    )
    max_actions: int = Field(
        default=8, description="Maximum actions allowed per episode"
    )
    context_requested: List[str] = Field(
        default_factory=list, description="List of context types requested"
    )
    decision_made: bool = Field(
        default=False, description="Whether a terminal decision has been made"
    )
    episode_reward: float = Field(
        default=0.0, description="Normalized total reward for episode"
    )
    raw_episode_reward: float = Field(
        default=0.0, description="Raw total reward for episode before normalization"
    )
    done: bool = Field(default=False, description="Whether the episode is terminal")
    last_error_code: Optional[str] = Field(
        default=None,
        description="Structured error code from the most recent rejected action",
    )