File size: 2,714 Bytes
aa6f797
 
 
 
 
 
5231b2a
 
 
 
 
a447d83
4f199dc
5231b2a
4f199dc
5231b2a
aa6f797
4f199dc
aa6f797
 
4f199dc
5231b2a
4f199dc
5231b2a
aa6f797
4f199dc
aa6f797
 
4f199dc
5231b2a
4f199dc
5231b2a
aa6f797
4f199dc
aa6f797
a447d83
aa6f797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a447d83
 
 
aa6f797
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
spec_version: 1
name: social_stream_moderation
type: environment
runtime: docker
app: server.app:app
port: 7860
description: >
  A content-moderation RL environment where an agent must classify social-media
  posts as safe or harmful under varying policy regimes, with tasks spanning
  basic safety, contextual nuance, and fairness.

tasks:
  - id: clear_cut_moderation
    name: "Task 1: Basic Safety"
    difficulty: easy
    description: "Moderate a stream of social posts with obvious violations and safe content."
    grader: deterministic
    grader_id: basic_safety_grader
    scoring: "0.0-1.0 reward based on action-label match"

  - id: nuanced_sarcastic
    name: "Task 2: Context & Nuance"
    difficulty: medium
    description: "Handle sarcastic content and quotes of harmful material with condemnation."
    grader: deterministic
    grader_id: context_nuance_grader
    scoring: "0.0-1.0 reward with context-aware adjustments"

  - id: policy_fairness
    name: "Task 3: Fairness & Bias"
    difficulty: hard
    description: "Ensure fairness across user groups and adhere to stricter policy regimes."
    grader: deterministic
    grader_id: fairness_bias_grader
    scoring: "0.0-1.0 reward with fairness penalty"

graders:
  - id: basic_safety_grader
    description: "Grader for basic safety checks"
    type: deterministic
    entry_point: envs.social_stream_moderation.graders:BasicSafetyGrader
  - id: context_nuance_grader
    description: "Grader for contextual and sarcastic content"
    type: deterministic
    entry_point: envs.social_stream_moderation.graders:ContextNuanceGrader
  - id: fairness_bias_grader
    description: "Grader for fairness and bias parity"
    type: deterministic
    entry_point: envs.social_stream_moderation.graders:FairnessBiasGrader

observation_space:
  post_id: "string"
  text: "string"
  user_history_summary: "string"
  context_type: "string"
  platform_policy_mode: "string"
  user_group: "string"
  step_index: "integer"
  total_steps: "integer"

action_space:
  type: string
  enum:
    - ALLOW
    - ALLOW_WITH_WARNING
    - SOFT_HIDE
    - ESCALATE_HUMAN
    - BAN_USER

reward:
  type: continuous
  range: [0.0, 1.0]

endpoints:
  - path: /reset
    method: POST
    description: Start a new episode
  - path: /step
    method: POST
    description: Submit a moderation action
  - path: /state
    method: GET
    description: Get current episode state
  - path: /tasks
    method: GET
    description: List all tasks with grader info
  - path: /grader
    method: GET
    description: Get grader score for current episode
  - path: /health
    method: GET
    description: Health check

paths:
  inference: ./inference.py
  app: ./server/app.py