Spaces:
Sleeping
Sleeping
File size: 2,714 Bytes
aa6f797 5231b2a a447d83 4f199dc 5231b2a 4f199dc 5231b2a aa6f797 4f199dc aa6f797 4f199dc 5231b2a 4f199dc 5231b2a aa6f797 4f199dc aa6f797 4f199dc 5231b2a 4f199dc 5231b2a aa6f797 4f199dc aa6f797 a447d83 aa6f797 a447d83 aa6f797 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | spec_version: 1
name: social_stream_moderation
type: environment
runtime: docker
app: server.app:app
port: 7860
description: >
A content-moderation RL environment where an agent must classify social-media
posts as safe or harmful under varying policy regimes, with tasks spanning
basic safety, contextual nuance, and fairness.
tasks:
- id: clear_cut_moderation
name: "Task 1: Basic Safety"
difficulty: easy
description: "Moderate a stream of social posts with obvious violations and safe content."
grader: deterministic
grader_id: basic_safety_grader
scoring: "0.0-1.0 reward based on action-label match"
- id: nuanced_sarcastic
name: "Task 2: Context & Nuance"
difficulty: medium
description: "Handle sarcastic content and quotes of harmful material with condemnation."
grader: deterministic
grader_id: context_nuance_grader
scoring: "0.0-1.0 reward with context-aware adjustments"
- id: policy_fairness
name: "Task 3: Fairness & Bias"
difficulty: hard
description: "Ensure fairness across user groups and adhere to stricter policy regimes."
grader: deterministic
grader_id: fairness_bias_grader
scoring: "0.0-1.0 reward with fairness penalty"
graders:
- id: basic_safety_grader
description: "Grader for basic safety checks"
type: deterministic
entry_point: envs.social_stream_moderation.graders:BasicSafetyGrader
- id: context_nuance_grader
description: "Grader for contextual and sarcastic content"
type: deterministic
entry_point: envs.social_stream_moderation.graders:ContextNuanceGrader
- id: fairness_bias_grader
description: "Grader for fairness and bias parity"
type: deterministic
entry_point: envs.social_stream_moderation.graders:FairnessBiasGrader
observation_space:
post_id: "string"
text: "string"
user_history_summary: "string"
context_type: "string"
platform_policy_mode: "string"
user_group: "string"
step_index: "integer"
total_steps: "integer"
action_space:
type: string
enum:
- ALLOW
- ALLOW_WITH_WARNING
- SOFT_HIDE
- ESCALATE_HUMAN
- BAN_USER
reward:
type: continuous
range: [0.0, 1.0]
endpoints:
- path: /reset
method: POST
description: Start a new episode
- path: /step
method: POST
description: Submit a moderation action
- path: /state
method: GET
description: Get current episode state
- path: /tasks
method: GET
description: List all tasks with grader info
- path: /grader
method: GET
description: Get grader score for current episode
- path: /health
method: GET
description: Health check
paths:
inference: ./inference.py
app: ./server/app.py |