spec_version: 1 name: social_stream_moderation type: environment runtime: docker app: server.app:app port: 7860 description: > A content-moderation RL environment where an agent must classify social-media posts as safe or harmful under varying policy regimes, with tasks spanning basic safety, contextual nuance, and fairness. tasks: - id: clear_cut_moderation name: "Task 1: Basic Safety" difficulty: easy description: "Moderate a stream of social posts with obvious violations and safe content." grader: deterministic grader_id: basic_safety_grader scoring: "0.0-1.0 reward based on action-label match" - id: nuanced_sarcastic name: "Task 2: Context & Nuance" difficulty: medium description: "Handle sarcastic content and quotes of harmful material with condemnation." grader: deterministic grader_id: context_nuance_grader scoring: "0.0-1.0 reward with context-aware adjustments" - id: policy_fairness name: "Task 3: Fairness & Bias" difficulty: hard description: "Ensure fairness across user groups and adhere to stricter policy regimes." grader: deterministic grader_id: fairness_bias_grader scoring: "0.0-1.0 reward with fairness penalty" graders: - id: basic_safety_grader description: "Grader for basic safety checks" type: deterministic entry_point: envs.social_stream_moderation.graders:BasicSafetyGrader - id: context_nuance_grader description: "Grader for contextual and sarcastic content" type: deterministic entry_point: envs.social_stream_moderation.graders:ContextNuanceGrader - id: fairness_bias_grader description: "Grader for fairness and bias parity" type: deterministic entry_point: envs.social_stream_moderation.graders:FairnessBiasGrader observation_space: post_id: "string" text: "string" user_history_summary: "string" context_type: "string" platform_policy_mode: "string" user_group: "string" step_index: "integer" total_steps: "integer" action_space: type: string enum: - ALLOW - ALLOW_WITH_WARNING - SOFT_HIDE - ESCALATE_HUMAN - BAN_USER reward: type: continuous range: [0.0, 1.0] endpoints: - path: /reset method: POST description: Start a new episode - path: /step method: POST description: Submit a moderation action - path: /state method: GET description: Get current episode state - path: /tasks method: GET description: List all tasks with grader info - path: /grader method: GET description: Get grader score for current episode - path: /health method: GET description: Health check paths: inference: ./inference.py app: ./server/app.py