""" Task definitions for ContentGuardEnv. Mirrors Meta's actual content moderation decision pipeline. """ VIOLATION_TYPES = [ "hate_speech", "misinformation", "spam", "sexual_content", "violence", "harassment", "safe", ] MODERATION_ACTIONS = [ "remove", "restrict_reach", "add_warning_label", "escalate_to_human", "no_action", ] TASKS: dict = { "easy": { "name": "Violation Type Detection", "description": ( "You are a content moderation AI. Analyze the social media post " "and classify it into exactly ONE violation category.\n\n" "Categories:\n" " hate_speech — content targeting people based on identity\n" " misinformation — demonstrably false or misleading claims\n" " spam — unsolicited commercial/repetitive content\n" " sexual_content — explicit or suggestive sexual material\n" " violence — graphic violence or promotion of harm\n" " harassment — targeted personal attacks on individuals\n" " safe — no policy violation\n\n" "Respond ONLY with JSON: {\"violation\": \"\"}" ), "action_space": { "type": "object", "properties": { "violation": { "type": "string", "enum": VIOLATION_TYPES, "description": "The detected policy violation category.", } }, "required": ["violation"], }, "observation_space": { "type": "object", "description": "Social media post with content, metadata, and platform context.", }, }, "medium": { "name": "Moderation Action Decision", "description": ( "You are a senior content moderator AI. Given the post AND its " "confirmed violation type, decide the correct enforcement action.\n\n" "Actions:\n" " remove — take content down immediately\n" " restrict_reach — reduce distribution (shadowban)\n" " add_warning_label — keep up with a fact-check/warning overlay\n" " escalate_to_human — too complex/sensitive, needs human review\n" " no_action — borderline, no enforcement warranted\n\n" "Also rate severity 1–5 (1=minor, 5=critical).\n\n" "Respond ONLY with JSON: " "{\"action\": \"\", \"severity\": <1-5>, \"reasoning\": \"\"}" ), "action_space": { "type": "object", "properties": { "action": { "type": "string", "enum": MODERATION_ACTIONS, }, "severity": { "type": "integer", "minimum": 1, "maximum": 5, }, "reasoning": { "type": "string", "description": "One-sentence justification for the decision.", }, }, "required": ["action", "severity", "reasoning"], }, "observation_space": { "type": "object", "description": "Post + violation type + user account history + platform context.", }, }, "hard": { "name": "User Appeal Moderation Report", "description": ( "You are a Trust & Safety AI writing the official response to a user " "who has appealed their content moderation decision.\n\n" "The report must include:\n" " ruling — 'upheld' (action stands) or 'overturned' (action reversed)\n" " policy_references— list of specific Community Standard policies violated\n" " explanation — clear explanation of what violated policy and why\n" " user_guidance — what the user should do to avoid future violations\n\n" "Respond ONLY with JSON:\n" "{\n" " \"ruling\": \"upheld\" | \"overturned\",\n" " \"policy_references\": [\"policy_name_1\", ...],\n" " \"explanation\": \"<2-3 sentence explanation>\",\n" " \"user_guidance\": \"\"\n" "}" ), "action_space": { "type": "object", "properties": { "ruling": {"type": "string", "enum": ["upheld", "overturned"]}, "policy_references": { "type": "array", "items": {"type": "string"}, }, "explanation": {"type": "string"}, "user_guidance": {"type": "string"}, }, "required": ["ruling", "policy_references", "explanation", "user_guidance"], }, "observation_space": { "type": "object", "description": "Full case: post + violation + action taken + user appeal message.", }, }, }