Spaces:
Sleeping
Sleeping
File size: 5,189 Bytes
7743c15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Task definitions for ContentGuardEnv.
Mirrors Meta's actual content moderation decision pipeline.
"""
VIOLATION_TYPES = [
"hate_speech",
"misinformation",
"spam",
"sexual_content",
"violence",
"harassment",
"safe",
]
MODERATION_ACTIONS = [
"remove",
"restrict_reach",
"add_warning_label",
"escalate_to_human",
"no_action",
]
TASKS: dict = {
"easy": {
"name": "Violation Type Detection",
"description": (
"You are a content moderation AI. Analyze the social media post "
"and classify it into exactly ONE violation category.\n\n"
"Categories:\n"
" hate_speech — content targeting people based on identity\n"
" misinformation — demonstrably false or misleading claims\n"
" spam — unsolicited commercial/repetitive content\n"
" sexual_content — explicit or suggestive sexual material\n"
" violence — graphic violence or promotion of harm\n"
" harassment — targeted personal attacks on individuals\n"
" safe — no policy violation\n\n"
"Respond ONLY with JSON: {\"violation\": \"<category>\"}"
),
"action_space": {
"type": "object",
"properties": {
"violation": {
"type": "string",
"enum": VIOLATION_TYPES,
"description": "The detected policy violation category.",
}
},
"required": ["violation"],
},
"observation_space": {
"type": "object",
"description": "Social media post with content, metadata, and platform context.",
},
},
"medium": {
"name": "Moderation Action Decision",
"description": (
"You are a senior content moderator AI. Given the post AND its "
"confirmed violation type, decide the correct enforcement action.\n\n"
"Actions:\n"
" remove — take content down immediately\n"
" restrict_reach — reduce distribution (shadowban)\n"
" add_warning_label — keep up with a fact-check/warning overlay\n"
" escalate_to_human — too complex/sensitive, needs human review\n"
" no_action — borderline, no enforcement warranted\n\n"
"Also rate severity 1–5 (1=minor, 5=critical).\n\n"
"Respond ONLY with JSON: "
"{\"action\": \"<action>\", \"severity\": <1-5>, \"reasoning\": \"<brief reasoning>\"}"
),
"action_space": {
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": MODERATION_ACTIONS,
},
"severity": {
"type": "integer",
"minimum": 1,
"maximum": 5,
},
"reasoning": {
"type": "string",
"description": "One-sentence justification for the decision.",
},
},
"required": ["action", "severity", "reasoning"],
},
"observation_space": {
"type": "object",
"description": "Post + violation type + user account history + platform context.",
},
},
"hard": {
"name": "User Appeal Moderation Report",
"description": (
"You are a Trust & Safety AI writing the official response to a user "
"who has appealed their content moderation decision.\n\n"
"The report must include:\n"
" ruling — 'upheld' (action stands) or 'overturned' (action reversed)\n"
" policy_references— list of specific Community Standard policies violated\n"
" explanation — clear explanation of what violated policy and why\n"
" user_guidance — what the user should do to avoid future violations\n\n"
"Respond ONLY with JSON:\n"
"{\n"
" \"ruling\": \"upheld\" | \"overturned\",\n"
" \"policy_references\": [\"policy_name_1\", ...],\n"
" \"explanation\": \"<2-3 sentence explanation>\",\n"
" \"user_guidance\": \"<actionable guidance>\"\n"
"}"
),
"action_space": {
"type": "object",
"properties": {
"ruling": {"type": "string", "enum": ["upheld", "overturned"]},
"policy_references": {
"type": "array",
"items": {"type": "string"},
},
"explanation": {"type": "string"},
"user_guidance": {"type": "string"},
},
"required": ["ruling", "policy_references", "explanation", "user_guidance"],
},
"observation_space": {
"type": "object",
"description": "Full case: post + violation + action taken + user appeal message.",
},
},
}
|