File size: 5,189 Bytes
7743c15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Task definitions for ContentGuardEnv.
Mirrors Meta's actual content moderation decision pipeline.
"""

VIOLATION_TYPES = [
    "hate_speech",
    "misinformation",
    "spam",
    "sexual_content",
    "violence",
    "harassment",
    "safe",
]

MODERATION_ACTIONS = [
    "remove",
    "restrict_reach",
    "add_warning_label",
    "escalate_to_human",
    "no_action",
]

TASKS: dict = {
    "easy": {
        "name": "Violation Type Detection",
        "description": (
            "You are a content moderation AI. Analyze the social media post "
            "and classify it into exactly ONE violation category.\n\n"
            "Categories:\n"
            "  hate_speech     — content targeting people based on identity\n"
            "  misinformation  — demonstrably false or misleading claims\n"
            "  spam            — unsolicited commercial/repetitive content\n"
            "  sexual_content  — explicit or suggestive sexual material\n"
            "  violence        — graphic violence or promotion of harm\n"
            "  harassment      — targeted personal attacks on individuals\n"
            "  safe            — no policy violation\n\n"
            "Respond ONLY with JSON: {\"violation\": \"<category>\"}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "violation": {
                    "type": "string",
                    "enum": VIOLATION_TYPES,
                    "description": "The detected policy violation category.",
                }
            },
            "required": ["violation"],
        },
        "observation_space": {
            "type": "object",
            "description": "Social media post with content, metadata, and platform context.",
        },
    },

    "medium": {
        "name": "Moderation Action Decision",
        "description": (
            "You are a senior content moderator AI. Given the post AND its "
            "confirmed violation type, decide the correct enforcement action.\n\n"
            "Actions:\n"
            "  remove              — take content down immediately\n"
            "  restrict_reach      — reduce distribution (shadowban)\n"
            "  add_warning_label   — keep up with a fact-check/warning overlay\n"
            "  escalate_to_human   — too complex/sensitive, needs human review\n"
            "  no_action           — borderline, no enforcement warranted\n\n"
            "Also rate severity 1–5 (1=minor, 5=critical).\n\n"
            "Respond ONLY with JSON: "
            "{\"action\": \"<action>\", \"severity\": <1-5>, \"reasoning\": \"<brief reasoning>\"}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "action": {
                    "type": "string",
                    "enum": MODERATION_ACTIONS,
                },
                "severity": {
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 5,
                },
                "reasoning": {
                    "type": "string",
                    "description": "One-sentence justification for the decision.",
                },
            },
            "required": ["action", "severity", "reasoning"],
        },
        "observation_space": {
            "type": "object",
            "description": "Post + violation type + user account history + platform context.",
        },
    },

    "hard": {
        "name": "User Appeal Moderation Report",
        "description": (
            "You are a Trust & Safety AI writing the official response to a user "
            "who has appealed their content moderation decision.\n\n"
            "The report must include:\n"
            "  ruling           — 'upheld' (action stands) or 'overturned' (action reversed)\n"
            "  policy_references— list of specific Community Standard policies violated\n"
            "  explanation      — clear explanation of what violated policy and why\n"
            "  user_guidance    — what the user should do to avoid future violations\n\n"
            "Respond ONLY with JSON:\n"
            "{\n"
            "  \"ruling\": \"upheld\" | \"overturned\",\n"
            "  \"policy_references\": [\"policy_name_1\", ...],\n"
            "  \"explanation\": \"<2-3 sentence explanation>\",\n"
            "  \"user_guidance\": \"<actionable guidance>\"\n"
            "}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "ruling": {"type": "string", "enum": ["upheld", "overturned"]},
                "policy_references": {
                    "type": "array",
                    "items": {"type": "string"},
                },
                "explanation": {"type": "string"},
                "user_guidance": {"type": "string"},
            },
            "required": ["ruling", "policy_references", "explanation", "user_guidance"],
        },
        "observation_space": {
            "type": "object",
            "description": "Full case: post + violation + action taken + user appeal message.",
        },
    },
}