Spaces:

mj064
/

ContentGuardEnv

Sleeping

File size: 5,189 Bytes

7743c15

"""
Task definitions for ContentGuardEnv.
Mirrors Meta's actual content moderation decision pipeline.
"""

VIOLATION_TYPES = [
    "hate_speech",
    "misinformation",
    "spam",
    "sexual_content",
    "violence",
    "harassment",
    "safe",
]

MODERATION_ACTIONS = [
    "remove",
    "restrict_reach",
    "add_warning_label",
    "escalate_to_human",
    "no_action",
]

TASKS: dict = {
    "easy": {
        "name": "Violation Type Detection",
        "description": (
            "You are a content moderation AI. Analyze the social media post "
            "and classify it into exactly ONE violation category.\n\n"
            "Categories:\n"
            "  hate_speech     — content targeting people based on identity\n"
            "  misinformation  — demonstrably false or misleading claims\n"
            "  spam            — unsolicited commercial/repetitive content\n"
            "  sexual_content  — explicit or suggestive sexual material\n"
            "  violence        — graphic violence or promotion of harm\n"
            "  harassment      — targeted personal attacks on individuals\n"
            "  safe            — no policy violation\n\n"
            "Respond ONLY with JSON: {\"violation\": \"<category>\"}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "violation": {
                    "type": "string",
                    "enum": VIOLATION_TYPES,
                    "description": "The detected policy violation category.",
                }
            },
            "required": ["violation"],
        },
        "observation_space": {
            "type": "object",
            "description": "Social media post with content, metadata, and platform context.",
        },
    },

    "medium": {
        "name": "Moderation Action Decision",
        "description": (
            "You are a senior content moderator AI. Given the post AND its "
            "confirmed violation type, decide the correct enforcement action.\n\n"
            "Actions:\n"
            "  remove              — take content down immediately\n"
            "  restrict_reach      — reduce distribution (shadowban)\n"
            "  add_warning_label   — keep up with a fact-check/warning overlay\n"
            "  escalate_to_human   — too complex/sensitive, needs human review\n"
            "  no_action           — borderline, no enforcement warranted\n\n"
            "Also rate severity 1–5 (1=minor, 5=critical).\n\n"
            "Respond ONLY with JSON: "
            "{\"action\": \"<action>\", \"severity\": <1-5>, \"reasoning\": \"<brief reasoning>\"}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "action": {
                    "type": "string",
                    "enum": MODERATION_ACTIONS,
                },
                "severity": {
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 5,
                },
                "reasoning": {
                    "type": "string",
                    "description": "One-sentence justification for the decision.",
                },
            },
            "required": ["action", "severity", "reasoning"],
        },
        "observation_space": {
            "type": "object",
            "description": "Post + violation type + user account history + platform context.",
        },
    },

    "hard": {
        "name": "User Appeal Moderation Report",
        "description": (
            "You are a Trust & Safety AI writing the official response to a user "
            "who has appealed their content moderation decision.\n\n"
            "The report must include:\n"
            "  ruling           — 'upheld' (action stands) or 'overturned' (action reversed)\n"
            "  policy_references— list of specific Community Standard policies violated\n"
            "  explanation      — clear explanation of what violated policy and why\n"
            "  user_guidance    — what the user should do to avoid future violations\n\n"
            "Respond ONLY with JSON:\n"
            "{\n"
            "  \"ruling\": \"upheld\" | \"overturned\",\n"
            "  \"policy_references\": [\"policy_name_1\", ...],\n"
            "  \"explanation\": \"<2-3 sentence explanation>\",\n"
            "  \"user_guidance\": \"<actionable guidance>\"\n"
            "}"
        ),
        "action_space": {
            "type": "object",
            "properties": {
                "ruling": {"type": "string", "enum": ["upheld", "overturned"]},
                "policy_references": {
                    "type": "array",
                    "items": {"type": "string"},
                },
                "explanation": {"type": "string"},
                "user_guidance": {"type": "string"},
            },
            "required": ["ruling", "policy_references", "explanation", "user_guidance"],
        },
        "observation_space": {
            "type": "object",
            "description": "Full case: post + violation + action taken + user appeal message.",
        },
    },
}