Spaces:

mj064
/

ContentGuardEnv

Sleeping

Mridul Jain

Elite v3.5 Submission

7743c15 about 1 month ago

5.19 kB

	"""
	Task definitions for ContentGuardEnv.
	Mirrors Meta's actual content moderation decision pipeline.
	"""

	VIOLATION_TYPES = [
	"hate_speech",
	"misinformation",
	"spam",
	"sexual_content",
	"violence",
	"harassment",
	"safe",
	]

	MODERATION_ACTIONS = [
	"remove",
	"restrict_reach",
	"add_warning_label",
	"escalate_to_human",
	"no_action",
	]

	TASKS: dict = {
	"easy": {
	"name": "Violation Type Detection",
	"description": (
	"You are a content moderation AI. Analyze the social media post "
	"and classify it into exactly ONE violation category.\n\n"
	"Categories:\n"
	" hate_speech — content targeting people based on identity\n"
	" misinformation — demonstrably false or misleading claims\n"
	" spam — unsolicited commercial/repetitive content\n"
	" sexual_content — explicit or suggestive sexual material\n"
	" violence — graphic violence or promotion of harm\n"
	" harassment — targeted personal attacks on individuals\n"
	" safe — no policy violation\n\n"
	"Respond ONLY with JSON: {\"violation\": \"<category>\"}"
	),
	"action_space": {
	"type": "object",
	"properties": {
	"violation": {
	"type": "string",
	"enum": VIOLATION_TYPES,
	"description": "The detected policy violation category.",
	}
	},
	"required": ["violation"],
	},
	"observation_space": {
	"type": "object",
	"description": "Social media post with content, metadata, and platform context.",
	},
	},

	"medium": {
	"name": "Moderation Action Decision",
	"description": (
	"You are a senior content moderator AI. Given the post AND its "
	"confirmed violation type, decide the correct enforcement action.\n\n"
	"Actions:\n"
	" remove — take content down immediately\n"
	" restrict_reach — reduce distribution (shadowban)\n"
	" add_warning_label — keep up with a fact-check/warning overlay\n"
	" escalate_to_human — too complex/sensitive, needs human review\n"
	" no_action — borderline, no enforcement warranted\n\n"
	"Also rate severity 1–5 (1=minor, 5=critical).\n\n"
	"Respond ONLY with JSON: "
	"{\"action\": \"<action>\", \"severity\": <1-5>, \"reasoning\": \"<brief reasoning>\"}"
	),
	"action_space": {
	"type": "object",
	"properties": {
	"action": {
	"type": "string",
	"enum": MODERATION_ACTIONS,
	},
	"severity": {
	"type": "integer",
	"minimum": 1,
	"maximum": 5,
	},
	"reasoning": {
	"type": "string",
	"description": "One-sentence justification for the decision.",
	},
	},
	"required": ["action", "severity", "reasoning"],
	},
	"observation_space": {
	"type": "object",
	"description": "Post + violation type + user account history + platform context.",
	},
	},

	"hard": {
	"name": "User Appeal Moderation Report",
	"description": (
	"You are a Trust & Safety AI writing the official response to a user "
	"who has appealed their content moderation decision.\n\n"
	"The report must include:\n"
	" ruling — 'upheld' (action stands) or 'overturned' (action reversed)\n"
	" policy_references— list of specific Community Standard policies violated\n"
	" explanation — clear explanation of what violated policy and why\n"
	" user_guidance — what the user should do to avoid future violations\n\n"
	"Respond ONLY with JSON:\n"
	"{\n"
	" \"ruling\": \"upheld\" \| \"overturned\",\n"
	" \"policy_references\": [\"policy_name_1\", ...],\n"
	" \"explanation\": \"<2-3 sentence explanation>\",\n"
	" \"user_guidance\": \"<actionable guidance>\"\n"
	"}"
	),
	"action_space": {
	"type": "object",
	"properties": {
	"ruling": {"type": "string", "enum": ["upheld", "overturned"]},
	"policy_references": {
	"type": "array",
	"items": {"type": "string"},
	},
	"explanation": {"type": "string"},
	"user_guidance": {"type": "string"},
	},
	"required": ["ruling", "policy_references", "explanation", "user_guidance"],
	},
	"observation_space": {
	"type": "object",
	"description": "Full case: post + violation + action taken + user appeal message.",
	},
	},
	}