Spaces:

jdsb06
/

content-moderation-env

Sleeping

Soham Banerjee

Fix campaign IDs: load dynamically from env instead of hardcoded values

e8094c5 about 2 months ago

19.9 kB

	"""
	app.py — Gradio UI for ContentModerationEnv (Hugging Face Spaces)
	=================================================================
	Live interactive demo + API endpoint for the OpenEnv benchmark.

	Tabs
	----
	1. Try It — step through individual scenarios
	2. Campaign Mode — deterministic campaign episodes (reset(campaign_id=...))
	3. Baseline — run the lexical agent over all 128 scenarios
	4. API Docs — Python / shell examples
	"""

	import json
	import sys
	from pathlib import Path

	import gradio as gr

	SCRIPT_DIR = Path(__file__).parent.parent
	sys.path.insert(0, str(SCRIPT_DIR))

	from content_moderation_env import ContentModerationEnv, CampaignModerationEnv
	from baseline_inference import run_baseline

	# ── env singleton ──────────────────────────────────────────────────────────────
	SCENARIOS_PATH = SCRIPT_DIR / "moderation_benchmark.json"
	CAMPAIGNS_PATH = SCRIPT_DIR / "campaign_benchmark.json"
	env = ContentModerationEnv(str(SCENARIOS_PATH), seed=42)
	campaign_env = CampaignModerationEnv(str(CAMPAIGNS_PATH), seed=42)
	ALL_IDS = env.scenario_ids
	CAMPAIGN_IDS = campaign_env._campaign_ids

	# ── helpers ───────────────────────────────────────────────────────────────────

	def _fmt_state(s: dict) -> str:
	lines = [f"Text: {s['text']}"]
	if s.get("audio_transcript"):
	lines.append(f"Audio: {s['audio_transcript']}")
	if s.get("visual_tags"):
	lines.append(f"Visual tags: {', '.join(s['visual_tags'])}")
	lines.append(f"Previous flags: {s['previous_flags']} \| Policy: {s['platform_policy']}")
	return "\n\n".join(lines)


	def _reward_bar(reward: float) -> str:
	filled = int(reward * 20)
	bar = "█" * filled + "░" * (20 - filled)
	emoji = "✅" if reward >= 0.8 else ("🟡" if reward >= 0.4 else "❌")
	return f"{emoji} [{bar}] {reward:.2f}"


	# ── Tab 1: Try It ─────────────────────────────────────────────────────────────

	def load_scenario(scenario_id: str):
	try:
	state = env.reset(scenario_id)
	except Exception as e:
	return f"Error: {e}", "", gr.update(visible=False)
	tier = env._current_scenario["tier"]
	show_sev = tier == "hard"
	return _fmt_state(state), f"Tier: `{tier}`", gr.update(visible=show_sev)


	def submit_action(scenario_id: str, label: str, action: str, severity: int, rationale: str):
	try:
	env.reset(scenario_id)
	except Exception as e:
	return f"Error resetting: {e}", ""

	act_dict = {"label": label, "action": action, "severity": severity, "rationale": rationale}
	try:
	result = env.step(act_dict)
	except Exception as e:
	return f"Error in step(): {e}", ""

	info = result["info"]
	gt = info["ground_truth"]
	bd = info["score_breakdown"]
	reward = result["reward"]

	out_md = f"""
	### Result

	{_reward_bar(reward)}

	\| Component \| Score \|
	\|-----------\|-------\|
	\| Label correct \| `{bd.get('label_correct', 'n/a')}` \|
	\| Action correct \| `{bd.get('action_correct', 'n/a')}` \|
	\| Severity ±1 \| `{bd.get('severity_within_1', 'n/a')}` \|

	Ground truth: label=`{gt['label']}` action=`{gt['action']}` severity=`{gt.get('severity', 'n/a')}`

	> {gt.get('rationale', '')}
	"""
	raw = json.dumps(result, indent=2, default=str)
	return out_md, f"```json\n{raw}\n```"


	# ── Tab 2: Baseline ───────────────────────────────────────────────────────────

	def run_baseline_tab(tier_filter: str):
	tf = None if tier_filter == "all" else tier_filter
	results = run_baseline(tier_filter=tf, seed=42, verbose=False)

	tiers = ["easy", "medium", "hard"]
	rows = []
	for t in tiers:
	rs = [r for r in results if r["tier"] == t]
	if not rs:
	continue
	rw = [r["reward"] for r in rs]
	mn = sum(rw) / len(rw)
	pct = sum(1 for r in rw if r == 1.0)
	rows.append([t, len(rs), f"{mn:.3f}", pct, sum(1 for r in rw if r == 0.0)])

	all_rw = [r["reward"] for r in results]
	overall = sum(all_rw) / len(all_rw) if all_rw else 0.0
	rows.append(["OVERALL", len(all_rw), f"{overall:.3f}",
	sum(1 for r in all_rw if r == 1.0), sum(1 for r in all_rw if r == 0.0)])

	headers = ["Tier", "N", "Mean Reward", "Perfect (1.0)", "Zero (0.0)"]
	return rows, f"Baseline complete. Overall mean reward: {overall:.3f}"


	# ── Tab 3: Campaign Detection ────────────────────────────────────────────────

	def load_campaign(campaign_id=None):
	"""Load a campaign scenario for the Campaign Detection tab"""
	try:
	state = campaign_env.reset(campaign_id=campaign_id)
	except Exception as e:
	return f"Error: {e}", "Failed to load campaign."
	posts_md = ""
	for i, p in enumerate(state.get("posts", []), 1):
	posts_md += f"Post {i} — account: `{p.get('account_id', 'N/A')}`"
	posts_md += f"  \|  +{p.get('posted_at_offset_minutes', 0)} min"
	posts_md += f"  \|  platform: `{p.get('platform', 'unknown')}`\n\n"
	posts_md += f"> {p.get('text', '')}\n\n"
	if p.get("visual_tags"):
	posts_md += f"Visual signals: {', '.join(p['visual_tags'])}\n\n"
	posts_md += "---\n\n"
	return (
	f"Campaign: `{state.get('campaign_id', 'N/A')}`  \|"
	f"  {state.get('num_posts', 0)} posts\n",
	posts_md
	)


	def submit_campaign(campaign_id, is_coord_str, action, reasoning):
	"""Submit campaign detection decision"""
	try:
	campaign_env.reset(campaign_id=campaign_id)
	except Exception as e:
	return f"Error resetting campaign: {e}"
	action_dict = {
	"is_coordinated": is_coord_str == "true",
	"action": action,
	"reasoning": reasoning,
	}
	result = campaign_env.step(action_dict)
	r = result.get("reward", 0.0)
	info = result.get("info", {})
	gt = info.get("ground_truth", {"is_coordinated": False, "correct_action": "None"})
	bd = info.get("score_breakdown", {})
	filled = int(max(r, 0) * 20)
	bar = "█" * filled + "░" * (20 - filled)
	emoji = "✅" if r >= 0.8 else ("🟡" if r >= 0.4 else "❌")
	out = f"{emoji} [{bar}] {r:.2f}\n\n"
	out += f"Ground truth: coordinated=`{gt['is_coordinated']}`"
	out += f" action=`{gt['correct_action']}`\n\n"
	out += f"Score breakdown:\n\n"
	for k, v in bd.items():
	out += f" - `{k}`: `{v}`\n"
	return out


	# ── Tab 4: API examples ───────────────────────────────────────────────────────

	API_CURL = """\
	# 1. Reset (load a random scenario)
	STATE=$(python -c "
	import json, sys
	sys.path.insert(0, '.')
	from content_moderation_env import ContentModerationEnv
	env = ContentModerationEnv('moderation_benchmark.json', seed=42)
	state = env.reset()
	print(json.dumps(state, indent=2))
	")

	# 2. Step (submit your action)
	python -c "
	import json, sys
	sys.path.insert(0, '.')
	from content_moderation_env import ContentModerationEnv
	env = ContentModerationEnv('moderation_benchmark.json', seed=42)
	env.reset('scen_hard_1')
	result = env.step({
	'label': 'toxic',
	'action': 'escalate',
	'severity': 5,
	'rationale': 'Coordinated physical threat.'
	})
	print(json.dumps(result, indent=2))
	"
	"""

	API_PYTHON = """\
	from content_moderation_env import ContentModerationEnv

	# Instantiate
	env = ContentModerationEnv("moderation_benchmark.json", seed=42)
	print(f"Loaded {env.num_scenarios} scenarios")

	# Episode
	state = env.reset() # random
	# state = env.reset("scen_hard_1") # specific
	print(state["text"])

	result = env.step({
	"label": "toxic",
	"action": "escalate",
	"severity": 4,
	"rationale": "Threat indicators detected."
	})
	print(f"Reward: {result['reward']}")
	print(f"Breakdown: {result['info']['score_breakdown']}")
	"""

	# ── Build UI ──────────────────────────────────────────────────────────────────

	THEME = gr.themes.Soft(
	primary_hue="emerald",
	neutral_hue="zinc",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
	).set(
	button_primary_background_fill="*primary_500",
	button_primary_background_fill_hover="*primary_600",
	block_radius="12px",
	block_border_width="1px",
	block_border_color="*neutral_200",
	block_border_color_dark="*neutral_700",
	block_background_fill="*background_fill_secondary",
	)

	CSS = """
	.gradio-container {
	max-width: 1100px !important;
	margin: 0 auto;
	}
	.header {
	text-align: center;
	padding: 3rem 0 2rem;
	margin-bottom: 2rem;
	background: linear-gradient(135deg, rgba(16,185,129,0.1) 0%, rgba(59,130,246,0.1) 100%);
	border-radius: 12px;
	border: 1px solid rgba(0,0,0,0.05);
	}
	.dark .header {
	background: linear-gradient(135deg, rgba(16,185,129,0.05) 0%, rgba(59,130,246,0.05) 100%);
	border-color: rgba(255,255,255,0.05);
	}
	.action-btn {
	font-weight: 600 !important;
	font-size: 1.1rem !important;
	padding: 0.75rem !important;
	box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1) !important;
	transition: transform 0.1s ease !important;
	}
	.action-btn:hover {
	transform: translateY(-1px) !important;
	}
	.observation-card {
	background: white;
	padding: 1.5rem;
	border-radius: 12px;
	border: 1px solid #e5e7eb;
	box-shadow: 0 1px 3px 0 rgba(0,0,0,0.05);
	}
	.dark .observation-card {
	background: #1f2937;
	border-color: #374151;
	}
	"""

	with gr.Blocks(theme=THEME, css=CSS, title="ContentModerationEnv — OpenEnv Benchmark") as demo:

	with gr.Column(elem_classes=["header"]):
	gr.Markdown("""
	# 🛡️ ContentModerationEnv
	### An OpenEnv benchmark for evaluating AI content moderation agents

	> 128 scenarios across 3 difficulty tiers (easy / medium / hard) ·
	> Partial-credit scoring (0.0 – 1.0) · Full OpenEnv API
	> `reset()` · `step()` · `state()` · `appeal()` · campaign mode · adversarial scenarios
	""")

	with gr.Tabs():

	# ── Tab 1: Try It ─────────────────────────────────────────────────────
	with gr.Tab("🎮 Try It"):
	with gr.Row():
	with gr.Column(scale=1):
	sid_dd = gr.Dropdown(
	choices=ALL_IDS,
	value=ALL_IDS[0],
	label="Select Scenario ID to Load",
	interactive=True,
	)
	tier_md = gr.Markdown()

	with gr.Column(scale=2):
	state_md = gr.Markdown("Select a scenario from the dropdown to begin →", label="Observation", elem_classes=["observation-card"])

	gr.Markdown("### Your moderation decision")
	with gr.Row():
	label_dd = gr.Dropdown(
	choices=["safe", "toxic", "spam", "misleading"],
	value="safe", label="Label"
	)
	action_dd = gr.Dropdown(
	choices=["allow", "warn", "remove", "shadowban", "escalate"],
	value="allow", label="Action"
	)
	sev_slider = gr.Slider(1, 5, value=3, step=1,
	label="Severity (hard tier)", visible=False)

	rationale_tb = gr.Textbox(label="Rationale (optional)", lines=2,
	placeholder="Brief explanation …")
	step_btn = gr.Button("Submit → env.step()", variant="primary", elem_classes=["action-btn"])
	result_md = gr.Markdown()
	result_raw = gr.Markdown()

	sid_dd.change(
	load_scenario,
	inputs=[sid_dd],
	outputs=[state_md, tier_md, sev_slider],
	)
	step_btn.click(
	submit_action,
	inputs=[sid_dd, label_dd, action_dd, sev_slider, rationale_tb],
	outputs=[result_md, result_raw],
	)

	# ── Tab 2: Baseline ───────────────────────────────────────────────────
	with gr.Tab("📊 Baseline"):
	gr.Markdown("""
	### Lexical Rule-Based Baseline

	A deterministic, no-LLM agent that uses regex patterns to classify content
	and policy-based rules to choose an action. Run it to verify the environment
	and as a comparison floor for LLM agents.
	""")
	tier_radio = gr.Radio(
	choices=["all", "easy", "medium", "hard"],
	value="all", label="Tier to evaluate"
	)
	run_btn = gr.Button("Run Baseline", variant="primary")
	status_md = gr.Markdown()
	result_tbl = gr.Dataframe(
	headers=["Tier", "N", "Mean Reward", "Perfect (1.0)", "Zero (0.0)"],
	interactive=False,
	)
	run_btn.click(
	run_baseline_tab,
	inputs=[tier_radio],
	outputs=[result_tbl, status_md],
	)

	# ── Tab 3: API Docs ───────────────────────────────────────────────────
	with gr.Tab("📖 API Docs"):
	gr.Markdown("""
	## Quick Start

	```bash
	git clone https://huggingface.co/spaces/sohambanerjee/content-moderation-env
	cd content-moderation-env
	pip install -r requirements.txt
	```

	### Python API
	""")
	gr.Code(API_PYTHON, language="python", label="Python usage")
	gr.Markdown("### Shell / curl equivalent")
	gr.Textbox(API_CURL, label="Shell usage", lines=20, interactive=False)

	gr.Markdown("""
	## Action Space

	\| Field \| Type \| Required \| Values \|
	\|-------\|------\|----------\|--------\|
	\| `label` \| str \| ✅ \| `safe` · `toxic` · `spam` · `misleading` \|
	\| `action` \| str \| ✅ \| `allow` · `warn` · `remove` · `shadowban` · `escalate` \|
	\| `severity` \| int 1-5 \| ❌ (scored in hard) \| `1` (mild) → `5` (critical) \|
	\| `rationale` \| str \| ❌ \| Free text explanation \|

	## Reward Function

	\| Tier \| Label \| Action \| Severity ±1 \|
	\|------\|-------\|--------\|-------------\|
	\| easy / medium \| 0.5 \| 0.5 \| — \|
	\| hard \| 0.4 \| 0.4 \| 0.2 \|

	## Baseline Scores (lexical agent, seed=42)

	\| Tier \| N \| Mean Reward \|
	\|------\|---\|-------------\|
	\| easy \| 52 \| 0.375 \|
	\| medium \| 25 \| 0.460 \|
	\| hard \| 51 \| 0.144 \|
	\| overall \| 128 \| 0.300 \|
	""")

	# ── Tab 4: Campaign Detection ────────────────────────────────────
	with gr.Tab("🎯 Campaign Detection"):
	gr.Markdown("""
	### Coordinated Campaign Detection
	Review 3 posts from different accounts and determine whether they form
	a coordinated inauthentic behavior campaign.

	\| Field \| Description \|
	\|-------\|-------------\|
	\| `is_coordinated` \| True if posts are from a coordinated operation \|
	\| `action` \| `allow` / `remove` / `shadowban` / `escalate` \|
	\| Reward \| +0.5 coordination detected · +0.5 action correct · -0.2 false positive \|
	""")
	with gr.Row():
	with gr.Column(scale=1):
	camp_sid_dd = gr.Dropdown(
	choices=CAMPAIGN_IDS,
	value=CAMPAIGN_IDS[0],
	label="Select Campaign to Load",
	interactive=True,
	)
	camp_type_md = gr.Markdown()
	with gr.Column(scale=2):
	camp_posts_md = gr.Markdown("Select a campaign from the dropdown to begin →", elem_classes=["observation-card"])

	with gr.Row():
	is_coord_dd = gr.Dropdown(
	choices=["true", "false"],
	value="false",
	label="Is Coordinated?"
	)
	camp_action_dd = gr.Dropdown(
	choices=["allow", "remove", "shadowban", "escalate"],
	value="allow",
	label="Action"
	)
	reasoning_tb = gr.Textbox(
	label="Reasoning (optional)", lines=2,
	placeholder="Explain your coordination assessment..."
	)
	camp_submit_btn = gr.Button(
	"Submit → campaign_env.step()", variant="primary", elem_classes=["action-btn"]
	)
	camp_result_md = gr.Markdown()

	camp_sid_dd.change(
	load_campaign,
	inputs=[camp_sid_dd],
	outputs=[camp_type_md, camp_posts_md]
	)
	camp_submit_btn.click(
	submit_campaign,
	inputs=[camp_sid_dd, is_coord_dd, camp_action_dd, reasoning_tb],
	outputs=[camp_result_md]
	)

	gr.Markdown("""
	---
	<p style="text-align:center; color: #888; font-size: 0.85rem;">
	ContentModerationEnv v2.0 · OpenEnv · MIT License
	</p>
	""")


	# ── OpenEnv HTTP API routes ───────────────────────────────────────────────────
	# Added to the Gradio FastAPI instance so POST /reset returns HTTP 200,
	# satisfying the HF Space validator check.

	from fastapi import FastAPI, Request
	from fastapi.responses import JSONResponse
	import uvicorn

	app = FastAPI()

	@app.post("/reset")
	@app.post("/reset/")
	async def api_reset(request: Request):
	"""POST /reset → initial observation, HTTP 200"""
	try:
	body: dict = {}
	if request.headers.get("content-type", "").startswith("application/json"):
	body = await request.json()
	except Exception:
	body = {}
	scenario_id = body.get("scenario_id", None) if isinstance(body, dict) else None
	try:
	state = env.reset(scenario_id=scenario_id)
	return JSONResponse({"state": state, "status": "ok"})
	except Exception as exc:
	return JSONResponse({"error": str(exc)}, status_code=400)


	@app.post("/step")
	@app.post("/step/")
	async def api_step(request: Request):
	"""POST /step → takes action dict, returns result"""
	try:
	body: dict = await request.json()
	except Exception:
	body = {}
	action = body.get("action", {}) if isinstance(body, dict) else {}
	try:
	result = env.step(action)
	return JSONResponse(result)
	except Exception as exc:
	return JSONResponse({"error": str(exc)}, status_code=400)


	@app.get("/state")
	@app.get("/state/")
	async def api_state():
	"""GET /state → current environment state"""
	try:
	state = env.state()
	return JSONResponse({"state": state, "status": "ok"})
	except Exception as exc:
	return JSONResponse({"error": str(exc)}, status_code=400)


	app = gr.mount_gradio_app(app, demo, path="/")

	def main():
	uvicorn.run("server.app:app", host="0.0.0.0", port=7860)

	if __name__ == "__main__":
	main()