ANI00 commited on
Commit
f3ed76a
·
verified ·
1 Parent(s): c669dad

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -35
  2. .gitignore +6 -6
  3. README.md +920 -920
  4. uv.lock +236 -236
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,7 +1,7 @@
1
- __pycache__
2
- *.pyc
3
- .env
4
-
5
- .sixth
6
- .pytest_cache
7
  .coverage
 
1
+ __pycache__
2
+ *.pyc
3
+ .env
4
+
5
+ .sixth
6
+ .pytest_cache
7
  .coverage
README.md CHANGED
@@ -1,920 +1,920 @@
1
- ---
2
- title: Content Moderation OpenEnv
3
- emoji: 🛡️
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: docker
7
- app_file: server/main.py
8
- pinned: false
9
- ---
10
-
11
- # Content Moderation OpenEnv
12
-
13
- An AI content moderation environment built to the OpenEnv specification. Agents triage real-world content — spam emails, harmful social media posts, and AI-generated deepfakes — using a standard `step()` / `reset()` / `state()` API.
14
-
15
- [![OpenEnv Spec](https://img.shields.io/badge/OpenEnv-Spec-blue)](https://github.com/openenv-core/spec)
16
- [![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue.svg)](https://www.python.org/downloads/)
17
- [![FastAPI](https://img.shields.io/badge/FastAPI-0.111.0-green.svg)](https://fastapi.tiangolo.com/)
18
- [![Docker](https://img.shields.io/badge/Docker-Ready-blue.svg)](https://www.docker.com/)
19
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
20
-
21
- ---
22
-
23
- ## 📋 Table of Contents
24
-
25
- - [Environment Description & Motivation](#environment-description--motivation)
26
- - [Task Descriptions](#task-descriptions)
27
- - [Observation Space](#observation-space)
28
- - [Action Space](#action-space)
29
- - [Reward Functions](#reward-functions)
30
- - [Baseline Scores](#baseline-scores)
31
- - [Setup & Usage](#setup--usage)
32
- - [Requirements](#requirements)
33
- - [Local Installation](#local-installation)
34
- - [Docker Deployment](#docker-deployment)
35
- - [HuggingFace Spaces Deployment](#huggingface-spaces-deployment)
36
- - [Running the Inference Script](#running-the-inference-script)
37
- - [API Reference](#api-reference)
38
- - [Project Structure](#project-structure)
39
- - [Environment Variables](#environment-variables)
40
- - [Running Tests](#running-tests)
41
- - [Troubleshooting](#troubleshooting)
42
- - [Citation](#citation)
43
- - [Acknowledgements](#acknowledgements)
44
-
45
- ---
46
-
47
- ## Environment Description & Motivation
48
-
49
- Content moderation is a high-stakes, high-volume real-world task. Human moderators review millions of items daily across platforms and inboxes. This environment simulates a realistic moderation pipeline across three difficulty levels, enabling AI agents to learn decision-making strategies under resource constraints.
50
-
51
- **Key Challenges:**
52
- - Multi-label classification with imbalanced datasets
53
- - Confidence calibration under uncertainty
54
- - Real-world content variability (spam, deepfakes, policy violations)
55
- - Escalation vs. immediate action tradeoffs
56
-
57
- | Task | Difficulty | Content Type | Metrics | Description |
58
- |---|---|---|---|---|
59
- | `text_spam` | Easy | Email / SMS | Binary classification + confidence | Spam vs. legitimate email filtering with sender reputation signals |
60
- | `content_moderation` | Medium | Social media posts | Multi-label (up to 11 labels) | Detect policy violations (hate speech, harassment, violence, etc.) |
61
- | `deepfake_detection` | Hard | Multimodal (image + text) | Detection accuracy + labels | Identify AI-manipulated media using vision transformer + metadata |
62
-
63
- **Models Used:**
64
- - **Deepfake Detection**: `dima806/deepfake_vs_real_image_detection` (ViT, ~300MB)
65
- - **Language Model**: `meta-llama/Llama-3.1-8B-Instruct` (8B params, via transformers or HF API)
66
- - **Infrastructure**: FastAPI + Uvicorn, Docker containerization
67
-
68
- ---
69
-
70
- ## Task Descriptions
71
-
72
- ### Task 1: Text Spam Classification (Easy)
73
- **Objective:** Binary classification of emails/SMS as spam or legitimate.
74
-
75
- **Dataset:**
76
- - 50 items (30 spam, 20 legitimate)
77
- - Features: text content, sender reputation score, link count, source
78
- - Ground truth: decision + labels
79
-
80
- **Example:**
81
- ```json
82
- {
83
- "content_id": "ts_001",
84
- "content_type": "text",
85
- "text": "CONGRATULATIONS! You've won $1,000,000! Click here NOW to claim your prize!!!",
86
- "metadata": {"source": "email", "sender_reputation": 0.05, "link_count": 3},
87
- "ground_truth": {
88
- "decision": "reject",
89
- "labels": ["spam", "scam"],
90
- "is_harmful": true
91
- }
92
- }
93
- ```
94
-
95
- ---
96
-
97
- ### Task 2: Content Moderation (Medium)
98
- **Objective:** Multi-label classification of social media posts for policy violations.
99
-
100
- **Dataset:**
101
- - 40 items across diverse platforms
102
- - Labels: spam, scam, phishing, hate_speech, violence, harassment, misinformation, adult_content, deepfake, political_manipulation, fraud
103
- - Features: post text, engagement metrics, user reputation, report count
104
-
105
- **Violation Categories:**
106
- | Category | Definition | Examples |
107
- |---|---|---|
108
- | Hate Speech | Dehumanizing content targeting identity | Slurs, discrimination, incitement |
109
- | Violence | Threats or glorification of violence | Physical harm, weapon promotion |
110
- | Harassment | Coordinated or severe personal attacks | Doxxing, targeted campaigns |
111
- | Misinformation | False claims with societal impact | Election fraud claims, health hoaxes |
112
-
113
- ---
114
-
115
- ### Task 3: Deepfake Detection (Hard)
116
- **Objective:** Detect AI-manipulated media and classify content appropriately.
117
-
118
- **Dataset:**
119
- - 30 items (multimodal: images + descriptions)
120
- - Deepfake detection model outputs raw confidence scores (0-1)
121
- - Features: image description, detector_score, metadata
122
-
123
- **Detector Score Interpretation:**
124
- - `0.0-0.3`: Likely real/authentic
125
- - `0.3-0.7`: Uncertain, may require additional analysis
126
- - `0.7-1.0`: Likely deepfake/manipulated
127
-
128
- **Example:**
129
- ```json
130
- {
131
- "content_id": "df_001",
132
- "content_type": "multimodal",
133
- "image_description": "Portrait of person in business attire, lighting appears natural",
134
- "detector_score": 0.82,
135
- "metadata": {"platform": "social_media", "report_count": 3}
136
- }
137
- ```
138
-
139
- ---
140
-
141
- ## Observation Space
142
-
143
- Every step returns a `ContentObservation` with the following structure:
144
-
145
- ```json
146
- {
147
- "content_id": "string",
148
- "content_type": "text | multimodal",
149
- "text": "string (optional, for text tasks)",
150
- "image_description": "string (optional, deepfake task only)",
151
- "detector_score": 0.0-1.0 (optional, deepfake task only),
152
- "metadata": {
153
- "source": "email | social_media | platform",
154
- "sender_reputation": 0.0-1.0,
155
- "link_count": 0,
156
- "report_count": 0,
157
- "timestamp": "ISO8601"
158
- },
159
- "step_num": 1,
160
- "total_steps": 10
161
- }
162
- ```
163
-
164
- | Field | Type | Required | Task | Description |
165
- |---|---|---|---|---|
166
- | `content_id` | string | All | Unique identifier for the content item |
167
- | `content_type` | string | All | Type of content: `text` or `multimodal` |
168
- | `text` | string | text_spam, content_moderation | The actual email/post body |
169
- | `image_description` | string | deepfake_detection | AI-generated description of the image |
170
- | `detector_score` | float | deepfake_detection | Raw output from deepfake model (0-1) |
171
- | `metadata` | object | All | Platform-specific signals (reputation, reports, etc.) |
172
- | `step_num` | int | All | Current step in episode |
173
- | `total_steps` | int | All | Total steps in this episode |
174
-
175
- ---
176
-
177
- ## Action Space
178
-
179
- Agents must respond with a `ModerationAction` object:
180
-
181
- ```json
182
- {
183
- "decision": "approve | reject | escalate | flag",
184
- "reason": "string (max 200 chars, explanation)",
185
- "confidence": 0.0-1.0,
186
- "labels": ["spam", "scam", "phishing", "hate_speech", "violence",
187
- "harassment", "misinformation", "adult_content",
188
- "deepfake", "political_manipulation", "fraud"]
189
- }
190
- ```
191
-
192
- | Field | Type | Range | Description |
193
- |---|---|---|---|
194
- | `decision` | enum | {approve, reject, escalate, flag} | **approve**: Safe content; **reject**: Clear violation, remove immediately; **flag**: Borderline, needs review; **escalate**: Uncertain, route to specialist |
195
- | `reason` | string | 0-200 chars | Brief explanation of the decision |
196
- | `confidence` | float | [0.0, 1.0] | Agent's confidence in this decision. Calibration is rewarded. |
197
- | `labels` | array | 0-11 labels | Detected violation categories. Must be valid from the allowed set. |
198
-
199
- **Valid Labels:**
200
- - `spam` — Unsolicited bulk messaging
201
- - `scam` — Financial or identity theft attempts
202
- - `phishing` — Social engineering attacks
203
- - `hate_speech` — Dehumanizing content targeting identity groups
204
- - `violence` — Threats, glorification, or instruction of violence
205
- - `harassment` — Personal attacks, doxxing, coordinated abuse
206
- - `misinformation` — False claims with societal impact
207
- - `adult_content` — NSFW or sexually explicit material
208
- - `deepfake` — AI-manipulated media
209
- - `political_manipulation` — Coordinated inauthentic behavior
210
- - `fraud` — Financial scams, false claims
211
-
212
- ---
213
-
214
- ## Reward Functions
215
-
216
- Rewards are computed per task based on decision accuracy, label coverage (F1), and confidence calibration.
217
-
218
- ### text_spam (Easy)
219
-
220
- | Component | Reward | Condition |
221
- |---|---|---|
222
- | Correct decision | **+0.65** | `decision` matches ground truth |
223
- | Escalate on harmful | **+0.30** | Harmful content + escalate/flag (partial credit) |
224
- | Label F1 contribution | **+0.20** | F1 score of predicted vs. true labels |
225
- | Confidence calibration | **±0.10** | Bonus if confident on correct, penalty if confident on wrong |
226
- | **Max per step** | **1.00** | Sum of components (capped) |
227
-
228
- ### content_moderation (Medium)
229
-
230
- | Component | Reward | Condition |
231
- |---|---|---|
232
- | Correct decision | **+0.50** | `decision` matches ground truth |
233
- | Partial credit | **+0.25** | Harmful content + flag/escalate (conservative approach) |
234
- | Label F1 contribution | **+0.35** | Multi-label F1 score (up to 11 labels) |
235
- | Confidence calibration | **±0.10** | Brier score penalty for miscalibration |
236
- | **Max per step** | **1.00** | Sum of components (capped) |
237
-
238
- ### deepfake_detection (Hard)
239
-
240
- | Component | Reward | Condition |
241
- |---|---|---|
242
- | Correct decision | **+0.40** | `decision` matches ground truth |
243
- | Deepfake detection | **+0.30** | Accuracy vs. detector_score threshold |
244
- | Detector alignment | **+0.10** | Bonus for leveraging model signals |
245
- | Label F1 contribution | **+0.20** | Multi-label F1 (fewer labels than medium task) |
246
- | Confidence calibration | **±0.10** | Calibration error penalty |
247
- | **Max per step** | **1.00** | Sum of components (capped) |
248
-
249
- **Calibration Bonus Formula:**
250
- ```
251
- bonus = 0.1 × (confidence if correct else -confidence)
252
- ```
253
-
254
- ---
255
-
256
- ## Baseline Scores
257
-
258
- Scores reported for **Llama-3.1-8B-Instruct** with `temperature=0.2` and `top-p=0.95`:
259
-
260
- | Task | Score | Steps | Notes |
261
- |---|---|---|---|
262
- | `text_spam` | **0.72** | 5 | Strong on obvious spam; struggles with phishing disguised as legitimate |
263
- | `content_moderation` | **0.58** | 8 | Good binary decisions; incomplete label coverage (F1 ≈0.52) |
264
- | `deepfake_detection` | **0.44** | 10 | Relies on image descriptions; independent detector signals underutilized |
265
-
266
- ---
267
-
268
- ## Setup & Usage
269
-
270
- ### Requirements
271
-
272
- - **Python**: 3.11 or higher
273
- - **Docker** (optional, for containerized deployment)
274
- - **GPU** (optional, recommended for deepfake models): CUDA 12.1+
275
- - **Memory**: 8GB+ RAM (16GB recommended for local LLM inference)
276
- - **Disk**: 10GB+ (models cached in `~/.cache/huggingface/`)
277
-
278
- ### Local Installation
279
-
280
- 1. **Clone and navigate:**
281
- ```bash
282
- git clone https://github.com/Anidipta/Content-Moderation-env.git
283
- cd Content-Moderation-env
284
- ```
285
-
286
- 2. **Create virtual environment:**
287
- ```bash
288
- python -m venv venv
289
- source venv/bin/activate # On Windows: venv\Scripts\activate
290
- ```
291
-
292
- 3. **Install dependencies:**
293
- ```bash
294
- pip install -r server/requirements.txt
295
- ```
296
-
297
- 4. **Start the server:**
298
- ```bash
299
- uvicorn server.main:app --host 0.0.0.0 --port 7860
300
- ```
301
-
302
- Server runs at `http://localhost:7860`
303
-
304
- 5. **Access API documentation:**
305
- - Swagger UI: `http://localhost:7860/docs`
306
- - ReDoc: `http://localhost:7860/redoc`
307
-
308
- ### Docker Deployment
309
-
310
- #### Build the Image
311
-
312
- ```bash
313
- # Basic build
314
- docker build -f server/Dockerfile -t content-moderation-env .
315
-
316
- # Build with memory allocation (recommended)
317
- docker build --memory=4g -f server/Dockerfile -t content-moderation-env .
318
-
319
- # Build with progress output
320
- docker build --progress=plain -f server/Dockerfile -t content-moderation-env .
321
- ```
322
-
323
- #### Run the Container
324
-
325
- ```bash
326
- # Basic run
327
- docker run -p 7860:7860 content-moderation-env
328
-
329
- # Run with environment variables
330
- docker run -p 7860:7860 \
331
- -e API_BASE_URL="https://router.huggingface.co/v1" \
332
- -e MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" \
333
- -e HF_TOKEN="hf_your_token_here" \
334
- content-moderation-env
335
-
336
- # Run with GPU support
337
- docker run --gpus all -p 7860:7860 content-moderation-env
338
-
339
- # Run with volume mounts (cache models locally)
340
- docker run -p 7860:7860 \
341
- -v ~/.cache/huggingface:/app/.cache/huggingface \
342
- content-moderation-env
343
-
344
- # Run in background
345
- docker run -d -p 7860:7860 --name moderation-env content-moderation-env
346
-
347
- # Check logs
348
- docker logs moderation-env
349
-
350
- # Stop container
351
- docker stop moderation-env
352
- ```
353
-
354
- #### Dockerfile Details
355
-
356
- The [server/Dockerfile](server/Dockerfile) uses:
357
- - **Base Image**: `python:3.11-slim` (~300MB) — minimal footprint with Python runtime
358
- - **System Dependencies**: `libgl1 libglib2.0-0 curl` — required for vision models and health checks
359
- - **Dependencies Installation**: Multi-stage approach with pip cache optimization
360
- - **Model Preloading**: Deepfake detection model downloaded during build for faster startup
361
- - **Environment Setup**: HuggingFace cache directories and Python settings pre-configured
362
- - **Entry Point**: FastAPI app via Uvicorn on port 7860
363
-
364
- ```dockerfile
365
- # Key optimizations:
366
- - --no-cache-dir: Reduces image size by 50%
367
- - --no-build-isolation: Prevents memory spikes during pip install
368
- - Pre-downloaded models: Eliminates first-run delays
369
- - Minimal dependencies: Only libraries needed for the environment
370
- ```
371
-
372
- #### Deployment to Production
373
-
374
- **Docker Compose:**
375
- ```yaml
376
- version: '3.8'
377
- services:
378
- moderation-api:
379
- build:
380
- context: .
381
- dockerfile: server/Dockerfile
382
- ports:
383
- - "7860:7860"
384
- environment:
385
- - API_BASE_URL=https://router.huggingface.co/v1
386
- - MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
387
- - HF_TOKEN=${HF_TOKEN}
388
- volumes:
389
- - ~/.cache/huggingface:/app/.cache/huggingface
390
- restart: unless-stopped
391
- healthcheck:
392
- test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
393
- interval: 30s
394
- timeout: 10s
395
- retries: 3
396
- ```
397
-
398
- Run with: `docker-compose up -d`
399
-
400
- ### HuggingFace Spaces Deployment
401
-
402
- 1. Create a new Space with Docker SDK
403
- 2. Add Secrets (Settings → Repository secrets):
404
- - `HF_TOKEN`: Your HuggingFace API token
405
- 3. Add Variables (Settings → Repository variables):
406
- - `API_BASE_URL`: `https://router.huggingface.co/v1`
407
- - `MODEL_NAME`: `meta-llama/Llama-3.1-8B-Instruct`
408
- 4. Push this repository to the Space
409
- 5. Space URL becomes your `PING_URL` for validation scripts
410
-
411
- ---
412
-
413
- ## Running the Inference Script
414
-
415
- ```bash
416
- # API mode (HF inference endpoint)
417
- export API_BASE_URL="https://router.huggingface.co/v1"
418
- export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
419
- export HF_TOKEN="hf_your_token_here"
420
- export SERVER_URL="http://localhost:7860"
421
- export TASK_NAME="text_spam"
422
-
423
- python inference.py
424
-
425
- # Local transformers pipeline mode
426
- export USE_LOCAL_MODEL="true"
427
- python inference.py
428
- ```
429
-
430
- ### Output Format
431
-
432
- ```
433
- [START] task=text_spam env=content_moderation_env model=meta-llama/Llama-3.1-8B-Instruct
434
- [STEP] step=1 action={"decision":"reject","confidence":0.9,"labels":["spam"]} reward=0.85 done=false error=null
435
- [STEP] step=2 action={"decision":"approve","confidence":0.8,"labels":[]} reward=0.75 done=false error=null
436
- [STEP] step=3 action={"decision":"escalate","confidence":0.5,"labels":["scam"]} reward=0.30 done=false error=null
437
- [STEP] step=4 action={"decision":"reject","confidence":0.85,"labels":["phishing"]} reward=0.70 done=false error=null
438
- [STEP] step=5 action={"decision":"approve","confidence":0.88,"labels":[]} reward=0.75 done=true error=null
439
- [END] success=true steps=5 score=0.720 rewards=0.85,0.75,0.30,0.70,0.75
440
- ```
441
-
442
- | Field | Type | Description |
443
- |---|---|---|
444
- | `task` | string | The task being evaluated |
445
- | `step` | int | Current step number in episode |
446
- | `decision` | string | Agent's moderation decision |
447
- | `confidence` | float | Agent's confidence (0-1) |
448
- | `labels` | array | Detected violation labels |
449
- | `reward` | float | Reward received for this step |
450
- | `done` | boolean | Episode completion flag |
451
- | `error` | string/null | Error message if applicable |
452
- | `score` | float | Final episode score |
453
-
454
- ---
455
-
456
- ## API Reference
457
-
458
- ### Server Endpoints
459
-
460
- All endpoints are JSON-based with FastAPI's automatic validation.
461
-
462
- #### 1. Reset Episode
463
- **POST** `/reset`
464
-
465
- Start a new moderation episode.
466
-
467
- **Request Body:**
468
- ```json
469
- {
470
- "task": "text_spam"
471
- }
472
- ```
473
-
474
- **Response (200 OK):**
475
- ```json
476
- {
477
- "observation": {
478
- "content_id": "ts_001",
479
- "content_type": "text",
480
- "text": "CONGRATULATIONS! You've won $1,000,000!...",
481
- "metadata": {"source": "email", "sender_reputation": 0.05, "link_count": 3},
482
- "step_num": 1,
483
- "total_steps": 10
484
- },
485
- "info": {}
486
- }
487
- ```
488
-
489
- **Error (400):**
490
- ```json
491
- {
492
- "detail": "Unknown task 'invalid_task'. Valid: ['text_spam', 'content_moderation', 'deepfake_detection']"
493
- }
494
- ```
495
-
496
- ---
497
-
498
- #### 2. Submit Action
499
- **POST** `/step`
500
-
501
- Submit a moderation action for the current content.
502
-
503
- **Request Body:**
504
- ```json
505
- {
506
- "decision": "reject",
507
- "reason": "Email contains typical spam patterns and suspicious links",
508
- "confidence": 0.92,
509
- "labels": ["spam", "scam"]
510
- }
511
- ```
512
-
513
- **Response (200 OK):**
514
- ```json
515
- {
516
- "observation": {
517
- "content_id": "ts_002",
518
- "content_type": "text",
519
- "text": "Hi Sarah, confirming our meeting tomorrow...",
520
- "metadata": {"source": "email", "sender_reputation": 0.92, "link_count": 0},
521
- "step_num": 2,
522
- "total_steps": 10
523
- },
524
- "reward": 0.85,
525
- "done": false,
526
- "info": {}
527
- }
528
- ```
529
-
530
- ---
531
-
532
- #### 3. Get Current State
533
- **GET** `/state`
534
-
535
- Retrieve the current episode state without taking an action.
536
-
537
- **Response (200 OK):**
538
- ```json
539
- {
540
- "observation": {...},
541
- "reward": 0.85,
542
- "done": false,
543
- "info": {
544
- "task": "text_spam",
545
- "items_completed": 2,
546
- "total_items": 10,
547
- "cumulative_reward": 1.60
548
- }
549
- }
550
- ```
551
-
552
- ---
553
-
554
- #### 4. Close Episode
555
- **POST** `/close`
556
-
557
- Explicitly close the episode and clean up resources.
558
-
559
- **Response (200 OK):**
560
- ```json
561
- {
562
- "status": "closed",
563
- "final_reward": 7.20,
564
- "steps_completed": 10
565
- }
566
- ```
567
-
568
- ---
569
-
570
- #### 5. List Available Tasks
571
- **GET** `/tasks`
572
-
573
- Get metadata about all available tasks.
574
-
575
- **Response (200 OK):**
576
- ```json
577
- {
578
- "text_spam": {
579
- "description": "Classify email/message content as spam or legitimate",
580
- "difficulty": "easy",
581
- "num_items": 50,
582
- "content_type": "text"
583
- },
584
- "content_moderation": {
585
- "description": "Detect policy violations in social media posts",
586
- "difficulty": "medium",
587
- "num_items": 40,
588
- "content_type": "text"
589
- },
590
- "deepfake_detection": {
591
- "description": "Identify AI-manipulated media",
592
- "difficulty": "hard",
593
- "num_items": 30,
594
- "content_type": "multimodal"
595
- }
596
- }
597
- ```
598
-
599
- ---
600
-
601
- #### 6. Health Check
602
- **GET** `/health`
603
-
604
- Check server health and status.
605
-
606
- **Response (200 OK):**
607
- ```json
608
- {
609
- "status": "ok"
610
- }
611
- ```
612
-
613
- ---
614
-
615
- #### 7. Root Endpoint
616
- **GET** `/`
617
-
618
- Redirects to interactive Swagger UI documentation.
619
-
620
- ---
621
-
622
- ## Project Structure
623
-
624
- ```
625
- content-moderation-env/
626
-
627
- ├── README.md # This file
628
- ├── uv.lock # Dependency lock file (UV package manager)
629
- ├── inference.py # Baseline agent script (235 lines)
630
- │ # Demonstrates LLM agent interaction
631
- │ # Supports HF API and local inference modes
632
-
633
- ├── server/ # FastAPI application (core)
634
- │ ├── __init__.py # Package marker (empty)
635
- │ │
636
- │ ├── main.py # FastAPI app & HTTP endpoints (57 lines)
637
- │ │ # Defines: /reset, /step, /state, /close
638
- │ │ # /tasks, /health, / endpoints
639
- │ │
640
- │ ├── env.py # OpenEnv environment implementation (122 lines)
641
- │ │ # Core logic: reset(), step(), state(), close()
642
- │ │ # Thread-safe with locks for concurrency
643
- │ │
644
- │ ├── models.py # Pydantic data models
645
- │ │ # Defines: ContentObservation, ModerationAction
646
- │ │ # StepResult, ResetResult, EnvState
647
- │ │
648
- │ ├── tasks.py # Task datasets & ground truth (193 lines)
649
- │ │ # Contains: text_spam, content_moderation,
650
- │ │ # deepfake_detection task definitions & items
651
- │ │
652
- │ ├── graders.py # Reward functions per task (95 lines)
653
- │ │ # Implements: label F1, calibration bonus,
654
- │ │ # decision accuracy scoring logic
655
- │ │
656
- │ ├── deepfake_model.py # HF deepfake detection pipeline (90 lines)
657
- │ │ # Lazy-loads: dima806/deepfake_vs_real...
658
- │ │ # Caches model in HF_HOME for reuse
659
- │ │
660
- │ ├── openenv.yaml # OpenEnv specification metadata
661
- │ │ # Declares task specs, observation/action space
662
- │ │
663
- │ ├── Dockerfile # Docker container definition
664
- │ │ # Base: python:3.11-slim (~300MB)
665
- │ │ # Installs system deps, pip packages,
666
- │ │ # pre-downloads deepfake model
667
- │ │
668
- │ └── requirements.txt # Python dependencies (12 packages)
669
- │ # Key: fastapi, uvicorn, transformers,
670
- │ # torch, openai, python-dotenv
671
-
672
- ├── test/ # Test suite
673
- │ └── test.py # pytest tests (20+ test cases)
674
- │ # Coverage: tasks, endpoints, rewards
675
-
676
- └── .env # Environment variables (git-ignored)
677
- # Stores: HF_TOKEN, API_BASE_URL, etc.
678
- ```
679
-
680
- ---
681
-
682
- ## Environment Variables
683
-
684
- Configuration is controlled via environment variables. Create a `.env` file in the project root:
685
-
686
- ```env
687
- # ============ API Configuration ============
688
- API_BASE_URL=https://router.huggingface.co/v1
689
- # URL of the LLM inference endpoint
690
- # Default: HuggingFace router (requires HF_TOKEN)
691
-
692
- MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
693
- # Which LLM to use for agent inference
694
- # Other options: gpt-3.5-turbo, claude-3-opus, mistral-large, etc.
695
-
696
- HF_TOKEN=hf_your_token_here
697
- # HuggingFace API token for authenticated requests
698
- # Get from: https://huggingface.co/settings/tokens
699
-
700
- # ============ Server Configuration ============
701
- SERVER_URL=http://localhost:7860
702
- # Where the OpenEnv API server runs
703
- # Used by inference.py to connect to environment
704
-
705
- # ============ Task & Inference Configuration ============
706
- TASK_NAME=text_spam
707
- # Which task to run: text_spam, content_moderation, deepfake_detection
708
-
709
- USE_LOCAL_MODEL=false
710
- # If true: Load Llama-3.1-8B locally via transformers
711
- # If false: Use remote API (requires HF_TOKEN)
712
- # Local mode requires 16GB+ RAM
713
-
714
- # ============ HuggingFace Model Caching ============
715
- HF_HOME=/app/.cache/huggingface
716
- # Directory for cached HF models and datasets
717
- # Mounted as volume in Docker for persistence
718
-
719
- TRANSFORMERS_CACHE=/app/.cache/huggingface
720
- # Alternative env var for transformers library caching
721
-
722
- # ============ Python Configuration ============
723
- PYTHONDONTWRITEBYTECODE=1
724
- # Don't create __pycache__ directories
725
-
726
- PYTHONUNBUFFERED=1
727
- # Stream logs immediately (useful in Docker)
728
-
729
- # ============ Logging ============
730
- LOG_LEVEL=INFO
731
- # Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
732
- ```
733
-
734
- ### Variable Precedence
735
-
736
- 1. Environment variables (highest priority)
737
- 2. `.env` file
738
- 3. Hardcoded defaults in code (lowest priority)
739
-
740
- Example override:
741
- ```bash
742
- export HF_TOKEN="hf_custom_token" && python inference.py
743
- # Uses custom token instead of .env value
744
- ```
745
-
746
- ---
747
-
748
- ## Running Tests
749
-
750
- The project includes a comprehensive test suite using pytest.
751
-
752
- ### Setup
753
-
754
- ```bash
755
- pip install pytest pytest-cov
756
- ```
757
-
758
- ### Run All Tests
759
-
760
- ```bash
761
- pytest test/test.py -v
762
- ```
763
-
764
- ### Run Specific Test Class
765
-
766
- ```bash
767
- pytest test/test.py::TestTasks -v
768
- ```
769
-
770
- ### Run with Coverage Report
771
-
772
- ```bash
773
- pytest test/test.py --cov=server --cov-report=html
774
- # Opens htmlcov/index.html in browser for coverage visualization
775
- ```
776
-
777
- ### Test Categories
778
-
779
- | Test | Coverage | Status |
780
- |---|---|---|
781
- | Task loading | All 3 tasks initialize correctly | ✓ |
782
- | API endpoints | /reset, /step, /state, /close, /tasks, /health | ✓ |
783
- | Reward grading | text_spam, content_moderation, deepfake_detection | ✓ |
784
- | Input validation | Action schema validation, label validation | ✓ |
785
- | Edge cases | Empty labels, out-of-range confidence, etc. | ✓ |
786
-
787
- ---
788
-
789
- ## Troubleshooting
790
-
791
- ### Installation Issues
792
-
793
- **Problem:** `ImportError: No module named 'openai'`
794
- ```bash
795
- Solution: pip install "openai>=1.40.0"
796
- ```
797
-
798
- **Problem:** `ImportError: No module named 'torch'`
799
- ```bash
800
- Solution: pip install torch torchvision
801
- # For GPU: pip install torch torchvision -f https://download.pytorch.org/whl/cu121/torch_stable.html
802
- ```
803
-
804
- **Problem:** `FileNotFoundError: requirements.txt`
805
- ```bash
806
- Solution: Ensure you're in the project root: cd content-moderation-env/
807
- # Then: pip install -r server/requirements.txt
808
- ```
809
-
810
- ### Docker Issues
811
-
812
- **Problem:** `Segmentation fault (core dumped)` during build
813
- ```
814
- Solution: Allocate more memory to Docker build:
815
- docker build --memory=8g -f server/Dockerfile -t content-moderation-env .
816
- ```
817
-
818
- **Problem:** `failed to solve: failed to compute cache key`
819
- ```
820
- Solution: Ensure requirements.txt is in server/ directory:
821
- # Current: server/requirements.txt (correct)
822
- # Wrong: ./requirements.txt
823
- ```
824
-
825
- **Problem:** Port 7860 already in use
826
- ```bash
827
- Solution: Use different port:
828
- docker run -p 8000:7860 content-moderation-env
829
- # Now access at http://localhost:8000
830
- ```
831
-
832
- ### Runtime Issues
833
-
834
- **Problem:** `Connection refused: localhost:7860`
835
- ```bash
836
- Solution: Ensure server is running:
837
- uvicorn server.main:app --host 0.0.0.0 --port 7860
838
-
839
- In Docker, use: docker logs <container_id>
840
- ```
841
-
842
- **Problem:** `Client.__init__() got an unexpected keyword argument 'proxies'`
843
- ```bash
844
- Solution: Update OpenAI client:
845
- pip install --upgrade openai
846
- ```
847
-
848
- **Problem:** HuggingFace models downloading very slowly
849
- ```bash
850
- Solution: Check internet connection and verify HF_TOKEN:
851
- export HF_TOKEN="hf_your_token_here"
852
- # Or download models ahead of time
853
- python -c "from transformers import pipeline; pipeline('image-classification', model='dima806/deepfake_vs_real_image_detection')"
854
- ```
855
-
856
- ### API Issues
857
-
858
- **Problem:** Invalid request to `/step` without `/reset`
859
- ```json
860
- Error: "Environment not initialized. Call /reset first."
861
- Solution: Always call POST /reset before any /step requests
862
- ```
863
-
864
- **Problem:** Invalid label in action
865
- ```json
866
- Error: {"detail": "Invalid label: 'unknown_label'"}
867
- Solution: Use only valid labels from the specification
868
- ```
869
-
870
- **Problem:** Confidence out of range
871
- ```
872
- Solution: Ensure confidence is between 0.0 and 1.0
873
- ```
874
-
875
- ---
876
-
877
- ## Citation
878
-
879
- If you use this environment in your research, please cite:
880
-
881
- ```bibtex
882
- @software{content_moderation_openenv_2025,
883
- title={Content Moderation OpenEnv: A Real-World AI Triage Environment},
884
- author={Anidipta},
885
- year={2025},
886
- url={https://github.com/Anidipta/Content-Moderation-env},
887
- note={OpenEnv Specification Compliant}
888
- }
889
- ```
890
-
891
- ---
892
-
893
- ## Acknowledgements
894
-
895
- 🙏 Built for the **OpenEnv Hackathon 2025**.
896
-
897
- **Special Thanks To:**
898
- - OpenEnv community for the specification and framework
899
- - HuggingFace for model hosting and inference APIs
900
- - Meta for the Llama-3.1-8B-Instruct model
901
- - Contributors and testers who improved the environment
902
-
903
- **Dataset & Content Note:**
904
- The email and content corpus is entirely **synthetic** and does not represent any real individuals, companies, organizations, or actual events. All examples are generated for demonstration and testing purposes only.
905
-
906
- **License:** MIT License — See [LICENSE](LICENSE) file for details
907
-
908
- **Questions?** Open an issue on GitHub or contact the maintainers.
909
-
910
- ---
911
-
912
- **Last Updated:** April 8, 2026 | **OpenEnv Spec Version:** 1.0
913
- colorTo: green
914
- sdk: docker
915
- pinned: false
916
- license: mit
917
- ---
918
-
919
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
920
- >>>>>>> f6dee02010a32ba1936311cbb3790fa087282e74
 
1
+ ---
2
+ title: Content Moderation OpenEnv
3
+ emoji: 🛡️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_file: server/main.py
8
+ pinned: false
9
+ ---
10
+
11
+ # Content Moderation OpenEnv
12
+
13
+ An AI content moderation environment built to the OpenEnv specification. Agents triage real-world content — spam emails, harmful social media posts, and AI-generated deepfakes — using a standard `step()` / `reset()` / `state()` API.
14
+
15
+ [![OpenEnv Spec](https://img.shields.io/badge/OpenEnv-Spec-blue)](https://github.com/openenv-core/spec)
16
+ [![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue.svg)](https://www.python.org/downloads/)
17
+ [![FastAPI](https://img.shields.io/badge/FastAPI-0.111.0-green.svg)](https://fastapi.tiangolo.com/)
18
+ [![Docker](https://img.shields.io/badge/Docker-Ready-blue.svg)](https://www.docker.com/)
19
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
20
+
21
+ ---
22
+
23
+ ## 📋 Table of Contents
24
+
25
+ - [Environment Description & Motivation](#environment-description--motivation)
26
+ - [Task Descriptions](#task-descriptions)
27
+ - [Observation Space](#observation-space)
28
+ - [Action Space](#action-space)
29
+ - [Reward Functions](#reward-functions)
30
+ - [Baseline Scores](#baseline-scores)
31
+ - [Setup & Usage](#setup--usage)
32
+ - [Requirements](#requirements)
33
+ - [Local Installation](#local-installation)
34
+ - [Docker Deployment](#docker-deployment)
35
+ - [HuggingFace Spaces Deployment](#huggingface-spaces-deployment)
36
+ - [Running the Inference Script](#running-the-inference-script)
37
+ - [API Reference](#api-reference)
38
+ - [Project Structure](#project-structure)
39
+ - [Environment Variables](#environment-variables)
40
+ - [Running Tests](#running-tests)
41
+ - [Troubleshooting](#troubleshooting)
42
+ - [Citation](#citation)
43
+ - [Acknowledgements](#acknowledgements)
44
+
45
+ ---
46
+
47
+ ## Environment Description & Motivation
48
+
49
+ Content moderation is a high-stakes, high-volume real-world task. Human moderators review millions of items daily across platforms and inboxes. This environment simulates a realistic moderation pipeline across three difficulty levels, enabling AI agents to learn decision-making strategies under resource constraints.
50
+
51
+ **Key Challenges:**
52
+ - Multi-label classification with imbalanced datasets
53
+ - Confidence calibration under uncertainty
54
+ - Real-world content variability (spam, deepfakes, policy violations)
55
+ - Escalation vs. immediate action tradeoffs
56
+
57
+ | Task | Difficulty | Content Type | Metrics | Description |
58
+ |---|---|---|---|---|
59
+ | `text_spam` | Easy | Email / SMS | Binary classification + confidence | Spam vs. legitimate email filtering with sender reputation signals |
60
+ | `content_moderation` | Medium | Social media posts | Multi-label (up to 11 labels) | Detect policy violations (hate speech, harassment, violence, etc.) |
61
+ | `deepfake_detection` | Hard | Multimodal (image + text) | Detection accuracy + labels | Identify AI-manipulated media using vision transformer + metadata |
62
+
63
+ **Models Used:**
64
+ - **Deepfake Detection**: `dima806/deepfake_vs_real_image_detection` (ViT, ~300MB)
65
+ - **Language Model**: `meta-llama/Llama-3.1-8B-Instruct` (8B params, via transformers or HF API)
66
+ - **Infrastructure**: FastAPI + Uvicorn, Docker containerization
67
+
68
+ ---
69
+
70
+ ## Task Descriptions
71
+
72
+ ### Task 1: Text Spam Classification (Easy)
73
+ **Objective:** Binary classification of emails/SMS as spam or legitimate.
74
+
75
+ **Dataset:**
76
+ - 50 items (30 spam, 20 legitimate)
77
+ - Features: text content, sender reputation score, link count, source
78
+ - Ground truth: decision + labels
79
+
80
+ **Example:**
81
+ ```json
82
+ {
83
+ "content_id": "ts_001",
84
+ "content_type": "text",
85
+ "text": "CONGRATULATIONS! You've won $1,000,000! Click here NOW to claim your prize!!!",
86
+ "metadata": {"source": "email", "sender_reputation": 0.05, "link_count": 3},
87
+ "ground_truth": {
88
+ "decision": "reject",
89
+ "labels": ["spam", "scam"],
90
+ "is_harmful": true
91
+ }
92
+ }
93
+ ```
94
+
95
+ ---
96
+
97
+ ### Task 2: Content Moderation (Medium)
98
+ **Objective:** Multi-label classification of social media posts for policy violations.
99
+
100
+ **Dataset:**
101
+ - 40 items across diverse platforms
102
+ - Labels: spam, scam, phishing, hate_speech, violence, harassment, misinformation, adult_content, deepfake, political_manipulation, fraud
103
+ - Features: post text, engagement metrics, user reputation, report count
104
+
105
+ **Violation Categories:**
106
+ | Category | Definition | Examples |
107
+ |---|---|---|
108
+ | Hate Speech | Dehumanizing content targeting identity | Slurs, discrimination, incitement |
109
+ | Violence | Threats or glorification of violence | Physical harm, weapon promotion |
110
+ | Harassment | Coordinated or severe personal attacks | Doxxing, targeted campaigns |
111
+ | Misinformation | False claims with societal impact | Election fraud claims, health hoaxes |
112
+
113
+ ---
114
+
115
+ ### Task 3: Deepfake Detection (Hard)
116
+ **Objective:** Detect AI-manipulated media and classify content appropriately.
117
+
118
+ **Dataset:**
119
+ - 30 items (multimodal: images + descriptions)
120
+ - Deepfake detection model outputs raw confidence scores (0-1)
121
+ - Features: image description, detector_score, metadata
122
+
123
+ **Detector Score Interpretation:**
124
+ - `0.0-0.3`: Likely real/authentic
125
+ - `0.3-0.7`: Uncertain, may require additional analysis
126
+ - `0.7-1.0`: Likely deepfake/manipulated
127
+
128
+ **Example:**
129
+ ```json
130
+ {
131
+ "content_id": "df_001",
132
+ "content_type": "multimodal",
133
+ "image_description": "Portrait of person in business attire, lighting appears natural",
134
+ "detector_score": 0.82,
135
+ "metadata": {"platform": "social_media", "report_count": 3}
136
+ }
137
+ ```
138
+
139
+ ---
140
+
141
+ ## Observation Space
142
+
143
+ Every step returns a `ContentObservation` with the following structure:
144
+
145
+ ```json
146
+ {
147
+ "content_id": "string",
148
+ "content_type": "text | multimodal",
149
+ "text": "string (optional, for text tasks)",
150
+ "image_description": "string (optional, deepfake task only)",
151
+ "detector_score": 0.0-1.0 (optional, deepfake task only),
152
+ "metadata": {
153
+ "source": "email | social_media | platform",
154
+ "sender_reputation": 0.0-1.0,
155
+ "link_count": 0,
156
+ "report_count": 0,
157
+ "timestamp": "ISO8601"
158
+ },
159
+ "step_num": 1,
160
+ "total_steps": 10
161
+ }
162
+ ```
163
+
164
+ | Field | Type | Required | Task | Description |
165
+ |---|---|---|---|---|
166
+ | `content_id` | string | All | Unique identifier for the content item |
167
+ | `content_type` | string | All | Type of content: `text` or `multimodal` |
168
+ | `text` | string | text_spam, content_moderation | The actual email/post body |
169
+ | `image_description` | string | deepfake_detection | AI-generated description of the image |
170
+ | `detector_score` | float | deepfake_detection | Raw output from deepfake model (0-1) |
171
+ | `metadata` | object | All | Platform-specific signals (reputation, reports, etc.) |
172
+ | `step_num` | int | All | Current step in episode |
173
+ | `total_steps` | int | All | Total steps in this episode |
174
+
175
+ ---
176
+
177
+ ## Action Space
178
+
179
+ Agents must respond with a `ModerationAction` object:
180
+
181
+ ```json
182
+ {
183
+ "decision": "approve | reject | escalate | flag",
184
+ "reason": "string (max 200 chars, explanation)",
185
+ "confidence": 0.0-1.0,
186
+ "labels": ["spam", "scam", "phishing", "hate_speech", "violence",
187
+ "harassment", "misinformation", "adult_content",
188
+ "deepfake", "political_manipulation", "fraud"]
189
+ }
190
+ ```
191
+
192
+ | Field | Type | Range | Description |
193
+ |---|---|---|---|
194
+ | `decision` | enum | {approve, reject, escalate, flag} | **approve**: Safe content; **reject**: Clear violation, remove immediately; **flag**: Borderline, needs review; **escalate**: Uncertain, route to specialist |
195
+ | `reason` | string | 0-200 chars | Brief explanation of the decision |
196
+ | `confidence` | float | [0.0, 1.0] | Agent's confidence in this decision. Calibration is rewarded. |
197
+ | `labels` | array | 0-11 labels | Detected violation categories. Must be valid from the allowed set. |
198
+
199
+ **Valid Labels:**
200
+ - `spam` — Unsolicited bulk messaging
201
+ - `scam` — Financial or identity theft attempts
202
+ - `phishing` — Social engineering attacks
203
+ - `hate_speech` — Dehumanizing content targeting identity groups
204
+ - `violence` — Threats, glorification, or instruction of violence
205
+ - `harassment` — Personal attacks, doxxing, coordinated abuse
206
+ - `misinformation` — False claims with societal impact
207
+ - `adult_content` — NSFW or sexually explicit material
208
+ - `deepfake` — AI-manipulated media
209
+ - `political_manipulation` — Coordinated inauthentic behavior
210
+ - `fraud` — Financial scams, false claims
211
+
212
+ ---
213
+
214
+ ## Reward Functions
215
+
216
+ Rewards are computed per task based on decision accuracy, label coverage (F1), and confidence calibration.
217
+
218
+ ### text_spam (Easy)
219
+
220
+ | Component | Reward | Condition |
221
+ |---|---|---|
222
+ | Correct decision | **+0.65** | `decision` matches ground truth |
223
+ | Escalate on harmful | **+0.30** | Harmful content + escalate/flag (partial credit) |
224
+ | Label F1 contribution | **+0.20** | F1 score of predicted vs. true labels |
225
+ | Confidence calibration | **±0.10** | Bonus if confident on correct, penalty if confident on wrong |
226
+ | **Max per step** | **1.00** | Sum of components (capped) |
227
+
228
+ ### content_moderation (Medium)
229
+
230
+ | Component | Reward | Condition |
231
+ |---|---|---|
232
+ | Correct decision | **+0.50** | `decision` matches ground truth |
233
+ | Partial credit | **+0.25** | Harmful content + flag/escalate (conservative approach) |
234
+ | Label F1 contribution | **+0.35** | Multi-label F1 score (up to 11 labels) |
235
+ | Confidence calibration | **±0.10** | Brier score penalty for miscalibration |
236
+ | **Max per step** | **1.00** | Sum of components (capped) |
237
+
238
+ ### deepfake_detection (Hard)
239
+
240
+ | Component | Reward | Condition |
241
+ |---|---|---|
242
+ | Correct decision | **+0.40** | `decision` matches ground truth |
243
+ | Deepfake detection | **+0.30** | Accuracy vs. detector_score threshold |
244
+ | Detector alignment | **+0.10** | Bonus for leveraging model signals |
245
+ | Label F1 contribution | **+0.20** | Multi-label F1 (fewer labels than medium task) |
246
+ | Confidence calibration | **±0.10** | Calibration error penalty |
247
+ | **Max per step** | **1.00** | Sum of components (capped) |
248
+
249
+ **Calibration Bonus Formula:**
250
+ ```
251
+ bonus = 0.1 × (confidence if correct else -confidence)
252
+ ```
253
+
254
+ ---
255
+
256
+ ## Baseline Scores
257
+
258
+ Scores reported for **Llama-3.1-8B-Instruct** with `temperature=0.2` and `top-p=0.95`:
259
+
260
+ | Task | Score | Steps | Notes |
261
+ |---|---|---|---|
262
+ | `text_spam` | **0.72** | 5 | Strong on obvious spam; struggles with phishing disguised as legitimate |
263
+ | `content_moderation` | **0.58** | 8 | Good binary decisions; incomplete label coverage (F1 ≈0.52) |
264
+ | `deepfake_detection` | **0.44** | 10 | Relies on image descriptions; independent detector signals underutilized |
265
+
266
+ ---
267
+
268
+ ## Setup & Usage
269
+
270
+ ### Requirements
271
+
272
+ - **Python**: 3.11 or higher
273
+ - **Docker** (optional, for containerized deployment)
274
+ - **GPU** (optional, recommended for deepfake models): CUDA 12.1+
275
+ - **Memory**: 8GB+ RAM (16GB recommended for local LLM inference)
276
+ - **Disk**: 10GB+ (models cached in `~/.cache/huggingface/`)
277
+
278
+ ### Local Installation
279
+
280
+ 1. **Clone and navigate:**
281
+ ```bash
282
+ git clone https://github.com/Anidipta/Content-Moderation-env.git
283
+ cd Content-Moderation-env
284
+ ```
285
+
286
+ 2. **Create virtual environment:**
287
+ ```bash
288
+ python -m venv venv
289
+ source venv/bin/activate # On Windows: venv\Scripts\activate
290
+ ```
291
+
292
+ 3. **Install dependencies:**
293
+ ```bash
294
+ pip install -r server/requirements.txt
295
+ ```
296
+
297
+ 4. **Start the server:**
298
+ ```bash
299
+ uvicorn server.main:app --host 0.0.0.0 --port 7860
300
+ ```
301
+
302
+ Server runs at `http://localhost:7860`
303
+
304
+ 5. **Access API documentation:**
305
+ - Swagger UI: `http://localhost:7860/docs`
306
+ - ReDoc: `http://localhost:7860/redoc`
307
+
308
+ ### Docker Deployment
309
+
310
+ #### Build the Image
311
+
312
+ ```bash
313
+ # Basic build
314
+ docker build -f server/Dockerfile -t content-moderation-env .
315
+
316
+ # Build with memory allocation (recommended)
317
+ docker build --memory=4g -f server/Dockerfile -t content-moderation-env .
318
+
319
+ # Build with progress output
320
+ docker build --progress=plain -f server/Dockerfile -t content-moderation-env .
321
+ ```
322
+
323
+ #### Run the Container
324
+
325
+ ```bash
326
+ # Basic run
327
+ docker run -p 7860:7860 content-moderation-env
328
+
329
+ # Run with environment variables
330
+ docker run -p 7860:7860 \
331
+ -e API_BASE_URL="https://router.huggingface.co/v1" \
332
+ -e MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" \
333
+ -e HF_TOKEN="hf_your_token_here" \
334
+ content-moderation-env
335
+
336
+ # Run with GPU support
337
+ docker run --gpus all -p 7860:7860 content-moderation-env
338
+
339
+ # Run with volume mounts (cache models locally)
340
+ docker run -p 7860:7860 \
341
+ -v ~/.cache/huggingface:/app/.cache/huggingface \
342
+ content-moderation-env
343
+
344
+ # Run in background
345
+ docker run -d -p 7860:7860 --name moderation-env content-moderation-env
346
+
347
+ # Check logs
348
+ docker logs moderation-env
349
+
350
+ # Stop container
351
+ docker stop moderation-env
352
+ ```
353
+
354
+ #### Dockerfile Details
355
+
356
+ The [server/Dockerfile](server/Dockerfile) uses:
357
+ - **Base Image**: `python:3.11-slim` (~300MB) — minimal footprint with Python runtime
358
+ - **System Dependencies**: `libgl1 libglib2.0-0 curl` — required for vision models and health checks
359
+ - **Dependencies Installation**: Multi-stage approach with pip cache optimization
360
+ - **Model Preloading**: Deepfake detection model downloaded during build for faster startup
361
+ - **Environment Setup**: HuggingFace cache directories and Python settings pre-configured
362
+ - **Entry Point**: FastAPI app via Uvicorn on port 7860
363
+
364
+ ```dockerfile
365
+ # Key optimizations:
366
+ - --no-cache-dir: Reduces image size by 50%
367
+ - --no-build-isolation: Prevents memory spikes during pip install
368
+ - Pre-downloaded models: Eliminates first-run delays
369
+ - Minimal dependencies: Only libraries needed for the environment
370
+ ```
371
+
372
+ #### Deployment to Production
373
+
374
+ **Docker Compose:**
375
+ ```yaml
376
+ version: '3.8'
377
+ services:
378
+ moderation-api:
379
+ build:
380
+ context: .
381
+ dockerfile: server/Dockerfile
382
+ ports:
383
+ - "7860:7860"
384
+ environment:
385
+ - API_BASE_URL=https://router.huggingface.co/v1
386
+ - MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
387
+ - HF_TOKEN=${HF_TOKEN}
388
+ volumes:
389
+ - ~/.cache/huggingface:/app/.cache/huggingface
390
+ restart: unless-stopped
391
+ healthcheck:
392
+ test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
393
+ interval: 30s
394
+ timeout: 10s
395
+ retries: 3
396
+ ```
397
+
398
+ Run with: `docker-compose up -d`
399
+
400
+ ### HuggingFace Spaces Deployment
401
+
402
+ 1. Create a new Space with Docker SDK
403
+ 2. Add Secrets (Settings → Repository secrets):
404
+ - `HF_TOKEN`: Your HuggingFace API token
405
+ 3. Add Variables (Settings → Repository variables):
406
+ - `API_BASE_URL`: `https://router.huggingface.co/v1`
407
+ - `MODEL_NAME`: `meta-llama/Llama-3.1-8B-Instruct`
408
+ 4. Push this repository to the Space
409
+ 5. Space URL becomes your `PING_URL` for validation scripts
410
+
411
+ ---
412
+
413
+ ## Running the Inference Script
414
+
415
+ ```bash
416
+ # API mode (HF inference endpoint)
417
+ export API_BASE_URL="https://router.huggingface.co/v1"
418
+ export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
419
+ export HF_TOKEN="hf_your_token_here"
420
+ export SERVER_URL="http://localhost:7860"
421
+ export TASK_NAME="text_spam"
422
+
423
+ python inference.py
424
+
425
+ # Local transformers pipeline mode
426
+ export USE_LOCAL_MODEL="true"
427
+ python inference.py
428
+ ```
429
+
430
+ ### Output Format
431
+
432
+ ```
433
+ [START] task=text_spam env=content_moderation_env model=meta-llama/Llama-3.1-8B-Instruct
434
+ [STEP] step=1 action={"decision":"reject","confidence":0.9,"labels":["spam"]} reward=0.85 done=false error=null
435
+ [STEP] step=2 action={"decision":"approve","confidence":0.8,"labels":[]} reward=0.75 done=false error=null
436
+ [STEP] step=3 action={"decision":"escalate","confidence":0.5,"labels":["scam"]} reward=0.30 done=false error=null
437
+ [STEP] step=4 action={"decision":"reject","confidence":0.85,"labels":["phishing"]} reward=0.70 done=false error=null
438
+ [STEP] step=5 action={"decision":"approve","confidence":0.88,"labels":[]} reward=0.75 done=true error=null
439
+ [END] success=true steps=5 score=0.720 rewards=0.85,0.75,0.30,0.70,0.75
440
+ ```
441
+
442
+ | Field | Type | Description |
443
+ |---|---|---|
444
+ | `task` | string | The task being evaluated |
445
+ | `step` | int | Current step number in episode |
446
+ | `decision` | string | Agent's moderation decision |
447
+ | `confidence` | float | Agent's confidence (0-1) |
448
+ | `labels` | array | Detected violation labels |
449
+ | `reward` | float | Reward received for this step |
450
+ | `done` | boolean | Episode completion flag |
451
+ | `error` | string/null | Error message if applicable |
452
+ | `score` | float | Final episode score |
453
+
454
+ ---
455
+
456
+ ## API Reference
457
+
458
+ ### Server Endpoints
459
+
460
+ All endpoints are JSON-based with FastAPI's automatic validation.
461
+
462
+ #### 1. Reset Episode
463
+ **POST** `/reset`
464
+
465
+ Start a new moderation episode.
466
+
467
+ **Request Body:**
468
+ ```json
469
+ {
470
+ "task": "text_spam"
471
+ }
472
+ ```
473
+
474
+ **Response (200 OK):**
475
+ ```json
476
+ {
477
+ "observation": {
478
+ "content_id": "ts_001",
479
+ "content_type": "text",
480
+ "text": "CONGRATULATIONS! You've won $1,000,000!...",
481
+ "metadata": {"source": "email", "sender_reputation": 0.05, "link_count": 3},
482
+ "step_num": 1,
483
+ "total_steps": 10
484
+ },
485
+ "info": {}
486
+ }
487
+ ```
488
+
489
+ **Error (400):**
490
+ ```json
491
+ {
492
+ "detail": "Unknown task 'invalid_task'. Valid: ['text_spam', 'content_moderation', 'deepfake_detection']"
493
+ }
494
+ ```
495
+
496
+ ---
497
+
498
+ #### 2. Submit Action
499
+ **POST** `/step`
500
+
501
+ Submit a moderation action for the current content.
502
+
503
+ **Request Body:**
504
+ ```json
505
+ {
506
+ "decision": "reject",
507
+ "reason": "Email contains typical spam patterns and suspicious links",
508
+ "confidence": 0.92,
509
+ "labels": ["spam", "scam"]
510
+ }
511
+ ```
512
+
513
+ **Response (200 OK):**
514
+ ```json
515
+ {
516
+ "observation": {
517
+ "content_id": "ts_002",
518
+ "content_type": "text",
519
+ "text": "Hi Sarah, confirming our meeting tomorrow...",
520
+ "metadata": {"source": "email", "sender_reputation": 0.92, "link_count": 0},
521
+ "step_num": 2,
522
+ "total_steps": 10
523
+ },
524
+ "reward": 0.85,
525
+ "done": false,
526
+ "info": {}
527
+ }
528
+ ```
529
+
530
+ ---
531
+
532
+ #### 3. Get Current State
533
+ **GET** `/state`
534
+
535
+ Retrieve the current episode state without taking an action.
536
+
537
+ **Response (200 OK):**
538
+ ```json
539
+ {
540
+ "observation": {...},
541
+ "reward": 0.85,
542
+ "done": false,
543
+ "info": {
544
+ "task": "text_spam",
545
+ "items_completed": 2,
546
+ "total_items": 10,
547
+ "cumulative_reward": 1.60
548
+ }
549
+ }
550
+ ```
551
+
552
+ ---
553
+
554
+ #### 4. Close Episode
555
+ **POST** `/close`
556
+
557
+ Explicitly close the episode and clean up resources.
558
+
559
+ **Response (200 OK):**
560
+ ```json
561
+ {
562
+ "status": "closed",
563
+ "final_reward": 7.20,
564
+ "steps_completed": 10
565
+ }
566
+ ```
567
+
568
+ ---
569
+
570
+ #### 5. List Available Tasks
571
+ **GET** `/tasks`
572
+
573
+ Get metadata about all available tasks.
574
+
575
+ **Response (200 OK):**
576
+ ```json
577
+ {
578
+ "text_spam": {
579
+ "description": "Classify email/message content as spam or legitimate",
580
+ "difficulty": "easy",
581
+ "num_items": 50,
582
+ "content_type": "text"
583
+ },
584
+ "content_moderation": {
585
+ "description": "Detect policy violations in social media posts",
586
+ "difficulty": "medium",
587
+ "num_items": 40,
588
+ "content_type": "text"
589
+ },
590
+ "deepfake_detection": {
591
+ "description": "Identify AI-manipulated media",
592
+ "difficulty": "hard",
593
+ "num_items": 30,
594
+ "content_type": "multimodal"
595
+ }
596
+ }
597
+ ```
598
+
599
+ ---
600
+
601
+ #### 6. Health Check
602
+ **GET** `/health`
603
+
604
+ Check server health and status.
605
+
606
+ **Response (200 OK):**
607
+ ```json
608
+ {
609
+ "status": "ok"
610
+ }
611
+ ```
612
+
613
+ ---
614
+
615
+ #### 7. Root Endpoint
616
+ **GET** `/`
617
+
618
+ Redirects to interactive Swagger UI documentation.
619
+
620
+ ---
621
+
622
+ ## Project Structure
623
+
624
+ ```
625
+ content-moderation-env/
626
+
627
+ ├── README.md # This file
628
+ ├── uv.lock # Dependency lock file (UV package manager)
629
+ ├── inference.py # Baseline agent script (235 lines)
630
+ │ # Demonstrates LLM agent interaction
631
+ │ # Supports HF API and local inference modes
632
+
633
+ ├── server/ # FastAPI application (core)
634
+ │ ├── __init__.py # Package marker (empty)
635
+ │ │
636
+ │ ├── main.py # FastAPI app & HTTP endpoints (57 lines)
637
+ │ │ # Defines: /reset, /step, /state, /close
638
+ │ │ # /tasks, /health, / endpoints
639
+ │ │
640
+ │ ├── env.py # OpenEnv environment implementation (122 lines)
641
+ │ │ # Core logic: reset(), step(), state(), close()
642
+ │ │ # Thread-safe with locks for concurrency
643
+ │ │
644
+ │ ├── models.py # Pydantic data models
645
+ │ │ # Defines: ContentObservation, ModerationAction
646
+ │ │ # StepResult, ResetResult, EnvState
647
+ │ │
648
+ │ ├── tasks.py # Task datasets & ground truth (193 lines)
649
+ │ │ # Contains: text_spam, content_moderation,
650
+ │ │ # deepfake_detection task definitions & items
651
+ │ │
652
+ │ ├── graders.py # Reward functions per task (95 lines)
653
+ │ │ # Implements: label F1, calibration bonus,
654
+ │ │ # decision accuracy scoring logic
655
+ │ │
656
+ │ ├── deepfake_model.py # HF deepfake detection pipeline (90 lines)
657
+ │ │ # Lazy-loads: dima806/deepfake_vs_real...
658
+ │ │ # Caches model in HF_HOME for reuse
659
+ │ │
660
+ │ ├── openenv.yaml # OpenEnv specification metadata
661
+ │ │ # Declares task specs, observation/action space
662
+ │ │
663
+ │ ├── Dockerfile # Docker container definition
664
+ │ │ # Base: python:3.11-slim (~300MB)
665
+ │ │ # Installs system deps, pip packages,
666
+ │ │ # pre-downloads deepfake model
667
+ │ │
668
+ │ └── requirements.txt # Python dependencies (12 packages)
669
+ │ # Key: fastapi, uvicorn, transformers,
670
+ │ # torch, openai, python-dotenv
671
+
672
+ ├── test/ # Test suite
673
+ │ └── test.py # pytest tests (20+ test cases)
674
+ │ # Coverage: tasks, endpoints, rewards
675
+
676
+ └── .env # Environment variables (git-ignored)
677
+ # Stores: HF_TOKEN, API_BASE_URL, etc.
678
+ ```
679
+
680
+ ---
681
+
682
+ ## Environment Variables
683
+
684
+ Configuration is controlled via environment variables. Create a `.env` file in the project root:
685
+
686
+ ```env
687
+ # ============ API Configuration ============
688
+ API_BASE_URL=https://router.huggingface.co/v1
689
+ # URL of the LLM inference endpoint
690
+ # Default: HuggingFace router (requires HF_TOKEN)
691
+
692
+ MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
693
+ # Which LLM to use for agent inference
694
+ # Other options: gpt-3.5-turbo, claude-3-opus, mistral-large, etc.
695
+
696
+ HF_TOKEN=hf_your_token_here
697
+ # HuggingFace API token for authenticated requests
698
+ # Get from: https://huggingface.co/settings/tokens
699
+
700
+ # ============ Server Configuration ============
701
+ SERVER_URL=http://localhost:7860
702
+ # Where the OpenEnv API server runs
703
+ # Used by inference.py to connect to environment
704
+
705
+ # ============ Task & Inference Configuration ============
706
+ TASK_NAME=text_spam
707
+ # Which task to run: text_spam, content_moderation, deepfake_detection
708
+
709
+ USE_LOCAL_MODEL=false
710
+ # If true: Load Llama-3.1-8B locally via transformers
711
+ # If false: Use remote API (requires HF_TOKEN)
712
+ # Local mode requires 16GB+ RAM
713
+
714
+ # ============ HuggingFace Model Caching ============
715
+ HF_HOME=/app/.cache/huggingface
716
+ # Directory for cached HF models and datasets
717
+ # Mounted as volume in Docker for persistence
718
+
719
+ TRANSFORMERS_CACHE=/app/.cache/huggingface
720
+ # Alternative env var for transformers library caching
721
+
722
+ # ============ Python Configuration ============
723
+ PYTHONDONTWRITEBYTECODE=1
724
+ # Don't create __pycache__ directories
725
+
726
+ PYTHONUNBUFFERED=1
727
+ # Stream logs immediately (useful in Docker)
728
+
729
+ # ============ Logging ============
730
+ LOG_LEVEL=INFO
731
+ # Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
732
+ ```
733
+
734
+ ### Variable Precedence
735
+
736
+ 1. Environment variables (highest priority)
737
+ 2. `.env` file
738
+ 3. Hardcoded defaults in code (lowest priority)
739
+
740
+ Example override:
741
+ ```bash
742
+ export HF_TOKEN="hf_custom_token" && python inference.py
743
+ # Uses custom token instead of .env value
744
+ ```
745
+
746
+ ---
747
+
748
+ ## Running Tests
749
+
750
+ The project includes a comprehensive test suite using pytest.
751
+
752
+ ### Setup
753
+
754
+ ```bash
755
+ pip install pytest pytest-cov
756
+ ```
757
+
758
+ ### Run All Tests
759
+
760
+ ```bash
761
+ pytest test/test.py -v
762
+ ```
763
+
764
+ ### Run Specific Test Class
765
+
766
+ ```bash
767
+ pytest test/test.py::TestTasks -v
768
+ ```
769
+
770
+ ### Run with Coverage Report
771
+
772
+ ```bash
773
+ pytest test/test.py --cov=server --cov-report=html
774
+ # Opens htmlcov/index.html in browser for coverage visualization
775
+ ```
776
+
777
+ ### Test Categories
778
+
779
+ | Test | Coverage | Status |
780
+ |---|---|---|
781
+ | Task loading | All 3 tasks initialize correctly | ✓ |
782
+ | API endpoints | /reset, /step, /state, /close, /tasks, /health | ✓ |
783
+ | Reward grading | text_spam, content_moderation, deepfake_detection | ✓ |
784
+ | Input validation | Action schema validation, label validation | ✓ |
785
+ | Edge cases | Empty labels, out-of-range confidence, etc. | ✓ |
786
+
787
+ ---
788
+
789
+ ## Troubleshooting
790
+
791
+ ### Installation Issues
792
+
793
+ **Problem:** `ImportError: No module named 'openai'`
794
+ ```bash
795
+ Solution: pip install "openai>=1.40.0"
796
+ ```
797
+
798
+ **Problem:** `ImportError: No module named 'torch'`
799
+ ```bash
800
+ Solution: pip install torch torchvision
801
+ # For GPU: pip install torch torchvision -f https://download.pytorch.org/whl/cu121/torch_stable.html
802
+ ```
803
+
804
+ **Problem:** `FileNotFoundError: requirements.txt`
805
+ ```bash
806
+ Solution: Ensure you're in the project root: cd content-moderation-env/
807
+ # Then: pip install -r server/requirements.txt
808
+ ```
809
+
810
+ ### Docker Issues
811
+
812
+ **Problem:** `Segmentation fault (core dumped)` during build
813
+ ```
814
+ Solution: Allocate more memory to Docker build:
815
+ docker build --memory=8g -f server/Dockerfile -t content-moderation-env .
816
+ ```
817
+
818
+ **Problem:** `failed to solve: failed to compute cache key`
819
+ ```
820
+ Solution: Ensure requirements.txt is in server/ directory:
821
+ # Current: server/requirements.txt (correct)
822
+ # Wrong: ./requirements.txt
823
+ ```
824
+
825
+ **Problem:** Port 7860 already in use
826
+ ```bash
827
+ Solution: Use different port:
828
+ docker run -p 8000:7860 content-moderation-env
829
+ # Now access at http://localhost:8000
830
+ ```
831
+
832
+ ### Runtime Issues
833
+
834
+ **Problem:** `Connection refused: localhost:7860`
835
+ ```bash
836
+ Solution: Ensure server is running:
837
+ uvicorn server.main:app --host 0.0.0.0 --port 7860
838
+
839
+ In Docker, use: docker logs <container_id>
840
+ ```
841
+
842
+ **Problem:** `Client.__init__() got an unexpected keyword argument 'proxies'`
843
+ ```bash
844
+ Solution: Update OpenAI client:
845
+ pip install --upgrade openai
846
+ ```
847
+
848
+ **Problem:** HuggingFace models downloading very slowly
849
+ ```bash
850
+ Solution: Check internet connection and verify HF_TOKEN:
851
+ export HF_TOKEN="hf_your_token_here"
852
+ # Or download models ahead of time
853
+ python -c "from transformers import pipeline; pipeline('image-classification', model='dima806/deepfake_vs_real_image_detection')"
854
+ ```
855
+
856
+ ### API Issues
857
+
858
+ **Problem:** Invalid request to `/step` without `/reset`
859
+ ```json
860
+ Error: "Environment not initialized. Call /reset first."
861
+ Solution: Always call POST /reset before any /step requests
862
+ ```
863
+
864
+ **Problem:** Invalid label in action
865
+ ```json
866
+ Error: {"detail": "Invalid label: 'unknown_label'"}
867
+ Solution: Use only valid labels from the specification
868
+ ```
869
+
870
+ **Problem:** Confidence out of range
871
+ ```
872
+ Solution: Ensure confidence is between 0.0 and 1.0
873
+ ```
874
+
875
+ ---
876
+
877
+ ## Citation
878
+
879
+ If you use this environment in your research, please cite:
880
+
881
+ ```bibtex
882
+ @software{content_moderation_openenv_2025,
883
+ title={Content Moderation OpenEnv: A Real-World AI Triage Environment},
884
+ author={Anidipta},
885
+ year={2025},
886
+ url={https://github.com/Anidipta/Content-Moderation-env},
887
+ note={OpenEnv Specification Compliant}
888
+ }
889
+ ```
890
+
891
+ ---
892
+
893
+ ## Acknowledgements
894
+
895
+ 🙏 Built for the **OpenEnv Hackathon 2025**.
896
+
897
+ **Special Thanks To:**
898
+ - OpenEnv community for the specification and framework
899
+ - HuggingFace for model hosting and inference APIs
900
+ - Meta for the Llama-3.1-8B-Instruct model
901
+ - Contributors and testers who improved the environment
902
+
903
+ **Dataset & Content Note:**
904
+ The email and content corpus is entirely **synthetic** and does not represent any real individuals, companies, organizations, or actual events. All examples are generated for demonstration and testing purposes only.
905
+
906
+ **License:** MIT License — See [LICENSE](LICENSE) file for details
907
+
908
+ **Questions?** Open an issue on GitHub or contact the maintainers.
909
+
910
+ ---
911
+
912
+ **Last Updated:** April 8, 2026 | **OpenEnv Spec Version:** 1.0
913
+ colorTo: green
914
+ sdk: docker
915
+ pinned: false
916
+ license: mit
917
+ ---
918
+
919
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
920
+ >>>>>>> f6dee02010a32ba1936311cbb3790fa087282e74
uv.lock CHANGED
@@ -1,236 +1,236 @@
1
- # This file is automatically @generated by uv and should not be manually edited.
2
- # Use `uv lock` to regenerate this file.
3
-
4
- version = 1
5
- requires-python = ">=3.11"
6
-
7
- [[package]]
8
- name = "accelerate"
9
- version = "0.30.0"
10
- source = { registry = "https://pypi.org/simple" }
11
- requires-python = ">=3.8"
12
- dependencies = [
13
- { name = "numpy", version = ">=1.17" },
14
- { name = "packaging", version = ">=20.0" },
15
- { name = "psutil" },
16
- { name = "pyyaml" },
17
- { name = "torch", version = ">=1.4.0" },
18
- ]
19
-
20
- [[package]]
21
- name = "anyio"
22
- version = "4.1.1"
23
- source = { registry = "https://pypi.org/simple" }
24
- requires-python = ">=3.8"
25
- dependencies = [
26
- { name = "sniffio", version = ">=1.1" },
27
- ]
28
-
29
- [[package]]
30
- name = "certifi"
31
- version = "2024.2.2"
32
- source = { registry = "https://pypi.org/simple" }
33
- requires-python = ">=3.6"
34
-
35
- [[package]]
36
- name = "charset-normalizer"
37
- version = "3.3.2"
38
- source = { registry = "https://pypi.org/simple" }
39
- requires-python = ">=3.7"
40
-
41
- [[package]]
42
- name = "click"
43
- version = "8.1.7"
44
- source = { registry = "https://pypi.org/simple" }
45
- requires-python = ">=3.7"
46
-
47
- [[package]]
48
- name = "fastapi"
49
- version = "0.111.0"
50
- source = { registry = "https://pypi.org/simple" }
51
- requires-python = ">=3.8"
52
- dependencies = [
53
- { name = "pydantic", version = "!=1.8.1,>=1.7.4" },
54
- { name = "starlette", version = "==0.37.0" },
55
- { name = "typing-extensions", version = ">=4.8.0" },
56
- ]
57
-
58
- [[package]]
59
- name = "filelock"
60
- version = "3.13.1"
61
- source = { registry = "https://pypi.org/simple" }
62
- requires-python = ">=3.8"
63
-
64
- [[package]]
65
- name = "huggingface-hub"
66
- version = "0.21.4"
67
- source = { registry = "https://pypi.org/simple" }
68
- requires-python = ">=3.8"
69
- dependencies = [
70
- { name = "filelock" },
71
- { name = "fsspec" },
72
- { name = "requests" },
73
- { name = "tqdm" },
74
- { name = "typing-extensions" },
75
- { name = "urllib3" },
76
- { name = "pyyaml", version = ">=5.1" },
77
- ]
78
-
79
- [[package]]
80
- name = "idna"
81
- version = "3.6"
82
- source = { registry = "https://pypi.org/simple" }
83
- requires-python = ">=3.5"
84
-
85
- [[package]]
86
- name = "numpy"
87
- version = "1.26.4"
88
- source = { registry = "https://pypi.org/simple" }
89
- requires-python = ">=3.9"
90
-
91
- [[package]]
92
- name = "openai"
93
- version = "1.42.0"
94
- source = { registry = "https://pypi.org/simple" }
95
- requires-python = ">=3.7"
96
- dependencies = [
97
- { name = "anyio", version = "<5,>=3.5.0" },
98
- { name = "distro", version = "<2,>=1.7.0" },
99
- { name = "httpx", version = "<1,>=0.23.1" },
100
- { name = "pydantic", version = "<3,>=1.9.0" },
101
- { name = "sniffio" },
102
- { name = "typing-extensions", version = "<5,>=4.11" },
103
- ]
104
-
105
- [[package]]
106
- name = "packaging"
107
- version = "24.0"
108
- source = { registry = "https://pypi.org/simple" }
109
- requires-python = ">=3.7"
110
-
111
- [[package]]
112
- name = "pillow"
113
- version = "10.3.0"
114
- source = { registry = "https://pypi.org/simple" }
115
- requires-python = ">=3.8"
116
-
117
- [[package]]
118
- name = "pydantic"
119
- version = "2.7.1"
120
- source = { registry = "https://pypi.org/simple" }
121
- requires-python = ">=3.8"
122
- dependencies = [
123
- { name = "pydantic-core", version = "==2.18.1" },
124
- { name = "typing-extensions", version = "!=4.7.0,>=4.6.1" },
125
- { name = "annotated-types", version = ">=0.4.0" },
126
- ]
127
-
128
- [[package]]
129
- name = "pydantic-core"
130
- version = "2.18.1"
131
- source = { registry = "https://pypi.org/simple" }
132
- requires-python = ">=3.8"
133
-
134
- [[package]]
135
- name = "python-dotenv"
136
- version = "1.0.0"
137
- source = { registry = "https://pypi.org/simple" }
138
- requires-python = ">=3.5"
139
-
140
- [[package]]
141
- name = "pyyaml"
142
- version = "6.0.1"
143
- source = { registry = "https://pypi.org/simple" }
144
- requires-python = ">=3.6"
145
-
146
- [[package]]
147
- name = "requests"
148
- version = "2.31.0"
149
- source = { registry = "https://pypi.org/simple" }
150
- requires-python = ">=3.7"
151
- dependencies = [
152
- { name = "charset-normalizer", version = ">=2,<4" },
153
- { name = "idna", version = ">=2.5,<4" },
154
- { name = "urllib3", version = ">=1.21.1,<3" },
155
- { name = "certifi", version = ">=2017.4.17" },
156
- ]
157
-
158
- [[package]]
159
- name = "sniffio"
160
- version = "1.3.1"
161
- source = { registry = "https://pypi.org/simple" }
162
- requires-python = ">=3.7"
163
-
164
- [[package]]
165
- name = "starlette"
166
- version = "0.37.0"
167
- source = { registry = "https://pypi.org/simple" }
168
- requires-python = ">=3.8"
169
- dependencies = [
170
- { name = "anyio", version = "<5,>=3.4.0" },
171
- ]
172
-
173
- [[package]]
174
- name = "torch"
175
- version = "2.3.0"
176
- source = { registry = "https://pypi.org/simple" }
177
- requires-python = ">=3.8"
178
-
179
- [[package]]
180
- name = "torchvision"
181
- version = "0.18.0"
182
- source = { registry = "https://pypi.org/simple" }
183
- requires-python = ">=3.8"
184
- dependencies = [
185
- { name = "numpy" },
186
- { name = "pillow", version = "!=8.3.0,>=5.3.0" },
187
- { name = "torch", version = "==2.3.0" },
188
- { name = "requests" },
189
- ]
190
-
191
- [[package]]
192
- name = "tqdm"
193
- version = "4.66.2"
194
- source = { registry = "https://pypi.org/simple" }
195
- requires-python = ">=3.7"
196
-
197
- [[package]]
198
- name = "transformers"
199
- version = "4.41.2"
200
- source = { registry = "https://pypi.org/simple" }
201
- requires-python = ">=3.8"
202
- dependencies = [
203
- { name = "filelock" },
204
- { name = "huggingface-hub", version = ">=0.21.1" },
205
- { name = "numpy", version = ">=1.17" },
206
- { name = "packaging", version = ">=20.0" },
207
- { name = "pyyaml", version = ">=5.1" },
208
- { name = "regex", version = "!=2019.12.17" },
209
- { name = "requests" },
210
- { name = "safetensors", version = ">=0.4.1" },
211
- { name = "tokenizers", version = ">=0.14,<0.15" },
212
- { name = "torch", version = ">=1.9" },
213
- { name = "tqdm", version = ">=4.27" },
214
- ]
215
-
216
- [[package]]
217
- name = "typing-extensions"
218
- version = "4.10.0"
219
- source = { registry = "https://pypi.org/simple" }
220
- requires-python = ">=3.8"
221
-
222
- [[package]]
223
- name = "urllib3"
224
- version = "2.1.0"
225
- source = { registry = "https://pypi.org/simple" }
226
- requires-python = ">=3.8"
227
-
228
- [[package]]
229
- name = "uvicorn"
230
- version = "0.29.0"
231
- source = { registry = "https://pypi.org/simple" }
232
- requires-python = ">=3.8"
233
- dependencies = [
234
- { name = "click", version = ">=7.0" },
235
- { name = "h11", version = ">=0.8" },
236
- ]
 
1
+ # This file is automatically @generated by uv and should not be manually edited.
2
+ # Use `uv lock` to regenerate this file.
3
+
4
+ version = 1
5
+ requires-python = ">=3.11"
6
+
7
+ [[package]]
8
+ name = "accelerate"
9
+ version = "0.30.0"
10
+ source = { registry = "https://pypi.org/simple" }
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ { name = "numpy", version = ">=1.17" },
14
+ { name = "packaging", version = ">=20.0" },
15
+ { name = "psutil" },
16
+ { name = "pyyaml" },
17
+ { name = "torch", version = ">=1.4.0" },
18
+ ]
19
+
20
+ [[package]]
21
+ name = "anyio"
22
+ version = "4.1.1"
23
+ source = { registry = "https://pypi.org/simple" }
24
+ requires-python = ">=3.8"
25
+ dependencies = [
26
+ { name = "sniffio", version = ">=1.1" },
27
+ ]
28
+
29
+ [[package]]
30
+ name = "certifi"
31
+ version = "2024.2.2"
32
+ source = { registry = "https://pypi.org/simple" }
33
+ requires-python = ">=3.6"
34
+
35
+ [[package]]
36
+ name = "charset-normalizer"
37
+ version = "3.3.2"
38
+ source = { registry = "https://pypi.org/simple" }
39
+ requires-python = ">=3.7"
40
+
41
+ [[package]]
42
+ name = "click"
43
+ version = "8.1.7"
44
+ source = { registry = "https://pypi.org/simple" }
45
+ requires-python = ">=3.7"
46
+
47
+ [[package]]
48
+ name = "fastapi"
49
+ version = "0.111.0"
50
+ source = { registry = "https://pypi.org/simple" }
51
+ requires-python = ">=3.8"
52
+ dependencies = [
53
+ { name = "pydantic", version = "!=1.8.1,>=1.7.4" },
54
+ { name = "starlette", version = "==0.37.0" },
55
+ { name = "typing-extensions", version = ">=4.8.0" },
56
+ ]
57
+
58
+ [[package]]
59
+ name = "filelock"
60
+ version = "3.13.1"
61
+ source = { registry = "https://pypi.org/simple" }
62
+ requires-python = ">=3.8"
63
+
64
+ [[package]]
65
+ name = "huggingface-hub"
66
+ version = "0.21.4"
67
+ source = { registry = "https://pypi.org/simple" }
68
+ requires-python = ">=3.8"
69
+ dependencies = [
70
+ { name = "filelock" },
71
+ { name = "fsspec" },
72
+ { name = "requests" },
73
+ { name = "tqdm" },
74
+ { name = "typing-extensions" },
75
+ { name = "urllib3" },
76
+ { name = "pyyaml", version = ">=5.1" },
77
+ ]
78
+
79
+ [[package]]
80
+ name = "idna"
81
+ version = "3.6"
82
+ source = { registry = "https://pypi.org/simple" }
83
+ requires-python = ">=3.5"
84
+
85
+ [[package]]
86
+ name = "numpy"
87
+ version = "1.26.4"
88
+ source = { registry = "https://pypi.org/simple" }
89
+ requires-python = ">=3.9"
90
+
91
+ [[package]]
92
+ name = "openai"
93
+ version = "1.42.0"
94
+ source = { registry = "https://pypi.org/simple" }
95
+ requires-python = ">=3.7"
96
+ dependencies = [
97
+ { name = "anyio", version = "<5,>=3.5.0" },
98
+ { name = "distro", version = "<2,>=1.7.0" },
99
+ { name = "httpx", version = "<1,>=0.23.1" },
100
+ { name = "pydantic", version = "<3,>=1.9.0" },
101
+ { name = "sniffio" },
102
+ { name = "typing-extensions", version = "<5,>=4.11" },
103
+ ]
104
+
105
+ [[package]]
106
+ name = "packaging"
107
+ version = "24.0"
108
+ source = { registry = "https://pypi.org/simple" }
109
+ requires-python = ">=3.7"
110
+
111
+ [[package]]
112
+ name = "pillow"
113
+ version = "10.3.0"
114
+ source = { registry = "https://pypi.org/simple" }
115
+ requires-python = ">=3.8"
116
+
117
+ [[package]]
118
+ name = "pydantic"
119
+ version = "2.7.1"
120
+ source = { registry = "https://pypi.org/simple" }
121
+ requires-python = ">=3.8"
122
+ dependencies = [
123
+ { name = "pydantic-core", version = "==2.18.1" },
124
+ { name = "typing-extensions", version = "!=4.7.0,>=4.6.1" },
125
+ { name = "annotated-types", version = ">=0.4.0" },
126
+ ]
127
+
128
+ [[package]]
129
+ name = "pydantic-core"
130
+ version = "2.18.1"
131
+ source = { registry = "https://pypi.org/simple" }
132
+ requires-python = ">=3.8"
133
+
134
+ [[package]]
135
+ name = "python-dotenv"
136
+ version = "1.0.0"
137
+ source = { registry = "https://pypi.org/simple" }
138
+ requires-python = ">=3.5"
139
+
140
+ [[package]]
141
+ name = "pyyaml"
142
+ version = "6.0.1"
143
+ source = { registry = "https://pypi.org/simple" }
144
+ requires-python = ">=3.6"
145
+
146
+ [[package]]
147
+ name = "requests"
148
+ version = "2.31.0"
149
+ source = { registry = "https://pypi.org/simple" }
150
+ requires-python = ">=3.7"
151
+ dependencies = [
152
+ { name = "charset-normalizer", version = ">=2,<4" },
153
+ { name = "idna", version = ">=2.5,<4" },
154
+ { name = "urllib3", version = ">=1.21.1,<3" },
155
+ { name = "certifi", version = ">=2017.4.17" },
156
+ ]
157
+
158
+ [[package]]
159
+ name = "sniffio"
160
+ version = "1.3.1"
161
+ source = { registry = "https://pypi.org/simple" }
162
+ requires-python = ">=3.7"
163
+
164
+ [[package]]
165
+ name = "starlette"
166
+ version = "0.37.0"
167
+ source = { registry = "https://pypi.org/simple" }
168
+ requires-python = ">=3.8"
169
+ dependencies = [
170
+ { name = "anyio", version = "<5,>=3.4.0" },
171
+ ]
172
+
173
+ [[package]]
174
+ name = "torch"
175
+ version = "2.3.0"
176
+ source = { registry = "https://pypi.org/simple" }
177
+ requires-python = ">=3.8"
178
+
179
+ [[package]]
180
+ name = "torchvision"
181
+ version = "0.18.0"
182
+ source = { registry = "https://pypi.org/simple" }
183
+ requires-python = ">=3.8"
184
+ dependencies = [
185
+ { name = "numpy" },
186
+ { name = "pillow", version = "!=8.3.0,>=5.3.0" },
187
+ { name = "torch", version = "==2.3.0" },
188
+ { name = "requests" },
189
+ ]
190
+
191
+ [[package]]
192
+ name = "tqdm"
193
+ version = "4.66.2"
194
+ source = { registry = "https://pypi.org/simple" }
195
+ requires-python = ">=3.7"
196
+
197
+ [[package]]
198
+ name = "transformers"
199
+ version = "4.41.2"
200
+ source = { registry = "https://pypi.org/simple" }
201
+ requires-python = ">=3.8"
202
+ dependencies = [
203
+ { name = "filelock" },
204
+ { name = "huggingface-hub", version = ">=0.21.1" },
205
+ { name = "numpy", version = ">=1.17" },
206
+ { name = "packaging", version = ">=20.0" },
207
+ { name = "pyyaml", version = ">=5.1" },
208
+ { name = "regex", version = "!=2019.12.17" },
209
+ { name = "requests" },
210
+ { name = "safetensors", version = ">=0.4.1" },
211
+ { name = "tokenizers", version = ">=0.14,<0.15" },
212
+ { name = "torch", version = ">=1.9" },
213
+ { name = "tqdm", version = ">=4.27" },
214
+ ]
215
+
216
+ [[package]]
217
+ name = "typing-extensions"
218
+ version = "4.10.0"
219
+ source = { registry = "https://pypi.org/simple" }
220
+ requires-python = ">=3.8"
221
+
222
+ [[package]]
223
+ name = "urllib3"
224
+ version = "2.1.0"
225
+ source = { registry = "https://pypi.org/simple" }
226
+ requires-python = ">=3.8"
227
+
228
+ [[package]]
229
+ name = "uvicorn"
230
+ version = "0.29.0"
231
+ source = { registry = "https://pypi.org/simple" }
232
+ requires-python = ">=3.8"
233
+ dependencies = [
234
+ { name = "click", version = ">=7.0" },
235
+ { name = "h11", version = ">=0.8" },
236
+ ]