Mirae Kang commited on
Commit
e317d56
·
1 Parent(s): f0b240d

feat: update UI using VITE+React without streamlit, #22

Browse files
.dockerignore CHANGED
@@ -20,6 +20,7 @@ tests
20
  !README.md
21
  .env
22
  .env.*
 
23
  frontend/dist
24
  models/checkpoints
25
  models/**/checkpoints
@@ -28,4 +29,3 @@ models/roberta_hate_results
28
  models/distilbert_results
29
  models/best_distilbert
30
  models/nb08_*
31
- models/*_frozen
 
20
  !README.md
21
  .env
22
  .env.*
23
+ !.env.example
24
  frontend/dist
25
  models/checkpoints
26
  models/**/checkpoints
 
29
  models/distilbert_results
30
  models/best_distilbert
31
  models/nb08_*
 
.env.example CHANGED
@@ -1,15 +1,18 @@
1
- # Copy to .env for local development: cp .env.example .env
2
- # Docker Compose reads these via environment (optional).
3
 
4
- # YouTube Data API v3 (optional — /predict-video and scraping)
5
  # https://console.cloud.google.com/apis/credentials
6
  YOUTUBE_API_KEY=
7
 
8
- # Active model (must match a key in ModelService.AVAILABLE_MODELS)
9
  MODEL_NAME=LR + TF-IDF (local)
10
 
11
  # development | production
12
- ENV=production
13
 
14
- # Used by Streamlit when calling the API from another host (Docker sets this automatically)
15
- API_URL=http://localhost:8000
 
 
 
 
1
+ # Copy to .env: cp .env.example .env
2
+ # Docker Compose reads YOUTUBE_API_KEY from your environment.
3
 
4
+ # YouTube Data API v3 — required for real suggested videos and /predict-video
5
  # https://console.cloud.google.com/apis/credentials
6
  YOUTUBE_API_KEY=
7
 
8
+ # Active model (key from configs/model_catalog.yaml)
9
  MODEL_NAME=LR + TF-IDF (local)
10
 
11
  # development | production
12
+ ENV=development
13
 
14
+ # Optional: frontend dev when API is on another host (default uses Vite proxy)
15
+ VITE_API_BASE_URL=
16
+
17
+ # Docker only: build with Hugging Face models (see README)
18
+ # INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile CHANGED
@@ -1,30 +1,40 @@
1
- # youtube_hate_detector — shared image for FastAPI + Streamlit services
 
 
 
 
 
 
 
2
  FROM python:3.12-slim-bookworm
3
 
 
 
4
  ENV PYTHONDONTWRITEBYTECODE=1 \
5
  PYTHONUNBUFFERED=1 \
6
  PYTHONPATH=/app \
7
  NLTK_DATA=/app/nltk_data \
8
  MODEL_NAME="LR + TF-IDF (local)" \
9
- ENV=production
 
10
 
11
  WORKDIR /app
12
 
13
- # System deps for spaCy / sklearn wheels
14
  RUN apt-get update \
15
- && apt-get install -y --no-install-recommends build-essential curl \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
- COPY requirements.txt .
19
 
20
- # CPU-only PyTorch keeps the image smaller; sufficient for the default local LR model
21
- RUN pip install --no-cache-dir --upgrade pip \
22
- && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
23
- && pip install --no-cache-dir -r requirements.txt \
24
- && python -m spacy download en_core_web_sm
 
25
 
26
- # NLTK corpora used by TextPreprocessor
27
- RUN python - <<'PY'
28
  import nltk
29
  for pkg in ("stopwords", "punkt"):
30
  nltk.download(pkg, download_dir="/app/nltk_data")
@@ -33,8 +43,13 @@ PY
33
  COPY configs/ configs/
34
  COPY src/ src/
35
  COPY models/final_model.joblib models/final_model.joblib
 
 
 
 
 
36
 
37
- # Default env template (overridden by docker-compose)
38
- COPY env.example .env.example
39
 
40
- EXPOSE 8000 8501
 
1
+ # youtube_hate_detector — multi-stage: React + FastAPI (uv)
2
+ FROM node:22-bookworm-slim AS frontend-build
3
+ WORKDIR /app/frontend
4
+ COPY frontend/package.json frontend/package-lock.json* ./
5
+ RUN npm ci 2>/dev/null || npm install
6
+ COPY frontend/ ./
7
+ RUN npm run build
8
+
9
  FROM python:3.12-slim-bookworm
10
 
11
+ ARG INSTALL_HF=0
12
+
13
  ENV PYTHONDONTWRITEBYTECODE=1 \
14
  PYTHONUNBUFFERED=1 \
15
  PYTHONPATH=/app \
16
  NLTK_DATA=/app/nltk_data \
17
  MODEL_NAME="LR + TF-IDF (local)" \
18
+ ENV=production \
19
+ INSTALL_HF=${INSTALL_HF}
20
 
21
  WORKDIR /app
22
 
 
23
  RUN apt-get update \
24
+ && apt-get install -y --no-install-recommends curl \
25
  && rm -rf /var/lib/apt/lists/*
26
 
27
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
28
 
29
+ COPY pyproject.toml uv.lock* README.md ./
30
+ RUN if [ "$INSTALL_HF" = "1" ]; then \
31
+ uv sync --frozen --no-dev --extra hf 2>/dev/null || uv sync --no-dev --extra hf; \
32
+ else \
33
+ uv sync --frozen --no-dev 2>/dev/null || uv sync --no-dev; \
34
+ fi
35
 
36
+ RUN uv run python -m spacy download en_core_web_sm \
37
+ && uv run python - <<'PY'
38
  import nltk
39
  for pkg in ("stopwords", "punkt"):
40
  nltk.download(pkg, download_dir="/app/nltk_data")
 
43
  COPY configs/ configs/
44
  COPY src/ src/
45
  COPY models/final_model.joblib models/final_model.joblib
46
+ COPY models/finetuned_hf/ models/finetuned_hf/
47
+ COPY --from=frontend-build /app/frontend/dist frontend/dist
48
+ COPY .env.example .env.example
49
+
50
+ EXPOSE 8000
51
 
52
+ HEALTHCHECK --interval=10s --timeout=5s --retries=12 --start-period=60s \
53
+ CMD curl -f http://localhost:8000/health || exit 1
54
 
55
+ CMD ["uv", "run", "uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -1,247 +1,143 @@
1
- # YouTube Toxic Comment Detector (SignalMod)
2
 
3
  [![Python](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
4
  [![FastAPI](https://img.shields.io/badge/FastAPI-0.136-009688.svg)](https://fastapi.tiangolo.com/)
5
- [![Streamlit](https://img.shields.io/badge/Streamlit-UI-FF4B4B.svg)](https://streamlit.io/)
6
  [![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg)](https://docs.docker.com/compose/)
 
7
  **Español:** [README.es.md](README.es.md)
8
 
9
- Automated **Safe vs Toxic** classification for YouTube-style comments. The production stack is **FastAPI** (REST inference) plus **Streamlit** (watch-page style UI). The default model is **Logistic Regression + TF-IDF** (`models/final_model.joblib`).
10
 
11
  ---
12
 
13
- ## Project description
14
-
15
- | Item | Detail |
16
- |------|--------|
17
- | **Goal** | Help moderation teams flag toxic comments quickly |
18
- | **Dataset** | `data/raw/youtoxic_english_1000.csv` (~1k English comments) |
19
- | **Target** | `IsToxic` → **Safe (0)** / **Toxic (1)** |
20
- | **Primary metric** | Weighted F1 and ROC-AUC (imbalanced classes) |
21
- | **Overfitting check** | \|CV F1 − test F1\| &lt; 5 percentage points (project rubric) |
22
-
23
- ---
24
 
25
- ## Architecture
 
 
 
26
 
27
  ```
28
  youtube_hate_detector/
29
- ├── configs/ # YAML: pipeline, features, models, best_params
30
- ├── data/raw/ # Source CSV (not committed if gitignored)
31
  ├── models/ # final_model.joblib, experiments/
32
- ├── reports/ # summary.csv, plots, pipeline artifacts
33
  ├── src/
34
- │ ├── api/ # FastAPI — /predict, /predict-batch, …
35
- ── app/ # Streamlit UI (src/app/app.py)
36
- ├── data/ # load_raw_data, scraping helpers
37
- ├── evaluation/ # Evaluator — metrics, ROC, confusion matrix
38
- │ ├── features/ # TextPreprocessor, Vectorizer
39
- │ ├── models/ # LR, RF, XGBoost baselines
40
- │ ├── pipeline/ # run_pipeline.py — train end-to-end
41
- │ └── service/ # ModelService — shared inference layer
42
- ├── tests/
43
- ├── Dockerfile
44
  └── docker-compose.yml
45
  ```
46
 
47
- **Runtime flow**
48
-
49
- 1. **Training:** `load_raw_data` → `TextPreprocessor` → `build_model().fit()` → `Evaluator` → `reports/summary.csv`
50
- 2. **API:** `uvicorn` loads `ModelService` → `POST /predict`
51
- 3. **Streamlit:** `ModelService.predict()` in-process (same models as API catalog)
52
-
53
- See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for more detail.
54
-
55
  ---
56
 
57
- ## Installation
58
-
59
- **Requirements:** Python 3.12+, ~2 GB disk for dependencies (optional PyTorch if using Hugging Face models in the UI).
60
-
61
- ```bash
62
- git clone https://github.com/Bootcamp-IA-P6/Project_9_Equipo3.git
63
- cd Project_9_Equipo3 # or your local folder name
64
-
65
- python -m venv .venv
66
- source .venv/bin/activate # Windows: .venv\Scripts\activate
67
-
68
- pip install -r requirements.txt
69
- python -m spacy download en_core_web_sm
70
- ```
71
-
72
- **Data:** place `youtoxic_english_1000.csv` under `data/raw/` (path in `configs/pipeline.yaml`).
73
 
74
- **Environment:**
75
 
76
  ```bash
77
  cp .env.example .env
78
- # Optional: YOUTUBE_API_KEY for /predict-video
79
- # MODEL_NAME must match a key in ModelService (default: LR + TF-IDF (local))
80
- ```
81
-
82
- ---
83
-
84
- ## Training pipeline
85
-
86
- End-to-end training and evaluation:
87
-
88
- ```bash
89
- python -m src.pipeline.run_pipeline --model lr
90
- # Options: lr | rf | xgboost
91
  ```
92
 
93
- **Phases:** load data stratified split → spaCy/NLTK preprocessing → train → 5-fold CV test metrics → save `models/experiments/{model}/` → MLflow → update [`reports/summary.csv`](reports/summary.csv) and plots under `reports/pipeline/{model}/`.
94
-
95
- Config files:
96
-
97
- | File | Purpose |
98
- |------|---------|
99
- | `configs/pipeline.yaml` | Paths, `IsToxic`, test_size, CV folds |
100
- | `configs/features.yaml` | Preprocessing + TF-IDF |
101
- | `configs/models.yaml` | Classifier hyperparameters |
102
- | `configs/best_params.yaml` | Optuna winner (LR) |
103
-
104
- Details: [docs/PIPELINE.md](docs/PIPELINE.md)
105
-
106
- ---
107
 
108
- ## Run with Docker
 
 
 
109
 
110
- ```bash
111
- docker compose up --build
112
- ```
113
 
114
- | Service | URL |
115
- |---------|-----|
116
- | Streamlit | http://localhost:8501 |
117
- | FastAPI | http://localhost:8000 |
118
- | Swagger | http://localhost:8000/docs |
 
 
 
119
 
120
- ```bash
121
- export YOUTUBE_API_KEY=your_key # optional
122
- docker compose down # stop
123
- ```
124
 
125
- Containers: `youtube_hate_detector-api`, `youtube_hate_detector-streamlit`.
126
 
127
  ---
128
 
129
- ## Local run (without Docker)
130
 
131
  ```bash
132
  # Terminal 1 — API
133
- uvicorn src.api.main:app --reload --host 0.0.0.0 --port 8000
134
 
135
- # Terminal 2 — Streamlit
136
- streamlit run src/app/app.py --server.port 8501
137
  ```
138
 
139
- ---
140
-
141
- ## API examples
142
 
143
- Full reference: [docs/API.md](docs/API.md)
144
 
145
- **Health check**
146
 
147
  ```bash
148
- curl -s http://localhost:8000/ | python -m json.tool
149
- ```
150
 
151
- **Single prediction**
152
-
153
- ```bash
154
- curl -s -X POST http://localhost:8000/predict \
155
- -H "Content-Type: application/json" \
156
- -d '{"text": "This video is amazing, thanks for sharing!", "threshold": 0.5}'
157
  ```
158
 
159
- Example response:
160
-
161
- ```json
162
- {
163
- "text": "This video is amazing, thanks for sharing!",
164
- "is_toxic": false,
165
- "probability": 0.08,
166
- "labels": [],
167
- "model_used": "LR + TF-IDF (local)",
168
- "latency_ms": 12.5
169
- }
170
- ```
171
 
172
- **Batch**
173
 
174
- ```bash
175
- curl -s -X POST http://localhost:8000/predict-batch \
176
- -H "Content-Type: application/json" \
177
- -d '{"texts": ["Great content!", "You are an idiot"], "threshold": 0.5}'
178
- ```
179
 
180
- **List / switch models**
181
 
182
  ```bash
183
- curl -s http://localhost:8000/models
184
- curl -s -X PUT http://localhost:8000/model/DistilBERT%20Toxicity
185
  ```
186
 
187
- ---
188
-
189
- ## Results
190
-
191
- Best **sklearn** model on the project test split (from `configs/best_params.yaml`):
192
-
193
- | Metric | Value |
194
- |--------|-------|
195
- | F1 (weighted, test) | **0.7579** |
196
- | ROC-AUC | **0.81** |
197
- | False positives | 18 |
198
- | False negatives | 30 |
199
- | CV–test gap | **4.76 pp** (within 5 pp target) |
200
- | Train–test gap | 14.07 pp |
201
-
202
- Plots and EDA: `reports/v2/`. Per-run artifacts: `reports/pipeline/{lr,rf,xgboost}/`.
203
 
204
  ---
205
 
206
- ## Technical results report
207
-
208
- Full write-up (decisions, metrics, error analysis, limitations, roadmap):
209
-
210
- - **English:** [reports/final_report.md](reports/final_report.md)
211
- - **Español:** [reports/final_report.es.md](reports/final_report.es.md)
212
 
213
- ## Model comparison
214
-
215
- Canonical table: [`reports/summary.csv`](reports/summary.csv)
216
- Human-readable: [docs/RESULTS.md](docs/RESULTS.md)
217
-
218
- | Model | Family | F1 (test) | ROC-AUC | FP | FN | Production default |
219
- |-------|--------|-----------|---------|----|----|--------------------|
220
- | LR + TF-IDF (tuned) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes |
221
- | LR + TF-IDF (local) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes (`final_model.joblib`) |
222
- | RF / XGBoost | sklearn | — | — | — | — | Run pipeline to fill |
223
- | DistilBERT / toxic-bert / RoBERTa | Hugging Face | — | — | — | — | Optional via API/UI |
224
-
225
- Re-run `python -m src.pipeline.run_pipeline --model rf` to append RF metrics to `summary.csv`.
226
 
227
  ---
228
 
229
  ## Tests
230
 
231
  ```bash
232
- pytest tests/ -v
 
233
  ```
234
 
235
- Covers preprocessor, vectorizer, model binary output, and `/predict` response shape.
236
-
237
  ---
238
 
239
- ## Documentation index
 
 
 
 
 
 
240
 
241
- | English | Español |
242
- |---------|---------|
243
- | [docs/API.md](docs/API.md) | [docs/API.es.md](docs/API.es.md) |
244
- | [docs/PIPELINE.md](docs/PIPELINE.md) | [docs/PIPELINE.es.md](docs/PIPELINE.es.md) |
245
- | [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) |
246
- | [docs/RESULTS.md](docs/RESULTS.md) | [docs/RESULTS.es.md](docs/RESULTS.es.md) |
247
- | [reports/final_report.md](reports/final_report.md) | [reports/final_report.es.md](reports/final_report.es.md) |
 
1
+ # YouTube Toxic Comment Detector (youtube_hate_detector)
2
 
3
  [![Python](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
4
  [![FastAPI](https://img.shields.io/badge/FastAPI-0.136-009688.svg)](https://fastapi.tiangolo.com/)
5
+ [![React](https://img.shields.io/badge/React-UI-61DAFB.svg)](https://react.dev/)
6
  [![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg)](https://docs.docker.com/compose/)
7
+
8
  **Español:** [README.es.md](README.es.md)
9
 
10
+ Automated **Safe vs Toxic** classification for YouTube-style comments. Production stack: **FastAPI** (REST) + **React** (YouTube Watch UI). Default model: **Logistic Regression + TF-IDF** (`models/final_model.joblib`).
11
 
12
  ---
13
 
14
+ ## Clone and layout
 
 
 
 
 
 
 
 
 
 
15
 
16
+ ```bash
17
+ git clone <your-repo-url>
18
+ cd youtube_hate_detector # use this folder name locally (team convention)
19
+ ```
20
 
21
  ```
22
  youtube_hate_detector/
23
+ ├── configs/ # pipeline, features, model_catalog, suggested_videos
24
+ ├── frontend/ # React SPA (Vite)
25
  ├── models/ # final_model.joblib, experiments/
 
26
  ├── src/
27
+ │ ├── api/ # FastAPI routes
28
+ ── service/ # ModelService (inference)
29
+ ├── pyproject.toml # uv dependencies
30
+ ├── uv.lock
 
 
 
 
 
 
31
  └── docker-compose.yml
32
  ```
33
 
 
 
 
 
 
 
 
 
34
  ---
35
 
36
+ ## How to use FastAPI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ The API loads `ModelService` once at startup and serves JSON only (the React app is the UI).
39
 
40
  ```bash
41
  cp .env.example .env
42
+ uv sync # baseline (LR model only)
43
+ uv sync --extra hf # required for DistilBERT / toxic-bert / Fine-tuned HF models
44
+ uv run uvicorn src.api.main:app --reload --port 8000
 
 
 
 
 
 
 
 
 
 
45
  ```
46
 
47
+ Verify HF deps: `uv run python -c "import transformers; print('ok')"`.
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ | Resource | URL |
50
+ |----------|-----|
51
+ | Swagger | http://localhost:8000/docs |
52
+ | Health | http://localhost:8000/health |
53
 
54
+ **Main endpoints**
 
 
55
 
56
+ | Method | Path | Description |
57
+ |--------|------|-------------|
58
+ | `POST` | `/predict` | Score one comment `{ "text", "threshold" }` |
59
+ | `POST` | `/predict-video` | Fetch YouTube comments + score `{ "url", "max_comments", "threshold" }` |
60
+ | `GET` | `/videos/suggested` | Metadata for right-rail videos (from `configs/suggested_videos.yaml`) |
61
+ | `GET` | `/models` | Available models |
62
+ | `GET` | `/models/status` | Per-model availability (HF deps, local weights) |
63
+ | `PUT` | `/model/{name}` | Switch active model (warmup-validated) |
64
 
65
+ Set `YOUTUBE_API_KEY` in `.env` for real comments and suggested-video thumbnails.
 
 
 
66
 
67
+ **Change models without UI changes:** edit [`configs/model_catalog.yaml`](configs/model_catalog.yaml), then restart the API or use Settings in the app.
68
 
69
  ---
70
 
71
+ ## React UI (local dev)
72
 
73
  ```bash
74
  # Terminal 1 — API
75
+ uv run uvicorn src.api.main:app --reload --port 8000
76
 
77
+ # Terminal 2 — frontend (proxies API)
78
+ cd frontend && npm install && npm run dev
79
  ```
80
 
81
+ Open http://localhost:5173 — Watch page with staged demo player, real suggested videos (click to load comments), English UI.
 
 
82
 
83
+ ---
84
 
85
+ ## Docker
86
 
87
  ```bash
88
+ export YOUTUBE_API_KEY=your_key # optional but recommended
89
+ docker compose up --build # LR model only (default)
90
 
91
+ # Hugging Face models (transformers + torch; larger image):
92
+ INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1
93
+ INSTALL_HF=1 docker compose up
 
 
 
94
  ```
95
 
96
+ | URL | Service |
97
+ |-----|---------|
98
+ | http://localhost:8000 | API + built React SPA |
99
+ | http://localhost:8000/docs | Swagger |
 
 
 
 
 
 
 
 
100
 
101
+ Container: `youtube_hate_detector-app`.
102
 
103
+ ---
 
 
 
 
104
 
105
+ ## Training (unchanged)
106
 
107
  ```bash
108
+ uv run python -m src.pipeline.run_pipeline --model lr
 
109
  ```
110
 
111
+ See [docs/PIPELINE.md](docs/PIPELINE.md).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  ---
114
 
115
+ ## Configuration
 
 
 
 
 
116
 
117
+ | File | Purpose |
118
+ |------|---------|
119
+ | `.env` | Secrets (`YOUTUBE_API_KEY`, `MODEL_NAME`) |
120
+ | `configs/model_catalog.yaml` | Inference models for API/UI |
121
+ | `configs/suggested_videos.yaml` | YouTube IDs for the suggested rail |
122
+ | `configs/pipeline.yaml` | Training data paths |
 
 
 
 
 
 
 
123
 
124
  ---
125
 
126
  ## Tests
127
 
128
  ```bash
129
+ uv sync --extra dev --extra hf
130
+ uv run pytest
131
  ```
132
 
 
 
133
  ---
134
 
135
+ ## Briefing vs team stack
136
+
137
+ | Topic | Briefing | This repo |
138
+ |-------|----------|-----------|
139
+ | UI | Streamlit | **React** |
140
+ | API | FastAPI | **FastAPI** |
141
+ | Package manager | varies | **`uv`** |
142
 
143
+ Legacy Streamlit (`src/app/`) has been removed.
 
 
 
 
 
 
configs/model_catalog.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "LR + TF-IDF (local)":
2
+ type: local
3
+ icon: "⚡"
4
+ description: "Project baseline. No GPU, instant inference."
5
+ speed: "< 50ms"
6
+ accuracy: "F1 0.76"
7
+ requires: "joblib only"
8
+
9
+ "DistilBERT Toxicity":
10
+ type: hf_remote
11
+ icon: "🤖"
12
+ model_id: martin-ha/toxic-comment-model
13
+ description: "DistilBERT fine-tuned on toxic comments (Hugging Face Hub)."
14
+ speed: "~200ms CPU"
15
+ accuracy: "F1 0.85"
16
+ requires: "uv sync --extra hf"
17
+
18
+ "toxic-bert (multilabel)":
19
+ type: hf_remote
20
+ icon: "🧠"
21
+ model_id: unitary/toxic-bert
22
+ description: "BERT multi-label (Jigsaw). Six toxicity categories (Hugging Face Hub)."
23
+ speed: "~400ms CPU"
24
+ accuracy: "F1 0.88"
25
+ requires: "uv sync --extra hf"
26
+
27
+ "RoBERTa Toxicity":
28
+ type: hf_remote
29
+ icon: "🔬"
30
+ model_id: s-nlp/roberta_toxicity_classifier
31
+ description: "RoBERTa fine-tuned for general toxicity (Hugging Face Hub)."
32
+ speed: "~350ms CPU"
33
+ accuracy: "F1 0.87"
34
+ requires: "uv sync --extra hf"
35
+
36
+ "Fine-tuned (local HF)":
37
+ type: hf_local
38
+ icon: "✨"
39
+ model_path: models/finetuned_hf
40
+ description: "Locally fine-tuned Hugging Face model (models/finetuned_hf)."
41
+ speed: "Hardware dependent"
42
+ accuracy: "TBD"
43
+ requires: "uv sync --extra hf"
configs/suggested_videos.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Suggested videos for the watch-page right rail (edit ids only).
2
+ # Prefer embed-friendly videos with comments enabled (avoid Vevo music IDs).
3
+ max_comments: 50
4
+
5
+ videos:
6
+ - id: jNQXAC9IVRw
7
+ note: Me at the zoo — first YouTube upload; comments enabled
8
+ - id: IEEhzQoKtQU
9
+ note: 3Blue1Brown — embed-friendly educational
10
+ - id: dQw4w9WgXcQ
11
+ note: Rick Astley — usually embeddable
12
+ - id: e-z0xWm0xK0
13
+ note: Kurzgesagt — educational, comments on
14
+ - id: aKydtOUFkeg
15
+ note: TED-style talk — embed-friendly
docker-compose.yml CHANGED
@@ -1,21 +1,18 @@
1
- # youtube_hate_detector — API + Streamlit UI
2
- # Start everything: docker compose up --build
3
- # Stop: docker compose down
4
 
5
  name: youtube_hate_detector
6
 
7
  services:
8
- api:
9
- build: .
 
 
 
 
10
  image: youtube_hate_detector:latest
11
- container_name: youtube_hate_detector-api
12
- command:
13
- - uvicorn
14
- - src.api.main:app
15
- - --host
16
- - "0.0.0.0"
17
- - --port
18
- - "8000"
19
  ports:
20
  - "8000:8000"
21
  environment:
@@ -24,34 +21,9 @@ services:
24
  YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
25
  NLTK_DATA: /app/nltk_data
26
  healthcheck:
27
- test: ["CMD", "curl", "-f", "http://localhost:8000/"]
28
  interval: 10s
29
  timeout: 5s
30
  retries: 12
31
- start_period: 40s
32
- restart: unless-stopped
33
-
34
- streamlit:
35
- # Reuses the image built by `api` — do not add `build:` here (parallel builds race on the same tag)
36
- image: youtube_hate_detector:latest
37
- container_name: youtube_hate_detector-streamlit
38
- command:
39
- - streamlit
40
- - run
41
- - src/app/app.py
42
- - --server.port=8501
43
- - --server.address=0.0.0.0
44
- - --server.headless=true
45
- - --browser.gatherUsageStats=false
46
- ports:
47
- - "8501:8501"
48
- environment:
49
- MODEL_NAME: "LR + TF-IDF (local)"
50
- ENV: production
51
- API_URL: http://api:8000
52
- YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
53
- NLTK_DATA: /app/nltk_data
54
- depends_on:
55
- api:
56
- condition: service_healthy
57
  restart: unless-stopped
 
1
+ # youtube_hate_detector — FastAPI + React (single service)
2
+ # Start: docker compose up --build
3
+ # Stop: docker compose down
4
 
5
  name: youtube_hate_detector
6
 
7
  services:
8
+ app:
9
+ build:
10
+ context: .
11
+ args:
12
+ # Set INSTALL_HF=1 for Hugging Face models (larger image, ~1–2 GB extra)
13
+ INSTALL_HF: ${INSTALL_HF:-0}
14
  image: youtube_hate_detector:latest
15
+ container_name: youtube_hate_detector-app
 
 
 
 
 
 
 
16
  ports:
17
  - "8000:8000"
18
  environment:
 
21
  YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
22
  NLTK_DATA: /app/nltk_data
23
  healthcheck:
24
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
25
  interval: 10s
26
  timeout: 5s
27
  retries: 12
28
+ start_period: 60s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  restart: unless-stopped
docs/ARCHITECTURE.md CHANGED
@@ -1,66 +1,46 @@
1
- # System architecture
2
 
3
- ## Components
4
 
5
  ```mermaid
6
- flowchart TB
7
- subgraph data [Data layer]
8
- CSV[data/raw/youtoxic_english_1000.csv]
9
- CFG[configs/*.yaml]
10
- end
11
-
12
- subgraph training [Training]
13
- PIPE[run_pipeline.py]
14
- PRE[TextPreprocessor]
15
- BL[build_model LR RF XGB]
16
- EV[Evaluator]
17
- CSV --> PIPE
18
- CFG --> PIPE
19
- PIPE --> PRE --> BL --> EV
20
- EV --> SUM[reports/summary.csv]
21
- BL --> JOB[models/experiments/]
22
- end
23
-
24
- subgraph inference [Inference]
25
- MS[ModelService]
26
- JOB2[models/final_model.joblib]
27
- JOB2 --> MS
28
- API[FastAPI src/api/main.py]
29
- UI[Streamlit src/app/app.py]
30
- MS --> API
31
- MS --> UI
32
- end
33
  ```
34
 
35
- ## Module map
 
 
 
36
 
37
- | Module | Responsibility |
38
- |--------|----------------|
39
- | `src/data/loader.py` | Load raw CSV, optional processed paths |
40
- | `src/features/text_preprocessor.py` | Clean and lemmatize text |
41
- | `src/features/vectorizer.py` | Standalone TF-IDF (notebooks); baselines embed TF-IDF in sklearn `Pipeline` |
42
- | `src/models/baseline.py` | `LRModel`, `RFModel`, `XGBModel`, `build_model()` |
43
- | `src/evaluation/evaluator.py` | Metrics, ROC, confusion matrix, error analysis, `summary.csv` |
44
- | `src/pipeline/run_pipeline.py` | Orchestrates training + evaluation |
45
- | `src/service/model_service.py` | Loads joblib or Hugging Face models; `predict(text)` |
46
- | `src/api/main.py` | REST endpoints, lifespan model load |
47
- | `src/app/app.py` | Streamlit UI; calls `ModelService` directly |
48
 
49
- ## Label strategy
50
-
51
- - **Binary default:** column `IsToxic` Safe `0`, Toxic `1`
52
- - User-facing strings: **Safe** / **Toxic** (not “hate” or “harmful” in the UI copy)
53
- - API returns `is_toxic` and `probability` (P(toxic))
54
 
55
  ## Docker
56
 
57
- [`docker-compose.yml`](../docker-compose.yml) runs two containers from one image:
58
-
59
- - `youtube_hate_detector-api` — uvicorn port 8000
60
- - `youtube_hate_detector-streamlit` — port 8501
61
 
62
- Both include `final_model.joblib`, configs, spaCy, and NLTK data baked into the image.
63
 
64
- ## Tests
65
-
66
- [`tests/`](../tests/) preprocessor, vectorizer, model binary outputs, `/predict` schema (mocked service).
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture — youtube_hate_detector
2
 
3
+ ## Runtime (production)
4
 
5
  ```mermaid
6
+ flowchart LR
7
+ Browser[React SPA]
8
+ API[FastAPI :8000]
9
+ MS[ModelService]
10
+ YT[YouTube Data API]
11
+ Browser -->|HTTP JSON| API
12
+ API --> MS
13
+ API --> YT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  ```
15
 
16
+ - **UI:** `frontend/` built to `frontend/dist`, served by FastAPI `StaticFiles` in production.
17
+ - **Inference:** Only `ModelService` in `src/service/` loads models.
18
+ - **Catalog:** `configs/model_catalog.yaml` — add models without React changes.
19
+ - **Suggested videos:** `configs/suggested_videos.yaml` — YouTube video IDs for the right rail.
20
 
21
+ ## Local development
 
 
 
 
 
 
 
 
 
 
22
 
23
+ | Process | Command | Port |
24
+ |---------|---------|------|
25
+ | API | `uv run uvicorn src.api.main:app --reload` | 8000 |
26
+ | UI | `cd frontend && npm run dev` | 5173 (proxies API) |
 
27
 
28
  ## Docker
29
 
30
+ Single service `youtube_hate_detector-app` on port **8000** (API + static UI).
 
 
 
31
 
32
+ ## API layout
33
 
34
+ ```
35
+ src/api/
36
+ main.py # app factory, CORS, static mount
37
+ schemas.py # Pydantic models
38
+ services.py # predict helpers
39
+ youtube.py # comment fetch + metadata
40
+ state.py # shared app state
41
+ routes/
42
+ health.py
43
+ models.py
44
+ predict.py
45
+ videos.py
46
+ ```
pyproject.toml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "youtube_hate_detector"
3
+ version = "1.0.0"
4
+ description = "YouTube toxic comment detector — FastAPI + React"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12,<3.13"
7
+ dependencies = [
8
+ "fastapi>=0.136.1",
9
+ "uvicorn[standard]>=0.47.0",
10
+ "scikit-learn>=1.8.0",
11
+ "spacy>=3.8.14",
12
+ "nltk>=3.9.4",
13
+ "pandas>=3.0.2",
14
+ "PyYAML>=6.0.3",
15
+ "python-dotenv>=1.2.2",
16
+ "joblib>=1.5.3",
17
+ "pydantic>=2.13.4",
18
+ "httpx>=0.28.1",
19
+ "google-api-python-client>=2.100.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ hf = [
24
+ "transformers>=5.9.0",
25
+ "torch>=2.0.0",
26
+ "sentencepiece>=0.2.0",
27
+ "accelerate>=0.30.0",
28
+ ]
29
+ dev = [
30
+ "pytest>=8.0.0",
31
+ ]
32
+ [build-system]
33
+ requires = ["hatchling"]
34
+ build-backend = "hatchling.build"
35
+
36
+ [tool.hatch.build.targets.wheel]
37
+ packages = ["src"]
38
+
39
+ [tool.pytest.ini_options]
40
+ testpaths = ["tests"]
41
+ pythonpath = ["."]
requirements.txt DELETED
@@ -1,17 +0,0 @@
1
- # Runtime dependencies for API + Streamlit (Docker and local installs)
2
- fastapi==0.136.1
3
- uvicorn[standard]==0.47.0
4
- streamlit>=1.41.0,<2
5
- scikit-learn==1.8.0
6
- spacy==3.8.14
7
- nltk==3.9.4
8
- pandas==3.0.2
9
- PyYAML==6.0.3
10
- python-dotenv==1.2.2
11
- joblib==1.5.3
12
- pydantic==2.13.4
13
- transformers==5.9.0
14
- httpx==0.28.1
15
- matplotlib>=3.8.0
16
- seaborn>=0.13.0
17
- mlflow>=2.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """FastAPI application package."""
src/api/main.py CHANGED
@@ -1,462 +1,114 @@
1
  """
2
- src/api/main.py
3
 
4
- API REST de producción para detección de hate speech.
5
- Ejecutar con: uvicorn src.api.main:app --reload --port 8000
6
-
7
- Documentación automática en:
8
- http://localhost:8000/docs (Swagger UI)
9
- http://localhost:8000/redoc (ReDoc)
10
-
11
- Endpoints:
12
- GET / → health check
13
- GET /model-info → info del modelo activo
14
- GET /models → lista de modelos disponibles
15
- POST /predict → predice un comentario
16
- POST /predict-batch → predice una lista de comentarios
17
- POST /predict-video → dado URL de YouTube, predice todos sus comentarios
18
- PUT /model/{name} → cambia el modelo activo
19
  """
20
 
 
 
21
  import os
22
- import sys
23
  import time
24
- import logging
25
- from pathlib import Path
26
- from typing import Optional
27
  from contextlib import asynccontextmanager
28
- from dotenv import load_dotenv
29
- load_dotenv()
30
 
31
- from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
 
32
  from fastapi.middleware.cors import CORSMiddleware
33
- from pydantic import BaseModel, Field, field_validator
 
34
 
35
- # ── Setup path ────────────────────────────────────────────────────────────────
36
- PROJECT_ROOT = Path(__file__).resolve().parents[2]
37
- sys.path.insert(0, str(PROJECT_ROOT))
38
 
39
- from src.service.model_service import ModelService, AVAILABLE_MODELS
 
 
40
  from src.utils.logger import get_logger
41
 
42
  logger = get_logger(__name__)
43
 
44
- # ── Estado global de la app ───────────────────────────────────────────────────
45
- # El modelo se carga una sola vez al iniciar la API y se reutiliza.
46
- # Esto evita cargar el modelo en cada request (costoso en tiempo).
47
- _state: dict = {
48
- "service" : None,
49
- "model_name" : None,
50
- "startup_time" : None,
51
- "predictions_served": 0,
52
- }
53
 
54
 
55
- # ══════════════════════════════════════════════════════════════════════════════
56
- # LIFESPAN — carga del modelo al iniciar la API
57
- # ══════════════════════════════════════════════════════════════════════════════
58
  @asynccontextmanager
59
  async def lifespan(app: FastAPI):
60
- """
61
- Lifespan context manager de FastAPI.
62
- Carga el modelo al iniciar la app y libera recursos al cerrarla.
63
- """
64
- # Startup
65
- model_name = os.getenv("MODEL_NAME", list(AVAILABLE_MODELS.keys())[0])
66
- logger.info(f"Iniciando API cargando modelo: {model_name}")
67
- _state["service"] = ModelService(model_name, PROJECT_ROOT)
68
- _state["model_name"] = model_name
69
- _state["startup_time"] = time.time()
 
 
 
 
 
 
 
70
 
71
- # Warm-up: predecir un texto de prueba para que el modelo quede en memoria
72
  try:
73
- _state["service"].predict("test warmup text")
74
- logger.info("Modelo cargado y warm-up completado ✅")
75
- except Exception as e:
76
- logger.warning(f"Warm-up falló (no crítico): {e}")
77
 
78
- yield # La API está lista
79
 
80
- # Shutdown
81
- logger.info("API cerrándose — limpiando recursos")
82
- _state["service"] = None
83
 
84
 
85
- # ══════════════════════════════════════════════════════════════════════════════
86
- # APP
87
- # ══════════════════════════════════════════════════════════════════════════════
88
  app = FastAPI(
89
- title = "SignalMod API",
90
- description = "API de detección de hate speech en comentarios de YouTube",
91
- version = "1.0.0",
92
- lifespan = lifespan,
93
  )
94
 
95
- # CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
96
  app.add_middleware(
97
  CORSMiddleware,
98
- allow_origins = ["*"],
99
- allow_methods = ["*"],
100
- allow_headers = ["*"],
 
 
 
 
 
101
  )
102
 
 
 
 
 
103
 
104
- # ══════════════════════════════════════════════════════════════════════════════
105
- # SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
106
- # ══════════════════════════════════════════════════════════════════════════════
107
- class PredictRequest(BaseModel):
108
- """Cuerpo del request para predecir un comentario."""
109
- text : str = Field(..., min_length=1, max_length=5000,
110
- description="Comentario a analizar")
111
- threshold: float = Field(0.5, ge=0.0, le=1.0,
112
- description="Umbral de probabilidad para clasificar como tóxico")
113
-
114
- @field_validator("text")
115
- @classmethod
116
- def text_not_empty(cls, v):
117
- if not v.strip():
118
- raise ValueError("El texto no puede estar vacío")
119
- return v.strip()
120
-
121
-
122
- class PredictResponse(BaseModel):
123
- """Respuesta de la predicción."""
124
- text : str
125
- is_toxic : bool
126
- probability: float = Field(..., ge=0.0, le=1.0)
127
- labels : list[str]
128
- model_used : str
129
- latency_ms : float
130
-
131
-
132
- class BatchPredictRequest(BaseModel):
133
- """Request para predecir múltiples comentarios."""
134
- texts : list[str] = Field(..., min_length=1, max_length=100)
135
- threshold: float = Field(0.5, ge=0.0, le=1.0)
136
-
137
-
138
- class BatchPredictResponse(BaseModel):
139
- """Respuesta de predicción batch."""
140
- results : list[PredictResponse]
141
- total : int
142
- toxic_count : int
143
- latency_ms : float
144
 
 
145
 
146
- class VideoRequest(BaseModel):
147
- """Request para analizar comentarios de un video de YouTube."""
148
- url : str = Field(..., description="URL del video de YouTube")
149
- max_comments: int = Field(50, ge=1, le=200,
150
- description="Número máximo de comentarios a analizar")
151
- threshold : float = Field(0.5, ge=0.0, le=1.0)
152
-
153
-
154
- class VideoResponse(BaseModel):
155
- """Respuesta del análisis de un video de YouTube."""
156
- video_url : str
157
- total_fetched: int
158
- toxic_count : int
159
- toxic_rate : float
160
- results : list[PredictResponse]
161
- error : Optional[str] = None
162
-
163
-
164
- class ModelInfo(BaseModel):
165
- """Información sobre el modelo activo."""
166
- name : str
167
- type : str
168
- description : str
169
- speed : str
170
- accuracy : str
171
- uptime_s : float
172
- predictions_served: int
173
-
174
-
175
- # ══════════════════════════════════════════════════════════════════════════════
176
- # HELPERS
177
- # ══════════════════════════════════════════════════════════════════════════════
178
- def _get_service() -> ModelService:
179
- """Devuelve el servicio activo o lanza 503 si no está listo."""
180
- if _state["service"] is None:
181
- raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
182
- return _state["service"]
183
-
184
-
185
- def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
186
- """Predice un texto y devuelve (result, latency_ms)."""
187
- t0 = time.perf_counter()
188
- result = _get_service().predict(text)
189
- ms = round((time.perf_counter() - t0) * 1000, 2)
190
-
191
- # Aplicar umbral personalizado
192
- result["is_toxic"] = result["probability"] >= threshold
193
- if not result["is_toxic"]:
194
- result["labels"] = []
195
-
196
- _state["predictions_served"] += 1
197
- return result, ms
198
-
199
-
200
- def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
201
- """
202
- Obtiene comentarios de un video de YouTube.
203
-
204
- Estrategia:
205
- 1. Intentar con YouTube Data API v3 (si hay API key en .env)
206
- 2. Fallback: BeautifulSoup (sin autenticación, limitado)
207
- """
208
- api_key = os.getenv("YOUTUBE_API_KEY", "")
209
-
210
- if api_key:
211
- return _fetch_via_api(url, api_key, max_comments)
212
- else:
213
- return _fetch_via_scraper(url, max_comments)
214
-
215
-
216
- def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
217
- """Obtiene comentarios usando YouTube Data API v3."""
218
- try:
219
- import re
220
- from googleapiclient.discovery import build
221
 
222
- # Extraer video_id de la URL
223
- patterns = [
224
- r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
225
- r"youtu\.be/([a-zA-Z0-9_-]{11})",
226
- r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
227
- ]
228
- video_id = None
229
- for pattern in patterns:
230
- match = re.search(pattern, url)
231
- if match:
232
- video_id = match.group(1)
233
- break
234
 
235
- if not video_id:
236
- raise ValueError(f"No se pudo extraer video_id de: {url}")
237
-
238
- youtube = build("youtube", "v3", developerKey=api_key)
239
- comments = []
240
- page_token = None
241
-
242
- while len(comments) < max_comments:
243
- request = youtube.commentThreads().list(
244
- part = "snippet",
245
- videoId = video_id,
246
- maxResults = min(100, max_comments - len(comments)),
247
- pageToken = page_token,
248
- textFormat = "plainText",
249
- )
250
- response = request.execute()
251
-
252
- for item in response.get("items", []):
253
- text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
254
- comments.append(text)
255
-
256
- page_token = response.get("nextPageToken")
257
- if not page_token:
258
- break
259
-
260
- logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
261
- return comments[:max_comments]
262
-
263
- except Exception as e:
264
- logger.warning(f"YouTube API falló: {e} — usando fallback")
265
- return _fetch_via_scraper(url, max_comments)
266
-
267
-
268
- def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
269
- """
270
- Fallback: simula comentarios si no hay API key.
271
- En producción real debería usar BeautifulSoup + Selenium.
272
- """
273
- logger.warning(
274
- "YOUTUBE_API_KEY no configurada. "
275
- "Configura tu API key en .env para obtener comentarios reales. "
276
- "Usando comentarios de ejemplo."
277
- )
278
- # Comentarios de ejemplo para demo sin API key
279
- example_comments = [
280
- "This video is really informative, thanks for sharing!",
281
- "You are all stupid idiots, get out of here!",
282
- "Great content, I learned a lot from this.",
283
- "These people should be eliminated from society.",
284
- "I agree with the presenter's point of view.",
285
- "What a bunch of racist criminals!",
286
- "Thank you for this analysis, very helpful.",
287
- "Kill them all, they don't deserve to live.",
288
- "Interesting perspective on the topic.",
289
- "This is absolute bullshit propaganda!",
290
- "I think we need to look at both sides.",
291
- "Black people are thugs and criminals.",
292
- "The data presented here is compelling.",
293
- "Go back to where you came from!",
294
- "Well researched video, good job.",
295
- ]
296
- return example_comments[:max_comments]
297
-
298
-
299
- # ══════════════════════════════════════════════════════════════════════════════
300
- # ENDPOINTS
301
- # ══════════════════════════════════════════════════════════════════════════════
302
-
303
- @app.get("/", tags=["Health"])
304
- async def health_check():
305
- """
306
- Verifica que la API está funcionando.
307
- Útil para Docker healthcheck y load balancers.
308
- """
309
- service = _state["service"]
310
- return {
311
- "status" : "ok" if service else "loading",
312
- "model" : _state["model_name"],
313
- "uptime_s": round(time.time() - _state["startup_time"], 1)
314
- if _state["startup_time"] else 0,
315
- }
316
-
317
-
318
- @app.get("/model-info", response_model=ModelInfo, tags=["Model"])
319
- async def get_model_info():
320
- """Devuelve información sobre el modelo activo."""
321
- service = _get_service()
322
- info = service.get_model_info()
323
- return ModelInfo(
324
- name = _state["model_name"],
325
- type = info.get("type", "unknown"),
326
- description = info.get("description", ""),
327
- speed = info.get("speed", ""),
328
- accuracy = info.get("accuracy", ""),
329
- uptime_s = round(time.time() - _state["startup_time"], 1),
330
- predictions_served= _state["predictions_served"],
331
- )
332
-
333
-
334
- @app.get("/models", tags=["Model"])
335
- async def list_models():
336
- """Lista todos los modelos disponibles."""
337
- return {
338
- "available": list(AVAILABLE_MODELS.keys()),
339
- "active" : _state["model_name"],
340
- }
341
-
342
-
343
- @app.put("/model/{model_name}", tags=["Model"])
344
- async def switch_model(model_name: str):
345
- """
346
- Cambia el modelo activo.
347
- El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
348
- """
349
- if model_name not in AVAILABLE_MODELS:
350
- raise HTTPException(
351
- status_code=400,
352
- detail=f"Modelo '{model_name}' no disponible. "
353
- f"Opciones: {list(AVAILABLE_MODELS.keys())}",
354
- )
355
- _state["service"] = ModelService(model_name, PROJECT_ROOT)
356
- _state["model_name"] = model_name
357
- logger.info(f"Modelo cambiado a: {model_name}")
358
- return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
359
-
360
-
361
- @app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
362
- async def predict(request: PredictRequest):
363
- """
364
- Predice si un comentario es tóxico.
365
-
366
- - **text**: el comentario a analizar
367
- - **threshold**: umbral de probabilidad (default 0.5)
368
-
369
- Devuelve la probabilidad, si es tóxico y las categorías detectadas.
370
- """
371
- result, ms = _predict_single(request.text, request.threshold)
372
-
373
- if "error" in result:
374
- raise HTTPException(status_code=500, detail=result["error"])
375
-
376
- return PredictResponse(
377
- text = request.text,
378
- is_toxic = result["is_toxic"],
379
- probability= round(result["probability"], 4),
380
- labels = result["labels"],
381
- model_used = result["model_used"],
382
- latency_ms = ms,
383
- )
384
-
385
-
386
- @app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
387
- async def predict_batch(request: BatchPredictRequest):
388
- """
389
- Predice una lista de comentarios en un solo request.
390
- Más eficiente que llamar /predict N veces.
391
- Máximo 100 comentarios por request.
392
- """
393
- t0 = time.perf_counter()
394
- results = []
395
-
396
- for text in request.texts:
397
- if not text.strip():
398
- continue
399
- result, _ = _predict_single(text, request.threshold)
400
- results.append(PredictResponse(
401
- text = text,
402
- is_toxic = result["is_toxic"],
403
- probability= round(result["probability"], 4),
404
- labels = result["labels"],
405
- model_used = result["model_used"],
406
- latency_ms = 0.0,
407
- ))
408
-
409
- total_ms = round((time.perf_counter() - t0) * 1000, 2)
410
- toxic_count = sum(1 for r in results if r.is_toxic)
411
-
412
- return BatchPredictResponse(
413
- results = results,
414
- total = len(results),
415
- toxic_count = toxic_count,
416
- latency_ms = total_ms,
417
- )
418
-
419
-
420
- @app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
421
- async def predict_video(request: VideoRequest):
422
- """
423
- Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
424
-
425
- Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
426
- Sin API key usa comentarios de ejemplo para la demo.
427
- """
428
- # Obtener comentarios
429
- try:
430
- comments = _scrape_youtube_comments(request.url, request.max_comments)
431
- except Exception as e:
432
- raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
433
 
434
- if not comments:
435
- raise HTTPException(status_code=404, detail="No se encontraron comentarios en el video")
 
 
 
436
 
437
- # Predecir batch
438
- t0 = time.perf_counter()
439
- results = []
440
- for text in comments:
441
- if not text.strip():
442
- continue
443
- result, _ = _predict_single(text, request.threshold)
444
- results.append(PredictResponse(
445
- text = text,
446
- is_toxic = result["is_toxic"],
447
- probability= round(result["probability"], 4),
448
- labels = result["labels"],
449
- model_used = result["model_used"],
450
- latency_ms = 0.0,
451
- ))
452
 
453
- total_ms = round((time.perf_counter() - t0) * 1000, 2)
454
- toxic_count = sum(1 for r in results if r.is_toxic)
455
 
456
- return VideoResponse(
457
- video_url = request.url,
458
- total_fetched= len(results),
459
- toxic_count = toxic_count,
460
- toxic_rate = round(toxic_count / len(results), 4) if results else 0.0,
461
- results = results,
462
- )
 
1
  """
2
+ youtube_hate_detector API
3
 
4
+ Run: uv run uvicorn src.api.main:app --reload --port 8000
5
+ Docs: http://localhost:8000/docs
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
+ from __future__ import annotations
9
+
10
  import os
 
11
  import time
 
 
 
12
  from contextlib import asynccontextmanager
13
+ from pathlib import Path
 
14
 
15
+ from dotenv import load_dotenv
16
+ from fastapi import FastAPI
17
  from fastapi.middleware.cors import CORSMiddleware
18
+ from fastapi.responses import FileResponse
19
+ from fastapi.staticfiles import StaticFiles
20
 
21
+ load_dotenv()
 
 
22
 
23
+ from src.api.routes import health, models, predict, videos
24
+ from src.api.state import PROJECT_ROOT, get_state
25
+ from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
26
  from src.utils.logger import get_logger
27
 
28
  logger = get_logger(__name__)
29
 
30
+ FRONTEND_DIST = PROJECT_ROOT / "frontend" / "dist"
 
 
 
 
 
 
 
 
31
 
32
 
 
 
 
33
  @asynccontextmanager
34
  async def lifespan(app: FastAPI):
35
+ state = get_state()
36
+ model_name = os.getenv("MODEL_NAME", next(iter(AVAILABLE_MODELS.keys())))
37
+ available, reason = check_model_availability(model_name, PROJECT_ROOT)
38
+ if not available:
39
+ fallback = next(iter(AVAILABLE_MODELS.keys()))
40
+ logger.warning(
41
+ "MODEL_NAME '%s' unavailable (%s) using '%s'",
42
+ model_name,
43
+ reason,
44
+ fallback,
45
+ )
46
+ model_name = fallback
47
+ logger.info("Starting youtube_hate_detector API — model: %s", model_name)
48
+ state["service"] = ModelService(model_name, PROJECT_ROOT)
49
+ state["model_name"] = model_name
50
+ state["startup_time"] = time.time()
51
+ state["predictions_served"] = 0
52
 
 
53
  try:
54
+ state["service"].predict("warmup")
55
+ logger.info("Model warm-up complete")
56
+ except Exception as exc:
57
+ logger.warning("Warm-up failed (non-critical): %s", exc)
58
 
59
+ yield
60
 
61
+ state["service"] = None
62
+ logger.info("API shutdown")
 
63
 
64
 
 
 
 
65
  app = FastAPI(
66
+ title="youtube_hate_detector API",
67
+ description="Toxic comment detection for YouTube-style moderation demos",
68
+ version="1.0.0",
69
+ lifespan=lifespan,
70
  )
71
 
 
72
  app.add_middleware(
73
  CORSMiddleware,
74
+ allow_origins=[
75
+ "http://localhost:5173",
76
+ "http://127.0.0.1:5173",
77
+ "http://localhost:8000",
78
+ ],
79
+ allow_credentials=True,
80
+ allow_methods=["*"],
81
+ allow_headers=["*"],
82
  )
83
 
84
+ app.include_router(health.router)
85
+ app.include_router(models.router)
86
+ app.include_router(predict.router)
87
+ app.include_router(videos.router)
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ _API_GET_PREFIXES = ("models", "model", "videos", "predict", "health", "docs", "redoc", "openapi")
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ def _mount_frontend() -> None:
94
+ if not FRONTEND_DIST.is_dir():
95
+ return
96
+ assets = FRONTEND_DIST / "assets"
97
+ if assets.is_dir():
98
+ app.mount("/assets", StaticFiles(directory=assets), name="assets")
 
 
 
 
 
 
99
 
100
+ @app.get("/{full_path:path}", include_in_schema=False)
101
+ async def spa_fallback(full_path: str):
102
+ if full_path.startswith(_API_GET_PREFIXES):
103
+ from fastapi import HTTPException
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ raise HTTPException(status_code=404, detail="Not found")
106
+ index = FRONTEND_DIST / "index.html"
107
+ if index.exists():
108
+ return FileResponse(index)
109
+ from fastapi import HTTPException
110
 
111
+ raise HTTPException(status_code=404, detail="Not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
 
 
113
 
114
+ _mount_frontend()
 
 
 
 
 
 
src/api/routes/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API route modules."""
src/api/routes/health.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from fastapi import APIRouter
4
+
5
+ from src.api.state import get_state
6
+
7
+ router = APIRouter(tags=["Health"])
8
+
9
+
10
+ @router.get("/health")
11
+ async def health_check():
12
+ state = get_state()
13
+ service = state["service"]
14
+ uptime = 0.0
15
+ if state["startup_time"]:
16
+ uptime = round(time.time() - state["startup_time"], 1)
17
+ return {
18
+ "status": "ok" if service else "loading",
19
+ "model": state["model_name"],
20
+ "uptime_s": uptime,
21
+ "project": "youtube_hate_detector",
22
+ }
src/api/routes/models.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from fastapi import APIRouter, HTTPException
4
+
5
+ from src.api.schemas import ModelInfo, ModelStatusEntry, ModelsStatusResponse
6
+ from src.api.services import get_service
7
+ from src.api.state import PROJECT_ROOT, get_state
8
+ from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
9
+
10
+ router = APIRouter(tags=["Model"])
11
+
12
+
13
+ @router.get("/model-info", response_model=ModelInfo)
14
+ async def get_model_info():
15
+ service = get_service()
16
+ info = service.get_model_info()
17
+ state = get_state()
18
+ uptime = round(time.time() - state["startup_time"], 1) if state["startup_time"] else 0.0
19
+ return ModelInfo(
20
+ name=state["model_name"],
21
+ type=info.get("type", "unknown"),
22
+ description=info.get("description", ""),
23
+ speed=info.get("speed", ""),
24
+ accuracy=info.get("accuracy", ""),
25
+ uptime_s=uptime,
26
+ predictions_served=state.get("predictions_served", 0),
27
+ )
28
+
29
+
30
+ @router.get("/models/status", response_model=ModelsStatusResponse)
31
+ async def models_status():
32
+ state = get_state()
33
+ entries: list[ModelStatusEntry] = []
34
+ for name, cfg in AVAILABLE_MODELS.items():
35
+ available, reason = check_model_availability(name, PROJECT_ROOT)
36
+ entries.append(
37
+ ModelStatusEntry(
38
+ name=name,
39
+ available=available,
40
+ reason=reason,
41
+ type=cfg.get("type", "unknown"),
42
+ )
43
+ )
44
+ return ModelsStatusResponse(models=entries, active=state["model_name"] or "")
45
+
46
+
47
+ @router.get("/models")
48
+ async def list_models():
49
+ state = get_state()
50
+ return {"available": list(AVAILABLE_MODELS.keys()), "active": state["model_name"]}
51
+
52
+
53
+ @router.put("/model/{model_name}")
54
+ async def switch_model(model_name: str):
55
+ if model_name not in AVAILABLE_MODELS:
56
+ raise HTTPException(
57
+ status_code=400,
58
+ detail=f"Model '{model_name}' not available. Options: {list(AVAILABLE_MODELS.keys())}",
59
+ )
60
+
61
+ available, reason = check_model_availability(model_name, PROJECT_ROOT)
62
+ if not available:
63
+ raise HTTPException(status_code=400, detail=reason or "Model unavailable")
64
+
65
+ state = get_state()
66
+ prev_service = state["service"]
67
+ prev_name = state["model_name"]
68
+
69
+ new_service = ModelService(model_name, PROJECT_ROOT)
70
+ warmup = new_service.predict("warmup")
71
+ if warmup.get("error"):
72
+ state["service"] = prev_service
73
+ state["model_name"] = prev_name
74
+ raise HTTPException(status_code=400, detail=str(warmup["error"]))
75
+
76
+ state["service"] = new_service
77
+ state["model_name"] = model_name
78
+ return {"message": f"Active model set to '{model_name}'", "model": model_name}
src/api/routes/predict.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from fastapi import APIRouter, HTTPException
4
+
5
+ from src.api.schemas import (
6
+ BatchPredictRequest,
7
+ BatchPredictResponse,
8
+ PredictRequest,
9
+ PredictResponse,
10
+ VideoRequest,
11
+ VideoResponse,
12
+ )
13
+ from src.api.services import predict_single, to_predict_response
14
+ from src.api.state import get_state
15
+ from src.api.youtube import CommentsFetchError, fetch_comments
16
+ router = APIRouter(tags=["Prediction"])
17
+
18
+
19
+ @router.post("/predict", response_model=PredictResponse)
20
+ async def predict(request: PredictRequest):
21
+ return predict_single(request.text, request.threshold)
22
+
23
+
24
+ @router.post("/predict-batch", response_model=BatchPredictResponse)
25
+ async def predict_batch(request: BatchPredictRequest):
26
+ t0 = time.perf_counter()
27
+ results: list[PredictResponse] = []
28
+ for text in request.texts:
29
+ if not text.strip():
30
+ continue
31
+ results.append(predict_single(text.strip(), request.threshold))
32
+ total_ms = round((time.perf_counter() - t0) * 1000, 2)
33
+ toxic_count = sum(1 for r in results if r.is_toxic)
34
+ return BatchPredictResponse(
35
+ results=results,
36
+ total=len(results),
37
+ toxic_count=toxic_count,
38
+ latency_ms=total_ms,
39
+ )
40
+
41
+
42
+ @router.post("/predict-video", response_model=VideoResponse)
43
+ async def predict_video(request: VideoRequest):
44
+ try:
45
+ comments, source = fetch_comments(request.url, request.max_comments)
46
+ except CommentsFetchError as exc:
47
+ raise HTTPException(status_code=422, detail=str(exc)) from exc
48
+ except Exception as exc:
49
+ raise HTTPException(status_code=422, detail=f"Failed to fetch comments: {exc}") from exc
50
+
51
+ if not comments:
52
+ raise HTTPException(status_code=404, detail="No comments found for this video")
53
+
54
+ t0 = time.perf_counter()
55
+ results: list[PredictResponse] = []
56
+ service = get_state()["service"]
57
+ if service is None:
58
+ raise HTTPException(status_code=503, detail="Model not loaded")
59
+
60
+ for text in comments:
61
+ if not text.strip():
62
+ continue
63
+ raw = service.predict(text)
64
+ results.append(to_predict_response(text, raw, 0.0, request.threshold))
65
+
66
+ total_ms = round((time.perf_counter() - t0) * 1000, 2)
67
+ toxic_count = sum(1 for r in results if r.is_toxic)
68
+ get_state()["predictions_served"] = get_state().get("predictions_served", 0) + len(results)
69
+
70
+ return VideoResponse(
71
+ video_url=request.url,
72
+ total_fetched=len(results),
73
+ toxic_count=toxic_count,
74
+ toxic_rate=round(toxic_count / len(results), 4) if results else 0.0,
75
+ results=results,
76
+ source=source,
77
+ )
src/api/routes/videos.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from src.api.schemas import SuggestedVideo, SuggestedVideosResponse
4
+ from src.api.youtube import fetch_video_metadata, load_suggested_config
5
+
6
+ router = APIRouter(tags=["Videos"])
7
+
8
+
9
+ @router.get("/videos/suggested", response_model=SuggestedVideosResponse)
10
+ async def suggested_videos():
11
+ cfg = load_suggested_config()
12
+ max_comments = int(cfg.get("max_comments", 50))
13
+ entries = cfg.get("videos") or []
14
+ ids = [e["id"] if isinstance(e, dict) else str(e) for e in entries]
15
+ meta = fetch_video_metadata(ids)
16
+ videos = [SuggestedVideo(**m) for m in meta]
17
+ return SuggestedVideosResponse(videos=videos, max_comments=max_comments)
src/api/schemas.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic request/response models for the API."""
2
+
3
+ from typing import Literal, Optional
4
+
5
+ from pydantic import BaseModel, Field, field_validator
6
+
7
+
8
+ class PredictRequest(BaseModel):
9
+ text: str = Field(..., min_length=1, max_length=5000)
10
+ threshold: float = Field(0.5, ge=0.0, le=1.0)
11
+
12
+ @field_validator("text")
13
+ @classmethod
14
+ def text_not_empty(cls, v: str) -> str:
15
+ if not v.strip():
16
+ raise ValueError("Text cannot be empty")
17
+ return v.strip()
18
+
19
+
20
+ class PredictResponse(BaseModel):
21
+ text: str
22
+ is_toxic: bool
23
+ probability: float = Field(..., ge=0.0, le=1.0)
24
+ status: Literal["Safe", "Toxic"]
25
+ mode: Literal["binary"] = "binary"
26
+ labels: list[str]
27
+ model_used: str
28
+ latency_ms: float
29
+
30
+
31
+ class BatchPredictRequest(BaseModel):
32
+ texts: list[str] = Field(..., min_length=1, max_length=100)
33
+ threshold: float = Field(0.5, ge=0.0, le=1.0)
34
+
35
+
36
+ class BatchPredictResponse(BaseModel):
37
+ results: list[PredictResponse]
38
+ total: int
39
+ toxic_count: int
40
+ latency_ms: float
41
+
42
+
43
+ class VideoRequest(BaseModel):
44
+ url: str
45
+ max_comments: int = Field(50, ge=1, le=200)
46
+ threshold: float = Field(0.5, ge=0.0, le=1.0)
47
+
48
+
49
+ class VideoResponse(BaseModel):
50
+ video_url: str
51
+ total_fetched: int
52
+ toxic_count: int
53
+ toxic_rate: float
54
+ results: list[PredictResponse]
55
+ source: Literal["youtube", "demo", "unavailable"] = "demo"
56
+ reason: Optional[str] = None
57
+ error: Optional[str] = None
58
+
59
+
60
+ class ModelStatusEntry(BaseModel):
61
+ name: str
62
+ available: bool
63
+ reason: Optional[str] = None
64
+ type: str = "unknown"
65
+
66
+
67
+ class ModelsStatusResponse(BaseModel):
68
+ models: list[ModelStatusEntry]
69
+ active: str
70
+
71
+
72
+ class ModelInfo(BaseModel):
73
+ name: str
74
+ type: str
75
+ description: str
76
+ speed: str
77
+ accuracy: str
78
+ uptime_s: float
79
+ predictions_served: int
80
+
81
+
82
+ class SuggestedVideo(BaseModel):
83
+ id: str
84
+ title: str
85
+ channel_title: str
86
+ thumbnail_url: str
87
+ watch_url: str
88
+ embeddable: bool = True
89
+
90
+
91
+ class SuggestedVideosResponse(BaseModel):
92
+ videos: list[SuggestedVideo]
93
+ max_comments: int
src/api/services.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prediction helpers used by route handlers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from fastapi import HTTPException
8
+
9
+ from src.api.schemas import PredictResponse
10
+ from src.api.state import get_state
11
+ from src.service.model_service import ModelService
12
+
13
+
14
+ def get_service() -> ModelService:
15
+ state = get_state()
16
+ if state["service"] is None:
17
+ raise HTTPException(status_code=503, detail="Model not loaded. Try again shortly.")
18
+ return state["service"]
19
+
20
+
21
+ def to_predict_response(text: str, result: dict, latency_ms: float, threshold: float) -> PredictResponse:
22
+ proba = float(result["probability"])
23
+ is_toxic = proba >= threshold
24
+ labels = result.get("labels", []) if is_toxic else []
25
+ return PredictResponse(
26
+ text=text,
27
+ is_toxic=is_toxic,
28
+ probability=round(proba, 4),
29
+ status="Toxic" if is_toxic else "Safe",
30
+ mode="binary",
31
+ labels=labels,
32
+ model_used=result.get("model_used", ""),
33
+ latency_ms=latency_ms,
34
+ )
35
+
36
+
37
+ def predict_single(text: str, threshold: float) -> PredictResponse:
38
+ state = get_state()
39
+ t0 = time.perf_counter()
40
+ result = get_service().predict(text)
41
+ ms = round((time.perf_counter() - t0) * 1000, 2)
42
+
43
+ if "error" in result:
44
+ raise HTTPException(status_code=500, detail=result["error"])
45
+
46
+ state["predictions_served"] = state.get("predictions_served", 0) + 1
47
+ return to_predict_response(text, result, ms, threshold)
src/api/state.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application state shared across routes."""
2
+
3
+ from pathlib import Path
4
+
5
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
6
+
7
+ _state: dict = {
8
+ "service": None,
9
+ "model_name": None,
10
+ "startup_time": None,
11
+ "predictions_served": 0,
12
+ }
13
+
14
+
15
+ def get_state() -> dict:
16
+ return _state
src/api/youtube.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """YouTube comment fetch and suggested-video metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import yaml
11
+
12
+ from src.utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
17
+ SUGGESTED_CONFIG = PROJECT_ROOT / "configs" / "suggested_videos.yaml"
18
+
19
+ _VIDEO_ID_PATTERNS = (
20
+ r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
21
+ r"youtu\.be/([a-zA-Z0-9_-]{11})",
22
+ r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
23
+ )
24
+
25
+
26
+ class CommentsFetchError(Exception):
27
+ """Raised when comments cannot be fetched and demo fallback must not be used."""
28
+
29
+
30
+ def extract_video_id(url: str) -> str | None:
31
+ for pattern in _VIDEO_ID_PATTERNS:
32
+ match = re.search(pattern, url)
33
+ if match:
34
+ return match.group(1)
35
+ return None
36
+
37
+
38
+ def load_suggested_config() -> dict[str, Any]:
39
+ if not SUGGESTED_CONFIG.exists():
40
+ return {"max_comments": 50, "videos": [{"id": "jNQXAC9IVRw"}]}
41
+ with SUGGESTED_CONFIG.open(encoding="utf-8") as f:
42
+ return yaml.safe_load(f) or {}
43
+
44
+
45
+ def _parse_youtube_error(exc: Exception) -> str:
46
+ err_text = str(exc)
47
+ if "commentsDisabled" in err_text:
48
+ return "Comments are disabled on this video"
49
+ if "disabled comments" in err_text.lower():
50
+ return "Comments are disabled on this video"
51
+ if "quota" in err_text.lower():
52
+ return "YouTube API quota exceeded"
53
+ try:
54
+ from googleapiclient.errors import HttpError
55
+
56
+ if isinstance(exc, HttpError):
57
+ for detail in getattr(exc, "error_details", []) or []:
58
+ reason = detail.get("reason") if isinstance(detail, dict) else None
59
+ if reason == "commentsDisabled":
60
+ return "Comments are disabled on this video"
61
+ except ImportError:
62
+ pass
63
+ return err_text
64
+
65
+
66
+ def fetch_comments(url: str, max_comments: int) -> tuple[list[str], str]:
67
+ video_id = extract_video_id(url) or "unknown"
68
+ api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
69
+ if api_key:
70
+ return _fetch_via_api(url, api_key, max_comments, video_id)
71
+ return _demo_comments(video_id, max_comments), "demo"
72
+
73
+
74
+ def _fetch_via_api(
75
+ url: str, api_key: str, max_comments: int, video_id: str
76
+ ) -> tuple[list[str], str]:
77
+ from googleapiclient.discovery import build
78
+
79
+ if video_id == "unknown":
80
+ raise CommentsFetchError(f"Could not parse video id from: {url}")
81
+
82
+ youtube = build("youtube", "v3", developerKey=api_key)
83
+ comments: list[str] = []
84
+ page_token = None
85
+
86
+ try:
87
+ while len(comments) < max_comments:
88
+ response = (
89
+ youtube.commentThreads()
90
+ .list(
91
+ part="snippet",
92
+ videoId=video_id,
93
+ maxResults=min(100, max_comments - len(comments)),
94
+ pageToken=page_token,
95
+ textFormat="plainText",
96
+ )
97
+ .execute()
98
+ )
99
+ for item in response.get("items", []):
100
+ text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
101
+ comments.append(text)
102
+ page_token = response.get("nextPageToken")
103
+ if not page_token:
104
+ break
105
+ except Exception as exc:
106
+ message = _parse_youtube_error(exc)
107
+ logger.warning("YouTube API failed for %s: %s", video_id, message)
108
+ raise CommentsFetchError(message) from exc
109
+
110
+ if not comments:
111
+ raise CommentsFetchError("No comments found for this video")
112
+
113
+ logger.info("YouTube API: fetched %s comments for %s", len(comments), video_id)
114
+ return comments[:max_comments], "youtube"
115
+
116
+
117
+ def fetch_video_metadata(video_ids: list[str]) -> list[dict[str, Any]]:
118
+ api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
119
+ if not api_key or not video_ids:
120
+ return [_placeholder_meta(vid) for vid in video_ids]
121
+
122
+ try:
123
+ from googleapiclient.discovery import build
124
+
125
+ youtube = build("youtube", "v3", developerKey=api_key)
126
+ response = (
127
+ youtube.videos()
128
+ .list(part="snippet,status", id=",".join(video_ids))
129
+ .execute()
130
+ )
131
+ by_id: dict[str, dict[str, Any]] = {}
132
+ for item in response.get("items", []):
133
+ vid = item["id"]
134
+ snip = item["snippet"]
135
+ status = item.get("status", {})
136
+ thumbs = snip.get("thumbnails", {})
137
+ thumb = thumbs.get("medium") or thumbs.get("default") or {}
138
+ embeddable = status.get("embeddable", True)
139
+ by_id[vid] = {
140
+ "id": vid,
141
+ "title": snip.get("title", vid),
142
+ "channel_title": snip.get("channelTitle", "Unknown"),
143
+ "thumbnail_url": thumb.get("url", f"https://i.ytimg.com/vi/{vid}/mqdefault.jpg"),
144
+ "watch_url": f"https://www.youtube.com/watch?v={vid}",
145
+ "embeddable": bool(embeddable),
146
+ }
147
+ return [by_id.get(vid, _placeholder_meta(vid)) for vid in video_ids]
148
+ except Exception as exc:
149
+ logger.warning("YouTube metadata failed: %s", exc)
150
+ return [_placeholder_meta(vid) for vid in video_ids]
151
+
152
+
153
+ def _placeholder_meta(video_id: str) -> dict[str, Any]:
154
+ return {
155
+ "id": video_id,
156
+ "title": f"Video {video_id}",
157
+ "channel_title": "YouTube",
158
+ "thumbnail_url": f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg",
159
+ "watch_url": f"https://www.youtube.com/watch?v={video_id}",
160
+ "embeddable": True,
161
+ }
162
+
163
+
164
+ def _demo_comments(video_id: str, max_comments: int) -> list[str]:
165
+ import random
166
+
167
+ logger.info("Using demo comments for %s (no YOUTUBE_API_KEY)", video_id)
168
+ examples = [
169
+ "This video is really informative, thanks for sharing!",
170
+ "You are all stupid idiots, get out of here!",
171
+ "Great content, I learned a lot from this.",
172
+ "These people should be eliminated from society.",
173
+ "I agree with the presenter's point of view.",
174
+ "What a bunch of racist criminals!",
175
+ "Thank you for this analysis, very helpful.",
176
+ "Kill them all, they don't deserve to live.",
177
+ "Interesting perspective on the topic.",
178
+ "This is absolute bullshit propaganda!",
179
+ "I think we need to look at both sides.",
180
+ "Well researched video, good job.",
181
+ "Go back to where you came from!",
182
+ "The data presented here is compelling.",
183
+ ]
184
+ rng = random.Random(video_id)
185
+ rotated = examples[:]
186
+ rng.shuffle(rotated)
187
+ return rotated[:max_comments]
src/app/app.py DELETED
@@ -1,764 +0,0 @@
1
- """
2
- src/app/streamlit_app.py
3
-
4
- App SignalMod — detección de hate speech estilo YouTube.
5
- Ejecutar: streamlit run src/app/streamlit_app.py
6
- """
7
-
8
- import html
9
- import sys
10
- import random
11
- import datetime
12
- from pathlib import Path
13
-
14
- import streamlit as st
15
- import pandas as pd
16
-
17
- from transformers.utils import logging
18
- logging.set_verbosity_error()
19
-
20
- # ── Paths ─────────────────────────────────────────────────────────────────────
21
- PROJECT_ROOT = Path(__file__).resolve().parents[2]
22
- sys.path.insert(0, str(PROJECT_ROOT))
23
-
24
- try:
25
- from src.service.model_service import ModelService, AVAILABLE_MODELS
26
- except ImportError:
27
- sys.path.insert(0, str(Path(__file__).parent.parent))
28
- from service.model_service import ModelService, AVAILABLE_MODELS
29
-
30
- # ── Config ────────────────────────────────────────────────────────────────────
31
- st.set_page_config(
32
- page_title="SignalMod",
33
- page_icon="🎬",
34
- layout="wide",
35
- initial_sidebar_state="expanded",
36
- )
37
-
38
- # ── CSS ───────────────────────────────────────────────────────────────────────
39
- # Nota: NO ocultamos el header completo para preservar el botón de toggle del sidebar.
40
- # Solo ocultamos el menú hamburguesa y el footer de Streamlit.
41
- st.markdown("""
42
- <style>
43
- @import url('https://fonts.googleapis.com/css2?family=YouTube+Sans:wght@400;600;700&display=swap');
44
-
45
- /* ── Ocultar solo elementos de branding, NO el header completo ── */
46
- #MainMenu { visibility: hidden; }
47
- footer { visibility: hidden; }
48
-
49
- /* ── Fondo de la app: blanco limpio ── */
50
- .stApp { background: #ffffff; }
51
-
52
- /* ── Sidebar oscuro (como YouTube) ── */
53
- section[data-testid="stSidebar"] {
54
- background-color: #0f0f0f !important;
55
- }
56
- section[data-testid="stSidebar"] > div {
57
- background-color: #0f0f0f !important;
58
- }
59
- /* Texto del sidebar en blanco */
60
- section[data-testid="stSidebar"] p,
61
- section[data-testid="stSidebar"] span,
62
- section[data-testid="stSidebar"] label,
63
- section[data-testid="stSidebar"] div {
64
- color: #ffffff !important;
65
- }
66
- /* Botones del sidebar */
67
- section[data-testid="stSidebar"] .stButton button {
68
- background: transparent !important;
69
- color: #e0e0e0 !important;
70
- border: none !important;
71
- text-align: left !important;
72
- justify-content: flex-start !important;
73
- border-radius: 10px !important;
74
- padding: 0.5rem 0.75rem !important;
75
- font-size: 0.9rem !important;
76
- font-weight: 400 !important;
77
- width: 100% !important;
78
- }
79
- section[data-testid="stSidebar"] .stButton button:hover {
80
- background: rgba(255,255,255,0.1) !important;
81
- color: #ffffff !important;
82
- }
83
- /* Botón activo en el sidebar */
84
- section[data-testid="stSidebar"] .stButton button[data-active="true"] {
85
- background: rgba(255,255,255,0.15) !important;
86
- color: #ffffff !important;
87
- font-weight: 600 !important;
88
- }
89
- /* Divider del sidebar */
90
- section[data-testid="stSidebar"] hr {
91
- border-color: rgba(255,255,255,0.15) !important;
92
- }
93
- /* Badge de modelo activo en sidebar */
94
- .sidebar-model-info {
95
- background: rgba(255,255,255,0.08);
96
- border-radius: 8px;
97
- padding: 8px 12px;
98
- margin: 8px 0;
99
- font-size: 0.75rem;
100
- color: #aaaaaa;
101
- }
102
- .sidebar-model-info strong { color: #ffffff; }
103
-
104
- /* ── Área principal: fondo blanco, texto oscuro ── */
105
- .main-area { background: #ffffff; }
106
-
107
- /* ── Video thumbnail ── */
108
- .video-thumb {
109
- background: linear-gradient(135deg, #0d0d1a 0%, #1a0a2e 50%, #0d1a1a 100%);
110
- border-radius: 12px;
111
- height: 340px;
112
- display: flex;
113
- align-items: center;
114
- justify-content: center;
115
- }
116
- .play-btn {
117
- width: 72px; height: 72px;
118
- background: rgba(255,255,255,0.9);
119
- border-radius: 50%;
120
- display: flex; align-items: center; justify-content: center;
121
- font-size: 2rem; cursor: pointer;
122
- box-shadow: 0 4px 20px rgba(0,0,0,0.4);
123
- }
124
-
125
- /* ── Títulos de video ── */
126
- .video-title {
127
- font-size: 1.15rem; font-weight: 700;
128
- color: #0f0f0f; margin: 0.75rem 0 0.3rem;
129
- line-height: 1.4;
130
- }
131
- .video-meta { font-size: 0.82rem; color: #606060; }
132
- .channel-name { font-weight: 600; font-size: 0.9rem; color: #0f0f0f; }
133
-
134
- /* ── Badges ── */
135
- .badge {
136
- display: inline-block;
137
- padding: 2px 9px; border-radius: 12px;
138
- font-size: 0.72rem; font-weight: 700;
139
- margin-left: 6px; vertical-align: middle;
140
- }
141
- .badge-toxic { background: #cc0000; color: #ffffff; }
142
- .badge-safe { background: #00c853; color: #ffffff; }
143
-
144
- /* ── Comentarios ── */
145
- .comment-wrap {
146
- display: flex; gap: 12px;
147
- padding: 12px 0; border-bottom: 1px solid #f0f0f0;
148
- }
149
- .c-avatar {
150
- width: 36px; height: 36px; min-width: 36px;
151
- border-radius: 50%; background: #cc0000;
152
- display: flex; align-items: center; justify-content: center;
153
- color: #ffffff; font-weight: 700; font-size: 0.85rem;
154
- flex-shrink: 0;
155
- }
156
- .c-avatar.safe { background: #606060; }
157
- .c-body { flex: 1; min-width: 0; }
158
- .c-header { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
159
- .c-user { font-size: 0.84rem; font-weight: 600; color: #0f0f0f; }
160
- .c-time { font-size: 0.75rem; color: #909090; margin-left: 4px; }
161
- .c-text { font-size: 0.88rem; color: #2d2d2d; margin-top: 4px; line-height: 1.55; }
162
- .c-text.toxic {
163
- background: #fff5f5;
164
- border-left: 3px solid #cc0000;
165
- padding: 6px 10px; border-radius: 0 6px 6px 0;
166
- margin-top: 6px;
167
- }
168
- .c-flagged { font-size: 0.77rem; color: #cc0000; font-weight: 500; margin-top: 4px; }
169
-
170
- /* ── Toxicity bar inline ── */
171
- .tox-row {
172
- display: flex; align-items: center; gap: 8px;
173
- font-size: 0.8rem; color: #606060; margin-top: 6px; flex-wrap: wrap;
174
- }
175
- .tox-bar-bg {
176
- flex: 1; max-width: 120px;
177
- background: #e5e5e5; border-radius: 4px; height: 6px;
178
- }
179
- .tox-bar-fill { height: 6px; border-radius: 4px; }
180
-
181
- /* ── Sugeridos ── */
182
- .sug-card {
183
- display: flex; gap: 8px; margin-bottom: 10px;
184
- cursor: pointer;
185
- }
186
- .sug-thumb {
187
- width: 120px; min-width: 120px; height: 68px;
188
- background: #1a1a2e; border-radius: 6px;
189
- display: flex; align-items: center; justify-content: center;
190
- font-size: 1.4rem; flex-shrink: 0;
191
- }
192
- .sug-title { font-size: 0.82rem; font-weight: 600; color: #0f0f0f; line-height: 1.3; }
193
- .sug-ch { font-size: 0.75rem; color: #606060; margin-top: 2px; }
194
- .sug-meta { font-size: 0.72rem; color: #909090; }
195
-
196
- /* ── Section header ── */
197
- .sec-title {
198
- font-size: 1rem; font-weight: 700; color: #0f0f0f;
199
- margin: 1.25rem 0 0.75rem; padding-bottom: 0.5rem;
200
- border-bottom: 1px solid #e5e5e5;
201
- }
202
-
203
- /* ── Modal body fixes ── */
204
- [data-testid="stDialog"] { background: #ffffff; }
205
-
206
- /* ── Hub cards ── */
207
- .hub-card {
208
- background: #ffffff; border: 1px solid #e5e5e5;
209
- border-radius: 12px; padding: 1rem;
210
- }
211
- .hub-kpi-label { font-size: 0.72rem; color: #606060; text-transform: uppercase;
212
- letter-spacing: 0.5px; margin-bottom: 4px; }
213
- .hub-kpi-val { font-size: 1.8rem; font-weight: 700; color: #0f0f0f; }
214
-
215
- /* ── Model cards (settings) ── */
216
- .model-card {
217
- background: #ffffff; border: 1.5px solid #e5e5e5;
218
- border-radius: 10px; padding: 14px 16px; margin-bottom: 8px;
219
- }
220
- .model-card.active {
221
- border-color: #cc0000; background: #fff5f5;
222
- }
223
- .model-card-name { font-size: 0.95rem; font-weight: 600; color: #0f0f0f; }
224
- .model-card-desc { font-size: 0.8rem; color: #606060; margin-top: 3px; }
225
- .model-pill {
226
- display: inline-block; background: #f0f0f0; color: #333;
227
- border-radius: 6px; padding: 2px 8px; font-size: 0.73rem; margin-right: 4px;
228
- }
229
- </style>
230
- """, unsafe_allow_html=True)
231
-
232
-
233
- # ── Session state init ────────────────────────────────────────────────────────
234
- def _init_state():
235
- defaults = {
236
- "page" : "Home",
237
- "selected_model": list(AVAILABLE_MODELS.keys())[0],
238
- "threshold" : 0.5,
239
- "pending_modal" : None, # dict con el comentario pendiente de decisión
240
- "comments": [
241
- {"user": "user_prime", "initial": "U",
242
- "text": "Excelente video, muy informativo!", "time": "1 h",
243
- "is_toxic": False, "probability": 0.04, "labels": []},
244
- {"user": "troll_master", "initial": "T",
245
- "text": "Esto es una basura completa", "time": "30 min",
246
- "is_toxic": True, "probability": 0.91, "labels": ["Insulto","Agresividad"]},
247
- {"user": "curious_viewer", "initial": "C",
248
- "text": "¿Alguien puede explicar esto mejor?", "time": "15 min",
249
- "is_toxic": False, "probability": 0.07, "labels": []},
250
- ],
251
- "hub_history": [
252
- {"Usuario": "@user_992", "Comentario": '"No puedo creer que seas tan..."', "Score": 0.94, "Acción": "🚫 Bloqueado"},
253
- {"Usuario": "@alpha_mod", "Comentario": '"Spam repetitivo de enlaces."', "Score": 0.82, "Acción": "🚩 Revisión"},
254
- {"Usuario": "@anon_404", "Comentario": '"Discurso de odio en contexto."', "Score": 0.98, "Acción": "📋 Archivado"},
255
- {"Usuario": "@user_123", "Comentario": '"¡Gran contenido, sigan!"', "Score": 0.03, "Acción": "✅ Aprobado"},
256
- {"Usuario": "@viewer_x", "Comentario": '"Esta gente debería desaparecer."',"Score": 0.97, "Acción": "🚫 Bloqueado"},
257
- ],
258
- }
259
- for k, v in defaults.items():
260
- if k not in st.session_state:
261
- st.session_state[k] = v
262
-
263
- _init_state()
264
-
265
-
266
- # ── Model cache ───────────────────────────────────────────────────────────────
267
- @st.cache_resource(show_spinner="Cargando modelo...")
268
- def get_service(model_name: str) -> ModelService:
269
- return ModelService(model_name, PROJECT_ROOT)
270
-
271
-
272
- # ══════════════════════════════════════════════════════════════════════════════
273
- # SIDEBAR
274
- # ══════════════════════════════════════════════════════════════════════════════
275
- def render_sidebar():
276
- with st.sidebar:
277
- # Logo
278
- st.markdown(
279
- "<div style='padding:0.5rem 0 0.25rem; font-size:1.3rem; font-weight:700;'>"
280
- "🎬 <span style='color:#cc0000'>Signal</span>Mod</div>"
281
- "<div style='font-size:0.65rem; color:#aaa; margin-bottom:1.2rem;'>"
282
- "Signal within the Noise</div>",
283
- unsafe_allow_html=True,
284
- )
285
-
286
- nav = {"Home": "🏠", "Moderator Hub": "📊", "Settings": "⚙️"}
287
- for page, icon in nav.items():
288
- label = f"{icon} {page}"
289
- clicked = st.button(label, key=f"nav_{page}", use_container_width=True)
290
- if clicked:
291
- st.session_state.page = page
292
- st.rerun()
293
-
294
- st.divider()
295
-
296
- # Info modelo activo
297
- model_short = st.session_state.selected_model.split("(")[0].strip()
298
- tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"])
299
- total_c = len(st.session_state.comments)
300
-
301
- st.markdown(
302
- f"<div class='sidebar-model-info'>"
303
- f"Modelo activo<br><strong>{html.escape(model_short)}</strong>"
304
- f"<br><br>Comentarios: <strong>{total_c}</strong>"
305
- f" · Tóxicos: <strong style='color:#cc0000'>{tox_cnt}</strong>"
306
- f"</div>",
307
- unsafe_allow_html=True,
308
- )
309
-
310
-
311
- # ══════════════════════════════════════════════════════════════════════════════
312
- # MODAL — toxicidad detectada
313
- # ══════════════════════════════════════════════════════════════════════════════
314
- @st.dialog("⚠️ Aviso de Toxicidad Detectada")
315
- def show_toxicity_modal():
316
- """
317
- @st.dialog crea una ventana modal nativa de Streamlit (1.32+).
318
- Cuando se llama a la función decorada, Streamlit renderiza el contenido
319
- dentro de un overlay modal y pausa la ejecución normal del script.
320
- """
321
- data = st.session_state.pending_modal
322
- if not data:
323
- st.rerun()
324
- return
325
-
326
- text = data["text"]
327
- prob = data["probability"]
328
- lbls = data["labels"]
329
- pct = int(prob * 100)
330
- color = "#cc0000" if pct >= 70 else "#ff6d00" if pct >= 40 else "#f5a623"
331
-
332
- st.markdown(
333
- "<div style='text-align:center; font-size:3rem; color:#cc0000'>⚠️</div>",
334
- unsafe_allow_html=True,
335
- )
336
- st.markdown(
337
- f"<div style='background:#f8f8f8; border-radius:8px; padding:12px 16px;"
338
- f"font-style:italic; color:#333; text-align:center; margin:8px 0;'>"
339
- f"&quot;{html.escape(text[:140])}{'...' if len(text)>140 else ''}&quot;</div>",
340
- unsafe_allow_html=True,
341
- )
342
-
343
- # Barra de toxicidad
344
- st.markdown(
345
- f"<div style='display:flex; justify-content:space-between; "
346
- f"font-size:0.82rem; color:#606060; margin-top:12px;'>"
347
- f"<span>ÍNDICE DE TOXICIDAD</span>"
348
- f"<span style='color:{color}; font-weight:700'>{pct}%</span></div>"
349
- f"<div style='background:#e5e5e5; border-radius:4px; height:8px; margin-top:4px;'>"
350
- f"<div style='width:{pct}%; background:{color}; height:8px; border-radius:4px;'></div>"
351
- f"</div>",
352
- unsafe_allow_html=True,
353
- )
354
-
355
- # Etiquetas
356
- if lbls:
357
- tags = " ".join(
358
- f"<span style='background:#ffe5e5; color:#cc0000; border-radius:14px;"
359
- f"padding:3px 10px; font-size:0.76rem; font-weight:600; margin:3px;'>"
360
- f"🚩 {html.escape(l)}</span>"
361
- for l in lbls
362
- )
363
- st.markdown(f"<div style='margin-top:10px'>{tags}</div>", unsafe_allow_html=True)
364
-
365
- st.markdown("<br>", unsafe_allow_html=True)
366
-
367
- col1, col2 = st.columns(2)
368
- with col1:
369
- if st.button("✏️ Editar comentario", use_container_width=True, type="primary"):
370
- st.session_state.pending_modal = None
371
- st.rerun()
372
- with col2:
373
- if st.button("Publicar de todas maneras", use_container_width=True):
374
- # Publicar aunque sea tóxico
375
- c = st.session_state.pending_modal
376
- st.session_state.comments.append(c)
377
- st.session_state.hub_history.insert(0, {
378
- "Usuario" : "@usuario",
379
- "Comentario": f'"{c["text"][:45]}..."',
380
- "Score" : round(c["probability"], 2),
381
- "Acción" : "⚠️ Override usuario",
382
- })
383
- st.session_state.pending_modal = None
384
- st.rerun()
385
-
386
-
387
- # ══════════════════════════════════════════════════════════════════════════════
388
- # HOME — interfaz estilo YouTube
389
- # ══════════════════════════════════════════════════════════════════════════════
390
- def render_home():
391
- # Disparar modal si hay comentario pendiente
392
- if st.session_state.pending_modal:
393
- show_toxicity_modal()
394
-
395
- col_main, col_right = st.columns([2.8, 1], gap="large")
396
-
397
- with col_main:
398
- # Video
399
- st.markdown(
400
- "<div class='video-thumb'><div class='play-btn'>▶</div></div>",
401
- unsafe_allow_html=True,
402
- )
403
- st.markdown(
404
- "<div class='video-title'>AI Moderation Demo — Detección de Hate Speech en tiempo real</div>"
405
- "<div class='video-meta'>15k vistas · 2 horas atrás</div>",
406
- unsafe_allow_html=True,
407
- )
408
- row_ch, row_sub = st.columns([3, 1])
409
- with row_ch:
410
- st.markdown(
411
- "<div style='display:flex; align-items:center; gap:10px; margin:10px 0;'>"
412
- "<div style='width:36px; height:36px; border-radius:50%; background:#cc0000;"
413
- "display:flex; align-items:center; justify-content:center; color:#fff;"
414
- "font-weight:700;'>S</div>"
415
- "<div><div class='channel-name'>SignalMod AI</div>"
416
- "<div class='video-meta'>1.2M suscriptores</div></div></div>",
417
- unsafe_allow_html=True,
418
- )
419
-
420
- st.divider()
421
-
422
- # ── Comentarios ────────────────────────────────────────────────────
423
- tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"])
424
- st.markdown(
425
- f"<div class='sec-title'>{len(st.session_state.comments)} Comentarios "
426
- f"<span style='font-size:0.8rem; color:#cc0000;'>· {tox_cnt} detectados</span></div>",
427
- unsafe_allow_html=True,
428
- )
429
-
430
- # Input de nuevo comentario
431
- new_text = st.text_area(
432
- "Escribe un comentario...",
433
- height=80, label_visibility="collapsed",
434
- key="comment_input",
435
- placeholder="Escribe un comentario...",
436
- )
437
-
438
- # Análisis en tiempo real (solo cuando hay texto)
439
- analysis = None
440
- if new_text.strip():
441
- svc = get_service(st.session_state.selected_model)
442
- analysis = svc.predict(new_text)
443
- pct = int(analysis["probability"] * 100)
444
- color = "#cc0000" if pct >= 70 else "#f5a623" if pct >= 40 else "#00c853"
445
- verdict = "TÓXICO" if analysis["is_toxic"] else "SEGURO"
446
- v_color = "#cc0000" if analysis["is_toxic"] else "#00c853"
447
- st.markdown(
448
- f"<div class='tox-row'>"
449
- f"<span>🔍 Analizando...</span>"
450
- f"<span style='background:{v_color}; color:#fff; border-radius:10px;"
451
- f"padding:1px 9px; font-size:0.72rem; font-weight:700;'>{verdict}</span>"
452
- f"<span style='color:{color}; font-weight:600;'>Toxicidad: {pct}%</span>"
453
- f"<div class='tox-bar-bg'>"
454
- f"<div class='tox-bar-fill' style='width:{pct}%; background:{color};'></div>"
455
- f"</div></div>",
456
- unsafe_allow_html=True,
457
- )
458
-
459
- col_c, col_p = st.columns([1, 1])
460
- with col_c:
461
- if st.button("Cancelar", use_container_width=True):
462
- st.rerun()
463
- with col_p:
464
- post = st.button("Comentar", type="primary", use_container_width=True)
465
-
466
- # Procesar envío
467
- if post and new_text.strip():
468
- if analysis is None:
469
- svc = get_service(st.session_state.selected_model)
470
- analysis = svc.predict(new_text)
471
-
472
- comment_obj = {
473
- "user" : "usuario",
474
- "initial" : "U",
475
- "text" : new_text.strip(),
476
- "time" : "ahora",
477
- "is_toxic" : analysis["is_toxic"],
478
- "probability": analysis["probability"],
479
- "labels" : analysis["labels"],
480
- }
481
-
482
- if analysis["is_toxic"]:
483
- # Guardar en pendiente y mostrar modal en el próximo render
484
- st.session_state.pending_modal = comment_obj
485
- st.rerun()
486
- else:
487
- # Publicar directamente
488
- st.session_state.comments.append(comment_obj)
489
- st.session_state.hub_history.insert(0, {
490
- "Usuario" : "@usuario",
491
- "Comentario": f'"{new_text.strip()[:45]}{"..." if len(new_text)>45 else ""}"',
492
- "Score" : round(analysis["probability"], 2),
493
- "Acción" : "✅ Aprobado",
494
- })
495
- st.rerun()
496
-
497
- # ── Lista de comentarios ───────────────────────────────────────────
498
- for c in reversed(st.session_state.comments):
499
- is_tox = c["is_toxic"]
500
- pct = int(c["probability"] * 100)
501
- av_class = "c-avatar" if is_tox else "c-avatar safe"
502
- badge = (
503
- "<span class='badge badge-toxic'>TÓXICO</span>" if is_tox
504
- else "<span class='badge badge-safe'>SEGURO</span>"
505
- )
506
- text_class = "c-text toxic" if is_tox else "c-text"
507
- flagged = "<div class='c-flagged'>🚩 Flagged for review</div>" if is_tox else ""
508
-
509
- # html.escape() protege contra caracteres que rompen el HTML
510
- safe_text = html.escape(c["text"])
511
- safe_user = html.escape(c["user"])
512
- initial = html.escape(c.get("initial", c["user"][0].upper()))
513
-
514
- st.markdown(
515
- f"<div class='comment-wrap'>"
516
- f" <div class='{av_class}'>{initial}</div>"
517
- f" <div class='c-body'>"
518
- f" <div class='c-header'>"
519
- f" <span class='c-user'>@{safe_user}</span>"
520
- f" <span class='c-time'>{c['time']}</span>"
521
- f" {badge}"
522
- f" </div>"
523
- f" <div class='{text_class}'>{safe_text}</div>"
524
- f" {flagged}"
525
- f" </div>"
526
- f"</div>",
527
- unsafe_allow_html=True,
528
- )
529
-
530
- # ── Columna derecha ────────────────────────────────────────────────────
531
- with col_right:
532
- st.markdown("**Sugeridos**")
533
- suggested = [
534
- ("🤖", "Understanding Transformer Models...", "Neural Systems", "89k · 1 día"),
535
- ("🎓", "The Future of Content Moderation", "Tech Ethics Pro", "1.4M · 2 sem"),
536
- ("📡", "Signal vs Noise: SignalMod Deep Dive","SignalMod AI", "250k · 3 días"),
537
- ("💡", "Why AI Moderation is Harder Than...", "Ethics in Code", "45k · 5 h"),
538
- ("🔬", "Hate Speech Detection 2024", "AI Research Lab", "12k · 1 sem"),
539
- ]
540
- for emoji, title, ch, meta in suggested:
541
- st.markdown(
542
- f"<div class='sug-card'>"
543
- f" <div class='sug-thumb'>{emoji}</div>"
544
- f" <div>"
545
- f" <div class='sug-title'>{html.escape(title)}</div>"
546
- f" <div class='sug-ch'>{html.escape(ch)}</div>"
547
- f" <div class='sug-meta'>{html.escape(meta)}</div>"
548
- f" </div>"
549
- f"</div>",
550
- unsafe_allow_html=True,
551
- )
552
-
553
-
554
- # ══════════════════════════════════════════════════════════════════════════════
555
- # MODERATOR HUB
556
- # ══════════════════════════════════════════════════════════════════════════════
557
- def render_hub():
558
- try:
559
- import plotly.graph_objects as go
560
- except ImportError:
561
- st.error("Instala plotly: pip install plotly")
562
- return
563
-
564
- st.markdown("## 📊 Panel de Estadísticas")
565
-
566
- # ── Cards de configuración ──────────────────────────────────────────────
567
- model_short = st.session_state.selected_model.split("(")[0].strip()
568
- c1, c2, c3 = st.columns(3)
569
- for col, label, val in [
570
- (c1, "MODEL ARCHITECTURE", model_short),
571
- (c2, "CONFIDENCE THRESHOLD", f"{st.session_state.threshold:.2f} Alpha"),
572
- (c3, "LANGUAGE COVERAGE", "English"),
573
- ]:
574
- with col:
575
- st.markdown(
576
- f"<div class='hub-card'>"
577
- f"<div class='hub-kpi-label'>{label}</div>"
578
- f"<div style='font-weight:600; font-size:0.95rem; color:#0f0f0f;'>"
579
- f"{html.escape(str(val))}</div></div>",
580
- unsafe_allow_html=True,
581
- )
582
-
583
- st.write("")
584
-
585
- # ── KPIs ────────��──────────────────────────────────────────────────────
586
- total = len(st.session_state.comments) + 100
587
- tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"]) + 5
588
- tox_rate = tox_cnt / total * 100
589
- m1, m2, m3 = st.columns(3)
590
- m1.metric("💬 Total comentarios", f"{total:,}", "+12%")
591
- m2.metric("☠️ Tasa de toxicidad", f"{tox_rate:.1f}%",
592
- f"+0.8%", delta_color="inverse")
593
- m3.metric("🎯 F1 Score", "0.7579", "Stable")
594
-
595
- st.divider()
596
-
597
- # ── Gráficos ───────────────────────────────────────────────────────────
598
- gcol, pcol = st.columns([2.2, 1])
599
-
600
- with gcol:
601
- days = ["Lun","Mar","Mié","Jue","Vie","Sáb","Dom"]
602
- vals = [random.randint(30, 80) for _ in days]
603
- vals[3] = max(vals) + 25
604
- colors = ["#cc0000" if i == 3 else "#b3c6ff" for i in range(7)]
605
- fig = go.Figure(go.Bar(x=days, y=vals, marker_color=colors, width=0.55))
606
- fig.update_layout(
607
- title="Tendencias de Toxicidad (7D)",
608
- paper_bgcolor="#ffffff", plot_bgcolor="#ffffff",
609
- margin=dict(l=20, r=20, t=40, b=20), height=260,
610
- font=dict(size=11, color="#0f0f0f"),
611
- )
612
- fig.update_yaxes(showgrid=True, gridcolor="#f0f0f0", zeroline=False)
613
- fig.update_xaxes(showgrid=False)
614
- st.plotly_chart(fig, use_container_width=True)
615
-
616
- with pcol:
617
- fig2 = go.Figure(go.Pie(
618
- labels=["Hate Speech","Insulto","Agresividad"],
619
- values=[45, 35, 20],
620
- hole=0.58,
621
- marker_colors=["#cc0000","#0f0f0f","#909090"],
622
- textfont_size=11,
623
- ))
624
- fig2.update_layout(
625
- title="Categorías",
626
- paper_bgcolor="#ffffff",
627
- margin=dict(l=10, r=10, t=40, b=10), height=260,
628
- legend=dict(font=dict(size=10), orientation="v"),
629
- font=dict(size=11, color="#0f0f0f"),
630
- )
631
- st.plotly_chart(fig2, use_container_width=True)
632
-
633
- # ── Historial ──────────────────────────────────────────────────────────
634
- st.markdown("### Historial Reciente")
635
- df = pd.DataFrame(st.session_state.hub_history)
636
- if not df.empty:
637
- st.dataframe(
638
- df, use_container_width=True, hide_index=True,
639
- column_config={
640
- "Score": st.column_config.ProgressColumn(
641
- "Score", min_value=0, max_value=1, format="%.2f"
642
- )
643
- },
644
- )
645
-
646
-
647
- # ══════════════════════════════════════════════════════════════════════════════
648
- # SETTINGS
649
- # ══════════════════════════════════════════════════════════════════════════════
650
- def render_settings():
651
- st.markdown("## ⚙️ Ajustes")
652
-
653
- # ── Selección de modelo ─────────────────────────────────────────────────
654
- st.markdown("### 🤖 Modelo de detección",)
655
- st.caption(
656
- "Los modelos HuggingFace se descargan la primera vez (~300–600 MB). "
657
- "Requieren: `pip install transformers torch sentencepiece`"
658
- )
659
- st.write("")
660
-
661
- # Usamos st.radio para la selección — sin bugs de HTML
662
- model_names = list(AVAILABLE_MODELS.keys())
663
- current_idx = model_names.index(st.session_state.selected_model) \
664
- if st.session_state.selected_model in model_names else 0
665
-
666
- chosen = st.radio(
667
- "Seleccionar modelo",
668
- model_names,
669
- index=current_idx,
670
- label_visibility="collapsed",
671
- )
672
-
673
- if chosen != st.session_state.selected_model:
674
- st.session_state.selected_model = chosen
675
- st.rerun()
676
-
677
- # Ficha del modelo seleccionado
678
- info = AVAILABLE_MODELS[st.session_state.selected_model]
679
- st.markdown(
680
- f"<div class='model-card active'>"
681
- f"<div class='model-card-name'>{info['icon']} {html.escape(st.session_state.selected_model)}</div>"
682
- f"<div class='model-card-desc'>{html.escape(info['description'])}</div>"
683
- f"<div style='margin-top:8px;'>"
684
- f"<span class='model-pill'>⚡ {html.escape(info['speed'])}</span>"
685
- f"<span class='model-pill'>🎯 {html.escape(info['accuracy'])}</span>"
686
- f"<span class='model-pill'>📦 {html.escape(info['requires'])}</span>"
687
- f"</div></div>",
688
- unsafe_allow_html=True,
689
- )
690
-
691
- # Info sobre modelo fine-tuneado
692
- if st.session_state.selected_model == "Modelo fine-tuneado (local)":
693
- path = PROJECT_ROOT / "models" / "finetuned_hf"
694
- if path.exists():
695
- st.success(f"✅ Modelo encontrado en `{path}`")
696
- else:
697
- st.warning(
698
- f"⚠️ No se encontró el modelo en `{path}`. "
699
- f"Ejecuta el **notebook 08** para generar el modelo fine-tuneado."
700
- )
701
-
702
- st.divider()
703
-
704
- # ── Umbral de confianza ─────────────────────────────────────────────────
705
- st.markdown("### 🎚️ Umbral de confianza")
706
- st.caption("Probabilidad mínima para marcar un comentario como tóxico.")
707
-
708
- new_thr = st.slider(
709
- "Umbral",
710
- min_value=0.3, max_value=0.9, step=0.05,
711
- value=st.session_state.threshold,
712
- label_visibility="collapsed",
713
- format="%.2f",
714
- )
715
- if new_thr != st.session_state.threshold:
716
- st.session_state.threshold = new_thr
717
- st.info(f"Umbral actualizado: **{new_thr:.2f}**")
718
-
719
- ta, tb = st.columns(2)
720
- ta.info(f"⬇️ **{new_thr:.2f}** bajo → más FP (más censura)", icon="⚠️")
721
- tb.info(f"⬆️ **{new_thr:.2f}** alto → más FN (más escapes)", icon="⚠️")
722
-
723
- st.divider()
724
-
725
- # ── Test rápido ─────────────────────────────────────────────────────────
726
- st.markdown("### 🧪 Probar modelo")
727
- test_txt = st.text_input(
728
- "Texto a analizar",
729
- placeholder="Ej: This is absolutely stupid and racist...",
730
- label_visibility="collapsed",
731
- )
732
- if st.button("Analizar", type="primary") and test_txt.strip():
733
- with st.spinner("Analizando..."):
734
- svc = get_service(st.session_state.selected_model)
735
- res = svc.predict(test_txt)
736
-
737
- pct = int(res["probability"] * 100)
738
- verdict = "🔴 TÓXICO" if res["is_toxic"] else "🟢 SEGURO"
739
- st.markdown(f"**{verdict}** — {pct}% de toxicidad")
740
- st.progress(res["probability"])
741
- if res["labels"]:
742
- st.markdown(f"**Categorías:** {', '.join(res['labels'])}")
743
- if "error" in res:
744
- st.error(f"Error: {res['error']}")
745
- st.caption(f"Modelo: {res['model_used']}")
746
-
747
-
748
- # ══════════════════════════════════════════════════════════════════════════════
749
- # MAIN
750
- # ══════════════════════════════════════════════════════════════════════════════
751
- def main():
752
- render_sidebar()
753
-
754
- page = st.session_state.page
755
- if page == "Home":
756
- render_home()
757
- elif page == "Moderator Hub":
758
- render_hub()
759
- elif page == "Settings":
760
- render_settings()
761
-
762
-
763
- if __name__ == "__main__":
764
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/evaluation/.gitkeep DELETED
File without changes
src/service/model_catalog.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load inference model catalog from configs/model_catalog.yaml."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+
10
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
11
+ CATALOG_PATH = PROJECT_ROOT / "configs" / "model_catalog.yaml"
12
+
13
+ _DEFAULT_CATALOG: dict[str, dict[str, Any]] = {
14
+ "LR + TF-IDF (local)": {
15
+ "type": "local",
16
+ "icon": "⚡",
17
+ "description": "Project baseline.",
18
+ "speed": "< 50ms",
19
+ "accuracy": "F1 0.76",
20
+ "requires": "joblib only",
21
+ },
22
+ }
23
+
24
+
25
+ def load_model_catalog() -> dict[str, dict[str, Any]]:
26
+ if not CATALOG_PATH.exists():
27
+ return dict(_DEFAULT_CATALOG)
28
+ with CATALOG_PATH.open(encoding="utf-8") as f:
29
+ data = yaml.safe_load(f) or {}
30
+ if not isinstance(data, dict) or not data:
31
+ return dict(_DEFAULT_CATALOG)
32
+ return data
src/service/model_service.py CHANGED
@@ -1,99 +1,99 @@
1
- """
2
- src/services/model_service.py
3
 
4
- Servicio centralizado de predicción de toxicidad.
5
-
6
- Modelos soportados:
7
- local → models/final_model.joblib (LR + TF-IDF, instantáneo)
8
- hf_remote → HuggingFace Hub (requiere internet + transformers)
9
- hf_local → modelo HF fine-tuneado localmente (notebook 08)
10
-
11
- Instalación para modelos HF:
12
- pip install transformers torch sentencepiece accelerate
13
- """
14
 
15
  import re
16
- import yaml
17
- import joblib
18
  from pathlib import Path
19
- from typing import Optional
20
-
21
- # ─── Catálogo de modelos ──────────────────────────────────────────────────────
22
- AVAILABLE_MODELS = {
23
- "LR + TF-IDF (local)": {
24
- "type" : "local",
25
- "icon" : "⚡",
26
- "description": "Modelo del proyecto. Sin GPU, instantáneo.",
27
- "speed" : "< 50ms",
28
- "accuracy" : "F1 0.76",
29
- "requires" : "Solo joblib",
30
- },
31
- "DistilBERT Toxicity": {
32
- "type" : "hf_remote",
33
- "icon" : "🤖",
34
- "model_id" : "martin-ha/toxic-comment-model",
35
- "description": "DistilBERT fine-tuned en comentarios tóxicos.",
36
- "speed" : "~200ms CPU",
37
- "accuracy" : "F1 0.85",
38
- "requires" : "transformers torch",
39
- },
40
- "toxic-bert (multilabel)": {
41
- "type" : "hf_remote",
42
- "icon" : "🧠",
43
- "model_id" : "unitary/toxic-bert",
44
- "description": "BERT multi-label (Jigsaw). Detecta 6 categorías.",
45
- "speed" : "~400ms CPU",
46
- "accuracy" : "F1 0.88",
47
- "requires" : "transformers torch",
48
- },
49
- "RoBERTa Toxicity": {
50
- "type" : "hf_remote",
51
- "icon" : "🔬",
52
- "model_id" : "s-nlp/roberta_toxicity_classifier",
53
- "description": "RoBERTa fine-tuned para toxicidad general.",
54
- "speed" : "~350ms CPU",
55
- "accuracy" : "F1 0.87",
56
- "requires" : "transformers torch",
57
- },
58
- "Modelo fine-tuneado (local)": {
59
- "type" : "hf_local",
60
- "icon" : "✨",
61
- "model_path" : "models/finetuned_hf",
62
- "description": "Tu modelo fine-tuneado en el notebook 08.",
63
- "speed" : "Depende del hardware",
64
- "accuracy" : "A evaluar",
65
- "requires" : "transformers torch",
66
- },
67
- }
 
 
 
 
 
68
 
69
  HF_LABEL_MAP = {
70
- "toxic": "Tóxico", "severe_toxic": "Muy ofensivo",
71
- "obscene": "Obsceno", "threat": "Amenaza",
72
- "insult": "Insulto", "identity_hate": "Odio racial",
73
- "label_1": "Tóxico",
 
 
 
74
  }
75
 
76
  _KEYWORD_LABELS = {
77
- "Insulto" : ["idiot","stupid","dumb","fool","moron","loser"],
78
- "Odio racial": ["thug","racist","race","criminal"],
79
- "Amenaza" : ["kill","shoot","die","dead","hurt","attack"],
80
- "Obsceno" : ["fuck","shit","ass","bitch","cunt","bastard"],
81
- "Agresividad": ["hate","despise","disgusting","pathetic","worthless"],
82
  }
83
 
84
 
85
- def _labels_from_keywords(text: str, probability: float) -> list:
86
  t = text.lower()
87
  found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
88
- return found if found else (["Contenido ofensivo"] if probability >= 0.5 else [])
89
 
90
 
91
  class _FallbackPreprocessor:
92
- _SW = {"the","a","an","and","or","but","in","on","at","to","for",
93
- "of","with","is","it","this","that","are","was","be","have",
94
- "has","he","she","they","we","you","i","not","do","did",
95
- "will","can","would","should","could","from","by","as","if"}
96
- def transform(self, text):
 
 
 
97
  t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
98
  t = re.sub(r"[^\x00-\x7F]+", " ", t)
99
  t = re.sub(r"[^a-z\s]", " ", t)
@@ -103,10 +103,10 @@ class _FallbackPreprocessor:
103
 
104
  class ModelService:
105
  def __init__(self, model_name: str, project_root: Optional[Path] = None):
106
- self.model_name = model_name
107
- self.cfg = AVAILABLE_MODELS.get(model_name) or list(AVAILABLE_MODELS.values())[0]
108
  self.project_root = project_root or Path.cwd()
109
- self._model = None
110
  self._preprocessor = None
111
 
112
  def _get_model(self):
@@ -119,84 +119,95 @@ class ModelService:
119
  elif t == "hf_local":
120
  path = self.project_root / self.cfg["model_path"]
121
  if not path.exists():
122
- raise FileNotFoundError(
123
- f"Modelo no encontrado en {path}. Ejecuta el notebook 08 primero."
124
- )
125
  self._load_hf(str(path))
126
  return self._model
127
 
128
- def _load_local(self):
129
- for name in ["final_model.joblib","lr_tuned.joblib",
130
- "lr_baseline.joblib","best_ensemble.joblib"]:
131
  p = self.project_root / "models" / name
132
  if p.exists():
133
  self._model = joblib.load(p)
134
  break
135
  if self._model is None:
136
- raise FileNotFoundError(f"No hay modelo en {self.project_root / 'models'}")
137
  try:
138
- import sys; sys.path.insert(0, str(self.project_root))
139
  from src.features.text_preprocessor import TextPreprocessor
 
140
  self._preprocessor = TextPreprocessor(
141
  config_path=str(self.project_root / "configs" / "features.yaml")
142
  )
143
  except Exception:
144
  self._preprocessor = _FallbackPreprocessor()
145
 
146
- def _load_hf(self, model_id_or_path: str):
147
  try:
148
  from transformers import pipeline as hf_pipeline
149
- except ImportError:
150
- raise ImportError("Instala: pip install transformers torch sentencepiece")
151
  self._model = hf_pipeline(
152
- "text-classification", model=model_id_or_path,
153
- return_all_scores=True, truncation=True, max_length=512,
 
 
 
154
  )
155
 
156
  def predict(self, text: str) -> dict:
157
  if not text or not text.strip():
158
- return {"is_toxic": False, "probability": 0.0,
159
- "labels": [], "model_used": self.model_name}
160
  try:
161
  model = self._get_model()
162
  if self.cfg["type"] == "local":
163
  return self._pred_local(text, model)
164
  return self._pred_hf(text, model)
165
  except Exception as e:
166
- return {"is_toxic": False, "probability": 0.0,
167
- "labels": [], "model_used": self.model_name, "error": str(e)}
168
-
169
- def _pred_local(self, text, model):
 
 
 
 
 
170
  clean = self._preprocessor.transform(text) or text
171
  proba = float(model.predict_proba([clean])[0][1])
172
- tox = proba >= 0.5
173
- return {"is_toxic": tox, "probability": proba,
174
- "labels": _labels_from_keywords(text, proba) if tox else [],
175
- "model_used": self.model_name}
176
-
177
- def _pred_hf(self, text, pipeline_fn):
178
- raw = pipeline_fn(text[:512])
179
- smap = {s["label"].lower(): s["score"] for s in (raw[0] if isinstance(raw[0], list) else raw)}
180
- for key in ("label_1","toxic","toxic_1"):
 
 
 
 
181
  if key in smap:
182
- proba = smap[key]; break
 
183
  else:
184
- neg = {"label_0","non_toxic","not_toxic","not toxic"}
185
- vals = [v for k,v in smap.items() if k not in neg]
186
  proba = max(vals) if vals else 0.0
187
  tox = proba >= 0.5
188
- labels = []
189
  if tox:
190
- for k,v in smap.items():
191
- if k not in ("label_0","non_toxic") and v >= 0.35:
192
- friendly = HF_LABEL_MAP.get(k, k.replace("_"," ").title())
193
- if "no tóxico" not in friendly.lower():
194
- labels.append(friendly)
195
  if not labels:
196
- labels = ["Contenido ofensivo"]
197
- return {"is_toxic": tox, "probability": proba,
198
- "labels": labels, "model_used": self.model_name}
199
 
200
  @staticmethod
201
- def get_available_models(): return AVAILABLE_MODELS
202
- def get_model_info(self): return self.cfg
 
 
 
 
1
+ """Centralized toxicity prediction service."""
 
2
 
3
+ from __future__ import annotations
 
 
 
 
 
 
 
 
 
4
 
5
  import re
6
+ import sys
 
7
  from pathlib import Path
8
+ from typing import Any, Optional
9
+
10
+ import joblib
11
+
12
+ from src.service.model_catalog import load_model_catalog
13
+
14
+ AVAILABLE_MODELS: dict[str, dict[str, Any]] = load_model_catalog()
15
+
16
+ _HF_DEPS_MSG = "Install HF deps: uv sync --extra hf"
17
+
18
+
19
+ def hf_deps_available() -> bool:
20
+ try:
21
+ import transformers # noqa: F401
22
+
23
+ return True
24
+ except ImportError:
25
+ return False
26
+
27
+
28
+ def check_model_availability(name: str, project_root: Path | None = None) -> tuple[bool, str | None]:
29
+ """Return (available, reason) for a catalog model name."""
30
+ cfg = AVAILABLE_MODELS.get(name)
31
+ if not cfg:
32
+ return False, "Unknown model"
33
+
34
+ root = project_root or Path.cwd()
35
+ model_type = cfg.get("type", "local")
36
+
37
+ if model_type == "local":
38
+ models_dir = root / "models"
39
+ if any((models_dir / n).exists() for n in (
40
+ "final_model.joblib",
41
+ "lr_tuned.joblib",
42
+ "lr_baseline.joblib",
43
+ "best_ensemble.joblib",
44
+ )):
45
+ return True, None
46
+ return False, f"No model in {models_dir}"
47
+
48
+ if model_type == "hf_local":
49
+ if not hf_deps_available():
50
+ return False, _HF_DEPS_MSG
51
+ path = root / cfg["model_path"]
52
+ if not path.exists():
53
+ return False, f"Model not found at {path}."
54
+ return True, None
55
+
56
+ if model_type == "hf_remote":
57
+ if not hf_deps_available():
58
+ return False, _HF_DEPS_MSG
59
+ return True, None
60
+
61
+ return False, f"Unsupported model type: {model_type}"
62
 
63
  HF_LABEL_MAP = {
64
+ "toxic": "Toxic",
65
+ "severe_toxic": "Severely offensive",
66
+ "obscene": "Obscene",
67
+ "threat": "Threat",
68
+ "insult": "Insult",
69
+ "identity_hate": "Identity hate",
70
+ "label_1": "Toxic",
71
  }
72
 
73
  _KEYWORD_LABELS = {
74
+ "Insult": ["idiot", "stupid", "dumb", "fool", "moron", "loser"],
75
+ "Identity hate": ["thug", "racist", "race", "criminal"],
76
+ "Threat": ["kill", "shoot", "die", "dead", "hurt", "attack"],
77
+ "Obscene": ["fuck", "shit", "ass", "bitch", "cunt", "bastard"],
78
+ "Aggression": ["hate", "despise", "disgusting", "pathetic", "worthless"],
79
  }
80
 
81
 
82
+ def _labels_from_keywords(text: str, probability: float) -> list[str]:
83
  t = text.lower()
84
  found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
85
+ return found if found else (["Offensive content"] if probability >= 0.5 else [])
86
 
87
 
88
  class _FallbackPreprocessor:
89
+ _SW = {
90
+ "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
91
+ "of", "with", "is", "it", "this", "that", "are", "was", "be", "have",
92
+ "has", "he", "she", "they", "we", "you", "i", "not", "do", "did",
93
+ "will", "can", "would", "should", "could", "from", "by", "as", "if",
94
+ }
95
+
96
+ def transform(self, text: str) -> str:
97
  t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
98
  t = re.sub(r"[^\x00-\x7F]+", " ", t)
99
  t = re.sub(r"[^a-z\s]", " ", t)
 
103
 
104
  class ModelService:
105
  def __init__(self, model_name: str, project_root: Optional[Path] = None):
106
+ self.model_name = model_name
107
+ self.cfg = AVAILABLE_MODELS.get(model_name) or next(iter(AVAILABLE_MODELS.values()))
108
  self.project_root = project_root or Path.cwd()
109
+ self._model = None
110
  self._preprocessor = None
111
 
112
  def _get_model(self):
 
119
  elif t == "hf_local":
120
  path = self.project_root / self.cfg["model_path"]
121
  if not path.exists():
122
+ raise FileNotFoundError(f"Model not found at {path}.")
 
 
123
  self._load_hf(str(path))
124
  return self._model
125
 
126
+ def _load_local(self) -> None:
127
+ for name in ("final_model.joblib", "lr_tuned.joblib", "lr_baseline.joblib", "best_ensemble.joblib"):
 
128
  p = self.project_root / "models" / name
129
  if p.exists():
130
  self._model = joblib.load(p)
131
  break
132
  if self._model is None:
133
+ raise FileNotFoundError(f"No model in {self.project_root / 'models'}")
134
  try:
135
+ sys.path.insert(0, str(self.project_root))
136
  from src.features.text_preprocessor import TextPreprocessor
137
+
138
  self._preprocessor = TextPreprocessor(
139
  config_path=str(self.project_root / "configs" / "features.yaml")
140
  )
141
  except Exception:
142
  self._preprocessor = _FallbackPreprocessor()
143
 
144
+ def _load_hf(self, model_id_or_path: str) -> None:
145
  try:
146
  from transformers import pipeline as hf_pipeline
147
+ except ImportError as exc:
148
+ raise ImportError("Install HF deps: uv sync --extra hf") from exc
149
  self._model = hf_pipeline(
150
+ "text-classification",
151
+ model=model_id_or_path,
152
+ return_all_scores=True,
153
+ truncation=True,
154
+ max_length=512,
155
  )
156
 
157
  def predict(self, text: str) -> dict:
158
  if not text or not text.strip():
159
+ return {"is_toxic": False, "probability": 0.0, "labels": [], "model_used": self.model_name}
 
160
  try:
161
  model = self._get_model()
162
  if self.cfg["type"] == "local":
163
  return self._pred_local(text, model)
164
  return self._pred_hf(text, model)
165
  except Exception as e:
166
+ return {
167
+ "is_toxic": False,
168
+ "probability": 0.0,
169
+ "labels": [],
170
+ "model_used": self.model_name,
171
+ "error": str(e),
172
+ }
173
+
174
+ def _pred_local(self, text: str, model) -> dict:
175
  clean = self._preprocessor.transform(text) or text
176
  proba = float(model.predict_proba([clean])[0][1])
177
+ tox = proba >= 0.5
178
+ return {
179
+ "is_toxic": tox,
180
+ "probability": proba,
181
+ "labels": _labels_from_keywords(text, proba) if tox else [],
182
+ "model_used": self.model_name,
183
+ }
184
+
185
+ def _pred_hf(self, text: str, pipeline_fn) -> dict:
186
+ raw = pipeline_fn(text[:512])
187
+ smap = {s["label"].lower(): s["score"] for s in (raw[0] if isinstance(raw[0], list) else raw)}
188
+ proba = 0.0
189
+ for key in ("label_1", "toxic", "toxic_1"):
190
  if key in smap:
191
+ proba = smap[key]
192
+ break
193
  else:
194
+ neg = {"label_0", "non_toxic", "not_toxic", "not toxic"}
195
+ vals = [v for k, v in smap.items() if k not in neg]
196
  proba = max(vals) if vals else 0.0
197
  tox = proba >= 0.5
198
+ labels: list[str] = []
199
  if tox:
200
+ for k, v in smap.items():
201
+ if k not in ("label_0", "non_toxic") and v >= 0.35:
202
+ friendly = HF_LABEL_MAP.get(k, k.replace("_", " ").title())
203
+ labels.append(friendly)
 
204
  if not labels:
205
+ labels = ["Offensive content"]
206
+ return {"is_toxic": tox, "probability": proba, "labels": labels, "model_used": self.model_name}
 
207
 
208
  @staticmethod
209
+ def get_available_models() -> dict:
210
+ return AVAILABLE_MODELS
211
+
212
+ def get_model_info(self) -> dict:
213
+ return self.cfg
tests/test_api.py CHANGED
@@ -1,4 +1,4 @@
1
- """Tests del endpoint POST /predict."""
2
 
3
  from unittest.mock import MagicMock
4
 
@@ -6,11 +6,14 @@ import pytest
6
  from fastapi.testclient import TestClient
7
 
8
  from src.api import main as api_main
 
9
 
10
  PREDICT_RESPONSE_KEYS = {
11
  "text",
12
  "is_toxic",
13
  "probability",
 
 
14
  "labels",
15
  "model_used",
16
  "latency_ms",
@@ -28,13 +31,16 @@ def client():
28
  }
29
 
30
  with TestClient(api_main.app) as test_client:
31
- api_main._state["service"] = mock_service
32
- api_main._state["model_name"] = "LR + TF-IDF (local)"
33
- api_main._state["predictions_served"] = 0
 
 
34
  yield test_client
35
 
36
- api_main._state["service"] = None
37
- api_main._state["model_name"] = None
 
38
 
39
 
40
  def test_predict_returns_correct_structure(client: TestClient):
@@ -47,14 +53,79 @@ def test_predict_returns_correct_structure(client: TestClient):
47
  data = response.json()
48
  assert PREDICT_RESPONSE_KEYS <= set(data.keys())
49
  assert data["text"] == "This is a sample comment"
 
 
50
  assert isinstance(data["is_toxic"], bool)
51
  assert 0.0 <= data["probability"] <= 1.0
52
- assert isinstance(data["labels"], list)
53
- assert isinstance(data["model_used"], str)
54
- assert isinstance(data["latency_ms"], (int, float))
55
 
56
 
57
  def test_predict_rejects_empty_text(client: TestClient):
58
  response = client.post("/predict", json={"text": " "})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  assert response.status_code == 422
 
 
1
+ """Tests for POST /predict."""
2
 
3
  from unittest.mock import MagicMock
4
 
 
6
  from fastapi.testclient import TestClient
7
 
8
  from src.api import main as api_main
9
+ from src.api.state import get_state
10
 
11
  PREDICT_RESPONSE_KEYS = {
12
  "text",
13
  "is_toxic",
14
  "probability",
15
+ "status",
16
+ "mode",
17
  "labels",
18
  "model_used",
19
  "latency_ms",
 
31
  }
32
 
33
  with TestClient(api_main.app) as test_client:
34
+ state = get_state()
35
+ state["service"] = mock_service
36
+ state["model_name"] = "LR + TF-IDF (local)"
37
+ state["predictions_served"] = 0
38
+ state["startup_time"] = 0.0
39
  yield test_client
40
 
41
+ state = get_state()
42
+ state["service"] = None
43
+ state["model_name"] = None
44
 
45
 
46
  def test_predict_returns_correct_structure(client: TestClient):
 
53
  data = response.json()
54
  assert PREDICT_RESPONSE_KEYS <= set(data.keys())
55
  assert data["text"] == "This is a sample comment"
56
+ assert data["status"] == "Safe"
57
+ assert data["mode"] == "binary"
58
  assert isinstance(data["is_toxic"], bool)
59
  assert 0.0 <= data["probability"] <= 1.0
 
 
 
60
 
61
 
62
  def test_predict_rejects_empty_text(client: TestClient):
63
  response = client.post("/predict", json={"text": " "})
64
+ assert response.status_code == 422
65
+
66
+
67
+ def test_health_includes_project_name(client: TestClient):
68
+ response = client.get("/health")
69
+ assert response.status_code == 200
70
+ assert response.json()["project"] == "youtube_hate_detector"
71
+
72
+
73
+ def test_predict_video_demo_comments_differ_by_url(client: TestClient, monkeypatch):
74
+ monkeypatch.delenv("YOUTUBE_API_KEY", raising=False)
75
+
76
+ r1 = client.post(
77
+ "/predict-video",
78
+ json={
79
+ "url": "https://www.youtube.com/watch?v=jNQXAC9IVRw",
80
+ "max_comments": 5,
81
+ "threshold": 0.5,
82
+ },
83
+ )
84
+ r2 = client.post(
85
+ "/predict-video",
86
+ json={
87
+ "url": "https://www.youtube.com/watch?v=IEEhzQoKtQU",
88
+ "max_comments": 5,
89
+ "threshold": 0.5,
90
+ },
91
+ )
92
+
93
+ assert r1.status_code == 200
94
+ assert r2.status_code == 200
95
+ data1 = r1.json()
96
+ data2 = r2.json()
97
+ assert data1["source"] == "demo"
98
+ assert data2["source"] == "demo"
99
+ assert data1["results"][0]["text"] != data2["results"][0]["text"]
100
+
101
 
102
+ def test_models_status_lists_catalog(client: TestClient):
103
+ response = client.get("/models/status")
104
+ assert response.status_code == 200
105
+ data = response.json()
106
+ assert "models" in data
107
+ assert len(data["models"]) >= 1
108
+ names = {m["name"] for m in data["models"]}
109
+ assert "LR + TF-IDF (local)" in names
110
+
111
+
112
+ def test_predict_video_comments_disabled_raises_422(client: TestClient, monkeypatch):
113
+ from src.api.youtube import CommentsFetchError
114
+
115
+ monkeypatch.setenv("YOUTUBE_API_KEY", "fake-key")
116
+
117
+ def _raise_disabled(*_args, **_kwargs):
118
+ raise CommentsFetchError("Comments are disabled on this video")
119
+
120
+ monkeypatch.setattr("src.api.routes.predict.fetch_comments", _raise_disabled)
121
+
122
+ response = client.post(
123
+ "/predict-video",
124
+ json={
125
+ "url": "https://www.youtube.com/watch?v=disabled123",
126
+ "max_comments": 5,
127
+ "threshold": 0.5,
128
+ },
129
+ )
130
  assert response.status_code == 422
131
+ assert "disabled" in response.json()["detail"].lower()
uv.lock ADDED
The diff for this file is too large to render. See raw diff