Mirae Kang commited on
Commit ·
e317d56
1
Parent(s): f0b240d
feat: update UI using VITE+React without streamlit, #22
Browse files- .dockerignore +1 -1
- .env.example +10 -7
- .python-version +1 -0
- Dockerfile +30 -15
- README.md +74 -178
- configs/model_catalog.yaml +43 -0
- configs/suggested_videos.yaml +15 -0
- docker-compose.yml +12 -40
- docs/ARCHITECTURE.md +34 -54
- pyproject.toml +41 -0
- requirements.txt +0 -17
- src/api/__init__.py +1 -0
- src/api/main.py +73 -421
- src/api/routes/__init__.py +1 -0
- src/api/routes/health.py +22 -0
- src/api/routes/models.py +78 -0
- src/api/routes/predict.py +77 -0
- src/api/routes/videos.py +17 -0
- src/api/schemas.py +93 -0
- src/api/services.py +47 -0
- src/api/state.py +16 -0
- src/api/youtube.py +187 -0
- src/app/app.py +0 -764
- src/evaluation/.gitkeep +0 -0
- src/service/model_catalog.py +32 -0
- src/service/model_service.py +135 -124
- tests/test_api.py +80 -9
- uv.lock +0 -0
.dockerignore
CHANGED
|
@@ -20,6 +20,7 @@ tests
|
|
| 20 |
!README.md
|
| 21 |
.env
|
| 22 |
.env.*
|
|
|
|
| 23 |
frontend/dist
|
| 24 |
models/checkpoints
|
| 25 |
models/**/checkpoints
|
|
@@ -28,4 +29,3 @@ models/roberta_hate_results
|
|
| 28 |
models/distilbert_results
|
| 29 |
models/best_distilbert
|
| 30 |
models/nb08_*
|
| 31 |
-
models/*_frozen
|
|
|
|
| 20 |
!README.md
|
| 21 |
.env
|
| 22 |
.env.*
|
| 23 |
+
!.env.example
|
| 24 |
frontend/dist
|
| 25 |
models/checkpoints
|
| 26 |
models/**/checkpoints
|
|
|
|
| 29 |
models/distilbert_results
|
| 30 |
models/best_distilbert
|
| 31 |
models/nb08_*
|
|
|
.env.example
CHANGED
|
@@ -1,15 +1,18 @@
|
|
| 1 |
-
# Copy to .env
|
| 2 |
-
# Docker Compose reads
|
| 3 |
|
| 4 |
-
# YouTube Data API v3
|
| 5 |
# https://console.cloud.google.com/apis/credentials
|
| 6 |
YOUTUBE_API_KEY=
|
| 7 |
|
| 8 |
-
# Active model (
|
| 9 |
MODEL_NAME=LR + TF-IDF (local)
|
| 10 |
|
| 11 |
# development | production
|
| 12 |
-
ENV=
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copy to .env: cp .env.example .env
|
| 2 |
+
# Docker Compose reads YOUTUBE_API_KEY from your environment.
|
| 3 |
|
| 4 |
+
# YouTube Data API v3 — required for real suggested videos and /predict-video
|
| 5 |
# https://console.cloud.google.com/apis/credentials
|
| 6 |
YOUTUBE_API_KEY=
|
| 7 |
|
| 8 |
+
# Active model (key from configs/model_catalog.yaml)
|
| 9 |
MODEL_NAME=LR + TF-IDF (local)
|
| 10 |
|
| 11 |
# development | production
|
| 12 |
+
ENV=development
|
| 13 |
|
| 14 |
+
# Optional: frontend dev when API is on another host (default uses Vite proxy)
|
| 15 |
+
VITE_API_BASE_URL=
|
| 16 |
+
|
| 17 |
+
# Docker only: build with Hugging Face models (see README)
|
| 18 |
+
# INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
Dockerfile
CHANGED
|
@@ -1,30 +1,40 @@
|
|
| 1 |
-
# youtube_hate_detector —
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
FROM python:3.12-slim-bookworm
|
| 3 |
|
|
|
|
|
|
|
| 4 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
PYTHONUNBUFFERED=1 \
|
| 6 |
PYTHONPATH=/app \
|
| 7 |
NLTK_DATA=/app/nltk_data \
|
| 8 |
MODEL_NAME="LR + TF-IDF (local)" \
|
| 9 |
-
ENV=production
|
|
|
|
| 10 |
|
| 11 |
WORKDIR /app
|
| 12 |
|
| 13 |
-
# System deps for spaCy / sklearn wheels
|
| 14 |
RUN apt-get update \
|
| 15 |
-
&& apt-get install -y --no-install-recommends
|
| 16 |
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
|
| 18 |
-
COPY
|
| 19 |
|
| 20 |
-
|
| 21 |
-
RUN
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
import nltk
|
| 29 |
for pkg in ("stopwords", "punkt"):
|
| 30 |
nltk.download(pkg, download_dir="/app/nltk_data")
|
|
@@ -33,8 +43,13 @@ PY
|
|
| 33 |
COPY configs/ configs/
|
| 34 |
COPY src/ src/
|
| 35 |
COPY models/final_model.joblib models/final_model.joblib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
|
|
|
| 1 |
+
# youtube_hate_detector — multi-stage: React + FastAPI (uv)
|
| 2 |
+
FROM node:22-bookworm-slim AS frontend-build
|
| 3 |
+
WORKDIR /app/frontend
|
| 4 |
+
COPY frontend/package.json frontend/package-lock.json* ./
|
| 5 |
+
RUN npm ci 2>/dev/null || npm install
|
| 6 |
+
COPY frontend/ ./
|
| 7 |
+
RUN npm run build
|
| 8 |
+
|
| 9 |
FROM python:3.12-slim-bookworm
|
| 10 |
|
| 11 |
+
ARG INSTALL_HF=0
|
| 12 |
+
|
| 13 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 14 |
PYTHONUNBUFFERED=1 \
|
| 15 |
PYTHONPATH=/app \
|
| 16 |
NLTK_DATA=/app/nltk_data \
|
| 17 |
MODEL_NAME="LR + TF-IDF (local)" \
|
| 18 |
+
ENV=production \
|
| 19 |
+
INSTALL_HF=${INSTALL_HF}
|
| 20 |
|
| 21 |
WORKDIR /app
|
| 22 |
|
|
|
|
| 23 |
RUN apt-get update \
|
| 24 |
+
&& apt-get install -y --no-install-recommends curl \
|
| 25 |
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
|
| 27 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
|
| 28 |
|
| 29 |
+
COPY pyproject.toml uv.lock* README.md ./
|
| 30 |
+
RUN if [ "$INSTALL_HF" = "1" ]; then \
|
| 31 |
+
uv sync --frozen --no-dev --extra hf 2>/dev/null || uv sync --no-dev --extra hf; \
|
| 32 |
+
else \
|
| 33 |
+
uv sync --frozen --no-dev 2>/dev/null || uv sync --no-dev; \
|
| 34 |
+
fi
|
| 35 |
|
| 36 |
+
RUN uv run python -m spacy download en_core_web_sm \
|
| 37 |
+
&& uv run python - <<'PY'
|
| 38 |
import nltk
|
| 39 |
for pkg in ("stopwords", "punkt"):
|
| 40 |
nltk.download(pkg, download_dir="/app/nltk_data")
|
|
|
|
| 43 |
COPY configs/ configs/
|
| 44 |
COPY src/ src/
|
| 45 |
COPY models/final_model.joblib models/final_model.joblib
|
| 46 |
+
COPY models/finetuned_hf/ models/finetuned_hf/
|
| 47 |
+
COPY --from=frontend-build /app/frontend/dist frontend/dist
|
| 48 |
+
COPY .env.example .env.example
|
| 49 |
+
|
| 50 |
+
EXPOSE 8000
|
| 51 |
|
| 52 |
+
HEALTHCHECK --interval=10s --timeout=5s --retries=12 --start-period=60s \
|
| 53 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 54 |
|
| 55 |
+
CMD ["uv", "run", "uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,247 +1,143 @@
|
|
| 1 |
-
# YouTube Toxic Comment Detector (
|
| 2 |
|
| 3 |
[](https://www.python.org/downloads/)
|
| 4 |
[](https://fastapi.tiangolo.com/)
|
| 5 |
-
[](https://docs.docker.com/compose/)
|
|
|
|
| 7 |
**Español:** [README.es.md](README.es.md)
|
| 8 |
|
| 9 |
-
Automated **Safe vs Toxic** classification for YouTube-style comments.
|
| 10 |
|
| 11 |
---
|
| 12 |
|
| 13 |
-
##
|
| 14 |
-
|
| 15 |
-
| Item | Detail |
|
| 16 |
-
|------|--------|
|
| 17 |
-
| **Goal** | Help moderation teams flag toxic comments quickly |
|
| 18 |
-
| **Dataset** | `data/raw/youtoxic_english_1000.csv` (~1k English comments) |
|
| 19 |
-
| **Target** | `IsToxic` → **Safe (0)** / **Toxic (1)** |
|
| 20 |
-
| **Primary metric** | Weighted F1 and ROC-AUC (imbalanced classes) |
|
| 21 |
-
| **Overfitting check** | \|CV F1 − test F1\| < 5 percentage points (project rubric) |
|
| 22 |
-
|
| 23 |
-
---
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
```
|
| 28 |
youtube_hate_detector/
|
| 29 |
-
├── configs/ #
|
| 30 |
-
├──
|
| 31 |
├── models/ # final_model.joblib, experiments/
|
| 32 |
-
├── reports/ # summary.csv, plots, pipeline artifacts
|
| 33 |
├── src/
|
| 34 |
-
│ ├── api/ # FastAPI
|
| 35 |
-
│
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
│ ├── features/ # TextPreprocessor, Vectorizer
|
| 39 |
-
│ ├── models/ # LR, RF, XGBoost baselines
|
| 40 |
-
│ ├── pipeline/ # run_pipeline.py — train end-to-end
|
| 41 |
-
│ └── service/ # ModelService — shared inference layer
|
| 42 |
-
├── tests/
|
| 43 |
-
├── Dockerfile
|
| 44 |
└── docker-compose.yml
|
| 45 |
```
|
| 46 |
|
| 47 |
-
**Runtime flow**
|
| 48 |
-
|
| 49 |
-
1. **Training:** `load_raw_data` → `TextPreprocessor` → `build_model().fit()` → `Evaluator` → `reports/summary.csv`
|
| 50 |
-
2. **API:** `uvicorn` loads `ModelService` → `POST /predict`
|
| 51 |
-
3. **Streamlit:** `ModelService.predict()` in-process (same models as API catalog)
|
| 52 |
-
|
| 53 |
-
See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for more detail.
|
| 54 |
-
|
| 55 |
---
|
| 56 |
|
| 57 |
-
##
|
| 58 |
-
|
| 59 |
-
**Requirements:** Python 3.12+, ~2 GB disk for dependencies (optional PyTorch if using Hugging Face models in the UI).
|
| 60 |
-
|
| 61 |
-
```bash
|
| 62 |
-
git clone https://github.com/Bootcamp-IA-P6/Project_9_Equipo3.git
|
| 63 |
-
cd Project_9_Equipo3 # or your local folder name
|
| 64 |
-
|
| 65 |
-
python -m venv .venv
|
| 66 |
-
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
| 67 |
-
|
| 68 |
-
pip install -r requirements.txt
|
| 69 |
-
python -m spacy download en_core_web_sm
|
| 70 |
-
```
|
| 71 |
-
|
| 72 |
-
**Data:** place `youtoxic_english_1000.csv` under `data/raw/` (path in `configs/pipeline.yaml`).
|
| 73 |
|
| 74 |
-
|
| 75 |
|
| 76 |
```bash
|
| 77 |
cp .env.example .env
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
---
|
| 83 |
-
|
| 84 |
-
## Training pipeline
|
| 85 |
-
|
| 86 |
-
End-to-end training and evaluation:
|
| 87 |
-
|
| 88 |
-
```bash
|
| 89 |
-
python -m src.pipeline.run_pipeline --model lr
|
| 90 |
-
# Options: lr | rf | xgboost
|
| 91 |
```
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
Config files:
|
| 96 |
-
|
| 97 |
-
| File | Purpose |
|
| 98 |
-
|------|---------|
|
| 99 |
-
| `configs/pipeline.yaml` | Paths, `IsToxic`, test_size, CV folds |
|
| 100 |
-
| `configs/features.yaml` | Preprocessing + TF-IDF |
|
| 101 |
-
| `configs/models.yaml` | Classifier hyperparameters |
|
| 102 |
-
| `configs/best_params.yaml` | Optuna winner (LR) |
|
| 103 |
-
|
| 104 |
-
Details: [docs/PIPELINE.md](docs/PIPELINE.md)
|
| 105 |
-
|
| 106 |
-
---
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
docker compose up --build
|
| 112 |
-
```
|
| 113 |
|
| 114 |
-
|
|
| 115 |
-
|---------|-----|
|
| 116 |
-
|
|
| 117 |
-
|
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
```
|
| 121 |
-
export YOUTUBE_API_KEY=your_key # optional
|
| 122 |
-
docker compose down # stop
|
| 123 |
-
```
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
---
|
| 128 |
|
| 129 |
-
##
|
| 130 |
|
| 131 |
```bash
|
| 132 |
# Terminal 1 — API
|
| 133 |
-
uvicorn src.api.main:app --reload --
|
| 134 |
|
| 135 |
-
# Terminal 2 —
|
| 136 |
-
|
| 137 |
```
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
## API examples
|
| 142 |
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
|
| 147 |
```bash
|
| 148 |
-
|
| 149 |
-
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
curl -s -X POST http://localhost:8000/predict \
|
| 155 |
-
-H "Content-Type: application/json" \
|
| 156 |
-
-d '{"text": "This video is amazing, thanks for sharing!", "threshold": 0.5}'
|
| 157 |
```
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
"text": "This video is amazing, thanks for sharing!",
|
| 164 |
-
"is_toxic": false,
|
| 165 |
-
"probability": 0.08,
|
| 166 |
-
"labels": [],
|
| 167 |
-
"model_used": "LR + TF-IDF (local)",
|
| 168 |
-
"latency_ms": 12.5
|
| 169 |
-
}
|
| 170 |
-
```
|
| 171 |
|
| 172 |
-
|
| 173 |
|
| 174 |
-
|
| 175 |
-
curl -s -X POST http://localhost:8000/predict-batch \
|
| 176 |
-
-H "Content-Type: application/json" \
|
| 177 |
-
-d '{"texts": ["Great content!", "You are an idiot"], "threshold": 0.5}'
|
| 178 |
-
```
|
| 179 |
|
| 180 |
-
|
| 181 |
|
| 182 |
```bash
|
| 183 |
-
|
| 184 |
-
curl -s -X PUT http://localhost:8000/model/DistilBERT%20Toxicity
|
| 185 |
```
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
## Results
|
| 190 |
-
|
| 191 |
-
Best **sklearn** model on the project test split (from `configs/best_params.yaml`):
|
| 192 |
-
|
| 193 |
-
| Metric | Value |
|
| 194 |
-
|--------|-------|
|
| 195 |
-
| F1 (weighted, test) | **0.7579** |
|
| 196 |
-
| ROC-AUC | **0.81** |
|
| 197 |
-
| False positives | 18 |
|
| 198 |
-
| False negatives | 30 |
|
| 199 |
-
| CV–test gap | **4.76 pp** (within 5 pp target) |
|
| 200 |
-
| Train–test gap | 14.07 pp |
|
| 201 |
-
|
| 202 |
-
Plots and EDA: `reports/v2/`. Per-run artifacts: `reports/pipeline/{lr,rf,xgboost}/`.
|
| 203 |
|
| 204 |
---
|
| 205 |
|
| 206 |
-
##
|
| 207 |
-
|
| 208 |
-
Full write-up (decisions, metrics, error analysis, limitations, roadmap):
|
| 209 |
-
|
| 210 |
-
- **English:** [reports/final_report.md](reports/final_report.md)
|
| 211 |
-
- **Español:** [reports/final_report.es.md](reports/final_report.es.md)
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
| 219 |
-
|-------|--------|-----------|---------|----|----|--------------------|
|
| 220 |
-
| LR + TF-IDF (tuned) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes |
|
| 221 |
-
| LR + TF-IDF (local) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes (`final_model.joblib`) |
|
| 222 |
-
| RF / XGBoost | sklearn | — | — | — | — | Run pipeline to fill |
|
| 223 |
-
| DistilBERT / toxic-bert / RoBERTa | Hugging Face | — | — | — | — | Optional via API/UI |
|
| 224 |
-
|
| 225 |
-
Re-run `python -m src.pipeline.run_pipeline --model rf` to append RF metrics to `summary.csv`.
|
| 226 |
|
| 227 |
---
|
| 228 |
|
| 229 |
## Tests
|
| 230 |
|
| 231 |
```bash
|
| 232 |
-
|
|
|
|
| 233 |
```
|
| 234 |
|
| 235 |
-
Covers preprocessor, vectorizer, model binary output, and `/predict` response shape.
|
| 236 |
-
|
| 237 |
---
|
| 238 |
|
| 239 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|---------|---------|
|
| 243 |
-
| [docs/API.md](docs/API.md) | [docs/API.es.md](docs/API.es.md) |
|
| 244 |
-
| [docs/PIPELINE.md](docs/PIPELINE.md) | [docs/PIPELINE.es.md](docs/PIPELINE.es.md) |
|
| 245 |
-
| [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) |
|
| 246 |
-
| [docs/RESULTS.md](docs/RESULTS.md) | [docs/RESULTS.es.md](docs/RESULTS.es.md) |
|
| 247 |
-
| [reports/final_report.md](reports/final_report.md) | [reports/final_report.es.md](reports/final_report.es.md) |
|
|
|
|
| 1 |
+
# YouTube Toxic Comment Detector (youtube_hate_detector)
|
| 2 |
|
| 3 |
[](https://www.python.org/downloads/)
|
| 4 |
[](https://fastapi.tiangolo.com/)
|
| 5 |
+
[](https://react.dev/)
|
| 6 |
[](https://docs.docker.com/compose/)
|
| 7 |
+
|
| 8 |
**Español:** [README.es.md](README.es.md)
|
| 9 |
|
| 10 |
+
Automated **Safe vs Toxic** classification for YouTube-style comments. Production stack: **FastAPI** (REST) + **React** (YouTube Watch UI). Default model: **Logistic Regression + TF-IDF** (`models/final_model.joblib`).
|
| 11 |
|
| 12 |
---
|
| 13 |
|
| 14 |
+
## Clone and layout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
```bash
|
| 17 |
+
git clone <your-repo-url>
|
| 18 |
+
cd youtube_hate_detector # use this folder name locally (team convention)
|
| 19 |
+
```
|
| 20 |
|
| 21 |
```
|
| 22 |
youtube_hate_detector/
|
| 23 |
+
├── configs/ # pipeline, features, model_catalog, suggested_videos
|
| 24 |
+
├── frontend/ # React SPA (Vite)
|
| 25 |
├── models/ # final_model.joblib, experiments/
|
|
|
|
| 26 |
├── src/
|
| 27 |
+
│ ├── api/ # FastAPI routes
|
| 28 |
+
│ └── service/ # ModelService (inference)
|
| 29 |
+
├── pyproject.toml # uv dependencies
|
| 30 |
+
├── uv.lock
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
└── docker-compose.yml
|
| 32 |
```
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
---
|
| 35 |
|
| 36 |
+
## How to use FastAPI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
The API loads `ModelService` once at startup and serves JSON only (the React app is the UI).
|
| 39 |
|
| 40 |
```bash
|
| 41 |
cp .env.example .env
|
| 42 |
+
uv sync # baseline (LR model only)
|
| 43 |
+
uv sync --extra hf # required for DistilBERT / toxic-bert / Fine-tuned HF models
|
| 44 |
+
uv run uvicorn src.api.main:app --reload --port 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
```
|
| 46 |
|
| 47 |
+
Verify HF deps: `uv run python -c "import transformers; print('ok')"`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
| Resource | URL |
|
| 50 |
+
|----------|-----|
|
| 51 |
+
| Swagger | http://localhost:8000/docs |
|
| 52 |
+
| Health | http://localhost:8000/health |
|
| 53 |
|
| 54 |
+
**Main endpoints**
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
| Method | Path | Description |
|
| 57 |
+
|--------|------|-------------|
|
| 58 |
+
| `POST` | `/predict` | Score one comment `{ "text", "threshold" }` |
|
| 59 |
+
| `POST` | `/predict-video` | Fetch YouTube comments + score `{ "url", "max_comments", "threshold" }` |
|
| 60 |
+
| `GET` | `/videos/suggested` | Metadata for right-rail videos (from `configs/suggested_videos.yaml`) |
|
| 61 |
+
| `GET` | `/models` | Available models |
|
| 62 |
+
| `GET` | `/models/status` | Per-model availability (HF deps, local weights) |
|
| 63 |
+
| `PUT` | `/model/{name}` | Switch active model (warmup-validated) |
|
| 64 |
|
| 65 |
+
Set `YOUTUBE_API_KEY` in `.env` for real comments and suggested-video thumbnails.
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
**Change models without UI changes:** edit [`configs/model_catalog.yaml`](configs/model_catalog.yaml), then restart the API or use Settings in the app.
|
| 68 |
|
| 69 |
---
|
| 70 |
|
| 71 |
+
## React UI (local dev)
|
| 72 |
|
| 73 |
```bash
|
| 74 |
# Terminal 1 — API
|
| 75 |
+
uv run uvicorn src.api.main:app --reload --port 8000
|
| 76 |
|
| 77 |
+
# Terminal 2 — frontend (proxies API)
|
| 78 |
+
cd frontend && npm install && npm run dev
|
| 79 |
```
|
| 80 |
|
| 81 |
+
Open http://localhost:5173 — Watch page with staged demo player, real suggested videos (click to load comments), English UI.
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
---
|
| 84 |
|
| 85 |
+
## Docker
|
| 86 |
|
| 87 |
```bash
|
| 88 |
+
export YOUTUBE_API_KEY=your_key # optional but recommended
|
| 89 |
+
docker compose up --build # LR model only (default)
|
| 90 |
|
| 91 |
+
# Hugging Face models (transformers + torch; larger image):
|
| 92 |
+
INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1
|
| 93 |
+
INSTALL_HF=1 docker compose up
|
|
|
|
|
|
|
|
|
|
| 94 |
```
|
| 95 |
|
| 96 |
+
| URL | Service |
|
| 97 |
+
|-----|---------|
|
| 98 |
+
| http://localhost:8000 | API + built React SPA |
|
| 99 |
+
| http://localhost:8000/docs | Swagger |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
Container: `youtube_hate_detector-app`.
|
| 102 |
|
| 103 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
## Training (unchanged)
|
| 106 |
|
| 107 |
```bash
|
| 108 |
+
uv run python -m src.pipeline.run_pipeline --model lr
|
|
|
|
| 109 |
```
|
| 110 |
|
| 111 |
+
See [docs/PIPELINE.md](docs/PIPELINE.md).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
---
|
| 114 |
|
| 115 |
+
## Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
| File | Purpose |
|
| 118 |
+
|------|---------|
|
| 119 |
+
| `.env` | Secrets (`YOUTUBE_API_KEY`, `MODEL_NAME`) |
|
| 120 |
+
| `configs/model_catalog.yaml` | Inference models for API/UI |
|
| 121 |
+
| `configs/suggested_videos.yaml` | YouTube IDs for the suggested rail |
|
| 122 |
+
| `configs/pipeline.yaml` | Training data paths |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
---
|
| 125 |
|
| 126 |
## Tests
|
| 127 |
|
| 128 |
```bash
|
| 129 |
+
uv sync --extra dev --extra hf
|
| 130 |
+
uv run pytest
|
| 131 |
```
|
| 132 |
|
|
|
|
|
|
|
| 133 |
---
|
| 134 |
|
| 135 |
+
## Briefing vs team stack
|
| 136 |
+
|
| 137 |
+
| Topic | Briefing | This repo |
|
| 138 |
+
|-------|----------|-----------|
|
| 139 |
+
| UI | Streamlit | **React** |
|
| 140 |
+
| API | FastAPI | **FastAPI** |
|
| 141 |
+
| Package manager | varies | **`uv`** |
|
| 142 |
|
| 143 |
+
Legacy Streamlit (`src/app/`) has been removed.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/model_catalog.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"LR + TF-IDF (local)":
|
| 2 |
+
type: local
|
| 3 |
+
icon: "⚡"
|
| 4 |
+
description: "Project baseline. No GPU, instant inference."
|
| 5 |
+
speed: "< 50ms"
|
| 6 |
+
accuracy: "F1 0.76"
|
| 7 |
+
requires: "joblib only"
|
| 8 |
+
|
| 9 |
+
"DistilBERT Toxicity":
|
| 10 |
+
type: hf_remote
|
| 11 |
+
icon: "🤖"
|
| 12 |
+
model_id: martin-ha/toxic-comment-model
|
| 13 |
+
description: "DistilBERT fine-tuned on toxic comments (Hugging Face Hub)."
|
| 14 |
+
speed: "~200ms CPU"
|
| 15 |
+
accuracy: "F1 0.85"
|
| 16 |
+
requires: "uv sync --extra hf"
|
| 17 |
+
|
| 18 |
+
"toxic-bert (multilabel)":
|
| 19 |
+
type: hf_remote
|
| 20 |
+
icon: "🧠"
|
| 21 |
+
model_id: unitary/toxic-bert
|
| 22 |
+
description: "BERT multi-label (Jigsaw). Six toxicity categories (Hugging Face Hub)."
|
| 23 |
+
speed: "~400ms CPU"
|
| 24 |
+
accuracy: "F1 0.88"
|
| 25 |
+
requires: "uv sync --extra hf"
|
| 26 |
+
|
| 27 |
+
"RoBERTa Toxicity":
|
| 28 |
+
type: hf_remote
|
| 29 |
+
icon: "🔬"
|
| 30 |
+
model_id: s-nlp/roberta_toxicity_classifier
|
| 31 |
+
description: "RoBERTa fine-tuned for general toxicity (Hugging Face Hub)."
|
| 32 |
+
speed: "~350ms CPU"
|
| 33 |
+
accuracy: "F1 0.87"
|
| 34 |
+
requires: "uv sync --extra hf"
|
| 35 |
+
|
| 36 |
+
"Fine-tuned (local HF)":
|
| 37 |
+
type: hf_local
|
| 38 |
+
icon: "✨"
|
| 39 |
+
model_path: models/finetuned_hf
|
| 40 |
+
description: "Locally fine-tuned Hugging Face model (models/finetuned_hf)."
|
| 41 |
+
speed: "Hardware dependent"
|
| 42 |
+
accuracy: "TBD"
|
| 43 |
+
requires: "uv sync --extra hf"
|
configs/suggested_videos.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Suggested videos for the watch-page right rail (edit ids only).
|
| 2 |
+
# Prefer embed-friendly videos with comments enabled (avoid Vevo music IDs).
|
| 3 |
+
max_comments: 50
|
| 4 |
+
|
| 5 |
+
videos:
|
| 6 |
+
- id: jNQXAC9IVRw
|
| 7 |
+
note: Me at the zoo — first YouTube upload; comments enabled
|
| 8 |
+
- id: IEEhzQoKtQU
|
| 9 |
+
note: 3Blue1Brown — embed-friendly educational
|
| 10 |
+
- id: dQw4w9WgXcQ
|
| 11 |
+
note: Rick Astley — usually embeddable
|
| 12 |
+
- id: e-z0xWm0xK0
|
| 13 |
+
note: Kurzgesagt — educational, comments on
|
| 14 |
+
- id: aKydtOUFkeg
|
| 15 |
+
note: TED-style talk — embed-friendly
|
docker-compose.yml
CHANGED
|
@@ -1,21 +1,18 @@
|
|
| 1 |
-
# youtube_hate_detector —
|
| 2 |
-
# Start
|
| 3 |
-
# Stop:
|
| 4 |
|
| 5 |
name: youtube_hate_detector
|
| 6 |
|
| 7 |
services:
|
| 8 |
-
|
| 9 |
-
build:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
image: youtube_hate_detector:latest
|
| 11 |
-
container_name: youtube_hate_detector-
|
| 12 |
-
command:
|
| 13 |
-
- uvicorn
|
| 14 |
-
- src.api.main:app
|
| 15 |
-
- --host
|
| 16 |
-
- "0.0.0.0"
|
| 17 |
-
- --port
|
| 18 |
-
- "8000"
|
| 19 |
ports:
|
| 20 |
- "8000:8000"
|
| 21 |
environment:
|
|
@@ -24,34 +21,9 @@ services:
|
|
| 24 |
YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
|
| 25 |
NLTK_DATA: /app/nltk_data
|
| 26 |
healthcheck:
|
| 27 |
-
test: ["CMD", "curl", "-f", "http://localhost:8000/"]
|
| 28 |
interval: 10s
|
| 29 |
timeout: 5s
|
| 30 |
retries: 12
|
| 31 |
-
start_period:
|
| 32 |
-
restart: unless-stopped
|
| 33 |
-
|
| 34 |
-
streamlit:
|
| 35 |
-
# Reuses the image built by `api` — do not add `build:` here (parallel builds race on the same tag)
|
| 36 |
-
image: youtube_hate_detector:latest
|
| 37 |
-
container_name: youtube_hate_detector-streamlit
|
| 38 |
-
command:
|
| 39 |
-
- streamlit
|
| 40 |
-
- run
|
| 41 |
-
- src/app/app.py
|
| 42 |
-
- --server.port=8501
|
| 43 |
-
- --server.address=0.0.0.0
|
| 44 |
-
- --server.headless=true
|
| 45 |
-
- --browser.gatherUsageStats=false
|
| 46 |
-
ports:
|
| 47 |
-
- "8501:8501"
|
| 48 |
-
environment:
|
| 49 |
-
MODEL_NAME: "LR + TF-IDF (local)"
|
| 50 |
-
ENV: production
|
| 51 |
-
API_URL: http://api:8000
|
| 52 |
-
YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
|
| 53 |
-
NLTK_DATA: /app/nltk_data
|
| 54 |
-
depends_on:
|
| 55 |
-
api:
|
| 56 |
-
condition: service_healthy
|
| 57 |
restart: unless-stopped
|
|
|
|
| 1 |
+
# youtube_hate_detector — FastAPI + React (single service)
|
| 2 |
+
# Start: docker compose up --build
|
| 3 |
+
# Stop: docker compose down
|
| 4 |
|
| 5 |
name: youtube_hate_detector
|
| 6 |
|
| 7 |
services:
|
| 8 |
+
app:
|
| 9 |
+
build:
|
| 10 |
+
context: .
|
| 11 |
+
args:
|
| 12 |
+
# Set INSTALL_HF=1 for Hugging Face models (larger image, ~1–2 GB extra)
|
| 13 |
+
INSTALL_HF: ${INSTALL_HF:-0}
|
| 14 |
image: youtube_hate_detector:latest
|
| 15 |
+
container_name: youtube_hate_detector-app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
ports:
|
| 17 |
- "8000:8000"
|
| 18 |
environment:
|
|
|
|
| 21 |
YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
|
| 22 |
NLTK_DATA: /app/nltk_data
|
| 23 |
healthcheck:
|
| 24 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 25 |
interval: 10s
|
| 26 |
timeout: 5s
|
| 27 |
retries: 12
|
| 28 |
+
start_period: 60s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
restart: unless-stopped
|
docs/ARCHITECTURE.md
CHANGED
|
@@ -1,66 +1,46 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
##
|
| 4 |
|
| 5 |
```mermaid
|
| 6 |
-
flowchart
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
PRE[TextPreprocessor]
|
| 15 |
-
BL[build_model LR RF XGB]
|
| 16 |
-
EV[Evaluator]
|
| 17 |
-
CSV --> PIPE
|
| 18 |
-
CFG --> PIPE
|
| 19 |
-
PIPE --> PRE --> BL --> EV
|
| 20 |
-
EV --> SUM[reports/summary.csv]
|
| 21 |
-
BL --> JOB[models/experiments/]
|
| 22 |
-
end
|
| 23 |
-
|
| 24 |
-
subgraph inference [Inference]
|
| 25 |
-
MS[ModelService]
|
| 26 |
-
JOB2[models/final_model.joblib]
|
| 27 |
-
JOB2 --> MS
|
| 28 |
-
API[FastAPI src/api/main.py]
|
| 29 |
-
UI[Streamlit src/app/app.py]
|
| 30 |
-
MS --> API
|
| 31 |
-
MS --> UI
|
| 32 |
-
end
|
| 33 |
```
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|--------|----------------|
|
| 39 |
-
| `src/data/loader.py` | Load raw CSV, optional processed paths |
|
| 40 |
-
| `src/features/text_preprocessor.py` | Clean and lemmatize text |
|
| 41 |
-
| `src/features/vectorizer.py` | Standalone TF-IDF (notebooks); baselines embed TF-IDF in sklearn `Pipeline` |
|
| 42 |
-
| `src/models/baseline.py` | `LRModel`, `RFModel`, `XGBModel`, `build_model()` |
|
| 43 |
-
| `src/evaluation/evaluator.py` | Metrics, ROC, confusion matrix, error analysis, `summary.csv` |
|
| 44 |
-
| `src/pipeline/run_pipeline.py` | Orchestrates training + evaluation |
|
| 45 |
-
| `src/service/model_service.py` | Loads joblib or Hugging Face models; `predict(text)` |
|
| 46 |
-
| `src/api/main.py` | REST endpoints, lifespan model load |
|
| 47 |
-
| `src/app/app.py` | Streamlit UI; calls `ModelService` directly |
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
- API returns `is_toxic` and `probability` (P(toxic))
|
| 54 |
|
| 55 |
## Docker
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
- `youtube_hate_detector-api` — uvicorn port 8000
|
| 60 |
-
- `youtube_hate_detector-streamlit` — port 8501
|
| 61 |
|
| 62 |
-
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture — youtube_hate_detector
|
| 2 |
|
| 3 |
+
## Runtime (production)
|
| 4 |
|
| 5 |
```mermaid
|
| 6 |
+
flowchart LR
|
| 7 |
+
Browser[React SPA]
|
| 8 |
+
API[FastAPI :8000]
|
| 9 |
+
MS[ModelService]
|
| 10 |
+
YT[YouTube Data API]
|
| 11 |
+
Browser -->|HTTP JSON| API
|
| 12 |
+
API --> MS
|
| 13 |
+
API --> YT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
```
|
| 15 |
|
| 16 |
+
- **UI:** `frontend/` built to `frontend/dist`, served by FastAPI `StaticFiles` in production.
|
| 17 |
+
- **Inference:** Only `ModelService` in `src/service/` loads models.
|
| 18 |
+
- **Catalog:** `configs/model_catalog.yaml` — add models without React changes.
|
| 19 |
+
- **Suggested videos:** `configs/suggested_videos.yaml` — YouTube video IDs for the right rail.
|
| 20 |
|
| 21 |
+
## Local development
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
| Process | Command | Port |
|
| 24 |
+
|---------|---------|------|
|
| 25 |
+
| API | `uv run uvicorn src.api.main:app --reload` | 8000 |
|
| 26 |
+
| UI | `cd frontend && npm run dev` | 5173 (proxies API) |
|
|
|
|
| 27 |
|
| 28 |
## Docker
|
| 29 |
|
| 30 |
+
Single service `youtube_hate_detector-app` on port **8000** (API + static UI).
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
## API layout
|
| 33 |
|
| 34 |
+
```
|
| 35 |
+
src/api/
|
| 36 |
+
main.py # app factory, CORS, static mount
|
| 37 |
+
schemas.py # Pydantic models
|
| 38 |
+
services.py # predict helpers
|
| 39 |
+
youtube.py # comment fetch + metadata
|
| 40 |
+
state.py # shared app state
|
| 41 |
+
routes/
|
| 42 |
+
health.py
|
| 43 |
+
models.py
|
| 44 |
+
predict.py
|
| 45 |
+
videos.py
|
| 46 |
+
```
|
pyproject.toml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "youtube_hate_detector"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "YouTube toxic comment detector — FastAPI + React"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.12,<3.13"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"fastapi>=0.136.1",
|
| 9 |
+
"uvicorn[standard]>=0.47.0",
|
| 10 |
+
"scikit-learn>=1.8.0",
|
| 11 |
+
"spacy>=3.8.14",
|
| 12 |
+
"nltk>=3.9.4",
|
| 13 |
+
"pandas>=3.0.2",
|
| 14 |
+
"PyYAML>=6.0.3",
|
| 15 |
+
"python-dotenv>=1.2.2",
|
| 16 |
+
"joblib>=1.5.3",
|
| 17 |
+
"pydantic>=2.13.4",
|
| 18 |
+
"httpx>=0.28.1",
|
| 19 |
+
"google-api-python-client>=2.100.0",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
[project.optional-dependencies]
|
| 23 |
+
hf = [
|
| 24 |
+
"transformers>=5.9.0",
|
| 25 |
+
"torch>=2.0.0",
|
| 26 |
+
"sentencepiece>=0.2.0",
|
| 27 |
+
"accelerate>=0.30.0",
|
| 28 |
+
]
|
| 29 |
+
dev = [
|
| 30 |
+
"pytest>=8.0.0",
|
| 31 |
+
]
|
| 32 |
+
[build-system]
|
| 33 |
+
requires = ["hatchling"]
|
| 34 |
+
build-backend = "hatchling.build"
|
| 35 |
+
|
| 36 |
+
[tool.hatch.build.targets.wheel]
|
| 37 |
+
packages = ["src"]
|
| 38 |
+
|
| 39 |
+
[tool.pytest.ini_options]
|
| 40 |
+
testpaths = ["tests"]
|
| 41 |
+
pythonpath = ["."]
|
requirements.txt
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
# Runtime dependencies for API + Streamlit (Docker and local installs)
|
| 2 |
-
fastapi==0.136.1
|
| 3 |
-
uvicorn[standard]==0.47.0
|
| 4 |
-
streamlit>=1.41.0,<2
|
| 5 |
-
scikit-learn==1.8.0
|
| 6 |
-
spacy==3.8.14
|
| 7 |
-
nltk==3.9.4
|
| 8 |
-
pandas==3.0.2
|
| 9 |
-
PyYAML==6.0.3
|
| 10 |
-
python-dotenv==1.2.2
|
| 11 |
-
joblib==1.5.3
|
| 12 |
-
pydantic==2.13.4
|
| 13 |
-
transformers==5.9.0
|
| 14 |
-
httpx==0.28.1
|
| 15 |
-
matplotlib>=3.8.0
|
| 16 |
-
seaborn>=0.13.0
|
| 17 |
-
mlflow>=2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application package."""
|
src/api/main.py
CHANGED
|
@@ -1,462 +1,114 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
Documentación automática en:
|
| 8 |
-
http://localhost:8000/docs (Swagger UI)
|
| 9 |
-
http://localhost:8000/redoc (ReDoc)
|
| 10 |
-
|
| 11 |
-
Endpoints:
|
| 12 |
-
GET / → health check
|
| 13 |
-
GET /model-info → info del modelo activo
|
| 14 |
-
GET /models → lista de modelos disponibles
|
| 15 |
-
POST /predict → predice un comentario
|
| 16 |
-
POST /predict-batch → predice una lista de comentarios
|
| 17 |
-
POST /predict-video → dado URL de YouTube, predice todos sus comentarios
|
| 18 |
-
PUT /model/{name} → cambia el modelo activo
|
| 19 |
"""
|
| 20 |
|
|
|
|
|
|
|
| 21 |
import os
|
| 22 |
-
import sys
|
| 23 |
import time
|
| 24 |
-
import logging
|
| 25 |
-
from pathlib import Path
|
| 26 |
-
from typing import Optional
|
| 27 |
from contextlib import asynccontextmanager
|
| 28 |
-
from
|
| 29 |
-
load_dotenv()
|
| 30 |
|
| 31 |
-
from
|
|
|
|
| 32 |
from fastapi.middleware.cors import CORSMiddleware
|
| 33 |
-
from
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 37 |
-
sys.path.insert(0, str(PROJECT_ROOT))
|
| 38 |
|
| 39 |
-
from src.
|
|
|
|
|
|
|
| 40 |
from src.utils.logger import get_logger
|
| 41 |
|
| 42 |
logger = get_logger(__name__)
|
| 43 |
|
| 44 |
-
|
| 45 |
-
# El modelo se carga una sola vez al iniciar la API y se reutiliza.
|
| 46 |
-
# Esto evita cargar el modelo en cada request (costoso en tiempo).
|
| 47 |
-
_state: dict = {
|
| 48 |
-
"service" : None,
|
| 49 |
-
"model_name" : None,
|
| 50 |
-
"startup_time" : None,
|
| 51 |
-
"predictions_served": 0,
|
| 52 |
-
}
|
| 53 |
|
| 54 |
|
| 55 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 56 |
-
# LIFESPAN — carga del modelo al iniciar la API
|
| 57 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 58 |
@asynccontextmanager
|
| 59 |
async def lifespan(app: FastAPI):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
# Warm-up: predecir un texto de prueba para que el modelo quede en memoria
|
| 72 |
try:
|
| 73 |
-
|
| 74 |
-
logger.info("
|
| 75 |
-
except Exception as
|
| 76 |
-
logger.warning(
|
| 77 |
|
| 78 |
-
yield
|
| 79 |
|
| 80 |
-
|
| 81 |
-
logger.info("API
|
| 82 |
-
_state["service"] = None
|
| 83 |
|
| 84 |
|
| 85 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 86 |
-
# APP
|
| 87 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 88 |
app = FastAPI(
|
| 89 |
-
title
|
| 90 |
-
description
|
| 91 |
-
version
|
| 92 |
-
lifespan
|
| 93 |
)
|
| 94 |
|
| 95 |
-
# CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
|
| 96 |
app.add_middleware(
|
| 97 |
CORSMiddleware,
|
| 98 |
-
allow_origins
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 105 |
-
# SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
|
| 106 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 107 |
-
class PredictRequest(BaseModel):
|
| 108 |
-
"""Cuerpo del request para predecir un comentario."""
|
| 109 |
-
text : str = Field(..., min_length=1, max_length=5000,
|
| 110 |
-
description="Comentario a analizar")
|
| 111 |
-
threshold: float = Field(0.5, ge=0.0, le=1.0,
|
| 112 |
-
description="Umbral de probabilidad para clasificar como tóxico")
|
| 113 |
-
|
| 114 |
-
@field_validator("text")
|
| 115 |
-
@classmethod
|
| 116 |
-
def text_not_empty(cls, v):
|
| 117 |
-
if not v.strip():
|
| 118 |
-
raise ValueError("El texto no puede estar vacío")
|
| 119 |
-
return v.strip()
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
class PredictResponse(BaseModel):
|
| 123 |
-
"""Respuesta de la predicción."""
|
| 124 |
-
text : str
|
| 125 |
-
is_toxic : bool
|
| 126 |
-
probability: float = Field(..., ge=0.0, le=1.0)
|
| 127 |
-
labels : list[str]
|
| 128 |
-
model_used : str
|
| 129 |
-
latency_ms : float
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
class BatchPredictRequest(BaseModel):
|
| 133 |
-
"""Request para predecir múltiples comentarios."""
|
| 134 |
-
texts : list[str] = Field(..., min_length=1, max_length=100)
|
| 135 |
-
threshold: float = Field(0.5, ge=0.0, le=1.0)
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
class BatchPredictResponse(BaseModel):
|
| 139 |
-
"""Respuesta de predicción batch."""
|
| 140 |
-
results : list[PredictResponse]
|
| 141 |
-
total : int
|
| 142 |
-
toxic_count : int
|
| 143 |
-
latency_ms : float
|
| 144 |
|
|
|
|
| 145 |
|
| 146 |
-
class VideoRequest(BaseModel):
|
| 147 |
-
"""Request para analizar comentarios de un video de YouTube."""
|
| 148 |
-
url : str = Field(..., description="URL del video de YouTube")
|
| 149 |
-
max_comments: int = Field(50, ge=1, le=200,
|
| 150 |
-
description="Número máximo de comentarios a analizar")
|
| 151 |
-
threshold : float = Field(0.5, ge=0.0, le=1.0)
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
class VideoResponse(BaseModel):
|
| 155 |
-
"""Respuesta del análisis de un video de YouTube."""
|
| 156 |
-
video_url : str
|
| 157 |
-
total_fetched: int
|
| 158 |
-
toxic_count : int
|
| 159 |
-
toxic_rate : float
|
| 160 |
-
results : list[PredictResponse]
|
| 161 |
-
error : Optional[str] = None
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
class ModelInfo(BaseModel):
|
| 165 |
-
"""Información sobre el modelo activo."""
|
| 166 |
-
name : str
|
| 167 |
-
type : str
|
| 168 |
-
description : str
|
| 169 |
-
speed : str
|
| 170 |
-
accuracy : str
|
| 171 |
-
uptime_s : float
|
| 172 |
-
predictions_served: int
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 176 |
-
# HELPERS
|
| 177 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 178 |
-
def _get_service() -> ModelService:
|
| 179 |
-
"""Devuelve el servicio activo o lanza 503 si no está listo."""
|
| 180 |
-
if _state["service"] is None:
|
| 181 |
-
raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
|
| 182 |
-
return _state["service"]
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
|
| 186 |
-
"""Predice un texto y devuelve (result, latency_ms)."""
|
| 187 |
-
t0 = time.perf_counter()
|
| 188 |
-
result = _get_service().predict(text)
|
| 189 |
-
ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 190 |
-
|
| 191 |
-
# Aplicar umbral personalizado
|
| 192 |
-
result["is_toxic"] = result["probability"] >= threshold
|
| 193 |
-
if not result["is_toxic"]:
|
| 194 |
-
result["labels"] = []
|
| 195 |
-
|
| 196 |
-
_state["predictions_served"] += 1
|
| 197 |
-
return result, ms
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
|
| 201 |
-
"""
|
| 202 |
-
Obtiene comentarios de un video de YouTube.
|
| 203 |
-
|
| 204 |
-
Estrategia:
|
| 205 |
-
1. Intentar con YouTube Data API v3 (si hay API key en .env)
|
| 206 |
-
2. Fallback: BeautifulSoup (sin autenticación, limitado)
|
| 207 |
-
"""
|
| 208 |
-
api_key = os.getenv("YOUTUBE_API_KEY", "")
|
| 209 |
-
|
| 210 |
-
if api_key:
|
| 211 |
-
return _fetch_via_api(url, api_key, max_comments)
|
| 212 |
-
else:
|
| 213 |
-
return _fetch_via_scraper(url, max_comments)
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
|
| 217 |
-
"""Obtiene comentarios usando YouTube Data API v3."""
|
| 218 |
-
try:
|
| 219 |
-
import re
|
| 220 |
-
from googleapiclient.discovery import build
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
video_id = None
|
| 229 |
-
for pattern in patterns:
|
| 230 |
-
match = re.search(pattern, url)
|
| 231 |
-
if match:
|
| 232 |
-
video_id = match.group(1)
|
| 233 |
-
break
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
comments = []
|
| 240 |
-
page_token = None
|
| 241 |
-
|
| 242 |
-
while len(comments) < max_comments:
|
| 243 |
-
request = youtube.commentThreads().list(
|
| 244 |
-
part = "snippet",
|
| 245 |
-
videoId = video_id,
|
| 246 |
-
maxResults = min(100, max_comments - len(comments)),
|
| 247 |
-
pageToken = page_token,
|
| 248 |
-
textFormat = "plainText",
|
| 249 |
-
)
|
| 250 |
-
response = request.execute()
|
| 251 |
-
|
| 252 |
-
for item in response.get("items", []):
|
| 253 |
-
text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
|
| 254 |
-
comments.append(text)
|
| 255 |
-
|
| 256 |
-
page_token = response.get("nextPageToken")
|
| 257 |
-
if not page_token:
|
| 258 |
-
break
|
| 259 |
-
|
| 260 |
-
logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
|
| 261 |
-
return comments[:max_comments]
|
| 262 |
-
|
| 263 |
-
except Exception as e:
|
| 264 |
-
logger.warning(f"YouTube API falló: {e} — usando fallback")
|
| 265 |
-
return _fetch_via_scraper(url, max_comments)
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
|
| 269 |
-
"""
|
| 270 |
-
Fallback: simula comentarios si no hay API key.
|
| 271 |
-
En producción real debería usar BeautifulSoup + Selenium.
|
| 272 |
-
"""
|
| 273 |
-
logger.warning(
|
| 274 |
-
"YOUTUBE_API_KEY no configurada. "
|
| 275 |
-
"Configura tu API key en .env para obtener comentarios reales. "
|
| 276 |
-
"Usando comentarios de ejemplo."
|
| 277 |
-
)
|
| 278 |
-
# Comentarios de ejemplo para demo sin API key
|
| 279 |
-
example_comments = [
|
| 280 |
-
"This video is really informative, thanks for sharing!",
|
| 281 |
-
"You are all stupid idiots, get out of here!",
|
| 282 |
-
"Great content, I learned a lot from this.",
|
| 283 |
-
"These people should be eliminated from society.",
|
| 284 |
-
"I agree with the presenter's point of view.",
|
| 285 |
-
"What a bunch of racist criminals!",
|
| 286 |
-
"Thank you for this analysis, very helpful.",
|
| 287 |
-
"Kill them all, they don't deserve to live.",
|
| 288 |
-
"Interesting perspective on the topic.",
|
| 289 |
-
"This is absolute bullshit propaganda!",
|
| 290 |
-
"I think we need to look at both sides.",
|
| 291 |
-
"Black people are thugs and criminals.",
|
| 292 |
-
"The data presented here is compelling.",
|
| 293 |
-
"Go back to where you came from!",
|
| 294 |
-
"Well researched video, good job.",
|
| 295 |
-
]
|
| 296 |
-
return example_comments[:max_comments]
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 300 |
-
# ENDPOINTS
|
| 301 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 302 |
-
|
| 303 |
-
@app.get("/", tags=["Health"])
|
| 304 |
-
async def health_check():
|
| 305 |
-
"""
|
| 306 |
-
Verifica que la API está funcionando.
|
| 307 |
-
Útil para Docker healthcheck y load balancers.
|
| 308 |
-
"""
|
| 309 |
-
service = _state["service"]
|
| 310 |
-
return {
|
| 311 |
-
"status" : "ok" if service else "loading",
|
| 312 |
-
"model" : _state["model_name"],
|
| 313 |
-
"uptime_s": round(time.time() - _state["startup_time"], 1)
|
| 314 |
-
if _state["startup_time"] else 0,
|
| 315 |
-
}
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
@app.get("/model-info", response_model=ModelInfo, tags=["Model"])
|
| 319 |
-
async def get_model_info():
|
| 320 |
-
"""Devuelve información sobre el modelo activo."""
|
| 321 |
-
service = _get_service()
|
| 322 |
-
info = service.get_model_info()
|
| 323 |
-
return ModelInfo(
|
| 324 |
-
name = _state["model_name"],
|
| 325 |
-
type = info.get("type", "unknown"),
|
| 326 |
-
description = info.get("description", ""),
|
| 327 |
-
speed = info.get("speed", ""),
|
| 328 |
-
accuracy = info.get("accuracy", ""),
|
| 329 |
-
uptime_s = round(time.time() - _state["startup_time"], 1),
|
| 330 |
-
predictions_served= _state["predictions_served"],
|
| 331 |
-
)
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
@app.get("/models", tags=["Model"])
|
| 335 |
-
async def list_models():
|
| 336 |
-
"""Lista todos los modelos disponibles."""
|
| 337 |
-
return {
|
| 338 |
-
"available": list(AVAILABLE_MODELS.keys()),
|
| 339 |
-
"active" : _state["model_name"],
|
| 340 |
-
}
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
@app.put("/model/{model_name}", tags=["Model"])
|
| 344 |
-
async def switch_model(model_name: str):
|
| 345 |
-
"""
|
| 346 |
-
Cambia el modelo activo.
|
| 347 |
-
El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
|
| 348 |
-
"""
|
| 349 |
-
if model_name not in AVAILABLE_MODELS:
|
| 350 |
-
raise HTTPException(
|
| 351 |
-
status_code=400,
|
| 352 |
-
detail=f"Modelo '{model_name}' no disponible. "
|
| 353 |
-
f"Opciones: {list(AVAILABLE_MODELS.keys())}",
|
| 354 |
-
)
|
| 355 |
-
_state["service"] = ModelService(model_name, PROJECT_ROOT)
|
| 356 |
-
_state["model_name"] = model_name
|
| 357 |
-
logger.info(f"Modelo cambiado a: {model_name}")
|
| 358 |
-
return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
@app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
|
| 362 |
-
async def predict(request: PredictRequest):
|
| 363 |
-
"""
|
| 364 |
-
Predice si un comentario es tóxico.
|
| 365 |
-
|
| 366 |
-
- **text**: el comentario a analizar
|
| 367 |
-
- **threshold**: umbral de probabilidad (default 0.5)
|
| 368 |
-
|
| 369 |
-
Devuelve la probabilidad, si es tóxico y las categorías detectadas.
|
| 370 |
-
"""
|
| 371 |
-
result, ms = _predict_single(request.text, request.threshold)
|
| 372 |
-
|
| 373 |
-
if "error" in result:
|
| 374 |
-
raise HTTPException(status_code=500, detail=result["error"])
|
| 375 |
-
|
| 376 |
-
return PredictResponse(
|
| 377 |
-
text = request.text,
|
| 378 |
-
is_toxic = result["is_toxic"],
|
| 379 |
-
probability= round(result["probability"], 4),
|
| 380 |
-
labels = result["labels"],
|
| 381 |
-
model_used = result["model_used"],
|
| 382 |
-
latency_ms = ms,
|
| 383 |
-
)
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
@app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
|
| 387 |
-
async def predict_batch(request: BatchPredictRequest):
|
| 388 |
-
"""
|
| 389 |
-
Predice una lista de comentarios en un solo request.
|
| 390 |
-
Más eficiente que llamar /predict N veces.
|
| 391 |
-
Máximo 100 comentarios por request.
|
| 392 |
-
"""
|
| 393 |
-
t0 = time.perf_counter()
|
| 394 |
-
results = []
|
| 395 |
-
|
| 396 |
-
for text in request.texts:
|
| 397 |
-
if not text.strip():
|
| 398 |
-
continue
|
| 399 |
-
result, _ = _predict_single(text, request.threshold)
|
| 400 |
-
results.append(PredictResponse(
|
| 401 |
-
text = text,
|
| 402 |
-
is_toxic = result["is_toxic"],
|
| 403 |
-
probability= round(result["probability"], 4),
|
| 404 |
-
labels = result["labels"],
|
| 405 |
-
model_used = result["model_used"],
|
| 406 |
-
latency_ms = 0.0,
|
| 407 |
-
))
|
| 408 |
-
|
| 409 |
-
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 410 |
-
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 411 |
-
|
| 412 |
-
return BatchPredictResponse(
|
| 413 |
-
results = results,
|
| 414 |
-
total = len(results),
|
| 415 |
-
toxic_count = toxic_count,
|
| 416 |
-
latency_ms = total_ms,
|
| 417 |
-
)
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
@app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
|
| 421 |
-
async def predict_video(request: VideoRequest):
|
| 422 |
-
"""
|
| 423 |
-
Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
|
| 424 |
-
|
| 425 |
-
Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
|
| 426 |
-
Sin API key usa comentarios de ejemplo para la demo.
|
| 427 |
-
"""
|
| 428 |
-
# Obtener comentarios
|
| 429 |
-
try:
|
| 430 |
-
comments = _scrape_youtube_comments(request.url, request.max_comments)
|
| 431 |
-
except Exception as e:
|
| 432 |
-
raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
|
| 433 |
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
-
|
| 438 |
-
t0 = time.perf_counter()
|
| 439 |
-
results = []
|
| 440 |
-
for text in comments:
|
| 441 |
-
if not text.strip():
|
| 442 |
-
continue
|
| 443 |
-
result, _ = _predict_single(text, request.threshold)
|
| 444 |
-
results.append(PredictResponse(
|
| 445 |
-
text = text,
|
| 446 |
-
is_toxic = result["is_toxic"],
|
| 447 |
-
probability= round(result["probability"], 4),
|
| 448 |
-
labels = result["labels"],
|
| 449 |
-
model_used = result["model_used"],
|
| 450 |
-
latency_ms = 0.0,
|
| 451 |
-
))
|
| 452 |
|
| 453 |
-
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 454 |
-
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 455 |
|
| 456 |
-
|
| 457 |
-
video_url = request.url,
|
| 458 |
-
total_fetched= len(results),
|
| 459 |
-
toxic_count = toxic_count,
|
| 460 |
-
toxic_rate = round(toxic_count / len(results), 4) if results else 0.0,
|
| 461 |
-
results = results,
|
| 462 |
-
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
youtube_hate_detector API
|
| 3 |
|
| 4 |
+
Run: uv run uvicorn src.api.main:app --reload --port 8000
|
| 5 |
+
Docs: http://localhost:8000/docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
import os
|
|
|
|
| 11 |
import time
|
|
|
|
|
|
|
|
|
|
| 12 |
from contextlib import asynccontextmanager
|
| 13 |
+
from pathlib import Path
|
|
|
|
| 14 |
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from fastapi import FastAPI
|
| 17 |
from fastapi.middleware.cors import CORSMiddleware
|
| 18 |
+
from fastapi.responses import FileResponse
|
| 19 |
+
from fastapi.staticfiles import StaticFiles
|
| 20 |
|
| 21 |
+
load_dotenv()
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
from src.api.routes import health, models, predict, videos
|
| 24 |
+
from src.api.state import PROJECT_ROOT, get_state
|
| 25 |
+
from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
|
| 26 |
from src.utils.logger import get_logger
|
| 27 |
|
| 28 |
logger = get_logger(__name__)
|
| 29 |
|
| 30 |
+
FRONTEND_DIST = PROJECT_ROOT / "frontend" / "dist"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
@asynccontextmanager
|
| 34 |
async def lifespan(app: FastAPI):
|
| 35 |
+
state = get_state()
|
| 36 |
+
model_name = os.getenv("MODEL_NAME", next(iter(AVAILABLE_MODELS.keys())))
|
| 37 |
+
available, reason = check_model_availability(model_name, PROJECT_ROOT)
|
| 38 |
+
if not available:
|
| 39 |
+
fallback = next(iter(AVAILABLE_MODELS.keys()))
|
| 40 |
+
logger.warning(
|
| 41 |
+
"MODEL_NAME '%s' unavailable (%s) — using '%s'",
|
| 42 |
+
model_name,
|
| 43 |
+
reason,
|
| 44 |
+
fallback,
|
| 45 |
+
)
|
| 46 |
+
model_name = fallback
|
| 47 |
+
logger.info("Starting youtube_hate_detector API — model: %s", model_name)
|
| 48 |
+
state["service"] = ModelService(model_name, PROJECT_ROOT)
|
| 49 |
+
state["model_name"] = model_name
|
| 50 |
+
state["startup_time"] = time.time()
|
| 51 |
+
state["predictions_served"] = 0
|
| 52 |
|
|
|
|
| 53 |
try:
|
| 54 |
+
state["service"].predict("warmup")
|
| 55 |
+
logger.info("Model warm-up complete")
|
| 56 |
+
except Exception as exc:
|
| 57 |
+
logger.warning("Warm-up failed (non-critical): %s", exc)
|
| 58 |
|
| 59 |
+
yield
|
| 60 |
|
| 61 |
+
state["service"] = None
|
| 62 |
+
logger.info("API shutdown")
|
|
|
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
app = FastAPI(
|
| 66 |
+
title="youtube_hate_detector API",
|
| 67 |
+
description="Toxic comment detection for YouTube-style moderation demos",
|
| 68 |
+
version="1.0.0",
|
| 69 |
+
lifespan=lifespan,
|
| 70 |
)
|
| 71 |
|
|
|
|
| 72 |
app.add_middleware(
|
| 73 |
CORSMiddleware,
|
| 74 |
+
allow_origins=[
|
| 75 |
+
"http://localhost:5173",
|
| 76 |
+
"http://127.0.0.1:5173",
|
| 77 |
+
"http://localhost:8000",
|
| 78 |
+
],
|
| 79 |
+
allow_credentials=True,
|
| 80 |
+
allow_methods=["*"],
|
| 81 |
+
allow_headers=["*"],
|
| 82 |
)
|
| 83 |
|
| 84 |
+
app.include_router(health.router)
|
| 85 |
+
app.include_router(models.router)
|
| 86 |
+
app.include_router(predict.router)
|
| 87 |
+
app.include_router(videos.router)
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
_API_GET_PREFIXES = ("models", "model", "videos", "predict", "health", "docs", "redoc", "openapi")
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
def _mount_frontend() -> None:
|
| 94 |
+
if not FRONTEND_DIST.is_dir():
|
| 95 |
+
return
|
| 96 |
+
assets = FRONTEND_DIST / "assets"
|
| 97 |
+
if assets.is_dir():
|
| 98 |
+
app.mount("/assets", StaticFiles(directory=assets), name="assets")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
@app.get("/{full_path:path}", include_in_schema=False)
|
| 101 |
+
async def spa_fallback(full_path: str):
|
| 102 |
+
if full_path.startswith(_API_GET_PREFIXES):
|
| 103 |
+
from fastapi import HTTPException
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
raise HTTPException(status_code=404, detail="Not found")
|
| 106 |
+
index = FRONTEND_DIST / "index.html"
|
| 107 |
+
if index.exists():
|
| 108 |
+
return FileResponse(index)
|
| 109 |
+
from fastapi import HTTPException
|
| 110 |
|
| 111 |
+
raise HTTPException(status_code=404, detail="Not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
_mount_frontend()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/api/routes/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API route modules."""
|
src/api/routes/health.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter
|
| 4 |
+
|
| 5 |
+
from src.api.state import get_state
|
| 6 |
+
|
| 7 |
+
router = APIRouter(tags=["Health"])
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@router.get("/health")
|
| 11 |
+
async def health_check():
|
| 12 |
+
state = get_state()
|
| 13 |
+
service = state["service"]
|
| 14 |
+
uptime = 0.0
|
| 15 |
+
if state["startup_time"]:
|
| 16 |
+
uptime = round(time.time() - state["startup_time"], 1)
|
| 17 |
+
return {
|
| 18 |
+
"status": "ok" if service else "loading",
|
| 19 |
+
"model": state["model_name"],
|
| 20 |
+
"uptime_s": uptime,
|
| 21 |
+
"project": "youtube_hate_detector",
|
| 22 |
+
}
|
src/api/routes/models.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, HTTPException
|
| 4 |
+
|
| 5 |
+
from src.api.schemas import ModelInfo, ModelStatusEntry, ModelsStatusResponse
|
| 6 |
+
from src.api.services import get_service
|
| 7 |
+
from src.api.state import PROJECT_ROOT, get_state
|
| 8 |
+
from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
|
| 9 |
+
|
| 10 |
+
router = APIRouter(tags=["Model"])
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@router.get("/model-info", response_model=ModelInfo)
|
| 14 |
+
async def get_model_info():
|
| 15 |
+
service = get_service()
|
| 16 |
+
info = service.get_model_info()
|
| 17 |
+
state = get_state()
|
| 18 |
+
uptime = round(time.time() - state["startup_time"], 1) if state["startup_time"] else 0.0
|
| 19 |
+
return ModelInfo(
|
| 20 |
+
name=state["model_name"],
|
| 21 |
+
type=info.get("type", "unknown"),
|
| 22 |
+
description=info.get("description", ""),
|
| 23 |
+
speed=info.get("speed", ""),
|
| 24 |
+
accuracy=info.get("accuracy", ""),
|
| 25 |
+
uptime_s=uptime,
|
| 26 |
+
predictions_served=state.get("predictions_served", 0),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@router.get("/models/status", response_model=ModelsStatusResponse)
|
| 31 |
+
async def models_status():
|
| 32 |
+
state = get_state()
|
| 33 |
+
entries: list[ModelStatusEntry] = []
|
| 34 |
+
for name, cfg in AVAILABLE_MODELS.items():
|
| 35 |
+
available, reason = check_model_availability(name, PROJECT_ROOT)
|
| 36 |
+
entries.append(
|
| 37 |
+
ModelStatusEntry(
|
| 38 |
+
name=name,
|
| 39 |
+
available=available,
|
| 40 |
+
reason=reason,
|
| 41 |
+
type=cfg.get("type", "unknown"),
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
return ModelsStatusResponse(models=entries, active=state["model_name"] or "")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@router.get("/models")
|
| 48 |
+
async def list_models():
|
| 49 |
+
state = get_state()
|
| 50 |
+
return {"available": list(AVAILABLE_MODELS.keys()), "active": state["model_name"]}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@router.put("/model/{model_name}")
|
| 54 |
+
async def switch_model(model_name: str):
|
| 55 |
+
if model_name not in AVAILABLE_MODELS:
|
| 56 |
+
raise HTTPException(
|
| 57 |
+
status_code=400,
|
| 58 |
+
detail=f"Model '{model_name}' not available. Options: {list(AVAILABLE_MODELS.keys())}",
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
available, reason = check_model_availability(model_name, PROJECT_ROOT)
|
| 62 |
+
if not available:
|
| 63 |
+
raise HTTPException(status_code=400, detail=reason or "Model unavailable")
|
| 64 |
+
|
| 65 |
+
state = get_state()
|
| 66 |
+
prev_service = state["service"]
|
| 67 |
+
prev_name = state["model_name"]
|
| 68 |
+
|
| 69 |
+
new_service = ModelService(model_name, PROJECT_ROOT)
|
| 70 |
+
warmup = new_service.predict("warmup")
|
| 71 |
+
if warmup.get("error"):
|
| 72 |
+
state["service"] = prev_service
|
| 73 |
+
state["model_name"] = prev_name
|
| 74 |
+
raise HTTPException(status_code=400, detail=str(warmup["error"]))
|
| 75 |
+
|
| 76 |
+
state["service"] = new_service
|
| 77 |
+
state["model_name"] = model_name
|
| 78 |
+
return {"message": f"Active model set to '{model_name}'", "model": model_name}
|
src/api/routes/predict.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, HTTPException
|
| 4 |
+
|
| 5 |
+
from src.api.schemas import (
|
| 6 |
+
BatchPredictRequest,
|
| 7 |
+
BatchPredictResponse,
|
| 8 |
+
PredictRequest,
|
| 9 |
+
PredictResponse,
|
| 10 |
+
VideoRequest,
|
| 11 |
+
VideoResponse,
|
| 12 |
+
)
|
| 13 |
+
from src.api.services import predict_single, to_predict_response
|
| 14 |
+
from src.api.state import get_state
|
| 15 |
+
from src.api.youtube import CommentsFetchError, fetch_comments
|
| 16 |
+
router = APIRouter(tags=["Prediction"])
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@router.post("/predict", response_model=PredictResponse)
|
| 20 |
+
async def predict(request: PredictRequest):
|
| 21 |
+
return predict_single(request.text, request.threshold)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@router.post("/predict-batch", response_model=BatchPredictResponse)
|
| 25 |
+
async def predict_batch(request: BatchPredictRequest):
|
| 26 |
+
t0 = time.perf_counter()
|
| 27 |
+
results: list[PredictResponse] = []
|
| 28 |
+
for text in request.texts:
|
| 29 |
+
if not text.strip():
|
| 30 |
+
continue
|
| 31 |
+
results.append(predict_single(text.strip(), request.threshold))
|
| 32 |
+
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 33 |
+
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 34 |
+
return BatchPredictResponse(
|
| 35 |
+
results=results,
|
| 36 |
+
total=len(results),
|
| 37 |
+
toxic_count=toxic_count,
|
| 38 |
+
latency_ms=total_ms,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.post("/predict-video", response_model=VideoResponse)
|
| 43 |
+
async def predict_video(request: VideoRequest):
|
| 44 |
+
try:
|
| 45 |
+
comments, source = fetch_comments(request.url, request.max_comments)
|
| 46 |
+
except CommentsFetchError as exc:
|
| 47 |
+
raise HTTPException(status_code=422, detail=str(exc)) from exc
|
| 48 |
+
except Exception as exc:
|
| 49 |
+
raise HTTPException(status_code=422, detail=f"Failed to fetch comments: {exc}") from exc
|
| 50 |
+
|
| 51 |
+
if not comments:
|
| 52 |
+
raise HTTPException(status_code=404, detail="No comments found for this video")
|
| 53 |
+
|
| 54 |
+
t0 = time.perf_counter()
|
| 55 |
+
results: list[PredictResponse] = []
|
| 56 |
+
service = get_state()["service"]
|
| 57 |
+
if service is None:
|
| 58 |
+
raise HTTPException(status_code=503, detail="Model not loaded")
|
| 59 |
+
|
| 60 |
+
for text in comments:
|
| 61 |
+
if not text.strip():
|
| 62 |
+
continue
|
| 63 |
+
raw = service.predict(text)
|
| 64 |
+
results.append(to_predict_response(text, raw, 0.0, request.threshold))
|
| 65 |
+
|
| 66 |
+
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 67 |
+
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 68 |
+
get_state()["predictions_served"] = get_state().get("predictions_served", 0) + len(results)
|
| 69 |
+
|
| 70 |
+
return VideoResponse(
|
| 71 |
+
video_url=request.url,
|
| 72 |
+
total_fetched=len(results),
|
| 73 |
+
toxic_count=toxic_count,
|
| 74 |
+
toxic_rate=round(toxic_count / len(results), 4) if results else 0.0,
|
| 75 |
+
results=results,
|
| 76 |
+
source=source,
|
| 77 |
+
)
|
src/api/routes/videos.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
|
| 3 |
+
from src.api.schemas import SuggestedVideo, SuggestedVideosResponse
|
| 4 |
+
from src.api.youtube import fetch_video_metadata, load_suggested_config
|
| 5 |
+
|
| 6 |
+
router = APIRouter(tags=["Videos"])
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@router.get("/videos/suggested", response_model=SuggestedVideosResponse)
|
| 10 |
+
async def suggested_videos():
|
| 11 |
+
cfg = load_suggested_config()
|
| 12 |
+
max_comments = int(cfg.get("max_comments", 50))
|
| 13 |
+
entries = cfg.get("videos") or []
|
| 14 |
+
ids = [e["id"] if isinstance(e, dict) else str(e) for e in entries]
|
| 15 |
+
meta = fetch_video_metadata(ids)
|
| 16 |
+
videos = [SuggestedVideo(**m) for m in meta]
|
| 17 |
+
return SuggestedVideosResponse(videos=videos, max_comments=max_comments)
|
src/api/schemas.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic request/response models for the API."""
|
| 2 |
+
|
| 3 |
+
from typing import Literal, Optional
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field, field_validator
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class PredictRequest(BaseModel):
|
| 9 |
+
text: str = Field(..., min_length=1, max_length=5000)
|
| 10 |
+
threshold: float = Field(0.5, ge=0.0, le=1.0)
|
| 11 |
+
|
| 12 |
+
@field_validator("text")
|
| 13 |
+
@classmethod
|
| 14 |
+
def text_not_empty(cls, v: str) -> str:
|
| 15 |
+
if not v.strip():
|
| 16 |
+
raise ValueError("Text cannot be empty")
|
| 17 |
+
return v.strip()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PredictResponse(BaseModel):
|
| 21 |
+
text: str
|
| 22 |
+
is_toxic: bool
|
| 23 |
+
probability: float = Field(..., ge=0.0, le=1.0)
|
| 24 |
+
status: Literal["Safe", "Toxic"]
|
| 25 |
+
mode: Literal["binary"] = "binary"
|
| 26 |
+
labels: list[str]
|
| 27 |
+
model_used: str
|
| 28 |
+
latency_ms: float
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class BatchPredictRequest(BaseModel):
|
| 32 |
+
texts: list[str] = Field(..., min_length=1, max_length=100)
|
| 33 |
+
threshold: float = Field(0.5, ge=0.0, le=1.0)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class BatchPredictResponse(BaseModel):
|
| 37 |
+
results: list[PredictResponse]
|
| 38 |
+
total: int
|
| 39 |
+
toxic_count: int
|
| 40 |
+
latency_ms: float
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class VideoRequest(BaseModel):
|
| 44 |
+
url: str
|
| 45 |
+
max_comments: int = Field(50, ge=1, le=200)
|
| 46 |
+
threshold: float = Field(0.5, ge=0.0, le=1.0)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class VideoResponse(BaseModel):
|
| 50 |
+
video_url: str
|
| 51 |
+
total_fetched: int
|
| 52 |
+
toxic_count: int
|
| 53 |
+
toxic_rate: float
|
| 54 |
+
results: list[PredictResponse]
|
| 55 |
+
source: Literal["youtube", "demo", "unavailable"] = "demo"
|
| 56 |
+
reason: Optional[str] = None
|
| 57 |
+
error: Optional[str] = None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class ModelStatusEntry(BaseModel):
|
| 61 |
+
name: str
|
| 62 |
+
available: bool
|
| 63 |
+
reason: Optional[str] = None
|
| 64 |
+
type: str = "unknown"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class ModelsStatusResponse(BaseModel):
|
| 68 |
+
models: list[ModelStatusEntry]
|
| 69 |
+
active: str
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class ModelInfo(BaseModel):
|
| 73 |
+
name: str
|
| 74 |
+
type: str
|
| 75 |
+
description: str
|
| 76 |
+
speed: str
|
| 77 |
+
accuracy: str
|
| 78 |
+
uptime_s: float
|
| 79 |
+
predictions_served: int
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class SuggestedVideo(BaseModel):
|
| 83 |
+
id: str
|
| 84 |
+
title: str
|
| 85 |
+
channel_title: str
|
| 86 |
+
thumbnail_url: str
|
| 87 |
+
watch_url: str
|
| 88 |
+
embeddable: bool = True
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class SuggestedVideosResponse(BaseModel):
|
| 92 |
+
videos: list[SuggestedVideo]
|
| 93 |
+
max_comments: int
|
src/api/services.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prediction helpers used by route handlers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
from fastapi import HTTPException
|
| 8 |
+
|
| 9 |
+
from src.api.schemas import PredictResponse
|
| 10 |
+
from src.api.state import get_state
|
| 11 |
+
from src.service.model_service import ModelService
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_service() -> ModelService:
|
| 15 |
+
state = get_state()
|
| 16 |
+
if state["service"] is None:
|
| 17 |
+
raise HTTPException(status_code=503, detail="Model not loaded. Try again shortly.")
|
| 18 |
+
return state["service"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def to_predict_response(text: str, result: dict, latency_ms: float, threshold: float) -> PredictResponse:
|
| 22 |
+
proba = float(result["probability"])
|
| 23 |
+
is_toxic = proba >= threshold
|
| 24 |
+
labels = result.get("labels", []) if is_toxic else []
|
| 25 |
+
return PredictResponse(
|
| 26 |
+
text=text,
|
| 27 |
+
is_toxic=is_toxic,
|
| 28 |
+
probability=round(proba, 4),
|
| 29 |
+
status="Toxic" if is_toxic else "Safe",
|
| 30 |
+
mode="binary",
|
| 31 |
+
labels=labels,
|
| 32 |
+
model_used=result.get("model_used", ""),
|
| 33 |
+
latency_ms=latency_ms,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def predict_single(text: str, threshold: float) -> PredictResponse:
|
| 38 |
+
state = get_state()
|
| 39 |
+
t0 = time.perf_counter()
|
| 40 |
+
result = get_service().predict(text)
|
| 41 |
+
ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 42 |
+
|
| 43 |
+
if "error" in result:
|
| 44 |
+
raise HTTPException(status_code=500, detail=result["error"])
|
| 45 |
+
|
| 46 |
+
state["predictions_served"] = state.get("predictions_served", 0) + 1
|
| 47 |
+
return to_predict_response(text, result, ms, threshold)
|
src/api/state.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application state shared across routes."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 6 |
+
|
| 7 |
+
_state: dict = {
|
| 8 |
+
"service": None,
|
| 9 |
+
"model_name": None,
|
| 10 |
+
"startup_time": None,
|
| 11 |
+
"predictions_served": 0,
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_state() -> dict:
|
| 16 |
+
return _state
|
src/api/youtube.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""YouTube comment fetch and suggested-video metadata."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
from src.utils.logger import get_logger
|
| 13 |
+
|
| 14 |
+
logger = get_logger(__name__)
|
| 15 |
+
|
| 16 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 17 |
+
SUGGESTED_CONFIG = PROJECT_ROOT / "configs" / "suggested_videos.yaml"
|
| 18 |
+
|
| 19 |
+
_VIDEO_ID_PATTERNS = (
|
| 20 |
+
r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
|
| 21 |
+
r"youtu\.be/([a-zA-Z0-9_-]{11})",
|
| 22 |
+
r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class CommentsFetchError(Exception):
|
| 27 |
+
"""Raised when comments cannot be fetched and demo fallback must not be used."""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def extract_video_id(url: str) -> str | None:
|
| 31 |
+
for pattern in _VIDEO_ID_PATTERNS:
|
| 32 |
+
match = re.search(pattern, url)
|
| 33 |
+
if match:
|
| 34 |
+
return match.group(1)
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def load_suggested_config() -> dict[str, Any]:
|
| 39 |
+
if not SUGGESTED_CONFIG.exists():
|
| 40 |
+
return {"max_comments": 50, "videos": [{"id": "jNQXAC9IVRw"}]}
|
| 41 |
+
with SUGGESTED_CONFIG.open(encoding="utf-8") as f:
|
| 42 |
+
return yaml.safe_load(f) or {}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _parse_youtube_error(exc: Exception) -> str:
|
| 46 |
+
err_text = str(exc)
|
| 47 |
+
if "commentsDisabled" in err_text:
|
| 48 |
+
return "Comments are disabled on this video"
|
| 49 |
+
if "disabled comments" in err_text.lower():
|
| 50 |
+
return "Comments are disabled on this video"
|
| 51 |
+
if "quota" in err_text.lower():
|
| 52 |
+
return "YouTube API quota exceeded"
|
| 53 |
+
try:
|
| 54 |
+
from googleapiclient.errors import HttpError
|
| 55 |
+
|
| 56 |
+
if isinstance(exc, HttpError):
|
| 57 |
+
for detail in getattr(exc, "error_details", []) or []:
|
| 58 |
+
reason = detail.get("reason") if isinstance(detail, dict) else None
|
| 59 |
+
if reason == "commentsDisabled":
|
| 60 |
+
return "Comments are disabled on this video"
|
| 61 |
+
except ImportError:
|
| 62 |
+
pass
|
| 63 |
+
return err_text
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def fetch_comments(url: str, max_comments: int) -> tuple[list[str], str]:
|
| 67 |
+
video_id = extract_video_id(url) or "unknown"
|
| 68 |
+
api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
|
| 69 |
+
if api_key:
|
| 70 |
+
return _fetch_via_api(url, api_key, max_comments, video_id)
|
| 71 |
+
return _demo_comments(video_id, max_comments), "demo"
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _fetch_via_api(
|
| 75 |
+
url: str, api_key: str, max_comments: int, video_id: str
|
| 76 |
+
) -> tuple[list[str], str]:
|
| 77 |
+
from googleapiclient.discovery import build
|
| 78 |
+
|
| 79 |
+
if video_id == "unknown":
|
| 80 |
+
raise CommentsFetchError(f"Could not parse video id from: {url}")
|
| 81 |
+
|
| 82 |
+
youtube = build("youtube", "v3", developerKey=api_key)
|
| 83 |
+
comments: list[str] = []
|
| 84 |
+
page_token = None
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
while len(comments) < max_comments:
|
| 88 |
+
response = (
|
| 89 |
+
youtube.commentThreads()
|
| 90 |
+
.list(
|
| 91 |
+
part="snippet",
|
| 92 |
+
videoId=video_id,
|
| 93 |
+
maxResults=min(100, max_comments - len(comments)),
|
| 94 |
+
pageToken=page_token,
|
| 95 |
+
textFormat="plainText",
|
| 96 |
+
)
|
| 97 |
+
.execute()
|
| 98 |
+
)
|
| 99 |
+
for item in response.get("items", []):
|
| 100 |
+
text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
|
| 101 |
+
comments.append(text)
|
| 102 |
+
page_token = response.get("nextPageToken")
|
| 103 |
+
if not page_token:
|
| 104 |
+
break
|
| 105 |
+
except Exception as exc:
|
| 106 |
+
message = _parse_youtube_error(exc)
|
| 107 |
+
logger.warning("YouTube API failed for %s: %s", video_id, message)
|
| 108 |
+
raise CommentsFetchError(message) from exc
|
| 109 |
+
|
| 110 |
+
if not comments:
|
| 111 |
+
raise CommentsFetchError("No comments found for this video")
|
| 112 |
+
|
| 113 |
+
logger.info("YouTube API: fetched %s comments for %s", len(comments), video_id)
|
| 114 |
+
return comments[:max_comments], "youtube"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def fetch_video_metadata(video_ids: list[str]) -> list[dict[str, Any]]:
|
| 118 |
+
api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
|
| 119 |
+
if not api_key or not video_ids:
|
| 120 |
+
return [_placeholder_meta(vid) for vid in video_ids]
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
from googleapiclient.discovery import build
|
| 124 |
+
|
| 125 |
+
youtube = build("youtube", "v3", developerKey=api_key)
|
| 126 |
+
response = (
|
| 127 |
+
youtube.videos()
|
| 128 |
+
.list(part="snippet,status", id=",".join(video_ids))
|
| 129 |
+
.execute()
|
| 130 |
+
)
|
| 131 |
+
by_id: dict[str, dict[str, Any]] = {}
|
| 132 |
+
for item in response.get("items", []):
|
| 133 |
+
vid = item["id"]
|
| 134 |
+
snip = item["snippet"]
|
| 135 |
+
status = item.get("status", {})
|
| 136 |
+
thumbs = snip.get("thumbnails", {})
|
| 137 |
+
thumb = thumbs.get("medium") or thumbs.get("default") or {}
|
| 138 |
+
embeddable = status.get("embeddable", True)
|
| 139 |
+
by_id[vid] = {
|
| 140 |
+
"id": vid,
|
| 141 |
+
"title": snip.get("title", vid),
|
| 142 |
+
"channel_title": snip.get("channelTitle", "Unknown"),
|
| 143 |
+
"thumbnail_url": thumb.get("url", f"https://i.ytimg.com/vi/{vid}/mqdefault.jpg"),
|
| 144 |
+
"watch_url": f"https://www.youtube.com/watch?v={vid}",
|
| 145 |
+
"embeddable": bool(embeddable),
|
| 146 |
+
}
|
| 147 |
+
return [by_id.get(vid, _placeholder_meta(vid)) for vid in video_ids]
|
| 148 |
+
except Exception as exc:
|
| 149 |
+
logger.warning("YouTube metadata failed: %s", exc)
|
| 150 |
+
return [_placeholder_meta(vid) for vid in video_ids]
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _placeholder_meta(video_id: str) -> dict[str, Any]:
|
| 154 |
+
return {
|
| 155 |
+
"id": video_id,
|
| 156 |
+
"title": f"Video {video_id}",
|
| 157 |
+
"channel_title": "YouTube",
|
| 158 |
+
"thumbnail_url": f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg",
|
| 159 |
+
"watch_url": f"https://www.youtube.com/watch?v={video_id}",
|
| 160 |
+
"embeddable": True,
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def _demo_comments(video_id: str, max_comments: int) -> list[str]:
|
| 165 |
+
import random
|
| 166 |
+
|
| 167 |
+
logger.info("Using demo comments for %s (no YOUTUBE_API_KEY)", video_id)
|
| 168 |
+
examples = [
|
| 169 |
+
"This video is really informative, thanks for sharing!",
|
| 170 |
+
"You are all stupid idiots, get out of here!",
|
| 171 |
+
"Great content, I learned a lot from this.",
|
| 172 |
+
"These people should be eliminated from society.",
|
| 173 |
+
"I agree with the presenter's point of view.",
|
| 174 |
+
"What a bunch of racist criminals!",
|
| 175 |
+
"Thank you for this analysis, very helpful.",
|
| 176 |
+
"Kill them all, they don't deserve to live.",
|
| 177 |
+
"Interesting perspective on the topic.",
|
| 178 |
+
"This is absolute bullshit propaganda!",
|
| 179 |
+
"I think we need to look at both sides.",
|
| 180 |
+
"Well researched video, good job.",
|
| 181 |
+
"Go back to where you came from!",
|
| 182 |
+
"The data presented here is compelling.",
|
| 183 |
+
]
|
| 184 |
+
rng = random.Random(video_id)
|
| 185 |
+
rotated = examples[:]
|
| 186 |
+
rng.shuffle(rotated)
|
| 187 |
+
return rotated[:max_comments]
|
src/app/app.py
DELETED
|
@@ -1,764 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
src/app/streamlit_app.py
|
| 3 |
-
|
| 4 |
-
App SignalMod — detección de hate speech estilo YouTube.
|
| 5 |
-
Ejecutar: streamlit run src/app/streamlit_app.py
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import html
|
| 9 |
-
import sys
|
| 10 |
-
import random
|
| 11 |
-
import datetime
|
| 12 |
-
from pathlib import Path
|
| 13 |
-
|
| 14 |
-
import streamlit as st
|
| 15 |
-
import pandas as pd
|
| 16 |
-
|
| 17 |
-
from transformers.utils import logging
|
| 18 |
-
logging.set_verbosity_error()
|
| 19 |
-
|
| 20 |
-
# ── Paths ─────────────────────────────────────────────────────────────────────
|
| 21 |
-
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 22 |
-
sys.path.insert(0, str(PROJECT_ROOT))
|
| 23 |
-
|
| 24 |
-
try:
|
| 25 |
-
from src.service.model_service import ModelService, AVAILABLE_MODELS
|
| 26 |
-
except ImportError:
|
| 27 |
-
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 28 |
-
from service.model_service import ModelService, AVAILABLE_MODELS
|
| 29 |
-
|
| 30 |
-
# ── Config ────────────────────────────────────────────────────────────────────
|
| 31 |
-
st.set_page_config(
|
| 32 |
-
page_title="SignalMod",
|
| 33 |
-
page_icon="🎬",
|
| 34 |
-
layout="wide",
|
| 35 |
-
initial_sidebar_state="expanded",
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
# ── CSS ───────────────────────────────────────────────────────────────────────
|
| 39 |
-
# Nota: NO ocultamos el header completo para preservar el botón de toggle del sidebar.
|
| 40 |
-
# Solo ocultamos el menú hamburguesa y el footer de Streamlit.
|
| 41 |
-
st.markdown("""
|
| 42 |
-
<style>
|
| 43 |
-
@import url('https://fonts.googleapis.com/css2?family=YouTube+Sans:wght@400;600;700&display=swap');
|
| 44 |
-
|
| 45 |
-
/* ── Ocultar solo elementos de branding, NO el header completo ── */
|
| 46 |
-
#MainMenu { visibility: hidden; }
|
| 47 |
-
footer { visibility: hidden; }
|
| 48 |
-
|
| 49 |
-
/* ── Fondo de la app: blanco limpio ── */
|
| 50 |
-
.stApp { background: #ffffff; }
|
| 51 |
-
|
| 52 |
-
/* ── Sidebar oscuro (como YouTube) ── */
|
| 53 |
-
section[data-testid="stSidebar"] {
|
| 54 |
-
background-color: #0f0f0f !important;
|
| 55 |
-
}
|
| 56 |
-
section[data-testid="stSidebar"] > div {
|
| 57 |
-
background-color: #0f0f0f !important;
|
| 58 |
-
}
|
| 59 |
-
/* Texto del sidebar en blanco */
|
| 60 |
-
section[data-testid="stSidebar"] p,
|
| 61 |
-
section[data-testid="stSidebar"] span,
|
| 62 |
-
section[data-testid="stSidebar"] label,
|
| 63 |
-
section[data-testid="stSidebar"] div {
|
| 64 |
-
color: #ffffff !important;
|
| 65 |
-
}
|
| 66 |
-
/* Botones del sidebar */
|
| 67 |
-
section[data-testid="stSidebar"] .stButton button {
|
| 68 |
-
background: transparent !important;
|
| 69 |
-
color: #e0e0e0 !important;
|
| 70 |
-
border: none !important;
|
| 71 |
-
text-align: left !important;
|
| 72 |
-
justify-content: flex-start !important;
|
| 73 |
-
border-radius: 10px !important;
|
| 74 |
-
padding: 0.5rem 0.75rem !important;
|
| 75 |
-
font-size: 0.9rem !important;
|
| 76 |
-
font-weight: 400 !important;
|
| 77 |
-
width: 100% !important;
|
| 78 |
-
}
|
| 79 |
-
section[data-testid="stSidebar"] .stButton button:hover {
|
| 80 |
-
background: rgba(255,255,255,0.1) !important;
|
| 81 |
-
color: #ffffff !important;
|
| 82 |
-
}
|
| 83 |
-
/* Botón activo en el sidebar */
|
| 84 |
-
section[data-testid="stSidebar"] .stButton button[data-active="true"] {
|
| 85 |
-
background: rgba(255,255,255,0.15) !important;
|
| 86 |
-
color: #ffffff !important;
|
| 87 |
-
font-weight: 600 !important;
|
| 88 |
-
}
|
| 89 |
-
/* Divider del sidebar */
|
| 90 |
-
section[data-testid="stSidebar"] hr {
|
| 91 |
-
border-color: rgba(255,255,255,0.15) !important;
|
| 92 |
-
}
|
| 93 |
-
/* Badge de modelo activo en sidebar */
|
| 94 |
-
.sidebar-model-info {
|
| 95 |
-
background: rgba(255,255,255,0.08);
|
| 96 |
-
border-radius: 8px;
|
| 97 |
-
padding: 8px 12px;
|
| 98 |
-
margin: 8px 0;
|
| 99 |
-
font-size: 0.75rem;
|
| 100 |
-
color: #aaaaaa;
|
| 101 |
-
}
|
| 102 |
-
.sidebar-model-info strong { color: #ffffff; }
|
| 103 |
-
|
| 104 |
-
/* ── Área principal: fondo blanco, texto oscuro ── */
|
| 105 |
-
.main-area { background: #ffffff; }
|
| 106 |
-
|
| 107 |
-
/* ── Video thumbnail ── */
|
| 108 |
-
.video-thumb {
|
| 109 |
-
background: linear-gradient(135deg, #0d0d1a 0%, #1a0a2e 50%, #0d1a1a 100%);
|
| 110 |
-
border-radius: 12px;
|
| 111 |
-
height: 340px;
|
| 112 |
-
display: flex;
|
| 113 |
-
align-items: center;
|
| 114 |
-
justify-content: center;
|
| 115 |
-
}
|
| 116 |
-
.play-btn {
|
| 117 |
-
width: 72px; height: 72px;
|
| 118 |
-
background: rgba(255,255,255,0.9);
|
| 119 |
-
border-radius: 50%;
|
| 120 |
-
display: flex; align-items: center; justify-content: center;
|
| 121 |
-
font-size: 2rem; cursor: pointer;
|
| 122 |
-
box-shadow: 0 4px 20px rgba(0,0,0,0.4);
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
/* ── Títulos de video ── */
|
| 126 |
-
.video-title {
|
| 127 |
-
font-size: 1.15rem; font-weight: 700;
|
| 128 |
-
color: #0f0f0f; margin: 0.75rem 0 0.3rem;
|
| 129 |
-
line-height: 1.4;
|
| 130 |
-
}
|
| 131 |
-
.video-meta { font-size: 0.82rem; color: #606060; }
|
| 132 |
-
.channel-name { font-weight: 600; font-size: 0.9rem; color: #0f0f0f; }
|
| 133 |
-
|
| 134 |
-
/* ── Badges ── */
|
| 135 |
-
.badge {
|
| 136 |
-
display: inline-block;
|
| 137 |
-
padding: 2px 9px; border-radius: 12px;
|
| 138 |
-
font-size: 0.72rem; font-weight: 700;
|
| 139 |
-
margin-left: 6px; vertical-align: middle;
|
| 140 |
-
}
|
| 141 |
-
.badge-toxic { background: #cc0000; color: #ffffff; }
|
| 142 |
-
.badge-safe { background: #00c853; color: #ffffff; }
|
| 143 |
-
|
| 144 |
-
/* ── Comentarios ── */
|
| 145 |
-
.comment-wrap {
|
| 146 |
-
display: flex; gap: 12px;
|
| 147 |
-
padding: 12px 0; border-bottom: 1px solid #f0f0f0;
|
| 148 |
-
}
|
| 149 |
-
.c-avatar {
|
| 150 |
-
width: 36px; height: 36px; min-width: 36px;
|
| 151 |
-
border-radius: 50%; background: #cc0000;
|
| 152 |
-
display: flex; align-items: center; justify-content: center;
|
| 153 |
-
color: #ffffff; font-weight: 700; font-size: 0.85rem;
|
| 154 |
-
flex-shrink: 0;
|
| 155 |
-
}
|
| 156 |
-
.c-avatar.safe { background: #606060; }
|
| 157 |
-
.c-body { flex: 1; min-width: 0; }
|
| 158 |
-
.c-header { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
|
| 159 |
-
.c-user { font-size: 0.84rem; font-weight: 600; color: #0f0f0f; }
|
| 160 |
-
.c-time { font-size: 0.75rem; color: #909090; margin-left: 4px; }
|
| 161 |
-
.c-text { font-size: 0.88rem; color: #2d2d2d; margin-top: 4px; line-height: 1.55; }
|
| 162 |
-
.c-text.toxic {
|
| 163 |
-
background: #fff5f5;
|
| 164 |
-
border-left: 3px solid #cc0000;
|
| 165 |
-
padding: 6px 10px; border-radius: 0 6px 6px 0;
|
| 166 |
-
margin-top: 6px;
|
| 167 |
-
}
|
| 168 |
-
.c-flagged { font-size: 0.77rem; color: #cc0000; font-weight: 500; margin-top: 4px; }
|
| 169 |
-
|
| 170 |
-
/* ── Toxicity bar inline ── */
|
| 171 |
-
.tox-row {
|
| 172 |
-
display: flex; align-items: center; gap: 8px;
|
| 173 |
-
font-size: 0.8rem; color: #606060; margin-top: 6px; flex-wrap: wrap;
|
| 174 |
-
}
|
| 175 |
-
.tox-bar-bg {
|
| 176 |
-
flex: 1; max-width: 120px;
|
| 177 |
-
background: #e5e5e5; border-radius: 4px; height: 6px;
|
| 178 |
-
}
|
| 179 |
-
.tox-bar-fill { height: 6px; border-radius: 4px; }
|
| 180 |
-
|
| 181 |
-
/* ── Sugeridos ── */
|
| 182 |
-
.sug-card {
|
| 183 |
-
display: flex; gap: 8px; margin-bottom: 10px;
|
| 184 |
-
cursor: pointer;
|
| 185 |
-
}
|
| 186 |
-
.sug-thumb {
|
| 187 |
-
width: 120px; min-width: 120px; height: 68px;
|
| 188 |
-
background: #1a1a2e; border-radius: 6px;
|
| 189 |
-
display: flex; align-items: center; justify-content: center;
|
| 190 |
-
font-size: 1.4rem; flex-shrink: 0;
|
| 191 |
-
}
|
| 192 |
-
.sug-title { font-size: 0.82rem; font-weight: 600; color: #0f0f0f; line-height: 1.3; }
|
| 193 |
-
.sug-ch { font-size: 0.75rem; color: #606060; margin-top: 2px; }
|
| 194 |
-
.sug-meta { font-size: 0.72rem; color: #909090; }
|
| 195 |
-
|
| 196 |
-
/* ── Section header ── */
|
| 197 |
-
.sec-title {
|
| 198 |
-
font-size: 1rem; font-weight: 700; color: #0f0f0f;
|
| 199 |
-
margin: 1.25rem 0 0.75rem; padding-bottom: 0.5rem;
|
| 200 |
-
border-bottom: 1px solid #e5e5e5;
|
| 201 |
-
}
|
| 202 |
-
|
| 203 |
-
/* ── Modal body fixes ── */
|
| 204 |
-
[data-testid="stDialog"] { background: #ffffff; }
|
| 205 |
-
|
| 206 |
-
/* ── Hub cards ── */
|
| 207 |
-
.hub-card {
|
| 208 |
-
background: #ffffff; border: 1px solid #e5e5e5;
|
| 209 |
-
border-radius: 12px; padding: 1rem;
|
| 210 |
-
}
|
| 211 |
-
.hub-kpi-label { font-size: 0.72rem; color: #606060; text-transform: uppercase;
|
| 212 |
-
letter-spacing: 0.5px; margin-bottom: 4px; }
|
| 213 |
-
.hub-kpi-val { font-size: 1.8rem; font-weight: 700; color: #0f0f0f; }
|
| 214 |
-
|
| 215 |
-
/* ── Model cards (settings) ── */
|
| 216 |
-
.model-card {
|
| 217 |
-
background: #ffffff; border: 1.5px solid #e5e5e5;
|
| 218 |
-
border-radius: 10px; padding: 14px 16px; margin-bottom: 8px;
|
| 219 |
-
}
|
| 220 |
-
.model-card.active {
|
| 221 |
-
border-color: #cc0000; background: #fff5f5;
|
| 222 |
-
}
|
| 223 |
-
.model-card-name { font-size: 0.95rem; font-weight: 600; color: #0f0f0f; }
|
| 224 |
-
.model-card-desc { font-size: 0.8rem; color: #606060; margin-top: 3px; }
|
| 225 |
-
.model-pill {
|
| 226 |
-
display: inline-block; background: #f0f0f0; color: #333;
|
| 227 |
-
border-radius: 6px; padding: 2px 8px; font-size: 0.73rem; margin-right: 4px;
|
| 228 |
-
}
|
| 229 |
-
</style>
|
| 230 |
-
""", unsafe_allow_html=True)
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
# ── Session state init ────────────────────────────────────────────────────────
|
| 234 |
-
def _init_state():
|
| 235 |
-
defaults = {
|
| 236 |
-
"page" : "Home",
|
| 237 |
-
"selected_model": list(AVAILABLE_MODELS.keys())[0],
|
| 238 |
-
"threshold" : 0.5,
|
| 239 |
-
"pending_modal" : None, # dict con el comentario pendiente de decisión
|
| 240 |
-
"comments": [
|
| 241 |
-
{"user": "user_prime", "initial": "U",
|
| 242 |
-
"text": "Excelente video, muy informativo!", "time": "1 h",
|
| 243 |
-
"is_toxic": False, "probability": 0.04, "labels": []},
|
| 244 |
-
{"user": "troll_master", "initial": "T",
|
| 245 |
-
"text": "Esto es una basura completa", "time": "30 min",
|
| 246 |
-
"is_toxic": True, "probability": 0.91, "labels": ["Insulto","Agresividad"]},
|
| 247 |
-
{"user": "curious_viewer", "initial": "C",
|
| 248 |
-
"text": "¿Alguien puede explicar esto mejor?", "time": "15 min",
|
| 249 |
-
"is_toxic": False, "probability": 0.07, "labels": []},
|
| 250 |
-
],
|
| 251 |
-
"hub_history": [
|
| 252 |
-
{"Usuario": "@user_992", "Comentario": '"No puedo creer que seas tan..."', "Score": 0.94, "Acción": "🚫 Bloqueado"},
|
| 253 |
-
{"Usuario": "@alpha_mod", "Comentario": '"Spam repetitivo de enlaces."', "Score": 0.82, "Acción": "🚩 Revisión"},
|
| 254 |
-
{"Usuario": "@anon_404", "Comentario": '"Discurso de odio en contexto."', "Score": 0.98, "Acción": "📋 Archivado"},
|
| 255 |
-
{"Usuario": "@user_123", "Comentario": '"¡Gran contenido, sigan!"', "Score": 0.03, "Acción": "✅ Aprobado"},
|
| 256 |
-
{"Usuario": "@viewer_x", "Comentario": '"Esta gente debería desaparecer."',"Score": 0.97, "Acción": "🚫 Bloqueado"},
|
| 257 |
-
],
|
| 258 |
-
}
|
| 259 |
-
for k, v in defaults.items():
|
| 260 |
-
if k not in st.session_state:
|
| 261 |
-
st.session_state[k] = v
|
| 262 |
-
|
| 263 |
-
_init_state()
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
# ── Model cache ───────────────────────────────────────────────────────────────
|
| 267 |
-
@st.cache_resource(show_spinner="Cargando modelo...")
|
| 268 |
-
def get_service(model_name: str) -> ModelService:
|
| 269 |
-
return ModelService(model_name, PROJECT_ROOT)
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 273 |
-
# SIDEBAR
|
| 274 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 275 |
-
def render_sidebar():
|
| 276 |
-
with st.sidebar:
|
| 277 |
-
# Logo
|
| 278 |
-
st.markdown(
|
| 279 |
-
"<div style='padding:0.5rem 0 0.25rem; font-size:1.3rem; font-weight:700;'>"
|
| 280 |
-
"🎬 <span style='color:#cc0000'>Signal</span>Mod</div>"
|
| 281 |
-
"<div style='font-size:0.65rem; color:#aaa; margin-bottom:1.2rem;'>"
|
| 282 |
-
"Signal within the Noise</div>",
|
| 283 |
-
unsafe_allow_html=True,
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
-
nav = {"Home": "🏠", "Moderator Hub": "📊", "Settings": "⚙️"}
|
| 287 |
-
for page, icon in nav.items():
|
| 288 |
-
label = f"{icon} {page}"
|
| 289 |
-
clicked = st.button(label, key=f"nav_{page}", use_container_width=True)
|
| 290 |
-
if clicked:
|
| 291 |
-
st.session_state.page = page
|
| 292 |
-
st.rerun()
|
| 293 |
-
|
| 294 |
-
st.divider()
|
| 295 |
-
|
| 296 |
-
# Info modelo activo
|
| 297 |
-
model_short = st.session_state.selected_model.split("(")[0].strip()
|
| 298 |
-
tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"])
|
| 299 |
-
total_c = len(st.session_state.comments)
|
| 300 |
-
|
| 301 |
-
st.markdown(
|
| 302 |
-
f"<div class='sidebar-model-info'>"
|
| 303 |
-
f"Modelo activo<br><strong>{html.escape(model_short)}</strong>"
|
| 304 |
-
f"<br><br>Comentarios: <strong>{total_c}</strong>"
|
| 305 |
-
f" · Tóxicos: <strong style='color:#cc0000'>{tox_cnt}</strong>"
|
| 306 |
-
f"</div>",
|
| 307 |
-
unsafe_allow_html=True,
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 312 |
-
# MODAL — toxicidad detectada
|
| 313 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 314 |
-
@st.dialog("⚠️ Aviso de Toxicidad Detectada")
|
| 315 |
-
def show_toxicity_modal():
|
| 316 |
-
"""
|
| 317 |
-
@st.dialog crea una ventana modal nativa de Streamlit (1.32+).
|
| 318 |
-
Cuando se llama a la función decorada, Streamlit renderiza el contenido
|
| 319 |
-
dentro de un overlay modal y pausa la ejecución normal del script.
|
| 320 |
-
"""
|
| 321 |
-
data = st.session_state.pending_modal
|
| 322 |
-
if not data:
|
| 323 |
-
st.rerun()
|
| 324 |
-
return
|
| 325 |
-
|
| 326 |
-
text = data["text"]
|
| 327 |
-
prob = data["probability"]
|
| 328 |
-
lbls = data["labels"]
|
| 329 |
-
pct = int(prob * 100)
|
| 330 |
-
color = "#cc0000" if pct >= 70 else "#ff6d00" if pct >= 40 else "#f5a623"
|
| 331 |
-
|
| 332 |
-
st.markdown(
|
| 333 |
-
"<div style='text-align:center; font-size:3rem; color:#cc0000'>⚠️</div>",
|
| 334 |
-
unsafe_allow_html=True,
|
| 335 |
-
)
|
| 336 |
-
st.markdown(
|
| 337 |
-
f"<div style='background:#f8f8f8; border-radius:8px; padding:12px 16px;"
|
| 338 |
-
f"font-style:italic; color:#333; text-align:center; margin:8px 0;'>"
|
| 339 |
-
f""{html.escape(text[:140])}{'...' if len(text)>140 else ''}"</div>",
|
| 340 |
-
unsafe_allow_html=True,
|
| 341 |
-
)
|
| 342 |
-
|
| 343 |
-
# Barra de toxicidad
|
| 344 |
-
st.markdown(
|
| 345 |
-
f"<div style='display:flex; justify-content:space-between; "
|
| 346 |
-
f"font-size:0.82rem; color:#606060; margin-top:12px;'>"
|
| 347 |
-
f"<span>ÍNDICE DE TOXICIDAD</span>"
|
| 348 |
-
f"<span style='color:{color}; font-weight:700'>{pct}%</span></div>"
|
| 349 |
-
f"<div style='background:#e5e5e5; border-radius:4px; height:8px; margin-top:4px;'>"
|
| 350 |
-
f"<div style='width:{pct}%; background:{color}; height:8px; border-radius:4px;'></div>"
|
| 351 |
-
f"</div>",
|
| 352 |
-
unsafe_allow_html=True,
|
| 353 |
-
)
|
| 354 |
-
|
| 355 |
-
# Etiquetas
|
| 356 |
-
if lbls:
|
| 357 |
-
tags = " ".join(
|
| 358 |
-
f"<span style='background:#ffe5e5; color:#cc0000; border-radius:14px;"
|
| 359 |
-
f"padding:3px 10px; font-size:0.76rem; font-weight:600; margin:3px;'>"
|
| 360 |
-
f"🚩 {html.escape(l)}</span>"
|
| 361 |
-
for l in lbls
|
| 362 |
-
)
|
| 363 |
-
st.markdown(f"<div style='margin-top:10px'>{tags}</div>", unsafe_allow_html=True)
|
| 364 |
-
|
| 365 |
-
st.markdown("<br>", unsafe_allow_html=True)
|
| 366 |
-
|
| 367 |
-
col1, col2 = st.columns(2)
|
| 368 |
-
with col1:
|
| 369 |
-
if st.button("✏️ Editar comentario", use_container_width=True, type="primary"):
|
| 370 |
-
st.session_state.pending_modal = None
|
| 371 |
-
st.rerun()
|
| 372 |
-
with col2:
|
| 373 |
-
if st.button("Publicar de todas maneras", use_container_width=True):
|
| 374 |
-
# Publicar aunque sea tóxico
|
| 375 |
-
c = st.session_state.pending_modal
|
| 376 |
-
st.session_state.comments.append(c)
|
| 377 |
-
st.session_state.hub_history.insert(0, {
|
| 378 |
-
"Usuario" : "@usuario",
|
| 379 |
-
"Comentario": f'"{c["text"][:45]}..."',
|
| 380 |
-
"Score" : round(c["probability"], 2),
|
| 381 |
-
"Acción" : "⚠️ Override usuario",
|
| 382 |
-
})
|
| 383 |
-
st.session_state.pending_modal = None
|
| 384 |
-
st.rerun()
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 388 |
-
# HOME — interfaz estilo YouTube
|
| 389 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 390 |
-
def render_home():
|
| 391 |
-
# Disparar modal si hay comentario pendiente
|
| 392 |
-
if st.session_state.pending_modal:
|
| 393 |
-
show_toxicity_modal()
|
| 394 |
-
|
| 395 |
-
col_main, col_right = st.columns([2.8, 1], gap="large")
|
| 396 |
-
|
| 397 |
-
with col_main:
|
| 398 |
-
# Video
|
| 399 |
-
st.markdown(
|
| 400 |
-
"<div class='video-thumb'><div class='play-btn'>▶</div></div>",
|
| 401 |
-
unsafe_allow_html=True,
|
| 402 |
-
)
|
| 403 |
-
st.markdown(
|
| 404 |
-
"<div class='video-title'>AI Moderation Demo — Detección de Hate Speech en tiempo real</div>"
|
| 405 |
-
"<div class='video-meta'>15k vistas · 2 horas atrás</div>",
|
| 406 |
-
unsafe_allow_html=True,
|
| 407 |
-
)
|
| 408 |
-
row_ch, row_sub = st.columns([3, 1])
|
| 409 |
-
with row_ch:
|
| 410 |
-
st.markdown(
|
| 411 |
-
"<div style='display:flex; align-items:center; gap:10px; margin:10px 0;'>"
|
| 412 |
-
"<div style='width:36px; height:36px; border-radius:50%; background:#cc0000;"
|
| 413 |
-
"display:flex; align-items:center; justify-content:center; color:#fff;"
|
| 414 |
-
"font-weight:700;'>S</div>"
|
| 415 |
-
"<div><div class='channel-name'>SignalMod AI</div>"
|
| 416 |
-
"<div class='video-meta'>1.2M suscriptores</div></div></div>",
|
| 417 |
-
unsafe_allow_html=True,
|
| 418 |
-
)
|
| 419 |
-
|
| 420 |
-
st.divider()
|
| 421 |
-
|
| 422 |
-
# ── Comentarios ────────────────────────────────────────────────────
|
| 423 |
-
tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"])
|
| 424 |
-
st.markdown(
|
| 425 |
-
f"<div class='sec-title'>{len(st.session_state.comments)} Comentarios "
|
| 426 |
-
f"<span style='font-size:0.8rem; color:#cc0000;'>· {tox_cnt} detectados</span></div>",
|
| 427 |
-
unsafe_allow_html=True,
|
| 428 |
-
)
|
| 429 |
-
|
| 430 |
-
# Input de nuevo comentario
|
| 431 |
-
new_text = st.text_area(
|
| 432 |
-
"Escribe un comentario...",
|
| 433 |
-
height=80, label_visibility="collapsed",
|
| 434 |
-
key="comment_input",
|
| 435 |
-
placeholder="Escribe un comentario...",
|
| 436 |
-
)
|
| 437 |
-
|
| 438 |
-
# Análisis en tiempo real (solo cuando hay texto)
|
| 439 |
-
analysis = None
|
| 440 |
-
if new_text.strip():
|
| 441 |
-
svc = get_service(st.session_state.selected_model)
|
| 442 |
-
analysis = svc.predict(new_text)
|
| 443 |
-
pct = int(analysis["probability"] * 100)
|
| 444 |
-
color = "#cc0000" if pct >= 70 else "#f5a623" if pct >= 40 else "#00c853"
|
| 445 |
-
verdict = "TÓXICO" if analysis["is_toxic"] else "SEGURO"
|
| 446 |
-
v_color = "#cc0000" if analysis["is_toxic"] else "#00c853"
|
| 447 |
-
st.markdown(
|
| 448 |
-
f"<div class='tox-row'>"
|
| 449 |
-
f"<span>🔍 Analizando...</span>"
|
| 450 |
-
f"<span style='background:{v_color}; color:#fff; border-radius:10px;"
|
| 451 |
-
f"padding:1px 9px; font-size:0.72rem; font-weight:700;'>{verdict}</span>"
|
| 452 |
-
f"<span style='color:{color}; font-weight:600;'>Toxicidad: {pct}%</span>"
|
| 453 |
-
f"<div class='tox-bar-bg'>"
|
| 454 |
-
f"<div class='tox-bar-fill' style='width:{pct}%; background:{color};'></div>"
|
| 455 |
-
f"</div></div>",
|
| 456 |
-
unsafe_allow_html=True,
|
| 457 |
-
)
|
| 458 |
-
|
| 459 |
-
col_c, col_p = st.columns([1, 1])
|
| 460 |
-
with col_c:
|
| 461 |
-
if st.button("Cancelar", use_container_width=True):
|
| 462 |
-
st.rerun()
|
| 463 |
-
with col_p:
|
| 464 |
-
post = st.button("Comentar", type="primary", use_container_width=True)
|
| 465 |
-
|
| 466 |
-
# Procesar envío
|
| 467 |
-
if post and new_text.strip():
|
| 468 |
-
if analysis is None:
|
| 469 |
-
svc = get_service(st.session_state.selected_model)
|
| 470 |
-
analysis = svc.predict(new_text)
|
| 471 |
-
|
| 472 |
-
comment_obj = {
|
| 473 |
-
"user" : "usuario",
|
| 474 |
-
"initial" : "U",
|
| 475 |
-
"text" : new_text.strip(),
|
| 476 |
-
"time" : "ahora",
|
| 477 |
-
"is_toxic" : analysis["is_toxic"],
|
| 478 |
-
"probability": analysis["probability"],
|
| 479 |
-
"labels" : analysis["labels"],
|
| 480 |
-
}
|
| 481 |
-
|
| 482 |
-
if analysis["is_toxic"]:
|
| 483 |
-
# Guardar en pendiente y mostrar modal en el próximo render
|
| 484 |
-
st.session_state.pending_modal = comment_obj
|
| 485 |
-
st.rerun()
|
| 486 |
-
else:
|
| 487 |
-
# Publicar directamente
|
| 488 |
-
st.session_state.comments.append(comment_obj)
|
| 489 |
-
st.session_state.hub_history.insert(0, {
|
| 490 |
-
"Usuario" : "@usuario",
|
| 491 |
-
"Comentario": f'"{new_text.strip()[:45]}{"..." if len(new_text)>45 else ""}"',
|
| 492 |
-
"Score" : round(analysis["probability"], 2),
|
| 493 |
-
"Acción" : "✅ Aprobado",
|
| 494 |
-
})
|
| 495 |
-
st.rerun()
|
| 496 |
-
|
| 497 |
-
# ── Lista de comentarios ───────────────────────────────────────────
|
| 498 |
-
for c in reversed(st.session_state.comments):
|
| 499 |
-
is_tox = c["is_toxic"]
|
| 500 |
-
pct = int(c["probability"] * 100)
|
| 501 |
-
av_class = "c-avatar" if is_tox else "c-avatar safe"
|
| 502 |
-
badge = (
|
| 503 |
-
"<span class='badge badge-toxic'>TÓXICO</span>" if is_tox
|
| 504 |
-
else "<span class='badge badge-safe'>SEGURO</span>"
|
| 505 |
-
)
|
| 506 |
-
text_class = "c-text toxic" if is_tox else "c-text"
|
| 507 |
-
flagged = "<div class='c-flagged'>🚩 Flagged for review</div>" if is_tox else ""
|
| 508 |
-
|
| 509 |
-
# html.escape() protege contra caracteres que rompen el HTML
|
| 510 |
-
safe_text = html.escape(c["text"])
|
| 511 |
-
safe_user = html.escape(c["user"])
|
| 512 |
-
initial = html.escape(c.get("initial", c["user"][0].upper()))
|
| 513 |
-
|
| 514 |
-
st.markdown(
|
| 515 |
-
f"<div class='comment-wrap'>"
|
| 516 |
-
f" <div class='{av_class}'>{initial}</div>"
|
| 517 |
-
f" <div class='c-body'>"
|
| 518 |
-
f" <div class='c-header'>"
|
| 519 |
-
f" <span class='c-user'>@{safe_user}</span>"
|
| 520 |
-
f" <span class='c-time'>{c['time']}</span>"
|
| 521 |
-
f" {badge}"
|
| 522 |
-
f" </div>"
|
| 523 |
-
f" <div class='{text_class}'>{safe_text}</div>"
|
| 524 |
-
f" {flagged}"
|
| 525 |
-
f" </div>"
|
| 526 |
-
f"</div>",
|
| 527 |
-
unsafe_allow_html=True,
|
| 528 |
-
)
|
| 529 |
-
|
| 530 |
-
# ── Columna derecha ────────────────────────────────────────────────────
|
| 531 |
-
with col_right:
|
| 532 |
-
st.markdown("**Sugeridos**")
|
| 533 |
-
suggested = [
|
| 534 |
-
("🤖", "Understanding Transformer Models...", "Neural Systems", "89k · 1 día"),
|
| 535 |
-
("🎓", "The Future of Content Moderation", "Tech Ethics Pro", "1.4M · 2 sem"),
|
| 536 |
-
("📡", "Signal vs Noise: SignalMod Deep Dive","SignalMod AI", "250k · 3 días"),
|
| 537 |
-
("💡", "Why AI Moderation is Harder Than...", "Ethics in Code", "45k · 5 h"),
|
| 538 |
-
("🔬", "Hate Speech Detection 2024", "AI Research Lab", "12k · 1 sem"),
|
| 539 |
-
]
|
| 540 |
-
for emoji, title, ch, meta in suggested:
|
| 541 |
-
st.markdown(
|
| 542 |
-
f"<div class='sug-card'>"
|
| 543 |
-
f" <div class='sug-thumb'>{emoji}</div>"
|
| 544 |
-
f" <div>"
|
| 545 |
-
f" <div class='sug-title'>{html.escape(title)}</div>"
|
| 546 |
-
f" <div class='sug-ch'>{html.escape(ch)}</div>"
|
| 547 |
-
f" <div class='sug-meta'>{html.escape(meta)}</div>"
|
| 548 |
-
f" </div>"
|
| 549 |
-
f"</div>",
|
| 550 |
-
unsafe_allow_html=True,
|
| 551 |
-
)
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 555 |
-
# MODERATOR HUB
|
| 556 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 557 |
-
def render_hub():
|
| 558 |
-
try:
|
| 559 |
-
import plotly.graph_objects as go
|
| 560 |
-
except ImportError:
|
| 561 |
-
st.error("Instala plotly: pip install plotly")
|
| 562 |
-
return
|
| 563 |
-
|
| 564 |
-
st.markdown("## 📊 Panel de Estadísticas")
|
| 565 |
-
|
| 566 |
-
# ── Cards de configuración ──────────────────────────────────────────────
|
| 567 |
-
model_short = st.session_state.selected_model.split("(")[0].strip()
|
| 568 |
-
c1, c2, c3 = st.columns(3)
|
| 569 |
-
for col, label, val in [
|
| 570 |
-
(c1, "MODEL ARCHITECTURE", model_short),
|
| 571 |
-
(c2, "CONFIDENCE THRESHOLD", f"{st.session_state.threshold:.2f} Alpha"),
|
| 572 |
-
(c3, "LANGUAGE COVERAGE", "English"),
|
| 573 |
-
]:
|
| 574 |
-
with col:
|
| 575 |
-
st.markdown(
|
| 576 |
-
f"<div class='hub-card'>"
|
| 577 |
-
f"<div class='hub-kpi-label'>{label}</div>"
|
| 578 |
-
f"<div style='font-weight:600; font-size:0.95rem; color:#0f0f0f;'>"
|
| 579 |
-
f"{html.escape(str(val))}</div></div>",
|
| 580 |
-
unsafe_allow_html=True,
|
| 581 |
-
)
|
| 582 |
-
|
| 583 |
-
st.write("")
|
| 584 |
-
|
| 585 |
-
# ── KPIs ────────��──────────────────────────────────────────────────────
|
| 586 |
-
total = len(st.session_state.comments) + 100
|
| 587 |
-
tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"]) + 5
|
| 588 |
-
tox_rate = tox_cnt / total * 100
|
| 589 |
-
m1, m2, m3 = st.columns(3)
|
| 590 |
-
m1.metric("💬 Total comentarios", f"{total:,}", "+12%")
|
| 591 |
-
m2.metric("☠️ Tasa de toxicidad", f"{tox_rate:.1f}%",
|
| 592 |
-
f"+0.8%", delta_color="inverse")
|
| 593 |
-
m3.metric("🎯 F1 Score", "0.7579", "Stable")
|
| 594 |
-
|
| 595 |
-
st.divider()
|
| 596 |
-
|
| 597 |
-
# ── Gráficos ───────────────────────────────────────────────────────────
|
| 598 |
-
gcol, pcol = st.columns([2.2, 1])
|
| 599 |
-
|
| 600 |
-
with gcol:
|
| 601 |
-
days = ["Lun","Mar","Mié","Jue","Vie","Sáb","Dom"]
|
| 602 |
-
vals = [random.randint(30, 80) for _ in days]
|
| 603 |
-
vals[3] = max(vals) + 25
|
| 604 |
-
colors = ["#cc0000" if i == 3 else "#b3c6ff" for i in range(7)]
|
| 605 |
-
fig = go.Figure(go.Bar(x=days, y=vals, marker_color=colors, width=0.55))
|
| 606 |
-
fig.update_layout(
|
| 607 |
-
title="Tendencias de Toxicidad (7D)",
|
| 608 |
-
paper_bgcolor="#ffffff", plot_bgcolor="#ffffff",
|
| 609 |
-
margin=dict(l=20, r=20, t=40, b=20), height=260,
|
| 610 |
-
font=dict(size=11, color="#0f0f0f"),
|
| 611 |
-
)
|
| 612 |
-
fig.update_yaxes(showgrid=True, gridcolor="#f0f0f0", zeroline=False)
|
| 613 |
-
fig.update_xaxes(showgrid=False)
|
| 614 |
-
st.plotly_chart(fig, use_container_width=True)
|
| 615 |
-
|
| 616 |
-
with pcol:
|
| 617 |
-
fig2 = go.Figure(go.Pie(
|
| 618 |
-
labels=["Hate Speech","Insulto","Agresividad"],
|
| 619 |
-
values=[45, 35, 20],
|
| 620 |
-
hole=0.58,
|
| 621 |
-
marker_colors=["#cc0000","#0f0f0f","#909090"],
|
| 622 |
-
textfont_size=11,
|
| 623 |
-
))
|
| 624 |
-
fig2.update_layout(
|
| 625 |
-
title="Categorías",
|
| 626 |
-
paper_bgcolor="#ffffff",
|
| 627 |
-
margin=dict(l=10, r=10, t=40, b=10), height=260,
|
| 628 |
-
legend=dict(font=dict(size=10), orientation="v"),
|
| 629 |
-
font=dict(size=11, color="#0f0f0f"),
|
| 630 |
-
)
|
| 631 |
-
st.plotly_chart(fig2, use_container_width=True)
|
| 632 |
-
|
| 633 |
-
# ── Historial ──────────────────────────────────────────────────────────
|
| 634 |
-
st.markdown("### Historial Reciente")
|
| 635 |
-
df = pd.DataFrame(st.session_state.hub_history)
|
| 636 |
-
if not df.empty:
|
| 637 |
-
st.dataframe(
|
| 638 |
-
df, use_container_width=True, hide_index=True,
|
| 639 |
-
column_config={
|
| 640 |
-
"Score": st.column_config.ProgressColumn(
|
| 641 |
-
"Score", min_value=0, max_value=1, format="%.2f"
|
| 642 |
-
)
|
| 643 |
-
},
|
| 644 |
-
)
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 648 |
-
# SETTINGS
|
| 649 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 650 |
-
def render_settings():
|
| 651 |
-
st.markdown("## ⚙️ Ajustes")
|
| 652 |
-
|
| 653 |
-
# ── Selección de modelo ─────────────────────────────────────────────────
|
| 654 |
-
st.markdown("### 🤖 Modelo de detección",)
|
| 655 |
-
st.caption(
|
| 656 |
-
"Los modelos HuggingFace se descargan la primera vez (~300–600 MB). "
|
| 657 |
-
"Requieren: `pip install transformers torch sentencepiece`"
|
| 658 |
-
)
|
| 659 |
-
st.write("")
|
| 660 |
-
|
| 661 |
-
# Usamos st.radio para la selección — sin bugs de HTML
|
| 662 |
-
model_names = list(AVAILABLE_MODELS.keys())
|
| 663 |
-
current_idx = model_names.index(st.session_state.selected_model) \
|
| 664 |
-
if st.session_state.selected_model in model_names else 0
|
| 665 |
-
|
| 666 |
-
chosen = st.radio(
|
| 667 |
-
"Seleccionar modelo",
|
| 668 |
-
model_names,
|
| 669 |
-
index=current_idx,
|
| 670 |
-
label_visibility="collapsed",
|
| 671 |
-
)
|
| 672 |
-
|
| 673 |
-
if chosen != st.session_state.selected_model:
|
| 674 |
-
st.session_state.selected_model = chosen
|
| 675 |
-
st.rerun()
|
| 676 |
-
|
| 677 |
-
# Ficha del modelo seleccionado
|
| 678 |
-
info = AVAILABLE_MODELS[st.session_state.selected_model]
|
| 679 |
-
st.markdown(
|
| 680 |
-
f"<div class='model-card active'>"
|
| 681 |
-
f"<div class='model-card-name'>{info['icon']} {html.escape(st.session_state.selected_model)}</div>"
|
| 682 |
-
f"<div class='model-card-desc'>{html.escape(info['description'])}</div>"
|
| 683 |
-
f"<div style='margin-top:8px;'>"
|
| 684 |
-
f"<span class='model-pill'>⚡ {html.escape(info['speed'])}</span>"
|
| 685 |
-
f"<span class='model-pill'>🎯 {html.escape(info['accuracy'])}</span>"
|
| 686 |
-
f"<span class='model-pill'>📦 {html.escape(info['requires'])}</span>"
|
| 687 |
-
f"</div></div>",
|
| 688 |
-
unsafe_allow_html=True,
|
| 689 |
-
)
|
| 690 |
-
|
| 691 |
-
# Info sobre modelo fine-tuneado
|
| 692 |
-
if st.session_state.selected_model == "Modelo fine-tuneado (local)":
|
| 693 |
-
path = PROJECT_ROOT / "models" / "finetuned_hf"
|
| 694 |
-
if path.exists():
|
| 695 |
-
st.success(f"✅ Modelo encontrado en `{path}`")
|
| 696 |
-
else:
|
| 697 |
-
st.warning(
|
| 698 |
-
f"⚠️ No se encontró el modelo en `{path}`. "
|
| 699 |
-
f"Ejecuta el **notebook 08** para generar el modelo fine-tuneado."
|
| 700 |
-
)
|
| 701 |
-
|
| 702 |
-
st.divider()
|
| 703 |
-
|
| 704 |
-
# ── Umbral de confianza ─────────────────────────────────────────────────
|
| 705 |
-
st.markdown("### 🎚️ Umbral de confianza")
|
| 706 |
-
st.caption("Probabilidad mínima para marcar un comentario como tóxico.")
|
| 707 |
-
|
| 708 |
-
new_thr = st.slider(
|
| 709 |
-
"Umbral",
|
| 710 |
-
min_value=0.3, max_value=0.9, step=0.05,
|
| 711 |
-
value=st.session_state.threshold,
|
| 712 |
-
label_visibility="collapsed",
|
| 713 |
-
format="%.2f",
|
| 714 |
-
)
|
| 715 |
-
if new_thr != st.session_state.threshold:
|
| 716 |
-
st.session_state.threshold = new_thr
|
| 717 |
-
st.info(f"Umbral actualizado: **{new_thr:.2f}**")
|
| 718 |
-
|
| 719 |
-
ta, tb = st.columns(2)
|
| 720 |
-
ta.info(f"⬇️ **{new_thr:.2f}** bajo → más FP (más censura)", icon="⚠️")
|
| 721 |
-
tb.info(f"⬆️ **{new_thr:.2f}** alto → más FN (más escapes)", icon="⚠️")
|
| 722 |
-
|
| 723 |
-
st.divider()
|
| 724 |
-
|
| 725 |
-
# ── Test rápido ─────────────────────────────────────────────────────────
|
| 726 |
-
st.markdown("### 🧪 Probar modelo")
|
| 727 |
-
test_txt = st.text_input(
|
| 728 |
-
"Texto a analizar",
|
| 729 |
-
placeholder="Ej: This is absolutely stupid and racist...",
|
| 730 |
-
label_visibility="collapsed",
|
| 731 |
-
)
|
| 732 |
-
if st.button("Analizar", type="primary") and test_txt.strip():
|
| 733 |
-
with st.spinner("Analizando..."):
|
| 734 |
-
svc = get_service(st.session_state.selected_model)
|
| 735 |
-
res = svc.predict(test_txt)
|
| 736 |
-
|
| 737 |
-
pct = int(res["probability"] * 100)
|
| 738 |
-
verdict = "🔴 TÓXICO" if res["is_toxic"] else "🟢 SEGURO"
|
| 739 |
-
st.markdown(f"**{verdict}** — {pct}% de toxicidad")
|
| 740 |
-
st.progress(res["probability"])
|
| 741 |
-
if res["labels"]:
|
| 742 |
-
st.markdown(f"**Categorías:** {', '.join(res['labels'])}")
|
| 743 |
-
if "error" in res:
|
| 744 |
-
st.error(f"Error: {res['error']}")
|
| 745 |
-
st.caption(f"Modelo: {res['model_used']}")
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 749 |
-
# MAIN
|
| 750 |
-
# ══════════════════════════════════════════════════════════════════════════════
|
| 751 |
-
def main():
|
| 752 |
-
render_sidebar()
|
| 753 |
-
|
| 754 |
-
page = st.session_state.page
|
| 755 |
-
if page == "Home":
|
| 756 |
-
render_home()
|
| 757 |
-
elif page == "Moderator Hub":
|
| 758 |
-
render_hub()
|
| 759 |
-
elif page == "Settings":
|
| 760 |
-
render_settings()
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
if __name__ == "__main__":
|
| 764 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/evaluation/.gitkeep
DELETED
|
File without changes
|
src/service/model_catalog.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load inference model catalog from configs/model_catalog.yaml."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 11 |
+
CATALOG_PATH = PROJECT_ROOT / "configs" / "model_catalog.yaml"
|
| 12 |
+
|
| 13 |
+
_DEFAULT_CATALOG: dict[str, dict[str, Any]] = {
|
| 14 |
+
"LR + TF-IDF (local)": {
|
| 15 |
+
"type": "local",
|
| 16 |
+
"icon": "⚡",
|
| 17 |
+
"description": "Project baseline.",
|
| 18 |
+
"speed": "< 50ms",
|
| 19 |
+
"accuracy": "F1 0.76",
|
| 20 |
+
"requires": "joblib only",
|
| 21 |
+
},
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_model_catalog() -> dict[str, dict[str, Any]]:
|
| 26 |
+
if not CATALOG_PATH.exists():
|
| 27 |
+
return dict(_DEFAULT_CATALOG)
|
| 28 |
+
with CATALOG_PATH.open(encoding="utf-8") as f:
|
| 29 |
+
data = yaml.safe_load(f) or {}
|
| 30 |
+
if not isinstance(data, dict) or not data:
|
| 31 |
+
return dict(_DEFAULT_CATALOG)
|
| 32 |
+
return data
|
src/service/model_service.py
CHANGED
|
@@ -1,99 +1,99 @@
|
|
| 1 |
-
"""
|
| 2 |
-
src/services/model_service.py
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
Modelos soportados:
|
| 7 |
-
local → models/final_model.joblib (LR + TF-IDF, instantáneo)
|
| 8 |
-
hf_remote → HuggingFace Hub (requiere internet + transformers)
|
| 9 |
-
hf_local → modelo HF fine-tuneado localmente (notebook 08)
|
| 10 |
-
|
| 11 |
-
Instalación para modelos HF:
|
| 12 |
-
pip install transformers torch sentencepiece accelerate
|
| 13 |
-
"""
|
| 14 |
|
| 15 |
import re
|
| 16 |
-
import
|
| 17 |
-
import joblib
|
| 18 |
from pathlib import Path
|
| 19 |
-
from typing import Optional
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
"
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
"
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
HF_LABEL_MAP = {
|
| 70 |
-
"toxic": "
|
| 71 |
-
"
|
| 72 |
-
"
|
| 73 |
-
"
|
|
|
|
|
|
|
|
|
|
| 74 |
}
|
| 75 |
|
| 76 |
_KEYWORD_LABELS = {
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"
|
| 80 |
-
"
|
| 81 |
-
"
|
| 82 |
}
|
| 83 |
|
| 84 |
|
| 85 |
-
def _labels_from_keywords(text: str, probability: float) -> list:
|
| 86 |
t = text.lower()
|
| 87 |
found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
|
| 88 |
-
return found if found else (["
|
| 89 |
|
| 90 |
|
| 91 |
class _FallbackPreprocessor:
|
| 92 |
-
_SW = {
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
| 97 |
t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
|
| 98 |
t = re.sub(r"[^\x00-\x7F]+", " ", t)
|
| 99 |
t = re.sub(r"[^a-z\s]", " ", t)
|
|
@@ -103,10 +103,10 @@ class _FallbackPreprocessor:
|
|
| 103 |
|
| 104 |
class ModelService:
|
| 105 |
def __init__(self, model_name: str, project_root: Optional[Path] = None):
|
| 106 |
-
self.model_name
|
| 107 |
-
self.cfg
|
| 108 |
self.project_root = project_root or Path.cwd()
|
| 109 |
-
self._model
|
| 110 |
self._preprocessor = None
|
| 111 |
|
| 112 |
def _get_model(self):
|
|
@@ -119,84 +119,95 @@ class ModelService:
|
|
| 119 |
elif t == "hf_local":
|
| 120 |
path = self.project_root / self.cfg["model_path"]
|
| 121 |
if not path.exists():
|
| 122 |
-
raise FileNotFoundError(
|
| 123 |
-
f"Modelo no encontrado en {path}. Ejecuta el notebook 08 primero."
|
| 124 |
-
)
|
| 125 |
self._load_hf(str(path))
|
| 126 |
return self._model
|
| 127 |
|
| 128 |
-
def _load_local(self):
|
| 129 |
-
for name in
|
| 130 |
-
"lr_baseline.joblib","best_ensemble.joblib"]:
|
| 131 |
p = self.project_root / "models" / name
|
| 132 |
if p.exists():
|
| 133 |
self._model = joblib.load(p)
|
| 134 |
break
|
| 135 |
if self._model is None:
|
| 136 |
-
raise FileNotFoundError(f"No
|
| 137 |
try:
|
| 138 |
-
|
| 139 |
from src.features.text_preprocessor import TextPreprocessor
|
|
|
|
| 140 |
self._preprocessor = TextPreprocessor(
|
| 141 |
config_path=str(self.project_root / "configs" / "features.yaml")
|
| 142 |
)
|
| 143 |
except Exception:
|
| 144 |
self._preprocessor = _FallbackPreprocessor()
|
| 145 |
|
| 146 |
-
def _load_hf(self, model_id_or_path: str):
|
| 147 |
try:
|
| 148 |
from transformers import pipeline as hf_pipeline
|
| 149 |
-
except ImportError:
|
| 150 |
-
raise ImportError("
|
| 151 |
self._model = hf_pipeline(
|
| 152 |
-
"text-classification",
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
)
|
| 155 |
|
| 156 |
def predict(self, text: str) -> dict:
|
| 157 |
if not text or not text.strip():
|
| 158 |
-
return {"is_toxic": False, "probability": 0.0,
|
| 159 |
-
"labels": [], "model_used": self.model_name}
|
| 160 |
try:
|
| 161 |
model = self._get_model()
|
| 162 |
if self.cfg["type"] == "local":
|
| 163 |
return self._pred_local(text, model)
|
| 164 |
return self._pred_hf(text, model)
|
| 165 |
except Exception as e:
|
| 166 |
-
return {
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
clean = self._preprocessor.transform(text) or text
|
| 171 |
proba = float(model.predict_proba([clean])[0][1])
|
| 172 |
-
tox
|
| 173 |
-
return {
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
if key in smap:
|
| 182 |
-
proba = smap[key]
|
|
|
|
| 183 |
else:
|
| 184 |
-
neg
|
| 185 |
-
vals = [v for k,v in smap.items() if k not in neg]
|
| 186 |
proba = max(vals) if vals else 0.0
|
| 187 |
tox = proba >= 0.5
|
| 188 |
-
labels = []
|
| 189 |
if tox:
|
| 190 |
-
for k,v in smap.items():
|
| 191 |
-
if k not in ("label_0","non_toxic") and v >= 0.35:
|
| 192 |
-
friendly = HF_LABEL_MAP.get(k, k.replace("_"," ").title())
|
| 193 |
-
|
| 194 |
-
labels.append(friendly)
|
| 195 |
if not labels:
|
| 196 |
-
labels = ["
|
| 197 |
-
return {"is_toxic": tox, "probability": proba,
|
| 198 |
-
"labels": labels, "model_used": self.model_name}
|
| 199 |
|
| 200 |
@staticmethod
|
| 201 |
-
def get_available_models()
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Centralized toxicity prediction service."""
|
|
|
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
import re
|
| 6 |
+
import sys
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
+
from typing import Any, Optional
|
| 9 |
+
|
| 10 |
+
import joblib
|
| 11 |
+
|
| 12 |
+
from src.service.model_catalog import load_model_catalog
|
| 13 |
+
|
| 14 |
+
AVAILABLE_MODELS: dict[str, dict[str, Any]] = load_model_catalog()
|
| 15 |
+
|
| 16 |
+
_HF_DEPS_MSG = "Install HF deps: uv sync --extra hf"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def hf_deps_available() -> bool:
|
| 20 |
+
try:
|
| 21 |
+
import transformers # noqa: F401
|
| 22 |
+
|
| 23 |
+
return True
|
| 24 |
+
except ImportError:
|
| 25 |
+
return False
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def check_model_availability(name: str, project_root: Path | None = None) -> tuple[bool, str | None]:
|
| 29 |
+
"""Return (available, reason) for a catalog model name."""
|
| 30 |
+
cfg = AVAILABLE_MODELS.get(name)
|
| 31 |
+
if not cfg:
|
| 32 |
+
return False, "Unknown model"
|
| 33 |
+
|
| 34 |
+
root = project_root or Path.cwd()
|
| 35 |
+
model_type = cfg.get("type", "local")
|
| 36 |
+
|
| 37 |
+
if model_type == "local":
|
| 38 |
+
models_dir = root / "models"
|
| 39 |
+
if any((models_dir / n).exists() for n in (
|
| 40 |
+
"final_model.joblib",
|
| 41 |
+
"lr_tuned.joblib",
|
| 42 |
+
"lr_baseline.joblib",
|
| 43 |
+
"best_ensemble.joblib",
|
| 44 |
+
)):
|
| 45 |
+
return True, None
|
| 46 |
+
return False, f"No model in {models_dir}"
|
| 47 |
+
|
| 48 |
+
if model_type == "hf_local":
|
| 49 |
+
if not hf_deps_available():
|
| 50 |
+
return False, _HF_DEPS_MSG
|
| 51 |
+
path = root / cfg["model_path"]
|
| 52 |
+
if not path.exists():
|
| 53 |
+
return False, f"Model not found at {path}."
|
| 54 |
+
return True, None
|
| 55 |
+
|
| 56 |
+
if model_type == "hf_remote":
|
| 57 |
+
if not hf_deps_available():
|
| 58 |
+
return False, _HF_DEPS_MSG
|
| 59 |
+
return True, None
|
| 60 |
+
|
| 61 |
+
return False, f"Unsupported model type: {model_type}"
|
| 62 |
|
| 63 |
HF_LABEL_MAP = {
|
| 64 |
+
"toxic": "Toxic",
|
| 65 |
+
"severe_toxic": "Severely offensive",
|
| 66 |
+
"obscene": "Obscene",
|
| 67 |
+
"threat": "Threat",
|
| 68 |
+
"insult": "Insult",
|
| 69 |
+
"identity_hate": "Identity hate",
|
| 70 |
+
"label_1": "Toxic",
|
| 71 |
}
|
| 72 |
|
| 73 |
_KEYWORD_LABELS = {
|
| 74 |
+
"Insult": ["idiot", "stupid", "dumb", "fool", "moron", "loser"],
|
| 75 |
+
"Identity hate": ["thug", "racist", "race", "criminal"],
|
| 76 |
+
"Threat": ["kill", "shoot", "die", "dead", "hurt", "attack"],
|
| 77 |
+
"Obscene": ["fuck", "shit", "ass", "bitch", "cunt", "bastard"],
|
| 78 |
+
"Aggression": ["hate", "despise", "disgusting", "pathetic", "worthless"],
|
| 79 |
}
|
| 80 |
|
| 81 |
|
| 82 |
+
def _labels_from_keywords(text: str, probability: float) -> list[str]:
|
| 83 |
t = text.lower()
|
| 84 |
found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
|
| 85 |
+
return found if found else (["Offensive content"] if probability >= 0.5 else [])
|
| 86 |
|
| 87 |
|
| 88 |
class _FallbackPreprocessor:
|
| 89 |
+
_SW = {
|
| 90 |
+
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
| 91 |
+
"of", "with", "is", "it", "this", "that", "are", "was", "be", "have",
|
| 92 |
+
"has", "he", "she", "they", "we", "you", "i", "not", "do", "did",
|
| 93 |
+
"will", "can", "would", "should", "could", "from", "by", "as", "if",
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
def transform(self, text: str) -> str:
|
| 97 |
t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
|
| 98 |
t = re.sub(r"[^\x00-\x7F]+", " ", t)
|
| 99 |
t = re.sub(r"[^a-z\s]", " ", t)
|
|
|
|
| 103 |
|
| 104 |
class ModelService:
|
| 105 |
def __init__(self, model_name: str, project_root: Optional[Path] = None):
|
| 106 |
+
self.model_name = model_name
|
| 107 |
+
self.cfg = AVAILABLE_MODELS.get(model_name) or next(iter(AVAILABLE_MODELS.values()))
|
| 108 |
self.project_root = project_root or Path.cwd()
|
| 109 |
+
self._model = None
|
| 110 |
self._preprocessor = None
|
| 111 |
|
| 112 |
def _get_model(self):
|
|
|
|
| 119 |
elif t == "hf_local":
|
| 120 |
path = self.project_root / self.cfg["model_path"]
|
| 121 |
if not path.exists():
|
| 122 |
+
raise FileNotFoundError(f"Model not found at {path}.")
|
|
|
|
|
|
|
| 123 |
self._load_hf(str(path))
|
| 124 |
return self._model
|
| 125 |
|
| 126 |
+
def _load_local(self) -> None:
|
| 127 |
+
for name in ("final_model.joblib", "lr_tuned.joblib", "lr_baseline.joblib", "best_ensemble.joblib"):
|
|
|
|
| 128 |
p = self.project_root / "models" / name
|
| 129 |
if p.exists():
|
| 130 |
self._model = joblib.load(p)
|
| 131 |
break
|
| 132 |
if self._model is None:
|
| 133 |
+
raise FileNotFoundError(f"No model in {self.project_root / 'models'}")
|
| 134 |
try:
|
| 135 |
+
sys.path.insert(0, str(self.project_root))
|
| 136 |
from src.features.text_preprocessor import TextPreprocessor
|
| 137 |
+
|
| 138 |
self._preprocessor = TextPreprocessor(
|
| 139 |
config_path=str(self.project_root / "configs" / "features.yaml")
|
| 140 |
)
|
| 141 |
except Exception:
|
| 142 |
self._preprocessor = _FallbackPreprocessor()
|
| 143 |
|
| 144 |
+
def _load_hf(self, model_id_or_path: str) -> None:
|
| 145 |
try:
|
| 146 |
from transformers import pipeline as hf_pipeline
|
| 147 |
+
except ImportError as exc:
|
| 148 |
+
raise ImportError("Install HF deps: uv sync --extra hf") from exc
|
| 149 |
self._model = hf_pipeline(
|
| 150 |
+
"text-classification",
|
| 151 |
+
model=model_id_or_path,
|
| 152 |
+
return_all_scores=True,
|
| 153 |
+
truncation=True,
|
| 154 |
+
max_length=512,
|
| 155 |
)
|
| 156 |
|
| 157 |
def predict(self, text: str) -> dict:
|
| 158 |
if not text or not text.strip():
|
| 159 |
+
return {"is_toxic": False, "probability": 0.0, "labels": [], "model_used": self.model_name}
|
|
|
|
| 160 |
try:
|
| 161 |
model = self._get_model()
|
| 162 |
if self.cfg["type"] == "local":
|
| 163 |
return self._pred_local(text, model)
|
| 164 |
return self._pred_hf(text, model)
|
| 165 |
except Exception as e:
|
| 166 |
+
return {
|
| 167 |
+
"is_toxic": False,
|
| 168 |
+
"probability": 0.0,
|
| 169 |
+
"labels": [],
|
| 170 |
+
"model_used": self.model_name,
|
| 171 |
+
"error": str(e),
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
def _pred_local(self, text: str, model) -> dict:
|
| 175 |
clean = self._preprocessor.transform(text) or text
|
| 176 |
proba = float(model.predict_proba([clean])[0][1])
|
| 177 |
+
tox = proba >= 0.5
|
| 178 |
+
return {
|
| 179 |
+
"is_toxic": tox,
|
| 180 |
+
"probability": proba,
|
| 181 |
+
"labels": _labels_from_keywords(text, proba) if tox else [],
|
| 182 |
+
"model_used": self.model_name,
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
def _pred_hf(self, text: str, pipeline_fn) -> dict:
|
| 186 |
+
raw = pipeline_fn(text[:512])
|
| 187 |
+
smap = {s["label"].lower(): s["score"] for s in (raw[0] if isinstance(raw[0], list) else raw)}
|
| 188 |
+
proba = 0.0
|
| 189 |
+
for key in ("label_1", "toxic", "toxic_1"):
|
| 190 |
if key in smap:
|
| 191 |
+
proba = smap[key]
|
| 192 |
+
break
|
| 193 |
else:
|
| 194 |
+
neg = {"label_0", "non_toxic", "not_toxic", "not toxic"}
|
| 195 |
+
vals = [v for k, v in smap.items() if k not in neg]
|
| 196 |
proba = max(vals) if vals else 0.0
|
| 197 |
tox = proba >= 0.5
|
| 198 |
+
labels: list[str] = []
|
| 199 |
if tox:
|
| 200 |
+
for k, v in smap.items():
|
| 201 |
+
if k not in ("label_0", "non_toxic") and v >= 0.35:
|
| 202 |
+
friendly = HF_LABEL_MAP.get(k, k.replace("_", " ").title())
|
| 203 |
+
labels.append(friendly)
|
|
|
|
| 204 |
if not labels:
|
| 205 |
+
labels = ["Offensive content"]
|
| 206 |
+
return {"is_toxic": tox, "probability": proba, "labels": labels, "model_used": self.model_name}
|
|
|
|
| 207 |
|
| 208 |
@staticmethod
|
| 209 |
+
def get_available_models() -> dict:
|
| 210 |
+
return AVAILABLE_MODELS
|
| 211 |
+
|
| 212 |
+
def get_model_info(self) -> dict:
|
| 213 |
+
return self.cfg
|
tests/test_api.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Tests
|
| 2 |
|
| 3 |
from unittest.mock import MagicMock
|
| 4 |
|
|
@@ -6,11 +6,14 @@ import pytest
|
|
| 6 |
from fastapi.testclient import TestClient
|
| 7 |
|
| 8 |
from src.api import main as api_main
|
|
|
|
| 9 |
|
| 10 |
PREDICT_RESPONSE_KEYS = {
|
| 11 |
"text",
|
| 12 |
"is_toxic",
|
| 13 |
"probability",
|
|
|
|
|
|
|
| 14 |
"labels",
|
| 15 |
"model_used",
|
| 16 |
"latency_ms",
|
|
@@ -28,13 +31,16 @@ def client():
|
|
| 28 |
}
|
| 29 |
|
| 30 |
with TestClient(api_main.app) as test_client:
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
yield test_client
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def test_predict_returns_correct_structure(client: TestClient):
|
|
@@ -47,14 +53,79 @@ def test_predict_returns_correct_structure(client: TestClient):
|
|
| 47 |
data = response.json()
|
| 48 |
assert PREDICT_RESPONSE_KEYS <= set(data.keys())
|
| 49 |
assert data["text"] == "This is a sample comment"
|
|
|
|
|
|
|
| 50 |
assert isinstance(data["is_toxic"], bool)
|
| 51 |
assert 0.0 <= data["probability"] <= 1.0
|
| 52 |
-
assert isinstance(data["labels"], list)
|
| 53 |
-
assert isinstance(data["model_used"], str)
|
| 54 |
-
assert isinstance(data["latency_ms"], (int, float))
|
| 55 |
|
| 56 |
|
| 57 |
def test_predict_rejects_empty_text(client: TestClient):
|
| 58 |
response = client.post("/predict", json={"text": " "})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
assert response.status_code == 422
|
|
|
|
|
|
| 1 |
+
"""Tests for POST /predict."""
|
| 2 |
|
| 3 |
from unittest.mock import MagicMock
|
| 4 |
|
|
|
|
| 6 |
from fastapi.testclient import TestClient
|
| 7 |
|
| 8 |
from src.api import main as api_main
|
| 9 |
+
from src.api.state import get_state
|
| 10 |
|
| 11 |
PREDICT_RESPONSE_KEYS = {
|
| 12 |
"text",
|
| 13 |
"is_toxic",
|
| 14 |
"probability",
|
| 15 |
+
"status",
|
| 16 |
+
"mode",
|
| 17 |
"labels",
|
| 18 |
"model_used",
|
| 19 |
"latency_ms",
|
|
|
|
| 31 |
}
|
| 32 |
|
| 33 |
with TestClient(api_main.app) as test_client:
|
| 34 |
+
state = get_state()
|
| 35 |
+
state["service"] = mock_service
|
| 36 |
+
state["model_name"] = "LR + TF-IDF (local)"
|
| 37 |
+
state["predictions_served"] = 0
|
| 38 |
+
state["startup_time"] = 0.0
|
| 39 |
yield test_client
|
| 40 |
|
| 41 |
+
state = get_state()
|
| 42 |
+
state["service"] = None
|
| 43 |
+
state["model_name"] = None
|
| 44 |
|
| 45 |
|
| 46 |
def test_predict_returns_correct_structure(client: TestClient):
|
|
|
|
| 53 |
data = response.json()
|
| 54 |
assert PREDICT_RESPONSE_KEYS <= set(data.keys())
|
| 55 |
assert data["text"] == "This is a sample comment"
|
| 56 |
+
assert data["status"] == "Safe"
|
| 57 |
+
assert data["mode"] == "binary"
|
| 58 |
assert isinstance(data["is_toxic"], bool)
|
| 59 |
assert 0.0 <= data["probability"] <= 1.0
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
def test_predict_rejects_empty_text(client: TestClient):
|
| 63 |
response = client.post("/predict", json={"text": " "})
|
| 64 |
+
assert response.status_code == 422
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def test_health_includes_project_name(client: TestClient):
|
| 68 |
+
response = client.get("/health")
|
| 69 |
+
assert response.status_code == 200
|
| 70 |
+
assert response.json()["project"] == "youtube_hate_detector"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_predict_video_demo_comments_differ_by_url(client: TestClient, monkeypatch):
|
| 74 |
+
monkeypatch.delenv("YOUTUBE_API_KEY", raising=False)
|
| 75 |
+
|
| 76 |
+
r1 = client.post(
|
| 77 |
+
"/predict-video",
|
| 78 |
+
json={
|
| 79 |
+
"url": "https://www.youtube.com/watch?v=jNQXAC9IVRw",
|
| 80 |
+
"max_comments": 5,
|
| 81 |
+
"threshold": 0.5,
|
| 82 |
+
},
|
| 83 |
+
)
|
| 84 |
+
r2 = client.post(
|
| 85 |
+
"/predict-video",
|
| 86 |
+
json={
|
| 87 |
+
"url": "https://www.youtube.com/watch?v=IEEhzQoKtQU",
|
| 88 |
+
"max_comments": 5,
|
| 89 |
+
"threshold": 0.5,
|
| 90 |
+
},
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
assert r1.status_code == 200
|
| 94 |
+
assert r2.status_code == 200
|
| 95 |
+
data1 = r1.json()
|
| 96 |
+
data2 = r2.json()
|
| 97 |
+
assert data1["source"] == "demo"
|
| 98 |
+
assert data2["source"] == "demo"
|
| 99 |
+
assert data1["results"][0]["text"] != data2["results"][0]["text"]
|
| 100 |
+
|
| 101 |
|
| 102 |
+
def test_models_status_lists_catalog(client: TestClient):
|
| 103 |
+
response = client.get("/models/status")
|
| 104 |
+
assert response.status_code == 200
|
| 105 |
+
data = response.json()
|
| 106 |
+
assert "models" in data
|
| 107 |
+
assert len(data["models"]) >= 1
|
| 108 |
+
names = {m["name"] for m in data["models"]}
|
| 109 |
+
assert "LR + TF-IDF (local)" in names
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_predict_video_comments_disabled_raises_422(client: TestClient, monkeypatch):
|
| 113 |
+
from src.api.youtube import CommentsFetchError
|
| 114 |
+
|
| 115 |
+
monkeypatch.setenv("YOUTUBE_API_KEY", "fake-key")
|
| 116 |
+
|
| 117 |
+
def _raise_disabled(*_args, **_kwargs):
|
| 118 |
+
raise CommentsFetchError("Comments are disabled on this video")
|
| 119 |
+
|
| 120 |
+
monkeypatch.setattr("src.api.routes.predict.fetch_comments", _raise_disabled)
|
| 121 |
+
|
| 122 |
+
response = client.post(
|
| 123 |
+
"/predict-video",
|
| 124 |
+
json={
|
| 125 |
+
"url": "https://www.youtube.com/watch?v=disabled123",
|
| 126 |
+
"max_comments": 5,
|
| 127 |
+
"threshold": 0.5,
|
| 128 |
+
},
|
| 129 |
+
)
|
| 130 |
assert response.status_code == 422
|
| 131 |
+
assert "disabled" in response.json()["detail"].lower()
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|