Spaces:

devrup404
/

SignalMod

Running

App Files Files Community

Mirae Kang commited on 4 days ago

Commit

e317d56

1 Parent(s): f0b240d

feat: update UI using VITE+React without streamlit, #22

Browse files

Files changed (28) hide show

.dockerignore +1 -1
.env.example +10 -7
.python-version +1 -0
Dockerfile +30 -15
README.md +74 -178
configs/model_catalog.yaml +43 -0
configs/suggested_videos.yaml +15 -0
docker-compose.yml +12 -40
docs/ARCHITECTURE.md +34 -54
pyproject.toml +41 -0
requirements.txt +0 -17
src/api/__init__.py +1 -0
src/api/main.py +73 -421
src/api/routes/__init__.py +1 -0
src/api/routes/health.py +22 -0
src/api/routes/models.py +78 -0
src/api/routes/predict.py +77 -0
src/api/routes/videos.py +17 -0
src/api/schemas.py +93 -0
src/api/services.py +47 -0
src/api/state.py +16 -0
src/api/youtube.py +187 -0
src/app/app.py +0 -764
src/evaluation/.gitkeep +0 -0
src/service/model_catalog.py +32 -0
src/service/model_service.py +135 -124
tests/test_api.py +80 -9
uv.lock +0 -0

.dockerignore CHANGED Viewed

@@ -20,6 +20,7 @@ tests
 !README.md
 .env
 .env.*
 frontend/dist
 models/checkpoints
 models/**/checkpoints
@@ -28,4 +29,3 @@ models/roberta_hate_results
 models/distilbert_results
 models/best_distilbert
 models/nb08_*
-models/*_frozen

 !README.md
 .env
 .env.*
+!.env.example
 frontend/dist
 models/checkpoints
 models/**/checkpoints
 models/distilbert_results
 models/best_distilbert
 models/nb08_*

.env.example CHANGED Viewed

@@ -1,15 +1,18 @@
-# Copy to .env for local development:  cp .env.example .env
-# Docker Compose reads these via environment (optional).
-# YouTube Data API v3 (optional — /predict-video and scraping)
 # https://console.cloud.google.com/apis/credentials
 YOUTUBE_API_KEY=
-# Active model (must match a key in ModelService.AVAILABLE_MODELS)
 MODEL_NAME=LR + TF-IDF (local)
 # development | production
-ENV=production
-# Used by Streamlit when calling the API from another host (Docker sets this automatically)
-API_URL=http://localhost:8000

+# Copy to .env:  cp .env.example .env
+# Docker Compose reads YOUTUBE_API_KEY from your environment.
+# YouTube Data API v3 — required for real suggested videos and /predict-video
 # https://console.cloud.google.com/apis/credentials
 YOUTUBE_API_KEY=
+# Active model (key from configs/model_catalog.yaml)
 MODEL_NAME=LR + TF-IDF (local)
 # development | production
+ENV=development
+# Optional: frontend dev when API is on another host (default uses Vite proxy)
+VITE_API_BASE_URL=
+# Docker only: build with Hugging Face models (see README)
+# INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile CHANGED Viewed

@@ -1,30 +1,40 @@
-# youtube_hate_detector — shared image for FastAPI + Streamlit services
 FROM python:3.12-slim-bookworm
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app \
     NLTK_DATA=/app/nltk_data \
     MODEL_NAME="LR + TF-IDF (local)" \
-    ENV=production
 WORKDIR /app
-# System deps for spaCy / sklearn wheels
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends build-essential curl \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt .
-# CPU-only PyTorch keeps the image smaller; sufficient for the default local LR model
-RUN pip install --no-cache-dir --upgrade pip \
-    && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
-    && pip install --no-cache-dir -r requirements.txt \
-    && python -m spacy download en_core_web_sm
-# NLTK corpora used by TextPreprocessor
-RUN python - <<'PY'
 import nltk
 for pkg in ("stopwords", "punkt"):
     nltk.download(pkg, download_dir="/app/nltk_data")
@@ -33,8 +43,13 @@ PY
 COPY configs/ configs/
 COPY src/ src/
 COPY models/final_model.joblib models/final_model.joblib
-# Default env template (overridden by docker-compose)
-COPY env.example .env.example
-EXPOSE 8000 8501

+# youtube_hate_detector — multi-stage: React + FastAPI (uv)
+FROM node:22-bookworm-slim AS frontend-build
+WORKDIR /app/frontend
+COPY frontend/package.json frontend/package-lock.json* ./
+RUN npm ci 2>/dev/null || npm install
+COPY frontend/ ./
+RUN npm run build
 FROM python:3.12-slim-bookworm
+ARG INSTALL_HF=0
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app \
     NLTK_DATA=/app/nltk_data \
     MODEL_NAME="LR + TF-IDF (local)" \
+    ENV=production \
+    INSTALL_HF=${INSTALL_HF}
 WORKDIR /app
 RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
     && rm -rf /var/lib/apt/lists/*
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+COPY pyproject.toml uv.lock* README.md ./
+RUN if [ "$INSTALL_HF" = "1" ]; then \
+      uv sync --frozen --no-dev --extra hf 2>/dev/null || uv sync --no-dev --extra hf; \
+    else \
+      uv sync --frozen --no-dev 2>/dev/null || uv sync --no-dev; \
+    fi
+RUN uv run python -m spacy download en_core_web_sm \
+    && uv run python - <<'PY'
 import nltk
 for pkg in ("stopwords", "punkt"):
     nltk.download(pkg, download_dir="/app/nltk_data")
 COPY configs/ configs/
 COPY src/ src/
 COPY models/final_model.joblib models/final_model.joblib
+COPY models/finetuned_hf/ models/finetuned_hf/
+COPY --from=frontend-build /app/frontend/dist frontend/dist
+COPY .env.example .env.example
+EXPOSE 8000
+HEALTHCHECK --interval=10s --timeout=5s --retries=12 --start-period=60s \
+  CMD curl -f http://localhost:8000/health || exit 1
+CMD ["uv", "run", "uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

README.md CHANGED Viewed

@@ -1,247 +1,143 @@
-# YouTube Toxic Comment Detector (SignalMod)
 [![Python](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
 [![FastAPI](https://img.shields.io/badge/FastAPI-0.136-009688.svg)](https://fastapi.tiangolo.com/)
-[![Streamlit](https://img.shields.io/badge/Streamlit-UI-FF4B4B.svg)](https://streamlit.io/)
 [![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg)](https://docs.docker.com/compose/)
 **Español:** [README.es.md](README.es.md)
-Automated **Safe vs Toxic** classification for YouTube-style comments. The production stack is **FastAPI** (REST inference) plus **Streamlit** (watch-page style UI). The default model is **Logistic Regression + TF-IDF** (`models/final_model.joblib`).
 ---
-## Project description
-| Item | Detail |
-|------|--------|
-| **Goal** | Help moderation teams flag toxic comments quickly |
-| **Dataset** | `data/raw/youtoxic_english_1000.csv` (~1k English comments) |
-| **Target** | `IsToxic` → **Safe (0)** / **Toxic (1)** |
-| **Primary metric** | Weighted F1 and ROC-AUC (imbalanced classes) |
-| **Overfitting check** | \|CV F1 − test F1\| &lt; 5 percentage points (project rubric) |
----
-## Architecture
 ```
 youtube_hate_detector/
-├── configs/              # YAML: pipeline, features, models, best_params
-├── data/raw/             # Source CSV (not committed if gitignored)
 ├── models/               # final_model.joblib, experiments/
-├── reports/              # summary.csv, plots, pipeline artifacts
 ├── src/
-│   ├── api/              # FastAPI — /predict, /predict-batch, …
-│   ├── app/              # Streamlit UI (src/app/app.py)
-│   ├── data/             # load_raw_data, scraping helpers
-│   ├── evaluation/       # Evaluator — metrics, ROC, confusion matrix
-│   ├── features/         # TextPreprocessor, Vectorizer
-│   ├── models/           # LR, RF, XGBoost baselines
-│   ├── pipeline/         # run_pipeline.py — train end-to-end
-│   └── service/          # ModelService — shared inference layer
-├── tests/
-├── Dockerfile
 └── docker-compose.yml
 ```
-**Runtime flow**
-1. **Training:** `load_raw_data` → `TextPreprocessor` → `build_model().fit()` → `Evaluator` → `reports/summary.csv`
-2. **API:** `uvicorn` loads `ModelService` → `POST /predict`
-3. **Streamlit:** `ModelService.predict()` in-process (same models as API catalog)
-See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for more detail.
 ---
-## Installation
-**Requirements:** Python 3.12+, ~2 GB disk for dependencies (optional PyTorch if using Hugging Face models in the UI).
-```bash
-git clone https://github.com/Bootcamp-IA-P6/Project_9_Equipo3.git
-cd Project_9_Equipo3   # or your local folder name
-python -m venv .venv
-source .venv/bin/activate   # Windows: .venv\Scripts\activate
-pip install -r requirements.txt
-python -m spacy download en_core_web_sm
-```
-**Data:** place `youtoxic_english_1000.csv` under `data/raw/` (path in `configs/pipeline.yaml`).
-**Environment:**
 ```bash
 cp .env.example .env
-# Optional: YOUTUBE_API_KEY for /predict-video
-# MODEL_NAME must match a key in ModelService (default: LR + TF-IDF (local))
-```
----
-## Training pipeline
-End-to-end training and evaluation:
-```bash
-python -m src.pipeline.run_pipeline --model lr
-# Options: lr | rf | xgboost
 ```
-**Phases:** load data → stratified split → spaCy/NLTK preprocessing → train → 5-fold CV → test metrics → save `models/experiments/{model}/` → MLflow → update [`reports/summary.csv`](reports/summary.csv) and plots under `reports/pipeline/{model}/`.
-Config files:
-| File | Purpose |
-|------|---------|
-| `configs/pipeline.yaml` | Paths, `IsToxic`, test_size, CV folds |
-| `configs/features.yaml` | Preprocessing + TF-IDF |
-| `configs/models.yaml` | Classifier hyperparameters |
-| `configs/best_params.yaml` | Optuna winner (LR) |
-Details: [docs/PIPELINE.md](docs/PIPELINE.md)
----
-## Run with Docker
-```bash
-docker compose up --build
-```
-| Service | URL |
-|---------|-----|
-| Streamlit | http://localhost:8501 |
-| FastAPI | http://localhost:8000 |
-| Swagger | http://localhost:8000/docs |
-```bash
-export YOUTUBE_API_KEY=your_key   # optional
-docker compose down               # stop
-```
-Containers: `youtube_hate_detector-api`, `youtube_hate_detector-streamlit`.
 ---
-## Local run (without Docker)
 ```bash
 # Terminal 1 — API
-uvicorn src.api.main:app --reload --host 0.0.0.0 --port 8000
-# Terminal 2 — Streamlit
-streamlit run src/app/app.py --server.port 8501
 ```
----
-## API examples
-Full reference: [docs/API.md](docs/API.md)
-**Health check**
 ```bash
-curl -s http://localhost:8000/ | python -m json.tool
-```
-**Single prediction**
-```bash
-curl -s -X POST http://localhost:8000/predict \
-  -H "Content-Type: application/json" \
-  -d '{"text": "This video is amazing, thanks for sharing!", "threshold": 0.5}'
 ```
-Example response:
-```json
-{
-  "text": "This video is amazing, thanks for sharing!",
-  "is_toxic": false,
-  "probability": 0.08,
-  "labels": [],
-  "model_used": "LR + TF-IDF (local)",
-  "latency_ms": 12.5
-}
-```
-**Batch**
-```bash
-curl -s -X POST http://localhost:8000/predict-batch \
-  -H "Content-Type: application/json" \
-  -d '{"texts": ["Great content!", "You are an idiot"], "threshold": 0.5}'
-```
-**List / switch models**
 ```bash
-curl -s http://localhost:8000/models
-curl -s -X PUT http://localhost:8000/model/DistilBERT%20Toxicity
 ```
----
-## Results
-Best **sklearn** model on the project test split (from `configs/best_params.yaml`):
-| Metric | Value |
-|--------|-------|
-| F1 (weighted, test) | **0.7579** |
-| ROC-AUC | **0.81** |
-| False positives | 18 |
-| False negatives | 30 |
-| CV–test gap | **4.76 pp** (within 5 pp target) |
-| Train–test gap | 14.07 pp |
-Plots and EDA: `reports/v2/`. Per-run artifacts: `reports/pipeline/{lr,rf,xgboost}/`.
 ---
-## Technical results report
-Full write-up (decisions, metrics, error analysis, limitations, roadmap):
-- **English:** [reports/final_report.md](reports/final_report.md)
-- **Español:** [reports/final_report.es.md](reports/final_report.es.md)
-## Model comparison
-Canonical table: [`reports/summary.csv`](reports/summary.csv)
-Human-readable: [docs/RESULTS.md](docs/RESULTS.md)
-| Model | Family | F1 (test) | ROC-AUC | FP | FN | Production default |
-|-------|--------|-----------|---------|----|----|--------------------|
-| LR + TF-IDF (tuned) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes |
-| LR + TF-IDF (local) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes (`final_model.joblib`) |
-| RF / XGBoost | sklearn | — | — | — | — | Run pipeline to fill |
-| DistilBERT / toxic-bert / RoBERTa | Hugging Face | — | — | — | — | Optional via API/UI |
-Re-run `python -m src.pipeline.run_pipeline --model rf` to append RF metrics to `summary.csv`.
 ---
 ## Tests
 ```bash
-pytest tests/ -v
 ```
-Covers preprocessor, vectorizer, model binary output, and `/predict` response shape.
 ---
-## Documentation index
-| English | Español |
-|---------|---------|
-| [docs/API.md](docs/API.md) | [docs/API.es.md](docs/API.es.md) |
-| [docs/PIPELINE.md](docs/PIPELINE.md) | [docs/PIPELINE.es.md](docs/PIPELINE.es.md) |
-| [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) |
-| [docs/RESULTS.md](docs/RESULTS.md) | [docs/RESULTS.es.md](docs/RESULTS.es.md) |
-| [reports/final_report.md](reports/final_report.md) | [reports/final_report.es.md](reports/final_report.es.md) |

+# YouTube Toxic Comment Detector (youtube_hate_detector)
 [![Python](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/)
 [![FastAPI](https://img.shields.io/badge/FastAPI-0.136-009688.svg)](https://fastapi.tiangolo.com/)
+[![React](https://img.shields.io/badge/React-UI-61DAFB.svg)](https://react.dev/)
 [![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg)](https://docs.docker.com/compose/)
 **Español:** [README.es.md](README.es.md)
+Automated **Safe vs Toxic** classification for YouTube-style comments. Production stack: **FastAPI** (REST) + **React** (YouTube Watch UI). Default model: **Logistic Regression + TF-IDF** (`models/final_model.joblib`).
 ---
+## Clone and layout
+```bash
+git clone <your-repo-url>
+cd youtube_hate_detector   # use this folder name locally (team convention)
+```
 ```
 youtube_hate_detector/
+├── configs/              # pipeline, features, model_catalog, suggested_videos
+├── frontend/             # React SPA (Vite)
 ├── models/               # final_model.joblib, experiments/
 ├── src/
+│   ├── api/              # FastAPI routes
+│   └── service/          # ModelService (inference)
+├── pyproject.toml        # uv dependencies
+├── uv.lock
 └── docker-compose.yml
 ```
 ---
+## How to use FastAPI
+The API loads `ModelService` once at startup and serves JSON only (the React app is the UI).
 ```bash
 cp .env.example .env
+uv sync                    # baseline (LR model only)
+uv sync --extra hf         # required for DistilBERT / toxic-bert / Fine-tuned HF models
+uv run uvicorn src.api.main:app --reload --port 8000
 ```
+Verify HF deps: `uv run python -c "import transformers; print('ok')"`.
+| Resource | URL |
+|----------|-----|
+| Swagger | http://localhost:8000/docs |
+| Health | http://localhost:8000/health |
+**Main endpoints**
+| Method | Path | Description |
+|--------|------|-------------|
+| `POST` | `/predict` | Score one comment `{ "text", "threshold" }` |
+| `POST` | `/predict-video` | Fetch YouTube comments + score `{ "url", "max_comments", "threshold" }` |
+| `GET` | `/videos/suggested` | Metadata for right-rail videos (from `configs/suggested_videos.yaml`) |
+| `GET` | `/models` | Available models |
+| `GET` | `/models/status` | Per-model availability (HF deps, local weights) |
+| `PUT` | `/model/{name}` | Switch active model (warmup-validated) |
+Set `YOUTUBE_API_KEY` in `.env` for real comments and suggested-video thumbnails.
+**Change models without UI changes:** edit [`configs/model_catalog.yaml`](configs/model_catalog.yaml), then restart the API or use Settings in the app.
 ---
+## React UI (local dev)
 ```bash
 # Terminal 1 — API
+uv run uvicorn src.api.main:app --reload --port 8000
+# Terminal 2 — frontend (proxies API)
+cd frontend && npm install && npm run dev
 ```
+Open http://localhost:5173 — Watch page with staged demo player, real suggested videos (click to load comments), English UI.
+---
+## Docker
 ```bash
+export YOUTUBE_API_KEY=your_key   # optional but recommended
+docker compose up --build         # LR model only (default)
+# Hugging Face models (transformers + torch; larger image):
+INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1
+INSTALL_HF=1 docker compose up
 ```
+| URL | Service |
+|-----|---------|
+| http://localhost:8000 | API + built React SPA |
+| http://localhost:8000/docs | Swagger |
+Container: `youtube_hate_detector-app`.
+---
+## Training (unchanged)
 ```bash
+uv run python -m src.pipeline.run_pipeline --model lr
 ```
+See [docs/PIPELINE.md](docs/PIPELINE.md).
 ---
+## Configuration
+| File | Purpose |
+|------|---------|
+| `.env` | Secrets (`YOUTUBE_API_KEY`, `MODEL_NAME`) |
+| `configs/model_catalog.yaml` | Inference models for API/UI |
+| `configs/suggested_videos.yaml` | YouTube IDs for the suggested rail |
+| `configs/pipeline.yaml` | Training data paths |
 ---
 ## Tests
 ```bash
+uv sync --extra dev --extra hf
+uv run pytest
 ```
 ---
+## Briefing vs team stack
+| Topic | Briefing | This repo |
+|-------|----------|-----------|
+| UI | Streamlit | **React** |
+| API | FastAPI | **FastAPI** |
+| Package manager | varies | **`uv`** |
+Legacy Streamlit (`src/app/`) has been removed.

configs/model_catalog.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+"LR + TF-IDF (local)":
+  type: local
+  icon: "⚡"
+  description: "Project baseline. No GPU, instant inference."
+  speed: "< 50ms"
+  accuracy: "F1 0.76"
+  requires: "joblib only"
+"DistilBERT Toxicity":
+  type: hf_remote
+  icon: "🤖"
+  model_id: martin-ha/toxic-comment-model
+  description: "DistilBERT fine-tuned on toxic comments (Hugging Face Hub)."
+  speed: "~200ms CPU"
+  accuracy: "F1 0.85"
+  requires: "uv sync --extra hf"
+"toxic-bert (multilabel)":
+  type: hf_remote
+  icon: "🧠"
+  model_id: unitary/toxic-bert
+  description: "BERT multi-label (Jigsaw). Six toxicity categories (Hugging Face Hub)."
+  speed: "~400ms CPU"
+  accuracy: "F1 0.88"
+  requires: "uv sync --extra hf"
+"RoBERTa Toxicity":
+  type: hf_remote
+  icon: "🔬"
+  model_id: s-nlp/roberta_toxicity_classifier
+  description: "RoBERTa fine-tuned for general toxicity (Hugging Face Hub)."
+  speed: "~350ms CPU"
+  accuracy: "F1 0.87"
+  requires: "uv sync --extra hf"
+"Fine-tuned (local HF)":
+  type: hf_local
+  icon: "✨"
+  model_path: models/finetuned_hf
+  description: "Locally fine-tuned Hugging Face model (models/finetuned_hf)."
+  speed: "Hardware dependent"
+  accuracy: "TBD"
+  requires: "uv sync --extra hf"

configs/suggested_videos.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# Suggested videos for the watch-page right rail (edit ids only).
+# Prefer embed-friendly videos with comments enabled (avoid Vevo music IDs).
+max_comments: 50
+videos:
+  - id: jNQXAC9IVRw
+    note: Me at the zoo — first YouTube upload; comments enabled
+  - id: IEEhzQoKtQU
+    note: 3Blue1Brown — embed-friendly educational
+  - id: dQw4w9WgXcQ
+    note: Rick Astley — usually embeddable
+  - id: e-z0xWm0xK0
+    note: Kurzgesagt — educational, comments on
+  - id: aKydtOUFkeg
+    note: TED-style talk — embed-friendly

docker-compose.yml CHANGED Viewed

@@ -1,21 +1,18 @@
-# youtube_hate_detector — API + Streamlit UI
-# Start everything:  docker compose up --build
-# Stop:             docker compose down
 name: youtube_hate_detector
 services:
-  api:
-    build: .
     image: youtube_hate_detector:latest
-    container_name: youtube_hate_detector-api
-    command:
-      - uvicorn
-      - src.api.main:app
-      - --host
-      - "0.0.0.0"
-      - --port
-      - "8000"
     ports:
       - "8000:8000"
     environment:
@@ -24,34 +21,9 @@ services:
       YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
       NLTK_DATA: /app/nltk_data
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/"]
       interval: 10s
       timeout: 5s
       retries: 12
-      start_period: 40s
-    restart: unless-stopped
-  streamlit:
-    # Reuses the image built by `api` — do not add `build:` here (parallel builds race on the same tag)
-    image: youtube_hate_detector:latest
-    container_name: youtube_hate_detector-streamlit
-    command:
-      - streamlit
-      - run
-      - src/app/app.py
-      - --server.port=8501
-      - --server.address=0.0.0.0
-      - --server.headless=true
-      - --browser.gatherUsageStats=false
-    ports:
-      - "8501:8501"
-    environment:
-      MODEL_NAME: "LR + TF-IDF (local)"
-      ENV: production
-      API_URL: http://api:8000
-      YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
-      NLTK_DATA: /app/nltk_data
-    depends_on:
-      api:
-        condition: service_healthy
     restart: unless-stopped

+# youtube_hate_detector — FastAPI + React (single service)
+# Start:  docker compose up --build
+# Stop:   docker compose down
 name: youtube_hate_detector
 services:
+  app:
+    build:
+      context: .
+      args:
+        # Set INSTALL_HF=1 for Hugging Face models (larger image, ~1–2 GB extra)
+        INSTALL_HF: ${INSTALL_HF:-0}
     image: youtube_hate_detector:latest
+    container_name: youtube_hate_detector-app
     ports:
       - "8000:8000"
     environment:
       YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-}
       NLTK_DATA: /app/nltk_data
     healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
       interval: 10s
       timeout: 5s
       retries: 12
+      start_period: 60s
     restart: unless-stopped

docs/ARCHITECTURE.md CHANGED Viewed

@@ -1,66 +1,46 @@
-# System architecture
-## Components
 ```mermaid
-flowchart TB
-  subgraph data [Data layer]
-    CSV[data/raw/youtoxic_english_1000.csv]
-    CFG[configs/*.yaml]
-  end
-  subgraph training [Training]
-    PIPE[run_pipeline.py]
-    PRE[TextPreprocessor]
-    BL[build_model LR RF XGB]
-    EV[Evaluator]
-    CSV --> PIPE
-    CFG --> PIPE
-    PIPE --> PRE --> BL --> EV
-    EV --> SUM[reports/summary.csv]
-    BL --> JOB[models/experiments/]
-  end
-  subgraph inference [Inference]
-    MS[ModelService]
-    JOB2[models/final_model.joblib]
-    JOB2 --> MS
-    API[FastAPI src/api/main.py]
-    UI[Streamlit src/app/app.py]
-    MS --> API
-    MS --> UI
-  end
 ```
-## Module map
-| Module | Responsibility |
-|--------|----------------|
-| `src/data/loader.py` | Load raw CSV, optional processed paths |
-| `src/features/text_preprocessor.py` | Clean and lemmatize text |
-| `src/features/vectorizer.py` | Standalone TF-IDF (notebooks); baselines embed TF-IDF in sklearn `Pipeline` |
-| `src/models/baseline.py` | `LRModel`, `RFModel`, `XGBModel`, `build_model()` |
-| `src/evaluation/evaluator.py` | Metrics, ROC, confusion matrix, error analysis, `summary.csv` |
-| `src/pipeline/run_pipeline.py` | Orchestrates training + evaluation |
-| `src/service/model_service.py` | Loads joblib or Hugging Face models; `predict(text)` |
-| `src/api/main.py` | REST endpoints, lifespan model load |
-| `src/app/app.py` | Streamlit UI; calls `ModelService` directly |
-## Label strategy
-- **Binary default:** column `IsToxic` → Safe `0`, Toxic `1`
-- User-facing strings: **Safe** / **Toxic** (not “hate” or “harmful” in the UI copy)
-- API returns `is_toxic` and `probability` (P(toxic))
 ## Docker
-[`docker-compose.yml`](../docker-compose.yml) runs two containers from one image:
-- `youtube_hate_detector-api` — uvicorn port 8000
-- `youtube_hate_detector-streamlit` — port 8501
-Both include `final_model.joblib`, configs, spaCy, and NLTK data baked into the image.
-## Tests
-[`tests/`](../tests/) — preprocessor, vectorizer, model binary outputs, `/predict` schema (mocked service).

+# Architecture — youtube_hate_detector
+## Runtime (production)
 ```mermaid
+flowchart LR
+  Browser[React SPA]
+  API[FastAPI :8000]
+  MS[ModelService]
+  YT[YouTube Data API]
+  Browser -->|HTTP JSON| API
+  API --> MS
+  API --> YT
 ```
+- **UI:** `frontend/` built to `frontend/dist`, served by FastAPI `StaticFiles` in production.
+- **Inference:** Only `ModelService` in `src/service/` loads models.
+- **Catalog:** `configs/model_catalog.yaml` — add models without React changes.
+- **Suggested videos:** `configs/suggested_videos.yaml` — YouTube video IDs for the right rail.
+## Local development
+| Process | Command | Port |
+|---------|---------|------|
+| API | `uv run uvicorn src.api.main:app --reload` | 8000 |
+| UI | `cd frontend && npm run dev` | 5173 (proxies API) |
 ## Docker
+Single service `youtube_hate_detector-app` on port **8000** (API + static UI).
+## API layout
+```
+src/api/
+  main.py           # app factory, CORS, static mount
+  schemas.py        # Pydantic models
+  services.py       # predict helpers
+  youtube.py        # comment fetch + metadata
+  state.py          # shared app state
+  routes/
+    health.py
+    models.py
+    predict.py
+    videos.py
+```

pyproject.toml ADDED Viewed

	@@ -0,0 +1,41 @@

+[project]
+name = "youtube_hate_detector"
+version = "1.0.0"
+description = "YouTube toxic comment detector — FastAPI + React"
+readme = "README.md"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    "fastapi>=0.136.1",
+    "uvicorn[standard]>=0.47.0",
+    "scikit-learn>=1.8.0",
+    "spacy>=3.8.14",
+    "nltk>=3.9.4",
+    "pandas>=3.0.2",
+    "PyYAML>=6.0.3",
+    "python-dotenv>=1.2.2",
+    "joblib>=1.5.3",
+    "pydantic>=2.13.4",
+    "httpx>=0.28.1",
+    "google-api-python-client>=2.100.0",
+]
+[project.optional-dependencies]
+hf = [
+    "transformers>=5.9.0",
+    "torch>=2.0.0",
+    "sentencepiece>=0.2.0",
+    "accelerate>=0.30.0",
+]
+dev = [
+    "pytest>=8.0.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

requirements.txt DELETED Viewed

@@ -1,17 +0,0 @@
-# Runtime dependencies for API + Streamlit (Docker and local installs)
-fastapi==0.136.1
-uvicorn[standard]==0.47.0
-streamlit>=1.41.0,<2
-scikit-learn==1.8.0
-spacy==3.8.14
-nltk==3.9.4
-pandas==3.0.2
-PyYAML==6.0.3
-python-dotenv==1.2.2
-joblib==1.5.3
-pydantic==2.13.4
-transformers==5.9.0
-httpx==0.28.1
-matplotlib>=3.8.0
-seaborn>=0.13.0
-mlflow>=2.0.0

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """FastAPI application package."""

src/api/main.py CHANGED Viewed

@@ -1,462 +1,114 @@
 """
-src/api/main.py
-API REST de producción para detección de hate speech.
-Ejecutar con: uvicorn src.api.main:app --reload --port 8000
-Documentación automática en:
-    http://localhost:8000/docs      (Swagger UI)
-    http://localhost:8000/redoc     (ReDoc)
-Endpoints:
-    GET  /                  → health check
-    GET  /model-info        → info del modelo activo
-    GET  /models            → lista de modelos disponibles
-    POST /predict           → predice un comentario
-    POST /predict-batch     → predice una lista de comentarios
-    POST /predict-video     → dado URL de YouTube, predice todos sus comentarios
-    PUT  /model/{name}      → cambia el modelo activo
 """
 import os
-import sys
 import time
-import logging
-from pathlib import Path
-from typing import Optional
 from contextlib import asynccontextmanager
-from dotenv import load_dotenv
-load_dotenv()
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field, field_validator
-# ── Setup path ────────────────────────────────────────────────────────────────
-PROJECT_ROOT = Path(__file__).resolve().parents[2]
-sys.path.insert(0, str(PROJECT_ROOT))
-from src.service.model_service import ModelService, AVAILABLE_MODELS
 from src.utils.logger import get_logger
 logger = get_logger(__name__)
-# ── Estado global de la app ───────────────────────────────────────────────────
-# El modelo se carga una sola vez al iniciar la API y se reutiliza.
-# Esto evita cargar el modelo en cada request (costoso en tiempo).
-_state: dict = {
-    "service"      : None,
-    "model_name"   : None,
-    "startup_time" : None,
-    "predictions_served": 0,
-}
-# ══════════════════════════════════════════════════════════════════════════════
-# LIFESPAN — carga del modelo al iniciar la API
-# ══════════════════════════════════════════════════════════════════════════════
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """
-    Lifespan context manager de FastAPI.
-    Carga el modelo al iniciar la app y libera recursos al cerrarla.
-    """
-    # Startup
-    model_name = os.getenv("MODEL_NAME", list(AVAILABLE_MODELS.keys())[0])
-    logger.info(f"Iniciando API — cargando modelo: {model_name}")
-    _state["service"]      = ModelService(model_name, PROJECT_ROOT)
-    _state["model_name"]   = model_name
-    _state["startup_time"] = time.time()
-    # Warm-up: predecir un texto de prueba para que el modelo quede en memoria
     try:
-        _state["service"].predict("test warmup text")
-        logger.info("Modelo cargado y warm-up completado ✅")
-    except Exception as e:
-        logger.warning(f"Warm-up falló (no crítico): {e}")
-    yield  # La API está lista
-    # Shutdown
-    logger.info("API cerrándose — limpiando recursos")
-    _state["service"] = None
-# ══════════════════════════════════════════════════════════════════════════════
-# APP
-# ══════════════════════════════════════════════════════════════════════════════
 app = FastAPI(
-    title       = "SignalMod API",
-    description = "API de detección de hate speech en comentarios de YouTube",
-    version     = "1.0.0",
-    lifespan    = lifespan,
 )
-# CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins  = ["*"],
-    allow_methods  = ["*"],
-    allow_headers  = ["*"],
 )
-# ══════════════════════════════════════════════════════════════════════════════
-# SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
-# ══════════════════════════════════════════════════════════════════════════════
-class PredictRequest(BaseModel):
-    """Cuerpo del request para predecir un comentario."""
-    text     : str  = Field(..., min_length=1, max_length=5000,
-                            description="Comentario a analizar")
-    threshold: float = Field(0.5, ge=0.0, le=1.0,
-                             description="Umbral de probabilidad para clasificar como tóxico")
-    @field_validator("text")
-    @classmethod
-    def text_not_empty(cls, v):
-        if not v.strip():
-            raise ValueError("El texto no puede estar vacío")
-        return v.strip()
-class PredictResponse(BaseModel):
-    """Respuesta de la predicción."""
-    text       : str
-    is_toxic   : bool
-    probability: float = Field(..., ge=0.0, le=1.0)
-    labels     : list[str]
-    model_used : str
-    latency_ms : float
-class BatchPredictRequest(BaseModel):
-    """Request para predecir múltiples comentarios."""
-    texts    : list[str] = Field(..., min_length=1, max_length=100)
-    threshold: float      = Field(0.5, ge=0.0, le=1.0)
-class BatchPredictResponse(BaseModel):
-    """Respuesta de predicción batch."""
-    results      : list[PredictResponse]
-    total        : int
-    toxic_count  : int
-    latency_ms   : float
-class VideoRequest(BaseModel):
-    """Request para analizar comentarios de un video de YouTube."""
-    url        : str   = Field(..., description="URL del video de YouTube")
-    max_comments: int  = Field(50, ge=1, le=200,
-                               description="Número máximo de comentarios a analizar")
-    threshold  : float = Field(0.5, ge=0.0, le=1.0)
-class VideoResponse(BaseModel):
-    """Respuesta del análisis de un video de YouTube."""
-    video_url   : str
-    total_fetched: int
-    toxic_count : int
-    toxic_rate  : float
-    results     : list[PredictResponse]
-    error       : Optional[str] = None
-class ModelInfo(BaseModel):
-    """Información sobre el modelo activo."""
-    name        : str
-    type        : str
-    description : str
-    speed       : str
-    accuracy    : str
-    uptime_s    : float
-    predictions_served: int
-# ══════════════════════════════════════════════════════════════════════════════
-# HELPERS
-# ══════════════════════════════════════════════════════════════════════════════
-def _get_service() -> ModelService:
-    """Devuelve el servicio activo o lanza 503 si no está listo."""
-    if _state["service"] is None:
-        raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
-    return _state["service"]
-def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
-    """Predice un texto y devuelve (result, latency_ms)."""
-    t0     = time.perf_counter()
-    result = _get_service().predict(text)
-    ms     = round((time.perf_counter() - t0) * 1000, 2)
-    # Aplicar umbral personalizado
-    result["is_toxic"] = result["probability"] >= threshold
-    if not result["is_toxic"]:
-        result["labels"] = []
-    _state["predictions_served"] += 1
-    return result, ms
-def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
-    """
-    Obtiene comentarios de un video de YouTube.
-    Estrategia:
-    1. Intentar con YouTube Data API v3 (si hay API key en .env)
-    2. Fallback: BeautifulSoup (sin autenticación, limitado)
-    """
-    api_key = os.getenv("YOUTUBE_API_KEY", "")
-    if api_key:
-        return _fetch_via_api(url, api_key, max_comments)
-    else:
-        return _fetch_via_scraper(url, max_comments)
-def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
-    """Obtiene comentarios usando YouTube Data API v3."""
-    try:
-        import re
-        from googleapiclient.discovery import build
-        # Extraer video_id de la URL
-        patterns = [
-            r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
-            r"youtu\.be/([a-zA-Z0-9_-]{11})",
-            r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
-        ]
-        video_id = None
-        for pattern in patterns:
-            match = re.search(pattern, url)
-            if match:
-                video_id = match.group(1)
-                break
-        if not video_id:
-            raise ValueError(f"No se pudo extraer video_id de: {url}")
-        youtube  = build("youtube", "v3", developerKey=api_key)
-        comments = []
-        page_token = None
-        while len(comments) < max_comments:
-            request = youtube.commentThreads().list(
-                part       = "snippet",
-                videoId    = video_id,
-                maxResults = min(100, max_comments - len(comments)),
-                pageToken  = page_token,
-                textFormat = "plainText",
-            )
-            response = request.execute()
-            for item in response.get("items", []):
-                text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
-                comments.append(text)
-            page_token = response.get("nextPageToken")
-            if not page_token:
-                break
-        logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
-        return comments[:max_comments]
-    except Exception as e:
-        logger.warning(f"YouTube API falló: {e} — usando fallback")
-        return _fetch_via_scraper(url, max_comments)
-def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
-    """
-    Fallback: simula comentarios si no hay API key.
-    En producción real debería usar BeautifulSoup + Selenium.
-    """
-    logger.warning(
-        "YOUTUBE_API_KEY no configurada. "
-        "Configura tu API key en .env para obtener comentarios reales. "
-        "Usando comentarios de ejemplo."
-    )
-    # Comentarios de ejemplo para demo sin API key
-    example_comments = [
-        "This video is really informative, thanks for sharing!",
-        "You are all stupid idiots, get out of here!",
-        "Great content, I learned a lot from this.",
-        "These people should be eliminated from society.",
-        "I agree with the presenter's point of view.",
-        "What a bunch of racist criminals!",
-        "Thank you for this analysis, very helpful.",
-        "Kill them all, they don't deserve to live.",
-        "Interesting perspective on the topic.",
-        "This is absolute bullshit propaganda!",
-        "I think we need to look at both sides.",
-        "Black people are thugs and criminals.",
-        "The data presented here is compelling.",
-        "Go back to where you came from!",
-        "Well researched video, good job.",
-    ]
-    return example_comments[:max_comments]
-# ══════════════════════════════════════════════════════════════════════════════
-# ENDPOINTS
-# ══════════════════════════════════════════════════════════════════════════════
-@app.get("/", tags=["Health"])
-async def health_check():
-    """
-    Verifica que la API está funcionando.
-    Útil para Docker healthcheck y load balancers.
-    """
-    service = _state["service"]
-    return {
-        "status"  : "ok" if service else "loading",
-        "model"   : _state["model_name"],
-        "uptime_s": round(time.time() - _state["startup_time"], 1)
-                    if _state["startup_time"] else 0,
-    }
-@app.get("/model-info", response_model=ModelInfo, tags=["Model"])
-async def get_model_info():
-    """Devuelve información sobre el modelo activo."""
-    service = _get_service()
-    info    = service.get_model_info()
-    return ModelInfo(
-        name              = _state["model_name"],
-        type              = info.get("type", "unknown"),
-        description       = info.get("description", ""),
-        speed             = info.get("speed", ""),
-        accuracy          = info.get("accuracy", ""),
-        uptime_s          = round(time.time() - _state["startup_time"], 1),
-        predictions_served= _state["predictions_served"],
-    )
-@app.get("/models", tags=["Model"])
-async def list_models():
-    """Lista todos los modelos disponibles."""
-    return {
-        "available": list(AVAILABLE_MODELS.keys()),
-        "active"   : _state["model_name"],
-    }
-@app.put("/model/{model_name}", tags=["Model"])
-async def switch_model(model_name: str):
-    """
-    Cambia el modelo activo.
-    El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
-    """
-    if model_name not in AVAILABLE_MODELS:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Modelo '{model_name}' no disponible. "
-                   f"Opciones: {list(AVAILABLE_MODELS.keys())}",
-        )
-    _state["service"]  = ModelService(model_name, PROJECT_ROOT)
-    _state["model_name"] = model_name
-    logger.info(f"Modelo cambiado a: {model_name}")
-    return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
-@app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
-async def predict(request: PredictRequest):
-    """
-    Predice si un comentario es tóxico.
-    - **text**: el comentario a analizar
-    - **threshold**: umbral de probabilidad (default 0.5)
-    Devuelve la probabilidad, si es tóxico y las categorías detectadas.
-    """
-    result, ms = _predict_single(request.text, request.threshold)
-    if "error" in result:
-        raise HTTPException(status_code=500, detail=result["error"])
-    return PredictResponse(
-        text       = request.text,
-        is_toxic   = result["is_toxic"],
-        probability= round(result["probability"], 4),
-        labels     = result["labels"],
-        model_used = result["model_used"],
-        latency_ms = ms,
-    )
-@app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
-async def predict_batch(request: BatchPredictRequest):
-    """
-    Predice una lista de comentarios en un solo request.
-    Más eficiente que llamar /predict N veces.
-    Máximo 100 comentarios por request.
-    """
-    t0      = time.perf_counter()
-    results = []
-    for text in request.texts:
-        if not text.strip():
-            continue
-        result, _ = _predict_single(text, request.threshold)
-        results.append(PredictResponse(
-            text       = text,
-            is_toxic   = result["is_toxic"],
-            probability= round(result["probability"], 4),
-            labels     = result["labels"],
-            model_used = result["model_used"],
-            latency_ms = 0.0,
-        ))
-    total_ms     = round((time.perf_counter() - t0) * 1000, 2)
-    toxic_count  = sum(1 for r in results if r.is_toxic)
-    return BatchPredictResponse(
-        results     = results,
-        total       = len(results),
-        toxic_count = toxic_count,
-        latency_ms  = total_ms,
-    )
-@app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
-async def predict_video(request: VideoRequest):
-    """
-    Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
-    Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
-    Sin API key usa comentarios de ejemplo para la demo.
-    """
-    # Obtener comentarios
-    try:
-        comments = _scrape_youtube_comments(request.url, request.max_comments)
-    except Exception as e:
-        raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
-    if not comments:
-        raise HTTPException(status_code=404, detail="No se encontraron comentarios en el video")
-    # Predecir batch
-    t0      = time.perf_counter()
-    results = []
-    for text in comments:
-        if not text.strip():
-            continue
-        result, _ = _predict_single(text, request.threshold)
-        results.append(PredictResponse(
-            text       = text,
-            is_toxic   = result["is_toxic"],
-            probability= round(result["probability"], 4),
-            labels     = result["labels"],
-            model_used = result["model_used"],
-            latency_ms = 0.0,
-        ))
-    total_ms    = round((time.perf_counter() - t0) * 1000, 2)
-    toxic_count = sum(1 for r in results if r.is_toxic)
-    return VideoResponse(
-        video_url    = request.url,
-        total_fetched= len(results),
-        toxic_count  = toxic_count,
-        toxic_rate   = round(toxic_count / len(results), 4) if results else 0.0,
-        results      = results,
-    )

 """
+youtube_hate_detector API
+Run: uv run uvicorn src.api.main:app --reload --port 8000
+Docs: http://localhost:8000/docs
 """
+from __future__ import annotations
 import os
 import time
 from contextlib import asynccontextmanager
+from pathlib import Path
+from dotenv import load_dotenv
+from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+load_dotenv()
+from src.api.routes import health, models, predict, videos
+from src.api.state import PROJECT_ROOT, get_state
+from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
 from src.utils.logger import get_logger
 logger = get_logger(__name__)
+FRONTEND_DIST = PROJECT_ROOT / "frontend" / "dist"
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    state = get_state()
+    model_name = os.getenv("MODEL_NAME", next(iter(AVAILABLE_MODELS.keys())))
+    available, reason = check_model_availability(model_name, PROJECT_ROOT)
+    if not available:
+        fallback = next(iter(AVAILABLE_MODELS.keys()))
+        logger.warning(
+            "MODEL_NAME '%s' unavailable (%s) — using '%s'",
+            model_name,
+            reason,
+            fallback,
+        )
+        model_name = fallback
+    logger.info("Starting youtube_hate_detector API — model: %s", model_name)
+    state["service"] = ModelService(model_name, PROJECT_ROOT)
+    state["model_name"] = model_name
+    state["startup_time"] = time.time()
+    state["predictions_served"] = 0
     try:
+        state["service"].predict("warmup")
+        logger.info("Model warm-up complete")
+    except Exception as exc:
+        logger.warning("Warm-up failed (non-critical): %s", exc)
+    yield
+    state["service"] = None
+    logger.info("API shutdown")
 app = FastAPI(
+    title="youtube_hate_detector API",
+    description="Toxic comment detection for YouTube-style moderation demos",
+    version="1.0.0",
+    lifespan=lifespan,
 )
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=[
+        "http://localhost:5173",
+        "http://127.0.0.1:5173",
+        "http://localhost:8000",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+app.include_router(health.router)
+app.include_router(models.router)
+app.include_router(predict.router)
+app.include_router(videos.router)
+_API_GET_PREFIXES = ("models", "model", "videos", "predict", "health", "docs", "redoc", "openapi")
+def _mount_frontend() -> None:
+    if not FRONTEND_DIST.is_dir():
+        return
+    assets = FRONTEND_DIST / "assets"
+    if assets.is_dir():
+        app.mount("/assets", StaticFiles(directory=assets), name="assets")
+    @app.get("/{full_path:path}", include_in_schema=False)
+    async def spa_fallback(full_path: str):
+        if full_path.startswith(_API_GET_PREFIXES):
+            from fastapi import HTTPException
+            raise HTTPException(status_code=404, detail="Not found")
+        index = FRONTEND_DIST / "index.html"
+        if index.exists():
+            return FileResponse(index)
+        from fastapi import HTTPException
+        raise HTTPException(status_code=404, detail="Not found")
+_mount_frontend()

src/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """API route modules."""

src/api/routes/health.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import time
+from fastapi import APIRouter
+from src.api.state import get_state
+router = APIRouter(tags=["Health"])
+@router.get("/health")
+async def health_check():
+    state = get_state()
+    service = state["service"]
+    uptime = 0.0
+    if state["startup_time"]:
+        uptime = round(time.time() - state["startup_time"], 1)
+    return {
+        "status": "ok" if service else "loading",
+        "model": state["model_name"],
+        "uptime_s": uptime,
+        "project": "youtube_hate_detector",
+    }

src/api/routes/models.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import time
+from fastapi import APIRouter, HTTPException
+from src.api.schemas import ModelInfo, ModelStatusEntry, ModelsStatusResponse
+from src.api.services import get_service
+from src.api.state import PROJECT_ROOT, get_state
+from src.service.model_service import AVAILABLE_MODELS, ModelService, check_model_availability
+router = APIRouter(tags=["Model"])
+@router.get("/model-info", response_model=ModelInfo)
+async def get_model_info():
+    service = get_service()
+    info = service.get_model_info()
+    state = get_state()
+    uptime = round(time.time() - state["startup_time"], 1) if state["startup_time"] else 0.0
+    return ModelInfo(
+        name=state["model_name"],
+        type=info.get("type", "unknown"),
+        description=info.get("description", ""),
+        speed=info.get("speed", ""),
+        accuracy=info.get("accuracy", ""),
+        uptime_s=uptime,
+        predictions_served=state.get("predictions_served", 0),
+    )
+@router.get("/models/status", response_model=ModelsStatusResponse)
+async def models_status():
+    state = get_state()
+    entries: list[ModelStatusEntry] = []
+    for name, cfg in AVAILABLE_MODELS.items():
+        available, reason = check_model_availability(name, PROJECT_ROOT)
+        entries.append(
+            ModelStatusEntry(
+                name=name,
+                available=available,
+                reason=reason,
+                type=cfg.get("type", "unknown"),
+            )
+        )
+    return ModelsStatusResponse(models=entries, active=state["model_name"] or "")
+@router.get("/models")
+async def list_models():
+    state = get_state()
+    return {"available": list(AVAILABLE_MODELS.keys()), "active": state["model_name"]}
+@router.put("/model/{model_name}")
+async def switch_model(model_name: str):
+    if model_name not in AVAILABLE_MODELS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Model '{model_name}' not available. Options: {list(AVAILABLE_MODELS.keys())}",
+        )
+    available, reason = check_model_availability(model_name, PROJECT_ROOT)
+    if not available:
+        raise HTTPException(status_code=400, detail=reason or "Model unavailable")
+    state = get_state()
+    prev_service = state["service"]
+    prev_name = state["model_name"]
+    new_service = ModelService(model_name, PROJECT_ROOT)
+    warmup = new_service.predict("warmup")
+    if warmup.get("error"):
+        state["service"] = prev_service
+        state["model_name"] = prev_name
+        raise HTTPException(status_code=400, detail=str(warmup["error"]))
+    state["service"] = new_service
+    state["model_name"] = model_name
+    return {"message": f"Active model set to '{model_name}'", "model": model_name}

src/api/routes/predict.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import time
+from fastapi import APIRouter, HTTPException
+from src.api.schemas import (
+    BatchPredictRequest,
+    BatchPredictResponse,
+    PredictRequest,
+    PredictResponse,
+    VideoRequest,
+    VideoResponse,
+)
+from src.api.services import predict_single, to_predict_response
+from src.api.state import get_state
+from src.api.youtube import CommentsFetchError, fetch_comments
+router = APIRouter(tags=["Prediction"])
+@router.post("/predict", response_model=PredictResponse)
+async def predict(request: PredictRequest):
+    return predict_single(request.text, request.threshold)
+@router.post("/predict-batch", response_model=BatchPredictResponse)
+async def predict_batch(request: BatchPredictRequest):
+    t0 = time.perf_counter()
+    results: list[PredictResponse] = []
+    for text in request.texts:
+        if not text.strip():
+            continue
+        results.append(predict_single(text.strip(), request.threshold))
+    total_ms = round((time.perf_counter() - t0) * 1000, 2)
+    toxic_count = sum(1 for r in results if r.is_toxic)
+    return BatchPredictResponse(
+        results=results,
+        total=len(results),
+        toxic_count=toxic_count,
+        latency_ms=total_ms,
+    )
+@router.post("/predict-video", response_model=VideoResponse)
+async def predict_video(request: VideoRequest):
+    try:
+        comments, source = fetch_comments(request.url, request.max_comments)
+    except CommentsFetchError as exc:
+        raise HTTPException(status_code=422, detail=str(exc)) from exc
+    except Exception as exc:
+        raise HTTPException(status_code=422, detail=f"Failed to fetch comments: {exc}") from exc
+    if not comments:
+        raise HTTPException(status_code=404, detail="No comments found for this video")
+    t0 = time.perf_counter()
+    results: list[PredictResponse] = []
+    service = get_state()["service"]
+    if service is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    for text in comments:
+        if not text.strip():
+            continue
+        raw = service.predict(text)
+        results.append(to_predict_response(text, raw, 0.0, request.threshold))
+    total_ms = round((time.perf_counter() - t0) * 1000, 2)
+    toxic_count = sum(1 for r in results if r.is_toxic)
+    get_state()["predictions_served"] = get_state().get("predictions_served", 0) + len(results)
+    return VideoResponse(
+        video_url=request.url,
+        total_fetched=len(results),
+        toxic_count=toxic_count,
+        toxic_rate=round(toxic_count / len(results), 4) if results else 0.0,
+        results=results,
+        source=source,
+    )

src/api/routes/videos.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from fastapi import APIRouter
+from src.api.schemas import SuggestedVideo, SuggestedVideosResponse
+from src.api.youtube import fetch_video_metadata, load_suggested_config
+router = APIRouter(tags=["Videos"])
+@router.get("/videos/suggested", response_model=SuggestedVideosResponse)
+async def suggested_videos():
+    cfg = load_suggested_config()
+    max_comments = int(cfg.get("max_comments", 50))
+    entries = cfg.get("videos") or []
+    ids = [e["id"] if isinstance(e, dict) else str(e) for e in entries]
+    meta = fetch_video_metadata(ids)
+    videos = [SuggestedVideo(**m) for m in meta]
+    return SuggestedVideosResponse(videos=videos, max_comments=max_comments)

src/api/schemas.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Pydantic request/response models for the API."""
+from typing import Literal, Optional
+from pydantic import BaseModel, Field, field_validator
+class PredictRequest(BaseModel):
+    text: str = Field(..., min_length=1, max_length=5000)
+    threshold: float = Field(0.5, ge=0.0, le=1.0)
+    @field_validator("text")
+    @classmethod
+    def text_not_empty(cls, v: str) -> str:
+        if not v.strip():
+            raise ValueError("Text cannot be empty")
+        return v.strip()
+class PredictResponse(BaseModel):
+    text: str
+    is_toxic: bool
+    probability: float = Field(..., ge=0.0, le=1.0)
+    status: Literal["Safe", "Toxic"]
+    mode: Literal["binary"] = "binary"
+    labels: list[str]
+    model_used: str
+    latency_ms: float
+class BatchPredictRequest(BaseModel):
+    texts: list[str] = Field(..., min_length=1, max_length=100)
+    threshold: float = Field(0.5, ge=0.0, le=1.0)
+class BatchPredictResponse(BaseModel):
+    results: list[PredictResponse]
+    total: int
+    toxic_count: int
+    latency_ms: float
+class VideoRequest(BaseModel):
+    url: str
+    max_comments: int = Field(50, ge=1, le=200)
+    threshold: float = Field(0.5, ge=0.0, le=1.0)
+class VideoResponse(BaseModel):
+    video_url: str
+    total_fetched: int
+    toxic_count: int
+    toxic_rate: float
+    results: list[PredictResponse]
+    source: Literal["youtube", "demo", "unavailable"] = "demo"
+    reason: Optional[str] = None
+    error: Optional[str] = None
+class ModelStatusEntry(BaseModel):
+    name: str
+    available: bool
+    reason: Optional[str] = None
+    type: str = "unknown"
+class ModelsStatusResponse(BaseModel):
+    models: list[ModelStatusEntry]
+    active: str
+class ModelInfo(BaseModel):
+    name: str
+    type: str
+    description: str
+    speed: str
+    accuracy: str
+    uptime_s: float
+    predictions_served: int
+class SuggestedVideo(BaseModel):
+    id: str
+    title: str
+    channel_title: str
+    thumbnail_url: str
+    watch_url: str
+    embeddable: bool = True
+class SuggestedVideosResponse(BaseModel):
+    videos: list[SuggestedVideo]
+    max_comments: int

src/api/services.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Prediction helpers used by route handlers."""
+from __future__ import annotations
+import time
+from fastapi import HTTPException
+from src.api.schemas import PredictResponse
+from src.api.state import get_state
+from src.service.model_service import ModelService
+def get_service() -> ModelService:
+    state = get_state()
+    if state["service"] is None:
+        raise HTTPException(status_code=503, detail="Model not loaded. Try again shortly.")
+    return state["service"]
+def to_predict_response(text: str, result: dict, latency_ms: float, threshold: float) -> PredictResponse:
+    proba = float(result["probability"])
+    is_toxic = proba >= threshold
+    labels = result.get("labels", []) if is_toxic else []
+    return PredictResponse(
+        text=text,
+        is_toxic=is_toxic,
+        probability=round(proba, 4),
+        status="Toxic" if is_toxic else "Safe",
+        mode="binary",
+        labels=labels,
+        model_used=result.get("model_used", ""),
+        latency_ms=latency_ms,
+    )
+def predict_single(text: str, threshold: float) -> PredictResponse:
+    state = get_state()
+    t0 = time.perf_counter()
+    result = get_service().predict(text)
+    ms = round((time.perf_counter() - t0) * 1000, 2)
+    if "error" in result:
+        raise HTTPException(status_code=500, detail=result["error"])
+    state["predictions_served"] = state.get("predictions_served", 0) + 1
+    return to_predict_response(text, result, ms, threshold)

src/api/state.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Application state shared across routes."""
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+_state: dict = {
+    "service": None,
+    "model_name": None,
+    "startup_time": None,
+    "predictions_served": 0,
+}
+def get_state() -> dict:
+    return _state

src/api/youtube.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""YouTube comment fetch and suggested-video metadata."""
+from __future__ import annotations
+import os
+import re
+from pathlib import Path
+from typing import Any
+import yaml
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+SUGGESTED_CONFIG = PROJECT_ROOT / "configs" / "suggested_videos.yaml"
+_VIDEO_ID_PATTERNS = (
+    r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
+    r"youtu\.be/([a-zA-Z0-9_-]{11})",
+    r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
+)
+class CommentsFetchError(Exception):
+    """Raised when comments cannot be fetched and demo fallback must not be used."""
+def extract_video_id(url: str) -> str | None:
+    for pattern in _VIDEO_ID_PATTERNS:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
+def load_suggested_config() -> dict[str, Any]:
+    if not SUGGESTED_CONFIG.exists():
+        return {"max_comments": 50, "videos": [{"id": "jNQXAC9IVRw"}]}
+    with SUGGESTED_CONFIG.open(encoding="utf-8") as f:
+        return yaml.safe_load(f) or {}
+def _parse_youtube_error(exc: Exception) -> str:
+    err_text = str(exc)
+    if "commentsDisabled" in err_text:
+        return "Comments are disabled on this video"
+    if "disabled comments" in err_text.lower():
+        return "Comments are disabled on this video"
+    if "quota" in err_text.lower():
+        return "YouTube API quota exceeded"
+    try:
+        from googleapiclient.errors import HttpError
+        if isinstance(exc, HttpError):
+            for detail in getattr(exc, "error_details", []) or []:
+                reason = detail.get("reason") if isinstance(detail, dict) else None
+                if reason == "commentsDisabled":
+                    return "Comments are disabled on this video"
+    except ImportError:
+        pass
+    return err_text
+def fetch_comments(url: str, max_comments: int) -> tuple[list[str], str]:
+    video_id = extract_video_id(url) or "unknown"
+    api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
+    if api_key:
+        return _fetch_via_api(url, api_key, max_comments, video_id)
+    return _demo_comments(video_id, max_comments), "demo"
+def _fetch_via_api(
+    url: str, api_key: str, max_comments: int, video_id: str
+) -> tuple[list[str], str]:
+    from googleapiclient.discovery import build
+    if video_id == "unknown":
+        raise CommentsFetchError(f"Could not parse video id from: {url}")
+    youtube = build("youtube", "v3", developerKey=api_key)
+    comments: list[str] = []
+    page_token = None
+    try:
+        while len(comments) < max_comments:
+            response = (
+                youtube.commentThreads()
+                .list(
+                    part="snippet",
+                    videoId=video_id,
+                    maxResults=min(100, max_comments - len(comments)),
+                    pageToken=page_token,
+                    textFormat="plainText",
+                )
+                .execute()
+            )
+            for item in response.get("items", []):
+                text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
+                comments.append(text)
+            page_token = response.get("nextPageToken")
+            if not page_token:
+                break
+    except Exception as exc:
+        message = _parse_youtube_error(exc)
+        logger.warning("YouTube API failed for %s: %s", video_id, message)
+        raise CommentsFetchError(message) from exc
+    if not comments:
+        raise CommentsFetchError("No comments found for this video")
+    logger.info("YouTube API: fetched %s comments for %s", len(comments), video_id)
+    return comments[:max_comments], "youtube"
+def fetch_video_metadata(video_ids: list[str]) -> list[dict[str, Any]]:
+    api_key = os.getenv("YOUTUBE_API_KEY", "").strip()
+    if not api_key or not video_ids:
+        return [_placeholder_meta(vid) for vid in video_ids]
+    try:
+        from googleapiclient.discovery import build
+        youtube = build("youtube", "v3", developerKey=api_key)
+        response = (
+            youtube.videos()
+            .list(part="snippet,status", id=",".join(video_ids))
+            .execute()
+        )
+        by_id: dict[str, dict[str, Any]] = {}
+        for item in response.get("items", []):
+            vid = item["id"]
+            snip = item["snippet"]
+            status = item.get("status", {})
+            thumbs = snip.get("thumbnails", {})
+            thumb = thumbs.get("medium") or thumbs.get("default") or {}
+            embeddable = status.get("embeddable", True)
+            by_id[vid] = {
+                "id": vid,
+                "title": snip.get("title", vid),
+                "channel_title": snip.get("channelTitle", "Unknown"),
+                "thumbnail_url": thumb.get("url", f"https://i.ytimg.com/vi/{vid}/mqdefault.jpg"),
+                "watch_url": f"https://www.youtube.com/watch?v={vid}",
+                "embeddable": bool(embeddable),
+            }
+        return [by_id.get(vid, _placeholder_meta(vid)) for vid in video_ids]
+    except Exception as exc:
+        logger.warning("YouTube metadata failed: %s", exc)
+        return [_placeholder_meta(vid) for vid in video_ids]
+def _placeholder_meta(video_id: str) -> dict[str, Any]:
+    return {
+        "id": video_id,
+        "title": f"Video {video_id}",
+        "channel_title": "YouTube",
+        "thumbnail_url": f"https://i.ytimg.com/vi/{video_id}/mqdefault.jpg",
+        "watch_url": f"https://www.youtube.com/watch?v={video_id}",
+        "embeddable": True,
+    }
+def _demo_comments(video_id: str, max_comments: int) -> list[str]:
+    import random
+    logger.info("Using demo comments for %s (no YOUTUBE_API_KEY)", video_id)
+    examples = [
+        "This video is really informative, thanks for sharing!",
+        "You are all stupid idiots, get out of here!",
+        "Great content, I learned a lot from this.",
+        "These people should be eliminated from society.",
+        "I agree with the presenter's point of view.",
+        "What a bunch of racist criminals!",
+        "Thank you for this analysis, very helpful.",
+        "Kill them all, they don't deserve to live.",
+        "Interesting perspective on the topic.",
+        "This is absolute bullshit propaganda!",
+        "I think we need to look at both sides.",
+        "Well researched video, good job.",
+        "Go back to where you came from!",
+        "The data presented here is compelling.",
+    ]
+    rng = random.Random(video_id)
+    rotated = examples[:]
+    rng.shuffle(rotated)
+    return rotated[:max_comments]

src/app/app.py DELETED Viewed

@@ -1,764 +0,0 @@
-"""
-src/app/streamlit_app.py
-App SignalMod — detección de hate speech estilo YouTube.
-Ejecutar: streamlit run src/app/streamlit_app.py
-"""
-import html
-import sys
-import random
-import datetime
-from pathlib import Path
-import streamlit as st
-import pandas as pd
-from transformers.utils import logging
-logging.set_verbosity_error()
-# ── Paths ─────────────────────────────────────────────────────────────────────
-PROJECT_ROOT = Path(__file__).resolve().parents[2]
-sys.path.insert(0, str(PROJECT_ROOT))
-try:
-    from src.service.model_service import ModelService, AVAILABLE_MODELS
-except ImportError:
-    sys.path.insert(0, str(Path(__file__).parent.parent))
-    from service.model_service import ModelService, AVAILABLE_MODELS
-# ── Config ────────────────────────────────────────────────────────────────────
-st.set_page_config(
-    page_title="SignalMod",
-    page_icon="🎬",
-    layout="wide",
-    initial_sidebar_state="expanded",
-)
-# ── CSS ───────────────────────────────────────────────────────────────────────
-# Nota: NO ocultamos el header completo para preservar el botón de toggle del sidebar.
-# Solo ocultamos el menú hamburguesa y el footer de Streamlit.
-st.markdown("""
-<style>
-@import url('https://fonts.googleapis.com/css2?family=YouTube+Sans:wght@400;600;700&display=swap');
-/* ── Ocultar solo elementos de branding, NO el header completo ── */
-#MainMenu { visibility: hidden; }
-footer    { visibility: hidden; }
-/* ── Fondo de la app: blanco limpio ── */
-.stApp { background: #ffffff; }
-/* ── Sidebar oscuro (como YouTube) ── */
-section[data-testid="stSidebar"] {
-    background-color: #0f0f0f !important;
-}
-section[data-testid="stSidebar"] > div {
-    background-color: #0f0f0f !important;
-}
-/* Texto del sidebar en blanco */
-section[data-testid="stSidebar"] p,
-section[data-testid="stSidebar"] span,
-section[data-testid="stSidebar"] label,
-section[data-testid="stSidebar"] div {
-    color: #ffffff !important;
-}
-/* Botones del sidebar */
-section[data-testid="stSidebar"] .stButton button {
-    background: transparent !important;
-    color: #e0e0e0 !important;
-    border: none !important;
-    text-align: left !important;
-    justify-content: flex-start !important;
-    border-radius: 10px !important;
-    padding: 0.5rem 0.75rem !important;
-    font-size: 0.9rem !important;
-    font-weight: 400 !important;
-    width: 100% !important;
-}
-section[data-testid="stSidebar"] .stButton button:hover {
-    background: rgba(255,255,255,0.1) !important;
-    color: #ffffff !important;
-}
-/* Botón activo en el sidebar */
-section[data-testid="stSidebar"] .stButton button[data-active="true"] {
-    background: rgba(255,255,255,0.15) !important;
-    color: #ffffff !important;
-    font-weight: 600 !important;
-}
-/* Divider del sidebar */
-section[data-testid="stSidebar"] hr {
-    border-color: rgba(255,255,255,0.15) !important;
-}
-/* Badge de modelo activo en sidebar */
-.sidebar-model-info {
-    background: rgba(255,255,255,0.08);
-    border-radius: 8px;
-    padding: 8px 12px;
-    margin: 8px 0;
-    font-size: 0.75rem;
-    color: #aaaaaa;
-}
-.sidebar-model-info strong { color: #ffffff; }
-/* ── Área principal: fondo blanco, texto oscuro ── */
-.main-area { background: #ffffff; }
-/* ── Video thumbnail ── */
-.video-thumb {
-    background: linear-gradient(135deg, #0d0d1a 0%, #1a0a2e 50%, #0d1a1a 100%);
-    border-radius: 12px;
-    height: 340px;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-}
-.play-btn {
-    width: 72px; height: 72px;
-    background: rgba(255,255,255,0.9);
-    border-radius: 50%;
-    display: flex; align-items: center; justify-content: center;
-    font-size: 2rem; cursor: pointer;
-    box-shadow: 0 4px 20px rgba(0,0,0,0.4);
-}
-/* ── Títulos de video ── */
-.video-title {
-    font-size: 1.15rem; font-weight: 700;
-    color: #0f0f0f; margin: 0.75rem 0 0.3rem;
-    line-height: 1.4;
-}
-.video-meta { font-size: 0.82rem; color: #606060; }
-.channel-name { font-weight: 600; font-size: 0.9rem; color: #0f0f0f; }
-/* ── Badges ── */
-.badge {
-    display: inline-block;
-    padding: 2px 9px; border-radius: 12px;
-    font-size: 0.72rem; font-weight: 700;
-    margin-left: 6px; vertical-align: middle;
-}
-.badge-toxic { background: #cc0000; color: #ffffff; }
-.badge-safe  { background: #00c853; color: #ffffff; }
-/* ── Comentarios ── */
-.comment-wrap {
-    display: flex; gap: 12px;
-    padding: 12px 0; border-bottom: 1px solid #f0f0f0;
-}
-.c-avatar {
-    width: 36px; height: 36px; min-width: 36px;
-    border-radius: 50%; background: #cc0000;
-    display: flex; align-items: center; justify-content: center;
-    color: #ffffff; font-weight: 700; font-size: 0.85rem;
-    flex-shrink: 0;
-}
-.c-avatar.safe { background: #606060; }
-.c-body { flex: 1; min-width: 0; }
-.c-header { display: flex; align-items: center; flex-wrap: wrap; gap: 4px; }
-.c-user { font-size: 0.84rem; font-weight: 600; color: #0f0f0f; }
-.c-time { font-size: 0.75rem; color: #909090; margin-left: 4px; }
-.c-text { font-size: 0.88rem; color: #2d2d2d; margin-top: 4px; line-height: 1.55; }
-.c-text.toxic {
-    background: #fff5f5;
-    border-left: 3px solid #cc0000;
-    padding: 6px 10px; border-radius: 0 6px 6px 0;
-    margin-top: 6px;
-}
-.c-flagged { font-size: 0.77rem; color: #cc0000; font-weight: 500; margin-top: 4px; }
-/* ── Toxicity bar inline ── */
-.tox-row {
-    display: flex; align-items: center; gap: 8px;
-    font-size: 0.8rem; color: #606060; margin-top: 6px; flex-wrap: wrap;
-}
-.tox-bar-bg {
-    flex: 1; max-width: 120px;
-    background: #e5e5e5; border-radius: 4px; height: 6px;
-}
-.tox-bar-fill { height: 6px; border-radius: 4px; }
-/* ── Sugeridos ── */
-.sug-card {
-    display: flex; gap: 8px; margin-bottom: 10px;
-    cursor: pointer;
-}
-.sug-thumb {
-    width: 120px; min-width: 120px; height: 68px;
-    background: #1a1a2e; border-radius: 6px;
-    display: flex; align-items: center; justify-content: center;
-    font-size: 1.4rem; flex-shrink: 0;
-}
-.sug-title  { font-size: 0.82rem; font-weight: 600; color: #0f0f0f; line-height: 1.3; }
-.sug-ch     { font-size: 0.75rem; color: #606060; margin-top: 2px; }
-.sug-meta   { font-size: 0.72rem; color: #909090; }
-/* ── Section header ── */
-.sec-title {
-    font-size: 1rem; font-weight: 700; color: #0f0f0f;
-    margin: 1.25rem 0 0.75rem; padding-bottom: 0.5rem;
-    border-bottom: 1px solid #e5e5e5;
-}
-/* ── Modal body fixes ── */
-[data-testid="stDialog"] { background: #ffffff; }
-/* ── Hub cards ── */
-.hub-card {
-    background: #ffffff; border: 1px solid #e5e5e5;
-    border-radius: 12px; padding: 1rem;
-}
-.hub-kpi-label { font-size: 0.72rem; color: #606060; text-transform: uppercase;
-                  letter-spacing: 0.5px; margin-bottom: 4px; }
-.hub-kpi-val   { font-size: 1.8rem; font-weight: 700; color: #0f0f0f; }
-/* ── Model cards (settings) ── */
-.model-card {
-    background: #ffffff; border: 1.5px solid #e5e5e5;
-    border-radius: 10px; padding: 14px 16px; margin-bottom: 8px;
-}
-.model-card.active {
-    border-color: #cc0000; background: #fff5f5;
-}
-.model-card-name { font-size: 0.95rem; font-weight: 600; color: #0f0f0f; }
-.model-card-desc { font-size: 0.8rem; color: #606060; margin-top: 3px; }
-.model-pill {
-    display: inline-block; background: #f0f0f0; color: #333;
-    border-radius: 6px; padding: 2px 8px; font-size: 0.73rem; margin-right: 4px;
-}
-</style>
-""", unsafe_allow_html=True)
-# ── Session state init ────────────────────────────────────────────────────────
-def _init_state():
-    defaults = {
-        "page"          : "Home",
-        "selected_model": list(AVAILABLE_MODELS.keys())[0],
-        "threshold"     : 0.5,
-        "pending_modal" : None,   # dict con el comentario pendiente de decisión
-        "comments": [
-            {"user": "user_prime", "initial": "U",
-             "text": "Excelente video, muy informativo!", "time": "1 h",
-             "is_toxic": False, "probability": 0.04, "labels": []},
-            {"user": "troll_master", "initial": "T",
-             "text": "Esto es una basura completa", "time": "30 min",
-             "is_toxic": True, "probability": 0.91, "labels": ["Insulto","Agresividad"]},
-            {"user": "curious_viewer", "initial": "C",
-             "text": "¿Alguien puede explicar esto mejor?", "time": "15 min",
-             "is_toxic": False, "probability": 0.07, "labels": []},
-        ],
-        "hub_history": [
-            {"Usuario": "@user_992",   "Comentario": '"No puedo creer que seas tan..."', "Score": 0.94, "Acción": "🚫 Bloqueado"},
-            {"Usuario": "@alpha_mod",  "Comentario": '"Spam repetitivo de enlaces."',    "Score": 0.82, "Acción": "🚩 Revisión"},
-            {"Usuario": "@anon_404",   "Comentario": '"Discurso de odio en contexto."',  "Score": 0.98, "Acción": "📋 Archivado"},
-            {"Usuario": "@user_123",   "Comentario": '"¡Gran contenido, sigan!"',        "Score": 0.03, "Acción": "✅ Aprobado"},
-            {"Usuario": "@viewer_x",   "Comentario": '"Esta gente debería desaparecer."',"Score": 0.97, "Acción": "🚫 Bloqueado"},
-        ],
-    }
-    for k, v in defaults.items():
-        if k not in st.session_state:
-            st.session_state[k] = v
-_init_state()
-# ── Model cache ───────────────────────────────────────────────────────────────
-@st.cache_resource(show_spinner="Cargando modelo...")
-def get_service(model_name: str) -> ModelService:
-    return ModelService(model_name, PROJECT_ROOT)
-# ══════════════════════════════════════════════════════════════════════════════
-# SIDEBAR
-# ══════════════════════════════════════════════════════════════════════════════
-def render_sidebar():
-    with st.sidebar:
-        # Logo
-        st.markdown(
-            "<div style='padding:0.5rem 0 0.25rem; font-size:1.3rem; font-weight:700;'>"
-            "🎬 <span style='color:#cc0000'>Signal</span>Mod</div>"
-            "<div style='font-size:0.65rem; color:#aaa; margin-bottom:1.2rem;'>"
-            "Signal within the Noise</div>",
-            unsafe_allow_html=True,
-        )
-        nav = {"Home": "🏠", "Moderator Hub": "📊", "Settings": "⚙️"}
-        for page, icon in nav.items():
-            label = f"{icon}  {page}"
-            clicked = st.button(label, key=f"nav_{page}", use_container_width=True)
-            if clicked:
-                st.session_state.page = page
-                st.rerun()
-        st.divider()
-        # Info modelo activo
-        model_short = st.session_state.selected_model.split("(")[0].strip()
-        tox_cnt     = sum(1 for c in st.session_state.comments if c["is_toxic"])
-        total_c     = len(st.session_state.comments)
-        st.markdown(
-            f"<div class='sidebar-model-info'>"
-            f"Modelo activo<br><strong>{html.escape(model_short)}</strong>"
-            f"<br><br>Comentarios: <strong>{total_c}</strong>"
-            f" · Tóxicos: <strong style='color:#cc0000'>{tox_cnt}</strong>"
-            f"</div>",
-            unsafe_allow_html=True,
-        )
-# ══════════════════════════════════════════════════════════════════════════════
-# MODAL — toxicidad detectada
-# ══════════════════════════════════════════════════════════════════════════════
-@st.dialog("⚠️  Aviso de Toxicidad Detectada")
-def show_toxicity_modal():
-    """
-    @st.dialog crea una ventana modal nativa de Streamlit (1.32+).
-    Cuando se llama a la función decorada, Streamlit renderiza el contenido
-    dentro de un overlay modal y pausa la ejecución normal del script.
-    """
-    data  = st.session_state.pending_modal
-    if not data:
-        st.rerun()
-        return
-    text  = data["text"]
-    prob  = data["probability"]
-    lbls  = data["labels"]
-    pct   = int(prob * 100)
-    color = "#cc0000" if pct >= 70 else "#ff6d00" if pct >= 40 else "#f5a623"
-    st.markdown(
-        "<div style='text-align:center; font-size:3rem; color:#cc0000'>⚠️</div>",
-        unsafe_allow_html=True,
-    )
-    st.markdown(
-        f"<div style='background:#f8f8f8; border-radius:8px; padding:12px 16px;"
-        f"font-style:italic; color:#333; text-align:center; margin:8px 0;'>"
-        f"&quot;{html.escape(text[:140])}{'...' if len(text)>140 else ''}&quot;</div>",
-        unsafe_allow_html=True,
-    )
-    # Barra de toxicidad
-    st.markdown(
-        f"<div style='display:flex; justify-content:space-between; "
-        f"font-size:0.82rem; color:#606060; margin-top:12px;'>"
-        f"<span>ÍNDICE DE TOXICIDAD</span>"
-        f"<span style='color:{color}; font-weight:700'>{pct}%</span></div>"
-        f"<div style='background:#e5e5e5; border-radius:4px; height:8px; margin-top:4px;'>"
-        f"<div style='width:{pct}%; background:{color}; height:8px; border-radius:4px;'></div>"
-        f"</div>",
-        unsafe_allow_html=True,
-    )
-    # Etiquetas
-    if lbls:
-        tags = " ".join(
-            f"<span style='background:#ffe5e5; color:#cc0000; border-radius:14px;"
-            f"padding:3px 10px; font-size:0.76rem; font-weight:600; margin:3px;'>"
-            f"🚩 {html.escape(l)}</span>"
-            for l in lbls
-        )
-        st.markdown(f"<div style='margin-top:10px'>{tags}</div>", unsafe_allow_html=True)
-    st.markdown("<br>", unsafe_allow_html=True)
-    col1, col2 = st.columns(2)
-    with col1:
-        if st.button("✏️  Editar comentario", use_container_width=True, type="primary"):
-            st.session_state.pending_modal = None
-            st.rerun()
-    with col2:
-        if st.button("Publicar de todas maneras", use_container_width=True):
-            # Publicar aunque sea tóxico
-            c = st.session_state.pending_modal
-            st.session_state.comments.append(c)
-            st.session_state.hub_history.insert(0, {
-                "Usuario"  : "@usuario",
-                "Comentario": f'"{c["text"][:45]}..."',
-                "Score"    : round(c["probability"], 2),
-                "Acción"   : "⚠️ Override usuario",
-            })
-            st.session_state.pending_modal = None
-            st.rerun()
-# ══════════════════════════════════════════════════════════════════════════════
-# HOME — interfaz estilo YouTube
-# ══════════════════════════════════════════════════════════════════════════════
-def render_home():
-    # Disparar modal si hay comentario pendiente
-    if st.session_state.pending_modal:
-        show_toxicity_modal()
-    col_main, col_right = st.columns([2.8, 1], gap="large")
-    with col_main:
-        # Video
-        st.markdown(
-            "<div class='video-thumb'><div class='play-btn'>▶</div></div>",
-            unsafe_allow_html=True,
-        )
-        st.markdown(
-            "<div class='video-title'>AI Moderation Demo — Detección de Hate Speech en tiempo real</div>"
-            "<div class='video-meta'>15k vistas · 2 horas atrás</div>",
-            unsafe_allow_html=True,
-        )
-        row_ch, row_sub = st.columns([3, 1])
-        with row_ch:
-            st.markdown(
-                "<div style='display:flex; align-items:center; gap:10px; margin:10px 0;'>"
-                "<div style='width:36px; height:36px; border-radius:50%; background:#cc0000;"
-                "display:flex; align-items:center; justify-content:center; color:#fff;"
-                "font-weight:700;'>S</div>"
-                "<div><div class='channel-name'>SignalMod AI</div>"
-                "<div class='video-meta'>1.2M suscriptores</div></div></div>",
-                unsafe_allow_html=True,
-            )
-        st.divider()
-        # ── Comentarios ────────────────────────────────────────────────────
-        tox_cnt = sum(1 for c in st.session_state.comments if c["is_toxic"])
-        st.markdown(
-            f"<div class='sec-title'>{len(st.session_state.comments)} Comentarios "
-            f"<span style='font-size:0.8rem; color:#cc0000;'>· {tox_cnt} detectados</span></div>",
-            unsafe_allow_html=True,
-        )
-        # Input de nuevo comentario
-        new_text = st.text_area(
-            "Escribe un comentario...",
-            height=80, label_visibility="collapsed",
-            key="comment_input",
-            placeholder="Escribe un comentario...",
-        )
-        # Análisis en tiempo real (solo cuando hay texto)
-        analysis = None
-        if new_text.strip():
-            svc      = get_service(st.session_state.selected_model)
-            analysis = svc.predict(new_text)
-            pct      = int(analysis["probability"] * 100)
-            color    = "#cc0000" if pct >= 70 else "#f5a623" if pct >= 40 else "#00c853"
-            verdict  = "TÓXICO" if analysis["is_toxic"] else "SEGURO"
-            v_color  = "#cc0000" if analysis["is_toxic"] else "#00c853"
-            st.markdown(
-                f"<div class='tox-row'>"
-                f"<span>🔍 Analizando...</span>"
-                f"<span style='background:{v_color}; color:#fff; border-radius:10px;"
-                f"padding:1px 9px; font-size:0.72rem; font-weight:700;'>{verdict}</span>"
-                f"<span style='color:{color}; font-weight:600;'>Toxicidad: {pct}%</span>"
-                f"<div class='tox-bar-bg'>"
-                f"<div class='tox-bar-fill' style='width:{pct}%; background:{color};'></div>"
-                f"</div></div>",
-                unsafe_allow_html=True,
-            )
-        col_c, col_p = st.columns([1, 1])
-        with col_c:
-            if st.button("Cancelar", use_container_width=True):
-                st.rerun()
-        with col_p:
-            post = st.button("Comentar", type="primary", use_container_width=True)
-        # Procesar envío
-        if post and new_text.strip():
-            if analysis is None:
-                svc      = get_service(st.session_state.selected_model)
-                analysis = svc.predict(new_text)
-            comment_obj = {
-                "user"       : "usuario",
-                "initial"    : "U",
-                "text"       : new_text.strip(),
-                "time"       : "ahora",
-                "is_toxic"   : analysis["is_toxic"],
-                "probability": analysis["probability"],
-                "labels"     : analysis["labels"],
-            }
-            if analysis["is_toxic"]:
-                # Guardar en pendiente y mostrar modal en el próximo render
-                st.session_state.pending_modal = comment_obj
-                st.rerun()
-            else:
-                # Publicar directamente
-                st.session_state.comments.append(comment_obj)
-                st.session_state.hub_history.insert(0, {
-                    "Usuario"   : "@usuario",
-                    "Comentario": f'"{new_text.strip()[:45]}{"..." if len(new_text)>45 else ""}"',
-                    "Score"     : round(analysis["probability"], 2),
-                    "Acción"    : "✅ Aprobado",
-                })
-                st.rerun()
-        # ── Lista de comentarios ───────────────────────────────────────────
-        for c in reversed(st.session_state.comments):
-            is_tox   = c["is_toxic"]
-            pct      = int(c["probability"] * 100)
-            av_class = "c-avatar" if is_tox else "c-avatar safe"
-            badge    = (
-                "<span class='badge badge-toxic'>TÓXICO</span>" if is_tox
-                else "<span class='badge badge-safe'>SEGURO</span>"
-            )
-            text_class = "c-text toxic" if is_tox else "c-text"
-            flagged    = "<div class='c-flagged'>🚩 Flagged for review</div>" if is_tox else ""
-            # html.escape() protege contra caracteres que rompen el HTML
-            safe_text = html.escape(c["text"])
-            safe_user = html.escape(c["user"])
-            initial   = html.escape(c.get("initial", c["user"][0].upper()))
-            st.markdown(
-                f"<div class='comment-wrap'>"
-                f"  <div class='{av_class}'>{initial}</div>"
-                f"  <div class='c-body'>"
-                f"    <div class='c-header'>"
-                f"      <span class='c-user'>@{safe_user}</span>"
-                f"      <span class='c-time'>{c['time']}</span>"
-                f"      {badge}"
-                f"    </div>"
-                f"    <div class='{text_class}'>{safe_text}</div>"
-                f"    {flagged}"
-                f"  </div>"
-                f"</div>",
-                unsafe_allow_html=True,
-            )
-    # ── Columna derecha ────────────────────────────────────────────────────
-    with col_right:
-        st.markdown("**Sugeridos**")
-        suggested = [
-            ("🤖", "Understanding Transformer Models...", "Neural Systems", "89k · 1 día"),
-            ("🎓", "The Future of Content Moderation",   "Tech Ethics Pro", "1.4M · 2 sem"),
-            ("📡", "Signal vs Noise: SignalMod Deep Dive","SignalMod AI",    "250k · 3 días"),
-            ("💡", "Why AI Moderation is Harder Than...", "Ethics in Code",  "45k · 5 h"),
-            ("🔬", "Hate Speech Detection 2024",          "AI Research Lab", "12k · 1 sem"),
-        ]
-        for emoji, title, ch, meta in suggested:
-            st.markdown(
-                f"<div class='sug-card'>"
-                f"  <div class='sug-thumb'>{emoji}</div>"
-                f"  <div>"
-                f"    <div class='sug-title'>{html.escape(title)}</div>"
-                f"    <div class='sug-ch'>{html.escape(ch)}</div>"
-                f"    <div class='sug-meta'>{html.escape(meta)}</div>"
-                f"  </div>"
-                f"</div>",
-                unsafe_allow_html=True,
-            )
-# ══════════════════════════════════════════════════════════════════════════════
-# MODERATOR HUB
-# ══════════════════════════════════════════════════════════════════════════════
-def render_hub():
-    try:
-        import plotly.graph_objects as go
-    except ImportError:
-        st.error("Instala plotly: pip install plotly")
-        return
-    st.markdown("## 📊 Panel de Estadísticas")
-    # ── Cards de configuración ──────────────────────────────────────────────
-    model_short = st.session_state.selected_model.split("(")[0].strip()
-    c1, c2, c3 = st.columns(3)
-    for col, label, val in [
-        (c1, "MODEL ARCHITECTURE",  model_short),
-        (c2, "CONFIDENCE THRESHOLD", f"{st.session_state.threshold:.2f} Alpha"),
-        (c3, "LANGUAGE COVERAGE",    "English"),
-    ]:
-        with col:
-            st.markdown(
-                f"<div class='hub-card'>"
-                f"<div class='hub-kpi-label'>{label}</div>"
-                f"<div style='font-weight:600; font-size:0.95rem; color:#0f0f0f;'>"
-                f"{html.escape(str(val))}</div></div>",
-                unsafe_allow_html=True,
-            )
-    st.write("")
-    # ── KPIs ────────��──────────────────────────────────────────────────────
-    total    = len(st.session_state.comments) + 100
-    tox_cnt  = sum(1 for c in st.session_state.comments if c["is_toxic"]) + 5
-    tox_rate = tox_cnt / total * 100
-    m1, m2, m3 = st.columns(3)
-    m1.metric("💬 Total comentarios", f"{total:,}", "+12%")
-    m2.metric("☠️ Tasa de toxicidad",  f"{tox_rate:.1f}%",
-              f"+0.8%", delta_color="inverse")
-    m3.metric("🎯 F1 Score",           "0.7579", "Stable")
-    st.divider()
-    # ── Gráficos ───────────────────────────────────────────────────────────
-    gcol, pcol = st.columns([2.2, 1])
-    with gcol:
-        days = ["Lun","Mar","Mié","Jue","Vie","Sáb","Dom"]
-        vals = [random.randint(30, 80) for _ in days]
-        vals[3] = max(vals) + 25
-        colors = ["#cc0000" if i == 3 else "#b3c6ff" for i in range(7)]
-        fig = go.Figure(go.Bar(x=days, y=vals, marker_color=colors, width=0.55))
-        fig.update_layout(
-            title="Tendencias de Toxicidad (7D)",
-            paper_bgcolor="#ffffff", plot_bgcolor="#ffffff",
-            margin=dict(l=20, r=20, t=40, b=20), height=260,
-            font=dict(size=11, color="#0f0f0f"),
-        )
-        fig.update_yaxes(showgrid=True, gridcolor="#f0f0f0", zeroline=False)
-        fig.update_xaxes(showgrid=False)
-        st.plotly_chart(fig, use_container_width=True)
-    with pcol:
-        fig2 = go.Figure(go.Pie(
-            labels=["Hate Speech","Insulto","Agresividad"],
-            values=[45, 35, 20],
-            hole=0.58,
-            marker_colors=["#cc0000","#0f0f0f","#909090"],
-            textfont_size=11,
-        ))
-        fig2.update_layout(
-            title="Categorías",
-            paper_bgcolor="#ffffff",
-            margin=dict(l=10, r=10, t=40, b=10), height=260,
-            legend=dict(font=dict(size=10), orientation="v"),
-            font=dict(size=11, color="#0f0f0f"),
-        )
-        st.plotly_chart(fig2, use_container_width=True)
-    # ── Historial ──────────────────────────────────────────────────────────
-    st.markdown("### Historial Reciente")
-    df = pd.DataFrame(st.session_state.hub_history)
-    if not df.empty:
-        st.dataframe(
-            df, use_container_width=True, hide_index=True,
-            column_config={
-                "Score": st.column_config.ProgressColumn(
-                    "Score", min_value=0, max_value=1, format="%.2f"
-                )
-            },
-        )
-# ══════════════════════════════════════════════════════════════════════════════
-# SETTINGS
-# ══════════════════════════════════════════════════════════════════════════════
-def render_settings():
-    st.markdown("## ⚙️ Ajustes")
-    # ── Selección de modelo ─────────────────────────────────────────────────
-    st.markdown("### 🤖 Modelo de detección",)
-    st.caption(
-        "Los modelos HuggingFace se descargan la primera vez (~300–600 MB). "
-        "Requieren: `pip install transformers torch sentencepiece`"
-    )
-    st.write("")
-    # Usamos st.radio para la selección — sin bugs de HTML
-    model_names = list(AVAILABLE_MODELS.keys())
-    current_idx = model_names.index(st.session_state.selected_model) \
-                  if st.session_state.selected_model in model_names else 0
-    chosen = st.radio(
-        "Seleccionar modelo",
-        model_names,
-        index=current_idx,
-        label_visibility="collapsed",
-    )
-    if chosen != st.session_state.selected_model:
-        st.session_state.selected_model = chosen
-        st.rerun()
-    # Ficha del modelo seleccionado
-    info = AVAILABLE_MODELS[st.session_state.selected_model]
-    st.markdown(
-        f"<div class='model-card active'>"
-        f"<div class='model-card-name'>{info['icon']}  {html.escape(st.session_state.selected_model)}</div>"
-        f"<div class='model-card-desc'>{html.escape(info['description'])}</div>"
-        f"<div style='margin-top:8px;'>"
-        f"<span class='model-pill'>⚡ {html.escape(info['speed'])}</span>"
-        f"<span class='model-pill'>🎯 {html.escape(info['accuracy'])}</span>"
-        f"<span class='model-pill'>📦 {html.escape(info['requires'])}</span>"
-        f"</div></div>",
-        unsafe_allow_html=True,
-    )
-    # Info sobre modelo fine-tuneado
-    if st.session_state.selected_model == "Modelo fine-tuneado (local)":
-        path = PROJECT_ROOT / "models" / "finetuned_hf"
-        if path.exists():
-            st.success(f"✅ Modelo encontrado en `{path}`")
-        else:
-            st.warning(
-                f"⚠️ No se encontró el modelo en `{path}`. "
-                f"Ejecuta el **notebook 08** para generar el modelo fine-tuneado."
-            )
-    st.divider()
-    # ── Umbral de confianza ─────────────────────────────────────────────────
-    st.markdown("### 🎚️ Umbral de confianza")
-    st.caption("Probabilidad mínima para marcar un comentario como tóxico.")
-    new_thr = st.slider(
-        "Umbral",
-        min_value=0.3, max_value=0.9, step=0.05,
-        value=st.session_state.threshold,
-        label_visibility="collapsed",
-        format="%.2f",
-    )
-    if new_thr != st.session_state.threshold:
-        st.session_state.threshold = new_thr
-        st.info(f"Umbral actualizado: **{new_thr:.2f}**")
-    ta, tb = st.columns(2)
-    ta.info(f"⬇️ **{new_thr:.2f}** bajo → más FP (más censura)", icon="⚠️")
-    tb.info(f"⬆️ **{new_thr:.2f}** alto → más FN (más escapes)", icon="⚠️")
-    st.divider()
-    # ── Test rápido ─────────────────────────────────────────────────────────
-    st.markdown("### 🧪 Probar modelo")
-    test_txt = st.text_input(
-        "Texto a analizar",
-        placeholder="Ej: This is absolutely stupid and racist...",
-        label_visibility="collapsed",
-    )
-    if st.button("Analizar", type="primary") and test_txt.strip():
-        with st.spinner("Analizando..."):
-            svc = get_service(st.session_state.selected_model)
-            res = svc.predict(test_txt)
-        pct     = int(res["probability"] * 100)
-        verdict = "🔴 TÓXICO" if res["is_toxic"] else "🟢 SEGURO"
-        st.markdown(f"**{verdict}** — {pct}% de toxicidad")
-        st.progress(res["probability"])
-        if res["labels"]:
-            st.markdown(f"**Categorías:** {', '.join(res['labels'])}")
-        if "error" in res:
-            st.error(f"Error: {res['error']}")
-        st.caption(f"Modelo: {res['model_used']}")
-# ══════════════════════════════════════════════════════════════════════════════
-# MAIN
-# ══════════════════════════════════════════════════════════════════════════════
-def main():
-    render_sidebar()
-    page = st.session_state.page
-    if page == "Home":
-        render_home()
-    elif page == "Moderator Hub":
-        render_hub()
-    elif page == "Settings":
-        render_settings()
-if __name__ == "__main__":
-    main()

src/evaluation/.gitkeep DELETED Viewed

File without changes

src/service/model_catalog.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Load inference model catalog from configs/model_catalog.yaml."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import yaml
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+CATALOG_PATH = PROJECT_ROOT / "configs" / "model_catalog.yaml"
+_DEFAULT_CATALOG: dict[str, dict[str, Any]] = {
+    "LR + TF-IDF (local)": {
+        "type": "local",
+        "icon": "⚡",
+        "description": "Project baseline.",
+        "speed": "< 50ms",
+        "accuracy": "F1 0.76",
+        "requires": "joblib only",
+    },
+}
+def load_model_catalog() -> dict[str, dict[str, Any]]:
+    if not CATALOG_PATH.exists():
+        return dict(_DEFAULT_CATALOG)
+    with CATALOG_PATH.open(encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    if not isinstance(data, dict) or not data:
+        return dict(_DEFAULT_CATALOG)
+    return data

src/service/model_service.py CHANGED Viewed

@@ -1,99 +1,99 @@
-"""
-src/services/model_service.py
-Servicio centralizado de predicción de toxicidad.
-Modelos soportados:
-  local      → models/final_model.joblib  (LR + TF-IDF, instantáneo)
-  hf_remote  → HuggingFace Hub (requiere internet + transformers)
-  hf_local   → modelo HF fine-tuneado localmente (notebook 08)
-Instalación para modelos HF:
-  pip install transformers torch sentencepiece accelerate
-"""
 import re
-import yaml
-import joblib
 from pathlib import Path
-from typing import Optional
-# ─── Catálogo de modelos ──────────────────────────────────────────────────────
-AVAILABLE_MODELS = {
-    "LR + TF-IDF (local)": {
-        "type"       : "local",
-        "icon"       : "⚡",
-        "description": "Modelo del proyecto. Sin GPU, instantáneo.",
-        "speed"      : "< 50ms",
-        "accuracy"   : "F1 0.76",
-        "requires"   : "Solo joblib",
-    },
-    "DistilBERT Toxicity": {
-        "type"       : "hf_remote",
-        "icon"       : "🤖",
-        "model_id"   : "martin-ha/toxic-comment-model",
-        "description": "DistilBERT fine-tuned en comentarios tóxicos.",
-        "speed"      : "~200ms CPU",
-        "accuracy"   : "F1 0.85",
-        "requires"   : "transformers torch",
-    },
-    "toxic-bert (multilabel)": {
-        "type"       : "hf_remote",
-        "icon"       : "🧠",
-        "model_id"   : "unitary/toxic-bert",
-        "description": "BERT multi-label (Jigsaw). Detecta 6 categorías.",
-        "speed"      : "~400ms CPU",
-        "accuracy"   : "F1 0.88",
-        "requires"   : "transformers torch",
-    },
-    "RoBERTa Toxicity": {
-        "type"       : "hf_remote",
-        "icon"       : "🔬",
-        "model_id"   : "s-nlp/roberta_toxicity_classifier",
-        "description": "RoBERTa fine-tuned para toxicidad general.",
-        "speed"      : "~350ms CPU",
-        "accuracy"   : "F1 0.87",
-        "requires"   : "transformers torch",
-    },
-    "Modelo fine-tuneado (local)": {
-        "type"       : "hf_local",
-        "icon"       : "✨",
-        "model_path" : "models/finetuned_hf",
-        "description": "Tu modelo fine-tuneado en el notebook 08.",
-        "speed"      : "Depende del hardware",
-        "accuracy"   : "A evaluar",
-        "requires"   : "transformers torch",
-    },
-}
 HF_LABEL_MAP = {
-    "toxic": "Tóxico", "severe_toxic": "Muy ofensivo",
-    "obscene": "Obsceno", "threat": "Amenaza",
-    "insult": "Insulto", "identity_hate": "Odio racial",
-    "label_1": "Tóxico",
 }
 _KEYWORD_LABELS = {
-    "Insulto"    : ["idiot","stupid","dumb","fool","moron","loser"],
-    "Odio racial": ["thug","racist","race","criminal"],
-    "Amenaza"    : ["kill","shoot","die","dead","hurt","attack"],
-    "Obsceno"    : ["fuck","shit","ass","bitch","cunt","bastard"],
-    "Agresividad": ["hate","despise","disgusting","pathetic","worthless"],
 }
-def _labels_from_keywords(text: str, probability: float) -> list:
     t = text.lower()
     found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
-    return found if found else (["Contenido ofensivo"] if probability >= 0.5 else [])
 class _FallbackPreprocessor:
-    _SW = {"the","a","an","and","or","but","in","on","at","to","for",
-           "of","with","is","it","this","that","are","was","be","have",
-           "has","he","she","they","we","you","i","not","do","did",
-           "will","can","would","should","could","from","by","as","if"}
-    def transform(self, text):
         t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
         t = re.sub(r"[^\x00-\x7F]+", " ", t)
         t = re.sub(r"[^a-z\s]", " ", t)
@@ -103,10 +103,10 @@ class _FallbackPreprocessor:
 class ModelService:
     def __init__(self, model_name: str, project_root: Optional[Path] = None):
-        self.model_name   = model_name
-        self.cfg          = AVAILABLE_MODELS.get(model_name) or list(AVAILABLE_MODELS.values())[0]
         self.project_root = project_root or Path.cwd()
-        self._model       = None
         self._preprocessor = None
     def _get_model(self):
@@ -119,84 +119,95 @@ class ModelService:
             elif t == "hf_local":
                 path = self.project_root / self.cfg["model_path"]
                 if not path.exists():
-                    raise FileNotFoundError(
-                        f"Modelo no encontrado en {path}. Ejecuta el notebook 08 primero."
-                    )
                 self._load_hf(str(path))
         return self._model
-    def _load_local(self):
-        for name in ["final_model.joblib","lr_tuned.joblib",
-                     "lr_baseline.joblib","best_ensemble.joblib"]:
             p = self.project_root / "models" / name
             if p.exists():
                 self._model = joblib.load(p)
                 break
         if self._model is None:
-            raise FileNotFoundError(f"No hay modelo en {self.project_root / 'models'}")
         try:
-            import sys; sys.path.insert(0, str(self.project_root))
             from src.features.text_preprocessor import TextPreprocessor
             self._preprocessor = TextPreprocessor(
                 config_path=str(self.project_root / "configs" / "features.yaml")
             )
         except Exception:
             self._preprocessor = _FallbackPreprocessor()
-    def _load_hf(self, model_id_or_path: str):
         try:
             from transformers import pipeline as hf_pipeline
-        except ImportError:
-            raise ImportError("Instala: pip install transformers torch sentencepiece")
         self._model = hf_pipeline(
-            "text-classification", model=model_id_or_path,
-            return_all_scores=True, truncation=True, max_length=512,
         )
     def predict(self, text: str) -> dict:
         if not text or not text.strip():
-            return {"is_toxic": False, "probability": 0.0,
-                    "labels": [], "model_used": self.model_name}
         try:
             model = self._get_model()
             if self.cfg["type"] == "local":
                 return self._pred_local(text, model)
             return self._pred_hf(text, model)
         except Exception as e:
-            return {"is_toxic": False, "probability": 0.0,
-                    "labels": [], "model_used": self.model_name, "error": str(e)}
-    def _pred_local(self, text, model):
         clean = self._preprocessor.transform(text) or text
         proba = float(model.predict_proba([clean])[0][1])
-        tox   = proba >= 0.5
-        return {"is_toxic": tox, "probability": proba,
-                "labels": _labels_from_keywords(text, proba) if tox else [],
-                "model_used": self.model_name}
-    def _pred_hf(self, text, pipeline_fn):
-        raw   = pipeline_fn(text[:512])
-        smap  = {s["label"].lower(): s["score"] for s in (raw[0] if isinstance(raw[0], list) else raw)}
-        for key in ("label_1","toxic","toxic_1"):
             if key in smap:
-                proba = smap[key]; break
         else:
-            neg  = {"label_0","non_toxic","not_toxic","not toxic"}
-            vals = [v for k,v in smap.items() if k not in neg]
             proba = max(vals) if vals else 0.0
         tox = proba >= 0.5
-        labels = []
         if tox:
-            for k,v in smap.items():
-                if k not in ("label_0","non_toxic") and v >= 0.35:
-                    friendly = HF_LABEL_MAP.get(k, k.replace("_"," ").title())
-                    if "no tóxico" not in friendly.lower():
-                        labels.append(friendly)
             if not labels:
-                labels = ["Contenido ofensivo"]
-        return {"is_toxic": tox, "probability": proba,
-                "labels": labels, "model_used": self.model_name}
     @staticmethod
-    def get_available_models(): return AVAILABLE_MODELS
-    def get_model_info(self):   return self.cfg

+"""Centralized toxicity prediction service."""
+from __future__ import annotations
 import re
+import sys
 from pathlib import Path
+from typing import Any, Optional
+import joblib
+from src.service.model_catalog import load_model_catalog
+AVAILABLE_MODELS: dict[str, dict[str, Any]] = load_model_catalog()
+_HF_DEPS_MSG = "Install HF deps: uv sync --extra hf"
+def hf_deps_available() -> bool:
+    try:
+        import transformers  # noqa: F401
+        return True
+    except ImportError:
+        return False
+def check_model_availability(name: str, project_root: Path | None = None) -> tuple[bool, str | None]:
+    """Return (available, reason) for a catalog model name."""
+    cfg = AVAILABLE_MODELS.get(name)
+    if not cfg:
+        return False, "Unknown model"
+    root = project_root or Path.cwd()
+    model_type = cfg.get("type", "local")
+    if model_type == "local":
+        models_dir = root / "models"
+        if any((models_dir / n).exists() for n in (
+            "final_model.joblib",
+            "lr_tuned.joblib",
+            "lr_baseline.joblib",
+            "best_ensemble.joblib",
+        )):
+            return True, None
+        return False, f"No model in {models_dir}"
+    if model_type == "hf_local":
+        if not hf_deps_available():
+            return False, _HF_DEPS_MSG
+        path = root / cfg["model_path"]
+        if not path.exists():
+            return False, f"Model not found at {path}."
+        return True, None
+    if model_type == "hf_remote":
+        if not hf_deps_available():
+            return False, _HF_DEPS_MSG
+        return True, None
+    return False, f"Unsupported model type: {model_type}"
 HF_LABEL_MAP = {
+    "toxic": "Toxic",
+    "severe_toxic": "Severely offensive",
+    "obscene": "Obscene",
+    "threat": "Threat",
+    "insult": "Insult",
+    "identity_hate": "Identity hate",
+    "label_1": "Toxic",
 }
 _KEYWORD_LABELS = {
+    "Insult": ["idiot", "stupid", "dumb", "fool", "moron", "loser"],
+    "Identity hate": ["thug", "racist", "race", "criminal"],
+    "Threat": ["kill", "shoot", "die", "dead", "hurt", "attack"],
+    "Obscene": ["fuck", "shit", "ass", "bitch", "cunt", "bastard"],
+    "Aggression": ["hate", "despise", "disgusting", "pathetic", "worthless"],
 }
+def _labels_from_keywords(text: str, probability: float) -> list[str]:
     t = text.lower()
     found = [lbl for lbl, kws in _KEYWORD_LABELS.items() if any(k in t for k in kws)]
+    return found if found else (["Offensive content"] if probability >= 0.5 else [])
 class _FallbackPreprocessor:
+    _SW = {
+        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
+        "of", "with", "is", "it", "this", "that", "are", "was", "be", "have",
+        "has", "he", "she", "they", "we", "you", "i", "not", "do", "did",
+        "will", "can", "would", "should", "could", "from", "by", "as", "if",
+    }
+    def transform(self, text: str) -> str:
         t = re.sub(r"http\S+|www\.\S+|@\w+", " ", str(text).lower())
         t = re.sub(r"[^\x00-\x7F]+", " ", t)
         t = re.sub(r"[^a-z\s]", " ", t)
 class ModelService:
     def __init__(self, model_name: str, project_root: Optional[Path] = None):
+        self.model_name = model_name
+        self.cfg = AVAILABLE_MODELS.get(model_name) or next(iter(AVAILABLE_MODELS.values()))
         self.project_root = project_root or Path.cwd()
+        self._model = None
         self._preprocessor = None
     def _get_model(self):
             elif t == "hf_local":
                 path = self.project_root / self.cfg["model_path"]
                 if not path.exists():
+                    raise FileNotFoundError(f"Model not found at {path}.")
                 self._load_hf(str(path))
         return self._model
+    def _load_local(self) -> None:
+        for name in ("final_model.joblib", "lr_tuned.joblib", "lr_baseline.joblib", "best_ensemble.joblib"):
             p = self.project_root / "models" / name
             if p.exists():
                 self._model = joblib.load(p)
                 break
         if self._model is None:
+            raise FileNotFoundError(f"No model in {self.project_root / 'models'}")
         try:
+            sys.path.insert(0, str(self.project_root))
             from src.features.text_preprocessor import TextPreprocessor
             self._preprocessor = TextPreprocessor(
                 config_path=str(self.project_root / "configs" / "features.yaml")
             )
         except Exception:
             self._preprocessor = _FallbackPreprocessor()
+    def _load_hf(self, model_id_or_path: str) -> None:
         try:
             from transformers import pipeline as hf_pipeline
+        except ImportError as exc:
+            raise ImportError("Install HF deps: uv sync --extra hf") from exc
         self._model = hf_pipeline(
+            "text-classification",
+            model=model_id_or_path,
+            return_all_scores=True,
+            truncation=True,
+            max_length=512,
         )
     def predict(self, text: str) -> dict:
         if not text or not text.strip():
+            return {"is_toxic": False, "probability": 0.0, "labels": [], "model_used": self.model_name}
         try:
             model = self._get_model()
             if self.cfg["type"] == "local":
                 return self._pred_local(text, model)
             return self._pred_hf(text, model)
         except Exception as e:
+            return {
+                "is_toxic": False,
+                "probability": 0.0,
+                "labels": [],
+                "model_used": self.model_name,
+                "error": str(e),
+            }
+    def _pred_local(self, text: str, model) -> dict:
         clean = self._preprocessor.transform(text) or text
         proba = float(model.predict_proba([clean])[0][1])
+        tox = proba >= 0.5
+        return {
+            "is_toxic": tox,
+            "probability": proba,
+            "labels": _labels_from_keywords(text, proba) if tox else [],
+            "model_used": self.model_name,
+        }
+    def _pred_hf(self, text: str, pipeline_fn) -> dict:
+        raw = pipeline_fn(text[:512])
+        smap = {s["label"].lower(): s["score"] for s in (raw[0] if isinstance(raw[0], list) else raw)}
+        proba = 0.0
+        for key in ("label_1", "toxic", "toxic_1"):
             if key in smap:
+                proba = smap[key]
+                break
         else:
+            neg = {"label_0", "non_toxic", "not_toxic", "not toxic"}
+            vals = [v for k, v in smap.items() if k not in neg]
             proba = max(vals) if vals else 0.0
         tox = proba >= 0.5
+        labels: list[str] = []
         if tox:
+            for k, v in smap.items():
+                if k not in ("label_0", "non_toxic") and v >= 0.35:
+                    friendly = HF_LABEL_MAP.get(k, k.replace("_", " ").title())
+                    labels.append(friendly)
             if not labels:
+                labels = ["Offensive content"]
+        return {"is_toxic": tox, "probability": proba, "labels": labels, "model_used": self.model_name}
     @staticmethod
+    def get_available_models() -> dict:
+        return AVAILABLE_MODELS
+    def get_model_info(self) -> dict:
+        return self.cfg

tests/test_api.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Tests del endpoint POST /predict."""
 from unittest.mock import MagicMock
@@ -6,11 +6,14 @@ import pytest
 from fastapi.testclient import TestClient
 from src.api import main as api_main
 PREDICT_RESPONSE_KEYS = {
     "text",
     "is_toxic",
     "probability",
     "labels",
     "model_used",
     "latency_ms",
@@ -28,13 +31,16 @@ def client():
     }
     with TestClient(api_main.app) as test_client:
-        api_main._state["service"] = mock_service
-        api_main._state["model_name"] = "LR + TF-IDF (local)"
-        api_main._state["predictions_served"] = 0
         yield test_client
-    api_main._state["service"] = None
-    api_main._state["model_name"] = None
 def test_predict_returns_correct_structure(client: TestClient):
@@ -47,14 +53,79 @@ def test_predict_returns_correct_structure(client: TestClient):
     data = response.json()
     assert PREDICT_RESPONSE_KEYS <= set(data.keys())
     assert data["text"] == "This is a sample comment"
     assert isinstance(data["is_toxic"], bool)
     assert 0.0 <= data["probability"] <= 1.0
-    assert isinstance(data["labels"], list)
-    assert isinstance(data["model_used"], str)
-    assert isinstance(data["latency_ms"], (int, float))
 def test_predict_rejects_empty_text(client: TestClient):
     response = client.post("/predict", json={"text": "   "})
     assert response.status_code == 422

+"""Tests for POST /predict."""
 from unittest.mock import MagicMock
 from fastapi.testclient import TestClient
 from src.api import main as api_main
+from src.api.state import get_state
 PREDICT_RESPONSE_KEYS = {
     "text",
     "is_toxic",
     "probability",
+    "status",
+    "mode",
     "labels",
     "model_used",
     "latency_ms",
     }
     with TestClient(api_main.app) as test_client:
+        state = get_state()
+        state["service"] = mock_service
+        state["model_name"] = "LR + TF-IDF (local)"
+        state["predictions_served"] = 0
+        state["startup_time"] = 0.0
         yield test_client
+    state = get_state()
+    state["service"] = None
+    state["model_name"] = None
 def test_predict_returns_correct_structure(client: TestClient):
     data = response.json()
     assert PREDICT_RESPONSE_KEYS <= set(data.keys())
     assert data["text"] == "This is a sample comment"
+    assert data["status"] == "Safe"
+    assert data["mode"] == "binary"
     assert isinstance(data["is_toxic"], bool)
     assert 0.0 <= data["probability"] <= 1.0
 def test_predict_rejects_empty_text(client: TestClient):
     response = client.post("/predict", json={"text": "   "})
+    assert response.status_code == 422
+def test_health_includes_project_name(client: TestClient):
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json()["project"] == "youtube_hate_detector"
+def test_predict_video_demo_comments_differ_by_url(client: TestClient, monkeypatch):
+    monkeypatch.delenv("YOUTUBE_API_KEY", raising=False)
+    r1 = client.post(
+        "/predict-video",
+        json={
+            "url": "https://www.youtube.com/watch?v=jNQXAC9IVRw",
+            "max_comments": 5,
+            "threshold": 0.5,
+        },
+    )
+    r2 = client.post(
+        "/predict-video",
+        json={
+            "url": "https://www.youtube.com/watch?v=IEEhzQoKtQU",
+            "max_comments": 5,
+            "threshold": 0.5,
+        },
+    )
+    assert r1.status_code == 200
+    assert r2.status_code == 200
+    data1 = r1.json()
+    data2 = r2.json()
+    assert data1["source"] == "demo"
+    assert data2["source"] == "demo"
+    assert data1["results"][0]["text"] != data2["results"][0]["text"]
+def test_models_status_lists_catalog(client: TestClient):
+    response = client.get("/models/status")
+    assert response.status_code == 200
+    data = response.json()
+    assert "models" in data
+    assert len(data["models"]) >= 1
+    names = {m["name"] for m in data["models"]}
+    assert "LR + TF-IDF (local)" in names
+def test_predict_video_comments_disabled_raises_422(client: TestClient, monkeypatch):
+    from src.api.youtube import CommentsFetchError
+    monkeypatch.setenv("YOUTUBE_API_KEY", "fake-key")
+    def _raise_disabled(*_args, **_kwargs):
+        raise CommentsFetchError("Comments are disabled on this video")
+    monkeypatch.setattr("src.api.routes.predict.fetch_comments", _raise_disabled)
+    response = client.post(
+        "/predict-video",
+        json={
+            "url": "https://www.youtube.com/watch?v=disabled123",
+            "max_comments": 5,
+            "threshold": 0.5,
+        },
+    )
     assert response.status_code == 422
+    assert "disabled" in response.json()["detail"].lower()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff