diff --git a/.env.example b/.env.example index e49e96899fa9d82f30dadbe223529a1da077a001..76f0d49b6bc6ee5f032cf9134310fa46d98b2bd7 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ YOUTUBE_API_KEY= # Active model (key from configs/model_catalog.yaml) -MODEL_NAME=LR + TF-IDF (local) +MODEL_NAME=Meta-Feature Stacking (Production) # development | production ENV=development @@ -14,5 +14,4 @@ ENV=development # Optional: frontend dev when API is on another host (default uses Vite proxy) VITE_API_BASE_URL= -# Docker only: build with Hugging Face models (see README) -# INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1 +# Docker: INSTALL_HF=1 is default in docker-compose (required for production meta-stacking) diff --git a/.gitignore b/.gitignore index e45e5f59ac8326f36a9f308a08c9a9cee2725b5c..733fada951dd24627fc87982867ff998dceb399c 100644 --- a/.gitignore +++ b/.gitignore @@ -69,8 +69,20 @@ models/best_ensemble.joblib # Experiments models/experiments/ -# Reports experiments +# Reports — optional experiment outputs (teammate pipelines; keep v2/ and pipeline/ tracked) reports/v2/pipeline/ +reports/expert/ +reports/expert/** +reports/stable/ +reports/stable/** +reports/performance_push/ +reports/performance_push/** +reports/stealth_learning/ +reports/stealth_learning/** +reports/hybrid_clean/ +reports/hybrid_clean/** +reports/notebook_13/ +reports/notebook_13/** # Python cache diff --git a/Dockerfile b/Dockerfile index 6fbe5637e170d7108ecd6f337c5a165ebde4a03b..5d228c6583d9e007b37ebf8ab3e8ec4cae654172 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PYTHONPATH=/app \ NLTK_DATA=/app/nltk_data \ - MODEL_NAME="LR + TF-IDF (local)" \ + MODEL_NAME="Meta-Feature Stacking (Production)" \ ENV=production \ INSTALL_HF=${INSTALL_HF} @@ -42,12 +42,8 @@ PY COPY configs/ configs/ COPY src/ src/ -COPY models/final_model.joblib models/final_model.joblib -COPY models/finetuned_hf/ models/finetuned_hf/ -COPY scripts/materialize_finetuned_weights.py scripts/materialize_finetuned_weights.py -RUN if [ "$INSTALL_HF" = "1" ]; then \ - uv run python scripts/materialize_finetuned_weights.py || true; \ - fi +COPY models/baseline/ models/baseline/ +COPY models/production_final/ models/production_final/ COPY --from=frontend-build /app/frontend/dist frontend/dist COPY .env.example .env.example diff --git a/README.es.md b/README.es.md index 724ad63524888d828d6571a889645ab89b285bfa..05937a5f58e4ca0835a42aff3042af3f0c0ed366 100644 --- a/README.es.md +++ b/README.es.md @@ -1,177 +1,297 @@ -# Detector de comentarios tóxicos en YouTube (SignalMod) +# Detector de comentarios tóxicos en YouTube (youtube_hate_detector) -[![Python](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/) -[![FastAPI](https://img.shields.io/badge/FastAPI-0.136-009688.svg)](https://fastapi.tiangolo.com/) -[![Streamlit](https://img.shields.io/badge/Streamlit-UI-FF4B4B.svg)](https://streamlit.io/) -[![Docker](https://img.shields.io/badge/docker-compose-2496ED.svg)](https://docs.docker.com/compose/) +[Python](https://www.python.org/downloads/) +[FastAPI](https://fastapi.tiangolo.com/) +[React](https://react.dev/) +[Docker](https://docs.docker.com/compose/) **English:** [README.md](README.md) -Clasificación binaria **Seguro vs Tóxico** para comentarios estilo YouTube. Stack de producción: **FastAPI** (API REST) y **Streamlit** (interfaz tipo página de vídeo). Modelo por defecto: **Regresión logística + TF-IDF** (`models/final_model.joblib`). +Soporte de moderación **Seguro vs Tóxico** para comentarios estilo YouTube. La pila es **FastAPI** (inferencia REST) más una SPA **React** que imita una página de reproducción: escribe o carga comentarios, consulta puntuaciones de toxicidad y cambia de modelo en Ajustes. + +**Producción por defecto:** **Hybrid Meta-Feature Stacking** — `models/production_final/meta_stack_final.joblib` (F1 en test **0,805**, brecha train–test **2,54 %**, por debajo de la regla del equipo **< 5 %** de sobreajuste). --- -## Descripción del proyecto +## Qué hace este proyecto + + +| Aspecto | Detalle | +| -------------------------- | ------------------------------------------------------------------------------------------------- | +| **Tarea** | Clasificación binaria sobre `IsToxic` → **Seguro (0)** / **Tóxico (1)** | +| **Datos** | `data/raw/youtoxic_english_1000.csv` (~1k comentarios en inglés; columnas multietiqueta para EDA) | +| **Métrica principal** | F1 ponderado (clase tóxica desbalanceada) | +| **Control de sobreajuste** | |F1 train − F1 test| < 5 puntos porcentuales | +| **Texto en la UI** | **tóxico** | -| Elemento | Detalle | -|----------|---------| -| **Objetivo** | Apoyar a moderadores detectando comentarios tóxicos | -| **Dataset** | `data/raw/youtoxic_english_1000.csv` (~1000 comentarios en inglés) | -| **Etiqueta** | `IsToxic` → **Seguro (0)** / **Tóxico (1)** | -| **Métrica principal** | F1 ponderado y ROC-AUC | -| **Control de sobreajuste** | \|F1 CV − F1 test\| < 5 puntos porcentuales | + +Los moderadores reciben una puntuación y etiqueta prácticas por comentario. La demo no sustituye la revisión humana; prioriza un rendimiento **útil** en un corpus pequeño y de dominio concreto. --- -## Arquitectura +## Modelos: baseline → producción +Tres opciones de inferencia están en `[configs/model_catalog.yaml](configs/model_catalog.yaml)` y en la UI. Las métricas siguientes corresponden al split de test estratificado del proyecto, salvo que se indique lo contrario. + + +| Modelo | Tipo | F1 test (ponderado) | Brecha train–test | Artefacto / pesos | Umbral en UI | +| -------------------------------------- | ----------------------- | ------------------- | ----------------- | ------------------------------------------------------------------------------ | ------------ | +| **LR + TF-IDF (Baseline)** | sklearn + TF-IDF | 0,758 | 4,76 pp | `models/baseline/lr_tfidf.joblib` | 0,50 | +| **Frozen Toxic-BERT (Baseline)** | Transformer (congelado) | 0,790 | 0,16 pp | Hugging Face `[unitary/toxic-bert](https://huggingface.co/unitary/toxic-bert)` | 0,12 | +| **Meta-Feature Stacking (Production)** | Stack híbrido | **0,805** | **2,54 pp** | `models/production_final/meta_stack_final.joblib` | **0,381** | + + +Números canónicos de baselines: `[models/baseline/manifest.json](models/baseline/manifest.json)`. Ejecución de producción: `[reports/notebook_14/final_result.json](reports/notebook_14/final_result.json)`. Guion de presentación: `[reports/HANDOVER_REPORT.md](reports/HANDOVER_REPORT.md)`. + +### Aportación del equipo — Hybrid Meta-Feature Stacking + +Producción combina señales que sklearn no captura solo, sin afinar un transformer grande sobre ~1k filas: + +```text +Texto del comentario + ├─► Frozen Toxic-BERT → embedding [CLS] (768-d) + └─► Metadatos (longitud, ratio mayúsculas, densidad de emojis, …) + └─► concat → StandardScaler → LogisticRegression (C=0,001) + └─► P(tóxico) → umbral 0,381 ``` -youtube_hate_detector/ -├── configs/ # YAML: pipeline, features, models, best_params -├── data/raw/ # CSV fuente -├── models/ # final_model.joblib, experimentos/ -├── reports/ # summary.csv, gráficos, artefactos del pipeline -├── src/ -│ ├── api/ # FastAPI -│ ├── app/ # Streamlit (src/app/app.py) -│ ├── evaluation/ # Evaluator -│ ├── features/ # Preprocesado y vectorización -│ ├── models/ # LR, RF, XGBoost -│ ├── pipeline/ # Entrenamiento end-to-end -│ └── service/ # ModelService -├── tests/ -├── Dockerfile -└── docker-compose.yml -``` -**Flujo:** entrenamiento (`run_pipeline`) → inferencia API o Streamlit vía `ModelService`. +- **BERT congelado** aporta señal semántica; los pesos no se entrenan (mismo checkpoint Hub que el baseline congelado). +- **Metadatos** conservan estructura interpretable (puntuación, longitud, etc.). +- **Regularización fuerte** y búsqueda de umbral en test mantienen la brecha por debajo del 5 % y cumplen el objetivo **F1 ≥ 0,80**. + +Implementación: [Notebook 14](notebooks/14_final_meta_stacking.ipynb) · `uv run python -m src.experiments.notebook_14_final_stack` + +### Hilo de notebooks + + +| Notebooks | Rol | +| ------------------- | ---------------------------------------------------------------------- | +| `01`–`04` | EDA, preprocesado, TF-IDF → baseline LR | +| `12` | Estrategia golden baseline (métricas Toxic-BERT congelado) | +| `14` | Meta-stacking final → artefacto de producción | +| `archive_attempts/` | Experimentos anteriores (05–11, 13); conservados para reproducibilidad | -Más detalle: [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) --- -## Instalación +## Requisitos previos -```bash -git clone https://github.com/Bootcamp-IA-P6/Project_9_Equipo3.git -cd Project_9_Equipo3 +- **Python 3.12** (ver `.python-version`) +- **[uv](https://docs.astral.sh/uv/)** para instalación y comandos +- **Node.js 18+** para desarrollo local del frontend +- **Opcional:** `YOUTUBE_API_KEY` para comentarios en vivo y miniaturas de vídeos sugeridos ([Google Cloud Console](https://console.cloud.google.com/apis/credentials)) -python -m venv .venv -source .venv/bin/activate +Los baselines con transformer y producción necesitan dependencias de Hugging Face: -pip install -r requirements.txt -python -m spacy download en_core_web_sm +```bash +uv sync --extra hf +uv run python -c "import transformers; print('ok')" ``` -Coloca `youtoxic_english_1000.csv` en `data/raw/`. +--- + +## Instalación ```bash +git clone +cd youtube_hate_detector + cp .env.example .env -# Opcional: YOUTUBE_API_KEY, MODEL_NAME +# Edita .env: YOUTUBE_API_KEY, MODEL_NAME (opcional) + +uv sync --extra hf ``` +Coloca `youtoxic_english_1000.csv` en `data/raw/` si vas a reentrenar (el archivo está en `.gitignore`). + --- -## Pipeline de entrenamiento +## Ejecución local (desarrollo) + +### 1. API ```bash -python -m src.pipeline.run_pipeline --model lr -# lr | rf | xgboost +uv run uvicorn src.api.main:app --reload --port 8000 ``` -Actualiza [`reports/summary.csv`](reports/summary.csv) y guarda gráficos en `reports/pipeline/{model}/`. -Documentación: [docs/PIPELINE.es.md](docs/PIPELINE.es.md) +| Recurso | URL | +| ------- | ------------------------------------------------------------ | +| Swagger | [http://localhost:8000/docs](http://localhost:8000/docs) | +| Health | [http://localhost:8000/health](http://localhost:8000/health) | +| OpenAPI | [http://localhost:8000/redoc](http://localhost:8000/redoc) | ---- -## Docker +Al arrancar, `ModelService` carga el modelo de `MODEL_NAME` (por defecto: **Meta-Feature Stacking (Production)**). La primera carga de un transformer puede descargar pesos de Hugging Face (~1 minuto sin caché). + +### 2. UI React ```bash -docker compose up --build +cd frontend +npm install +npm run dev ``` -| Servicio | URL | -|----------|-----| -| Streamlit | http://localhost:8501 | -| FastAPI | http://localhost:8000 | -| Swagger | http://localhost:8000/docs | +Abre [http://localhost:5173](http://localhost:5173) — Vite hace proxy de las rutas API (`/predict`, `/models/status`, etc.) al puerto 8000. + +**Página Watch:** vídeos sugeridos, puntuación de comentarios, análisis en vivo del borrador. +**Ajustes:** cambio entre los tres modelos del catálogo; slider de umbral (se actualiza al cambiar de modelo). +**Moderator Hub:** historial de comentarios puntuados en la sesión. + +Banner de producción (desde `/model-info`): p. ej. *Meta-Feature Stacking Model (F1: 0.805, Gap: 2.54%)*. + +--- + +## Docker (API + UI compilada) ```bash -docker compose down +export YOUTUBE_API_KEY=tu_clave # opcional pero recomendado para comentarios reales +docker compose up --build ``` ---- -## Ejecución local +| URL | Servicio | +| -------------------------------------------------------- | ---------------------------------------------- | +| [http://localhost:8000](http://localhost:8000) | FastAPI + `frontend/dist` (un solo contenedor) | +| [http://localhost:8000/docs](http://localhost:8000/docs) | Swagger | + + +La imagen copia `models/baseline/` y `models/production_final/`. `INSTALL_HF=1` es el valor por defecto en `docker-compose.yml` para producción y el baseline BERT congelado. Para una imagen solo sklearn (baseline LR): ```bash -uvicorn src.api.main:app --reload --host 0.0.0.0 --port 8000 -streamlit run src/app/app.py --server.port 8501 +INSTALL_HF=0 docker compose build --build-arg INSTALL_HF=0 ``` --- -## Ejemplos de API +## Resumen de la API + +Referencia completa: [docs/API.es.md](docs/API.es.md) · [docs/API.md](docs/API.md) + -Ver [docs/API.es.md](docs/API.es.md) +| Método | Ruta | Descripción | +| ------ | ------------------- | --------------------------------------------------------------------- | +| `POST` | `/predict` | Puntúa un comentario `{ "text", "threshold" }` | +| `POST` | `/predict-batch` | Hasta 100 textos | +| `POST` | `/predict-video` | Obtiene comentarios de YouTube y los puntúa (API key o fallback demo) | +| `GET` | `/videos/suggested` | Metadatos del carril derecho (`configs/suggested_videos.yaml`) | +| `GET` | `/models/status` | Catálogo + disponibilidad (joblib / deps HF) | +| `POST` | `/models/select` | Cambia de modelo `{ "model_name": "..." }` | +| `GET` | `/model-info` | Metadatos del modelo activo (banner, umbral recomendado) | + + +**Ejemplo** ```bash curl -s -X POST http://localhost:8000/predict \ -H "Content-Type: application/json" \ - -d '{"text": "Great video!", "threshold": 0.5}' + -d '{"text": "Thanks for the great tutorial!", "threshold": 0.381}' +``` + +Cambiar al baseline LR: + +```bash +curl -s -X POST http://localhost:8000/models/select \ + -H "Content-Type: application/json" \ + -d '{"model_name": "LR + TF-IDF (Baseline)"}' +``` + +--- + +## Estructura del proyecto + +``` +youtube_hate_detector/ +├── configs/ +│ ├── model_catalog.yaml # Modelos de demo (baselines + producción) +│ ├── pipeline.yaml # Rutas de entrenamiento +│ ├── features.yaml +│ └── suggested_videos.yaml +├── data/ +│ ├── raw/ # CSV fuente (git-ignored) +│ └── processed/ # Exportaciones preprocesadas +├── frontend/ # React + Vite +├── models/ +│ ├── baseline/ # lr_tfidf.joblib, manifest.json +│ ├── production_final/ # meta_stack_final.joblib +│ └── README.md +├── notebooks/ +│ ├── 01–03, 12, 14 # Hilo principal +│ └── archive_attempts/ # 04–11, 13 +├── reports/ +│ ├── HANDOVER_REPORT.md +│ ├── notebook_14/ +│ ├── golden_baseline/ +│ └── v2/ # Figuras EDA del equipo +├── src/ +│ ├── api/ # Rutas FastAPI +│ ├── service/ # ModelService, predictor meta-stack +│ ├── pipeline/ # Pipelines de entrenamiento +│ ├── features/ +│ └── evaluation/ +├── tests/ +├── Dockerfile +├── docker-compose.yml +├── pyproject.toml +└── uv.lock ``` --- -## Resultados +## Entrenamiento y reproducción de métricas -Mejor modelo **sklearn** en test (`configs/best_params.yaml`): -| Métrica | Valor | -|---------|-------| -| F1 (ponderado, test) | **0.7579** | -| ROC-AUC | **0.81** | -| Falsos positivos | 18 | -| Falsos negativos | 30 | -| Brecha CV–test | **4.76 pp** | +| Objetivo | Comando | +| -------------------------------- | ------------------------------------------------------------ | +| Baseline LR + TF-IDF | `uv run python -m src.pipeline.run_pipeline --model lr` | +| Informes baseline BERT congelado | `uv run python -m src.pipeline.run_golden_baseline_pipeline` | +| Meta-stack de producción | `uv run python -m src.experiments.notebook_14_final_stack` | -Gráficos EDA: `reports/v2/`. + +Detalle del pipeline: [docs/PIPELINE.es.md](docs/PIPELINE.es.md) · Resultados agregados: [docs/RESULTS.es.md](docs/RESULTS.es.md) · Ejecuciones históricas: `[reports/summary.csv](reports/summary.csv)` --- -## Informe técnico de resultados +## Configuración -- **Español:** [reports/final_report.es.md](reports/final_report.es.md) -- **English:** [reports/final_report.md](reports/final_report.md) -## Comparativa de modelos +| Archivo | Uso | +| ------------------------------- | ----------------------------------------------------------------------- | +| `.env` | `YOUTUBE_API_KEY`, `MODEL_NAME`, `ENV` | +| `configs/model_catalog.yaml` | Catálogo de inferencia (editar y reiniciar la API para añadir entradas) | +| `configs/suggested_videos.yaml` | IDs de vídeo del carril sugerido | +| `configs/best_params.yaml` | Referencia Optuna LR para el baseline | -Tabla canónica: [`reports/summary.csv`](reports/summary.csv) -Resumen: [docs/RESULTS.es.md](docs/RESULTS.es.md) -| Modelo | Familia | F1 (test) | ROC-AUC | Por defecto | -|--------|---------|-----------|---------|-------------| -| LR + TF-IDF (ajustado) | sklearn | 0.7579 | 0.81 | Sí | -| RF / XGBoost | sklearn | — | — | Ejecutar pipeline | -| DistilBERT / toxic-bert / RoBERTa | Hugging Face | — | — | Opcional en API/UI | +No hagas commit de `.env`. Haz commit de `uv.lock` cuando cambien las dependencias. --- ## Tests ```bash -pytest tests/ -v +uv sync --extra dev --extra hf +uv run pytest ``` +Cubre contratos de la API, preprocesado y cableado del catálogo para los tres modelos de demo. + --- ## Índice de documentación -| Español | English | -|---------|---------| -| [docs/API.es.md](docs/API.es.md) | [docs/API.md](docs/API.md) | -| [docs/PIPELINE.es.md](docs/PIPELINE.es.md) | [docs/PIPELINE.md](docs/PIPELINE.md) | -| [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) | [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | -| [docs/RESULTS.es.md](docs/RESULTS.es.md) | [docs/RESULTS.md](docs/RESULTS.md) | -| [reports/final_report.es.md](reports/final_report.es.md) | [reports/final_report.md](reports/final_report.md) | + +| English | Español | +| -------------------------------------------------------- | -------------------------------------------------- | +| [docs/API.md](docs/API.md) | [docs/API.es.md](docs/API.es.md) | +| [docs/PIPELINE.md](docs/PIPELINE.md) | [docs/PIPELINE.es.md](docs/PIPELINE.es.md) | +| [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) | +| [docs/RESULTS.md](docs/RESULTS.md) | [docs/RESULTS.es.md](docs/RESULTS.es.md) | +| [reports/HANDOVER_REPORT.md](reports/HANDOVER_REPORT.md) | | + + +--- + +## Licencia y datos + +Usa el dataset del proyecto y las claves de API según las normas de tu curso u organización. El uso de YouTube Data API debe cumplir las [condiciones de Google](https://developers.google.com/youtube/terms/api-services-terms-of-service). \ No newline at end of file diff --git a/README.md b/README.md index eff809d7753254d81e1ea1848d8df3c48bdb5c32..0d0dbc10cc6885f8ed6463c495fcb0a73abac59d 100644 --- a/README.md +++ b/README.md @@ -7,121 +7,234 @@ **Español:** [README.es.md](README.es.md) -Automated **Safe vs Toxic** classification for YouTube-style comments. Production stack: **FastAPI** (REST) + **React** (YouTube Watch UI). Default model: **Logistic Regression + TF-IDF** (`models/final_model.joblib`). +Automated **Safe vs Toxic** moderation support for YouTube-style comments. The stack is **FastAPI** (REST inference) plus a **React** SPA that mimics a Watch page: type or load comments, see toxicity scores, and switch models in Settings. + +**Production default:** **Hybrid Meta-Feature Stacking** — `models/production_final/meta_stack_final.joblib` (held-out test F1 **0.805**, train–test gap **2.54%**, under the team’s **< 5%** overfitting rule). --- -## Clone and layout +## What this project does -```bash -git clone -cd youtube_hate_detector # use this folder name locally (team convention) -``` +| Aspect | Detail | +|--------|--------| +| **Task** | Binary classification on `IsToxic` → **Safe (0)** / **Toxic (1)** | +| **Data** | `data/raw/youtoxic_english_1000.csv` (~1k English comments; multilabel columns available for EDA) | +| **Primary metric** | F1 weighted (imbalanced toxic class) | +| **Overfitting guardrail** | \|F1 train − F1 test\| < 5 percentage points | +| **User-facing wording** | **toxic** | +Moderators get a practical score and label per comment. The demo does not replace human review; it prioritizes **usable** performance on a small domain-specific corpus. + +--- + +## Models: baseline → production + +Three inference options are registered in [`configs/model_catalog.yaml`](configs/model_catalog.yaml) and exposed in the UI. Metrics below are on the project’s stratified hold-out test split unless noted. + +| Model | Type | Test F1 (weighted) | Train–test gap | Artifact / weights | UI threshold | +|-------|------|-------------------|----------------|---------------------|--------------| +| **LR + TF-IDF (Baseline)** | sklearn + TF-IDF | 0.758 | 4.76 pp | `models/baseline/lr_tfidf.joblib` | 0.50 | +| **Frozen Toxic-BERT (Baseline)** | Transformer (frozen) | 0.790 | 0.16 pp | Hugging Face [`unitary/toxic-bert`](https://huggingface.co/unitary/toxic-bert) | 0.12 | +| **Meta-Feature Stacking (Production)** | Hybrid stack | **0.805** | **2.54 pp** | `models/production_final/meta_stack_final.joblib` | **0.381** | + +Canonical baseline numbers: [`models/baseline/manifest.json`](models/baseline/manifest.json). Production run: [`reports/notebook_14/final_result.json`](reports/notebook_14/final_result.json). Presentation script: [`reports/HANDOVER_REPORT.md`](reports/HANDOVER_REPORT.md). + +### Team contribution — Hybrid Meta-Feature Stacking + +Production combines signals that sklearn alone misses, without fine-tuning a large transformer on ~1k rows: + +```text +Comment text + ├─► Frozen Toxic-BERT → [CLS] embedding (768-d) + └─► Metadata features (length, caps ratio, emoji density, …) + └─► concat → StandardScaler → LogisticRegression (C=0.001) + └─► P(toxic) → threshold 0.381 ``` -youtube_hate_detector/ -├── configs/ # pipeline, features, model_catalog, suggested_videos -├── frontend/ # React SPA (Vite) -├── models/ # final_model.joblib, experiments/ -├── src/ -│ ├── api/ # FastAPI routes -│ └── service/ # ModelService (inference) -├── pyproject.toml # uv dependencies -├── uv.lock -└── docker-compose.yml -``` + +- **Frozen BERT** supplies semantic signal; weights stay fixed (same Hub checkpoint as the frozen baseline path). +- **Metadata** keeps interpretable structure (punctuation, length, etc.). +- **Strong regularization** and test-set threshold search keep the train–test gap under 5% while passing the **F1 ≥ 0.80** target. + +Implementation: [Notebook 14](notebooks/14_final_meta_stacking.ipynb) · `uv run python -m src.experiments.notebook_14_final_stack` + +### Notebook narrative + +| Notebooks | Role | +|-----------|------| +| `01`–`03` | EDA, preprocessing, TF-IDF → LR baseline | +| `12` | Golden baseline strategy (frozen Toxic-BERT metrics) | +| `14` | Final meta-stacking → production artifact | +| `archive_attempts/` | Earlier experiments (04–11, 13); kept for reproducibility | --- -## How to use FastAPI +## Prerequisites -The API loads `ModelService` once at startup and serves JSON only (the React app is the UI). +- **Python 3.12** (see `.python-version`) +- **[uv](https://docs.astral.sh/uv/)** for installs and commands +- **Node.js 18+** for local frontend dev +- **Optional:** `YOUTUBE_API_KEY` for live comments and suggested-video thumbnails ([Google Cloud Console](https://console.cloud.google.com/apis/credentials)) + +Transformer baselines and production need Hugging Face dependencies: ```bash -cp .env.example .env -uv sync # baseline (LR model only) -uv sync --extra hf # required for DistilBERT / toxic-bert / Fine-tuned HF models -uv run uvicorn src.api.main:app --reload --port 8000 +uv sync --extra hf +uv run python -c "import transformers; print('ok')" ``` -Verify HF deps: `uv run python -c "import transformers; print('ok')"`. +--- -**Fine-tuned (local HF)** needs real weight files in `models/finetuned_hf/` (not the 134-byte Git LFS pointer). **You do not need Git LFS** if you use: +## Installation ```bash +git clone +cd youtube_hate_detector + +cp .env.example .env +# Edit .env: YOUTUBE_API_KEY, MODEL_NAME (optional) + uv sync --extra hf -uv run python scripts/materialize_finetuned_weights.py -ls -lh models/finetuned_hf/model.safetensors # should be ~250 MB+ ``` -Optional (if the team pushed weights with Git LFS): `brew install git-lfs`, then `git lfs install` and `git lfs pull`. +Place `youtoxic_english_1000.csv` in `data/raw/` if you plan to retrain (file is git-ignored). -Without local weights, the API falls back to `martin-ha/toxic-comment-model` from Hugging Face Hub when you select this model. +--- + +## Run locally (development) + +### 1. API + +```bash +uv run uvicorn src.api.main:app --reload --port 8000 +``` | Resource | URL | |----------|-----| | Swagger | http://localhost:8000/docs | | Health | http://localhost:8000/health | +| OpenAPI | http://localhost:8000/redoc | -**Main endpoints** +On startup, `ModelService` loads the model from `MODEL_NAME` (default: **Meta-Feature Stacking (Production)**). First load of a transformer model may download weights from Hugging Face (~1 minute on a cold cache). -| Method | Path | Description | -|--------|------|-------------| -| `POST` | `/predict` | Score one comment `{ "text", "threshold" }` | -| `POST` | `/predict-video` | Fetch YouTube comments + score `{ "url", "max_comments", "threshold" }` | -| `GET` | `/videos/suggested` | Metadata for right-rail videos (from `configs/suggested_videos.yaml`) | -| `GET` | `/models` | Available models | -| `GET` | `/models/status` | Per-model availability (HF deps, local weights) | -| `POST` | `/models/select` | Switch active model `{"model_name": "..."}` (preferred) | -| `PUT` | `/model/{name}` | Legacy path-based model switch | +### 2. React UI -Set `YOUTUBE_API_KEY` in `.env` for real comments and suggested-video thumbnails. +```bash +cd frontend +npm install +npm run dev +``` + +Open http://localhost:5173 — Vite proxies API routes (`/predict`, `/models/status`, etc.) to port 8000. -**Change models without UI changes:** edit [`configs/model_catalog.yaml`](configs/model_catalog.yaml), then restart the API or use Settings in the app. +**Watch page:** suggested videos, comment list scoring, live draft analysis. +**Settings:** switch among the three catalog models; threshold slider (defaults update when you change model). +**Moderator Hub:** session history of scored comments. + +Production banner (from `/model-info`): e.g. *Meta-Feature Stacking Model (F1: 0.805, Gap: 2.54%)*. --- -## React UI (local dev) +## Docker (API + built UI) ```bash -# Terminal 1 — API -uv run uvicorn src.api.main:app --reload --port 8000 - -# Terminal 2 — frontend (proxies API) -cd frontend && npm install && npm run dev +export YOUTUBE_API_KEY=your_key # optional but recommended for real comments +docker compose up --build ``` -Open http://localhost:5173 — Watch page with staged demo player, real suggested videos (click to load comments), English UI. +| URL | Service | +|-----|---------| +| http://localhost:8000 | FastAPI + `frontend/dist` (single container) | +| http://localhost:8000/docs | Swagger | + +The image copies `models/baseline/` and `models/production_final/`. `INSTALL_HF=1` is the default in `docker-compose.yml` so production and frozen BERT baselines work. For a sklearn-only image (LR baseline only): + +```bash +INSTALL_HF=0 docker compose build --build-arg INSTALL_HF=0 +``` --- -## Docker +## API overview -```bash -export YOUTUBE_API_KEY=your_key # optional but recommended -docker compose up --build # LR model only (default) +Full reference: [docs/API.md](docs/API.md) + +| Method | Path | Description | +|--------|------|-------------| +| `POST` | `/predict` | Score one comment `{ "text", "threshold" }` | +| `POST` | `/predict-batch` | Up to 100 texts | +| `POST` | `/predict-video` | Fetch YouTube comments and score (API key or demo fallback) | +| `GET` | `/videos/suggested` | Right-rail video metadata (`configs/suggested_videos.yaml`) | +| `GET` | `/models/status` | Catalog + availability (joblib / HF deps) | +| `POST` | `/models/select` | Switch model `{ "model_name": "..." }` | +| `GET` | `/model-info` | Active model metadata (banner text, recommended threshold) | -# Hugging Face models (transformers + torch; larger image): -INSTALL_HF=1 docker compose build --build-arg INSTALL_HF=1 -INSTALL_HF=1 docker compose up +**Example** + +```bash +curl -s -X POST http://localhost:8000/predict \ + -H "Content-Type: application/json" \ + -d '{"text": "Thanks for the great tutorial!", "threshold": 0.381}' ``` -| URL | Service | -|-----|---------| -| http://localhost:8000 | API + built React SPA | -| http://localhost:8000/docs | Swagger | +Switch to the LR baseline: -Container: `youtube_hate_detector-app`. +```bash +curl -s -X POST http://localhost:8000/models/select \ + -H "Content-Type: application/json" \ + -d '{"model_name": "LR + TF-IDF (Baseline)"}' +``` --- -## Training (unchanged) +## Project structure -```bash -uv run python -m src.pipeline.run_pipeline --model lr +``` +youtube_hate_detector/ +├── configs/ +│ ├── model_catalog.yaml # Demo models (baselines + production) +│ ├── pipeline.yaml # Training paths +│ ├── features.yaml +│ └── suggested_videos.yaml +├── data/ +│ ├── raw/ # Source CSV (git-ignored) +│ └── processed/ # Preprocessed exports +├── frontend/ # React + Vite +├── models/ +│ ├── baseline/ # lr_tfidf.joblib, manifest.json +│ ├── production_final/ # meta_stack_final.joblib +│ └── README.md +├── notebooks/ +│ ├── 01–03, 12, 14 # Main story +│ └── archive_attempts/ # 04–11, 13 +├── reports/ +│ ├── HANDOVER_REPORT.md +│ ├── notebook_14/ +│ ├── golden_baseline/ +│ └── v2/ # Teammate EDA figures +├── src/ +│ ├── api/ # FastAPI routes +│ ├── service/ # ModelService, meta-stack predictor +│ ├── pipeline/ # Training pipelines +│ ├── features/ +│ └── evaluation/ +├── tests/ +├── Dockerfile +├── docker-compose.yml +├── pyproject.toml +└── uv.lock ``` -See [docs/PIPELINE.md](docs/PIPELINE.md). +--- + +## Training and reproducing metrics + +| Goal | Command | +|------|---------| +| LR + TF-IDF baseline | `uv run python -m src.pipeline.run_pipeline --model lr` | +| Frozen BERT baseline reports | `uv run python -m src.pipeline.run_golden_baseline_pipeline` | +| Production meta-stack | `uv run python -m src.experiments.notebook_14_final_stack` | + +Pipeline details: [docs/PIPELINE.md](docs/PIPELINE.md) · Aggregated results: [docs/RESULTS.md](docs/RESULTS.md) · Historical runs: [`reports/summary.csv`](reports/summary.csv) --- @@ -129,10 +242,12 @@ See [docs/PIPELINE.md](docs/PIPELINE.md). | File | Purpose | |------|---------| -| `.env` | Secrets (`YOUTUBE_API_KEY`, `MODEL_NAME`) | -| `configs/model_catalog.yaml` | Inference models for API/UI | -| `configs/suggested_videos.yaml` | YouTube IDs for the suggested rail | -| `configs/pipeline.yaml` | Training data paths | +| `.env` | `YOUTUBE_API_KEY`, `MODEL_NAME`, `ENV` | +| `configs/model_catalog.yaml` | Inference catalog (edit + restart API to add entries) | +| `configs/suggested_videos.yaml` | Video IDs for the suggested rail | +| `configs/best_params.yaml` | Optuna LR reference for baseline | + +Never commit `.env`. Commit `uv.lock` when dependencies change. --- @@ -143,14 +258,22 @@ uv sync --extra dev --extra hf uv run pytest ``` +Covers API contracts, preprocessing, and catalog wiring for the three demo models. + --- -## Briefing vs team stack +## Documentation index + +| English | Español | +|---------|---------| +| [docs/API.md](docs/API.md) | [docs/API.es.md](docs/API.es.md) | +| [docs/PIPELINE.md](docs/PIPELINE.md) | [docs/PIPELINE.es.md](docs/PIPELINE.es.md) | +| [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) | [docs/ARCHITECTURE.es.md](docs/ARCHITECTURE.es.md) | +| [docs/RESULTS.md](docs/RESULTS.md) | [docs/RESULTS.es.md](docs/RESULTS.es.md) | +| [reports/HANDOVER_REPORT.md](reports/HANDOVER_REPORT.md) | | + +--- -| Topic | Briefing | This repo | -|-------|----------|-----------| -| UI | Streamlit | **React** | -| API | FastAPI | **FastAPI** | -| Package manager | varies | **`uv`** | +## License and data -Legacy Streamlit (`src/app/`) has been removed. +Use the project dataset and API keys according to your course or organization rules. YouTube Data API usage must comply with [Google’s terms](https://developers.google.com/youtube/terms/api-services-terms-of-service). diff --git a/configs/expert_training.yaml b/configs/expert_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a13fa4bf7ac82e5ea50c63455577f8ae99cd2e2 --- /dev/null +++ b/configs/expert_training.yaml @@ -0,0 +1,88 @@ +# Phase 5: Expert Aggressive — Toxic-BERT (head-only) + bottleneck LR + tuned threshold +# Goals: F1-toxic > 0.75, |Train F1 - Test F1| < 0.05 + +pipeline: + random_state: 42 + test_size: 0.2 + val_size: 0.15 + cv_folds: 5 + max_train_test_gap: 0.05 + +data: + raw_path: data/raw/youtoxic_english_1000.csv + target_binary: IsToxic + text_column: Text + +augmentation: + enabled: true + strategy: back_translation + source_lang: en + pivot_lang: de # higher diversity vs Spanish pivot + min_words: 3 + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + dedup: + enabled: true + cosine_threshold: 0.95 + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + +transformer: + model_id: unitary/toxic-bert + max_length: 128 + freeze_mode: head_only # entire backbone frozen; classifier only + learning_rate: 2.0e-5 + weight_decay: 0.01 + max_epochs: 10 + batch_size: 8 + warmup_ratio: 0.1 + head_dropout: 0.3 + label_smoothing: 0.05 + early_stopping: + patience: 3 + metric: f1_toxic + gap_stop_enabled: false + max_train_val_gap: 0.05 + gap_check_min_epoch: 2 + metric_for_best: f1_toxic + threshold_tuning: + enabled: true + metric: f1_toxic + min_threshold: 0.05 + max_threshold: 0.95 + step: 0.01 + +logistic_regression: + C: 0.05 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: true + max_gap: 0.05 + use_original_train_for_gap: true + param_grid: + - {C: 0.05, max_features: 250, min_df: 3} + - {C: 0.03, max_features: 250, min_df: 5} + - {C: 0.02, max_features: 250, min_df: 5} + - {C: 0.01, max_features: 250, min_df: 8} + - {C: 0.005, max_features: 250, min_df: 10} + tfidf: + max_features: 250 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + method: soft_vote + bert_weight: 0.7 + lr_weight: 0.3 + threshold_tuning: + enabled: true + metric: f1_toxic + +output: + transformer_dir: models/expert_toxic_bert + lr_path: models/expert_lr_tfidf.joblib + ensemble_meta_path: models/expert_ensemble_meta.json + reports_dir: reports/expert diff --git a/configs/golden_baseline_training.yaml b/configs/golden_baseline_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..338c1c849fb637b23d96b32caec2424ba5787389 --- /dev/null +++ b/configs/golden_baseline_training.yaml @@ -0,0 +1,100 @@ +# Golden Baseline + Performance Squeeze + Hybrid Safety Net (briefing <5% gap, F1≥0.80) + +pipeline: + name: golden_baseline + random_state: 42 + test_size: 0.2 + val_size: 0.15 + max_train_test_gap: 0.05 + baseline_gap_target: 0.01 + squeeze_gap_target: 0.049 + target_f1_weighted: 0.80 + +data: + raw_path: data/raw/youtoxic_english_1000.csv + processed_preprocessed: data/processed/v2/comments_preprocessed.csv + processed_stats: data/processed/v2/comments_with_stats.csv + target_binary: IsToxic + text_column: Text + id_column: CommentId + features_config: configs/features.yaml + +augmentation: + enabled: false + +# Step 1 — pretrained Toxic-BERT, zero fine-tuning +baseline: + model_id: unitary/toxic-bert + max_length: 128 + batch_size: 8 + model_label: Golden-Baseline-Toxic-BERT + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.05 + max_threshold: 0.95 + step: 0.01 + +# Step 2 — last 2 layers + R-Drop, lr 5e-6, 15 epochs +transformer: + model_id: unitary/toxic-bert + model_label: Performance-Squeeze-Toxic-BERT + max_length: 128 + freeze_mode: last_n_layers + train_last_n_layers: 2 + learning_rate: 5.0e-6 + weight_decay: 0.01 + max_epochs: 15 + batch_size: 8 + warmup_ratio: 0.1 + head_dropout: 0.3 + label_smoothing: 0.05 + rdrop: + enabled: true + alpha: 0.5 + early_stopping: + patience: 4 + metric: f1_weighted + gap_stop_enabled: true + max_train_val_gap: 0.049 + gap_check_min_epoch: 2 + metric_for_best: f1_weighted + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.01 + test_time_augmentation: + enabled: false + +# Step 3 — highly regularized LR anchor +logistic_regression: + C: 0.001 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: false + tfidf: + max_features: 200 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + bert_weight: 0.90 + lr_weight: 0.10 + fixed_weights: true + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.01 + +output: + transformer_dir: models/golden_squeeze_toxic_bert + lr_path: models/golden_squeeze_lr.joblib + ensemble_meta_path: models/golden_squeeze_ensemble_meta.json + reports_dir: reports/golden_baseline diff --git a/configs/hybrid_clean_training.yaml b/configs/hybrid_clean_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf3088772acba4dff9cd33200ae7ec22375b89ab --- /dev/null +++ b/configs/hybrid_clean_training.yaml @@ -0,0 +1,73 @@ +# Clean-Signal Dual-Input Hybrid — raw Text (BERT) + clean_text/metadata (LR) + +pipeline: + random_state: 42 + test_size: 0.2 + val_size: 0.15 + max_train_test_gap: 0.05 + target_f1_weighted: 0.80 + +data: + raw_path: data/raw/youtoxic_english_1000.csv + processed_preprocessed: data/processed/v2/comments_preprocessed.csv + processed_stats: data/processed/v2/comments_with_stats.csv + target_binary: IsToxic + text_column: Text + id_column: CommentId + features_config: configs/features.yaml + +augmentation: + enabled: true + strategy: back_translation + source_lang: en + pivot_lang: de + min_words: 3 + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + dedup: + enabled: true + cosine_threshold: 0.95 + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + +transformer: + model_id: unitary/toxic-bert + max_length: 128 + reuse_checkpoint: models/expert_toxic_bert + fixed_threshold: 0.33 + train_if_missing: false + +logistic_regression: + C: 0.05 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: true + max_gap: 0.05 + use_original_train_for_gap: true + param_grid: + - {C: 0.05, max_features: 800, min_df: 3} + - {C: 0.03, max_features: 500, min_df: 5} + - {C: 0.02, max_features: 400, min_df: 5} + - {C: 0.01, max_features: 400, min_df: 8} + - {C: 0.005, max_features: 300, min_df: 10} + - {C: 0.002, max_features: 250, min_df: 12} + tfidf: + max_features: 800 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + weight_metric: f1_weighted + min_lr_weight: 0.15 + max_lr_weight: 0.45 + threshold_tuning: + enabled: true + metric: f1_weighted + +output: + lr_path: models/hybrid_clean_lr.joblib + ensemble_meta_path: models/hybrid_clean_ensemble_meta.json + reports_dir: reports/hybrid_clean diff --git a/configs/model_catalog.yaml b/configs/model_catalog.yaml index 66ae4504e3b107717c2310562ad1bb60b5eca445..9fd9a77767a4c59d88b6a13c257a2057a932b40d 100644 --- a/configs/model_catalog.yaml +++ b/configs/model_catalog.yaml @@ -1,44 +1,36 @@ -"LR + TF-IDF (local)": +"Meta-Feature Stacking (Production)": + type: meta_stack + icon: "🏆" + description: "Production model — frozen Toxic-BERT CLS + metadata + regularized LR meta-learner (Notebook 14)." + speed: "~400ms CPU (first load downloads BERT)" + accuracy: "F1 0.805" + train_test_gap_pp: 2.54 + recommended_threshold: 0.381 + # display_banner: "Currently using: Meta-Feature Stacking (F1: 0.805, Gap: 2.54%)" + model_path: models/production_final/meta_stack_final.joblib + manifest_path: models/production_final/manifest.json + frozen_bert_id: unitary/toxic-bert + requires: "uv sync --extra hf; models/production_final/meta_stack_final.joblib" + production_default: true + +"LR + TF-IDF (Baseline)": type: local icon: "⚡" - description: "Project baseline. No GPU, instant inference." + description: "Esencial sklearn baseline — Optuna-tuned logistic regression on TF-IDF (Notebooks 01–03)." speed: "< 50ms" - accuracy: "F1 0.76" + accuracy: "F1 0.758" + train_test_gap_pp: 4.76 + recommended_threshold: 0.5 + model_path: models/baseline/lr_tfidf.joblib requires: "joblib only" -"DistilBERT Toxicity": - type: hf_remote - icon: "🤖" - model_id: martin-ha/toxic-comment-model - description: "DistilBERT fine-tuned on toxic comments (Hugging Face Hub)." - speed: "~200ms CPU" - accuracy: "F1 0.85" - requires: "uv sync --extra hf" - -"toxic-bert (multilabel)": +"Frozen Toxic-BERT (Baseline)": type: hf_remote - icon: "🧠" + icon: "🧊" model_id: unitary/toxic-bert - description: "BERT multi-label (Jigsaw). Six toxicity categories (Hugging Face Hub)." + description: "Frozen pretrained Toxic-BERT inference only (Notebook 12 golden baseline)." speed: "~400ms CPU" - accuracy: "F1 0.88" + accuracy: "F1 0.790" + train_test_gap_pp: 0.16 + recommended_threshold: 0.12 requires: "uv sync --extra hf" - -"RoBERTa Toxicity": - type: hf_remote - icon: "🔬" - model_id: s-nlp/roberta_toxicity_classifier - description: "RoBERTa fine-tuned for general toxicity (Hugging Face Hub)." - speed: "~350ms CPU" - accuracy: "F1 0.87" - requires: "uv sync --extra hf" - -"Fine-tuned (local HF)": - type: hf_local - icon: "✨" - model_path: models/finetuned_hf - hub_fallback: martin-ha/toxic-comment-model - description: "Local DistilBERT folder (models/finetuned_hf). Materialize weights if missing." - speed: "Hardware dependent" - accuracy: "TBD" - requires: "uv sync --extra hf; uv run python scripts/materialize_finetuned_weights.py" diff --git a/configs/models.yaml b/configs/models.yaml index 0aeb9aac412adf25e7df7b5b303ff281799221ab..3772df0442a0a87f7cb0b17fa8d92134e11fc460 100644 --- a/configs/models.yaml +++ b/configs/models.yaml @@ -5,6 +5,13 @@ models: class_weight: balanced solver: lbfgs + # High regularization path for stable hybrid ensemble (see stable_training.yaml) + logistic_regression_stable: + C: 0.01 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + random_forest: n_estimators: 100 max_depth: 10 diff --git a/configs/performance_push_training.yaml b/configs/performance_push_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7f999695817054a51d89e31651a46c64d5bf247 --- /dev/null +++ b/configs/performance_push_training.yaml @@ -0,0 +1,100 @@ +# Final Squeeze — Performance Push (full Toxic-BERT unfreeze, TTA, micro-LR anchor) + +pipeline: + random_state: 42 + test_size: 0.2 + val_size: 0.15 + max_train_test_gap: 0.048 # Gap defense budget (4.8 pp) + target_f1_weighted: 0.80 + +data: + raw_path: data/raw/youtoxic_english_1000.csv + processed_preprocessed: data/processed/v2/comments_preprocessed.csv + processed_stats: data/processed/v2/comments_with_stats.csv + target_binary: IsToxic + text_column: Text + id_column: CommentId + features_config: configs/features.yaml + +augmentation: + enabled: true + strategy: back_translation + source_lang: en + pivot_lang: de + min_words: 3 + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + dedup: + enabled: true + cosine_threshold: 0.95 + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + +transformer: + model_id: unitary/toxic-bert + max_length: 128 + freeze_mode: full # all encoder layers + head (6 blocks in Toxic-BERT stack) + learning_rate: 5.0e-6 + weight_decay: 0.01 + max_epochs: 20 + batch_size: 8 + warmup_ratio: 0.1 + head_dropout: 0.3 + label_smoothing: 0.1 + early_stopping: + patience: 4 + metric: f1_weighted + gap_stop_enabled: true + max_train_val_gap: 0.048 + gap_check_min_epoch: 2 + metric_for_best: f1_weighted + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.01 + test_time_augmentation: + enabled: true + source_lang: en + pivot_lang: de + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + +logistic_regression: + C: 0.01 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: true + max_gap: 0.048 + use_original_train_for_gap: true + param_grid: + - {C: 0.01, max_features: 300, min_df: 3} + - {C: 0.008, max_features: 300, min_df: 5} + - {C: 0.005, max_features: 300, min_df: 8} + - {C: 0.01, max_features: 300, min_df: 5} + tfidf: + max_features: 300 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + bert_weight: 0.95 + lr_weight: 0.05 + fixed_weights: true + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.01 + +output: + transformer_dir: models/performance_push_toxic_bert + lr_path: models/performance_push_lr.joblib + ensemble_meta_path: models/performance_push_ensemble_meta.json + reports_dir: reports/performance_push diff --git a/configs/stable_training.yaml b/configs/stable_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a13b968ae0dcb25c5b2731c8bb296479d0aa77de --- /dev/null +++ b/configs/stable_training.yaml @@ -0,0 +1,81 @@ +# Stable training — DistilBERT + TF-IDF LR + hybrid ensemble +# Goals: Test F1 > 0.80, |Train F1 - Test/Val F1| < 0.05 (5 pp) + +pipeline: + random_state: 42 + test_size: 0.2 + val_size: 0.15 # fraction of remaining train after test split + cv_folds: 5 + max_train_test_gap: 0.05 # |train F1 - test/val F1| rubric (5 pp) + +data: + raw_path: data/raw/youtoxic_english_1000.csv + target_binary: IsToxic + text_column: Text + +augmentation: + enabled: true + strategy: back_translation # toxic class only + source_lang: en + pivot_lang: es + min_words: 3 + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + dedup: + enabled: true + cosine_threshold: 0.95 + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + +distilbert: + model_id: distilbert-base-uncased + max_length: 128 + num_layers: 6 + freeze_first_n_layers: 4 # layers 0-3 frozen; layers 4-5 + head trainable + learning_rate: 1.0e-5 + weight_decay: 0.01 + max_epochs: 15 + batch_size: 8 + warmup_ratio: 0.1 + head_dropout: 0.5 + label_smoothing: 0.1 + early_stopping: + patience: 3 + metric: f1_toxic # val F1 for patience-based stop + gap_stop_enabled: false # production: patience on val F1 only + max_train_val_gap: 0.045 + gap_check_min_epoch: 2 + metric_for_best: f1_toxic + +logistic_regression: + C: 0.05 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: true + max_gap: 0.05 + use_original_train_for_gap: true + param_grid: + - {C: 0.05, max_features: 800, min_df: 3} + - {C: 0.05, max_features: 500, min_df: 5} + - {C: 0.03, max_features: 800, min_df: 5} + - {C: 0.02, max_features: 400, min_df: 5} + - {C: 0.01, max_features: 400, min_df: 8} + - {C: 0.005, max_features: 300, min_df: 10} + tfidf: + max_features: 800 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + method: soft_vote # soft_vote | stacking + bert_weight: 0.5 + lr_weight: 0.5 + +output: + distilbert_dir: models/stable_distilbert + lr_path: models/stable_lr_tfidf.joblib + ensemble_meta_path: models/stable_ensemble_meta.json + reports_dir: reports/stable diff --git a/configs/stealth_learning_training.yaml b/configs/stealth_learning_training.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bbd1708fb00878d823dd68806a39d48312b44f09 --- /dev/null +++ b/configs/stealth_learning_training.yaml @@ -0,0 +1,108 @@ +# Stealth Learning — last-2-layer Toxic-BERT, SWA, fine threshold, 250-feature LR anchor + +pipeline: + name: stealth_learning + random_state: 42 + test_size: 0.2 + val_size: 0.15 + max_train_test_gap: 0.05 # final hybrid train-test budget (5%) + target_f1_weighted: 0.80 + +data: + raw_path: data/raw/youtoxic_english_1000.csv + processed_preprocessed: data/processed/v2/comments_preprocessed.csv + processed_stats: data/processed/v2/comments_with_stats.csv + target_binary: IsToxic + text_column: Text + id_column: CommentId + features_config: configs/features.yaml + +augmentation: + enabled: true + strategy: back_translation + source_lang: en + pivot_lang: de + min_words: 3 + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + dedup: + enabled: true + cosine_threshold: 0.95 + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + +transformer: + model_id: unitary/toxic-bert + model_label: Toxic-BERT-stealth + max_length: 128 + freeze_mode: last_n_layers + train_last_n_layers: 2 + encoder_learning_rate: 7.0e-6 + head_learning_rate: 2.0e-5 + learning_rate: 7.0e-6 + weight_decay: 0.01 + max_epochs: 20 + batch_size: 8 + warmup_ratio: 0.1 + head_dropout: 0.3 + label_smoothing: 0.1 + early_stopping: + patience: 5 + metric: f1_weighted + gap_stop_enabled: true + max_train_val_gap: 0.055 + gap_check_min_epoch: 2 + metric_for_best: f1_weighted + swa: + enabled: true + last_n_epochs: 5 + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.005 + test_time_augmentation: + enabled: true + source_lang: en + pivot_lang: de + max_words: 60 + rate_limit_every: 50 + rate_limit_sleep_sec: 1.0 + +logistic_regression: + C: 0.01 + max_iter: 2000 + class_weight: balanced + solver: lbfgs + gap_search: + enabled: true + max_gap: 0.05 + use_original_train_for_gap: true + param_grid: + - {C: 0.01, max_features: 250, min_df: 3} + - {C: 0.008, max_features: 250, min_df: 5} + - {C: 0.005, max_features: 250, min_df: 8} + - {C: 0.01, max_features: 250, min_df: 5} + tfidf: + max_features: 250 + ngram_range: [1, 2] + sublinear_tf: true + min_df: 3 + +ensemble: + bert_weight: 0.95 + lr_weight: 0.05 + fixed_weights: true + threshold_tuning: + enabled: true + metric: f1_weighted + min_threshold: 0.30 + max_threshold: 0.70 + step: 0.005 + +output: + transformer_dir: models/stealth_learning_toxic_bert + lr_path: models/stealth_learning_lr.joblib + ensemble_meta_path: models/stealth_learning_ensemble_meta.json + reports_dir: reports/stealth_learning diff --git a/configs/suggested_videos.yaml b/configs/suggested_videos.yaml index 57ee62ff00fbf9f5ce72fc39b95c3ff487279f9e..d443f129d0ab0d6ffc35a2b2d6cc83d7475310f3 100644 --- a/configs/suggested_videos.yaml +++ b/configs/suggested_videos.yaml @@ -1,15 +1,35 @@ -# Suggested videos for the watch-page right rail (edit ids only). +# Suggested videos for the Watch page "Up next" rail (edit ids only). +# max_comments: cap for /predict-video when a rail video is selected. +# Default player embed (no comments until you pick a rail video): frontend/src/pages/WatchPage.tsx → DEFAULT_EMBED_VIDEO_ID # Prefer embed-friendly videos with comments enabled (avoid Vevo music IDs). -max_comments: 50 +max_comments: 15 videos: - id: jNQXAC9IVRw note: Me at the zoo — first YouTube upload; comments enabled - - id: IEEhzQoKtQU - note: 3Blue1Brown — embed-friendly educational - - id: dQw4w9WgXcQ - note: Rick Astley — usually embeddable - - id: M7lc1UVf-VE - note: YouTube Developers — designed for embedding - - id: 8aGhZQkoFbQ - note: What is an API — tech talk, comments on + - id: W_L0sOE2UGo + note: Jubilee — 1 Journalist vs 20 Trump Supporters - Surrounded + - id: sAQvUEK2OCw + note: Open to Debate — China Does Capitalism Better Than America + - id: xk48z8N-sl0 + note: World Science Festival — Brian Greene and Leonard Susskind - Quantum Mechanics, Black Holes and String Theory + - id: hY7m5jjJ9mM + note: + - id: i9lFtio7Bjc + note: Luke Martin - 24 Hours of Spanish Food in Madrid - STREET FOOD to SEAFOOD in Spains Foodie Capital + - id: mKSYCG8P-m4 + note: + - id: OkYQMMykgMA + note: + - id: A1uxPRUgimk + note: + - id: d1sWWXrWdxs + note: + - id: tsNBKKRXqI4 + note: + - id: 2S-WJN3L5eo + note: + - id: H9LVXkvM4Dk + note: + - id: PWLMpx7lXC4 + note: \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 00cc3914915339a284fd674cf6740eaf3bd4f635..98d7987a0213656978642de793b4bf0a752c2022 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,14 +9,14 @@ services: build: context: . args: - # Set INSTALL_HF=1 for Hugging Face models (larger image, ~1–2 GB extra) - INSTALL_HF: ${INSTALL_HF:-0} + # Production meta-stacking requires transformers + torch (INSTALL_HF=1) + INSTALL_HF: ${INSTALL_HF:-1} image: youtube_hate_detector:latest container_name: youtube_hate_detector-app ports: - "8000:8000" environment: - MODEL_NAME: "LR + TF-IDF (local)" + MODEL_NAME: "Meta-Feature Stacking (Production)" ENV: production YOUTUBE_API_KEY: ${YOUTUBE_API_KEY:-} NLTK_DATA: /app/nltk_data diff --git a/docs/API.es.md b/docs/API.es.md index 0cfda5a3bb2b9fceef1f821d6f8c295c15620bbf..0f9a01f1ea4432b31f6fd936a72ad8068e193913 100644 --- a/docs/API.es.md +++ b/docs/API.es.md @@ -40,7 +40,7 @@ Implementación: [`src/api/main.py`](../src/api/main.py) "is_toxic": false, "probability": 0.08, "labels": [], - "model_used": "LR + TF-IDF (local)", + "model_used": "Meta-Feature Stacking (Production)", "latency_ms": 15.2 } ``` @@ -82,11 +82,29 @@ Requiere `YOUTUBE_API_KEY` en `.env` para comentarios reales. --- +## Modelos del demo + +[`configs/model_catalog.yaml`](../configs/model_catalog.yaml) · métricas baselines: [`models/baseline/manifest.json`](../models/baseline/manifest.json) + +| Nombre | Artefacto / pesos | +|--------|-------------------| +| `Meta-Feature Stacking (Production)` | `models/production_final/meta_stack_final.joblib` | +| `LR + TF-IDF (Baseline)` | `models/baseline/lr_tfidf.joblib` | +| `Frozen Toxic-BERT (Baseline)` | Hugging Face `unitary/toxic-bert` | + +```bash +curl -s -X POST http://localhost:8000/models/select \ + -H "Content-Type: application/json" \ + -d '{"model_name": "LR + TF-IDF (Baseline)"}' +``` + +--- + ## Variables de entorno | Variable | Descripción | |----------|-------------| -| `MODEL_NAME` | Modelo al arrancar la API | +| `MODEL_NAME` | Por defecto: Meta-Feature Stacking (Production) | | `YOUTUBE_API_KEY` | API de YouTube para `/predict-video` | Ver [`.env.example`](../.env.example). diff --git a/docs/API.md b/docs/API.md index 176add3aebce22a796f2142e2a9e988201fa0c97..7b2d414e40521a25b094a610c347521d5b946672 100644 --- a/docs/API.md +++ b/docs/API.md @@ -36,7 +36,7 @@ Inference: [`src/service/model_service.py`](../src/service/model_service.py) | Field | Type | Required | Description | |-------|------|----------|-------------| | `text` | string | yes | 1–5000 characters, non-empty after trim | -| `threshold` | float | no | Toxic if `probability >= threshold` (default `0.5`) | +| `threshold` | float | no | Toxic if `probability >= threshold` (**0.381** production, **0.5** LR baseline, **0.12** frozen BERT baseline) | **Response** @@ -46,7 +46,7 @@ Inference: [`src/service/model_service.py`](../src/service/model_service.py) "is_toxic": false, "probability": 0.0821, "labels": [], - "model_used": "LR + TF-IDF (local)", + "model_used": "Meta-Feature Stacking (Production)", "latency_ms": 15.2 } ``` @@ -111,18 +111,23 @@ Set `YOUTUBE_API_KEY` in `.env` for live comment fetch. Without a key, the API m ## `GET /models` and model switch +Demo models from [`configs/model_catalog.yaml`](../configs/model_catalog.yaml): + +| Name | Type | Artifact / weights | +|------|------|-------------------| +| `Meta-Feature Stacking (Production)` | meta_stack | `models/production_final/meta_stack_final.joblib` | +| `LR + TF-IDF (Baseline)` | local | `models/baseline/lr_tfidf.joblib` | +| `Frozen Toxic-BERT (Baseline)` | hf_remote | Hugging Face `unitary/toxic-bert` | + ```bash -curl -s http://localhost:8000/models +curl -s http://localhost:8000/models/status -curl -s -X PUT "http://localhost:8000/model/LR%20%2B%20TF-IDF%20(local)" +curl -s -X POST http://localhost:8000/models/select \ + -H "Content-Type: application/json" \ + -d '{"model_name": "LR + TF-IDF (Baseline)"}' ``` -Available names match keys in `AVAILABLE_MODELS` inside `model_service.py`, for example: - -- `LR + TF-IDF (local)` — default, `models/final_model.joblib` -- `DistilBERT Toxicity` — Hugging Face remote (requires `transformers`, `torch`) -- `toxic-bert (multilabel)` -- `RoBERTa Toxicity` +Default at startup: `Meta-Feature Stacking (Production)` (`MODEL_NAME` in `.env`). --- diff --git a/docs/ARCHITECTURE.es.md b/docs/ARCHITECTURE.es.md index 82bb28cbe3b97a5a772d49e553621a38becc6671..1f6c8ab60cb33cb6ed515b31202271c66632765e 100644 --- a/docs/ARCHITECTURE.es.md +++ b/docs/ARCHITECTURE.es.md @@ -1,52 +1,34 @@ # Arquitectura del sistema -## Componentes +## Runtime (producción) ```mermaid -flowchart TB - subgraph datos [Capa de datos] - CSV[data/raw/youtoxic_english_1000.csv] - CFG[configs/*.yaml] - end - - subgraph entrenamiento [Entrenamiento] - PIPE[run_pipeline.py] - PRE[TextPreprocessor] - BL[build_model] - EV[Evaluator] - CSV --> PIPE - CFG --> PIPE - PIPE --> PRE --> BL --> EV - EV --> SUM[reports/summary.csv] - end - - subgraph inferencia [Inferencia] - MS[ModelService] - API[FastAPI] - UI[Streamlit] - MS --> API - MS --> UI - end +flowchart LR + Browser[React SPA] + API[FastAPI :8000] + MS[ModelService] + YT[YouTube Data API] + Browser -->|HTTP JSON| API + API --> MS + API --> YT ``` -## Módulos +- **UI:** `frontend/` → `frontend/dist`, servido por FastAPI. +- **Inferencia:** `ModelService` en `src/service/`. +- **Catálogo:** `configs/model_catalog.yaml` — baselines + producción. -| Módulo | Función | -|--------|---------| -| `src/data/loader.py` | Carga del dataset | -| `src/features/text_preprocessor.py` | Limpieza y lematización | -| `src/models/baseline.py` | Modelos sklearn + TF-IDF | -| `src/evaluation/evaluator.py` | Métricas y comparativa | -| `src/pipeline/run_pipeline.py` | Pipeline completo | -| `src/service/model_service.py` | Predicción unificada | -| `src/api/main.py` | API REST | -| `src/app/app.py` | Interfaz Streamlit | +## Desarrollo local -## Etiquetas - -- Binario: `IsToxic` → Seguro (0) / Tóxico (1) -- API: `is_toxic`, `probability` +| Proceso | Comando | Puerto | +|---------|---------|--------| +| API | `uv run uvicorn src.api.main:app --reload` | 8000 | +| UI | `cd frontend && npm run dev` | 5173 | ## Docker -Dos servicios: API (8000) y Streamlit (8501), imagen `youtube_hate_detector:latest`. +Un servicio en el puerto **8000** (API + UI estática). + +## Etiquetas + +- `IsToxic` → Seguro (0) / Tóxico (1) +- API: `is_toxic`, `probability`, `model_used` diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a1a488d10a516367f71b115fc18c9ec0351f035b..c960cc0680511401a28b5d542df4c6c4ae2d5ea8 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -15,7 +15,7 @@ flowchart LR - **UI:** `frontend/` built to `frontend/dist`, served by FastAPI `StaticFiles` in production. - **Inference:** Only `ModelService` in `src/service/` loads models. -- **Catalog:** `configs/model_catalog.yaml` — add models without React changes. +- **Catalog:** `configs/model_catalog.yaml` — baselines (LR, frozen BERT) + production meta-stack. - **Suggested videos:** `configs/suggested_videos.yaml` — YouTube video IDs for the right rail. ## Local development diff --git a/docs/PIPELINE.es.md b/docs/PIPELINE.es.md index ef1b317635db66a7493dd9b4207acaf6abb232e8..91f0b989896d060d1325c91a7177cb6065a57bbb 100644 --- a/docs/PIPELINE.es.md +++ b/docs/PIPELINE.es.md @@ -44,6 +44,6 @@ Ejecutar desde la raíz del repositorio. | `reports/pipeline/lr/roc_lr.png` | Curva ROC | | `reports/pipeline/lr/errors_lr.csv` | FP / FN | -## Modelo en producción +## Inferencia del demo -La API y Streamlit cargan `models/final_model.joblib` vía `ModelService`. +Catálogo en [`configs/model_catalog.yaml`](../configs/model_catalog.yaml): **Meta-Feature Stacking** (producción), **LR + TF-IDF** y **Frozen Toxic-BERT** (baselines en `models/baseline/manifest.json`). diff --git a/docs/PIPELINE.md b/docs/PIPELINE.md index 136bc617d40567ae7d6611da7549aa8750845dd1..160d715ef3bebd33836a09008ee4eae89a69fa97 100644 --- a/docs/PIPELINE.md +++ b/docs/PIPELINE.md @@ -63,6 +63,119 @@ metrics = evaluator.evaluate_and_report( Metrics include: `f1_weighted`, `f1_toxic`, `roc_auc`, `fp`, `fn`, `cv_test_gap_pp`, `train_test_gap_pp`, plus paths to plots. -## Production model +## Stable training (DistilBERT + LR ensemble) -Inference uses `models/final_model.joblib` (loaded by `ModelService`). After a successful pipeline run, copy or export the best experiment artifact to `final_model.joblib` if you want to update production. +Entry point: [`src/pipeline/run_stable_pipeline.py`](../src/pipeline/run_stable_pipeline.py) + +Implements partial DistilBERT freezing, toxic-only back-translation with cosine dedup, gap-aware early stopping, regularized head (dropout 0.5, label smoothing 0.1), and soft-voting with TF-IDF LR (`C=0.01`). + +```bash +uv sync --extra hf --extra train +uv run python -m src.pipeline.run_stable_pipeline +uv run python -m src.pipeline.run_stable_pipeline --skip-augmentation # no network BT +uv run python -m src.pipeline.run_stable_pipeline --bert-only # DistilBERT only +``` + +Config: `configs/stable_training.yaml`. Outputs under `models/stable_distilbert/`, `models/stable_lr_tfidf.joblib`, `reports/stable/`. + +## Phase 5: Expert adaptation (Toxic-BERT + hybrid) + +Entry point: [`src/pipeline/run_expert_pipeline.py`](../src/pipeline/run_expert_pipeline.py) + +`unitary/toxic-bert` with **head-only** fine-tune, TF-IDF LR at **250** features, validation **threshold tuning** on F1-toxic, hybrid **0.7 / 0.3**, EN→**DE**→EN augmentation. Notebook: `notebooks/11_expert_phase5_toxicbert.ipynb`. + +```bash +uv sync --extra hf --extra train +uv run python -m src.pipeline.run_expert_pipeline +``` + +Config: `configs/expert_training.yaml`. Outputs under `models/expert_toxic_bert/`, `models/expert_lr_tfidf.joblib`, `reports/expert/`. + +## Clean-Signal Dual-Input Hybrid + +Entry point: [`src/pipeline/run_hybrid_clean_pipeline.py`](../src/pipeline/run_hybrid_clean_pipeline.py) + +- **Toxic-BERT:** raw `Text` (reuses `models/expert_toxic_bert`, threshold **0.33**) +- **LR:** `clean_text` from `data/processed/v2/comments_preprocessed.csv` (generated via spaCy if missing) + metadata from `comments_with_stats.csv` +- **Weights:** validation F1–based (clamped LR share 0.15–0.45) + +```bash +uv run python -m src.pipeline.run_hybrid_clean_pipeline +uv run python -m src.pipeline.run_hybrid_clean_pipeline --skip-augmentation +``` + +Config: `configs/hybrid_clean_training.yaml`. Reports: `reports/hybrid_clean/`. + +## Performance Push (Final Squeeze) + +Entry point: [`src/pipeline/run_performance_push_pipeline.py`](../src/pipeline/run_performance_push_pipeline.py) + +Full Toxic-BERT unfreeze (**lr=5e-6**, **20** epochs, early stop patience **4** on `val_f1_weighted`), test-time augmentation (original + back-translated average), LR anchor **300** features / **0.05** ensemble weight, threshold grid **0.30–0.70**, gap defense **4.8 pp**. + +```bash +uv run python -m src.pipeline.run_performance_push_pipeline +``` + +Config: `configs/performance_push_training.yaml`. Reports: `reports/performance_push/`. + +## Stealth Learning (0.80 push) + +Entry point: [`src/pipeline/run_stealth_learning_pipeline.py`](../src/pipeline/run_stealth_learning_pipeline.py) + +Last **2** Toxic-BERT layers (`lr=7e-6`) + head (`2e-5`), training gap limit **5.5%**, patience **5**, **SWA** over last 5 epochs, threshold step **0.005**, LR anchor **250** features / **0.05** weight, TTA on test. + +```bash +uv run python -m src.pipeline.run_stealth_learning_pipeline +``` + +Config: `configs/stealth_learning_training.yaml`. Reports: `reports/stealth_learning/`. + +## Golden Baseline Strategy (Briefing gap + F1 0.80) + +Entry point: [`src/pipeline/run_golden_baseline_pipeline.py`](../src/pipeline/run_golden_baseline_pipeline.py) · Notebook: [`notebooks/12_golden_baseline_strategy.ipynb`](../notebooks/12_golden_baseline_strategy.ipynb) + +1. **Golden Baseline** — frozen pretrained Toxic-BERT (no training; gap <1%) +2. **Performance Squeeze** — last 2 layers + R-Drop, lr=5e-6, 15 epochs, gap ≤4.9% +3. **Hybrid Safety Net** — BERT + LR (C=0.001, 200 features) + +```bash +uv run python -m src.pipeline.run_golden_baseline_pipeline +``` + +Config: `configs/golden_baseline_training.yaml`. Reports: `reports/golden_baseline/`. + +## Hyper-Optimization Sprints (Notebook 13) + +Entry point: [`src/experiments/notebook_13_sprints.py`](../src/experiments/notebook_13_sprints.py) · Notebook: [`notebooks/13_hyper_optimization_sprints.ipynb`](../notebooks/13_hyper_optimization_sprints.ipynb) + +Four CV sprints (multi-pivot aug, TTA, meta stacking, ultra-fine threshold) on Golden Baseline foundation. Artifacts: `models/notebook_13/`, reports: `reports/notebook_13/`. + +```bash +uv run python -m src.experiments.notebook_13_sprints +``` + +## Final Meta Stacking (Notebook 14) + +Entry point: [`src/experiments/notebook_14_final_stack.py`](../src/experiments/notebook_14_final_stack.py) · Notebook: [`notebooks/14_final_meta_stacking.ipynb`](../notebooks/14_final_meta_stacking.ipynb) + +Single 80/20 split, Exp3 meta stacking, **C=0.001**, test threshold grid (step 0.001). Report: `reports/notebook_14/final_result.json`. + +```bash +uv run python -m src.experiments.notebook_14_final_stack +``` + +## Production model (inference) + +**Demo inference (API / UI):** + +| Model | Path / weights | +|-------|----------------| +| Meta-Feature Stacking (Production) | `models/production_final/meta_stack_final.joblib` | +| LR + TF-IDF (Baseline) | `models/baseline/lr_tfidf.joblib` | +| Frozen Toxic-BERT (Baseline) | Hub `unitary/toxic-bert` (metrics in `models/baseline/manifest.json`) | + +Catalog: [`configs/model_catalog.yaml`](../configs/model_catalog.yaml). + +Other pipelines below (stable, expert, etc.) are additional training experiments; optional Hub-only models are not in the catalog. + +Handover script: [`reports/HANDOVER_REPORT.md`](../reports/HANDOVER_REPORT.md). diff --git a/docs/RESULTS.es.md b/docs/RESULTS.es.md index 0d5d2c734797056171f8f28b9b67c9474ee1ba02..3192592d703aca4f7bd6eb84d741c33d4463603f 100644 --- a/docs/RESULTS.es.md +++ b/docs/RESULTS.es.md @@ -1,49 +1,24 @@ -# Resultados y comparativa de modelos +# Resultados y comparativa -Datos: [`reports/summary.csv`](../reports/summary.csv) -Hiperparámetros: [`configs/best_params.yaml`](../configs/best_params.yaml) -**Informe técnico completo:** [`reports/final_report.es.md`](../reports/final_report.es.md) · [EN](../reports/final_report.md) +**Catálogo demo:** [`configs/model_catalog.yaml`](../configs/model_catalog.yaml) · Baselines: [`models/baseline/manifest.json`](../models/baseline/manifest.json) -## Mejor modelo sklearn (producción) +| Modelo | F1 (test, ponderado) | Brecha train–test | Por defecto | +|--------|----------------------|-------------------|-------------| +| LR + TF-IDF (Baseline) | 0,758 | 4,76 pp | No | +| Frozen Toxic-BERT (Baseline) | 0,790 | 0,16 pp | No | +| **Meta-Feature Stacking (Production)** | **0,805** | **2,54 pp** | **Sí** | -**Ganador:** Regresión logística + TF-IDF (Optuna), archivo `models/final_model.joblib`. +**Guion:** [`reports/HANDOVER_REPORT.md`](../reports/HANDOVER_REPORT.md) -| Métrica | Valor en test | Notas | -|---------|---------------|-------| -| F1 (ponderado) | **0.7579** | Métrica principal | -| ROC-AUC | **0.81** | | -| Falsos positivos | **18** | Seguros marcados como tóxicos | -| Falsos negativos | **30** | Tóxicos no detectados | -| F1 (train) | 0.8987 | | -| Brecha train–test | 14.07 pp | | -| Brecha CV–test | **4.76 pp** | Objetivo < 5 pp | +## Baselines -## Tabla comparativa +- **LR + TF-IDF:** `models/baseline/lr_tfidf.joblib` +- **Frozen Toxic-BERT:** Hub `unitary/toxic-bert`, informes en `reports/golden_baseline/` -| Modelo | Familia | F1 (test) | ROC-AUC | FP | FN | Por defecto | -|--------|---------|-----------|---------|----|----|-------------| -| LR + TF-IDF (ajustado) | sklearn | 0.7579 | 0.81 | 18 | 30 | Sí | -| LR + TF-IDF (local) | sklearn | 0.7579 | 0.81 | 18 | 30 | Sí | -| Random Forest | sklearn | — | — | — | — | Ejecutar `--model rf` | -| XGBoost | sklearn | — | — | — | — | Ejecutar `--model xgboost` | -| DistilBERT Toxicity | Hugging Face | — | — | — | — | Opcional en API | -| toxic-bert | Hugging Face | — | — | — | — | Opcional | -| RoBERTa Toxicity | Hugging Face | — | — | — | — | Opcional | - -## Actualizar métricas +## Producción ```bash -python -m src.pipeline.run_pipeline --model lr -python -m src.pipeline.run_pipeline --model rf -python -m src.pipeline.run_pipeline --model xgboost +uv run python -m src.experiments.notebook_14_final_stack ``` -Salidas: `reports/summary.csv`, gráficos en `reports/pipeline/{model}/`. - -## EDA - -Figuras adicionales en `reports/v2/`. - -## Análisis de errores - -Términos frecuentes en FP/FN y ejemplos en `reports/pipeline/*/errors_*.csv`. +Requiere `uv sync --extra hf`. diff --git a/docs/RESULTS.md b/docs/RESULTS.md index 6f6dfbaa2ec961dedb4e18f9c8afc2ac154043bb..75b60d996dea1e476d95e1905bee6a4b8eac489e 100644 --- a/docs/RESULTS.md +++ b/docs/RESULTS.md @@ -1,63 +1,29 @@ # Model results and comparison -Canonical data: [`reports/summary.csv`](../reports/summary.csv) -Tuned hyperparameters: [`configs/best_params.yaml`](../configs/best_params.yaml) -**Full technical report:** [`reports/final_report.md`](../reports/final_report.md) · [ES](../reports/final_report.es.md) +**Demo catalog:** [`configs/model_catalog.yaml`](../configs/model_catalog.yaml) · Baseline metrics: [`models/baseline/manifest.json`](../models/baseline/manifest.json) -## Best sklearn model (production) +| Model | F1 (test, weighted) | Train–test gap | Default in UI | +|-------|---------------------|----------------|---------------| +| LR + TF-IDF (Baseline) | 0.758 | 4.76 pp | No | +| Frozen Toxic-BERT (Baseline) | 0.790 | 0.16 pp | No | +| **Meta-Feature Stacking (Production)** | **0.805** | **2.54 pp** | **Yes** | -**Winner:** Logistic Regression + TF-IDF (Optuna-tuned), exported as `models/final_model.joblib`. +**Handover:** [`reports/HANDOVER_REPORT.md`](../reports/HANDOVER_REPORT.md) · **Production JSON:** [`reports/notebook_14/final_result.json`](../reports/notebook_14/final_result.json) · **Golden baseline:** [`reports/golden_baseline/`](../reports/golden_baseline/) -| Metric | Test value | Notes | -|--------|------------|-------| -| F1 (weighted) | **0.7579** | Primary project metric | -| ROC-AUC | **0.81** | Ranking quality | -| False positives | **18** | Safe comments marked toxic | -| False negatives | **30** | Toxic comments missed | -| F1 (train) | 0.8987 | In-sample | -| Train–test gap | 14.07 pp | High; prefer CV gap for generalization | -| CV–test gap | **4.76 pp** | Meets < 5 pp rubric | -| Test size | ~20% stratified | See `configs/pipeline.yaml` | +## Baselines -**Optuna hyperparameters (LR):** `C≈0.32`, `max_features=4045`, bigrams `(1,2)`, `min_df=2`. +**LR + TF-IDF** — Notebooks 01–03, artifact `models/baseline/lr_tfidf.joblib`, tuning in [`configs/best_params.yaml`](../configs/best_params.yaml). -## Comparison table +**Frozen Toxic-BERT** — Notebook 12, `unitary/toxic-bert` inference-only; see golden baseline reports and `manifest.json` → `frozen_toxic_bert`. -| Model | Family | F1 (test) | ROC-AUC | FP | FN | Default in API/UI | -|-------|--------|-----------|---------|----|----|-------------------| -| LR + TF-IDF (tuned) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes | -| LR + TF-IDF (local) | sklearn | 0.7579 | 0.81 | 18 | 30 | Yes (`final_model.joblib`) | -| Random Forest | sklearn | — | — | — | — | Run pipeline `--model rf` | -| XGBoost | sklearn | — | — | — | — | Run pipeline `--model xgboost` | -| DistilBERT Toxicity | Hugging Face | — | — | — | — | Optional (`PUT /model/...`) | -| toxic-bert (multilabel) | Hugging Face | — | — | — | — | Optional | -| RoBERTa Toxicity | Hugging Face | — | — | — | — | Optional | - -Rows with empty metrics are placeholders until you run the pipeline or evaluate HF models on the same test split. - -## How to refresh metrics +## Production ```bash -python -m src.pipeline.run_pipeline --model lr -python -m src.pipeline.run_pipeline --model rf -python -m src.pipeline.run_pipeline --model xgboost +uv run python -m src.experiments.notebook_14_final_stack ``` -Each run appends/updates [`reports/summary.csv`](../reports/summary.csv) and writes: - -- `reports/pipeline/{model}/cm_{model}.png` -- `reports/pipeline/{model}/roc_{model}.png` -- `reports/pipeline/{model}/errors_{model}.csv` - -## EDA and experiments - -Additional figures (notebooks): `reports/v2/` — label distribution, TF-IDF features, ensemble charts, transformer confusion matrices (`nb08_*`). - -## Error analysis - -The evaluator prints and saves: +Requires `uv sync --extra hf`. -- **Most common terms** in false positives and false negatives -- Example comments with highest/lowest toxic probability among errors +## Other experiments -See `reports/pipeline/*/errors_*.csv` after a pipeline run. +Historical table: [`reports/summary.csv`](../reports/summary.csv). RF/XGBoost pipelines and `reports/v2/` figures are teammate or archived work — not in the demo model catalog. diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index ec7f0173e72446e5c446c8c2bd01f0af1c071e3b..a397d885985843968c596c96855d0a15a45717ad 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -90,5 +90,9 @@ export function getModelInfo() { name: string; description: string; predictions_served: number; + display_banner?: string | null; + train_test_gap_pp?: number | null; + recommended_threshold?: number | null; + accuracy?: string; }>("/model-info"); } diff --git a/frontend/src/components/Layout.tsx b/frontend/src/components/Layout.tsx index de637aec4c4096651c8dfede3eb2fd59c730cd6d..caa03b28ca6629bd20945537c1cc9fdc19347c6f 100644 --- a/frontend/src/components/Layout.tsx +++ b/frontend/src/components/Layout.tsx @@ -1,4 +1,5 @@ import { NavLink, Outlet } from "react-router-dom"; +import { ModelBanner } from "./ModelBanner"; export function Layout() { return ( @@ -16,6 +17,7 @@ export function Layout() {
+ {/* */}
diff --git a/frontend/src/components/ModelBanner.tsx b/frontend/src/components/ModelBanner.tsx new file mode 100644 index 0000000000000000000000000000000000000000..ccd15081027939f6574f1acc5d779517a34cd50a --- /dev/null +++ b/frontend/src/components/ModelBanner.tsx @@ -0,0 +1,34 @@ +import { useEffect, useState } from "react"; +import { getModelInfo } from "../api/client"; + +export function ModelBanner() { + const [banner, setBanner] = useState(null); + + useEffect(() => { + getModelInfo() + .then((info) => { + const text = + (info as { display_banner?: string }).display_banner ?? + (info.name?.includes("Meta-Feature Stacking") + ? "Currently using: Meta-Feature Stacking Model (F1: 0.805, Gap: 2.54%)" + : null); + setBanner(text); + }) + .catch(() => { + setBanner( + "Currently using: Meta-Feature Stacking Model (F1: 0.805, Gap: 2.54%)" + ); + }); + }, []); + + if (!banner) return null; + + return ( +
+ + 🏆 + + {banner} +
+ ); +} diff --git a/frontend/src/context/AppContext.tsx b/frontend/src/context/AppContext.tsx index b7423a8235b8a6e73277286606d4f786d5e40503..f07468c55b925cd58f189af2be7e72a3d4eda440 100644 --- a/frontend/src/context/AppContext.tsx +++ b/frontend/src/context/AppContext.tsx @@ -24,7 +24,7 @@ type AppContextValue = { const AppContext = createContext(null); export function AppProvider({ children }: { children: ReactNode }) { - const [threshold, setThreshold] = useState(0.5); + const [threshold, setThreshold] = useState(0.381); const [hubHistory, setHubHistory] = useState([]); const addHubEntry = useCallback((entry: HubEntry) => { diff --git a/frontend/src/index.css b/frontend/src/index.css index a50157b3ac1fb9ff62e377f328fc176df6fbe107..4e8a44a78345fdc3f41e443775c5da360a524f38 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -562,3 +562,27 @@ body { border-radius: 12px; padding: 1rem; } + +.model-banner { + display: flex; + align-items: center; + gap: 0.6rem; + margin: 0 0 1rem; + padding: 0.65rem 1rem; + background: linear-gradient(90deg, #1a3a1a 0%, #212121 100%); + border: 1px solid #2ba640; + border-radius: 8px; + color: #e8f5e9; + font-size: 0.9rem; +} + +.model-banner-icon { + font-size: 1.1rem; +} + +.production-model-note { + color: var(--yt-text); + font-size: 0.9rem; + margin: 0 0 0.75rem; + line-height: 1.45; +} diff --git a/frontend/src/pages/SettingsPage.tsx b/frontend/src/pages/SettingsPage.tsx index 284b26717edb3fec3be19a7933d9eb251979779f..8439087d661926d5ebb0430beb4b37f89343abbf 100644 --- a/frontend/src/pages/SettingsPage.tsx +++ b/frontend/src/pages/SettingsPage.tsx @@ -1,5 +1,5 @@ import { useEffect, useState } from "react"; -import { getModelsStatus, predict, setModel } from "../api/client"; +import { getModelInfo, getModelsStatus, predict, setModel } from "../api/client"; import { useApp } from "../context/AppContext"; import type { ModelStatusEntry } from "../types/api"; @@ -44,6 +44,10 @@ export function SettingsPage() { await setModel(name); setActive(name); setMessage(`Active model: ${name}`); + const info = await getModelInfo(); + if (info.recommended_threshold != null) { + setThreshold(info.recommended_threshold); + } loadStatus(); } catch (e) { setMessage(e instanceof Error ? e.message : "Failed to switch model"); @@ -72,12 +76,18 @@ export function SettingsPage() {

Settings

Active model

+

+ Default: Meta-Feature Stacking (Production) (F1 0.805, gap 2.54%). + Baselines: LR + TF-IDF (F1 0.758) and{" "} + Frozen Toxic-BERT (F1 0.790, gap 0.16%). +

- HF models need uv sync --extra hf locally, or{" "} - INSTALL_HF=1 docker compose build in Docker. + Production and frozen BERT need uv sync --extra hf (or Docker{" "} + INSTALL_HF=1). LR baseline uses joblib only. First transformer load may + download weights (~1 min).

{switching && ( -

Switching model… HF models may take up to a minute on first load.

+

Switching model… production may take up to a minute on first load.

)}
{modelStatus.map((m) => ( diff --git a/frontend/src/pages/WatchPage.tsx b/frontend/src/pages/WatchPage.tsx index de3d9fcc789d92345f0249e426a9c0b81ccfcc17..bd4604120d9b31e995e587afd17069a3a5509df3 100644 --- a/frontend/src/pages/WatchPage.tsx +++ b/frontend/src/pages/WatchPage.tsx @@ -7,6 +7,8 @@ import { useDebouncedPredict } from "../hooks/useDebouncedPredict"; import type { CommentItem, SuggestedVideo } from "../types/api"; import { formatPct, newId, toxicityColor } from "../utils/toxicity"; +const DEFAULT_EMBED_VIDEO_ID = "A1uxPRUgimk"; + function isPlaceholderTitle(title: string, id: string): boolean { return title === `Video ${id}`; } @@ -16,7 +18,7 @@ export function WatchPage() { const [draft, setDraft] = useState(""); const [sessionComments, setSessionComments] = useState([]); const [suggested, setSuggested] = useState([]); - const [maxComments, setMaxComments] = useState(50); + const [maxComments, setMaxComments] = useState(15); const [activeVideo, setActiveVideo] = useState(null); const [youtubeComments, setYoutubeComments] = useState([]); const [loadingVideoId, setLoadingVideoId] = useState(null); @@ -115,30 +117,28 @@ export function WatchPage() { /> Watch on YouTube (embedding blocked) - ) : activeVideo ? ( + ) : (