JonnyBP commited on
Commit ·
6cda091
1
Parent(s): b53eb67
backup stable api and model service before pipeline testing
Browse files- .gitignore +13 -1
- configs/.gitkeep +0 -0
- data/.gitkeep +0 -0
- env.example +9 -0
- notebooks/.gitkeep +0 -0
- reports/.gitkeep +0 -0
- src/.gitkeep +0 -0
- src/api/main.py +462 -0
- src/app/.gitkeep +0 -0
- src/data/.gitkeep +0 -0
- src/data/loader.py +120 -0
- src/evaluation/evaluator.py +264 -0
- src/features/.gitkeep +0 -0
- src/features/text_preprocessor.py +135 -0
- src/features/vectorizer.py +78 -0
- src/models/baseline.py +282 -0
- src/pipeline/.gitkeep +0 -0
- src/pipeline/run_pipeline.py +231 -0
- src/utils/.gitkeep +0 -0
- src/utils/config_loader.py +36 -0
- src/utils/logger.py +47 -0
.gitignore
CHANGED
|
@@ -63,4 +63,16 @@ models/nb08_roberta_hate/
|
|
| 63 |
models/nb08_toxic_distilbert/
|
| 64 |
|
| 65 |
models/lr_baseline.joblib
|
| 66 |
-
models/best_ensemble.joblib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
models/nb08_toxic_distilbert/
|
| 64 |
|
| 65 |
models/lr_baseline.joblib
|
| 66 |
+
models/best_ensemble.joblib
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# Experiments
|
| 70 |
+
models/experiments/
|
| 71 |
+
|
| 72 |
+
# Reports experiments
|
| 73 |
+
reports/v2/pipeline/
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Python cache
|
| 77 |
+
__pycache__/
|
| 78 |
+
*.pyc
|
configs/.gitkeep
DELETED
|
File without changes
|
data/.gitkeep
DELETED
|
File without changes
|
env.example
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copia este archivo como .env y rellena los valores
|
| 2 |
+
# cp .env.example .env
|
| 3 |
+
|
| 4 |
+
# YouTube Data API v3
|
| 5 |
+
# Obtener en: https://console.cloud.google.com/apis/credentials
|
| 6 |
+
YOUTUBE_API_KEY=your_youtube_api_key_here
|
| 7 |
+
|
| 8 |
+
# Entorno
|
| 9 |
+
ENV=development # development | production
|
notebooks/.gitkeep
DELETED
|
File without changes
|
reports/.gitkeep
DELETED
|
File without changes
|
src/.gitkeep
DELETED
|
File without changes
|
src/api/main.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/api/main.py
|
| 3 |
+
|
| 4 |
+
API REST de producción para detección de hate speech.
|
| 5 |
+
Ejecutar con: uvicorn src.api.main:app --reload --port 8000
|
| 6 |
+
|
| 7 |
+
Documentación automática en:
|
| 8 |
+
http://localhost:8000/docs (Swagger UI)
|
| 9 |
+
http://localhost:8000/redoc (ReDoc)
|
| 10 |
+
|
| 11 |
+
Endpoints:
|
| 12 |
+
GET / → health check
|
| 13 |
+
GET /model-info → info del modelo activo
|
| 14 |
+
GET /models → lista de modelos disponibles
|
| 15 |
+
POST /predict → predice un comentario
|
| 16 |
+
POST /predict-batch → predice una lista de comentarios
|
| 17 |
+
POST /predict-video → dado URL de YouTube, predice todos sus comentarios
|
| 18 |
+
PUT /model/{name} → cambia el modelo activo
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import time
|
| 24 |
+
import logging
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
from typing import Optional
|
| 27 |
+
from contextlib import asynccontextmanager
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
|
| 32 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 33 |
+
from pydantic import BaseModel, Field, field_validator
|
| 34 |
+
|
| 35 |
+
# ── Setup path ────────────────────────────────────────────────────────────────
|
| 36 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 37 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 38 |
+
|
| 39 |
+
from src.service.model_service import ModelService, AVAILABLE_MODELS
|
| 40 |
+
from src.utils.logger import get_logger
|
| 41 |
+
|
| 42 |
+
logger = get_logger(__name__)
|
| 43 |
+
|
| 44 |
+
# ── Estado global de la app ───────────────────────────────────────────────────
|
| 45 |
+
# El modelo se carga una sola vez al iniciar la API y se reutiliza.
|
| 46 |
+
# Esto evita cargar el modelo en cada request (costoso en tiempo).
|
| 47 |
+
_state: dict = {
|
| 48 |
+
"service" : None,
|
| 49 |
+
"model_name" : None,
|
| 50 |
+
"startup_time" : None,
|
| 51 |
+
"predictions_served": 0,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 56 |
+
# LIFESPAN — carga del modelo al iniciar la API
|
| 57 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 58 |
+
@asynccontextmanager
|
| 59 |
+
async def lifespan(app: FastAPI):
|
| 60 |
+
"""
|
| 61 |
+
Lifespan context manager de FastAPI.
|
| 62 |
+
Carga el modelo al iniciar la app y libera recursos al cerrarla.
|
| 63 |
+
"""
|
| 64 |
+
# Startup
|
| 65 |
+
model_name = os.getenv("MODEL_NAME", list(AVAILABLE_MODELS.keys())[0])
|
| 66 |
+
logger.info(f"Iniciando API — cargando modelo: {model_name}")
|
| 67 |
+
_state["service"] = ModelService(model_name, PROJECT_ROOT)
|
| 68 |
+
_state["model_name"] = model_name
|
| 69 |
+
_state["startup_time"] = time.time()
|
| 70 |
+
|
| 71 |
+
# Warm-up: predecir un texto de prueba para que el modelo quede en memoria
|
| 72 |
+
try:
|
| 73 |
+
_state["service"].predict("test warmup text")
|
| 74 |
+
logger.info("Modelo cargado y warm-up completado ✅")
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.warning(f"Warm-up falló (no crítico): {e}")
|
| 77 |
+
|
| 78 |
+
yield # La API está lista
|
| 79 |
+
|
| 80 |
+
# Shutdown
|
| 81 |
+
logger.info("API cerrándose — limpiando recursos")
|
| 82 |
+
_state["service"] = None
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 86 |
+
# APP
|
| 87 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 88 |
+
app = FastAPI(
|
| 89 |
+
title = "SignalMod API",
|
| 90 |
+
description = "API de detección de hate speech en comentarios de YouTube",
|
| 91 |
+
version = "1.0.0",
|
| 92 |
+
lifespan = lifespan,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
|
| 96 |
+
app.add_middleware(
|
| 97 |
+
CORSMiddleware,
|
| 98 |
+
allow_origins = ["*"],
|
| 99 |
+
allow_methods = ["*"],
|
| 100 |
+
allow_headers = ["*"],
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 105 |
+
# SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
|
| 106 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 107 |
+
class PredictRequest(BaseModel):
|
| 108 |
+
"""Cuerpo del request para predecir un comentario."""
|
| 109 |
+
text : str = Field(..., min_length=1, max_length=5000,
|
| 110 |
+
description="Comentario a analizar")
|
| 111 |
+
threshold: float = Field(0.5, ge=0.0, le=1.0,
|
| 112 |
+
description="Umbral de probabilidad para clasificar como tóxico")
|
| 113 |
+
|
| 114 |
+
@field_validator("text")
|
| 115 |
+
@classmethod
|
| 116 |
+
def text_not_empty(cls, v):
|
| 117 |
+
if not v.strip():
|
| 118 |
+
raise ValueError("El texto no puede estar vacío")
|
| 119 |
+
return v.strip()
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class PredictResponse(BaseModel):
|
| 123 |
+
"""Respuesta de la predicción."""
|
| 124 |
+
text : str
|
| 125 |
+
is_toxic : bool
|
| 126 |
+
probability: float = Field(..., ge=0.0, le=1.0)
|
| 127 |
+
labels : list[str]
|
| 128 |
+
model_used : str
|
| 129 |
+
latency_ms : float
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class BatchPredictRequest(BaseModel):
|
| 133 |
+
"""Request para predecir múltiples comentarios."""
|
| 134 |
+
texts : list[str] = Field(..., min_length=1, max_length=100)
|
| 135 |
+
threshold: float = Field(0.5, ge=0.0, le=1.0)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class BatchPredictResponse(BaseModel):
|
| 139 |
+
"""Respuesta de predicción batch."""
|
| 140 |
+
results : list[PredictResponse]
|
| 141 |
+
total : int
|
| 142 |
+
toxic_count : int
|
| 143 |
+
latency_ms : float
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class VideoRequest(BaseModel):
|
| 147 |
+
"""Request para analizar comentarios de un video de YouTube."""
|
| 148 |
+
url : str = Field(..., description="URL del video de YouTube")
|
| 149 |
+
max_comments: int = Field(50, ge=1, le=200,
|
| 150 |
+
description="Número máximo de comentarios a analizar")
|
| 151 |
+
threshold : float = Field(0.5, ge=0.0, le=1.0)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class VideoResponse(BaseModel):
|
| 155 |
+
"""Respuesta del análisis de un video de YouTube."""
|
| 156 |
+
video_url : str
|
| 157 |
+
total_fetched: int
|
| 158 |
+
toxic_count : int
|
| 159 |
+
toxic_rate : float
|
| 160 |
+
results : list[PredictResponse]
|
| 161 |
+
error : Optional[str] = None
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class ModelInfo(BaseModel):
|
| 165 |
+
"""Información sobre el modelo activo."""
|
| 166 |
+
name : str
|
| 167 |
+
type : str
|
| 168 |
+
description : str
|
| 169 |
+
speed : str
|
| 170 |
+
accuracy : str
|
| 171 |
+
uptime_s : float
|
| 172 |
+
predictions_served: int
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 176 |
+
# HELPERS
|
| 177 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 178 |
+
def _get_service() -> ModelService:
|
| 179 |
+
"""Devuelve el servicio activo o lanza 503 si no está listo."""
|
| 180 |
+
if _state["service"] is None:
|
| 181 |
+
raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
|
| 182 |
+
return _state["service"]
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
|
| 186 |
+
"""Predice un texto y devuelve (result, latency_ms)."""
|
| 187 |
+
t0 = time.perf_counter()
|
| 188 |
+
result = _get_service().predict(text)
|
| 189 |
+
ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 190 |
+
|
| 191 |
+
# Aplicar umbral personalizado
|
| 192 |
+
result["is_toxic"] = result["probability"] >= threshold
|
| 193 |
+
if not result["is_toxic"]:
|
| 194 |
+
result["labels"] = []
|
| 195 |
+
|
| 196 |
+
_state["predictions_served"] += 1
|
| 197 |
+
return result, ms
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
|
| 201 |
+
"""
|
| 202 |
+
Obtiene comentarios de un video de YouTube.
|
| 203 |
+
|
| 204 |
+
Estrategia:
|
| 205 |
+
1. Intentar con YouTube Data API v3 (si hay API key en .env)
|
| 206 |
+
2. Fallback: BeautifulSoup (sin autenticación, limitado)
|
| 207 |
+
"""
|
| 208 |
+
api_key = os.getenv("YOUTUBE_API_KEY", "")
|
| 209 |
+
|
| 210 |
+
if api_key:
|
| 211 |
+
return _fetch_via_api(url, api_key, max_comments)
|
| 212 |
+
else:
|
| 213 |
+
return _fetch_via_scraper(url, max_comments)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
|
| 217 |
+
"""Obtiene comentarios usando YouTube Data API v3."""
|
| 218 |
+
try:
|
| 219 |
+
import re
|
| 220 |
+
from googleapiclient.discovery import build
|
| 221 |
+
|
| 222 |
+
# Extraer video_id de la URL
|
| 223 |
+
patterns = [
|
| 224 |
+
r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
|
| 225 |
+
r"youtu\.be/([a-zA-Z0-9_-]{11})",
|
| 226 |
+
r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
|
| 227 |
+
]
|
| 228 |
+
video_id = None
|
| 229 |
+
for pattern in patterns:
|
| 230 |
+
match = re.search(pattern, url)
|
| 231 |
+
if match:
|
| 232 |
+
video_id = match.group(1)
|
| 233 |
+
break
|
| 234 |
+
|
| 235 |
+
if not video_id:
|
| 236 |
+
raise ValueError(f"No se pudo extraer video_id de: {url}")
|
| 237 |
+
|
| 238 |
+
youtube = build("youtube", "v3", developerKey=api_key)
|
| 239 |
+
comments = []
|
| 240 |
+
page_token = None
|
| 241 |
+
|
| 242 |
+
while len(comments) < max_comments:
|
| 243 |
+
request = youtube.commentThreads().list(
|
| 244 |
+
part = "snippet",
|
| 245 |
+
videoId = video_id,
|
| 246 |
+
maxResults = min(100, max_comments - len(comments)),
|
| 247 |
+
pageToken = page_token,
|
| 248 |
+
textFormat = "plainText",
|
| 249 |
+
)
|
| 250 |
+
response = request.execute()
|
| 251 |
+
|
| 252 |
+
for item in response.get("items", []):
|
| 253 |
+
text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
|
| 254 |
+
comments.append(text)
|
| 255 |
+
|
| 256 |
+
page_token = response.get("nextPageToken")
|
| 257 |
+
if not page_token:
|
| 258 |
+
break
|
| 259 |
+
|
| 260 |
+
logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
|
| 261 |
+
return comments[:max_comments]
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.warning(f"YouTube API falló: {e} — usando fallback")
|
| 265 |
+
return _fetch_via_scraper(url, max_comments)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
|
| 269 |
+
"""
|
| 270 |
+
Fallback: simula comentarios si no hay API key.
|
| 271 |
+
En producción real debería usar BeautifulSoup + Selenium.
|
| 272 |
+
"""
|
| 273 |
+
logger.warning(
|
| 274 |
+
"YOUTUBE_API_KEY no configurada. "
|
| 275 |
+
"Configura tu API key en .env para obtener comentarios reales. "
|
| 276 |
+
"Usando comentarios de ejemplo."
|
| 277 |
+
)
|
| 278 |
+
# Comentarios de ejemplo para demo sin API key
|
| 279 |
+
example_comments = [
|
| 280 |
+
"This video is really informative, thanks for sharing!",
|
| 281 |
+
"You are all stupid idiots, get out of here!",
|
| 282 |
+
"Great content, I learned a lot from this.",
|
| 283 |
+
"These people should be eliminated from society.",
|
| 284 |
+
"I agree with the presenter's point of view.",
|
| 285 |
+
"What a bunch of racist criminals!",
|
| 286 |
+
"Thank you for this analysis, very helpful.",
|
| 287 |
+
"Kill them all, they don't deserve to live.",
|
| 288 |
+
"Interesting perspective on the topic.",
|
| 289 |
+
"This is absolute bullshit propaganda!",
|
| 290 |
+
"I think we need to look at both sides.",
|
| 291 |
+
"Black people are thugs and criminals.",
|
| 292 |
+
"The data presented here is compelling.",
|
| 293 |
+
"Go back to where you came from!",
|
| 294 |
+
"Well researched video, good job.",
|
| 295 |
+
]
|
| 296 |
+
return example_comments[:max_comments]
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 300 |
+
# ENDPOINTS
|
| 301 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 302 |
+
|
| 303 |
+
@app.get("/", tags=["Health"])
|
| 304 |
+
async def health_check():
|
| 305 |
+
"""
|
| 306 |
+
Verifica que la API está funcionando.
|
| 307 |
+
Útil para Docker healthcheck y load balancers.
|
| 308 |
+
"""
|
| 309 |
+
service = _state["service"]
|
| 310 |
+
return {
|
| 311 |
+
"status" : "ok" if service else "loading",
|
| 312 |
+
"model" : _state["model_name"],
|
| 313 |
+
"uptime_s": round(time.time() - _state["startup_time"], 1)
|
| 314 |
+
if _state["startup_time"] else 0,
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
@app.get("/model-info", response_model=ModelInfo, tags=["Model"])
|
| 319 |
+
async def get_model_info():
|
| 320 |
+
"""Devuelve información sobre el modelo activo."""
|
| 321 |
+
service = _get_service()
|
| 322 |
+
info = service.get_model_info()
|
| 323 |
+
return ModelInfo(
|
| 324 |
+
name = _state["model_name"],
|
| 325 |
+
type = info.get("type", "unknown"),
|
| 326 |
+
description = info.get("description", ""),
|
| 327 |
+
speed = info.get("speed", ""),
|
| 328 |
+
accuracy = info.get("accuracy", ""),
|
| 329 |
+
uptime_s = round(time.time() - _state["startup_time"], 1),
|
| 330 |
+
predictions_served= _state["predictions_served"],
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
@app.get("/models", tags=["Model"])
|
| 335 |
+
async def list_models():
|
| 336 |
+
"""Lista todos los modelos disponibles."""
|
| 337 |
+
return {
|
| 338 |
+
"available": list(AVAILABLE_MODELS.keys()),
|
| 339 |
+
"active" : _state["model_name"],
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
@app.put("/model/{model_name}", tags=["Model"])
|
| 344 |
+
async def switch_model(model_name: str):
|
| 345 |
+
"""
|
| 346 |
+
Cambia el modelo activo.
|
| 347 |
+
El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
|
| 348 |
+
"""
|
| 349 |
+
if model_name not in AVAILABLE_MODELS:
|
| 350 |
+
raise HTTPException(
|
| 351 |
+
status_code=400,
|
| 352 |
+
detail=f"Modelo '{model_name}' no disponible. "
|
| 353 |
+
f"Opciones: {list(AVAILABLE_MODELS.keys())}",
|
| 354 |
+
)
|
| 355 |
+
_state["service"] = ModelService(model_name, PROJECT_ROOT)
|
| 356 |
+
_state["model_name"] = model_name
|
| 357 |
+
logger.info(f"Modelo cambiado a: {model_name}")
|
| 358 |
+
return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
@app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
|
| 362 |
+
async def predict(request: PredictRequest):
|
| 363 |
+
"""
|
| 364 |
+
Predice si un comentario es tóxico.
|
| 365 |
+
|
| 366 |
+
- **text**: el comentario a analizar
|
| 367 |
+
- **threshold**: umbral de probabilidad (default 0.5)
|
| 368 |
+
|
| 369 |
+
Devuelve la probabilidad, si es tóxico y las categorías detectadas.
|
| 370 |
+
"""
|
| 371 |
+
result, ms = _predict_single(request.text, request.threshold)
|
| 372 |
+
|
| 373 |
+
if "error" in result:
|
| 374 |
+
raise HTTPException(status_code=500, detail=result["error"])
|
| 375 |
+
|
| 376 |
+
return PredictResponse(
|
| 377 |
+
text = request.text,
|
| 378 |
+
is_toxic = result["is_toxic"],
|
| 379 |
+
probability= round(result["probability"], 4),
|
| 380 |
+
labels = result["labels"],
|
| 381 |
+
model_used = result["model_used"],
|
| 382 |
+
latency_ms = ms,
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
@app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
|
| 387 |
+
async def predict_batch(request: BatchPredictRequest):
|
| 388 |
+
"""
|
| 389 |
+
Predice una lista de comentarios en un solo request.
|
| 390 |
+
Más eficiente que llamar /predict N veces.
|
| 391 |
+
Máximo 100 comentarios por request.
|
| 392 |
+
"""
|
| 393 |
+
t0 = time.perf_counter()
|
| 394 |
+
results = []
|
| 395 |
+
|
| 396 |
+
for text in request.texts:
|
| 397 |
+
if not text.strip():
|
| 398 |
+
continue
|
| 399 |
+
result, _ = _predict_single(text, request.threshold)
|
| 400 |
+
results.append(PredictResponse(
|
| 401 |
+
text = text,
|
| 402 |
+
is_toxic = result["is_toxic"],
|
| 403 |
+
probability= round(result["probability"], 4),
|
| 404 |
+
labels = result["labels"],
|
| 405 |
+
model_used = result["model_used"],
|
| 406 |
+
latency_ms = 0.0,
|
| 407 |
+
))
|
| 408 |
+
|
| 409 |
+
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 410 |
+
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 411 |
+
|
| 412 |
+
return BatchPredictResponse(
|
| 413 |
+
results = results,
|
| 414 |
+
total = len(results),
|
| 415 |
+
toxic_count = toxic_count,
|
| 416 |
+
latency_ms = total_ms,
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
@app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
|
| 421 |
+
async def predict_video(request: VideoRequest):
|
| 422 |
+
"""
|
| 423 |
+
Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
|
| 424 |
+
|
| 425 |
+
Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
|
| 426 |
+
Sin API key usa comentarios de ejemplo para la demo.
|
| 427 |
+
"""
|
| 428 |
+
# Obtener comentarios
|
| 429 |
+
try:
|
| 430 |
+
comments = _scrape_youtube_comments(request.url, request.max_comments)
|
| 431 |
+
except Exception as e:
|
| 432 |
+
raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
|
| 433 |
+
|
| 434 |
+
if not comments:
|
| 435 |
+
raise HTTPException(status_code=404, detail="No se encontraron comentarios en el video")
|
| 436 |
+
|
| 437 |
+
# Predecir batch
|
| 438 |
+
t0 = time.perf_counter()
|
| 439 |
+
results = []
|
| 440 |
+
for text in comments:
|
| 441 |
+
if not text.strip():
|
| 442 |
+
continue
|
| 443 |
+
result, _ = _predict_single(text, request.threshold)
|
| 444 |
+
results.append(PredictResponse(
|
| 445 |
+
text = text,
|
| 446 |
+
is_toxic = result["is_toxic"],
|
| 447 |
+
probability= round(result["probability"], 4),
|
| 448 |
+
labels = result["labels"],
|
| 449 |
+
model_used = result["model_used"],
|
| 450 |
+
latency_ms = 0.0,
|
| 451 |
+
))
|
| 452 |
+
|
| 453 |
+
total_ms = round((time.perf_counter() - t0) * 1000, 2)
|
| 454 |
+
toxic_count = sum(1 for r in results if r.is_toxic)
|
| 455 |
+
|
| 456 |
+
return VideoResponse(
|
| 457 |
+
video_url = request.url,
|
| 458 |
+
total_fetched= len(results),
|
| 459 |
+
toxic_count = toxic_count,
|
| 460 |
+
toxic_rate = round(toxic_count / len(results), 4) if results else 0.0,
|
| 461 |
+
results = results,
|
| 462 |
+
)
|
src/app/.gitkeep
DELETED
|
File without changes
|
src/data/.gitkeep
DELETED
|
File without changes
|
src/data/loader.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/data/loader.py
|
| 3 |
+
|
| 4 |
+
Carga y valida el dataset de comentarios de YouTube.
|
| 5 |
+
Responsabilidad única: leer el CSV y verificar que tiene las columnas esperadas.
|
| 6 |
+
El preprocesamiento viene después (src/features/text_preprocessor.py).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from src.utils.logger import get_logger
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
# Columnas obligatorias en el dataset
|
| 16 |
+
REQUIRED_COLUMNS = {"Text", "IsToxic"}
|
| 17 |
+
|
| 18 |
+
# Sublabels opcionales (pueden no estar presentes)
|
| 19 |
+
SUBLABEL_COLUMNS = {
|
| 20 |
+
"IsAbusive", "IsProvocative", "IsHatespeech",
|
| 21 |
+
"IsRacist", "IsObscene", "IsThreat",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_raw_data(path: str | Path) -> pd.DataFrame:
|
| 26 |
+
"""
|
| 27 |
+
Carga el CSV crudo de comentarios de YouTube.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
path: Ruta al archivo CSV.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
DataFrame validado y limpio a nivel estructural.
|
| 34 |
+
|
| 35 |
+
Raises:
|
| 36 |
+
FileNotFoundError: si el archivo no existe.
|
| 37 |
+
ValueError: si faltan columnas obligatorias.
|
| 38 |
+
"""
|
| 39 |
+
path = Path(path)
|
| 40 |
+
if not path.exists():
|
| 41 |
+
raise FileNotFoundError(f"Dataset no encontrado: {path}")
|
| 42 |
+
|
| 43 |
+
logger.info(f"Cargando dataset: {path}")
|
| 44 |
+
df = pd.read_csv(path)
|
| 45 |
+
logger.info(f" Shape: {df.shape}")
|
| 46 |
+
|
| 47 |
+
_validate_columns(df)
|
| 48 |
+
df = _clean_structure(df)
|
| 49 |
+
|
| 50 |
+
logger.info(f" Toxicos: {df['IsToxic'].sum()} ({df['IsToxic'].mean()*100:.1f}%)")
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def load_preprocessed_data(path: str | Path) -> pd.DataFrame:
|
| 55 |
+
"""
|
| 56 |
+
Carga el CSV preprocesado (con columna clean_text).
|
| 57 |
+
Generado por el notebook 02 o por run_pipeline.
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
path: Ruta al CSV preprocesado.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
DataFrame con columna clean_text lista para vectorizar.
|
| 64 |
+
"""
|
| 65 |
+
path = Path(path)
|
| 66 |
+
if not path.exists():
|
| 67 |
+
raise FileNotFoundError(
|
| 68 |
+
f"Datos preprocesados no encontrados: {path}\n"
|
| 69 |
+
f"Ejecuta: python -m src.pipeline.run_pipeline"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
df = pd.read_csv(path)
|
| 73 |
+
if "clean_text" not in df.columns:
|
| 74 |
+
raise ValueError("El CSV no tiene columna 'clean_text'. Regenera el preprocesamiento.")
|
| 75 |
+
|
| 76 |
+
logger.info(f"Datos preprocesados cargados: {df.shape}")
|
| 77 |
+
return df
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# ── Funciones internas ────────────────────────────────────────────────────────
|
| 81 |
+
|
| 82 |
+
def _validate_columns(df: pd.DataFrame) -> None:
|
| 83 |
+
"""Verifica que el dataset tenga las columnas obligatorias."""
|
| 84 |
+
missing = REQUIRED_COLUMNS - set(df.columns)
|
| 85 |
+
if missing:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
f"Columnas obligatorias ausentes: {missing}\n"
|
| 88 |
+
f"Columnas encontradas: {list(df.columns)}"
|
| 89 |
+
)
|
| 90 |
+
logger.info(f" Columnas validadas ✅")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _clean_structure(df: pd.DataFrame) -> pd.DataFrame:
|
| 94 |
+
"""
|
| 95 |
+
Limpieza estructural mínima:
|
| 96 |
+
- Elimina filas con Text vacío
|
| 97 |
+
- Convierte IsToxic a bool
|
| 98 |
+
- Convierte sublabels a bool si existen
|
| 99 |
+
"""
|
| 100 |
+
df = df.copy()
|
| 101 |
+
|
| 102 |
+
# Texto
|
| 103 |
+
df["Text"] = df["Text"].fillna("").astype(str).str.strip()
|
| 104 |
+
df = df[df["Text"] != ""].reset_index(drop=True)
|
| 105 |
+
|
| 106 |
+
# Target binario
|
| 107 |
+
df["IsToxic"] = df["IsToxic"].astype(bool)
|
| 108 |
+
|
| 109 |
+
# Sublabels
|
| 110 |
+
for col in SUBLABEL_COLUMNS:
|
| 111 |
+
if col in df.columns:
|
| 112 |
+
df[col] = df[col].astype(bool)
|
| 113 |
+
|
| 114 |
+
# Eliminar duplicados
|
| 115 |
+
n_before = len(df)
|
| 116 |
+
df = df.drop_duplicates(subset=["Text"]).reset_index(drop=True)
|
| 117 |
+
if len(df) < n_before:
|
| 118 |
+
logger.warning(f" {n_before - len(df)} duplicados eliminados")
|
| 119 |
+
|
| 120 |
+
return df
|
src/evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/evaluation/evaluator.py
|
| 3 |
+
|
| 4 |
+
Evaluación estandarizada de modelos.
|
| 5 |
+
Genera métricas, visualizaciones e informes JSON.
|
| 6 |
+
|
| 7 |
+
Uso:
|
| 8 |
+
evaluator = Evaluator(output_dir="reports/pipeline")
|
| 9 |
+
metrics = evaluator.evaluate(model, X_test, y_test, model_name="LR")
|
| 10 |
+
evaluator.error_analysis(X_test, y_test, preds, probs)
|
| 11 |
+
evaluator.save_summary(all_metrics, path="reports/summary.csv")
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import numpy as np
|
| 16 |
+
import pandas as pd
|
| 17 |
+
import matplotlib.pyplot as plt
|
| 18 |
+
import seaborn as sns
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from sklearn.metrics import (
|
| 22 |
+
f1_score, precision_score, recall_score,
|
| 23 |
+
roc_auc_score, accuracy_score,
|
| 24 |
+
confusion_matrix, classification_report,
|
| 25 |
+
RocCurveDisplay,
|
| 26 |
+
)
|
| 27 |
+
from src.utils.logger import get_logger
|
| 28 |
+
|
| 29 |
+
logger = get_logger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class Evaluator:
|
| 33 |
+
"""
|
| 34 |
+
Evaluador estandarizado de modelos de clasificación binaria.
|
| 35 |
+
|
| 36 |
+
Genera:
|
| 37 |
+
- Métricas completas (F1, Precision, Recall, ROC-AUC)
|
| 38 |
+
- Ambas métricas de gap (train-test y CV-test)
|
| 39 |
+
- Matriz de confusión (PNG)
|
| 40 |
+
- Curva ROC (PNG)
|
| 41 |
+
- Análisis de errores (FP y FN más comunes)
|
| 42 |
+
- Informe JSON por experimento
|
| 43 |
+
- CSV resumen de todos los experimentos
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def __init__(self, output_dir: str | Path = "reports/pipeline"):
|
| 47 |
+
self.output_dir = Path(output_dir)
|
| 48 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 49 |
+
|
| 50 |
+
# ── Evaluación principal ─────────────────────────────────────────────────
|
| 51 |
+
def evaluate(
|
| 52 |
+
self,
|
| 53 |
+
model,
|
| 54 |
+
X_test,
|
| 55 |
+
y_test,
|
| 56 |
+
model_name: str,
|
| 57 |
+
X_train=None,
|
| 58 |
+
y_train=None,
|
| 59 |
+
cv_results: dict = None,
|
| 60 |
+
) -> dict:
|
| 61 |
+
"""
|
| 62 |
+
Evalúa un modelo sobre el test set.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
model: objeto con método predict() y predict_proba()
|
| 66 |
+
X_test, y_test: datos de test
|
| 67 |
+
model_name: nombre para los reports
|
| 68 |
+
X_train, y_train: opcional — para calcular train_test_gap
|
| 69 |
+
cv_results: opcional — dict con cv_f1_mean para calcular cv_test_gap
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Dict con todas las métricas.
|
| 73 |
+
"""
|
| 74 |
+
logger.info(f"Evaluando: {model_name}")
|
| 75 |
+
|
| 76 |
+
y_pred = model.predict(X_test)
|
| 77 |
+
y_proba = model.predict_proba(X_test)[:, 1]
|
| 78 |
+
y_test_arr = np.array(y_test)
|
| 79 |
+
|
| 80 |
+
# ── Métricas test ────────────────────────────────────────────────────
|
| 81 |
+
metrics = {
|
| 82 |
+
"model" : model_name,
|
| 83 |
+
"timestamp" : datetime.now().isoformat(),
|
| 84 |
+
"f1_weighted": round(f1_score(y_test_arr, y_pred, average="weighted"), 4),
|
| 85 |
+
"f1_toxic" : round(f1_score(y_test_arr, y_pred, pos_label=1), 4),
|
| 86 |
+
"precision" : round(precision_score(y_test_arr, y_pred, average="weighted"), 4),
|
| 87 |
+
"recall" : round(recall_score(y_test_arr, y_pred, average="weighted"), 4),
|
| 88 |
+
"accuracy" : round(accuracy_score(y_test_arr, y_pred), 4),
|
| 89 |
+
"roc_auc" : round(roc_auc_score(y_test_arr, y_proba), 4),
|
| 90 |
+
"fp" : int(((y_test_arr == 0) & (y_pred == 1)).sum()),
|
| 91 |
+
"fn" : int(((y_test_arr == 1) & (y_pred == 0)).sum()),
|
| 92 |
+
"n_test" : len(y_test_arr),
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
# ── Train-test gap (in-sample vs OOS) ────────────────────────────────
|
| 96 |
+
if X_train is not None and y_train is not None:
|
| 97 |
+
y_train_pred = model.predict(X_train)
|
| 98 |
+
f1_train = f1_score(np.array(y_train), y_train_pred, average="weighted")
|
| 99 |
+
metrics["f1_train"] = round(f1_train, 4)
|
| 100 |
+
metrics["train_test_gap_pp"]= round((f1_train - metrics["f1_weighted"]) * 100, 2)
|
| 101 |
+
|
| 102 |
+
# ── CV-test gap (OOS vs OOS — métrica correcta para la rúbrica) ──────
|
| 103 |
+
if cv_results and "cv_f1_mean" in cv_results:
|
| 104 |
+
cv_mean = cv_results["cv_f1_mean"]
|
| 105 |
+
metrics["cv_f1_mean"] = round(cv_mean, 4)
|
| 106 |
+
metrics["cv_f1_std"] = round(cv_results.get("cv_f1_std", 0), 4)
|
| 107 |
+
metrics["cv_test_gap_pp"]= round(abs(cv_mean - metrics["f1_weighted"]) * 100, 2)
|
| 108 |
+
|
| 109 |
+
self._print_summary(metrics)
|
| 110 |
+
return metrics
|
| 111 |
+
|
| 112 |
+
# ── Visualizaciones ──────────────────────────────────────────────────────
|
| 113 |
+
def plot_confusion_matrix(
|
| 114 |
+
self,
|
| 115 |
+
y_test,
|
| 116 |
+
y_pred,
|
| 117 |
+
model_name: str,
|
| 118 |
+
save: bool = True,
|
| 119 |
+
) -> Path | None:
|
| 120 |
+
"""Genera y guarda la matriz de confusión."""
|
| 121 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 122 |
+
fig, ax = plt.subplots(figsize=(5, 4))
|
| 123 |
+
sns.heatmap(
|
| 124 |
+
cm, annot=True, fmt="d", cmap="Blues", ax=ax,
|
| 125 |
+
xticklabels=["No tóxico", "Tóxico"],
|
| 126 |
+
yticklabels=["No tóxico", "Tóxico"],
|
| 127 |
+
linewidths=0.5,
|
| 128 |
+
)
|
| 129 |
+
ax.set_title(f"{model_name} — Confusion Matrix", fontweight="bold")
|
| 130 |
+
ax.set_xlabel("Predicción")
|
| 131 |
+
ax.set_ylabel("Real")
|
| 132 |
+
plt.tight_layout()
|
| 133 |
+
|
| 134 |
+
if save:
|
| 135 |
+
safe = model_name.lower().replace(" ", "_").replace("/", "_")
|
| 136 |
+
path = self.output_dir / f"cm_{safe}.png"
|
| 137 |
+
plt.savefig(path, dpi=150, bbox_inches="tight")
|
| 138 |
+
plt.show()
|
| 139 |
+
logger.info(f"Confusion matrix guardada: {path}")
|
| 140 |
+
return path
|
| 141 |
+
|
| 142 |
+
plt.show()
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
def plot_roc_curve(
|
| 146 |
+
self,
|
| 147 |
+
y_test,
|
| 148 |
+
y_proba,
|
| 149 |
+
model_name: str,
|
| 150 |
+
save: bool = True,
|
| 151 |
+
) -> Path | None:
|
| 152 |
+
"""Genera y guarda la curva ROC."""
|
| 153 |
+
fig, ax = plt.subplots(figsize=(6, 5))
|
| 154 |
+
RocCurveDisplay.from_predictions(
|
| 155 |
+
y_test, y_proba, ax=ax, name=model_name, color="#7F77DD"
|
| 156 |
+
)
|
| 157 |
+
ax.plot([0, 1], [0, 1], "--", color="gray", alpha=0.5, label="Random")
|
| 158 |
+
ax.set_title(f"{model_name} — Curva ROC", fontweight="bold")
|
| 159 |
+
ax.legend()
|
| 160 |
+
plt.tight_layout()
|
| 161 |
+
|
| 162 |
+
if save:
|
| 163 |
+
safe = model_name.lower().replace(" ", "_").replace("/", "_")
|
| 164 |
+
path = self.output_dir / f"roc_{safe}.png"
|
| 165 |
+
plt.savefig(path, dpi=150, bbox_inches="tight")
|
| 166 |
+
plt.show()
|
| 167 |
+
logger.info(f"Curva ROC guardada: {path}")
|
| 168 |
+
return path
|
| 169 |
+
|
| 170 |
+
plt.show()
|
| 171 |
+
return None
|
| 172 |
+
|
| 173 |
+
# ── Análisis de errores ──────────────────────────────────────────────────
|
| 174 |
+
def error_analysis(
|
| 175 |
+
self,
|
| 176 |
+
X_test,
|
| 177 |
+
y_test,
|
| 178 |
+
y_pred,
|
| 179 |
+
y_proba,
|
| 180 |
+
n_examples: int = 5,
|
| 181 |
+
) -> dict:
|
| 182 |
+
"""
|
| 183 |
+
Analiza los falsos positivos y falsos negativos más relevantes.
|
| 184 |
+
|
| 185 |
+
FP → comentarios OK que el modelo censura (peor UX)
|
| 186 |
+
FN → hate speech que se escapa (peor para el objetivo del proyecto)
|
| 187 |
+
"""
|
| 188 |
+
texts = np.array(X_test) if not isinstance(X_test, np.ndarray) else X_test
|
| 189 |
+
y_arr = np.array(y_test)
|
| 190 |
+
|
| 191 |
+
error_df = pd.DataFrame({
|
| 192 |
+
"text" : texts,
|
| 193 |
+
"real" : y_arr,
|
| 194 |
+
"pred" : y_pred,
|
| 195 |
+
"prob_toxic": y_proba,
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
fp = error_df[(error_df["real"] == 0) & (error_df["pred"] == 1)]
|
| 199 |
+
fn = error_df[(error_df["real"] == 1) & (error_df["pred"] == 0)]
|
| 200 |
+
|
| 201 |
+
logger.info(f"Errores: FP={len(fp)} | FN={len(fn)}")
|
| 202 |
+
|
| 203 |
+
print(f"\n{'='*65}")
|
| 204 |
+
print(f"FALSOS NEGATIVOS — hate speech que NO detectó ({len(fn)} total)")
|
| 205 |
+
print(f"{'='*65}")
|
| 206 |
+
for _, row in fn.nsmallest(n_examples, "prob_toxic").iterrows():
|
| 207 |
+
print(f" Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
|
| 208 |
+
print()
|
| 209 |
+
|
| 210 |
+
print(f"{'='*65}")
|
| 211 |
+
print(f"FALSOS POSITIVOS — comentarios OK censurados ({len(fp)} total)")
|
| 212 |
+
print(f"{'='*65}")
|
| 213 |
+
for _, row in fp.nlargest(n_examples, "prob_toxic").iterrows():
|
| 214 |
+
print(f" Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
|
| 215 |
+
print()
|
| 216 |
+
|
| 217 |
+
return {"fp_examples": fp.head(n_examples).to_dict("records"),
|
| 218 |
+
"fn_examples": fn.head(n_examples).to_dict("records")}
|
| 219 |
+
|
| 220 |
+
# ── Reports ──────────────────────────────────────────────────────────────
|
| 221 |
+
def save_report(self, metrics: dict, experiment_id: str) -> Path:
|
| 222 |
+
"""Guarda las métricas de un experimento en JSON."""
|
| 223 |
+
path = self.output_dir / f"{experiment_id}.json"
|
| 224 |
+
with open(path, "w") as f:
|
| 225 |
+
json.dump(metrics, f, indent=2)
|
| 226 |
+
logger.info(f"Report guardado: {path}")
|
| 227 |
+
return path
|
| 228 |
+
|
| 229 |
+
def save_summary(self, all_metrics: list[dict], path: str | Path = None) -> Path:
|
| 230 |
+
"""
|
| 231 |
+
Guarda un CSV con todos los experimentos para comparar.
|
| 232 |
+
Este es el 'reports/summary.csv' que mencionaba el roadmap.
|
| 233 |
+
"""
|
| 234 |
+
path = Path(path or self.output_dir / "summary.csv")
|
| 235 |
+
df = pd.DataFrame(all_metrics)
|
| 236 |
+
|
| 237 |
+
# Ordenar por F1 descendente
|
| 238 |
+
if "f1_weighted" in df.columns:
|
| 239 |
+
df = df.sort_values("f1_weighted", ascending=False)
|
| 240 |
+
|
| 241 |
+
df.to_csv(path, index=False)
|
| 242 |
+
logger.info(f"Summary guardado: {path}")
|
| 243 |
+
print(df[["model", "f1_weighted", "roc_auc", "fp", "fn"]].to_string(index=False))
|
| 244 |
+
return path
|
| 245 |
+
|
| 246 |
+
# ── Interno ──────────────────────────────────────────────────────────────
|
| 247 |
+
def _print_summary(self, metrics: dict) -> None:
|
| 248 |
+
gap_str = ""
|
| 249 |
+
if "cv_test_gap_pp" in metrics:
|
| 250 |
+
ok = "✅" if metrics["cv_test_gap_pp"] < 5 else "⚠️"
|
| 251 |
+
gap_str = f"CV-test gap: {metrics['cv_test_gap_pp']:.2f}pp {ok}"
|
| 252 |
+
elif "train_test_gap_pp" in metrics:
|
| 253 |
+
ok = "✅" if metrics["train_test_gap_pp"] < 5 else "⚠️"
|
| 254 |
+
gap_str = f"Train-test gap: {metrics['train_test_gap_pp']:.2f}pp {ok}"
|
| 255 |
+
|
| 256 |
+
print(f"\n{'='*55}")
|
| 257 |
+
print(f"RESULTADOS — {metrics['model']}")
|
| 258 |
+
print(f"{'='*55}")
|
| 259 |
+
print(f" F1 weighted : {metrics['f1_weighted']:.4f}")
|
| 260 |
+
print(f" ROC-AUC : {metrics['roc_auc']:.4f}")
|
| 261 |
+
print(f" FP / FN : {metrics['fp']} / {metrics['fn']}")
|
| 262 |
+
if gap_str:
|
| 263 |
+
print(f" {gap_str}")
|
| 264 |
+
print(f"{'='*55}")
|
src/features/.gitkeep
DELETED
|
File without changes
|
src/features/text_preprocessor.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/features/text_preprocessor.py
|
| 3 |
+
|
| 4 |
+
Pipeline de preprocesamiento NLP.
|
| 5 |
+
Traducción directa del notebook 02 a código de producción.
|
| 6 |
+
|
| 7 |
+
Pasos:
|
| 8 |
+
1. Lowercase
|
| 9 |
+
2. Regex: URLs, @menciones, \\xa0, apostrofes, números
|
| 10 |
+
3. spaCy: lematización (en_core_web_sm)
|
| 11 |
+
4. NLTK: filtrado stopwords english + custom
|
| 12 |
+
|
| 13 |
+
Uso:
|
| 14 |
+
preprocessor = TextPreprocessor()
|
| 15 |
+
clean_series = preprocessor.transform(df["Text"])
|
| 16 |
+
clean_text = preprocessor.transform("texto crudo aqui")
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import re
|
| 20 |
+
import yaml
|
| 21 |
+
import nltk
|
| 22 |
+
import spacy
|
| 23 |
+
import pandas as pd
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from nltk.corpus import stopwords
|
| 26 |
+
from src.utils.logger import get_logger
|
| 27 |
+
|
| 28 |
+
logger = get_logger(__name__)
|
| 29 |
+
|
| 30 |
+
# Descargar recursos NLTK si no existen
|
| 31 |
+
for resource in ["stopwords", "punkt"]:
|
| 32 |
+
nltk.download(resource, quiet=True)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TextPreprocessor:
|
| 36 |
+
"""
|
| 37 |
+
Pipeline NLP para hate speech detection.
|
| 38 |
+
Lee su configuración de configs/features.yaml.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
# Stopwords custom: palabras frecuentes sin valor discriminante
|
| 42 |
+
# en el dominio YouTube. No son stopwords generales.
|
| 43 |
+
CUSTOM_STOPWORDS = {
|
| 44 |
+
"youtube", "video", "watch", "like", "comment",
|
| 45 |
+
"channel", "click", "subscribe", "link",
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def __init__(self, config_path: str = "configs/features.yaml"):
|
| 49 |
+
# Cargar config
|
| 50 |
+
with open(config_path) as f:
|
| 51 |
+
cfg = yaml.safe_load(f)["preprocessing"]
|
| 52 |
+
self.cfg = cfg
|
| 53 |
+
|
| 54 |
+
# Stopwords: NLTK + custom
|
| 55 |
+
self.stop_words = set(stopwords.words("english")) | self.CUSTOM_STOPWORDS
|
| 56 |
+
self.min_len = cfg.get("min_token_length", 2)
|
| 57 |
+
|
| 58 |
+
# Cargar modelo spaCy
|
| 59 |
+
# disable=["parser","ner"] → solo usamos el lemmatizer, más rápido
|
| 60 |
+
self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
|
| 61 |
+
logger.info(f"TextPreprocessor iniciado — spaCy {self.nlp.meta['version']}")
|
| 62 |
+
|
| 63 |
+
# ── Pasos individuales ────────────────────────────────────────────────────
|
| 64 |
+
|
| 65 |
+
def _lowercase(self, text: str) -> str:
|
| 66 |
+
"""Paso 1: minúsculas. 'BLACK' y 'black' son la misma feature."""
|
| 67 |
+
return str(text).lower()
|
| 68 |
+
|
| 69 |
+
def _clean_regex(self, text: str) -> str:
|
| 70 |
+
"""
|
| 71 |
+
Paso 2: elimina ruido estructural con regex.
|
| 72 |
+
Orden importante: primero lo más específico, luego lo general.
|
| 73 |
+
"""
|
| 74 |
+
text = re.sub(r"http\S+|www\.\S+", "", text) # URLs
|
| 75 |
+
text = re.sub(r"@\w+", "", text) # @menciones
|
| 76 |
+
text = re.sub(r"[\n\t\r]", " ", text) # saltos de línea
|
| 77 |
+
text = re.sub(r"[^\x00-\x7F]+", " ", text) # \xa0, emojis
|
| 78 |
+
text = re.sub(r"'", "", text) # apóstrofes
|
| 79 |
+
text = re.sub(r"\b\d+\b", "", text) # números solos
|
| 80 |
+
text = re.sub(r"\s+", " ", text) # espacios múltiples
|
| 81 |
+
return text.strip()
|
| 82 |
+
|
| 83 |
+
def _lemmatize(self, text: str) -> str:
|
| 84 |
+
"""
|
| 85 |
+
Paso 3+4: lematización con spaCy + filtrado de stopwords con NLTK.
|
| 86 |
+
|
| 87 |
+
Por qué spaCy para lematizar:
|
| 88 |
+
Entiende gramática: 'running'→'run', 'cops'→'cop'
|
| 89 |
+
Un stemmer de NLTK simplemente corta: 'running'→'runn'
|
| 90 |
+
|
| 91 |
+
Por qué NLTK para stopwords:
|
| 92 |
+
Lista curada de 179 palabras funcionales.
|
| 93 |
+
Más fácil de personalizar que la lista interna de spaCy.
|
| 94 |
+
|
| 95 |
+
DECISIÓN del EDA: NO eliminar 'black','white','police','cop'
|
| 96 |
+
→ Aparecen en ambas clases con contexto distinto.
|
| 97 |
+
El modelo necesita verlas para aprender por bigrams.
|
| 98 |
+
"""
|
| 99 |
+
doc = self.nlp(text)
|
| 100 |
+
tokens = [
|
| 101 |
+
token.lemma_
|
| 102 |
+
for token in doc
|
| 103 |
+
if not token.is_punct
|
| 104 |
+
and not token.is_space
|
| 105 |
+
and len(token.text) >= self.min_len
|
| 106 |
+
and token.lemma_ not in self.stop_words
|
| 107 |
+
]
|
| 108 |
+
return " ".join(tokens)
|
| 109 |
+
|
| 110 |
+
def _transform_one(self, text: str) -> str:
|
| 111 |
+
text = self._lowercase(text)
|
| 112 |
+
text = self._clean_regex(text)
|
| 113 |
+
text = self._lemmatize(text)
|
| 114 |
+
return text
|
| 115 |
+
|
| 116 |
+
# ── Interfaz pública ──────────────────────────────────────────────────────
|
| 117 |
+
|
| 118 |
+
def transform(self, data) -> str | pd.Series:
|
| 119 |
+
"""
|
| 120 |
+
Preprocesa un texto o una Serie completa.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
data: str o pd.Series con textos crudos.
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
str o pd.Series con textos limpios y lematizados.
|
| 127 |
+
"""
|
| 128 |
+
if isinstance(data, pd.Series):
|
| 129 |
+
logger.info(f"Preprocesando {len(data)} textos...")
|
| 130 |
+
result = data.apply(self._transform_one)
|
| 131 |
+
empty = (result == "").sum()
|
| 132 |
+
if empty > 0:
|
| 133 |
+
logger.warning(f" {empty} textos quedaron vacíos tras limpieza")
|
| 134 |
+
return result
|
| 135 |
+
return self._transform_one(data)
|
src/features/vectorizer.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/features/vectorizer.py
|
| 3 |
+
|
| 4 |
+
Vectorizador configurable desde YAML.
|
| 5 |
+
Traducción directa del notebook 03 a código de producción.
|
| 6 |
+
|
| 7 |
+
Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000.
|
| 8 |
+
Justificación:
|
| 9 |
+
- Bigramas capturan contexto: 'black thug' es distinto a 'black' solo
|
| 10 |
+
- max_features=5000 equilibra vocabulario vs overfitting (800 muestras train)
|
| 11 |
+
- sublinear_tf=True evita que repetir una palabra infle artificialmente su peso
|
| 12 |
+
|
| 13 |
+
Uso:
|
| 14 |
+
vec = Vectorizer()
|
| 15 |
+
X_train_vec = vec.fit_transform(X_train_text)
|
| 16 |
+
X_test_vec = vec.transform(X_test_text)
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import yaml
|
| 20 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
| 21 |
+
from src.utils.logger import get_logger
|
| 22 |
+
|
| 23 |
+
logger = get_logger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Vectorizer:
|
| 27 |
+
"""
|
| 28 |
+
Wrapper sobre TfidfVectorizer / CountVectorizer.
|
| 29 |
+
Parámetros controlados por configs/features.yaml.
|
| 30 |
+
|
| 31 |
+
Regla crítica: fit() SOLO sobre train, transform() sobre train y test.
|
| 32 |
+
Si se hace fit sobre todo el dataset antes del split → data leakage.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, config_path: str = "configs/features.yaml", method: str = None):
|
| 36 |
+
with open(config_path) as f:
|
| 37 |
+
cfg = yaml.safe_load(f)["vectorization"]
|
| 38 |
+
|
| 39 |
+
self.method = method or cfg.get("method", "tfidf")
|
| 40 |
+
c = cfg[self.method]
|
| 41 |
+
|
| 42 |
+
if self.method == "tfidf":
|
| 43 |
+
self.vectorizer = TfidfVectorizer(
|
| 44 |
+
max_features = c["max_features"],
|
| 45 |
+
ngram_range = tuple(c["ngram_range"]),
|
| 46 |
+
sublinear_tf = c.get("sublinear_tf", True),
|
| 47 |
+
min_df = c.get("min_df", 3),
|
| 48 |
+
analyzer = "word",
|
| 49 |
+
strip_accents = "unicode",
|
| 50 |
+
)
|
| 51 |
+
else:
|
| 52 |
+
self.vectorizer = CountVectorizer(
|
| 53 |
+
max_features = c["max_features"],
|
| 54 |
+
ngram_range = tuple(c["ngram_range"]),
|
| 55 |
+
min_df = c.get("min_df", 3),
|
| 56 |
+
analyzer = "word",
|
| 57 |
+
strip_accents = "unicode",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
logger.info(f"Vectorizer: {self.method} | max_features={c['max_features']} | ngram={c['ngram_range']}")
|
| 61 |
+
|
| 62 |
+
def fit_transform(self, X_train):
|
| 63 |
+
"""Ajusta el vocabulario y transforma el train set."""
|
| 64 |
+
logger.info("Vectorizando train set...")
|
| 65 |
+
matrix = self.vectorizer.fit_transform(X_train)
|
| 66 |
+
logger.info(f" Shape: {matrix.shape} | Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}")
|
| 67 |
+
return matrix
|
| 68 |
+
|
| 69 |
+
def transform(self, X):
|
| 70 |
+
"""Transforma sin ajustar (para test/producción)."""
|
| 71 |
+
return self.vectorizer.transform(X)
|
| 72 |
+
|
| 73 |
+
def get_feature_names(self):
|
| 74 |
+
return self.vectorizer.get_feature_names_out()
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def vocabulary_size(self) -> int:
|
| 78 |
+
return len(self.vectorizer.vocabulary_)
|
src/models/baseline.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/models/baseline.py
|
| 3 |
+
|
| 4 |
+
Modelos clásicos de ML para clasificación de texto.
|
| 5 |
+
Traducción directa de notebooks 04 y 05.
|
| 6 |
+
|
| 7 |
+
Todos los modelos siguen la misma interfaz:
|
| 8 |
+
model.fit(X_train, y_train)
|
| 9 |
+
model.predict(X)
|
| 10 |
+
model.predict_proba(X)
|
| 11 |
+
model.save(path)
|
| 12 |
+
Model.load(path)
|
| 13 |
+
|
| 14 |
+
Uso desde el pipeline:
|
| 15 |
+
model = build_model("lr", config_path="configs/models.yaml")
|
| 16 |
+
model.fit(X_train_vec, y_train)
|
| 17 |
+
preds = model.predict(X_test_vec)
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import yaml
|
| 21 |
+
import joblib
|
| 22 |
+
import numpy as np
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from sklearn.linear_model import LogisticRegression
|
| 25 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 26 |
+
from sklearn.pipeline import Pipeline
|
| 27 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 28 |
+
from sklearn.model_selection import StratifiedKFold, cross_validate
|
| 29 |
+
from src.utils.logger import get_logger
|
| 30 |
+
|
| 31 |
+
logger = get_logger(__name__)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ── Clase base ────────────────────────────────────────────────────────────────
|
| 35 |
+
class BaseSklearnModel:
|
| 36 |
+
"""
|
| 37 |
+
Interfaz común para todos los modelos sklearn del proyecto.
|
| 38 |
+
Hereda LRModel y EnsembleModel.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
self.pipeline = None # sklearn Pipeline (TF-IDF + clf)
|
| 43 |
+
self.is_fitted = False
|
| 44 |
+
|
| 45 |
+
def fit(self, X_train, y_train) -> "BaseSklearnModel":
|
| 46 |
+
"""Entrena el pipeline completo."""
|
| 47 |
+
logger.info(f"Entrenando {self.__class__.__name__}...")
|
| 48 |
+
self.pipeline.fit(X_train, y_train)
|
| 49 |
+
self.is_fitted = True
|
| 50 |
+
logger.info(" Entrenamiento completado")
|
| 51 |
+
return self
|
| 52 |
+
|
| 53 |
+
def predict(self, X) -> np.ndarray:
|
| 54 |
+
self._check_fitted()
|
| 55 |
+
return self.pipeline.predict(X)
|
| 56 |
+
|
| 57 |
+
def predict_proba(self, X) -> np.ndarray:
|
| 58 |
+
self._check_fitted()
|
| 59 |
+
return self.pipeline.predict_proba(X)
|
| 60 |
+
|
| 61 |
+
def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
|
| 62 |
+
"""
|
| 63 |
+
Evaluación con StratifiedKFold.
|
| 64 |
+
Devuelve medias y desviaciones estándar de las métricas.
|
| 65 |
+
"""
|
| 66 |
+
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
|
| 67 |
+
results = cross_validate(
|
| 68 |
+
self.pipeline, X_train, y_train,
|
| 69 |
+
cv=cv,
|
| 70 |
+
scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
|
| 71 |
+
return_train_score=True,
|
| 72 |
+
n_jobs=-1,
|
| 73 |
+
)
|
| 74 |
+
summary = {
|
| 75 |
+
"cv_f1_mean" : results["test_f1"].mean(),
|
| 76 |
+
"cv_f1_std" : results["test_f1"].std(),
|
| 77 |
+
"cv_roc_mean" : results["test_roc_auc"].mean(),
|
| 78 |
+
"train_f1_mean" : results["train_f1"].mean(),
|
| 79 |
+
"gap_pp" : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
|
| 80 |
+
}
|
| 81 |
+
logger.info(
|
| 82 |
+
f" CV F1: {summary['cv_f1_mean']:.4f} ± {summary['cv_f1_std']:.4f} | "
|
| 83 |
+
f"Gap: {summary['gap_pp']:.1f}pp"
|
| 84 |
+
)
|
| 85 |
+
return summary
|
| 86 |
+
|
| 87 |
+
def save(self, path: str | Path) -> None:
|
| 88 |
+
path = Path(path)
|
| 89 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
joblib.dump(self.pipeline, path)
|
| 91 |
+
logger.info(f"Modelo guardado: {path}")
|
| 92 |
+
|
| 93 |
+
@classmethod
|
| 94 |
+
def load(cls, path: str | Path) -> "BaseSklearnModel":
|
| 95 |
+
path = Path(path)
|
| 96 |
+
if not path.exists():
|
| 97 |
+
raise FileNotFoundError(f"Modelo no encontrado: {path}")
|
| 98 |
+
instance = cls.__new__(cls)
|
| 99 |
+
instance.pipeline = joblib.load(path)
|
| 100 |
+
instance.is_fitted = True
|
| 101 |
+
logger.info(f"Modelo cargado: {path}")
|
| 102 |
+
return instance
|
| 103 |
+
|
| 104 |
+
def _check_fitted(self):
|
| 105 |
+
if not self.is_fitted:
|
| 106 |
+
raise RuntimeError("El modelo no está entrenado. Llama a .fit() primero.")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ── Logistic Regression ────────────────────────────────────────────────────────
|
| 110 |
+
class LRModel(BaseSklearnModel):
|
| 111 |
+
"""
|
| 112 |
+
Logistic Regression + TF-IDF.
|
| 113 |
+
|
| 114 |
+
Mejor modelo del proyecto (notebook 06):
|
| 115 |
+
F1 test = 0.7579 | CV-test gap = 4.76pp
|
| 116 |
+
Parámetros optimizados con Optuna sobre configs/best_params.yaml.
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
def __init__(
|
| 120 |
+
self,
|
| 121 |
+
config_path: str = "configs/models.yaml",
|
| 122 |
+
feat_config_path: str = "configs/features.yaml",
|
| 123 |
+
best_params_path: str = "configs/best_params.yaml",
|
| 124 |
+
):
|
| 125 |
+
super().__init__()
|
| 126 |
+
|
| 127 |
+
# Intentar cargar best_params.yaml (resultado de Optuna)
|
| 128 |
+
try:
|
| 129 |
+
import yaml as _yaml
|
| 130 |
+
with open(best_params_path) as f:
|
| 131 |
+
best = _yaml.safe_load(f)
|
| 132 |
+
bp = best.get("hyperparameters", {})
|
| 133 |
+
logger.info("Parámetros cargados desde best_params.yaml")
|
| 134 |
+
except FileNotFoundError:
|
| 135 |
+
bp = {}
|
| 136 |
+
logger.warning("best_params.yaml no encontrado — usando config por defecto")
|
| 137 |
+
|
| 138 |
+
# Config base
|
| 139 |
+
with open(config_path) as f:
|
| 140 |
+
mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
|
| 141 |
+
with open(feat_config_path) as f:
|
| 142 |
+
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
|
| 143 |
+
|
| 144 |
+
# Prioridad: best_params > yaml config
|
| 145 |
+
ngram_str = str(bp.get("ngram_range", "1_2"))
|
| 146 |
+
ngram = (1, 1) if ngram_str == "1_1" else (1, 2)
|
| 147 |
+
|
| 148 |
+
self.pipeline = Pipeline([
|
| 149 |
+
("tfidf", TfidfVectorizer(
|
| 150 |
+
max_features = bp.get("max_features", vec_cfg["max_features"]),
|
| 151 |
+
ngram_range = ngram,
|
| 152 |
+
sublinear_tf = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
|
| 153 |
+
min_df = bp.get("min_df", vec_cfg["min_df"]),
|
| 154 |
+
analyzer = "word",
|
| 155 |
+
strip_accents = "unicode",
|
| 156 |
+
)),
|
| 157 |
+
("clf", LogisticRegression(
|
| 158 |
+
C = bp.get("C", mod_cfg["C"]),
|
| 159 |
+
max_iter = mod_cfg["max_iter"],
|
| 160 |
+
class_weight = mod_cfg["class_weight"],
|
| 161 |
+
solver = mod_cfg["solver"],
|
| 162 |
+
random_state = 42,
|
| 163 |
+
)),
|
| 164 |
+
])
|
| 165 |
+
logger.info(f"LRModel creado — C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ── Random Forest ──────────────────────────────────────────────────────────────
|
| 169 |
+
class RFModel(BaseSklearnModel):
|
| 170 |
+
"""
|
| 171 |
+
Random Forest + TF-IDF.
|
| 172 |
+
Parámetros desde configs/models.yaml.
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
def __init__(
|
| 176 |
+
self,
|
| 177 |
+
config_path: str = "configs/models.yaml",
|
| 178 |
+
feat_config_path: str = "configs/features.yaml",
|
| 179 |
+
):
|
| 180 |
+
super().__init__()
|
| 181 |
+
|
| 182 |
+
with open(config_path) as f:
|
| 183 |
+
rf_cfg = yaml.safe_load(f)["models"]["random_forest"]
|
| 184 |
+
with open(feat_config_path) as f:
|
| 185 |
+
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
|
| 186 |
+
|
| 187 |
+
self.pipeline = Pipeline([
|
| 188 |
+
("tfidf", TfidfVectorizer(
|
| 189 |
+
max_features = vec_cfg["max_features"],
|
| 190 |
+
ngram_range = (1, 1), # RF + bigramas es muy lento
|
| 191 |
+
sublinear_tf = vec_cfg["sublinear_tf"],
|
| 192 |
+
min_df = vec_cfg["min_df"],
|
| 193 |
+
analyzer = "word",
|
| 194 |
+
strip_accents = "unicode",
|
| 195 |
+
)),
|
| 196 |
+
("clf", RandomForestClassifier(
|
| 197 |
+
n_estimators = rf_cfg["n_estimators"],
|
| 198 |
+
max_depth = rf_cfg.get("max_depth", 8),
|
| 199 |
+
min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
|
| 200 |
+
max_features = "sqrt",
|
| 201 |
+
class_weight = rf_cfg["class_weight"],
|
| 202 |
+
random_state = 42,
|
| 203 |
+
n_jobs = -1,
|
| 204 |
+
)),
|
| 205 |
+
])
|
| 206 |
+
logger.info("RFModel creado")
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ── XGBoost ───────────────────────────────────────────────────────────────────
|
| 210 |
+
class XGBModel(BaseSklearnModel):
|
| 211 |
+
"""
|
| 212 |
+
XGBoost + TF-IDF.
|
| 213 |
+
Requiere: pip install xgboost
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
def __init__(
|
| 217 |
+
self,
|
| 218 |
+
config_path: str = "configs/models.yaml",
|
| 219 |
+
feat_config_path: str = "configs/features.yaml",
|
| 220 |
+
):
|
| 221 |
+
super().__init__()
|
| 222 |
+
|
| 223 |
+
try:
|
| 224 |
+
from xgboost import XGBClassifier
|
| 225 |
+
except ImportError:
|
| 226 |
+
raise ImportError("Instala XGBoost: pip install xgboost")
|
| 227 |
+
|
| 228 |
+
with open(config_path) as f:
|
| 229 |
+
xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
|
| 230 |
+
with open(feat_config_path) as f:
|
| 231 |
+
vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
|
| 232 |
+
|
| 233 |
+
self.pipeline = Pipeline([
|
| 234 |
+
("tfidf", TfidfVectorizer(
|
| 235 |
+
max_features = vec_cfg["max_features"],
|
| 236 |
+
ngram_range = (1, 1),
|
| 237 |
+
sublinear_tf = True,
|
| 238 |
+
min_df = vec_cfg["min_df"],
|
| 239 |
+
analyzer = "word",
|
| 240 |
+
strip_accents = "unicode",
|
| 241 |
+
)),
|
| 242 |
+
("clf", XGBClassifier(
|
| 243 |
+
n_estimators = xgb_cfg.get("n_estimators", 200),
|
| 244 |
+
max_depth = xgb_cfg.get("max_depth", 3),
|
| 245 |
+
learning_rate = xgb_cfg.get("learning_rate", 0.05),
|
| 246 |
+
subsample = xgb_cfg.get("subsample", 0.8),
|
| 247 |
+
colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
|
| 248 |
+
use_label_encoder= False,
|
| 249 |
+
eval_metric = "logloss",
|
| 250 |
+
random_state = 42,
|
| 251 |
+
verbosity = 0,
|
| 252 |
+
)),
|
| 253 |
+
])
|
| 254 |
+
logger.info("XGBModel creado")
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ── Factory ───────────────────────────────────────────────────────────────────
|
| 258 |
+
def build_model(
|
| 259 |
+
model_type: str,
|
| 260 |
+
config_path: str = "configs/models.yaml",
|
| 261 |
+
feat_config_path: str = "configs/features.yaml",
|
| 262 |
+
best_params_path: str = "configs/best_params.yaml",
|
| 263 |
+
) -> BaseSklearnModel:
|
| 264 |
+
"""
|
| 265 |
+
Construye el modelo indicado en la configuración.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
model_type: "lr" | "rf" | "xgboost"
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Instancia del modelo listo para .fit()
|
| 272 |
+
"""
|
| 273 |
+
builders = {
|
| 274 |
+
"lr" : lambda: LRModel(config_path, feat_config_path, best_params_path),
|
| 275 |
+
"rf" : lambda: RFModel(config_path, feat_config_path),
|
| 276 |
+
"xgboost": lambda: XGBModel(config_path, feat_config_path),
|
| 277 |
+
}
|
| 278 |
+
if model_type not in builders:
|
| 279 |
+
raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")
|
| 280 |
+
|
| 281 |
+
logger.info(f"Construyendo modelo: {model_type}")
|
| 282 |
+
return builders[model_type]()
|
src/pipeline/.gitkeep
DELETED
|
File without changes
|
src/pipeline/run_pipeline.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
src/pipeline/run_pipeline.py
|
| 3 |
+
|
| 4 |
+
Pipeline end-to-end de entrenamiento y evaluación.
|
| 5 |
+
Ejecutar con: python -m src.pipeline.run_pipeline [--model lr|rf|xgboost]
|
| 6 |
+
|
| 7 |
+
Fases:
|
| 8 |
+
1. Carga de datos
|
| 9 |
+
2. Split train/test
|
| 10 |
+
3. Preprocesamiento (spaCy + NLTK)
|
| 11 |
+
4. Entrenamiento (modelo elegido desde config o argumento)
|
| 12 |
+
5. Cross-validation
|
| 13 |
+
6. Evaluación final en test
|
| 14 |
+
7. Guardado del modelo
|
| 15 |
+
8. Registro en MLflow
|
| 16 |
+
9. Informe JSON + CSV resumen
|
| 17 |
+
|
| 18 |
+
Todo se controla desde:
|
| 19 |
+
configs/pipeline.yaml → rutas, split, cv_folds
|
| 20 |
+
configs/features.yaml → preprocesamiento y vectorización
|
| 21 |
+
configs/models.yaml → hiperparámetros
|
| 22 |
+
configs/best_params.yaml → resultado de Optuna (si existe)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
import argparse
|
| 26 |
+
import sys
|
| 27 |
+
import yaml
|
| 28 |
+
import mlflow
|
| 29 |
+
import mlflow.sklearn
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from datetime import datetime
|
| 32 |
+
from sklearn.model_selection import train_test_split
|
| 33 |
+
|
| 34 |
+
# ── Setup path ────────────────────────────────────────────────────────────────
|
| 35 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
| 36 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 37 |
+
|
| 38 |
+
from src.data.loader import load_raw_data
|
| 39 |
+
from src.features.text_preprocessor import TextPreprocessor
|
| 40 |
+
from src.models.baseline import build_model
|
| 41 |
+
from src.evaluation.evaluator import Evaluator
|
| 42 |
+
from src.utils.logger import get_logger
|
| 43 |
+
|
| 44 |
+
logger = get_logger(__name__)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 48 |
+
# PIPELINE
|
| 49 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 50 |
+
def run_pipeline(model_type: str = "lr") -> dict:
|
| 51 |
+
"""
|
| 52 |
+
Ejecuta el pipeline completo de ML.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
model_type: "lr" | "rf" | "xgboost"
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Dict con las métricas del modelo entrenado.
|
| 59 |
+
"""
|
| 60 |
+
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 61 |
+
logger.info("=" * 60)
|
| 62 |
+
logger.info(f"🚀 PIPELINE — model={model_type} | run={run_id}")
|
| 63 |
+
logger.info("=" * 60)
|
| 64 |
+
|
| 65 |
+
# ── Cargar configuración ──────────────────────────────────────────────────
|
| 66 |
+
cfg_pipe = yaml.safe_load(open(PROJECT_ROOT / "configs" / "pipeline.yaml"))
|
| 67 |
+
cfg_feat = yaml.safe_load(open(PROJECT_ROOT / "configs" / "features.yaml"))
|
| 68 |
+
|
| 69 |
+
TARGET = cfg_pipe["data"]["target_binary"]
|
| 70 |
+
RAND = cfg_pipe["pipeline"]["random_state"]
|
| 71 |
+
TEST_SIZE = cfg_pipe["pipeline"]["test_size"]
|
| 72 |
+
CV_FOLDS = cfg_pipe["pipeline"]["cv_folds"]
|
| 73 |
+
RAW_PATH = PROJECT_ROOT / cfg_pipe["data"]["raw_path"]
|
| 74 |
+
MODELS_DIR = PROJECT_ROOT / "models"
|
| 75 |
+
#MODELS_DIR.mkdir(exist_ok=True)
|
| 76 |
+
# Carpeta segura para experimentos
|
| 77 |
+
EXPERIMENTS_DIR = MODELS_DIR / "experiments" / model_type
|
| 78 |
+
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 79 |
+
|
| 80 |
+
# ── FASE 1: Carga de datos ────────────────────────────────────────────────
|
| 81 |
+
logger.info("FASE 1 — Carga de datos")
|
| 82 |
+
df = load_raw_data(RAW_PATH)
|
| 83 |
+
logger.info(f" {len(df)} comentarios cargados")
|
| 84 |
+
|
| 85 |
+
# ── FASE 2: Split ─────────────────────────────────────────────────────────
|
| 86 |
+
logger.info("FASE 2 — Split train/test")
|
| 87 |
+
X = df["Text"]
|
| 88 |
+
y = df[TARGET]
|
| 89 |
+
|
| 90 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 91 |
+
X, y, test_size=TEST_SIZE, random_state=RAND, stratify=y
|
| 92 |
+
)
|
| 93 |
+
logger.info(f" Train: {len(X_train)} | Test: {len(X_test)}")
|
| 94 |
+
|
| 95 |
+
# ── FASE 3: Preprocesamiento ──────────────────────────────────────────────
|
| 96 |
+
logger.info("FASE 3 — Preprocesamiento NLP")
|
| 97 |
+
preprocessor = TextPreprocessor(
|
| 98 |
+
config_path=str(PROJECT_ROOT / "configs" / "features.yaml")
|
| 99 |
+
)
|
| 100 |
+
X_train_clean = preprocessor.transform(X_train)
|
| 101 |
+
X_test_clean = preprocessor.transform(X_test)
|
| 102 |
+
|
| 103 |
+
# Reemplazar vacíos con texto original (evitar pérdida de muestras)
|
| 104 |
+
X_train_clean = X_train_clean.where(X_train_clean != "", X_train)
|
| 105 |
+
X_test_clean = X_test_clean.where(X_test_clean != "", X_test)
|
| 106 |
+
|
| 107 |
+
logger.info(f" Preprocesamiento completado")
|
| 108 |
+
|
| 109 |
+
# ── FASE 4: Entrenamiento ─────────────────────────────────────────────────
|
| 110 |
+
logger.info(f"FASE 4 — Entrenamiento ({model_type.upper()})")
|
| 111 |
+
model = build_model(
|
| 112 |
+
model_type,
|
| 113 |
+
config_path = str(PROJECT_ROOT / "configs" / "models.yaml"),
|
| 114 |
+
feat_config_path = str(PROJECT_ROOT / "configs" / "features.yaml"),
|
| 115 |
+
best_params_path = str(PROJECT_ROOT / "configs" / "best_params.yaml"),
|
| 116 |
+
)
|
| 117 |
+
model.fit(X_train_clean, y_train)
|
| 118 |
+
|
| 119 |
+
# ── FASE 5: Cross-validation ──────────────────────────────────────────────
|
| 120 |
+
logger.info(f"FASE 5 — Cross-validation ({CV_FOLDS} folds)")
|
| 121 |
+
cv_results = model.cross_validate(X_train_clean, y_train, cv_folds=CV_FOLDS, rand=RAND)
|
| 122 |
+
|
| 123 |
+
# ── FASE 6: Evaluación en test ────────────────────────────────────────────
|
| 124 |
+
logger.info("FASE 6 — Evaluación en test")
|
| 125 |
+
evaluator = Evaluator(output_dir=PROJECT_ROOT / "reports" / "v2" / "pipeline")
|
| 126 |
+
|
| 127 |
+
y_pred = model.predict(X_test_clean)
|
| 128 |
+
y_proba = model.predict_proba(X_test_clean)[:, 1]
|
| 129 |
+
|
| 130 |
+
metrics = evaluator.evaluate(
|
| 131 |
+
model, X_test_clean, y_test,
|
| 132 |
+
model_name = model_type.upper(),
|
| 133 |
+
X_train = X_train_clean,
|
| 134 |
+
y_train = y_train,
|
| 135 |
+
cv_results = cv_results,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Visualizaciones
|
| 139 |
+
evaluator.plot_confusion_matrix(y_test, y_pred, model_type.upper())
|
| 140 |
+
evaluator.plot_roc_curve(y_test, y_proba, model_type.upper())
|
| 141 |
+
evaluator.error_analysis(X_test_clean, y_test, y_pred, y_proba)
|
| 142 |
+
|
| 143 |
+
# ── FASE 7: Guardado del modelo ───────────────────────────────────────────
|
| 144 |
+
logger.info("FASE 7 — Guardado del modelo")
|
| 145 |
+
model_path = EXPERIMENTS_DIR / f"{model_type}_pipeline_{run_id}.joblib"
|
| 146 |
+
model.save(model_path)
|
| 147 |
+
|
| 148 |
+
"""
|
| 149 |
+
# Actualizar final_model.joblib si es el modelo por defecto del proyecto
|
| 150 |
+
final_path = MODELS_DIR / "pipeline" / "final_model.joblib"
|
| 151 |
+
model.save(final_path)
|
| 152 |
+
logger.info(f" Modelo de producción actualizado: {final_path}")
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
# ── FASE 8: MLflow ────────────────────────────────────────────────────────
|
| 156 |
+
logger.info("FASE 8 — Registro en MLflow")
|
| 157 |
+
_log_mlflow(metrics, cv_results, model, model_path, run_id, model_type)
|
| 158 |
+
|
| 159 |
+
# ── FASE 9: Informe ───────────────────────────────────────────────────────
|
| 160 |
+
logger.info("FASE 9 — Generando informes")
|
| 161 |
+
metrics["run_id"] = run_id
|
| 162 |
+
metrics["model_path"]= str(model_path)
|
| 163 |
+
evaluator.save_report(metrics, f"exp_{run_id}_{model_type}")
|
| 164 |
+
evaluator.save_summary([metrics])
|
| 165 |
+
|
| 166 |
+
logger.info("=" * 60)
|
| 167 |
+
logger.info(f"✅ Pipeline completado — F1={metrics['f1_weighted']:.4f}")
|
| 168 |
+
logger.info("=" * 60)
|
| 169 |
+
|
| 170 |
+
return metrics
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ── MLflow logging ────────────────────────────────────────────────────────────
|
| 174 |
+
def _log_mlflow(metrics, cv_results, model, model_path, run_id, model_type):
|
| 175 |
+
"""Registra el experimento en MLflow."""
|
| 176 |
+
try:
|
| 177 |
+
mlflow_dir = PROJECT_ROOT / "mlruns"
|
| 178 |
+
mlflow.set_tracking_uri(f"file://{mlflow_dir}")
|
| 179 |
+
mlflow.set_experiment("Youtube_project_experiment_pipeline")
|
| 180 |
+
|
| 181 |
+
with mlflow.start_run(run_name=f"{model_type}_{run_id}"):
|
| 182 |
+
# Parámetros
|
| 183 |
+
mlflow.log_param("model_type", model_type)
|
| 184 |
+
mlflow.log_param("run_id", run_id)
|
| 185 |
+
|
| 186 |
+
# Métricas del pipeline
|
| 187 |
+
mlflow.log_metric("test_f1", metrics["f1_weighted"])
|
| 188 |
+
mlflow.log_metric("test_roc_auc", metrics["roc_auc"])
|
| 189 |
+
mlflow.log_metric("test_fp", metrics["fp"])
|
| 190 |
+
mlflow.log_metric("test_fn", metrics["fn"])
|
| 191 |
+
mlflow.log_metric("cv_f1_mean", cv_results["cv_f1_mean"])
|
| 192 |
+
mlflow.log_metric("cv_f1_std", cv_results["cv_f1_std"])
|
| 193 |
+
mlflow.log_metric("train_test_gap_pp", metrics.get("train_test_gap_pp", 0))
|
| 194 |
+
|
| 195 |
+
if "cv_test_gap_pp" in metrics:
|
| 196 |
+
mlflow.log_metric("cv_test_gap_pp", metrics["cv_test_gap_pp"])
|
| 197 |
+
|
| 198 |
+
# Modelo como artefacto
|
| 199 |
+
mlflow.sklearn.log_model(model.pipeline, f"{model_type}_pipeline")
|
| 200 |
+
|
| 201 |
+
logger.info(f" MLflow run registrado: {model_type}_{run_id}")
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.warning(f"MLflow no disponible: {e}")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 208 |
+
# ENTRY POINT
|
| 209 |
+
# ═════════════════════════════════════════════��════════════════════════════════
|
| 210 |
+
def _parse_args():
|
| 211 |
+
parser = argparse.ArgumentParser(
|
| 212 |
+
description="Pipeline de entrenamiento — YouTube Hate Speech Detection"
|
| 213 |
+
)
|
| 214 |
+
parser.add_argument(
|
| 215 |
+
"--model",
|
| 216 |
+
type=str,
|
| 217 |
+
default="lr",
|
| 218 |
+
choices=["lr", "rf", "xgboost"],
|
| 219 |
+
help="Tipo de modelo a entrenar (default: lr)",
|
| 220 |
+
)
|
| 221 |
+
return parser.parse_args()
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def main():
|
| 225 |
+
args = _parse_args()
|
| 226 |
+
metrics = run_pipeline(model_type=args.model)
|
| 227 |
+
return metrics
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
main()
|
src/utils/.gitkeep
DELETED
|
File without changes
|
src/utils/config_loader.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilidad para cargar archivos de configuración YAML.
|
| 3 |
+
Todos los módulos deben usar esto en lugar de hardcodear valores.
|
| 4 |
+
"""
|
| 5 |
+
import yaml
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def load_config(config_path: str) -> dict:
|
| 10 |
+
"""
|
| 11 |
+
Carga un archivo YAML de configuración.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
config_path: Ruta al archivo YAML (relativa a la raíz del proyecto).
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
Diccionario con la configuración.
|
| 18 |
+
"""
|
| 19 |
+
path = Path(config_path)
|
| 20 |
+
if not path.exists():
|
| 21 |
+
raise FileNotFoundError(f"Config file not found: {config_path}")
|
| 22 |
+
|
| 23 |
+
with open(path, "r") as f:
|
| 24 |
+
return yaml.safe_load(f)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def load_pipeline_config() -> dict:
|
| 28 |
+
return load_config("configs/pipeline.yaml")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def load_features_config() -> dict:
|
| 32 |
+
return load_config("configs/features.yaml")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def load_models_config() -> dict:
|
| 36 |
+
return load_config("configs/models.yaml")
|
src/utils/logger.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sistema de logging centralizado para todo el proyecto.
|
| 3 |
+
Uso: from src.utils.logger import get_logger; logger = get_logger(__name__)
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
|
| 12 |
+
"""
|
| 13 |
+
Devuelve un logger configurado con salida a consola y archivo.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
name: Nombre del logger (usar __name__ en cada módulo).
|
| 17 |
+
level: Nivel de logging.
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
Logger configurado.
|
| 21 |
+
"""
|
| 22 |
+
logger = logging.getLogger(name)
|
| 23 |
+
|
| 24 |
+
if logger.handlers:
|
| 25 |
+
return logger
|
| 26 |
+
|
| 27 |
+
logger.setLevel(level)
|
| 28 |
+
|
| 29 |
+
formatter = logging.Formatter(
|
| 30 |
+
"%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
|
| 31 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Handler consola
|
| 35 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 36 |
+
console_handler.setFormatter(formatter)
|
| 37 |
+
logger.addHandler(console_handler)
|
| 38 |
+
|
| 39 |
+
# Handler archivo
|
| 40 |
+
log_dir = Path("logs")
|
| 41 |
+
log_dir.mkdir(exist_ok=True)
|
| 42 |
+
log_file = log_dir / f"pipeline_{datetime.now().strftime('%Y%m%d')}.log"
|
| 43 |
+
file_handler = logging.FileHandler(log_file)
|
| 44 |
+
file_handler.setFormatter(formatter)
|
| 45 |
+
logger.addHandler(file_handler)
|
| 46 |
+
|
| 47 |
+
return logger
|