Spaces:

devrup404
/

SignalMod

Running

App Files Files Community

JonnyBP commited on 6 days ago

Commit

6cda091

1 Parent(s): b53eb67

backup stable api and model service before pipeline testing

Browse files

Files changed (21) hide show

.gitignore +13 -1
configs/.gitkeep +0 -0
data/.gitkeep +0 -0
env.example +9 -0
notebooks/.gitkeep +0 -0
reports/.gitkeep +0 -0
src/.gitkeep +0 -0
src/api/main.py +462 -0
src/app/.gitkeep +0 -0
src/data/.gitkeep +0 -0
src/data/loader.py +120 -0
src/evaluation/evaluator.py +264 -0
src/features/.gitkeep +0 -0
src/features/text_preprocessor.py +135 -0
src/features/vectorizer.py +78 -0
src/models/baseline.py +282 -0
src/pipeline/.gitkeep +0 -0
src/pipeline/run_pipeline.py +231 -0
src/utils/.gitkeep +0 -0
src/utils/config_loader.py +36 -0
src/utils/logger.py +47 -0

.gitignore CHANGED Viewed

@@ -63,4 +63,16 @@ models/nb08_roberta_hate/
 models/nb08_toxic_distilbert/
 models/lr_baseline.joblib
-models/best_ensemble.joblib

 models/nb08_toxic_distilbert/
 models/lr_baseline.joblib
+models/best_ensemble.joblib
+# Experiments
+models/experiments/
+# Reports experiments
+reports/v2/pipeline/
+# Python cache
+__pycache__/
+*.pyc

configs/.gitkeep DELETED Viewed

File without changes

data/.gitkeep DELETED Viewed

File without changes

env.example ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copia este archivo como .env y rellena los valores
+# cp .env.example .env
+# YouTube Data API v3
+# Obtener en: https://console.cloud.google.com/apis/credentials
+YOUTUBE_API_KEY=your_youtube_api_key_here
+# Entorno
+ENV=development  # development | production

notebooks/.gitkeep DELETED Viewed

File without changes

reports/.gitkeep DELETED Viewed

File without changes

src/.gitkeep DELETED Viewed

File without changes

src/api/main.py ADDED Viewed

	@@ -0,0 +1,462 @@

+"""
+src/api/main.py
+API REST de producción para detección de hate speech.
+Ejecutar con: uvicorn src.api.main:app --reload --port 8000
+Documentación automática en:
+    http://localhost:8000/docs      (Swagger UI)
+    http://localhost:8000/redoc     (ReDoc)
+Endpoints:
+    GET  /                  → health check
+    GET  /model-info        → info del modelo activo
+    GET  /models            → lista de modelos disponibles
+    POST /predict           → predice un comentario
+    POST /predict-batch     → predice una lista de comentarios
+    POST /predict-video     → dado URL de YouTube, predice todos sus comentarios
+    PUT  /model/{name}      → cambia el modelo activo
+"""
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+from typing import Optional
+from contextlib import asynccontextmanager
+from dotenv import load_dotenv
+load_dotenv()
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, field_validator
+# ── Setup path ────────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.service.model_service import ModelService, AVAILABLE_MODELS
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+# ── Estado global de la app ───────────────────────────────────────────────────
+# El modelo se carga una sola vez al iniciar la API y se reutiliza.
+# Esto evita cargar el modelo en cada request (costoso en tiempo).
+_state: dict = {
+    "service"      : None,
+    "model_name"   : None,
+    "startup_time" : None,
+    "predictions_served": 0,
+}
+# ══════════════════════════════════════════════════════════════════════════════
+# LIFESPAN — carga del modelo al iniciar la API
+# ══════════════════════════════════════════════════════════════════════════════
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Lifespan context manager de FastAPI.
+    Carga el modelo al iniciar la app y libera recursos al cerrarla.
+    """
+    # Startup
+    model_name = os.getenv("MODEL_NAME", list(AVAILABLE_MODELS.keys())[0])
+    logger.info(f"Iniciando API — cargando modelo: {model_name}")
+    _state["service"]      = ModelService(model_name, PROJECT_ROOT)
+    _state["model_name"]   = model_name
+    _state["startup_time"] = time.time()
+    # Warm-up: predecir un texto de prueba para que el modelo quede en memoria
+    try:
+        _state["service"].predict("test warmup text")
+        logger.info("Modelo cargado y warm-up completado ✅")
+    except Exception as e:
+        logger.warning(f"Warm-up falló (no crítico): {e}")
+    yield  # La API está lista
+    # Shutdown
+    logger.info("API cerrándose — limpiando recursos")
+    _state["service"] = None
+# ══════════════════════════════════════════════════════════════════════════════
+# APP
+# ══════════════════════════════════════════════════════════════════════════════
+app = FastAPI(
+    title       = "SignalMod API",
+    description = "API de detección de hate speech en comentarios de YouTube",
+    version     = "1.0.0",
+    lifespan    = lifespan,
+)
+# CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins  = ["*"],
+    allow_methods  = ["*"],
+    allow_headers  = ["*"],
+)
+# ══════════════════════════════════════════════════════════════════════════════
+# SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
+# ══════════════════════════════════════════════════════════════════════════════
+class PredictRequest(BaseModel):
+    """Cuerpo del request para predecir un comentario."""
+    text     : str  = Field(..., min_length=1, max_length=5000,
+                            description="Comentario a analizar")
+    threshold: float = Field(0.5, ge=0.0, le=1.0,
+                             description="Umbral de probabilidad para clasificar como tóxico")
+    @field_validator("text")
+    @classmethod
+    def text_not_empty(cls, v):
+        if not v.strip():
+            raise ValueError("El texto no puede estar vacío")
+        return v.strip()
+class PredictResponse(BaseModel):
+    """Respuesta de la predicción."""
+    text       : str
+    is_toxic   : bool
+    probability: float = Field(..., ge=0.0, le=1.0)
+    labels     : list[str]
+    model_used : str
+    latency_ms : float
+class BatchPredictRequest(BaseModel):
+    """Request para predecir múltiples comentarios."""
+    texts    : list[str] = Field(..., min_length=1, max_length=100)
+    threshold: float      = Field(0.5, ge=0.0, le=1.0)
+class BatchPredictResponse(BaseModel):
+    """Respuesta de predicción batch."""
+    results      : list[PredictResponse]
+    total        : int
+    toxic_count  : int
+    latency_ms   : float
+class VideoRequest(BaseModel):
+    """Request para analizar comentarios de un video de YouTube."""
+    url        : str   = Field(..., description="URL del video de YouTube")
+    max_comments: int  = Field(50, ge=1, le=200,
+                               description="Número máximo de comentarios a analizar")
+    threshold  : float = Field(0.5, ge=0.0, le=1.0)
+class VideoResponse(BaseModel):
+    """Respuesta del análisis de un video de YouTube."""
+    video_url   : str
+    total_fetched: int
+    toxic_count : int
+    toxic_rate  : float
+    results     : list[PredictResponse]
+    error       : Optional[str] = None
+class ModelInfo(BaseModel):
+    """Información sobre el modelo activo."""
+    name        : str
+    type        : str
+    description : str
+    speed       : str
+    accuracy    : str
+    uptime_s    : float
+    predictions_served: int
+# ══════════════════════════════════════════════════════════════════════════════
+# HELPERS
+# ══════════════════════════════════════════════════════════════════════════════
+def _get_service() -> ModelService:
+    """Devuelve el servicio activo o lanza 503 si no está listo."""
+    if _state["service"] is None:
+        raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
+    return _state["service"]
+def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
+    """Predice un texto y devuelve (result, latency_ms)."""
+    t0     = time.perf_counter()
+    result = _get_service().predict(text)
+    ms     = round((time.perf_counter() - t0) * 1000, 2)
+    # Aplicar umbral personalizado
+    result["is_toxic"] = result["probability"] >= threshold
+    if not result["is_toxic"]:
+        result["labels"] = []
+    _state["predictions_served"] += 1
+    return result, ms
+def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
+    """
+    Obtiene comentarios de un video de YouTube.
+    Estrategia:
+    1. Intentar con YouTube Data API v3 (si hay API key en .env)
+    2. Fallback: BeautifulSoup (sin autenticación, limitado)
+    """
+    api_key = os.getenv("YOUTUBE_API_KEY", "")
+    if api_key:
+        return _fetch_via_api(url, api_key, max_comments)
+    else:
+        return _fetch_via_scraper(url, max_comments)
+def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
+    """Obtiene comentarios usando YouTube Data API v3."""
+    try:
+        import re
+        from googleapiclient.discovery import build
+        # Extraer video_id de la URL
+        patterns = [
+            r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
+            r"youtu\.be/([a-zA-Z0-9_-]{11})",
+            r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
+        ]
+        video_id = None
+        for pattern in patterns:
+            match = re.search(pattern, url)
+            if match:
+                video_id = match.group(1)
+                break
+        if not video_id:
+            raise ValueError(f"No se pudo extraer video_id de: {url}")
+        youtube  = build("youtube", "v3", developerKey=api_key)
+        comments = []
+        page_token = None
+        while len(comments) < max_comments:
+            request = youtube.commentThreads().list(
+                part       = "snippet",
+                videoId    = video_id,
+                maxResults = min(100, max_comments - len(comments)),
+                pageToken  = page_token,
+                textFormat = "plainText",
+            )
+            response = request.execute()
+            for item in response.get("items", []):
+                text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
+                comments.append(text)
+            page_token = response.get("nextPageToken")
+            if not page_token:
+                break
+        logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
+        return comments[:max_comments]
+    except Exception as e:
+        logger.warning(f"YouTube API falló: {e} — usando fallback")
+        return _fetch_via_scraper(url, max_comments)
+def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
+    """
+    Fallback: simula comentarios si no hay API key.
+    En producción real debería usar BeautifulSoup + Selenium.
+    """
+    logger.warning(
+        "YOUTUBE_API_KEY no configurada. "
+        "Configura tu API key en .env para obtener comentarios reales. "
+        "Usando comentarios de ejemplo."
+    )
+    # Comentarios de ejemplo para demo sin API key
+    example_comments = [
+        "This video is really informative, thanks for sharing!",
+        "You are all stupid idiots, get out of here!",
+        "Great content, I learned a lot from this.",
+        "These people should be eliminated from society.",
+        "I agree with the presenter's point of view.",
+        "What a bunch of racist criminals!",
+        "Thank you for this analysis, very helpful.",
+        "Kill them all, they don't deserve to live.",
+        "Interesting perspective on the topic.",
+        "This is absolute bullshit propaganda!",
+        "I think we need to look at both sides.",
+        "Black people are thugs and criminals.",
+        "The data presented here is compelling.",
+        "Go back to where you came from!",
+        "Well researched video, good job.",
+    ]
+    return example_comments[:max_comments]
+# ══════════════════════════════════════════════════════════════════════════════
+# ENDPOINTS
+# ══════════════════════════════════════════════════════════════════════════════
+@app.get("/", tags=["Health"])
+async def health_check():
+    """
+    Verifica que la API está funcionando.
+    Útil para Docker healthcheck y load balancers.
+    """
+    service = _state["service"]
+    return {
+        "status"  : "ok" if service else "loading",
+        "model"   : _state["model_name"],
+        "uptime_s": round(time.time() - _state["startup_time"], 1)
+                    if _state["startup_time"] else 0,
+    }
+@app.get("/model-info", response_model=ModelInfo, tags=["Model"])
+async def get_model_info():
+    """Devuelve información sobre el modelo activo."""
+    service = _get_service()
+    info    = service.get_model_info()
+    return ModelInfo(
+        name              = _state["model_name"],
+        type              = info.get("type", "unknown"),
+        description       = info.get("description", ""),
+        speed             = info.get("speed", ""),
+        accuracy          = info.get("accuracy", ""),
+        uptime_s          = round(time.time() - _state["startup_time"], 1),
+        predictions_served= _state["predictions_served"],
+    )
+@app.get("/models", tags=["Model"])
+async def list_models():
+    """Lista todos los modelos disponibles."""
+    return {
+        "available": list(AVAILABLE_MODELS.keys()),
+        "active"   : _state["model_name"],
+    }
+@app.put("/model/{model_name}", tags=["Model"])
+async def switch_model(model_name: str):
+    """
+    Cambia el modelo activo.
+    El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
+    """
+    if model_name not in AVAILABLE_MODELS:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Modelo '{model_name}' no disponible. "
+                   f"Opciones: {list(AVAILABLE_MODELS.keys())}",
+        )
+    _state["service"]  = ModelService(model_name, PROJECT_ROOT)
+    _state["model_name"] = model_name
+    logger.info(f"Modelo cambiado a: {model_name}")
+    return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
+@app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
+async def predict(request: PredictRequest):
+    """
+    Predice si un comentario es tóxico.
+    - **text**: el comentario a analizar
+    - **threshold**: umbral de probabilidad (default 0.5)
+    Devuelve la probabilidad, si es tóxico y las categorías detectadas.
+    """
+    result, ms = _predict_single(request.text, request.threshold)
+    if "error" in result:
+        raise HTTPException(status_code=500, detail=result["error"])
+    return PredictResponse(
+        text       = request.text,
+        is_toxic   = result["is_toxic"],
+        probability= round(result["probability"], 4),
+        labels     = result["labels"],
+        model_used = result["model_used"],
+        latency_ms = ms,
+    )
+@app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
+async def predict_batch(request: BatchPredictRequest):
+    """
+    Predice una lista de comentarios en un solo request.
+    Más eficiente que llamar /predict N veces.
+    Máximo 100 comentarios por request.
+    """
+    t0      = time.perf_counter()
+    results = []
+    for text in request.texts:
+        if not text.strip():
+            continue
+        result, _ = _predict_single(text, request.threshold)
+        results.append(PredictResponse(
+            text       = text,
+            is_toxic   = result["is_toxic"],
+            probability= round(result["probability"], 4),
+            labels     = result["labels"],
+            model_used = result["model_used"],
+            latency_ms = 0.0,
+        ))
+    total_ms     = round((time.perf_counter() - t0) * 1000, 2)
+    toxic_count  = sum(1 for r in results if r.is_toxic)
+    return BatchPredictResponse(
+        results     = results,
+        total       = len(results),
+        toxic_count = toxic_count,
+        latency_ms  = total_ms,
+    )
+@app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
+async def predict_video(request: VideoRequest):
+    """
+    Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
+    Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
+    Sin API key usa comentarios de ejemplo para la demo.
+    """
+    # Obtener comentarios
+    try:
+        comments = _scrape_youtube_comments(request.url, request.max_comments)
+    except Exception as e:
+        raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
+    if not comments:
+        raise HTTPException(status_code=404, detail="No se encontraron comentarios en el video")
+    # Predecir batch
+    t0      = time.perf_counter()
+    results = []
+    for text in comments:
+        if not text.strip():
+            continue
+        result, _ = _predict_single(text, request.threshold)
+        results.append(PredictResponse(
+            text       = text,
+            is_toxic   = result["is_toxic"],
+            probability= round(result["probability"], 4),
+            labels     = result["labels"],
+            model_used = result["model_used"],
+            latency_ms = 0.0,
+        ))
+    total_ms    = round((time.perf_counter() - t0) * 1000, 2)
+    toxic_count = sum(1 for r in results if r.is_toxic)
+    return VideoResponse(
+        video_url    = request.url,
+        total_fetched= len(results),
+        toxic_count  = toxic_count,
+        toxic_rate   = round(toxic_count / len(results), 4) if results else 0.0,
+        results      = results,
+    )

src/app/.gitkeep DELETED Viewed

File without changes

src/data/.gitkeep DELETED Viewed

File without changes

src/data/loader.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+src/data/loader.py
+Carga y valida el dataset de comentarios de YouTube.
+Responsabilidad única: leer el CSV y verificar que tiene las columnas esperadas.
+El preprocesamiento viene después (src/features/text_preprocessor.py).
+"""
+import pandas as pd
+from pathlib import Path
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+# Columnas obligatorias en el dataset
+REQUIRED_COLUMNS = {"Text", "IsToxic"}
+# Sublabels opcionales (pueden no estar presentes)
+SUBLABEL_COLUMNS = {
+    "IsAbusive", "IsProvocative", "IsHatespeech",
+    "IsRacist", "IsObscene", "IsThreat",
+}
+def load_raw_data(path: str | Path) -> pd.DataFrame:
+    """
+    Carga el CSV crudo de comentarios de YouTube.
+    Args:
+        path: Ruta al archivo CSV.
+    Returns:
+        DataFrame validado y limpio a nivel estructural.
+    Raises:
+        FileNotFoundError: si el archivo no existe.
+        ValueError: si faltan columnas obligatorias.
+    """
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Dataset no encontrado: {path}")
+    logger.info(f"Cargando dataset: {path}")
+    df = pd.read_csv(path)
+    logger.info(f"  Shape: {df.shape}")
+    _validate_columns(df)
+    df = _clean_structure(df)
+    logger.info(f"  Toxicos: {df['IsToxic'].sum()} ({df['IsToxic'].mean()*100:.1f}%)")
+    return df
+def load_preprocessed_data(path: str | Path) -> pd.DataFrame:
+    """
+    Carga el CSV preprocesado (con columna clean_text).
+    Generado por el notebook 02 o por run_pipeline.
+    Args:
+        path: Ruta al CSV preprocesado.
+    Returns:
+        DataFrame con columna clean_text lista para vectorizar.
+    """
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Datos preprocesados no encontrados: {path}\n"
+            f"Ejecuta: python -m src.pipeline.run_pipeline"
+        )
+    df = pd.read_csv(path)
+    if "clean_text" not in df.columns:
+        raise ValueError("El CSV no tiene columna 'clean_text'. Regenera el preprocesamiento.")
+    logger.info(f"Datos preprocesados cargados: {df.shape}")
+    return df
+# ── Funciones internas ────────────────────────────────────────────────────────
+def _validate_columns(df: pd.DataFrame) -> None:
+    """Verifica que el dataset tenga las columnas obligatorias."""
+    missing = REQUIRED_COLUMNS - set(df.columns)
+    if missing:
+        raise ValueError(
+            f"Columnas obligatorias ausentes: {missing}\n"
+            f"Columnas encontradas: {list(df.columns)}"
+        )
+    logger.info(f"  Columnas validadas ✅")
+def _clean_structure(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Limpieza estructural mínima:
+    - Elimina filas con Text vacío
+    - Convierte IsToxic a bool
+    - Convierte sublabels a bool si existen
+    """
+    df = df.copy()
+    # Texto
+    df["Text"] = df["Text"].fillna("").astype(str).str.strip()
+    df = df[df["Text"] != ""].reset_index(drop=True)
+    # Target binario
+    df["IsToxic"] = df["IsToxic"].astype(bool)
+    # Sublabels
+    for col in SUBLABEL_COLUMNS:
+        if col in df.columns:
+            df[col] = df[col].astype(bool)
+    # Eliminar duplicados
+    n_before = len(df)
+    df = df.drop_duplicates(subset=["Text"]).reset_index(drop=True)
+    if len(df) < n_before:
+        logger.warning(f"  {n_before - len(df)} duplicados eliminados")
+    return df

src/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+src/evaluation/evaluator.py
+Evaluación estandarizada de modelos.
+Genera métricas, visualizaciones e informes JSON.
+Uso:
+    evaluator = Evaluator(output_dir="reports/pipeline")
+    metrics = evaluator.evaluate(model, X_test, y_test, model_name="LR")
+    evaluator.error_analysis(X_test, y_test, preds, probs)
+    evaluator.save_summary(all_metrics, path="reports/summary.csv")
+"""
+import json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime
+from pathlib import Path
+from sklearn.metrics import (
+    f1_score, precision_score, recall_score,
+    roc_auc_score, accuracy_score,
+    confusion_matrix, classification_report,
+    RocCurveDisplay,
+)
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+class Evaluator:
+    """
+    Evaluador estandarizado de modelos de clasificación binaria.
+    Genera:
+    - Métricas completas (F1, Precision, Recall, ROC-AUC)
+    - Ambas métricas de gap (train-test y CV-test)
+    - Matriz de confusión (PNG)
+    - Curva ROC (PNG)
+    - Análisis de errores (FP y FN más comunes)
+    - Informe JSON por experimento
+    - CSV resumen de todos los experimentos
+    """
+    def __init__(self, output_dir: str | Path = "reports/pipeline"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    # ── Evaluación principal ─────────────────────────────────────────────────
+    def evaluate(
+        self,
+        model,
+        X_test,
+        y_test,
+        model_name: str,
+        X_train=None,
+        y_train=None,
+        cv_results: dict = None,
+    ) -> dict:
+        """
+        Evalúa un modelo sobre el test set.
+        Args:
+            model: objeto con método predict() y predict_proba()
+            X_test, y_test: datos de test
+            model_name: nombre para los reports
+            X_train, y_train: opcional — para calcular train_test_gap
+            cv_results: opcional — dict con cv_f1_mean para calcular cv_test_gap
+        Returns:
+            Dict con todas las métricas.
+        """
+        logger.info(f"Evaluando: {model_name}")
+        y_pred  = model.predict(X_test)
+        y_proba = model.predict_proba(X_test)[:, 1]
+        y_test_arr = np.array(y_test)
+        # ── Métricas test ────────────────────────────────────────────────────
+        metrics = {
+            "model"      : model_name,
+            "timestamp"  : datetime.now().isoformat(),
+            "f1_weighted": round(f1_score(y_test_arr, y_pred, average="weighted"), 4),
+            "f1_toxic"   : round(f1_score(y_test_arr, y_pred, pos_label=1), 4),
+            "precision"  : round(precision_score(y_test_arr, y_pred, average="weighted"), 4),
+            "recall"     : round(recall_score(y_test_arr, y_pred, average="weighted"), 4),
+            "accuracy"   : round(accuracy_score(y_test_arr, y_pred), 4),
+            "roc_auc"    : round(roc_auc_score(y_test_arr, y_proba), 4),
+            "fp"         : int(((y_test_arr == 0) & (y_pred == 1)).sum()),
+            "fn"         : int(((y_test_arr == 1) & (y_pred == 0)).sum()),
+            "n_test"     : len(y_test_arr),
+        }
+        # ── Train-test gap (in-sample vs OOS) ────────────────────────────────
+        if X_train is not None and y_train is not None:
+            y_train_pred = model.predict(X_train)
+            f1_train = f1_score(np.array(y_train), y_train_pred, average="weighted")
+            metrics["f1_train"]         = round(f1_train, 4)
+            metrics["train_test_gap_pp"]= round((f1_train - metrics["f1_weighted"]) * 100, 2)
+        # ── CV-test gap (OOS vs OOS — métrica correcta para la rúbrica) ──────
+        if cv_results and "cv_f1_mean" in cv_results:
+            cv_mean = cv_results["cv_f1_mean"]
+            metrics["cv_f1_mean"]    = round(cv_mean, 4)
+            metrics["cv_f1_std"]     = round(cv_results.get("cv_f1_std", 0), 4)
+            metrics["cv_test_gap_pp"]= round(abs(cv_mean - metrics["f1_weighted"]) * 100, 2)
+        self._print_summary(metrics)
+        return metrics
+    # ── Visualizaciones ──────────────────────────────────────────────────────
+    def plot_confusion_matrix(
+        self,
+        y_test,
+        y_pred,
+        model_name: str,
+        save: bool = True,
+    ) -> Path | None:
+        """Genera y guarda la matriz de confusión."""
+        cm = confusion_matrix(y_test, y_pred)
+        fig, ax = plt.subplots(figsize=(5, 4))
+        sns.heatmap(
+            cm, annot=True, fmt="d", cmap="Blues", ax=ax,
+            xticklabels=["No tóxico", "Tóxico"],
+            yticklabels=["No tóxico", "Tóxico"],
+            linewidths=0.5,
+        )
+        ax.set_title(f"{model_name} — Confusion Matrix", fontweight="bold")
+        ax.set_xlabel("Predicción")
+        ax.set_ylabel("Real")
+        plt.tight_layout()
+        if save:
+            safe = model_name.lower().replace(" ", "_").replace("/", "_")
+            path = self.output_dir / f"cm_{safe}.png"
+            plt.savefig(path, dpi=150, bbox_inches="tight")
+            plt.show()
+            logger.info(f"Confusion matrix guardada: {path}")
+            return path
+        plt.show()
+        return None
+    def plot_roc_curve(
+        self,
+        y_test,
+        y_proba,
+        model_name: str,
+        save: bool = True,
+    ) -> Path | None:
+        """Genera y guarda la curva ROC."""
+        fig, ax = plt.subplots(figsize=(6, 5))
+        RocCurveDisplay.from_predictions(
+            y_test, y_proba, ax=ax, name=model_name, color="#7F77DD"
+        )
+        ax.plot([0, 1], [0, 1], "--", color="gray", alpha=0.5, label="Random")
+        ax.set_title(f"{model_name} — Curva ROC", fontweight="bold")
+        ax.legend()
+        plt.tight_layout()
+        if save:
+            safe = model_name.lower().replace(" ", "_").replace("/", "_")
+            path = self.output_dir / f"roc_{safe}.png"
+            plt.savefig(path, dpi=150, bbox_inches="tight")
+            plt.show()
+            logger.info(f"Curva ROC guardada: {path}")
+            return path
+        plt.show()
+        return None
+    # ── Análisis de errores ──────────────────────────────────────────────────
+    def error_analysis(
+        self,
+        X_test,
+        y_test,
+        y_pred,
+        y_proba,
+        n_examples: int = 5,
+    ) -> dict:
+        """
+        Analiza los falsos positivos y falsos negativos más relevantes.
+        FP → comentarios OK que el modelo censura (peor UX)
+        FN → hate speech que se escapa (peor para el objetivo del proyecto)
+        """
+        texts = np.array(X_test) if not isinstance(X_test, np.ndarray) else X_test
+        y_arr = np.array(y_test)
+        error_df = pd.DataFrame({
+            "text"      : texts,
+            "real"      : y_arr,
+            "pred"      : y_pred,
+            "prob_toxic": y_proba,
+        })
+        fp = error_df[(error_df["real"] == 0) & (error_df["pred"] == 1)]
+        fn = error_df[(error_df["real"] == 1) & (error_df["pred"] == 0)]
+        logger.info(f"Errores: FP={len(fp)} | FN={len(fn)}")
+        print(f"\n{'='*65}")
+        print(f"FALSOS NEGATIVOS — hate speech que NO detectó ({len(fn)} total)")
+        print(f"{'='*65}")
+        for _, row in fn.nsmallest(n_examples, "prob_toxic").iterrows():
+            print(f"  Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
+            print()
+        print(f"{'='*65}")
+        print(f"FALSOS POSITIVOS — comentarios OK censurados ({len(fp)} total)")
+        print(f"{'='*65}")
+        for _, row in fp.nlargest(n_examples, "prob_toxic").iterrows():
+            print(f"  Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
+            print()
+        return {"fp_examples": fp.head(n_examples).to_dict("records"),
+                "fn_examples": fn.head(n_examples).to_dict("records")}
+    # ── Reports ──────────────────────────────────────────────────────────────
+    def save_report(self, metrics: dict, experiment_id: str) -> Path:
+        """Guarda las métricas de un experimento en JSON."""
+        path = self.output_dir / f"{experiment_id}.json"
+        with open(path, "w") as f:
+            json.dump(metrics, f, indent=2)
+        logger.info(f"Report guardado: {path}")
+        return path
+    def save_summary(self, all_metrics: list[dict], path: str | Path = None) -> Path:
+        """
+        Guarda un CSV con todos los experimentos para comparar.
+        Este es el 'reports/summary.csv' que mencionaba el roadmap.
+        """
+        path = Path(path or self.output_dir / "summary.csv")
+        df = pd.DataFrame(all_metrics)
+        # Ordenar por F1 descendente
+        if "f1_weighted" in df.columns:
+            df = df.sort_values("f1_weighted", ascending=False)
+        df.to_csv(path, index=False)
+        logger.info(f"Summary guardado: {path}")
+        print(df[["model", "f1_weighted", "roc_auc", "fp", "fn"]].to_string(index=False))
+        return path
+    # ── Interno ──────────────────────────────────────────────────────────────
+    def _print_summary(self, metrics: dict) -> None:
+        gap_str = ""
+        if "cv_test_gap_pp" in metrics:
+            ok  = "✅" if metrics["cv_test_gap_pp"] < 5 else "⚠️"
+            gap_str = f"CV-test gap: {metrics['cv_test_gap_pp']:.2f}pp {ok}"
+        elif "train_test_gap_pp" in metrics:
+            ok  = "✅" if metrics["train_test_gap_pp"] < 5 else "⚠️"
+            gap_str = f"Train-test gap: {metrics['train_test_gap_pp']:.2f}pp {ok}"
+        print(f"\n{'='*55}")
+        print(f"RESULTADOS — {metrics['model']}")
+        print(f"{'='*55}")
+        print(f"  F1 weighted : {metrics['f1_weighted']:.4f}")
+        print(f"  ROC-AUC     : {metrics['roc_auc']:.4f}")
+        print(f"  FP / FN     : {metrics['fp']} / {metrics['fn']}")
+        if gap_str:
+            print(f"  {gap_str}")
+        print(f"{'='*55}")

src/features/.gitkeep DELETED Viewed

File without changes

src/features/text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+src/features/text_preprocessor.py
+Pipeline de preprocesamiento NLP.
+Traducción directa del notebook 02 a código de producción.
+Pasos:
+    1. Lowercase
+    2. Regex: URLs, @menciones, \\xa0, apostrofes, números
+    3. spaCy: lematización (en_core_web_sm)
+    4. NLTK: filtrado stopwords english + custom
+Uso:
+    preprocessor = TextPreprocessor()
+    clean_series = preprocessor.transform(df["Text"])
+    clean_text   = preprocessor.transform("texto crudo aqui")
+"""
+import re
+import yaml
+import nltk
+import spacy
+import pandas as pd
+from pathlib import Path
+from nltk.corpus import stopwords
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+# Descargar recursos NLTK si no existen
+for resource in ["stopwords", "punkt"]:
+    nltk.download(resource, quiet=True)
+class TextPreprocessor:
+    """
+    Pipeline NLP para hate speech detection.
+    Lee su configuración de configs/features.yaml.
+    """
+    # Stopwords custom: palabras frecuentes sin valor discriminante
+    # en el dominio YouTube. No son stopwords generales.
+    CUSTOM_STOPWORDS = {
+        "youtube", "video", "watch", "like", "comment",
+        "channel", "click", "subscribe", "link",
+    }
+    def __init__(self, config_path: str = "configs/features.yaml"):
+        # Cargar config
+        with open(config_path) as f:
+            cfg = yaml.safe_load(f)["preprocessing"]
+        self.cfg = cfg
+        # Stopwords: NLTK + custom
+        self.stop_words = set(stopwords.words("english")) | self.CUSTOM_STOPWORDS
+        self.min_len = cfg.get("min_token_length", 2)
+        # Cargar modelo spaCy
+        # disable=["parser","ner"] → solo usamos el lemmatizer, más rápido
+        self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
+        logger.info(f"TextPreprocessor iniciado — spaCy {self.nlp.meta['version']}")
+    # ── Pasos individuales ────────────────────────────────────────────────────
+    def _lowercase(self, text: str) -> str:
+        """Paso 1: minúsculas. 'BLACK' y 'black' son la misma feature."""
+        return str(text).lower()
+    def _clean_regex(self, text: str) -> str:
+        """
+        Paso 2: elimina ruido estructural con regex.
+        Orden importante: primero lo más específico, luego lo general.
+        """
+        text = re.sub(r"http\S+|www\.\S+", "", text)   # URLs
+        text = re.sub(r"@\w+", "", text)                # @menciones
+        text = re.sub(r"[\n\t\r]", " ", text)           # saltos de línea
+        text = re.sub(r"[^\x00-\x7F]+", " ", text)      # \xa0, emojis
+        text = re.sub(r"'", "", text)                   # apóstrofes
+        text = re.sub(r"\b\d+\b", "", text)             # números solos
+        text = re.sub(r"\s+", " ", text)                # espacios múltiples
+        return text.strip()
+    def _lemmatize(self, text: str) -> str:
+        """
+        Paso 3+4: lematización con spaCy + filtrado de stopwords con NLTK.
+        Por qué spaCy para lematizar:
+            Entiende gramática: 'running'→'run', 'cops'→'cop'
+            Un stemmer de NLTK simplemente corta: 'running'→'runn'
+        Por qué NLTK para stopwords:
+            Lista curada de 179 palabras funcionales.
+            Más fácil de personalizar que la lista interna de spaCy.
+        DECISIÓN del EDA: NO eliminar 'black','white','police','cop'
+            → Aparecen en ambas clases con contexto distinto.
+              El modelo necesita verlas para aprender por bigrams.
+        """
+        doc = self.nlp(text)
+        tokens = [
+            token.lemma_
+            for token in doc
+            if not token.is_punct
+            and not token.is_space
+            and len(token.text) >= self.min_len
+            and token.lemma_ not in self.stop_words
+        ]
+        return " ".join(tokens)
+    def _transform_one(self, text: str) -> str:
+        text = self._lowercase(text)
+        text = self._clean_regex(text)
+        text = self._lemmatize(text)
+        return text
+    # ── Interfaz pública ──────────────────────────────────────────────────────
+    def transform(self, data) -> str | pd.Series:
+        """
+        Preprocesa un texto o una Serie completa.
+        Args:
+            data: str o pd.Series con textos crudos.
+        Returns:
+            str o pd.Series con textos limpios y lematizados.
+        """
+        if isinstance(data, pd.Series):
+            logger.info(f"Preprocesando {len(data)} textos...")
+            result = data.apply(self._transform_one)
+            empty  = (result == "").sum()
+            if empty > 0:
+                logger.warning(f"  {empty} textos quedaron vacíos tras limpieza")
+            return result
+        return self._transform_one(data)

src/features/vectorizer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+src/features/vectorizer.py
+Vectorizador configurable desde YAML.
+Traducción directa del notebook 03 a código de producción.
+Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000.
+Justificación:
+    - Bigramas capturan contexto: 'black thug' es distinto a 'black' solo
+    - max_features=5000 equilibra vocabulario vs overfitting (800 muestras train)
+    - sublinear_tf=True evita que repetir una palabra infle artificialmente su peso
+Uso:
+    vec = Vectorizer()
+    X_train_vec = vec.fit_transform(X_train_text)
+    X_test_vec  = vec.transform(X_test_text)
+"""
+import yaml
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+class Vectorizer:
+    """
+    Wrapper sobre TfidfVectorizer / CountVectorizer.
+    Parámetros controlados por configs/features.yaml.
+    Regla crítica: fit() SOLO sobre train, transform() sobre train y test.
+    Si se hace fit sobre todo el dataset antes del split → data leakage.
+    """
+    def __init__(self, config_path: str = "configs/features.yaml", method: str = None):
+        with open(config_path) as f:
+            cfg = yaml.safe_load(f)["vectorization"]
+        self.method = method or cfg.get("method", "tfidf")
+        c = cfg[self.method]
+        if self.method == "tfidf":
+            self.vectorizer = TfidfVectorizer(
+                max_features  = c["max_features"],
+                ngram_range   = tuple(c["ngram_range"]),
+                sublinear_tf  = c.get("sublinear_tf", True),
+                min_df        = c.get("min_df", 3),
+                analyzer      = "word",
+                strip_accents = "unicode",
+            )
+        else:
+            self.vectorizer = CountVectorizer(
+                max_features  = c["max_features"],
+                ngram_range   = tuple(c["ngram_range"]),
+                min_df        = c.get("min_df", 3),
+                analyzer      = "word",
+                strip_accents = "unicode",
+            )
+        logger.info(f"Vectorizer: {self.method} | max_features={c['max_features']} | ngram={c['ngram_range']}")
+    def fit_transform(self, X_train):
+        """Ajusta el vocabulario y transforma el train set."""
+        logger.info("Vectorizando train set...")
+        matrix = self.vectorizer.fit_transform(X_train)
+        logger.info(f"  Shape: {matrix.shape} | Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}")
+        return matrix
+    def transform(self, X):
+        """Transforma sin ajustar (para test/producción)."""
+        return self.vectorizer.transform(X)
+    def get_feature_names(self):
+        return self.vectorizer.get_feature_names_out()
+    @property
+    def vocabulary_size(self) -> int:
+        return len(self.vectorizer.vocabulary_)

src/models/baseline.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+src/models/baseline.py
+Modelos clásicos de ML para clasificación de texto.
+Traducción directa de notebooks 04 y 05.
+Todos los modelos siguen la misma interfaz:
+    model.fit(X_train, y_train)
+    model.predict(X)
+    model.predict_proba(X)
+    model.save(path)
+    Model.load(path)
+Uso desde el pipeline:
+    model = build_model("lr", config_path="configs/models.yaml")
+    model.fit(X_train_vec, y_train)
+    preds = model.predict(X_test_vec)
+"""
+import yaml
+import joblib
+import numpy as np
+from pathlib import Path
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import StratifiedKFold, cross_validate
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+# ── Clase base ────────────────────────────────────────────────────────────────
+class BaseSklearnModel:
+    """
+    Interfaz común para todos los modelos sklearn del proyecto.
+    Hereda LRModel y EnsembleModel.
+    """
+    def __init__(self):
+        self.pipeline = None   # sklearn Pipeline (TF-IDF + clf)
+        self.is_fitted = False
+    def fit(self, X_train, y_train) -> "BaseSklearnModel":
+        """Entrena el pipeline completo."""
+        logger.info(f"Entrenando {self.__class__.__name__}...")
+        self.pipeline.fit(X_train, y_train)
+        self.is_fitted = True
+        logger.info("  Entrenamiento completado")
+        return self
+    def predict(self, X) -> np.ndarray:
+        self._check_fitted()
+        return self.pipeline.predict(X)
+    def predict_proba(self, X) -> np.ndarray:
+        self._check_fitted()
+        return self.pipeline.predict_proba(X)
+    def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
+        """
+        Evaluación con StratifiedKFold.
+        Devuelve medias y desviaciones estándar de las métricas.
+        """
+        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
+        results = cross_validate(
+            self.pipeline, X_train, y_train,
+            cv=cv,
+            scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
+            return_train_score=True,
+            n_jobs=-1,
+        )
+        summary = {
+            "cv_f1_mean"    : results["test_f1"].mean(),
+            "cv_f1_std"     : results["test_f1"].std(),
+            "cv_roc_mean"   : results["test_roc_auc"].mean(),
+            "train_f1_mean" : results["train_f1"].mean(),
+            "gap_pp"        : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
+        }
+        logger.info(
+            f"  CV F1: {summary['cv_f1_mean']:.4f} ± {summary['cv_f1_std']:.4f} | "
+            f"Gap: {summary['gap_pp']:.1f}pp"
+        )
+        return summary
+    def save(self, path: str | Path) -> None:
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        joblib.dump(self.pipeline, path)
+        logger.info(f"Modelo guardado: {path}")
+    @classmethod
+    def load(cls, path: str | Path) -> "BaseSklearnModel":
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"Modelo no encontrado: {path}")
+        instance = cls.__new__(cls)
+        instance.pipeline = joblib.load(path)
+        instance.is_fitted = True
+        logger.info(f"Modelo cargado: {path}")
+        return instance
+    def _check_fitted(self):
+        if not self.is_fitted:
+            raise RuntimeError("El modelo no está entrenado. Llama a .fit() primero.")
+# ── Logistic Regression ────────────────────────────────────────────────────────
+class LRModel(BaseSklearnModel):
+    """
+    Logistic Regression + TF-IDF.
+    Mejor modelo del proyecto (notebook 06):
+        F1 test = 0.7579 | CV-test gap = 4.76pp
+    Parámetros optimizados con Optuna sobre configs/best_params.yaml.
+    """
+    def __init__(
+        self,
+        config_path: str = "configs/models.yaml",
+        feat_config_path: str = "configs/features.yaml",
+        best_params_path: str = "configs/best_params.yaml",
+    ):
+        super().__init__()
+        # Intentar cargar best_params.yaml (resultado de Optuna)
+        try:
+            import yaml as _yaml
+            with open(best_params_path) as f:
+                best = _yaml.safe_load(f)
+            bp = best.get("hyperparameters", {})
+            logger.info("Parámetros cargados desde best_params.yaml")
+        except FileNotFoundError:
+            bp = {}
+            logger.warning("best_params.yaml no encontrado — usando config por defecto")
+        # Config base
+        with open(config_path) as f:
+            mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
+        with open(feat_config_path) as f:
+            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
+        # Prioridad: best_params > yaml config
+        ngram_str = str(bp.get("ngram_range", "1_2"))
+        ngram     = (1, 1) if ngram_str == "1_1" else (1, 2)
+        self.pipeline = Pipeline([
+            ("tfidf", TfidfVectorizer(
+                max_features  = bp.get("max_features", vec_cfg["max_features"]),
+                ngram_range   = ngram,
+                sublinear_tf  = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
+                min_df        = bp.get("min_df", vec_cfg["min_df"]),
+                analyzer      = "word",
+                strip_accents = "unicode",
+            )),
+            ("clf", LogisticRegression(
+                C            = bp.get("C", mod_cfg["C"]),
+                max_iter     = mod_cfg["max_iter"],
+                class_weight = mod_cfg["class_weight"],
+                solver       = mod_cfg["solver"],
+                random_state = 42,
+            )),
+        ])
+        logger.info(f"LRModel creado — C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")
+# ── Random Forest ──────────────────────────────────────────────────────────────
+class RFModel(BaseSklearnModel):
+    """
+    Random Forest + TF-IDF.
+    Parámetros desde configs/models.yaml.
+    """
+    def __init__(
+        self,
+        config_path: str = "configs/models.yaml",
+        feat_config_path: str = "configs/features.yaml",
+    ):
+        super().__init__()
+        with open(config_path) as f:
+            rf_cfg  = yaml.safe_load(f)["models"]["random_forest"]
+        with open(feat_config_path) as f:
+            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
+        self.pipeline = Pipeline([
+            ("tfidf", TfidfVectorizer(
+                max_features  = vec_cfg["max_features"],
+                ngram_range   = (1, 1),   # RF + bigramas es muy lento
+                sublinear_tf  = vec_cfg["sublinear_tf"],
+                min_df        = vec_cfg["min_df"],
+                analyzer      = "word",
+                strip_accents = "unicode",
+            )),
+            ("clf", RandomForestClassifier(
+                n_estimators     = rf_cfg["n_estimators"],
+                max_depth        = rf_cfg.get("max_depth", 8),
+                min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
+                max_features     = "sqrt",
+                class_weight     = rf_cfg["class_weight"],
+                random_state     = 42,
+                n_jobs           = -1,
+            )),
+        ])
+        logger.info("RFModel creado")
+# ── XGBoost ───────────────────────────────────────────────────────────────────
+class XGBModel(BaseSklearnModel):
+    """
+    XGBoost + TF-IDF.
+    Requiere: pip install xgboost
+    """
+    def __init__(
+        self,
+        config_path: str = "configs/models.yaml",
+        feat_config_path: str = "configs/features.yaml",
+    ):
+        super().__init__()
+        try:
+            from xgboost import XGBClassifier
+        except ImportError:
+            raise ImportError("Instala XGBoost: pip install xgboost")
+        with open(config_path) as f:
+            xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
+        with open(feat_config_path) as f:
+            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
+        self.pipeline = Pipeline([
+            ("tfidf", TfidfVectorizer(
+                max_features  = vec_cfg["max_features"],
+                ngram_range   = (1, 1),
+                sublinear_tf  = True,
+                min_df        = vec_cfg["min_df"],
+                analyzer      = "word",
+                strip_accents = "unicode",
+            )),
+            ("clf", XGBClassifier(
+                n_estimators     = xgb_cfg.get("n_estimators", 200),
+                max_depth        = xgb_cfg.get("max_depth", 3),
+                learning_rate    = xgb_cfg.get("learning_rate", 0.05),
+                subsample        = xgb_cfg.get("subsample", 0.8),
+                colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
+                use_label_encoder= False,
+                eval_metric      = "logloss",
+                random_state     = 42,
+                verbosity        = 0,
+            )),
+        ])
+        logger.info("XGBModel creado")
+# ── Factory ───────────────────────────────────────────────────────────────────
+def build_model(
+    model_type: str,
+    config_path: str = "configs/models.yaml",
+    feat_config_path: str = "configs/features.yaml",
+    best_params_path: str = "configs/best_params.yaml",
+) -> BaseSklearnModel:
+    """
+    Construye el modelo indicado en la configuración.
+    Args:
+        model_type: "lr" | "rf" | "xgboost"
+    Returns:
+        Instancia del modelo listo para .fit()
+    """
+    builders = {
+        "lr"     : lambda: LRModel(config_path, feat_config_path, best_params_path),
+        "rf"     : lambda: RFModel(config_path, feat_config_path),
+        "xgboost": lambda: XGBModel(config_path, feat_config_path),
+    }
+    if model_type not in builders:
+        raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")
+    logger.info(f"Construyendo modelo: {model_type}")
+    return builders[model_type]()

src/pipeline/.gitkeep DELETED Viewed

File without changes

src/pipeline/run_pipeline.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+src/pipeline/run_pipeline.py
+Pipeline end-to-end de entrenamiento y evaluación.
+Ejecutar con: python -m src.pipeline.run_pipeline [--model lr|rf|xgboost]
+Fases:
+    1. Carga de datos
+    2. Split train/test
+    3. Preprocesamiento (spaCy + NLTK)
+    4. Entrenamiento (modelo elegido desde config o argumento)
+    5. Cross-validation
+    6. Evaluación final en test
+    7. Guardado del modelo
+    8. Registro en MLflow
+    9. Informe JSON + CSV resumen
+Todo se controla desde:
+    configs/pipeline.yaml  → rutas, split, cv_folds
+    configs/features.yaml  → preprocesamiento y vectorización
+    configs/models.yaml    → hiperparámetros
+    configs/best_params.yaml → resultado de Optuna (si existe)
+"""
+import argparse
+import sys
+import yaml
+import mlflow
+import mlflow.sklearn
+from pathlib import Path
+from datetime import datetime
+from sklearn.model_selection import train_test_split
+# ── Setup path ────────────────────────────────────────────────────────────────
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(PROJECT_ROOT))
+from src.data.loader import load_raw_data
+from src.features.text_preprocessor import TextPreprocessor
+from src.models.baseline import build_model
+from src.evaluation.evaluator import Evaluator
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+# ══════════════════════════════════════════════════════════════════════════════
+# PIPELINE
+# ══════════════════════════════════════════════════════════════════════════════
+def run_pipeline(model_type: str = "lr") -> dict:
+    """
+    Ejecuta el pipeline completo de ML.
+    Args:
+        model_type: "lr" | "rf" | "xgboost"
+    Returns:
+        Dict con las métricas del modelo entrenado.
+    """
+    run_id  = datetime.now().strftime("%Y%m%d_%H%M%S")
+    logger.info("=" * 60)
+    logger.info(f"🚀 PIPELINE — model={model_type} | run={run_id}")
+    logger.info("=" * 60)
+    # ── Cargar configuración ──────────────────────────────────────────────────
+    cfg_pipe = yaml.safe_load(open(PROJECT_ROOT / "configs" / "pipeline.yaml"))
+    cfg_feat = yaml.safe_load(open(PROJECT_ROOT / "configs" / "features.yaml"))
+    TARGET     = cfg_pipe["data"]["target_binary"]
+    RAND       = cfg_pipe["pipeline"]["random_state"]
+    TEST_SIZE  = cfg_pipe["pipeline"]["test_size"]
+    CV_FOLDS   = cfg_pipe["pipeline"]["cv_folds"]
+    RAW_PATH   = PROJECT_ROOT / cfg_pipe["data"]["raw_path"]
+    MODELS_DIR = PROJECT_ROOT / "models"
+    #MODELS_DIR.mkdir(exist_ok=True)
+    # Carpeta segura para experimentos
+    EXPERIMENTS_DIR = MODELS_DIR / "experiments" / model_type
+    EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
+    # ── FASE 1: Carga de datos ────────────────────────────────────────────────
+    logger.info("FASE 1 — Carga de datos")
+    df = load_raw_data(RAW_PATH)
+    logger.info(f"  {len(df)} comentarios cargados")
+    # ── FASE 2: Split ─────────────────────────────────────────────────────────
+    logger.info("FASE 2 — Split train/test")
+    X = df["Text"]
+    y = df[TARGET]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=TEST_SIZE, random_state=RAND, stratify=y
+    )
+    logger.info(f"  Train: {len(X_train)} | Test: {len(X_test)}")
+    # ── FASE 3: Preprocesamiento ──────────────────────────────────────────────
+    logger.info("FASE 3 — Preprocesamiento NLP")
+    preprocessor = TextPreprocessor(
+        config_path=str(PROJECT_ROOT / "configs" / "features.yaml")
+    )
+    X_train_clean = preprocessor.transform(X_train)
+    X_test_clean  = preprocessor.transform(X_test)
+    # Reemplazar vacíos con texto original (evitar pérdida de muestras)
+    X_train_clean = X_train_clean.where(X_train_clean != "", X_train)
+    X_test_clean  = X_test_clean.where(X_test_clean != "", X_test)
+    logger.info(f"  Preprocesamiento completado")
+    # ── FASE 4: Entrenamiento ─────────────────────────────────────────────────
+    logger.info(f"FASE 4 — Entrenamiento ({model_type.upper()})")
+    model = build_model(
+        model_type,
+        config_path      = str(PROJECT_ROOT / "configs" / "models.yaml"),
+        feat_config_path = str(PROJECT_ROOT / "configs" / "features.yaml"),
+        best_params_path = str(PROJECT_ROOT / "configs" / "best_params.yaml"),
+    )
+    model.fit(X_train_clean, y_train)
+    # ── FASE 5: Cross-validation ──────────────────────────────────────────────
+    logger.info(f"FASE 5 — Cross-validation ({CV_FOLDS} folds)")
+    cv_results = model.cross_validate(X_train_clean, y_train, cv_folds=CV_FOLDS, rand=RAND)
+    # ── FASE 6: Evaluación en test ────────────────────────────────────────────
+    logger.info("FASE 6 — Evaluación en test")
+    evaluator = Evaluator(output_dir=PROJECT_ROOT / "reports" / "v2" / "pipeline")
+    y_pred  = model.predict(X_test_clean)
+    y_proba = model.predict_proba(X_test_clean)[:, 1]
+    metrics = evaluator.evaluate(
+        model, X_test_clean, y_test,
+        model_name  = model_type.upper(),
+        X_train     = X_train_clean,
+        y_train     = y_train,
+        cv_results  = cv_results,
+    )
+    # Visualizaciones
+    evaluator.plot_confusion_matrix(y_test, y_pred, model_type.upper())
+    evaluator.plot_roc_curve(y_test, y_proba, model_type.upper())
+    evaluator.error_analysis(X_test_clean, y_test, y_pred, y_proba)
+    # ── FASE 7: Guardado del modelo ───────────────────────────────────────────
+    logger.info("FASE 7 — Guardado del modelo")
+    model_path = EXPERIMENTS_DIR / f"{model_type}_pipeline_{run_id}.joblib"
+    model.save(model_path)
+    """
+    # Actualizar final_model.joblib si es el modelo por defecto del proyecto
+    final_path = MODELS_DIR / "pipeline" / "final_model.joblib"
+    model.save(final_path)
+    logger.info(f"  Modelo de producción actualizado: {final_path}")
+    """
+    # ── FASE 8: MLflow ────────────────────────────────────────────────────────
+    logger.info("FASE 8 — Registro en MLflow")
+    _log_mlflow(metrics, cv_results, model, model_path, run_id, model_type)
+    # ── FASE 9: Informe ───────────────────────────────────────────────────────
+    logger.info("FASE 9 — Generando informes")
+    metrics["run_id"]    = run_id
+    metrics["model_path"]= str(model_path)
+    evaluator.save_report(metrics, f"exp_{run_id}_{model_type}")
+    evaluator.save_summary([metrics])
+    logger.info("=" * 60)
+    logger.info(f"✅ Pipeline completado — F1={metrics['f1_weighted']:.4f}")
+    logger.info("=" * 60)
+    return metrics
+# ── MLflow logging ────────────────────────────────────────────────────────────
+def _log_mlflow(metrics, cv_results, model, model_path, run_id, model_type):
+    """Registra el experimento en MLflow."""
+    try:
+        mlflow_dir = PROJECT_ROOT / "mlruns"
+        mlflow.set_tracking_uri(f"file://{mlflow_dir}")
+        mlflow.set_experiment("Youtube_project_experiment_pipeline")
+        with mlflow.start_run(run_name=f"{model_type}_{run_id}"):
+            # Parámetros
+            mlflow.log_param("model_type",  model_type)
+            mlflow.log_param("run_id",      run_id)
+            # Métricas del pipeline
+            mlflow.log_metric("test_f1",          metrics["f1_weighted"])
+            mlflow.log_metric("test_roc_auc",      metrics["roc_auc"])
+            mlflow.log_metric("test_fp",           metrics["fp"])
+            mlflow.log_metric("test_fn",           metrics["fn"])
+            mlflow.log_metric("cv_f1_mean",        cv_results["cv_f1_mean"])
+            mlflow.log_metric("cv_f1_std",         cv_results["cv_f1_std"])
+            mlflow.log_metric("train_test_gap_pp", metrics.get("train_test_gap_pp", 0))
+            if "cv_test_gap_pp" in metrics:
+                mlflow.log_metric("cv_test_gap_pp", metrics["cv_test_gap_pp"])
+            # Modelo como artefacto
+            mlflow.sklearn.log_model(model.pipeline, f"{model_type}_pipeline")
+            logger.info(f"  MLflow run registrado: {model_type}_{run_id}")
+    except Exception as e:
+        logger.warning(f"MLflow no disponible: {e}")
+# ══════════════════════════════════════════════════════════════════════════════
+# ENTRY POINT
+# ═════════════════════════════════════════════��════════════════════════════════
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Pipeline de entrenamiento — YouTube Hate Speech Detection"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="lr",
+        choices=["lr", "rf", "xgboost"],
+        help="Tipo de modelo a entrenar (default: lr)",
+    )
+    return parser.parse_args()
+def main():
+    args = _parse_args()
+    metrics = run_pipeline(model_type=args.model)
+    return metrics
+if __name__ == "__main__":
+    main()

src/utils/.gitkeep DELETED Viewed

File without changes

src/utils/config_loader.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Utilidad para cargar archivos de configuración YAML.
+Todos los módulos deben usar esto en lugar de hardcodear valores.
+"""
+import yaml
+from pathlib import Path
+def load_config(config_path: str) -> dict:
+    """
+    Carga un archivo YAML de configuración.
+    Args:
+        config_path: Ruta al archivo YAML (relativa a la raíz del proyecto).
+    Returns:
+        Diccionario con la configuración.
+    """
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    with open(path, "r") as f:
+        return yaml.safe_load(f)
+def load_pipeline_config() -> dict:
+    return load_config("configs/pipeline.yaml")
+def load_features_config() -> dict:
+    return load_config("configs/features.yaml")
+def load_models_config() -> dict:
+    return load_config("configs/models.yaml")

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Sistema de logging centralizado para todo el proyecto.
+Uso: from src.utils.logger import get_logger; logger = get_logger(__name__)
+"""
+import logging
+import sys
+from pathlib import Path
+from datetime import datetime
+def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
+    """
+    Devuelve un logger configurado con salida a consola y archivo.
+    Args:
+        name: Nombre del logger (usar __name__ en cada módulo).
+        level: Nivel de logging.
+    Returns:
+        Logger configurado.
+    """
+    logger = logging.getLogger(name)
+    if logger.handlers:
+        return logger
+    logger.setLevel(level)
+    formatter = logging.Formatter(
+        "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Handler consola
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    # Handler archivo
+    log_dir = Path("logs")
+    log_dir.mkdir(exist_ok=True)
+    log_file = log_dir / f"pipeline_{datetime.now().strftime('%Y%m%d')}.log"
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    return logger