JonnyBP commited on
Commit
6cda091
·
1 Parent(s): b53eb67

backup stable api and model service before pipeline testing

Browse files
.gitignore CHANGED
@@ -63,4 +63,16 @@ models/nb08_roberta_hate/
63
  models/nb08_toxic_distilbert/
64
 
65
  models/lr_baseline.joblib
66
- models/best_ensemble.joblib
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  models/nb08_toxic_distilbert/
64
 
65
  models/lr_baseline.joblib
66
+ models/best_ensemble.joblib
67
+
68
+
69
+ # Experiments
70
+ models/experiments/
71
+
72
+ # Reports experiments
73
+ reports/v2/pipeline/
74
+
75
+
76
+ # Python cache
77
+ __pycache__/
78
+ *.pyc
configs/.gitkeep DELETED
File without changes
data/.gitkeep DELETED
File without changes
env.example ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Copia este archivo como .env y rellena los valores
2
+ # cp .env.example .env
3
+
4
+ # YouTube Data API v3
5
+ # Obtener en: https://console.cloud.google.com/apis/credentials
6
+ YOUTUBE_API_KEY=your_youtube_api_key_here
7
+
8
+ # Entorno
9
+ ENV=development # development | production
notebooks/.gitkeep DELETED
File without changes
reports/.gitkeep DELETED
File without changes
src/.gitkeep DELETED
File without changes
src/api/main.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/api/main.py
3
+
4
+ API REST de producción para detección de hate speech.
5
+ Ejecutar con: uvicorn src.api.main:app --reload --port 8000
6
+
7
+ Documentación automática en:
8
+ http://localhost:8000/docs (Swagger UI)
9
+ http://localhost:8000/redoc (ReDoc)
10
+
11
+ Endpoints:
12
+ GET / → health check
13
+ GET /model-info → info del modelo activo
14
+ GET /models → lista de modelos disponibles
15
+ POST /predict → predice un comentario
16
+ POST /predict-batch → predice una lista de comentarios
17
+ POST /predict-video → dado URL de YouTube, predice todos sus comentarios
18
+ PUT /model/{name} → cambia el modelo activo
19
+ """
20
+
21
+ import os
22
+ import sys
23
+ import time
24
+ import logging
25
+ from pathlib import Path
26
+ from typing import Optional
27
+ from contextlib import asynccontextmanager
28
+ from dotenv import load_dotenv
29
+ load_dotenv()
30
+
31
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, Query
32
+ from fastapi.middleware.cors import CORSMiddleware
33
+ from pydantic import BaseModel, Field, field_validator
34
+
35
+ # ── Setup path ────────────────────────────────────────────────────────────────
36
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
37
+ sys.path.insert(0, str(PROJECT_ROOT))
38
+
39
+ from src.service.model_service import ModelService, AVAILABLE_MODELS
40
+ from src.utils.logger import get_logger
41
+
42
+ logger = get_logger(__name__)
43
+
44
+ # ── Estado global de la app ───────────────────────────────────────────────────
45
+ # El modelo se carga una sola vez al iniciar la API y se reutiliza.
46
+ # Esto evita cargar el modelo en cada request (costoso en tiempo).
47
+ _state: dict = {
48
+ "service" : None,
49
+ "model_name" : None,
50
+ "startup_time" : None,
51
+ "predictions_served": 0,
52
+ }
53
+
54
+
55
+ # ══════════════════════════════════════════════════════════════════════════════
56
+ # LIFESPAN — carga del modelo al iniciar la API
57
+ # ══════════════════════════════════════════════════════════════════════════════
58
+ @asynccontextmanager
59
+ async def lifespan(app: FastAPI):
60
+ """
61
+ Lifespan context manager de FastAPI.
62
+ Carga el modelo al iniciar la app y libera recursos al cerrarla.
63
+ """
64
+ # Startup
65
+ model_name = os.getenv("MODEL_NAME", list(AVAILABLE_MODELS.keys())[0])
66
+ logger.info(f"Iniciando API — cargando modelo: {model_name}")
67
+ _state["service"] = ModelService(model_name, PROJECT_ROOT)
68
+ _state["model_name"] = model_name
69
+ _state["startup_time"] = time.time()
70
+
71
+ # Warm-up: predecir un texto de prueba para que el modelo quede en memoria
72
+ try:
73
+ _state["service"].predict("test warmup text")
74
+ logger.info("Modelo cargado y warm-up completado ✅")
75
+ except Exception as e:
76
+ logger.warning(f"Warm-up falló (no crítico): {e}")
77
+
78
+ yield # La API está lista
79
+
80
+ # Shutdown
81
+ logger.info("API cerrándose — limpiando recursos")
82
+ _state["service"] = None
83
+
84
+
85
+ # ══════════════════════════════════════════════════════════════════════════════
86
+ # APP
87
+ # ══════════════════════════════════════════════════════════════════════════════
88
+ app = FastAPI(
89
+ title = "SignalMod API",
90
+ description = "API de detección de hate speech en comentarios de YouTube",
91
+ version = "1.0.0",
92
+ lifespan = lifespan,
93
+ )
94
+
95
+ # CORS: permite que el Streamlit (puerto 8501) llame a la API (puerto 8000)
96
+ app.add_middleware(
97
+ CORSMiddleware,
98
+ allow_origins = ["*"],
99
+ allow_methods = ["*"],
100
+ allow_headers = ["*"],
101
+ )
102
+
103
+
104
+ # ══════════════════════════════════════════════════════════════════════════════
105
+ # SCHEMAS — Pydantic valida automáticamente los datos de entrada/salida
106
+ # ══════════════════════════════════════════════════════════════════════════════
107
+ class PredictRequest(BaseModel):
108
+ """Cuerpo del request para predecir un comentario."""
109
+ text : str = Field(..., min_length=1, max_length=5000,
110
+ description="Comentario a analizar")
111
+ threshold: float = Field(0.5, ge=0.0, le=1.0,
112
+ description="Umbral de probabilidad para clasificar como tóxico")
113
+
114
+ @field_validator("text")
115
+ @classmethod
116
+ def text_not_empty(cls, v):
117
+ if not v.strip():
118
+ raise ValueError("El texto no puede estar vacío")
119
+ return v.strip()
120
+
121
+
122
+ class PredictResponse(BaseModel):
123
+ """Respuesta de la predicción."""
124
+ text : str
125
+ is_toxic : bool
126
+ probability: float = Field(..., ge=0.0, le=1.0)
127
+ labels : list[str]
128
+ model_used : str
129
+ latency_ms : float
130
+
131
+
132
+ class BatchPredictRequest(BaseModel):
133
+ """Request para predecir múltiples comentarios."""
134
+ texts : list[str] = Field(..., min_length=1, max_length=100)
135
+ threshold: float = Field(0.5, ge=0.0, le=1.0)
136
+
137
+
138
+ class BatchPredictResponse(BaseModel):
139
+ """Respuesta de predicción batch."""
140
+ results : list[PredictResponse]
141
+ total : int
142
+ toxic_count : int
143
+ latency_ms : float
144
+
145
+
146
+ class VideoRequest(BaseModel):
147
+ """Request para analizar comentarios de un video de YouTube."""
148
+ url : str = Field(..., description="URL del video de YouTube")
149
+ max_comments: int = Field(50, ge=1, le=200,
150
+ description="Número máximo de comentarios a analizar")
151
+ threshold : float = Field(0.5, ge=0.0, le=1.0)
152
+
153
+
154
+ class VideoResponse(BaseModel):
155
+ """Respuesta del análisis de un video de YouTube."""
156
+ video_url : str
157
+ total_fetched: int
158
+ toxic_count : int
159
+ toxic_rate : float
160
+ results : list[PredictResponse]
161
+ error : Optional[str] = None
162
+
163
+
164
+ class ModelInfo(BaseModel):
165
+ """Información sobre el modelo activo."""
166
+ name : str
167
+ type : str
168
+ description : str
169
+ speed : str
170
+ accuracy : str
171
+ uptime_s : float
172
+ predictions_served: int
173
+
174
+
175
+ # ══════════════════════════════════════════════════════════════════════════════
176
+ # HELPERS
177
+ # ══════════════════════════════════════════════════════════════════════════════
178
+ def _get_service() -> ModelService:
179
+ """Devuelve el servicio activo o lanza 503 si no está listo."""
180
+ if _state["service"] is None:
181
+ raise HTTPException(status_code=503, detail="Modelo no cargado. Intenta en unos segundos.")
182
+ return _state["service"]
183
+
184
+
185
+ def _predict_single(text: str, threshold: float) -> tuple[dict, float]:
186
+ """Predice un texto y devuelve (result, latency_ms)."""
187
+ t0 = time.perf_counter()
188
+ result = _get_service().predict(text)
189
+ ms = round((time.perf_counter() - t0) * 1000, 2)
190
+
191
+ # Aplicar umbral personalizado
192
+ result["is_toxic"] = result["probability"] >= threshold
193
+ if not result["is_toxic"]:
194
+ result["labels"] = []
195
+
196
+ _state["predictions_served"] += 1
197
+ return result, ms
198
+
199
+
200
+ def _scrape_youtube_comments(url: str, max_comments: int) -> list[str]:
201
+ """
202
+ Obtiene comentarios de un video de YouTube.
203
+
204
+ Estrategia:
205
+ 1. Intentar con YouTube Data API v3 (si hay API key en .env)
206
+ 2. Fallback: BeautifulSoup (sin autenticación, limitado)
207
+ """
208
+ api_key = os.getenv("YOUTUBE_API_KEY", "")
209
+
210
+ if api_key:
211
+ return _fetch_via_api(url, api_key, max_comments)
212
+ else:
213
+ return _fetch_via_scraper(url, max_comments)
214
+
215
+
216
+ def _fetch_via_api(url: str, api_key: str, max_comments: int) -> list[str]:
217
+ """Obtiene comentarios usando YouTube Data API v3."""
218
+ try:
219
+ import re
220
+ from googleapiclient.discovery import build
221
+
222
+ # Extraer video_id de la URL
223
+ patterns = [
224
+ r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
225
+ r"youtu\.be/([a-zA-Z0-9_-]{11})",
226
+ r"youtube\.com/embed/([a-zA-Z0-9_-]{11})",
227
+ ]
228
+ video_id = None
229
+ for pattern in patterns:
230
+ match = re.search(pattern, url)
231
+ if match:
232
+ video_id = match.group(1)
233
+ break
234
+
235
+ if not video_id:
236
+ raise ValueError(f"No se pudo extraer video_id de: {url}")
237
+
238
+ youtube = build("youtube", "v3", developerKey=api_key)
239
+ comments = []
240
+ page_token = None
241
+
242
+ while len(comments) < max_comments:
243
+ request = youtube.commentThreads().list(
244
+ part = "snippet",
245
+ videoId = video_id,
246
+ maxResults = min(100, max_comments - len(comments)),
247
+ pageToken = page_token,
248
+ textFormat = "plainText",
249
+ )
250
+ response = request.execute()
251
+
252
+ for item in response.get("items", []):
253
+ text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
254
+ comments.append(text)
255
+
256
+ page_token = response.get("nextPageToken")
257
+ if not page_token:
258
+ break
259
+
260
+ logger.info(f"YouTube API: {len(comments)} comentarios obtenidos")
261
+ return comments[:max_comments]
262
+
263
+ except Exception as e:
264
+ logger.warning(f"YouTube API falló: {e} — usando fallback")
265
+ return _fetch_via_scraper(url, max_comments)
266
+
267
+
268
+ def _fetch_via_scraper(url: str, max_comments: int) -> list[str]:
269
+ """
270
+ Fallback: simula comentarios si no hay API key.
271
+ En producción real debería usar BeautifulSoup + Selenium.
272
+ """
273
+ logger.warning(
274
+ "YOUTUBE_API_KEY no configurada. "
275
+ "Configura tu API key en .env para obtener comentarios reales. "
276
+ "Usando comentarios de ejemplo."
277
+ )
278
+ # Comentarios de ejemplo para demo sin API key
279
+ example_comments = [
280
+ "This video is really informative, thanks for sharing!",
281
+ "You are all stupid idiots, get out of here!",
282
+ "Great content, I learned a lot from this.",
283
+ "These people should be eliminated from society.",
284
+ "I agree with the presenter's point of view.",
285
+ "What a bunch of racist criminals!",
286
+ "Thank you for this analysis, very helpful.",
287
+ "Kill them all, they don't deserve to live.",
288
+ "Interesting perspective on the topic.",
289
+ "This is absolute bullshit propaganda!",
290
+ "I think we need to look at both sides.",
291
+ "Black people are thugs and criminals.",
292
+ "The data presented here is compelling.",
293
+ "Go back to where you came from!",
294
+ "Well researched video, good job.",
295
+ ]
296
+ return example_comments[:max_comments]
297
+
298
+
299
+ # ══════════════════════════════════════════════════════════════════════════════
300
+ # ENDPOINTS
301
+ # ══════════════════════════════════════════════════════════════════════════════
302
+
303
+ @app.get("/", tags=["Health"])
304
+ async def health_check():
305
+ """
306
+ Verifica que la API está funcionando.
307
+ Útil para Docker healthcheck y load balancers.
308
+ """
309
+ service = _state["service"]
310
+ return {
311
+ "status" : "ok" if service else "loading",
312
+ "model" : _state["model_name"],
313
+ "uptime_s": round(time.time() - _state["startup_time"], 1)
314
+ if _state["startup_time"] else 0,
315
+ }
316
+
317
+
318
+ @app.get("/model-info", response_model=ModelInfo, tags=["Model"])
319
+ async def get_model_info():
320
+ """Devuelve información sobre el modelo activo."""
321
+ service = _get_service()
322
+ info = service.get_model_info()
323
+ return ModelInfo(
324
+ name = _state["model_name"],
325
+ type = info.get("type", "unknown"),
326
+ description = info.get("description", ""),
327
+ speed = info.get("speed", ""),
328
+ accuracy = info.get("accuracy", ""),
329
+ uptime_s = round(time.time() - _state["startup_time"], 1),
330
+ predictions_served= _state["predictions_served"],
331
+ )
332
+
333
+
334
+ @app.get("/models", tags=["Model"])
335
+ async def list_models():
336
+ """Lista todos los modelos disponibles."""
337
+ return {
338
+ "available": list(AVAILABLE_MODELS.keys()),
339
+ "active" : _state["model_name"],
340
+ }
341
+
342
+
343
+ @app.put("/model/{model_name}", tags=["Model"])
344
+ async def switch_model(model_name: str):
345
+ """
346
+ Cambia el modelo activo.
347
+ El nuevo modelo se carga de forma lazy en el siguiente request de predicción.
348
+ """
349
+ if model_name not in AVAILABLE_MODELS:
350
+ raise HTTPException(
351
+ status_code=400,
352
+ detail=f"Modelo '{model_name}' no disponible. "
353
+ f"Opciones: {list(AVAILABLE_MODELS.keys())}",
354
+ )
355
+ _state["service"] = ModelService(model_name, PROJECT_ROOT)
356
+ _state["model_name"] = model_name
357
+ logger.info(f"Modelo cambiado a: {model_name}")
358
+ return {"message": f"Modelo cambiado a '{model_name}'", "model": model_name}
359
+
360
+
361
+ @app.post("/predict", response_model=PredictResponse, tags=["Prediction"])
362
+ async def predict(request: PredictRequest):
363
+ """
364
+ Predice si un comentario es tóxico.
365
+
366
+ - **text**: el comentario a analizar
367
+ - **threshold**: umbral de probabilidad (default 0.5)
368
+
369
+ Devuelve la probabilidad, si es tóxico y las categorías detectadas.
370
+ """
371
+ result, ms = _predict_single(request.text, request.threshold)
372
+
373
+ if "error" in result:
374
+ raise HTTPException(status_code=500, detail=result["error"])
375
+
376
+ return PredictResponse(
377
+ text = request.text,
378
+ is_toxic = result["is_toxic"],
379
+ probability= round(result["probability"], 4),
380
+ labels = result["labels"],
381
+ model_used = result["model_used"],
382
+ latency_ms = ms,
383
+ )
384
+
385
+
386
+ @app.post("/predict-batch", response_model=BatchPredictResponse, tags=["Prediction"])
387
+ async def predict_batch(request: BatchPredictRequest):
388
+ """
389
+ Predice una lista de comentarios en un solo request.
390
+ Más eficiente que llamar /predict N veces.
391
+ Máximo 100 comentarios por request.
392
+ """
393
+ t0 = time.perf_counter()
394
+ results = []
395
+
396
+ for text in request.texts:
397
+ if not text.strip():
398
+ continue
399
+ result, _ = _predict_single(text, request.threshold)
400
+ results.append(PredictResponse(
401
+ text = text,
402
+ is_toxic = result["is_toxic"],
403
+ probability= round(result["probability"], 4),
404
+ labels = result["labels"],
405
+ model_used = result["model_used"],
406
+ latency_ms = 0.0,
407
+ ))
408
+
409
+ total_ms = round((time.perf_counter() - t0) * 1000, 2)
410
+ toxic_count = sum(1 for r in results if r.is_toxic)
411
+
412
+ return BatchPredictResponse(
413
+ results = results,
414
+ total = len(results),
415
+ toxic_count = toxic_count,
416
+ latency_ms = total_ms,
417
+ )
418
+
419
+
420
+ @app.post("/predict-video", response_model=VideoResponse, tags=["Prediction"])
421
+ async def predict_video(request: VideoRequest):
422
+ """
423
+ Dado un URL de YouTube, obtiene los comentarios y predice su toxicidad.
424
+
425
+ Requiere YOUTUBE_API_KEY en el archivo .env para obtener comentarios reales.
426
+ Sin API key usa comentarios de ejemplo para la demo.
427
+ """
428
+ # Obtener comentarios
429
+ try:
430
+ comments = _scrape_youtube_comments(request.url, request.max_comments)
431
+ except Exception as e:
432
+ raise HTTPException(status_code=422, detail=f"Error al obtener comentarios: {e}")
433
+
434
+ if not comments:
435
+ raise HTTPException(status_code=404, detail="No se encontraron comentarios en el video")
436
+
437
+ # Predecir batch
438
+ t0 = time.perf_counter()
439
+ results = []
440
+ for text in comments:
441
+ if not text.strip():
442
+ continue
443
+ result, _ = _predict_single(text, request.threshold)
444
+ results.append(PredictResponse(
445
+ text = text,
446
+ is_toxic = result["is_toxic"],
447
+ probability= round(result["probability"], 4),
448
+ labels = result["labels"],
449
+ model_used = result["model_used"],
450
+ latency_ms = 0.0,
451
+ ))
452
+
453
+ total_ms = round((time.perf_counter() - t0) * 1000, 2)
454
+ toxic_count = sum(1 for r in results if r.is_toxic)
455
+
456
+ return VideoResponse(
457
+ video_url = request.url,
458
+ total_fetched= len(results),
459
+ toxic_count = toxic_count,
460
+ toxic_rate = round(toxic_count / len(results), 4) if results else 0.0,
461
+ results = results,
462
+ )
src/app/.gitkeep DELETED
File without changes
src/data/.gitkeep DELETED
File without changes
src/data/loader.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/data/loader.py
3
+
4
+ Carga y valida el dataset de comentarios de YouTube.
5
+ Responsabilidad única: leer el CSV y verificar que tiene las columnas esperadas.
6
+ El preprocesamiento viene después (src/features/text_preprocessor.py).
7
+ """
8
+
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from src.utils.logger import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+ # Columnas obligatorias en el dataset
16
+ REQUIRED_COLUMNS = {"Text", "IsToxic"}
17
+
18
+ # Sublabels opcionales (pueden no estar presentes)
19
+ SUBLABEL_COLUMNS = {
20
+ "IsAbusive", "IsProvocative", "IsHatespeech",
21
+ "IsRacist", "IsObscene", "IsThreat",
22
+ }
23
+
24
+
25
+ def load_raw_data(path: str | Path) -> pd.DataFrame:
26
+ """
27
+ Carga el CSV crudo de comentarios de YouTube.
28
+
29
+ Args:
30
+ path: Ruta al archivo CSV.
31
+
32
+ Returns:
33
+ DataFrame validado y limpio a nivel estructural.
34
+
35
+ Raises:
36
+ FileNotFoundError: si el archivo no existe.
37
+ ValueError: si faltan columnas obligatorias.
38
+ """
39
+ path = Path(path)
40
+ if not path.exists():
41
+ raise FileNotFoundError(f"Dataset no encontrado: {path}")
42
+
43
+ logger.info(f"Cargando dataset: {path}")
44
+ df = pd.read_csv(path)
45
+ logger.info(f" Shape: {df.shape}")
46
+
47
+ _validate_columns(df)
48
+ df = _clean_structure(df)
49
+
50
+ logger.info(f" Toxicos: {df['IsToxic'].sum()} ({df['IsToxic'].mean()*100:.1f}%)")
51
+ return df
52
+
53
+
54
+ def load_preprocessed_data(path: str | Path) -> pd.DataFrame:
55
+ """
56
+ Carga el CSV preprocesado (con columna clean_text).
57
+ Generado por el notebook 02 o por run_pipeline.
58
+
59
+ Args:
60
+ path: Ruta al CSV preprocesado.
61
+
62
+ Returns:
63
+ DataFrame con columna clean_text lista para vectorizar.
64
+ """
65
+ path = Path(path)
66
+ if not path.exists():
67
+ raise FileNotFoundError(
68
+ f"Datos preprocesados no encontrados: {path}\n"
69
+ f"Ejecuta: python -m src.pipeline.run_pipeline"
70
+ )
71
+
72
+ df = pd.read_csv(path)
73
+ if "clean_text" not in df.columns:
74
+ raise ValueError("El CSV no tiene columna 'clean_text'. Regenera el preprocesamiento.")
75
+
76
+ logger.info(f"Datos preprocesados cargados: {df.shape}")
77
+ return df
78
+
79
+
80
+ # ── Funciones internas ────────────────────────────────────────────────────────
81
+
82
+ def _validate_columns(df: pd.DataFrame) -> None:
83
+ """Verifica que el dataset tenga las columnas obligatorias."""
84
+ missing = REQUIRED_COLUMNS - set(df.columns)
85
+ if missing:
86
+ raise ValueError(
87
+ f"Columnas obligatorias ausentes: {missing}\n"
88
+ f"Columnas encontradas: {list(df.columns)}"
89
+ )
90
+ logger.info(f" Columnas validadas ✅")
91
+
92
+
93
+ def _clean_structure(df: pd.DataFrame) -> pd.DataFrame:
94
+ """
95
+ Limpieza estructural mínima:
96
+ - Elimina filas con Text vacío
97
+ - Convierte IsToxic a bool
98
+ - Convierte sublabels a bool si existen
99
+ """
100
+ df = df.copy()
101
+
102
+ # Texto
103
+ df["Text"] = df["Text"].fillna("").astype(str).str.strip()
104
+ df = df[df["Text"] != ""].reset_index(drop=True)
105
+
106
+ # Target binario
107
+ df["IsToxic"] = df["IsToxic"].astype(bool)
108
+
109
+ # Sublabels
110
+ for col in SUBLABEL_COLUMNS:
111
+ if col in df.columns:
112
+ df[col] = df[col].astype(bool)
113
+
114
+ # Eliminar duplicados
115
+ n_before = len(df)
116
+ df = df.drop_duplicates(subset=["Text"]).reset_index(drop=True)
117
+ if len(df) < n_before:
118
+ logger.warning(f" {n_before - len(df)} duplicados eliminados")
119
+
120
+ return df
src/evaluation/evaluator.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/evaluation/evaluator.py
3
+
4
+ Evaluación estandarizada de modelos.
5
+ Genera métricas, visualizaciones e informes JSON.
6
+
7
+ Uso:
8
+ evaluator = Evaluator(output_dir="reports/pipeline")
9
+ metrics = evaluator.evaluate(model, X_test, y_test, model_name="LR")
10
+ evaluator.error_analysis(X_test, y_test, preds, probs)
11
+ evaluator.save_summary(all_metrics, path="reports/summary.csv")
12
+ """
13
+
14
+ import json
15
+ import numpy as np
16
+ import pandas as pd
17
+ import matplotlib.pyplot as plt
18
+ import seaborn as sns
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+ from sklearn.metrics import (
22
+ f1_score, precision_score, recall_score,
23
+ roc_auc_score, accuracy_score,
24
+ confusion_matrix, classification_report,
25
+ RocCurveDisplay,
26
+ )
27
+ from src.utils.logger import get_logger
28
+
29
+ logger = get_logger(__name__)
30
+
31
+
32
+ class Evaluator:
33
+ """
34
+ Evaluador estandarizado de modelos de clasificación binaria.
35
+
36
+ Genera:
37
+ - Métricas completas (F1, Precision, Recall, ROC-AUC)
38
+ - Ambas métricas de gap (train-test y CV-test)
39
+ - Matriz de confusión (PNG)
40
+ - Curva ROC (PNG)
41
+ - Análisis de errores (FP y FN más comunes)
42
+ - Informe JSON por experimento
43
+ - CSV resumen de todos los experimentos
44
+ """
45
+
46
+ def __init__(self, output_dir: str | Path = "reports/pipeline"):
47
+ self.output_dir = Path(output_dir)
48
+ self.output_dir.mkdir(parents=True, exist_ok=True)
49
+
50
+ # ── Evaluación principal ─────────────────────────────────────────────────
51
+ def evaluate(
52
+ self,
53
+ model,
54
+ X_test,
55
+ y_test,
56
+ model_name: str,
57
+ X_train=None,
58
+ y_train=None,
59
+ cv_results: dict = None,
60
+ ) -> dict:
61
+ """
62
+ Evalúa un modelo sobre el test set.
63
+
64
+ Args:
65
+ model: objeto con método predict() y predict_proba()
66
+ X_test, y_test: datos de test
67
+ model_name: nombre para los reports
68
+ X_train, y_train: opcional — para calcular train_test_gap
69
+ cv_results: opcional — dict con cv_f1_mean para calcular cv_test_gap
70
+
71
+ Returns:
72
+ Dict con todas las métricas.
73
+ """
74
+ logger.info(f"Evaluando: {model_name}")
75
+
76
+ y_pred = model.predict(X_test)
77
+ y_proba = model.predict_proba(X_test)[:, 1]
78
+ y_test_arr = np.array(y_test)
79
+
80
+ # ── Métricas test ────────────────────────────────────────────────────
81
+ metrics = {
82
+ "model" : model_name,
83
+ "timestamp" : datetime.now().isoformat(),
84
+ "f1_weighted": round(f1_score(y_test_arr, y_pred, average="weighted"), 4),
85
+ "f1_toxic" : round(f1_score(y_test_arr, y_pred, pos_label=1), 4),
86
+ "precision" : round(precision_score(y_test_arr, y_pred, average="weighted"), 4),
87
+ "recall" : round(recall_score(y_test_arr, y_pred, average="weighted"), 4),
88
+ "accuracy" : round(accuracy_score(y_test_arr, y_pred), 4),
89
+ "roc_auc" : round(roc_auc_score(y_test_arr, y_proba), 4),
90
+ "fp" : int(((y_test_arr == 0) & (y_pred == 1)).sum()),
91
+ "fn" : int(((y_test_arr == 1) & (y_pred == 0)).sum()),
92
+ "n_test" : len(y_test_arr),
93
+ }
94
+
95
+ # ── Train-test gap (in-sample vs OOS) ────────────────────────────────
96
+ if X_train is not None and y_train is not None:
97
+ y_train_pred = model.predict(X_train)
98
+ f1_train = f1_score(np.array(y_train), y_train_pred, average="weighted")
99
+ metrics["f1_train"] = round(f1_train, 4)
100
+ metrics["train_test_gap_pp"]= round((f1_train - metrics["f1_weighted"]) * 100, 2)
101
+
102
+ # ── CV-test gap (OOS vs OOS — métrica correcta para la rúbrica) ──────
103
+ if cv_results and "cv_f1_mean" in cv_results:
104
+ cv_mean = cv_results["cv_f1_mean"]
105
+ metrics["cv_f1_mean"] = round(cv_mean, 4)
106
+ metrics["cv_f1_std"] = round(cv_results.get("cv_f1_std", 0), 4)
107
+ metrics["cv_test_gap_pp"]= round(abs(cv_mean - metrics["f1_weighted"]) * 100, 2)
108
+
109
+ self._print_summary(metrics)
110
+ return metrics
111
+
112
+ # ── Visualizaciones ──────────────────────────────────────────────────────
113
+ def plot_confusion_matrix(
114
+ self,
115
+ y_test,
116
+ y_pred,
117
+ model_name: str,
118
+ save: bool = True,
119
+ ) -> Path | None:
120
+ """Genera y guarda la matriz de confusión."""
121
+ cm = confusion_matrix(y_test, y_pred)
122
+ fig, ax = plt.subplots(figsize=(5, 4))
123
+ sns.heatmap(
124
+ cm, annot=True, fmt="d", cmap="Blues", ax=ax,
125
+ xticklabels=["No tóxico", "Tóxico"],
126
+ yticklabels=["No tóxico", "Tóxico"],
127
+ linewidths=0.5,
128
+ )
129
+ ax.set_title(f"{model_name} — Confusion Matrix", fontweight="bold")
130
+ ax.set_xlabel("Predicción")
131
+ ax.set_ylabel("Real")
132
+ plt.tight_layout()
133
+
134
+ if save:
135
+ safe = model_name.lower().replace(" ", "_").replace("/", "_")
136
+ path = self.output_dir / f"cm_{safe}.png"
137
+ plt.savefig(path, dpi=150, bbox_inches="tight")
138
+ plt.show()
139
+ logger.info(f"Confusion matrix guardada: {path}")
140
+ return path
141
+
142
+ plt.show()
143
+ return None
144
+
145
+ def plot_roc_curve(
146
+ self,
147
+ y_test,
148
+ y_proba,
149
+ model_name: str,
150
+ save: bool = True,
151
+ ) -> Path | None:
152
+ """Genera y guarda la curva ROC."""
153
+ fig, ax = plt.subplots(figsize=(6, 5))
154
+ RocCurveDisplay.from_predictions(
155
+ y_test, y_proba, ax=ax, name=model_name, color="#7F77DD"
156
+ )
157
+ ax.plot([0, 1], [0, 1], "--", color="gray", alpha=0.5, label="Random")
158
+ ax.set_title(f"{model_name} — Curva ROC", fontweight="bold")
159
+ ax.legend()
160
+ plt.tight_layout()
161
+
162
+ if save:
163
+ safe = model_name.lower().replace(" ", "_").replace("/", "_")
164
+ path = self.output_dir / f"roc_{safe}.png"
165
+ plt.savefig(path, dpi=150, bbox_inches="tight")
166
+ plt.show()
167
+ logger.info(f"Curva ROC guardada: {path}")
168
+ return path
169
+
170
+ plt.show()
171
+ return None
172
+
173
+ # ── Análisis de errores ──────────────────────────────────────────────────
174
+ def error_analysis(
175
+ self,
176
+ X_test,
177
+ y_test,
178
+ y_pred,
179
+ y_proba,
180
+ n_examples: int = 5,
181
+ ) -> dict:
182
+ """
183
+ Analiza los falsos positivos y falsos negativos más relevantes.
184
+
185
+ FP → comentarios OK que el modelo censura (peor UX)
186
+ FN → hate speech que se escapa (peor para el objetivo del proyecto)
187
+ """
188
+ texts = np.array(X_test) if not isinstance(X_test, np.ndarray) else X_test
189
+ y_arr = np.array(y_test)
190
+
191
+ error_df = pd.DataFrame({
192
+ "text" : texts,
193
+ "real" : y_arr,
194
+ "pred" : y_pred,
195
+ "prob_toxic": y_proba,
196
+ })
197
+
198
+ fp = error_df[(error_df["real"] == 0) & (error_df["pred"] == 1)]
199
+ fn = error_df[(error_df["real"] == 1) & (error_df["pred"] == 0)]
200
+
201
+ logger.info(f"Errores: FP={len(fp)} | FN={len(fn)}")
202
+
203
+ print(f"\n{'='*65}")
204
+ print(f"FALSOS NEGATIVOS — hate speech que NO detectó ({len(fn)} total)")
205
+ print(f"{'='*65}")
206
+ for _, row in fn.nsmallest(n_examples, "prob_toxic").iterrows():
207
+ print(f" Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
208
+ print()
209
+
210
+ print(f"{'='*65}")
211
+ print(f"FALSOS POSITIVOS — comentarios OK censurados ({len(fp)} total)")
212
+ print(f"{'='*65}")
213
+ for _, row in fp.nlargest(n_examples, "prob_toxic").iterrows():
214
+ print(f" Prob: {row['prob_toxic']:.3f} | {row['text'][:110]}")
215
+ print()
216
+
217
+ return {"fp_examples": fp.head(n_examples).to_dict("records"),
218
+ "fn_examples": fn.head(n_examples).to_dict("records")}
219
+
220
+ # ── Reports ──────────────────────────────────────────────────────────────
221
+ def save_report(self, metrics: dict, experiment_id: str) -> Path:
222
+ """Guarda las métricas de un experimento en JSON."""
223
+ path = self.output_dir / f"{experiment_id}.json"
224
+ with open(path, "w") as f:
225
+ json.dump(metrics, f, indent=2)
226
+ logger.info(f"Report guardado: {path}")
227
+ return path
228
+
229
+ def save_summary(self, all_metrics: list[dict], path: str | Path = None) -> Path:
230
+ """
231
+ Guarda un CSV con todos los experimentos para comparar.
232
+ Este es el 'reports/summary.csv' que mencionaba el roadmap.
233
+ """
234
+ path = Path(path or self.output_dir / "summary.csv")
235
+ df = pd.DataFrame(all_metrics)
236
+
237
+ # Ordenar por F1 descendente
238
+ if "f1_weighted" in df.columns:
239
+ df = df.sort_values("f1_weighted", ascending=False)
240
+
241
+ df.to_csv(path, index=False)
242
+ logger.info(f"Summary guardado: {path}")
243
+ print(df[["model", "f1_weighted", "roc_auc", "fp", "fn"]].to_string(index=False))
244
+ return path
245
+
246
+ # ── Interno ──────────────────────────────────────────────────────────────
247
+ def _print_summary(self, metrics: dict) -> None:
248
+ gap_str = ""
249
+ if "cv_test_gap_pp" in metrics:
250
+ ok = "✅" if metrics["cv_test_gap_pp"] < 5 else "⚠️"
251
+ gap_str = f"CV-test gap: {metrics['cv_test_gap_pp']:.2f}pp {ok}"
252
+ elif "train_test_gap_pp" in metrics:
253
+ ok = "✅" if metrics["train_test_gap_pp"] < 5 else "⚠️"
254
+ gap_str = f"Train-test gap: {metrics['train_test_gap_pp']:.2f}pp {ok}"
255
+
256
+ print(f"\n{'='*55}")
257
+ print(f"RESULTADOS — {metrics['model']}")
258
+ print(f"{'='*55}")
259
+ print(f" F1 weighted : {metrics['f1_weighted']:.4f}")
260
+ print(f" ROC-AUC : {metrics['roc_auc']:.4f}")
261
+ print(f" FP / FN : {metrics['fp']} / {metrics['fn']}")
262
+ if gap_str:
263
+ print(f" {gap_str}")
264
+ print(f"{'='*55}")
src/features/.gitkeep DELETED
File without changes
src/features/text_preprocessor.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/features/text_preprocessor.py
3
+
4
+ Pipeline de preprocesamiento NLP.
5
+ Traducción directa del notebook 02 a código de producción.
6
+
7
+ Pasos:
8
+ 1. Lowercase
9
+ 2. Regex: URLs, @menciones, \\xa0, apostrofes, números
10
+ 3. spaCy: lematización (en_core_web_sm)
11
+ 4. NLTK: filtrado stopwords english + custom
12
+
13
+ Uso:
14
+ preprocessor = TextPreprocessor()
15
+ clean_series = preprocessor.transform(df["Text"])
16
+ clean_text = preprocessor.transform("texto crudo aqui")
17
+ """
18
+
19
+ import re
20
+ import yaml
21
+ import nltk
22
+ import spacy
23
+ import pandas as pd
24
+ from pathlib import Path
25
+ from nltk.corpus import stopwords
26
+ from src.utils.logger import get_logger
27
+
28
+ logger = get_logger(__name__)
29
+
30
+ # Descargar recursos NLTK si no existen
31
+ for resource in ["stopwords", "punkt"]:
32
+ nltk.download(resource, quiet=True)
33
+
34
+
35
+ class TextPreprocessor:
36
+ """
37
+ Pipeline NLP para hate speech detection.
38
+ Lee su configuración de configs/features.yaml.
39
+ """
40
+
41
+ # Stopwords custom: palabras frecuentes sin valor discriminante
42
+ # en el dominio YouTube. No son stopwords generales.
43
+ CUSTOM_STOPWORDS = {
44
+ "youtube", "video", "watch", "like", "comment",
45
+ "channel", "click", "subscribe", "link",
46
+ }
47
+
48
+ def __init__(self, config_path: str = "configs/features.yaml"):
49
+ # Cargar config
50
+ with open(config_path) as f:
51
+ cfg = yaml.safe_load(f)["preprocessing"]
52
+ self.cfg = cfg
53
+
54
+ # Stopwords: NLTK + custom
55
+ self.stop_words = set(stopwords.words("english")) | self.CUSTOM_STOPWORDS
56
+ self.min_len = cfg.get("min_token_length", 2)
57
+
58
+ # Cargar modelo spaCy
59
+ # disable=["parser","ner"] → solo usamos el lemmatizer, más rápido
60
+ self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
61
+ logger.info(f"TextPreprocessor iniciado — spaCy {self.nlp.meta['version']}")
62
+
63
+ # ── Pasos individuales ────────────────────────────────────────────────────
64
+
65
+ def _lowercase(self, text: str) -> str:
66
+ """Paso 1: minúsculas. 'BLACK' y 'black' son la misma feature."""
67
+ return str(text).lower()
68
+
69
+ def _clean_regex(self, text: str) -> str:
70
+ """
71
+ Paso 2: elimina ruido estructural con regex.
72
+ Orden importante: primero lo más específico, luego lo general.
73
+ """
74
+ text = re.sub(r"http\S+|www\.\S+", "", text) # URLs
75
+ text = re.sub(r"@\w+", "", text) # @menciones
76
+ text = re.sub(r"[\n\t\r]", " ", text) # saltos de línea
77
+ text = re.sub(r"[^\x00-\x7F]+", " ", text) # \xa0, emojis
78
+ text = re.sub(r"'", "", text) # apóstrofes
79
+ text = re.sub(r"\b\d+\b", "", text) # números solos
80
+ text = re.sub(r"\s+", " ", text) # espacios múltiples
81
+ return text.strip()
82
+
83
+ def _lemmatize(self, text: str) -> str:
84
+ """
85
+ Paso 3+4: lematización con spaCy + filtrado de stopwords con NLTK.
86
+
87
+ Por qué spaCy para lematizar:
88
+ Entiende gramática: 'running'→'run', 'cops'→'cop'
89
+ Un stemmer de NLTK simplemente corta: 'running'→'runn'
90
+
91
+ Por qué NLTK para stopwords:
92
+ Lista curada de 179 palabras funcionales.
93
+ Más fácil de personalizar que la lista interna de spaCy.
94
+
95
+ DECISIÓN del EDA: NO eliminar 'black','white','police','cop'
96
+ → Aparecen en ambas clases con contexto distinto.
97
+ El modelo necesita verlas para aprender por bigrams.
98
+ """
99
+ doc = self.nlp(text)
100
+ tokens = [
101
+ token.lemma_
102
+ for token in doc
103
+ if not token.is_punct
104
+ and not token.is_space
105
+ and len(token.text) >= self.min_len
106
+ and token.lemma_ not in self.stop_words
107
+ ]
108
+ return " ".join(tokens)
109
+
110
+ def _transform_one(self, text: str) -> str:
111
+ text = self._lowercase(text)
112
+ text = self._clean_regex(text)
113
+ text = self._lemmatize(text)
114
+ return text
115
+
116
+ # ── Interfaz pública ──────────────────────────────────────────────────────
117
+
118
+ def transform(self, data) -> str | pd.Series:
119
+ """
120
+ Preprocesa un texto o una Serie completa.
121
+
122
+ Args:
123
+ data: str o pd.Series con textos crudos.
124
+
125
+ Returns:
126
+ str o pd.Series con textos limpios y lematizados.
127
+ """
128
+ if isinstance(data, pd.Series):
129
+ logger.info(f"Preprocesando {len(data)} textos...")
130
+ result = data.apply(self._transform_one)
131
+ empty = (result == "").sum()
132
+ if empty > 0:
133
+ logger.warning(f" {empty} textos quedaron vacíos tras limpieza")
134
+ return result
135
+ return self._transform_one(data)
src/features/vectorizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/features/vectorizer.py
3
+
4
+ Vectorizador configurable desde YAML.
5
+ Traducción directa del notebook 03 a código de producción.
6
+
7
+ Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000.
8
+ Justificación:
9
+ - Bigramas capturan contexto: 'black thug' es distinto a 'black' solo
10
+ - max_features=5000 equilibra vocabulario vs overfitting (800 muestras train)
11
+ - sublinear_tf=True evita que repetir una palabra infle artificialmente su peso
12
+
13
+ Uso:
14
+ vec = Vectorizer()
15
+ X_train_vec = vec.fit_transform(X_train_text)
16
+ X_test_vec = vec.transform(X_test_text)
17
+ """
18
+
19
+ import yaml
20
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
21
+ from src.utils.logger import get_logger
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+ class Vectorizer:
27
+ """
28
+ Wrapper sobre TfidfVectorizer / CountVectorizer.
29
+ Parámetros controlados por configs/features.yaml.
30
+
31
+ Regla crítica: fit() SOLO sobre train, transform() sobre train y test.
32
+ Si se hace fit sobre todo el dataset antes del split → data leakage.
33
+ """
34
+
35
+ def __init__(self, config_path: str = "configs/features.yaml", method: str = None):
36
+ with open(config_path) as f:
37
+ cfg = yaml.safe_load(f)["vectorization"]
38
+
39
+ self.method = method or cfg.get("method", "tfidf")
40
+ c = cfg[self.method]
41
+
42
+ if self.method == "tfidf":
43
+ self.vectorizer = TfidfVectorizer(
44
+ max_features = c["max_features"],
45
+ ngram_range = tuple(c["ngram_range"]),
46
+ sublinear_tf = c.get("sublinear_tf", True),
47
+ min_df = c.get("min_df", 3),
48
+ analyzer = "word",
49
+ strip_accents = "unicode",
50
+ )
51
+ else:
52
+ self.vectorizer = CountVectorizer(
53
+ max_features = c["max_features"],
54
+ ngram_range = tuple(c["ngram_range"]),
55
+ min_df = c.get("min_df", 3),
56
+ analyzer = "word",
57
+ strip_accents = "unicode",
58
+ )
59
+
60
+ logger.info(f"Vectorizer: {self.method} | max_features={c['max_features']} | ngram={c['ngram_range']}")
61
+
62
+ def fit_transform(self, X_train):
63
+ """Ajusta el vocabulario y transforma el train set."""
64
+ logger.info("Vectorizando train set...")
65
+ matrix = self.vectorizer.fit_transform(X_train)
66
+ logger.info(f" Shape: {matrix.shape} | Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}")
67
+ return matrix
68
+
69
+ def transform(self, X):
70
+ """Transforma sin ajustar (para test/producción)."""
71
+ return self.vectorizer.transform(X)
72
+
73
+ def get_feature_names(self):
74
+ return self.vectorizer.get_feature_names_out()
75
+
76
+ @property
77
+ def vocabulary_size(self) -> int:
78
+ return len(self.vectorizer.vocabulary_)
src/models/baseline.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/models/baseline.py
3
+
4
+ Modelos clásicos de ML para clasificación de texto.
5
+ Traducción directa de notebooks 04 y 05.
6
+
7
+ Todos los modelos siguen la misma interfaz:
8
+ model.fit(X_train, y_train)
9
+ model.predict(X)
10
+ model.predict_proba(X)
11
+ model.save(path)
12
+ Model.load(path)
13
+
14
+ Uso desde el pipeline:
15
+ model = build_model("lr", config_path="configs/models.yaml")
16
+ model.fit(X_train_vec, y_train)
17
+ preds = model.predict(X_test_vec)
18
+ """
19
+
20
+ import yaml
21
+ import joblib
22
+ import numpy as np
23
+ from pathlib import Path
24
+ from sklearn.linear_model import LogisticRegression
25
+ from sklearn.ensemble import RandomForestClassifier
26
+ from sklearn.pipeline import Pipeline
27
+ from sklearn.feature_extraction.text import TfidfVectorizer
28
+ from sklearn.model_selection import StratifiedKFold, cross_validate
29
+ from src.utils.logger import get_logger
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ # ── Clase base ────────────────────────────────────────────────────────────────
35
+ class BaseSklearnModel:
36
+ """
37
+ Interfaz común para todos los modelos sklearn del proyecto.
38
+ Hereda LRModel y EnsembleModel.
39
+ """
40
+
41
+ def __init__(self):
42
+ self.pipeline = None # sklearn Pipeline (TF-IDF + clf)
43
+ self.is_fitted = False
44
+
45
+ def fit(self, X_train, y_train) -> "BaseSklearnModel":
46
+ """Entrena el pipeline completo."""
47
+ logger.info(f"Entrenando {self.__class__.__name__}...")
48
+ self.pipeline.fit(X_train, y_train)
49
+ self.is_fitted = True
50
+ logger.info(" Entrenamiento completado")
51
+ return self
52
+
53
+ def predict(self, X) -> np.ndarray:
54
+ self._check_fitted()
55
+ return self.pipeline.predict(X)
56
+
57
+ def predict_proba(self, X) -> np.ndarray:
58
+ self._check_fitted()
59
+ return self.pipeline.predict_proba(X)
60
+
61
+ def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
62
+ """
63
+ Evaluación con StratifiedKFold.
64
+ Devuelve medias y desviaciones estándar de las métricas.
65
+ """
66
+ cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
67
+ results = cross_validate(
68
+ self.pipeline, X_train, y_train,
69
+ cv=cv,
70
+ scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
71
+ return_train_score=True,
72
+ n_jobs=-1,
73
+ )
74
+ summary = {
75
+ "cv_f1_mean" : results["test_f1"].mean(),
76
+ "cv_f1_std" : results["test_f1"].std(),
77
+ "cv_roc_mean" : results["test_roc_auc"].mean(),
78
+ "train_f1_mean" : results["train_f1"].mean(),
79
+ "gap_pp" : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
80
+ }
81
+ logger.info(
82
+ f" CV F1: {summary['cv_f1_mean']:.4f} ± {summary['cv_f1_std']:.4f} | "
83
+ f"Gap: {summary['gap_pp']:.1f}pp"
84
+ )
85
+ return summary
86
+
87
+ def save(self, path: str | Path) -> None:
88
+ path = Path(path)
89
+ path.parent.mkdir(parents=True, exist_ok=True)
90
+ joblib.dump(self.pipeline, path)
91
+ logger.info(f"Modelo guardado: {path}")
92
+
93
+ @classmethod
94
+ def load(cls, path: str | Path) -> "BaseSklearnModel":
95
+ path = Path(path)
96
+ if not path.exists():
97
+ raise FileNotFoundError(f"Modelo no encontrado: {path}")
98
+ instance = cls.__new__(cls)
99
+ instance.pipeline = joblib.load(path)
100
+ instance.is_fitted = True
101
+ logger.info(f"Modelo cargado: {path}")
102
+ return instance
103
+
104
+ def _check_fitted(self):
105
+ if not self.is_fitted:
106
+ raise RuntimeError("El modelo no está entrenado. Llama a .fit() primero.")
107
+
108
+
109
+ # ── Logistic Regression ────────────────────────────────────────────────────────
110
+ class LRModel(BaseSklearnModel):
111
+ """
112
+ Logistic Regression + TF-IDF.
113
+
114
+ Mejor modelo del proyecto (notebook 06):
115
+ F1 test = 0.7579 | CV-test gap = 4.76pp
116
+ Parámetros optimizados con Optuna sobre configs/best_params.yaml.
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ config_path: str = "configs/models.yaml",
122
+ feat_config_path: str = "configs/features.yaml",
123
+ best_params_path: str = "configs/best_params.yaml",
124
+ ):
125
+ super().__init__()
126
+
127
+ # Intentar cargar best_params.yaml (resultado de Optuna)
128
+ try:
129
+ import yaml as _yaml
130
+ with open(best_params_path) as f:
131
+ best = _yaml.safe_load(f)
132
+ bp = best.get("hyperparameters", {})
133
+ logger.info("Parámetros cargados desde best_params.yaml")
134
+ except FileNotFoundError:
135
+ bp = {}
136
+ logger.warning("best_params.yaml no encontrado — usando config por defecto")
137
+
138
+ # Config base
139
+ with open(config_path) as f:
140
+ mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
141
+ with open(feat_config_path) as f:
142
+ vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
143
+
144
+ # Prioridad: best_params > yaml config
145
+ ngram_str = str(bp.get("ngram_range", "1_2"))
146
+ ngram = (1, 1) if ngram_str == "1_1" else (1, 2)
147
+
148
+ self.pipeline = Pipeline([
149
+ ("tfidf", TfidfVectorizer(
150
+ max_features = bp.get("max_features", vec_cfg["max_features"]),
151
+ ngram_range = ngram,
152
+ sublinear_tf = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
153
+ min_df = bp.get("min_df", vec_cfg["min_df"]),
154
+ analyzer = "word",
155
+ strip_accents = "unicode",
156
+ )),
157
+ ("clf", LogisticRegression(
158
+ C = bp.get("C", mod_cfg["C"]),
159
+ max_iter = mod_cfg["max_iter"],
160
+ class_weight = mod_cfg["class_weight"],
161
+ solver = mod_cfg["solver"],
162
+ random_state = 42,
163
+ )),
164
+ ])
165
+ logger.info(f"LRModel creado — C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")
166
+
167
+
168
+ # ── Random Forest ──────────────────────────────────────────────────────────────
169
+ class RFModel(BaseSklearnModel):
170
+ """
171
+ Random Forest + TF-IDF.
172
+ Parámetros desde configs/models.yaml.
173
+ """
174
+
175
+ def __init__(
176
+ self,
177
+ config_path: str = "configs/models.yaml",
178
+ feat_config_path: str = "configs/features.yaml",
179
+ ):
180
+ super().__init__()
181
+
182
+ with open(config_path) as f:
183
+ rf_cfg = yaml.safe_load(f)["models"]["random_forest"]
184
+ with open(feat_config_path) as f:
185
+ vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
186
+
187
+ self.pipeline = Pipeline([
188
+ ("tfidf", TfidfVectorizer(
189
+ max_features = vec_cfg["max_features"],
190
+ ngram_range = (1, 1), # RF + bigramas es muy lento
191
+ sublinear_tf = vec_cfg["sublinear_tf"],
192
+ min_df = vec_cfg["min_df"],
193
+ analyzer = "word",
194
+ strip_accents = "unicode",
195
+ )),
196
+ ("clf", RandomForestClassifier(
197
+ n_estimators = rf_cfg["n_estimators"],
198
+ max_depth = rf_cfg.get("max_depth", 8),
199
+ min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
200
+ max_features = "sqrt",
201
+ class_weight = rf_cfg["class_weight"],
202
+ random_state = 42,
203
+ n_jobs = -1,
204
+ )),
205
+ ])
206
+ logger.info("RFModel creado")
207
+
208
+
209
+ # ── XGBoost ───────────────────────────────────────────────────────────────────
210
+ class XGBModel(BaseSklearnModel):
211
+ """
212
+ XGBoost + TF-IDF.
213
+ Requiere: pip install xgboost
214
+ """
215
+
216
+ def __init__(
217
+ self,
218
+ config_path: str = "configs/models.yaml",
219
+ feat_config_path: str = "configs/features.yaml",
220
+ ):
221
+ super().__init__()
222
+
223
+ try:
224
+ from xgboost import XGBClassifier
225
+ except ImportError:
226
+ raise ImportError("Instala XGBoost: pip install xgboost")
227
+
228
+ with open(config_path) as f:
229
+ xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
230
+ with open(feat_config_path) as f:
231
+ vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]
232
+
233
+ self.pipeline = Pipeline([
234
+ ("tfidf", TfidfVectorizer(
235
+ max_features = vec_cfg["max_features"],
236
+ ngram_range = (1, 1),
237
+ sublinear_tf = True,
238
+ min_df = vec_cfg["min_df"],
239
+ analyzer = "word",
240
+ strip_accents = "unicode",
241
+ )),
242
+ ("clf", XGBClassifier(
243
+ n_estimators = xgb_cfg.get("n_estimators", 200),
244
+ max_depth = xgb_cfg.get("max_depth", 3),
245
+ learning_rate = xgb_cfg.get("learning_rate", 0.05),
246
+ subsample = xgb_cfg.get("subsample", 0.8),
247
+ colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
248
+ use_label_encoder= False,
249
+ eval_metric = "logloss",
250
+ random_state = 42,
251
+ verbosity = 0,
252
+ )),
253
+ ])
254
+ logger.info("XGBModel creado")
255
+
256
+
257
+ # ── Factory ───────────────────────────────────────────────────────────────────
258
+ def build_model(
259
+ model_type: str,
260
+ config_path: str = "configs/models.yaml",
261
+ feat_config_path: str = "configs/features.yaml",
262
+ best_params_path: str = "configs/best_params.yaml",
263
+ ) -> BaseSklearnModel:
264
+ """
265
+ Construye el modelo indicado en la configuración.
266
+
267
+ Args:
268
+ model_type: "lr" | "rf" | "xgboost"
269
+
270
+ Returns:
271
+ Instancia del modelo listo para .fit()
272
+ """
273
+ builders = {
274
+ "lr" : lambda: LRModel(config_path, feat_config_path, best_params_path),
275
+ "rf" : lambda: RFModel(config_path, feat_config_path),
276
+ "xgboost": lambda: XGBModel(config_path, feat_config_path),
277
+ }
278
+ if model_type not in builders:
279
+ raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")
280
+
281
+ logger.info(f"Construyendo modelo: {model_type}")
282
+ return builders[model_type]()
src/pipeline/.gitkeep DELETED
File without changes
src/pipeline/run_pipeline.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ src/pipeline/run_pipeline.py
3
+
4
+ Pipeline end-to-end de entrenamiento y evaluación.
5
+ Ejecutar con: python -m src.pipeline.run_pipeline [--model lr|rf|xgboost]
6
+
7
+ Fases:
8
+ 1. Carga de datos
9
+ 2. Split train/test
10
+ 3. Preprocesamiento (spaCy + NLTK)
11
+ 4. Entrenamiento (modelo elegido desde config o argumento)
12
+ 5. Cross-validation
13
+ 6. Evaluación final en test
14
+ 7. Guardado del modelo
15
+ 8. Registro en MLflow
16
+ 9. Informe JSON + CSV resumen
17
+
18
+ Todo se controla desde:
19
+ configs/pipeline.yaml → rutas, split, cv_folds
20
+ configs/features.yaml → preprocesamiento y vectorización
21
+ configs/models.yaml → hiperparámetros
22
+ configs/best_params.yaml → resultado de Optuna (si existe)
23
+ """
24
+
25
+ import argparse
26
+ import sys
27
+ import yaml
28
+ import mlflow
29
+ import mlflow.sklearn
30
+ from pathlib import Path
31
+ from datetime import datetime
32
+ from sklearn.model_selection import train_test_split
33
+
34
+ # ── Setup path ────────────────────────────────────────────────────────────────
35
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
36
+ sys.path.insert(0, str(PROJECT_ROOT))
37
+
38
+ from src.data.loader import load_raw_data
39
+ from src.features.text_preprocessor import TextPreprocessor
40
+ from src.models.baseline import build_model
41
+ from src.evaluation.evaluator import Evaluator
42
+ from src.utils.logger import get_logger
43
+
44
+ logger = get_logger(__name__)
45
+
46
+
47
+ # ══════════════════════════════════════════════════════════════════════════════
48
+ # PIPELINE
49
+ # ══════════════════════════════════════════════════════════════════════════════
50
+ def run_pipeline(model_type: str = "lr") -> dict:
51
+ """
52
+ Ejecuta el pipeline completo de ML.
53
+
54
+ Args:
55
+ model_type: "lr" | "rf" | "xgboost"
56
+
57
+ Returns:
58
+ Dict con las métricas del modelo entrenado.
59
+ """
60
+ run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
61
+ logger.info("=" * 60)
62
+ logger.info(f"🚀 PIPELINE — model={model_type} | run={run_id}")
63
+ logger.info("=" * 60)
64
+
65
+ # ── Cargar configuración ──────────────────────────────────────────────────
66
+ cfg_pipe = yaml.safe_load(open(PROJECT_ROOT / "configs" / "pipeline.yaml"))
67
+ cfg_feat = yaml.safe_load(open(PROJECT_ROOT / "configs" / "features.yaml"))
68
+
69
+ TARGET = cfg_pipe["data"]["target_binary"]
70
+ RAND = cfg_pipe["pipeline"]["random_state"]
71
+ TEST_SIZE = cfg_pipe["pipeline"]["test_size"]
72
+ CV_FOLDS = cfg_pipe["pipeline"]["cv_folds"]
73
+ RAW_PATH = PROJECT_ROOT / cfg_pipe["data"]["raw_path"]
74
+ MODELS_DIR = PROJECT_ROOT / "models"
75
+ #MODELS_DIR.mkdir(exist_ok=True)
76
+ # Carpeta segura para experimentos
77
+ EXPERIMENTS_DIR = MODELS_DIR / "experiments" / model_type
78
+ EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
79
+
80
+ # ── FASE 1: Carga de datos ────────────────────────────────────────────────
81
+ logger.info("FASE 1 — Carga de datos")
82
+ df = load_raw_data(RAW_PATH)
83
+ logger.info(f" {len(df)} comentarios cargados")
84
+
85
+ # ── FASE 2: Split ─────────────────────────────────────────────────────────
86
+ logger.info("FASE 2 — Split train/test")
87
+ X = df["Text"]
88
+ y = df[TARGET]
89
+
90
+ X_train, X_test, y_train, y_test = train_test_split(
91
+ X, y, test_size=TEST_SIZE, random_state=RAND, stratify=y
92
+ )
93
+ logger.info(f" Train: {len(X_train)} | Test: {len(X_test)}")
94
+
95
+ # ── FASE 3: Preprocesamiento ──────────────────────────────────────────────
96
+ logger.info("FASE 3 — Preprocesamiento NLP")
97
+ preprocessor = TextPreprocessor(
98
+ config_path=str(PROJECT_ROOT / "configs" / "features.yaml")
99
+ )
100
+ X_train_clean = preprocessor.transform(X_train)
101
+ X_test_clean = preprocessor.transform(X_test)
102
+
103
+ # Reemplazar vacíos con texto original (evitar pérdida de muestras)
104
+ X_train_clean = X_train_clean.where(X_train_clean != "", X_train)
105
+ X_test_clean = X_test_clean.where(X_test_clean != "", X_test)
106
+
107
+ logger.info(f" Preprocesamiento completado")
108
+
109
+ # ── FASE 4: Entrenamiento ─────────────────────────────────────────────────
110
+ logger.info(f"FASE 4 — Entrenamiento ({model_type.upper()})")
111
+ model = build_model(
112
+ model_type,
113
+ config_path = str(PROJECT_ROOT / "configs" / "models.yaml"),
114
+ feat_config_path = str(PROJECT_ROOT / "configs" / "features.yaml"),
115
+ best_params_path = str(PROJECT_ROOT / "configs" / "best_params.yaml"),
116
+ )
117
+ model.fit(X_train_clean, y_train)
118
+
119
+ # ── FASE 5: Cross-validation ──────────────────────────────────────────────
120
+ logger.info(f"FASE 5 — Cross-validation ({CV_FOLDS} folds)")
121
+ cv_results = model.cross_validate(X_train_clean, y_train, cv_folds=CV_FOLDS, rand=RAND)
122
+
123
+ # ── FASE 6: Evaluación en test ────────────────────────────────────────────
124
+ logger.info("FASE 6 — Evaluación en test")
125
+ evaluator = Evaluator(output_dir=PROJECT_ROOT / "reports" / "v2" / "pipeline")
126
+
127
+ y_pred = model.predict(X_test_clean)
128
+ y_proba = model.predict_proba(X_test_clean)[:, 1]
129
+
130
+ metrics = evaluator.evaluate(
131
+ model, X_test_clean, y_test,
132
+ model_name = model_type.upper(),
133
+ X_train = X_train_clean,
134
+ y_train = y_train,
135
+ cv_results = cv_results,
136
+ )
137
+
138
+ # Visualizaciones
139
+ evaluator.plot_confusion_matrix(y_test, y_pred, model_type.upper())
140
+ evaluator.plot_roc_curve(y_test, y_proba, model_type.upper())
141
+ evaluator.error_analysis(X_test_clean, y_test, y_pred, y_proba)
142
+
143
+ # ── FASE 7: Guardado del modelo ───────────────────────────────────────────
144
+ logger.info("FASE 7 — Guardado del modelo")
145
+ model_path = EXPERIMENTS_DIR / f"{model_type}_pipeline_{run_id}.joblib"
146
+ model.save(model_path)
147
+
148
+ """
149
+ # Actualizar final_model.joblib si es el modelo por defecto del proyecto
150
+ final_path = MODELS_DIR / "pipeline" / "final_model.joblib"
151
+ model.save(final_path)
152
+ logger.info(f" Modelo de producción actualizado: {final_path}")
153
+ """
154
+
155
+ # ── FASE 8: MLflow ────────────────────────────────────────────────────────
156
+ logger.info("FASE 8 — Registro en MLflow")
157
+ _log_mlflow(metrics, cv_results, model, model_path, run_id, model_type)
158
+
159
+ # ── FASE 9: Informe ───────────────────────────────────────────────────────
160
+ logger.info("FASE 9 — Generando informes")
161
+ metrics["run_id"] = run_id
162
+ metrics["model_path"]= str(model_path)
163
+ evaluator.save_report(metrics, f"exp_{run_id}_{model_type}")
164
+ evaluator.save_summary([metrics])
165
+
166
+ logger.info("=" * 60)
167
+ logger.info(f"✅ Pipeline completado — F1={metrics['f1_weighted']:.4f}")
168
+ logger.info("=" * 60)
169
+
170
+ return metrics
171
+
172
+
173
+ # ── MLflow logging ────────────────────────────────────────────────────────────
174
+ def _log_mlflow(metrics, cv_results, model, model_path, run_id, model_type):
175
+ """Registra el experimento en MLflow."""
176
+ try:
177
+ mlflow_dir = PROJECT_ROOT / "mlruns"
178
+ mlflow.set_tracking_uri(f"file://{mlflow_dir}")
179
+ mlflow.set_experiment("Youtube_project_experiment_pipeline")
180
+
181
+ with mlflow.start_run(run_name=f"{model_type}_{run_id}"):
182
+ # Parámetros
183
+ mlflow.log_param("model_type", model_type)
184
+ mlflow.log_param("run_id", run_id)
185
+
186
+ # Métricas del pipeline
187
+ mlflow.log_metric("test_f1", metrics["f1_weighted"])
188
+ mlflow.log_metric("test_roc_auc", metrics["roc_auc"])
189
+ mlflow.log_metric("test_fp", metrics["fp"])
190
+ mlflow.log_metric("test_fn", metrics["fn"])
191
+ mlflow.log_metric("cv_f1_mean", cv_results["cv_f1_mean"])
192
+ mlflow.log_metric("cv_f1_std", cv_results["cv_f1_std"])
193
+ mlflow.log_metric("train_test_gap_pp", metrics.get("train_test_gap_pp", 0))
194
+
195
+ if "cv_test_gap_pp" in metrics:
196
+ mlflow.log_metric("cv_test_gap_pp", metrics["cv_test_gap_pp"])
197
+
198
+ # Modelo como artefacto
199
+ mlflow.sklearn.log_model(model.pipeline, f"{model_type}_pipeline")
200
+
201
+ logger.info(f" MLflow run registrado: {model_type}_{run_id}")
202
+
203
+ except Exception as e:
204
+ logger.warning(f"MLflow no disponible: {e}")
205
+
206
+
207
+ # ══════════════════════════════════════════════════════════════════════════════
208
+ # ENTRY POINT
209
+ # ═════════════════════════════════════════════��════════════════════════════════
210
+ def _parse_args():
211
+ parser = argparse.ArgumentParser(
212
+ description="Pipeline de entrenamiento — YouTube Hate Speech Detection"
213
+ )
214
+ parser.add_argument(
215
+ "--model",
216
+ type=str,
217
+ default="lr",
218
+ choices=["lr", "rf", "xgboost"],
219
+ help="Tipo de modelo a entrenar (default: lr)",
220
+ )
221
+ return parser.parse_args()
222
+
223
+
224
+ def main():
225
+ args = _parse_args()
226
+ metrics = run_pipeline(model_type=args.model)
227
+ return metrics
228
+
229
+
230
+ if __name__ == "__main__":
231
+ main()
src/utils/.gitkeep DELETED
File without changes
src/utils/config_loader.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilidad para cargar archivos de configuración YAML.
3
+ Todos los módulos deben usar esto en lugar de hardcodear valores.
4
+ """
5
+ import yaml
6
+ from pathlib import Path
7
+
8
+
9
+ def load_config(config_path: str) -> dict:
10
+ """
11
+ Carga un archivo YAML de configuración.
12
+
13
+ Args:
14
+ config_path: Ruta al archivo YAML (relativa a la raíz del proyecto).
15
+
16
+ Returns:
17
+ Diccionario con la configuración.
18
+ """
19
+ path = Path(config_path)
20
+ if not path.exists():
21
+ raise FileNotFoundError(f"Config file not found: {config_path}")
22
+
23
+ with open(path, "r") as f:
24
+ return yaml.safe_load(f)
25
+
26
+
27
+ def load_pipeline_config() -> dict:
28
+ return load_config("configs/pipeline.yaml")
29
+
30
+
31
+ def load_features_config() -> dict:
32
+ return load_config("configs/features.yaml")
33
+
34
+
35
+ def load_models_config() -> dict:
36
+ return load_config("configs/models.yaml")
src/utils/logger.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sistema de logging centralizado para todo el proyecto.
3
+ Uso: from src.utils.logger import get_logger; logger = get_logger(__name__)
4
+ """
5
+ import logging
6
+ import sys
7
+ from pathlib import Path
8
+ from datetime import datetime
9
+
10
+
11
+ def get_logger(name: str, level: int = logging.INFO) -> logging.Logger:
12
+ """
13
+ Devuelve un logger configurado con salida a consola y archivo.
14
+
15
+ Args:
16
+ name: Nombre del logger (usar __name__ en cada módulo).
17
+ level: Nivel de logging.
18
+
19
+ Returns:
20
+ Logger configurado.
21
+ """
22
+ logger = logging.getLogger(name)
23
+
24
+ if logger.handlers:
25
+ return logger
26
+
27
+ logger.setLevel(level)
28
+
29
+ formatter = logging.Formatter(
30
+ "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
31
+ datefmt="%Y-%m-%d %H:%M:%S",
32
+ )
33
+
34
+ # Handler consola
35
+ console_handler = logging.StreamHandler(sys.stdout)
36
+ console_handler.setFormatter(formatter)
37
+ logger.addHandler(console_handler)
38
+
39
+ # Handler archivo
40
+ log_dir = Path("logs")
41
+ log_dir.mkdir(exist_ok=True)
42
+ log_file = log_dir / f"pipeline_{datetime.now().strftime('%Y%m%d')}.log"
43
+ file_handler = logging.FileHandler(log_file)
44
+ file_handler.setFormatter(formatter)
45
+ logger.addHandler(file_handler)
46
+
47
+ return logger