fix: change the name of the experiment. #3
Browse files
notebooks/02_preprocessing_v2.ipynb
CHANGED
|
@@ -6,10 +6,8 @@
|
|
| 6 |
"source": [
|
| 7 |
"# 🔧 Notebook 02 — Preprocesamiento de Texto\n",
|
| 8 |
"\n",
|
| 9 |
-
"### ¿Qué hace este notebook?\n",
|
| 10 |
"Construimos y validamos el pipeline de limpieza de texto **paso a paso**.\n",
|
| 11 |
"\n",
|
| 12 |
-
"### ¿Por qué se hace así?\n",
|
| 13 |
"El texto crudo de YouTube tiene ruido que engaña al modelo: URLs, menciones, caracteres raros (`\\xa0`), contracciones rotas (`don t`).\n",
|
| 14 |
"Antes de vectorizar necesitamos texto limpio y normalizado.\n",
|
| 15 |
"\n",
|
|
@@ -17,12 +15,7 @@
|
|
| 17 |
"- **`re`** → expresiones regulares para limpiar ruido estructural\n",
|
| 18 |
"- **`NLTK`** → lista curada de 179 stopwords en inglés\n",
|
| 19 |
"- **`spaCy`** → lematización con modelo de lenguaje real `en_core_web_sm`\n",
|
| 20 |
-
"- **`MLflow`** → registrar qué configuración de preprocesamiento usamos
|
| 21 |
-
"\n",
|
| 22 |
-
"### Output de este notebook\n",
|
| 23 |
-
"- Columna `clean_text` lista para vectorizar\n",
|
| 24 |
-
"- `data/processed/v2/comments_preprocessed.csv`\n",
|
| 25 |
-
"- Experimento registrado en MLflow: `Youtube_project_experiment`"
|
| 26 |
]
|
| 27 |
},
|
| 28 |
{
|
|
@@ -36,7 +29,7 @@
|
|
| 36 |
},
|
| 37 |
{
|
| 38 |
"cell_type": "code",
|
| 39 |
-
"execution_count":
|
| 40 |
"metadata": {},
|
| 41 |
"outputs": [
|
| 42 |
{
|
|
@@ -72,7 +65,7 @@
|
|
| 72 |
"import warnings\n",
|
| 73 |
"warnings.filterwarnings('ignore')\n",
|
| 74 |
"\n",
|
| 75 |
-
"# Ruta raiz —
|
| 76 |
"PROJECT_ROOT = Path.cwd().parent\n",
|
| 77 |
"sys.path.insert(0, str(PROJECT_ROOT))\n",
|
| 78 |
"\n",
|
|
@@ -289,7 +282,7 @@
|
|
| 289 |
"# ── PASO 2: Limpieza con Regex ────────────────────────────────────────────\n",
|
| 290 |
"# Por que regex: hay ruido sistematico en comentarios de YouTube.\n",
|
| 291 |
"# El EDA mostro: \\xa0 embebidos, saltos de linea, URLs, @menciones.\n",
|
| 292 |
-
"
|
| 293 |
"\n",
|
| 294 |
"def clean_regex(text: str) -> str:\n",
|
| 295 |
" \"\"\"Limpieza con expresiones regulares.\"\"\"\n",
|
|
@@ -359,11 +352,7 @@
|
|
| 359 |
"# Por que NLTK para STOPWORDS:\n",
|
| 360 |
"# Lista curada de 179 palabras funcionales (the, is, at, which...)\n",
|
| 361 |
"# Mas explicita y facil de personalizar que la lista interna de spaCy\n",
|
| 362 |
-
"
|
| 363 |
-
"# DECISION CRITICA del EDA:\n",
|
| 364 |
-
"# NO anadir 'black', 'white', 'police', 'cop' a stopwords.\n",
|
| 365 |
-
"# Aparecen en ambas clases pero con contexto DISTINTO.\n",
|
| 366 |
-
"# El modelo necesita verlas para discriminar por bigrams.\n",
|
| 367 |
"\n",
|
| 368 |
"STOP_WORDS = set(stopwords.words('english'))\n",
|
| 369 |
"\n",
|
|
@@ -375,7 +364,7 @@
|
|
| 375 |
"\n",
|
| 376 |
"def lemmatize_and_filter(text: str) -> str:\n",
|
| 377 |
" \"\"\"Lematiza con spaCy y filtra stopwords con NLTK.\"\"\"\n",
|
| 378 |
-
" doc = nlp(text)\n",
|
| 379 |
" tokens = [\n",
|
| 380 |
" token.lemma_\n",
|
| 381 |
" for token in doc\n",
|
|
@@ -386,7 +375,8 @@
|
|
| 386 |
" ]\n",
|
| 387 |
" return ' '.join(tokens)\n",
|
| 388 |
"\n",
|
| 389 |
-
"
|
|
|
|
| 390 |
"print('PASO 3+4 — Lematizacion (spaCy) + Filtrado (NLTK)')\n",
|
| 391 |
"print('-' * 65)\n",
|
| 392 |
"test_texts = [\n",
|
|
@@ -741,7 +731,7 @@
|
|
| 741 |
},
|
| 742 |
{
|
| 743 |
"cell_type": "code",
|
| 744 |
-
"execution_count":
|
| 745 |
"metadata": {},
|
| 746 |
"outputs": [
|
| 747 |
{
|
|
@@ -804,9 +794,6 @@
|
|
| 804 |
" spaCy para lemma, no NLTK Stemmer\n",
|
| 805 |
" -> stemmer corta letras, lemma entiende gramatica\n",
|
| 806 |
"\n",
|
| 807 |
-
"Archivo guardado:\n",
|
| 808 |
-
" data/processed/comments_preprocessed.csv\n",
|
| 809 |
-
"\n",
|
| 810 |
"\"\"\")"
|
| 811 |
]
|
| 812 |
}
|
|
|
|
| 6 |
"source": [
|
| 7 |
"# 🔧 Notebook 02 — Preprocesamiento de Texto\n",
|
| 8 |
"\n",
|
|
|
|
| 9 |
"Construimos y validamos el pipeline de limpieza de texto **paso a paso**.\n",
|
| 10 |
"\n",
|
|
|
|
| 11 |
"El texto crudo de YouTube tiene ruido que engaña al modelo: URLs, menciones, caracteres raros (`\\xa0`), contracciones rotas (`don t`).\n",
|
| 12 |
"Antes de vectorizar necesitamos texto limpio y normalizado.\n",
|
| 13 |
"\n",
|
|
|
|
| 15 |
"- **`re`** → expresiones regulares para limpiar ruido estructural\n",
|
| 16 |
"- **`NLTK`** → lista curada de 179 stopwords en inglés\n",
|
| 17 |
"- **`spaCy`** → lematización con modelo de lenguaje real `en_core_web_sm`\n",
|
| 18 |
+
"- **`MLflow`** → registrar qué configuración de preprocesamiento usamos"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
]
|
| 20 |
},
|
| 21 |
{
|
|
|
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"cell_type": "code",
|
| 32 |
+
"execution_count": null,
|
| 33 |
"metadata": {},
|
| 34 |
"outputs": [
|
| 35 |
{
|
|
|
|
| 65 |
"import warnings\n",
|
| 66 |
"warnings.filterwarnings('ignore')\n",
|
| 67 |
"\n",
|
| 68 |
+
"# Ruta raiz — \n",
|
| 69 |
"PROJECT_ROOT = Path.cwd().parent\n",
|
| 70 |
"sys.path.insert(0, str(PROJECT_ROOT))\n",
|
| 71 |
"\n",
|
|
|
|
| 282 |
"# ── PASO 2: Limpieza con Regex ────────────────────────────────────────────\n",
|
| 283 |
"# Por que regex: hay ruido sistematico en comentarios de YouTube.\n",
|
| 284 |
"# El EDA mostro: \\xa0 embebidos, saltos de linea, URLs, @menciones.\n",
|
| 285 |
+
"\n",
|
| 286 |
"\n",
|
| 287 |
"def clean_regex(text: str) -> str:\n",
|
| 288 |
" \"\"\"Limpieza con expresiones regulares.\"\"\"\n",
|
|
|
|
| 352 |
"# Por que NLTK para STOPWORDS:\n",
|
| 353 |
"# Lista curada de 179 palabras funcionales (the, is, at, which...)\n",
|
| 354 |
"# Mas explicita y facil de personalizar que la lista interna de spaCy\n",
|
| 355 |
+
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
"\n",
|
| 357 |
"STOP_WORDS = set(stopwords.words('english'))\n",
|
| 358 |
"\n",
|
|
|
|
| 364 |
"\n",
|
| 365 |
"def lemmatize_and_filter(text: str) -> str:\n",
|
| 366 |
" \"\"\"Lematiza con spaCy y filtra stopwords con NLTK.\"\"\"\n",
|
| 367 |
+
" doc = nlp(text) # Separación de palabras\n",
|
| 368 |
" tokens = [\n",
|
| 369 |
" token.lemma_\n",
|
| 370 |
" for token in doc\n",
|
|
|
|
| 375 |
" ]\n",
|
| 376 |
" return ' '.join(tokens)\n",
|
| 377 |
"\n",
|
| 378 |
+
"\n",
|
| 379 |
+
"# Validacion: ver exactamente que hace la lematizacion (ejemplo)\n",
|
| 380 |
"print('PASO 3+4 — Lematizacion (spaCy) + Filtrado (NLTK)')\n",
|
| 381 |
"print('-' * 65)\n",
|
| 382 |
"test_texts = [\n",
|
|
|
|
| 731 |
},
|
| 732 |
{
|
| 733 |
"cell_type": "code",
|
| 734 |
+
"execution_count": null,
|
| 735 |
"metadata": {},
|
| 736 |
"outputs": [
|
| 737 |
{
|
|
|
|
| 794 |
" spaCy para lemma, no NLTK Stemmer\n",
|
| 795 |
" -> stemmer corta letras, lemma entiende gramatica\n",
|
| 796 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 797 |
"\"\"\")"
|
| 798 |
]
|
| 799 |
}
|