feat: add vectorization notebook. #4
Browse files
notebooks/01_eda_v2.ipynb
CHANGED
|
@@ -515,6 +515,40 @@
|
|
| 515 |
"print(df['word_count'].describe().round(1))\n"
|
| 516 |
]
|
| 517 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
{
|
| 519 |
"cell_type": "code",
|
| 520 |
"execution_count": 32,
|
|
@@ -932,7 +966,7 @@
|
|
| 932 |
"id": "b90fab71",
|
| 933 |
"metadata": {},
|
| 934 |
"source": [
|
| 935 |
-
"## 7. Análisis por VideoId\n",
|
| 936 |
"\n",
|
| 937 |
"¿Algunos vídeos tienen más comentarios de odio? ¿El origen del vídeo sesga el dataset?\n"
|
| 938 |
]
|
|
|
|
| 515 |
"print(df['word_count'].describe().round(1))\n"
|
| 516 |
]
|
| 517 |
},
|
| 518 |
+
{
|
| 519 |
+
"cell_type": "code",
|
| 520 |
+
"execution_count": 2,
|
| 521 |
+
"id": "c239ed0f",
|
| 522 |
+
"metadata": {},
|
| 523 |
+
"outputs": [
|
| 524 |
+
{
|
| 525 |
+
"name": "stdout",
|
| 526 |
+
"output_type": "stream",
|
| 527 |
+
"text": [
|
| 528 |
+
" -- CARACTERES --\n",
|
| 529 |
+
"La media > mediana = distribución sesgada\n",
|
| 530 |
+
"Hay comentarios MUY largos (hasta 4421 caracteres)\n",
|
| 531 |
+
"Pero no representan el comportamiento típico\n",
|
| 532 |
+
" \n",
|
| 533 |
+
" -- PALABRAS --\n",
|
| 534 |
+
"Igual patrón: muchos textos cortos, pocos muy largos\n",
|
| 535 |
+
"815 palabras es un outlier fuerte\n",
|
| 536 |
+
"\n"
|
| 537 |
+
]
|
| 538 |
+
}
|
| 539 |
+
],
|
| 540 |
+
"source": [
|
| 541 |
+
"print(\"\"\" -- CARACTERES --\n",
|
| 542 |
+
"La media > mediana = distribución sesgada\n",
|
| 543 |
+
"Hay comentarios MUY largos (hasta 4421 caracteres)\n",
|
| 544 |
+
"Pero no representan el comportamiento típico\n",
|
| 545 |
+
" \n",
|
| 546 |
+
" -- PALABRAS --\n",
|
| 547 |
+
"Igual patrón: muchos textos cortos, pocos muy largos\n",
|
| 548 |
+
"815 palabras es un outlier fuerte\n",
|
| 549 |
+
"\"\"\")"
|
| 550 |
+
]
|
| 551 |
+
},
|
| 552 |
{
|
| 553 |
"cell_type": "code",
|
| 554 |
"execution_count": 32,
|
|
|
|
| 966 |
"id": "b90fab71",
|
| 967 |
"metadata": {},
|
| 968 |
"source": [
|
| 969 |
+
"## 7. Análisis por VideoId (Por ahora no tiene relevancia)\n",
|
| 970 |
"\n",
|
| 971 |
"¿Algunos vídeos tienen más comentarios de odio? ¿El origen del vídeo sesga el dataset?\n"
|
| 972 |
]
|
notebooks/02_preprocessing_v2.ipynb
CHANGED
|
@@ -656,7 +656,7 @@
|
|
| 656 |
},
|
| 657 |
{
|
| 658 |
"cell_type": "code",
|
| 659 |
-
"execution_count":
|
| 660 |
"metadata": {},
|
| 661 |
"outputs": [
|
| 662 |
{
|
|
@@ -680,7 +680,7 @@
|
|
| 680 |
"source": [
|
| 681 |
"# ── Configuración MLflow ──\n",
|
| 682 |
"MLFLOW_DIR = PROJECT_ROOT / 'mlruns'\n",
|
| 683 |
-
"EXPERIMENT_NAME = '
|
| 684 |
"\n",
|
| 685 |
"mlflow.set_tracking_uri(f\"file:{MLFLOW_DIR}\")\n",
|
| 686 |
"mlflow.set_experiment(EXPERIMENT_NAME)\n",
|
|
@@ -718,7 +718,7 @@
|
|
| 718 |
" run_id = mlflow.active_run().info.run_id\n",
|
| 719 |
" print(f'MLflow run registrado')\n",
|
| 720 |
" print(f' Run ID : {run_id}')\n",
|
| 721 |
-
" print(f' Experimento:
|
| 722 |
" print(f' Ver UI : mlflow ui --backend-store-uri file://{MLFLOW_DIR}')"
|
| 723 |
]
|
| 724 |
},
|
|
|
|
| 656 |
},
|
| 657 |
{
|
| 658 |
"cell_type": "code",
|
| 659 |
+
"execution_count": null,
|
| 660 |
"metadata": {},
|
| 661 |
"outputs": [
|
| 662 |
{
|
|
|
|
| 680 |
"source": [
|
| 681 |
"# ── Configuración MLflow ──\n",
|
| 682 |
"MLFLOW_DIR = PROJECT_ROOT / 'mlruns'\n",
|
| 683 |
+
"EXPERIMENT_NAME = 'Youtube_project_data'\n",
|
| 684 |
"\n",
|
| 685 |
"mlflow.set_tracking_uri(f\"file:{MLFLOW_DIR}\")\n",
|
| 686 |
"mlflow.set_experiment(EXPERIMENT_NAME)\n",
|
|
|
|
| 718 |
" run_id = mlflow.active_run().info.run_id\n",
|
| 719 |
" print(f'MLflow run registrado')\n",
|
| 720 |
" print(f' Run ID : {run_id}')\n",
|
| 721 |
+
" print(f' Experimento: Youtube_project_data')\n",
|
| 722 |
" print(f' Ver UI : mlflow ui --backend-store-uri file://{MLFLOW_DIR}')"
|
| 723 |
]
|
| 724 |
},
|
notebooks/03_vectorization_v2.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
reports/v2/08_tfidf_top_features.png
ADDED
|
Git LFS Details
|