JonnyBP commited on
Commit
79ecfdc
·
1 Parent(s): df4616d

feat: add vectorization notebook. #4

Browse files
notebooks/01_eda_v2.ipynb CHANGED
@@ -515,6 +515,40 @@
515
  "print(df['word_count'].describe().round(1))\n"
516
  ]
517
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  {
519
  "cell_type": "code",
520
  "execution_count": 32,
@@ -932,7 +966,7 @@
932
  "id": "b90fab71",
933
  "metadata": {},
934
  "source": [
935
- "## 7. Análisis por VideoId\n",
936
  "\n",
937
  "¿Algunos vídeos tienen más comentarios de odio? ¿El origen del vídeo sesga el dataset?\n"
938
  ]
 
515
  "print(df['word_count'].describe().round(1))\n"
516
  ]
517
  },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 2,
521
+ "id": "c239ed0f",
522
+ "metadata": {},
523
+ "outputs": [
524
+ {
525
+ "name": "stdout",
526
+ "output_type": "stream",
527
+ "text": [
528
+ " -- CARACTERES --\n",
529
+ "La media > mediana = distribución sesgada\n",
530
+ "Hay comentarios MUY largos (hasta 4421 caracteres)\n",
531
+ "Pero no representan el comportamiento típico\n",
532
+ " \n",
533
+ " -- PALABRAS --\n",
534
+ "Igual patrón: muchos textos cortos, pocos muy largos\n",
535
+ "815 palabras es un outlier fuerte\n",
536
+ "\n"
537
+ ]
538
+ }
539
+ ],
540
+ "source": [
541
+ "print(\"\"\" -- CARACTERES --\n",
542
+ "La media > mediana = distribución sesgada\n",
543
+ "Hay comentarios MUY largos (hasta 4421 caracteres)\n",
544
+ "Pero no representan el comportamiento típico\n",
545
+ " \n",
546
+ " -- PALABRAS --\n",
547
+ "Igual patrón: muchos textos cortos, pocos muy largos\n",
548
+ "815 palabras es un outlier fuerte\n",
549
+ "\"\"\")"
550
+ ]
551
+ },
552
  {
553
  "cell_type": "code",
554
  "execution_count": 32,
 
966
  "id": "b90fab71",
967
  "metadata": {},
968
  "source": [
969
+ "## 7. Análisis por VideoId (Por ahora no tiene relevancia)\n",
970
  "\n",
971
  "¿Algunos vídeos tienen más comentarios de odio? ¿El origen del vídeo sesga el dataset?\n"
972
  ]
notebooks/02_preprocessing_v2.ipynb CHANGED
@@ -656,7 +656,7 @@
656
  },
657
  {
658
  "cell_type": "code",
659
- "execution_count": 13,
660
  "metadata": {},
661
  "outputs": [
662
  {
@@ -680,7 +680,7 @@
680
  "source": [
681
  "# ── Configuración MLflow ──\n",
682
  "MLFLOW_DIR = PROJECT_ROOT / 'mlruns'\n",
683
- "EXPERIMENT_NAME = 'Youtube_project_experiment'\n",
684
  "\n",
685
  "mlflow.set_tracking_uri(f\"file:{MLFLOW_DIR}\")\n",
686
  "mlflow.set_experiment(EXPERIMENT_NAME)\n",
@@ -718,7 +718,7 @@
718
  " run_id = mlflow.active_run().info.run_id\n",
719
  " print(f'MLflow run registrado')\n",
720
  " print(f' Run ID : {run_id}')\n",
721
- " print(f' Experimento: Youtube_project_experiments')\n",
722
  " print(f' Ver UI : mlflow ui --backend-store-uri file://{MLFLOW_DIR}')"
723
  ]
724
  },
 
656
  },
657
  {
658
  "cell_type": "code",
659
+ "execution_count": null,
660
  "metadata": {},
661
  "outputs": [
662
  {
 
680
  "source": [
681
  "# ── Configuración MLflow ──\n",
682
  "MLFLOW_DIR = PROJECT_ROOT / 'mlruns'\n",
683
+ "EXPERIMENT_NAME = 'Youtube_project_data'\n",
684
  "\n",
685
  "mlflow.set_tracking_uri(f\"file:{MLFLOW_DIR}\")\n",
686
  "mlflow.set_experiment(EXPERIMENT_NAME)\n",
 
718
  " run_id = mlflow.active_run().info.run_id\n",
719
  " print(f'MLflow run registrado')\n",
720
  " print(f' Run ID : {run_id}')\n",
721
+ " print(f' Experimento: Youtube_project_data')\n",
722
  " print(f' Ver UI : mlflow ui --backend-store-uri file://{MLFLOW_DIR}')"
723
  ]
724
  },
notebooks/03_vectorization_v2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
reports/v2/08_tfidf_top_features.png ADDED

Git LFS Details

  • SHA256: fee0436db93240ad29f5f6d513e1fd54c79f1586a5245433c88634ad7bca5927
  • Pointer size: 130 Bytes
  • Size of remote file: 92.8 kB