RasaBh commited on
Commit
137c887
·
1 Parent(s): 98360d1

Improved regression task

Browse files
A4/A4_Classification.ipynb CHANGED
@@ -61,8 +61,7 @@
61
  ")\n",
62
  "from sklearn.pipeline import Pipeline\n",
63
  "from sklearn.ensemble import (\n",
64
- " RandomForestClassifier,\n",
65
- " GradientBoostingClassifier\n",
66
  ")\n",
67
  "from imblearn.over_sampling import SMOTE\n",
68
  "import pickle\n",
@@ -543,7 +542,7 @@
543
  "champion_region_f1 = results_region_df.iloc[0]['F1_mean']\n",
544
  "\n",
545
  "print(f\"\\nChampion (Stage 1): {champion_region}\")\n",
546
- "print(f\"F1-Score: {champion_region_f1:.4f} ± {results_region_df.iloc[0]['F1_std']:.4f}\")"
547
  ]
548
  },
549
  {
@@ -611,7 +610,7 @@
611
  "champion_14class_f1 = results_14class_df.iloc[0]['F1_mean']\n",
612
  "\n",
613
  "print(f\"\\nChampion (Single Model): {champion_14class}\")\n",
614
- "print(f\"F1-Score: {champion_14class_f1:.4f} ± {results_14class_df.iloc[0]['F1_std']:.4f}\")"
615
  ]
616
  },
617
  {
@@ -735,15 +734,23 @@
735
  "print(\"Comparison of Pipeline vs single model\")\n",
736
  "\n",
737
  "print(f\"\\nSingle Model ({champion_14class}):\")\n",
738
- "print(f\"F1-Score: {champion_14class_f1:.4f} ± {results_14class_df.iloc[0]['F1_std']:.4f}\")\n",
739
  "\n",
740
  "print(f\"\\nHierarchical Pipeline:\")\n",
741
- "print(f\"F1-Score: {pipeline_f1_scores.mean():.4f} ± {pipeline_f1_scores.std():.4f}\")\n",
742
  "\n",
743
  "improvement = pipeline_f1_scores.mean() - champion_14class_f1\n",
744
  "print(f\"\\nDifference: {improvement:+.4f} ({100*improvement/champion_14class_f1:+.2f}%)\")"
745
  ]
746
  },
 
 
 
 
 
 
 
 
747
  {
748
  "cell_type": "code",
749
  "execution_count": 16,
@@ -763,7 +770,6 @@
763
  }
764
  ],
765
  "source": [
766
- "# Statistical test\n",
767
  "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
768
  " k = len(scores_a)\n",
769
  " differences = scores_a - scores_b\n",
@@ -791,6 +797,14 @@
791
  " print(f\"Result: NOT statistically significant\")\n"
792
  ]
793
  },
 
 
 
 
 
 
 
 
794
  {
795
  "cell_type": "code",
796
  "execution_count": 17,
@@ -809,7 +823,6 @@
809
  }
810
  ],
811
  "source": [
812
- "# Visualization\n",
813
  "fig, ax = plt.subplots(figsize=(8, 6))\n",
814
  "\n",
815
  "models = [f'Single Model\\n({champion_14class})', 'Hierarchical\\nPipeline']\n",
@@ -828,7 +841,7 @@
828
  "for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):\n",
829
  " height = bar.get_height()\n",
830
  " ax.text(bar.get_x() + bar.get_width()/2., height + std + 0.02,\n",
831
- " f'{mean:.4f}±{std:.4f}',\n",
832
  " ha='center', va='bottom', fontsize=11, fontweight='bold')\n",
833
  "\n",
834
  "plt.tight_layout()\n",
@@ -938,6 +951,14 @@
938
  " }"
939
  ]
940
  },
 
 
 
 
 
 
 
 
941
  {
942
  "cell_type": "code",
943
  "execution_count": 19,
@@ -974,11 +995,18 @@
974
  }
975
  ],
976
  "source": [
977
- "# Classification report\n",
978
  "print(\"\\nClassification Report:\")\n",
979
  "print(classification_report(y_test_14, y_pred_test, zero_division=0))"
980
  ]
981
  },
 
 
 
 
 
 
 
 
982
  {
983
  "cell_type": "code",
984
  "execution_count": 20,
@@ -1007,7 +1035,6 @@
1007
  }
1008
  ],
1009
  "source": [
1010
- "# Class imbalance analysis - Using Random forest\n",
1011
  "\n",
1012
  "# Train Random Forest\n",
1013
  "rf_model = RandomForestClassifier(\n",
@@ -1046,6 +1073,14 @@
1046
  "print(f\"Improvement vs LDA: {improvement_vs_lda:+.2f}%\")"
1047
  ]
1048
  },
 
 
 
 
 
 
 
 
1049
  {
1050
  "cell_type": "code",
1051
  "execution_count": 21,
@@ -1079,7 +1114,7 @@
1079
  }
1080
  ],
1081
  "source": [
1082
- "# Final comparison\n",
1083
  "\n",
1084
  "# Collect all results\n",
1085
  "results_all = [\n",
@@ -1181,7 +1216,6 @@
1181
  "source": [
1182
  "# Detailed analysis of the champion model\n",
1183
  "\n",
1184
- "# Determine which model won\n",
1185
  "if best_approach['Approach'] == 'Random Forest':\n",
1186
  " best_model = rf_model\n",
1187
  " best_predictions = y_pred_rf\n",
 
61
  ")\n",
62
  "from sklearn.pipeline import Pipeline\n",
63
  "from sklearn.ensemble import (\n",
64
+ " RandomForestClassifier\n",
 
65
  ")\n",
66
  "from imblearn.over_sampling import SMOTE\n",
67
  "import pickle\n",
 
542
  "champion_region_f1 = results_region_df.iloc[0]['F1_mean']\n",
543
  "\n",
544
  "print(f\"\\nChampion (Stage 1): {champion_region}\")\n",
545
+ "print(f\"F1-Score: {champion_region_f1:.4f} +/- {results_region_df.iloc[0]['F1_std']:.4f}\")"
546
  ]
547
  },
548
  {
 
610
  "champion_14class_f1 = results_14class_df.iloc[0]['F1_mean']\n",
611
  "\n",
612
  "print(f\"\\nChampion (Single Model): {champion_14class}\")\n",
613
+ "print(f\"F1-Score: {champion_14class_f1:.4f} +/- {results_14class_df.iloc[0]['F1_std']:.4f}\")"
614
  ]
615
  },
616
  {
 
734
  "print(\"Comparison of Pipeline vs single model\")\n",
735
  "\n",
736
  "print(f\"\\nSingle Model ({champion_14class}):\")\n",
737
+ "print(f\"F1-Score: {champion_14class_f1:.4f} +/- {results_14class_df.iloc[0]['F1_std']:.4f}\")\n",
738
  "\n",
739
  "print(f\"\\nHierarchical Pipeline:\")\n",
740
+ "print(f\"F1-Score: {pipeline_f1_scores.mean():.4f} +/- {pipeline_f1_scores.std():.4f}\")\n",
741
  "\n",
742
  "improvement = pipeline_f1_scores.mean() - champion_14class_f1\n",
743
  "print(f\"\\nDifference: {improvement:+.4f} ({100*improvement/champion_14class_f1:+.2f}%)\")"
744
  ]
745
  },
746
+ {
747
+ "cell_type": "markdown",
748
+ "id": "d284b4e6-8596-4806-847b-84db5f41d1dd",
749
+ "metadata": {},
750
+ "source": [
751
+ "### Statistical test"
752
+ ]
753
+ },
754
  {
755
  "cell_type": "code",
756
  "execution_count": 16,
 
770
  }
771
  ],
772
  "source": [
 
773
  "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
774
  " k = len(scores_a)\n",
775
  " differences = scores_a - scores_b\n",
 
797
  " print(f\"Result: NOT statistically significant\")\n"
798
  ]
799
  },
800
+ {
801
+ "cell_type": "markdown",
802
+ "id": "281ae294-b939-41af-bdda-030e77aefe08",
803
+ "metadata": {},
804
+ "source": [
805
+ "### Visualization"
806
+ ]
807
+ },
808
  {
809
  "cell_type": "code",
810
  "execution_count": 17,
 
823
  }
824
  ],
825
  "source": [
 
826
  "fig, ax = plt.subplots(figsize=(8, 6))\n",
827
  "\n",
828
  "models = [f'Single Model\\n({champion_14class})', 'Hierarchical\\nPipeline']\n",
 
841
  "for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):\n",
842
  " height = bar.get_height()\n",
843
  " ax.text(bar.get_x() + bar.get_width()/2., height + std + 0.02,\n",
844
+ " f'{mean:.4f}+/-{std:.4f}',\n",
845
  " ha='center', va='bottom', fontsize=11, fontweight='bold')\n",
846
  "\n",
847
  "plt.tight_layout()\n",
 
951
  " }"
952
  ]
953
  },
954
+ {
955
+ "cell_type": "markdown",
956
+ "id": "898490a6-59e2-4c42-bc98-943d9ac1c6c0",
957
+ "metadata": {},
958
+ "source": [
959
+ "### Classification report"
960
+ ]
961
+ },
962
  {
963
  "cell_type": "code",
964
  "execution_count": 19,
 
995
  }
996
  ],
997
  "source": [
 
998
  "print(\"\\nClassification Report:\")\n",
999
  "print(classification_report(y_test_14, y_pred_test, zero_division=0))"
1000
  ]
1001
  },
1002
+ {
1003
+ "cell_type": "markdown",
1004
+ "id": "d8004001-9372-4c40-85e7-179f80ce6a7a",
1005
+ "metadata": {},
1006
+ "source": [
1007
+ "### Class imbalance analysis - Using Random forest\n"
1008
+ ]
1009
+ },
1010
  {
1011
  "cell_type": "code",
1012
  "execution_count": 20,
 
1035
  }
1036
  ],
1037
  "source": [
 
1038
  "\n",
1039
  "# Train Random Forest\n",
1040
  "rf_model = RandomForestClassifier(\n",
 
1073
  "print(f\"Improvement vs LDA: {improvement_vs_lda:+.2f}%\")"
1074
  ]
1075
  },
1076
+ {
1077
+ "cell_type": "markdown",
1078
+ "id": "0b16e409-7698-4f98-9c2e-20eec70213c4",
1079
+ "metadata": {},
1080
+ "source": [
1081
+ "### Final comparison"
1082
+ ]
1083
+ },
1084
  {
1085
  "cell_type": "code",
1086
  "execution_count": 21,
 
1114
  }
1115
  ],
1116
  "source": [
1117
+ "\n",
1118
  "\n",
1119
  "# Collect all results\n",
1120
  "results_all = [\n",
 
1216
  "source": [
1217
  "# Detailed analysis of the champion model\n",
1218
  "\n",
 
1219
  "if best_approach['Approach'] == 'Random Forest':\n",
1220
  " best_model = rf_model\n",
1221
  " best_predictions = y_pred_rf\n",
A4/A4_Regression.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
A4/models/aimoscores_improved.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d63069b096d996ba193bb6fa6d1a73a01351617c122c16b9ed34f0c64c113df
3
+ size 9630522