Spaces:
Sleeping
Sleeping
Improved regression task
Browse files- A4/A4_Classification.ipynb +47 -13
- A4/A4_Regression.ipynb +0 -0
- A4/models/aimoscores_improved.pkl +3 -0
A4/A4_Classification.ipynb
CHANGED
|
@@ -61,8 +61,7 @@
|
|
| 61 |
")\n",
|
| 62 |
"from sklearn.pipeline import Pipeline\n",
|
| 63 |
"from sklearn.ensemble import (\n",
|
| 64 |
-
" RandomForestClassifier
|
| 65 |
-
" GradientBoostingClassifier\n",
|
| 66 |
")\n",
|
| 67 |
"from imblearn.over_sampling import SMOTE\n",
|
| 68 |
"import pickle\n",
|
|
@@ -543,7 +542,7 @@
|
|
| 543 |
"champion_region_f1 = results_region_df.iloc[0]['F1_mean']\n",
|
| 544 |
"\n",
|
| 545 |
"print(f\"\\nChampion (Stage 1): {champion_region}\")\n",
|
| 546 |
-
"print(f\"F1-Score: {champion_region_f1:.4f}
|
| 547 |
]
|
| 548 |
},
|
| 549 |
{
|
|
@@ -611,7 +610,7 @@
|
|
| 611 |
"champion_14class_f1 = results_14class_df.iloc[0]['F1_mean']\n",
|
| 612 |
"\n",
|
| 613 |
"print(f\"\\nChampion (Single Model): {champion_14class}\")\n",
|
| 614 |
-
"print(f\"F1-Score: {champion_14class_f1:.4f}
|
| 615 |
]
|
| 616 |
},
|
| 617 |
{
|
|
@@ -735,15 +734,23 @@
|
|
| 735 |
"print(\"Comparison of Pipeline vs single model\")\n",
|
| 736 |
"\n",
|
| 737 |
"print(f\"\\nSingle Model ({champion_14class}):\")\n",
|
| 738 |
-
"print(f\"F1-Score: {champion_14class_f1:.4f}
|
| 739 |
"\n",
|
| 740 |
"print(f\"\\nHierarchical Pipeline:\")\n",
|
| 741 |
-
"print(f\"F1-Score: {pipeline_f1_scores.mean():.4f}
|
| 742 |
"\n",
|
| 743 |
"improvement = pipeline_f1_scores.mean() - champion_14class_f1\n",
|
| 744 |
"print(f\"\\nDifference: {improvement:+.4f} ({100*improvement/champion_14class_f1:+.2f}%)\")"
|
| 745 |
]
|
| 746 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
{
|
| 748 |
"cell_type": "code",
|
| 749 |
"execution_count": 16,
|
|
@@ -763,7 +770,6 @@
|
|
| 763 |
}
|
| 764 |
],
|
| 765 |
"source": [
|
| 766 |
-
"# Statistical test\n",
|
| 767 |
"def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
|
| 768 |
" k = len(scores_a)\n",
|
| 769 |
" differences = scores_a - scores_b\n",
|
|
@@ -791,6 +797,14 @@
|
|
| 791 |
" print(f\"Result: NOT statistically significant\")\n"
|
| 792 |
]
|
| 793 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
{
|
| 795 |
"cell_type": "code",
|
| 796 |
"execution_count": 17,
|
|
@@ -809,7 +823,6 @@
|
|
| 809 |
}
|
| 810 |
],
|
| 811 |
"source": [
|
| 812 |
-
"# Visualization\n",
|
| 813 |
"fig, ax = plt.subplots(figsize=(8, 6))\n",
|
| 814 |
"\n",
|
| 815 |
"models = [f'Single Model\\n({champion_14class})', 'Hierarchical\\nPipeline']\n",
|
|
@@ -828,7 +841,7 @@
|
|
| 828 |
"for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):\n",
|
| 829 |
" height = bar.get_height()\n",
|
| 830 |
" ax.text(bar.get_x() + bar.get_width()/2., height + std + 0.02,\n",
|
| 831 |
-
" f'{mean:.4f}
|
| 832 |
" ha='center', va='bottom', fontsize=11, fontweight='bold')\n",
|
| 833 |
"\n",
|
| 834 |
"plt.tight_layout()\n",
|
|
@@ -938,6 +951,14 @@
|
|
| 938 |
" }"
|
| 939 |
]
|
| 940 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 941 |
{
|
| 942 |
"cell_type": "code",
|
| 943 |
"execution_count": 19,
|
|
@@ -974,11 +995,18 @@
|
|
| 974 |
}
|
| 975 |
],
|
| 976 |
"source": [
|
| 977 |
-
"# Classification report\n",
|
| 978 |
"print(\"\\nClassification Report:\")\n",
|
| 979 |
"print(classification_report(y_test_14, y_pred_test, zero_division=0))"
|
| 980 |
]
|
| 981 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
{
|
| 983 |
"cell_type": "code",
|
| 984 |
"execution_count": 20,
|
|
@@ -1007,7 +1035,6 @@
|
|
| 1007 |
}
|
| 1008 |
],
|
| 1009 |
"source": [
|
| 1010 |
-
"# Class imbalance analysis - Using Random forest\n",
|
| 1011 |
"\n",
|
| 1012 |
"# Train Random Forest\n",
|
| 1013 |
"rf_model = RandomForestClassifier(\n",
|
|
@@ -1046,6 +1073,14 @@
|
|
| 1046 |
"print(f\"Improvement vs LDA: {improvement_vs_lda:+.2f}%\")"
|
| 1047 |
]
|
| 1048 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
{
|
| 1050 |
"cell_type": "code",
|
| 1051 |
"execution_count": 21,
|
|
@@ -1079,7 +1114,7 @@
|
|
| 1079 |
}
|
| 1080 |
],
|
| 1081 |
"source": [
|
| 1082 |
-
"
|
| 1083 |
"\n",
|
| 1084 |
"# Collect all results\n",
|
| 1085 |
"results_all = [\n",
|
|
@@ -1181,7 +1216,6 @@
|
|
| 1181 |
"source": [
|
| 1182 |
"# Detailed analysis of the champion model\n",
|
| 1183 |
"\n",
|
| 1184 |
-
"# Determine which model won\n",
|
| 1185 |
"if best_approach['Approach'] == 'Random Forest':\n",
|
| 1186 |
" best_model = rf_model\n",
|
| 1187 |
" best_predictions = y_pred_rf\n",
|
|
|
|
| 61 |
")\n",
|
| 62 |
"from sklearn.pipeline import Pipeline\n",
|
| 63 |
"from sklearn.ensemble import (\n",
|
| 64 |
+
" RandomForestClassifier\n",
|
|
|
|
| 65 |
")\n",
|
| 66 |
"from imblearn.over_sampling import SMOTE\n",
|
| 67 |
"import pickle\n",
|
|
|
|
| 542 |
"champion_region_f1 = results_region_df.iloc[0]['F1_mean']\n",
|
| 543 |
"\n",
|
| 544 |
"print(f\"\\nChampion (Stage 1): {champion_region}\")\n",
|
| 545 |
+
"print(f\"F1-Score: {champion_region_f1:.4f} +/- {results_region_df.iloc[0]['F1_std']:.4f}\")"
|
| 546 |
]
|
| 547 |
},
|
| 548 |
{
|
|
|
|
| 610 |
"champion_14class_f1 = results_14class_df.iloc[0]['F1_mean']\n",
|
| 611 |
"\n",
|
| 612 |
"print(f\"\\nChampion (Single Model): {champion_14class}\")\n",
|
| 613 |
+
"print(f\"F1-Score: {champion_14class_f1:.4f} +/- {results_14class_df.iloc[0]['F1_std']:.4f}\")"
|
| 614 |
]
|
| 615 |
},
|
| 616 |
{
|
|
|
|
| 734 |
"print(\"Comparison of Pipeline vs single model\")\n",
|
| 735 |
"\n",
|
| 736 |
"print(f\"\\nSingle Model ({champion_14class}):\")\n",
|
| 737 |
+
"print(f\"F1-Score: {champion_14class_f1:.4f} +/- {results_14class_df.iloc[0]['F1_std']:.4f}\")\n",
|
| 738 |
"\n",
|
| 739 |
"print(f\"\\nHierarchical Pipeline:\")\n",
|
| 740 |
+
"print(f\"F1-Score: {pipeline_f1_scores.mean():.4f} +/- {pipeline_f1_scores.std():.4f}\")\n",
|
| 741 |
"\n",
|
| 742 |
"improvement = pipeline_f1_scores.mean() - champion_14class_f1\n",
|
| 743 |
"print(f\"\\nDifference: {improvement:+.4f} ({100*improvement/champion_14class_f1:+.2f}%)\")"
|
| 744 |
]
|
| 745 |
},
|
| 746 |
+
{
|
| 747 |
+
"cell_type": "markdown",
|
| 748 |
+
"id": "d284b4e6-8596-4806-847b-84db5f41d1dd",
|
| 749 |
+
"metadata": {},
|
| 750 |
+
"source": [
|
| 751 |
+
"### Statistical test"
|
| 752 |
+
]
|
| 753 |
+
},
|
| 754 |
{
|
| 755 |
"cell_type": "code",
|
| 756 |
"execution_count": 16,
|
|
|
|
| 770 |
}
|
| 771 |
],
|
| 772 |
"source": [
|
|
|
|
| 773 |
"def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
|
| 774 |
" k = len(scores_a)\n",
|
| 775 |
" differences = scores_a - scores_b\n",
|
|
|
|
| 797 |
" print(f\"Result: NOT statistically significant\")\n"
|
| 798 |
]
|
| 799 |
},
|
| 800 |
+
{
|
| 801 |
+
"cell_type": "markdown",
|
| 802 |
+
"id": "281ae294-b939-41af-bdda-030e77aefe08",
|
| 803 |
+
"metadata": {},
|
| 804 |
+
"source": [
|
| 805 |
+
"### Visualization"
|
| 806 |
+
]
|
| 807 |
+
},
|
| 808 |
{
|
| 809 |
"cell_type": "code",
|
| 810 |
"execution_count": 17,
|
|
|
|
| 823 |
}
|
| 824 |
],
|
| 825 |
"source": [
|
|
|
|
| 826 |
"fig, ax = plt.subplots(figsize=(8, 6))\n",
|
| 827 |
"\n",
|
| 828 |
"models = [f'Single Model\\n({champion_14class})', 'Hierarchical\\nPipeline']\n",
|
|
|
|
| 841 |
"for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):\n",
|
| 842 |
" height = bar.get_height()\n",
|
| 843 |
" ax.text(bar.get_x() + bar.get_width()/2., height + std + 0.02,\n",
|
| 844 |
+
" f'{mean:.4f}+/-{std:.4f}',\n",
|
| 845 |
" ha='center', va='bottom', fontsize=11, fontweight='bold')\n",
|
| 846 |
"\n",
|
| 847 |
"plt.tight_layout()\n",
|
|
|
|
| 951 |
" }"
|
| 952 |
]
|
| 953 |
},
|
| 954 |
+
{
|
| 955 |
+
"cell_type": "markdown",
|
| 956 |
+
"id": "898490a6-59e2-4c42-bc98-943d9ac1c6c0",
|
| 957 |
+
"metadata": {},
|
| 958 |
+
"source": [
|
| 959 |
+
"### Classification report"
|
| 960 |
+
]
|
| 961 |
+
},
|
| 962 |
{
|
| 963 |
"cell_type": "code",
|
| 964 |
"execution_count": 19,
|
|
|
|
| 995 |
}
|
| 996 |
],
|
| 997 |
"source": [
|
|
|
|
| 998 |
"print(\"\\nClassification Report:\")\n",
|
| 999 |
"print(classification_report(y_test_14, y_pred_test, zero_division=0))"
|
| 1000 |
]
|
| 1001 |
},
|
| 1002 |
+
{
|
| 1003 |
+
"cell_type": "markdown",
|
| 1004 |
+
"id": "d8004001-9372-4c40-85e7-179f80ce6a7a",
|
| 1005 |
+
"metadata": {},
|
| 1006 |
+
"source": [
|
| 1007 |
+
"### Class imbalance analysis - Using Random forest\n"
|
| 1008 |
+
]
|
| 1009 |
+
},
|
| 1010 |
{
|
| 1011 |
"cell_type": "code",
|
| 1012 |
"execution_count": 20,
|
|
|
|
| 1035 |
}
|
| 1036 |
],
|
| 1037 |
"source": [
|
|
|
|
| 1038 |
"\n",
|
| 1039 |
"# Train Random Forest\n",
|
| 1040 |
"rf_model = RandomForestClassifier(\n",
|
|
|
|
| 1073 |
"print(f\"Improvement vs LDA: {improvement_vs_lda:+.2f}%\")"
|
| 1074 |
]
|
| 1075 |
},
|
| 1076 |
+
{
|
| 1077 |
+
"cell_type": "markdown",
|
| 1078 |
+
"id": "0b16e409-7698-4f98-9c2e-20eec70213c4",
|
| 1079 |
+
"metadata": {},
|
| 1080 |
+
"source": [
|
| 1081 |
+
"### Final comparison"
|
| 1082 |
+
]
|
| 1083 |
+
},
|
| 1084 |
{
|
| 1085 |
"cell_type": "code",
|
| 1086 |
"execution_count": 21,
|
|
|
|
| 1114 |
}
|
| 1115 |
],
|
| 1116 |
"source": [
|
| 1117 |
+
"\n",
|
| 1118 |
"\n",
|
| 1119 |
"# Collect all results\n",
|
| 1120 |
"results_all = [\n",
|
|
|
|
| 1216 |
"source": [
|
| 1217 |
"# Detailed analysis of the champion model\n",
|
| 1218 |
"\n",
|
|
|
|
| 1219 |
"if best_approach['Approach'] == 'Random Forest':\n",
|
| 1220 |
" best_model = rf_model\n",
|
| 1221 |
" best_predictions = y_pred_rf\n",
|
A4/A4_Regression.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
A4/models/aimoscores_improved.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d63069b096d996ba193bb6fa6d1a73a01351617c122c16b9ed34f0c64c113df
|
| 3 |
+
size 9630522
|