AmrGaberr commited on
Commit
de7c20a
·
verified ·
1 Parent(s): 0d8a6c0

Upload 3 files

Browse files
deployment/model/CalibrateLikelihood.ipynb ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "# Calibrate Injury Likelihood Mapping\n",
21
+ "\n",
22
+ "This notebook calibrates the injury likelihood mapping for the injury risk prediction system. We use the training dataset (`Refined_Sports_Injury_Dataset.csv`) to map the model's predicted probabilities to true injury probabilities.\n",
23
+ "\n",
24
+ "## Objectives\n",
25
+ "- Load the dataset and trained models.\n",
26
+ "- Preprocess the data consistently with the training pipeline.\n",
27
+ "- Use the models to predict probabilities on a calibration set.\n",
28
+ "- Fit a logistic regression model to map predicted probabilities to true injury probabilities.\n",
29
+ "- Save the calibration model for use in `predict.py`."
30
+ ],
31
+ "metadata": {
32
+ "id": "v4eUbEr3u9Rm"
33
+ }
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "source": [
38
+ "# Import libraries\n",
39
+ "import pandas as pd\n",
40
+ "import numpy as np\n",
41
+ "import joblib\n",
42
+ "from sklearn.model_selection import train_test_split\n",
43
+ "from sklearn.linear_model import LogisticRegression\n",
44
+ "from sklearn.metrics import brier_score_loss\n",
45
+ "import matplotlib.pyplot as plt\n",
46
+ "\n",
47
+ "# Set random seed\n",
48
+ "np.random.seed(42)"
49
+ ],
50
+ "metadata": {
51
+ "id": "Y_dhli46u9Ro"
52
+ },
53
+ "execution_count": 1,
54
+ "outputs": []
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "source": [
59
+ "# ---------------------- 1. Load Data and Models ----------------------\n",
60
+ "# Load the dataset\n",
61
+ "data_path = \"/content/Refined_Sports_Injury_Dataset.csv\"\n",
62
+ "try:\n",
63
+ " df = pd.read_csv(data_path)\n",
64
+ "except FileNotFoundError:\n",
65
+ " raise FileNotFoundError(f\"Dataset file not found at {data_path}. Please ensure the file exists.\")\n",
66
+ "print(\"Dataset loaded. Shape:\", df.shape)\n",
67
+ "\n",
68
+ "# Load the trained models with error handling\n",
69
+ "model_dir = \"/content/model\"\n",
70
+ "try:\n",
71
+ " rf_model = joblib.load(f\"{model_dir}/rf_injury_model.pkl\")\n",
72
+ "except Exception as e:\n",
73
+ " raise FileNotFoundError(f\"Failed to load RandomForest model: {str(e)}. Ensure RandomForest.ipynb has been run successfully to generate {model_dir}/rf_injury_model.pkl.\")\n",
74
+ "\n",
75
+ "try:\n",
76
+ " xgb_model = joblib.load(f\"{model_dir}/xgboost_injury_model.pkl\")\n",
77
+ "except Exception as e:\n",
78
+ " raise FileNotFoundError(f\"Failed to load XGBoost model: {str(e)}. Ensure XGBOOST.ipynb has been run successfully to generate {model_dir}/xgboost_injury_model.pkl.\")\n",
79
+ "\n",
80
+ "print(\"Models loaded.\")"
81
+ ],
82
+ "metadata": {
83
+ "colab": {
84
+ "base_uri": "https://localhost:8080/"
85
+ },
86
+ "id": "NPDVk8WLu9Rp",
87
+ "outputId": "303e0a70-70b3-4796-d887-0b845c219e9a"
88
+ },
89
+ "execution_count": 2,
90
+ "outputs": [
91
+ {
92
+ "output_type": "stream",
93
+ "name": "stdout",
94
+ "text": [
95
+ "Dataset loaded. Shape: (10000, 18)\n",
96
+ "Models loaded.\n"
97
+ ]
98
+ }
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "source": [
104
+ "# ---------------------- 2. Preprocess Data ----------------------\n",
105
+ "# Encode categorical columns (consistent with training notebooks)\n",
106
+ "gender_mapping = {\"Male\": 0, \"Female\": 1}\n",
107
+ "experience_mapping = {\"Beginner\": 0, \"Intermediate\": 1, \"Advanced\": 2, \"Professional\": 3}\n",
108
+ "injury_type_mapping = {\"None\": 0, \"Sprain\": 1, \"Ligament Tear\": 2, \"Tendonitis\": 3, \"Strain\": 4, \"Fracture\": 5}\n",
109
+ "\n",
110
+ "# Handle NaN values in Previous_Injury_Type\n",
111
+ "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].fillna(\"None\")\n",
112
+ "\n",
113
+ "df[\"Gender\"] = df[\"Gender\"].map(gender_mapping).fillna(0).astype(int)\n",
114
+ "df[\"Sport_Type\"] = df[\"Sport_Type\"].astype(\"category\").cat.codes\n",
115
+ "df[\"Experience_Level\"] = df[\"Experience_Level\"].map(experience_mapping).fillna(0).astype(int)\n",
116
+ "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].map(injury_type_mapping).fillna(0).astype(int)\n",
117
+ "\n",
118
+ "# Replace 0 with 0.1 in Total_Weekly_Training_Hours\n",
119
+ "df[\"Total_Weekly_Training_Hours\"] = df[\"Total_Weekly_Training_Hours\"].replace(0, 0.1)\n",
120
+ "\n",
121
+ "# Create derived features\n",
122
+ "df[\"Intensity_Ratio\"] = df[\"High_Intensity_Training_Hours\"] / df[\"Total_Weekly_Training_Hours\"]\n",
123
+ "df[\"Recovery_Per_Training\"] = df[\"Recovery_Time_Between_Sessions\"] / df[\"Total_Weekly_Training_Hours\"]\n",
124
+ "\n",
125
+ "# Create Injury_Occurred column probabilistically based on Injury_Risk_Level\n",
126
+ "print(\"Injury_Risk_Level distribution:\\n\", df[\"Injury_Risk_Level\"].value_counts())\n",
127
+ "\n",
128
+ "# Define probabilities of injury occurrence based on risk level\n",
129
+ "injury_probabilities = {\n",
130
+ " \"High\": 0.95, # 95% chance of injury\n",
131
+ " \"Medium\": 0.5, # 50% chance of injury\n",
132
+ " \"Low\": 0.05 # 5% chance of injury\n",
133
+ "}\n",
134
+ "\n",
135
+ "# Generate Injury_Occurred using random sampling based on Injury_Risk_Level\n",
136
+ "df[\"Injury_Occurred\"] = df[\"Injury_Risk_Level\"].apply(\n",
137
+ " lambda x: np.random.binomial(1, injury_probabilities[x])\n",
138
+ ")\n",
139
+ "\n",
140
+ "# Check the distribution of Injury_Occurred\n",
141
+ "print(\"Injury_Occurred distribution (full dataset):\\n\", df[\"Injury_Occurred\"].value_counts())\n",
142
+ "\n",
143
+ "# Ensure both classes are present\n",
144
+ "if len(df[\"Injury_Occurred\"].unique()) < 2:\n",
145
+ " raise ValueError(\"Injury_Occurred contains only one class after probabilistic assignment. Adjust probabilities or dataset.\")\n",
146
+ "\n",
147
+ "# Define features\n",
148
+ "features = [\n",
149
+ " \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n",
150
+ " \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n",
151
+ " \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n",
152
+ " \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n",
153
+ " \"Intensity_Ratio\", \"Recovery_Per_Training\"\n",
154
+ "]\n",
155
+ "\n",
156
+ "# Prepare features and target\n",
157
+ "X = df[features]\n",
158
+ "y_outcome = df[\"Injury_Occurred\"]\n",
159
+ "\n",
160
+ "print(\"Features prepared:\", features)"
161
+ ],
162
+ "metadata": {
163
+ "colab": {
164
+ "base_uri": "https://localhost:8080/"
165
+ },
166
+ "id": "rEUcpksxu9Rq",
167
+ "outputId": "70eee7a9-b2f5-4d3c-a668-504cd0c9dd81"
168
+ },
169
+ "execution_count": 3,
170
+ "outputs": [
171
+ {
172
+ "output_type": "stream",
173
+ "name": "stdout",
174
+ "text": [
175
+ "Injury_Risk_Level distribution:\n",
176
+ " Injury_Risk_Level\n",
177
+ "Medium 6016\n",
178
+ "Low 2827\n",
179
+ "High 1157\n",
180
+ "Name: count, dtype: int64\n",
181
+ "Injury_Occurred distribution (full dataset):\n",
182
+ " Injury_Occurred\n",
183
+ "0 5801\n",
184
+ "1 4199\n",
185
+ "Name: count, dtype: int64\n",
186
+ "Features prepared: ['Age', 'Gender', 'Sport_Type', 'Experience_Level', 'Flexibility_Score', 'Total_Weekly_Training_Hours', 'High_Intensity_Training_Hours', 'Strength_Training_Frequency', 'Recovery_Time_Between_Sessions', 'Training_Load_Score', 'Sprint_Speed', 'Endurance_Score', 'Agility_Score', 'Fatigue_Level', 'Previous_Injury_Count', 'Previous_Injury_Type', 'Intensity_Ratio', 'Recovery_Per_Training']\n"
187
+ ]
188
+ }
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "source": [
194
+ "# ---------------------- 3. Split Data for Calibration ----------------------\n",
195
+ "# Split data into training and calibration sets\n",
196
+ "X_train, X_calib, y_train_outcome, y_calib_outcome = train_test_split(\n",
197
+ " X, y_outcome, test_size=0.2, stratify=y_outcome, random_state=42\n",
198
+ ")\n",
199
+ "\n",
200
+ "print(\"Calibration set size:\", X_calib.shape)\n",
201
+ "print(\"Calibration Injury_Occurred distribution:\\n\", y_calib_outcome.value_counts())\n",
202
+ "\n",
203
+ "# Check if y_calib_outcome has both classes\n",
204
+ "if len(y_calib_outcome.unique()) < 2:\n",
205
+ " raise ValueError(\"Calibration set contains only one class in Injury_Occurred. Cannot proceed with calibration.\")"
206
+ ],
207
+ "metadata": {
208
+ "colab": {
209
+ "base_uri": "https://localhost:8080/"
210
+ },
211
+ "id": "kbGVsNq4u9Rr",
212
+ "outputId": "732a5e13-f52b-4001-ee03-71a1b929fac7"
213
+ },
214
+ "execution_count": 4,
215
+ "outputs": [
216
+ {
217
+ "output_type": "stream",
218
+ "name": "stdout",
219
+ "text": [
220
+ "Calibration set size: (2000, 18)\n",
221
+ "Calibration Injury_Occurred distribution:\n",
222
+ " Injury_Occurred\n",
223
+ "0 1160\n",
224
+ "1 840\n",
225
+ "Name: count, dtype: int64\n"
226
+ ]
227
+ }
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "source": [
233
+ "# ---------------------- 4. Predict Probabilities with Ensemble ----------------------\n",
234
+ "# Use the ensemble (average of RandomForest and XGBoost) to predict probabilities\n",
235
+ "rf_probs = rf_model.predict_proba(X_calib)\n",
236
+ "xgb_probs = xgb_model.predict_proba(X_calib)\n",
237
+ "\n",
238
+ "# Average the probabilities (same as in predict.py)\n",
239
+ "avg_probs = (rf_probs + xgb_probs) / 2\n",
240
+ "\n",
241
+ "# Get the predicted class and confidence\n",
242
+ "predicted_classes = np.argmax(avg_probs, axis=1)\n",
243
+ "confidences = np.max(avg_probs, axis=1)\n",
244
+ "\n",
245
+ "print(\"Sample of predicted confidences:\\n\", confidences[:5])\n",
246
+ "print(\"Sample of predicted classes:\\n\", predicted_classes[:5])\n",
247
+ "\n",
248
+ "# Inspect the relationship between predicted classes and Injury_Occurred\n",
249
+ "calib_df = pd.DataFrame({\n",
250
+ " \"Predicted_Class\": predicted_classes,\n",
251
+ " \"Injury_Occurred\": y_calib_outcome\n",
252
+ "})\n",
253
+ "print(\"Distribution of Injury_Occurred by Predicted Class:\\n\", calib_df.groupby(\"Predicted_Class\")[\"Injury_Occurred\"].value_counts())"
254
+ ],
255
+ "metadata": {
256
+ "colab": {
257
+ "base_uri": "https://localhost:8080/"
258
+ },
259
+ "id": "-Qyc_ZYOu9Rr",
260
+ "outputId": "10e3592e-c51c-49cb-9ff4-6e157701811e"
261
+ },
262
+ "execution_count": 5,
263
+ "outputs": [
264
+ {
265
+ "output_type": "stream",
266
+ "name": "stdout",
267
+ "text": [
268
+ "Sample of predicted confidences:\n",
269
+ " [0.98814357 0.9881264 0.97794891 0.97335743 0.97383904]\n",
270
+ "Sample of predicted classes:\n",
271
+ " [2 2 2 1 1]\n",
272
+ "Distribution of Injury_Occurred by Predicted Class:\n",
273
+ " Predicted_Class Injury_Occurred\n",
274
+ "0 1 216\n",
275
+ " 0 14\n",
276
+ "1 0 540\n",
277
+ " 1 40\n",
278
+ "2 0 606\n",
279
+ " 1 584\n",
280
+ "Name: count, dtype: int64\n"
281
+ ]
282
+ }
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "source": [
288
+ "# ---------------------- 5. Map Probabilities to Injury Probabilities ----------------------\n",
289
+ "# Use the raw probabilities as features for calibration\n",
290
+ "calib_data = pd.DataFrame({\n",
291
+ " \"prob_high\": avg_probs[:, 0], # Probability of High risk\n",
292
+ " \"prob_low\": avg_probs[:, 1], # Probability of Low risk\n",
293
+ " \"prob_medium\": avg_probs[:, 2] # Probability of Medium risk\n",
294
+ "})\n",
295
+ "\n",
296
+ "# Fit logistic regression with regularization\n",
297
+ "lr_calib = LogisticRegression(max_iter=1000, penalty='l2', C=1.0)\n",
298
+ "lr_calib.fit(calib_data, y_calib_outcome)\n",
299
+ "\n",
300
+ "# Predict calibrated probabilities\n",
301
+ "calibrated_probs = lr_calib.predict_proba(calib_data)[:, 1]\n",
302
+ "\n",
303
+ "# Evaluate calibration using Brier score\n",
304
+ "brier_score = brier_score_loss(y_calib_outcome, calibrated_probs)\n",
305
+ "print(f\"Brier Score (lower is better): {brier_score:.4f}\")\n",
306
+ "\n",
307
+ "# Inspect the mapping\n",
308
+ "calib_results = pd.DataFrame({\n",
309
+ " \"Predicted_Class\": predicted_classes,\n",
310
+ " \"Confidence\": confidences,\n",
311
+ " \"Calibrated_Probability\": calibrated_probs,\n",
312
+ " \"True_Injury_Occurred\": y_calib_outcome\n",
313
+ "})\n",
314
+ "print(\"Average Calibrated Probability by Predicted Class:\\n\", calib_results.groupby(\"Predicted_Class\")[\"Calibrated_Probability\"].mean())\n",
315
+ "\n",
316
+ "# Plot calibration curve\n",
317
+ "plt.figure(figsize=(8, 6))\n",
318
+ "plt.scatter(confidences, calibrated_probs, alpha=0.5)\n",
319
+ "plt.plot([0, 1], [0, 1], 'k--', label=\"Perfectly Calibrated\")\n",
320
+ "plt.xlabel(\"Original Confidence (Ensemble)\")\n",
321
+ "plt.ylabel(\"Calibrated Injury Probability (P(Injury_Occurred=1))\")\n",
322
+ "plt.title(\"Calibration Curve: Confidence vs. Injury Probability\")\n",
323
+ "plt.legend()\n",
324
+ "plt.savefig(f\"{model_dir}/calibration_curve.png\")\n",
325
+ "plt.close()\n",
326
+ "\n",
327
+ "print(f\"Calibration curve saved to {model_dir}/calibration_curve.png\")"
328
+ ],
329
+ "metadata": {
330
+ "colab": {
331
+ "base_uri": "https://localhost:8080/"
332
+ },
333
+ "id": "oITFxyA4u9Rs",
334
+ "outputId": "98d1b529-b387-43ee-a7a8-9a7244701dd3"
335
+ },
336
+ "execution_count": 6,
337
+ "outputs": [
338
+ {
339
+ "output_type": "stream",
340
+ "name": "stdout",
341
+ "text": [
342
+ "Brier Score (lower is better): 0.1734\n",
343
+ "Average Calibrated Probability by Predicted Class:\n",
344
+ " Predicted_Class\n",
345
+ "0 0.923188\n",
346
+ "1 0.075402\n",
347
+ "2 0.490621\n",
348
+ "Name: Calibrated_Probability, dtype: float64\n",
349
+ "Calibration curve saved to /content/model/calibration_curve.png\n"
350
+ ]
351
+ }
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "source": [
357
+ "# ---------------------- 6. Save Calibration Model ----------------------\n",
358
+ "# Save the logistic regression model for use in predict.py\n",
359
+ "import os\n",
360
+ "os.makedirs(model_dir, exist_ok=True)\n",
361
+ "joblib.dump(lr_calib, f\"{model_dir}/likelihood_calibrator.pkl\")\n",
362
+ "print(f\"Calibration model saved to {model_dir}/likelihood_calibrator.pkl\")\n",
363
+ "print(\"Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\")\n",
364
+ "print(\"Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\")\n",
365
+ "print(\"Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\")"
366
+ ],
367
+ "metadata": {
368
+ "colab": {
369
+ "base_uri": "https://localhost:8080/"
370
+ },
371
+ "id": "S8IY2YCZu9Rs",
372
+ "outputId": "4585d640-eb1b-49d2-ec60-2727704b2de5"
373
+ },
374
+ "execution_count": 7,
375
+ "outputs": [
376
+ {
377
+ "output_type": "stream",
378
+ "name": "stdout",
379
+ "text": [
380
+ "Calibration model saved to /content/model/likelihood_calibrator.pkl\n",
381
+ "Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\n",
382
+ "Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\n",
383
+ "Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\n"
384
+ ]
385
+ }
386
+ ]
387
+ }
388
+ ]
389
+ }
deployment/model/RandomForest.ipynb ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "zh2cG0-8e5J3"
7
+ },
8
+ "source": [
9
+ "# RandomForest Model for Injury Risk Prediction\n",
10
+ "\n",
11
+ "This notebook trains a RandomForest model to predict injury risk levels (Low, Medium, High) for athletes based on their training and physical attributes. The pipeline includes data loading, preprocessing, feature engineering, model training with hyperparameter tuning, probability calibration, evaluation, and testing.\n",
12
+ "\n",
13
+ "## Objectives\n",
14
+ "- Load and preprocess the dataset.\n",
15
+ "- Perform feature engineering to create meaningful features.\n",
16
+ "- Train a RandomForest model with optimized hyperparameter tuning.\n",
17
+ "- Calibrate probabilities to ensure reliable confidence scores.\n",
18
+ "- Evaluate the model using appropriate metrics and visualizations.\n",
19
+ "- Save the model and encoder for use in the prediction pipeline.\n",
20
+ "- Test the saved model on the test set or new data."
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 1,
26
+ "metadata": {
27
+ "colab": {
28
+ "base_uri": "https://localhost:8080/"
29
+ },
30
+ "id": "9mkm6h_OfNaL",
31
+ "outputId": "536d54a0-46b7-4e91-fb9e-208fb1aa0983"
32
+ },
33
+ "outputs": [
34
+ {
35
+ "name": "stdout",
36
+ "output_type": "stream",
37
+ "text": [
38
+ "Requirement already satisfied: scikit-learn==1.6.1 in /usr/local/lib/python3.11/dist-packages (1.6.1)\n",
39
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (1.4.2)\n",
40
+ "Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.11/dist-packages (0.12.4)\n",
41
+ "Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.11/dist-packages (from scikit-learn==1.6.1) (2.0.2)\n",
42
+ "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn==1.6.1) (1.14.1)\n",
43
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn==1.6.1) (3.6.0)\n"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "# Install required packages\n",
49
+ "!pip install -U scikit-learn==1.6.1 joblib imbalanced-learn"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 2,
55
+ "metadata": {
56
+ "id": "zh2cG0-8e5J3"
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "# Import libraries\n",
61
+ "import os\n",
62
+ "import pandas as pd\n",
63
+ "import numpy as np\n",
64
+ "import joblib\n",
65
+ "import matplotlib.pyplot as plt\n",
66
+ "import seaborn as sns\n",
67
+ "import random\n",
68
+ "from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV\n",
69
+ "from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score\n",
70
+ "from sklearn.ensemble import RandomForestClassifier\n",
71
+ "from imblearn.over_sampling import SMOTE\n",
72
+ "from sklearn.preprocessing import LabelEncoder\n",
73
+ "from sklearn.calibration import CalibratedClassifierCV\n",
74
+ "\n",
75
+ "# Set seed for reproducibility across all random processes\n",
76
+ "SEED = 42\n",
77
+ "np.random.seed(SEED)\n",
78
+ "random.seed(SEED)\n",
79
+ "os.environ['PYTHONHASHSEED'] = str(SEED)"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 3,
85
+ "metadata": {
86
+ "colab": {
87
+ "base_uri": "https://localhost:8080/"
88
+ },
89
+ "id": "lUBXg0QGe_19",
90
+ "outputId": "916a5487-cdca-45ed-b6f3-d330f258189f"
91
+ },
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "Loading dataset...\n",
98
+ "Dataset loaded. Shape: (10000, 18)\n",
99
+ "\n",
100
+ "Dataset Info:\n",
101
+ "<class 'pandas.core.frame.DataFrame'>\n",
102
+ "RangeIndex: 10000 entries, 0 to 9999\n",
103
+ "Data columns (total 18 columns):\n",
104
+ " # Column Non-Null Count Dtype \n",
105
+ "--- ------ -------------- ----- \n",
106
+ " 0 Age 10000 non-null int64 \n",
107
+ " 1 Gender 10000 non-null object \n",
108
+ " 2 Sport_Type 10000 non-null object \n",
109
+ " 3 Experience_Level 10000 non-null object \n",
110
+ " 4 Flexibility_Score 10000 non-null float64\n",
111
+ " 5 Total_Weekly_Training_Hours 10000 non-null float64\n",
112
+ " 6 High_Intensity_Training_Hours 10000 non-null float64\n",
113
+ " 7 Strength_Training_Frequency 10000 non-null int64 \n",
114
+ " 8 Recovery_Time_Between_Sessions 10000 non-null float64\n",
115
+ " 9 Training_Load_Score 10000 non-null float64\n",
116
+ " 10 Sprint_Speed 10000 non-null float64\n",
117
+ " 11 Endurance_Score 10000 non-null float64\n",
118
+ " 12 Agility_Score 10000 non-null float64\n",
119
+ " 13 Fatigue_Level 10000 non-null int64 \n",
120
+ " 14 Previous_Injury_Count 10000 non-null int64 \n",
121
+ " 15 Previous_Injury_Type 10000 non-null object \n",
122
+ " 16 Injury_Risk_Level 10000 non-null object \n",
123
+ " 17 Injury_Outcome 10000 non-null int64 \n",
124
+ "dtypes: float64(8), int64(5), object(5)\n",
125
+ "memory usage: 1.4+ MB\n",
126
+ "None\n",
127
+ "\n",
128
+ "Class Distribution:\n",
129
+ "Injury_Risk_Level\n",
130
+ "Medium 6014\n",
131
+ "Low 2832\n",
132
+ "High 1154\n",
133
+ "Name: count, dtype: int64\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "# ---------------------- 1. Load Data ----------------------\n",
139
+ "print(\"Loading dataset...\")\n",
140
+ "data_path = \"/content/Refined_Sports_Injury_Dataset.csv\"\n",
141
+ "\n",
142
+ "# Check if file exists\n",
143
+ "if not os.path.exists(data_path):\n",
144
+ " raise FileNotFoundError(f\"Dataset file not found at {data_path}. Please ensure the file exists.\")\n",
145
+ "\n",
146
+ "df = pd.read_csv(data_path)\n",
147
+ "\n",
148
+ "# Validate dataset structure\n",
149
+ "expected_columns = [\n",
150
+ " \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n",
151
+ " \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n",
152
+ " \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n",
153
+ " \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n",
154
+ " \"Injury_Risk_Level\"\n",
155
+ "]\n",
156
+ "if not all(col in df.columns for col in expected_columns):\n",
157
+ " missing_cols = [col for col in expected_columns if col not in df.columns]\n",
158
+ " raise ValueError(f\"Dataset is missing required columns: {missing_cols}\")\n",
159
+ "\n",
160
+ "# Display dataset info\n",
161
+ "print(\"Dataset loaded. Shape:\", df.shape)\n",
162
+ "print(\"\\nDataset Info:\")\n",
163
+ "print(df.info())\n",
164
+ "print(\"\\nClass Distribution:\")\n",
165
+ "print(df[\"Injury_Risk_Level\"].value_counts())"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": 4,
171
+ "metadata": {
172
+ "colab": {
173
+ "base_uri": "https://localhost:8080/"
174
+ },
175
+ "id": "5NyQ1yuhLLRW",
176
+ "outputId": "ab2a5946-d752-46ba-ea42-69df3e6d8c4a"
177
+ },
178
+ "outputs": [
179
+ {
180
+ "name": "stdout",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "\n",
184
+ "🔠 Encoding categorical columns...\n",
185
+ "Encoded class mapping: {'High': 0, 'Low': 1, 'Medium': 2}\n",
186
+ "Sample of encoded data:\n",
187
+ " Age Gender Sport_Type Experience_Level Flexibility_Score \\\n",
188
+ "0 34 0 2 0 7.2 \n",
189
+ "1 29 1 4 3 8.5 \n",
190
+ "2 31 0 2 2 6.8 \n",
191
+ "3 27 1 0 1 7.9 \n",
192
+ "4 33 0 3 2 6.5 \n",
193
+ "\n",
194
+ " Total_Weekly_Training_Hours High_Intensity_Training_Hours \\\n",
195
+ "0 12.0 4.0 \n",
196
+ "1 8.0 2.0 \n",
197
+ "2 15.0 6.0 \n",
198
+ "3 10.0 3.0 \n",
199
+ "4 9.0 3.0 \n",
200
+ "\n",
201
+ " Strength_Training_Frequency Recovery_Time_Between_Sessions \\\n",
202
+ "0 3 48.0 \n",
203
+ "1 2 72.0 \n",
204
+ "2 4 36.0 \n",
205
+ "3 2 60.0 \n",
206
+ "4 3 48.0 \n",
207
+ "\n",
208
+ " Training_Load_Score Sprint_Speed Endurance_Score Agility_Score \\\n",
209
+ "0 65.0 6.8 7.5 7.0 \n",
210
+ "1 45.0 7.2 8.0 8.2 \n",
211
+ "2 80.0 6.5 7.0 6.8 \n",
212
+ "3 55.0 7.0 7.8 7.5 \n",
213
+ "4 60.0 6.7 7.2 6.9 \n",
214
+ "\n",
215
+ " Fatigue_Level Previous_Injury_Count Previous_Injury_Type \\\n",
216
+ "0 4 1 1 \n",
217
+ "1 2 0 0 \n",
218
+ "2 6 2 2 \n",
219
+ "3 3 0 0 \n",
220
+ "4 5 1 3 \n",
221
+ "\n",
222
+ " Injury_Risk_Level Injury_Outcome \n",
223
+ "0 2 0 \n",
224
+ "1 1 0 \n",
225
+ "2 0 1 \n",
226
+ "3 1 0 \n",
227
+ "4 2 0 \n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "# ---------------------- 2. Data Preprocessing & Encoding ----------------------\n",
233
+ "print(\"\\n🔠 Encoding categorical columns...\")\n",
234
+ "\n",
235
+ "# Define mappings for categorical variables\n",
236
+ "gender_mapping = {\"Male\": 0, \"Female\": 1}\n",
237
+ "experience_mapping = {\"Beginner\": 0, \"Intermediate\": 1, \"Advanced\": 2, \"Professional\": 3}\n",
238
+ "injury_type_mapping = {\"None\": 0, \"Sprain\": 1, \"Ligament Tear\": 2, \"Tendonitis\": 3, \"Strain\": 4, \"Fracture\": 5}\n",
239
+ "\n",
240
+ "# Handle NaN values in Previous_Injury_Type by treating them as 'None'\n",
241
+ "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].fillna(\"None\")\n",
242
+ "\n",
243
+ "# Validate categorical columns before encoding\n",
244
+ "for col, mapping in [(\"Gender\", gender_mapping), (\"Experience_Level\", experience_mapping), (\"Previous_Injury_Type\", injury_type_mapping)]:\n",
245
+ " invalid_values = set(df[col]) - set(mapping.keys())\n",
246
+ " if invalid_values:\n",
247
+ " raise ValueError(f\"Invalid values found in {col}: {invalid_values}. Expected values: {list(mapping.keys())}\")\n",
248
+ "\n",
249
+ "# Encode Gender\n",
250
+ "df[\"Gender\"] = df[\"Gender\"].map(gender_mapping).fillna(0).astype(int)\n",
251
+ "\n",
252
+ "# Encode Sport_Type dynamically\n",
253
+ "df[\"Sport_Type\"] = df[\"Sport_Type\"].astype(\"category\").cat.codes\n",
254
+ "\n",
255
+ "# Encode Experience_Level\n",
256
+ "df[\"Experience_Level\"] = df[\"Experience_Level\"].map(experience_mapping).fillna(0).astype(int)\n",
257
+ "\n",
258
+ "# Encode Previous_Injury_Type\n",
259
+ "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].map(injury_type_mapping).fillna(0).astype(int)\n",
260
+ "\n",
261
+ "# Encode target variable (Injury_Risk_Level)\n",
262
+ "le = LabelEncoder()\n",
263
+ "df[\"Injury_Risk_Level\"] = le.fit_transform(df[\"Injury_Risk_Level\"].astype(str))\n",
264
+ "\n",
265
+ "# Verify encoding\n",
266
+ "print(\"Encoded class mapping:\", dict(zip(le.classes_, range(len(le.classes_)))))\n",
267
+ "print(\"Sample of encoded data:\\n\", df.head())"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "metadata": {
274
+ "colab": {
275
+ "base_uri": "https://localhost:8080/"
276
+ },
277
+ "id": "Y4373JooLLit",
278
+ "outputId": "b9760f3d-3d2b-41b1-a63f-a6930099e0f7"
279
+ },
280
+ "outputs": [
281
+ {
282
+ "name": "stdout",
283
+ "output_type": "stream",
284
+ "text": [
285
+ "\n",
286
+ "🛠️ Creating derived features...\n",
287
+ "Features created: ['Age', 'Gender', 'Sport_Type', 'Experience_Level', 'Flexibility_Score', 'Total_Weekly_Training_Hours', 'High_Intensity_Training_Hours', 'Strength_Training_Frequency', 'Recovery_Time_Between_Sessions', 'Training_Load_Score', 'Sprint_Speed', 'Endurance_Score', 'Agility_Score', 'Fatigue_Level', 'Previous_Injury_Count', 'Previous_Injury_Type', 'Intensity_Ratio', 'Recovery_Per_Training']\n",
288
+ "Sample of features:\n",
289
+ " Age Gender Sport_Type Experience_Level Flexibility_Score \\\n",
290
+ "0 34 0 2 0 7.2 \n",
291
+ "1 29 1 4 3 8.5 \n",
292
+ "2 31 0 2 2 6.8 \n",
293
+ "3 27 1 0 1 7.9 \n",
294
+ "4 33 0 3 2 6.5 \n",
295
+ "\n",
296
+ " Total_Weekly_Training_Hours High_Intensity_Training_Hours \\\n",
297
+ "0 12.0 4.0 \n",
298
+ "1 8.0 2.0 \n",
299
+ "2 15.0 6.0 \n",
300
+ "3 10.0 3.0 \n",
301
+ "4 9.0 3.0 \n",
302
+ "\n",
303
+ " Strength_Training_Frequency Recovery_Time_Between_Sessions \\\n",
304
+ "0 3 48.0 \n",
305
+ "1 2 72.0 \n",
306
+ "2 4 36.0 \n",
307
+ "3 2 60.0 \n",
308
+ "4 3 48.0 \n",
309
+ "\n",
310
+ " Training_Load_Score Sprint_Speed Endurance_Score Agility_Score \\\n",
311
+ "0 65.0 6.8 7.5 7.0 \n",
312
+ "1 45.0 7.2 8.0 8.2 \n",
313
+ "2 80.0 6.5 7.0 6.8 \n",
314
+ "3 55.0 7.0 7.8 7.5 \n",
315
+ "4 60.0 6.7 7.2 6.9 \n",
316
+ "\n",
317
+ " Fatigue_Level Previous_Injury_Count Previous_Injury_Type Intensity_Ratio \\\n",
318
+ "0 4 1 1 0.333333 \n",
319
+ "1 2 0 0 0.250000 \n",
320
+ "2 6 2 2 0.400000 \n",
321
+ "3 3 0 0 0.300000 \n",
322
+ "4 5 1 3 0.333333 \n",
323
+ "\n",
324
+ " Recovery_Per_Training \n",
325
+ "0 4.0 \n",
326
+ "1 9.0 \n",
327
+ "2 2.4 \n",
328
+ "3 6.0 \n",
329
+ "4 5.3 \n"
330
+ ]
331
+ }
332
+ ],
333
+ "source": [
334
+ "# ---------------------- 3. Feature Engineering ----------------------\n",
335
+ "print(\"\\n🛠️ Creating derived features...\")\n",
336
+ "\n",
337
+ "# Replace 0 with 0.1 in Total_Weekly_Training_Hours to avoid division by zero\n",
338
+ "df[\"Total_Weekly_Training_Hours\"] = df[\"Total_Weekly_Training_Hours\"].replace(0, 0.1)\n",
339
+ "\n",
340
+ "# Create derived features\n",
341
+ "df[\"Intensity_Ratio\"] = df[\"High_Intensity_Training_Hours\"] / df[\"Total_Weekly_Training_Hours\"]\n",
342
+ "df[\"Recovery_Per_Training\"] = df[\"Recovery_Time_Between_Sessions\"] / df[\"Total_Weekly_Training_Hours\"]\n",
343
+ "\n",
344
+ "# Check for NaN or infinite values in derived features\n",
345
+ "if df[[\"Intensity_Ratio\", \"Recovery_Per_Training\"]].isna().any().any() or np.isinf(df[[\"Intensity_Ratio\", \"Recovery_Per_Training\"]]).any().any():\n",
346
+ " raise ValueError(\"NaN or infinite values found in derived features. Check data for inconsistencies.\")\n",
347
+ "\n",
348
+ "# Define features\n",
349
+ "features = [\n",
350
+ " \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n",
351
+ " \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n",
352
+ " \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n",
353
+ " \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n",
354
+ " \"Intensity_Ratio\", \"Recovery_Per_Training\"\n",
355
+ "]\n",
356
+ "\n"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "code",
361
+ "execution_count": null,
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": [
365
+ "# Prepare features and target\n",
366
+ "X = df[features]\n",
367
+ "y = df[\"Injury_Risk_Level\"]\n",
368
+ "\n",
369
+ "# Verify features\n",
370
+ "print(\"Features created:\", features)\n",
371
+ "print(\"Sample of features:\\n\", X.head())"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 6,
377
+ "metadata": {
378
+ "colab": {
379
+ "base_uri": "https://localhost:8080/"
380
+ },
381
+ "id": "MORGKXZULLsQ",
382
+ "outputId": "8c993957-7909-400b-a311-eab5fda2127b"
383
+ },
384
+ "outputs": [
385
+ {
386
+ "name": "stdout",
387
+ "output_type": "stream",
388
+ "text": [
389
+ "\n",
390
+ "📊 Splitting data & applying SMOTE...\n",
391
+ "Training set class distribution after SMOTE:\n",
392
+ "0 4811\n",
393
+ "1 4811\n",
394
+ "2 4811\n",
395
+ "Name: count, dtype: int64\n"
396
+ ]
397
+ }
398
+ ],
399
+ "source": [
400
+ "# ---------------------- 4. Train/Test Split & SMOTE ----------------------\n",
401
+ "print(\"\\n📊 Splitting data & applying SMOTE...\")\n",
402
+ "\n",
403
+ "# Split data into train and test sets\n",
404
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)\n",
405
+ "\n",
406
+ "# Apply SMOTE to balance the training set\n",
407
+ "try:\n",
408
+ " sm = SMOTE(random_state=SEED)\n",
409
+ " X_train_res, y_train_res = sm.fit_resample(X_train, y_train)\n",
410
+ "except Exception as e:\n",
411
+ " raise RuntimeError(f\"SMOTE failed: {str(e)}. Check for invalid data in X_train or y_train.\")\n",
412
+ "\n",
413
+ "# Print class distribution after SMOTE\n",
414
+ "print(\"Training set class distribution after SMOTE:\")\n",
415
+ "print(pd.Series(y_train_res).value_counts())"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 7,
421
+ "metadata": {
422
+ "colab": {
423
+ "base_uri": "https://localhost:8080/"
424
+ },
425
+ "id": "sKycZa1YLL11",
426
+ "outputId": "94940989-75fe-4cae-aee9-908a26d63bc1"
427
+ },
428
+ "outputs": [
429
+ {
430
+ "name": "stdout",
431
+ "output_type": "stream",
432
+ "text": [
433
+ "\n",
434
+ "🔍 Performing hyperparameter tuning...\n",
435
+ "Best hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 25}\n",
436
+ "Best cross-validation F1-score: 0.947693499624901\n"
437
+ ]
438
+ }
439
+ ],
440
+ "source": [
441
+ "# ---------------------- 5. Hyperparameter Tuning with RandomizedSearchCV ----------------------\n",
442
+ "print(\"\\n🔍 Performing hyperparameter tuning...\")\n",
443
+ "\n",
444
+ "# Compute class weights to prioritize 'High' class\n",
445
+ "class_weights = {0: 2.0, 1: 1.0, 2: 1.0} # Higher weight for 'High' (encoded as 0)\n",
446
+ "\n",
447
+ "# Define RandomForest model\n",
448
+ "rf = RandomForestClassifier(random_state=SEED, class_weight=class_weights)\n",
449
+ "\n",
450
+ "# Define reduced hyperparameter grid for faster tuning\n",
451
+ "param_dist = {\n",
452
+ " 'n_estimators': [100, 200, 300],\n",
453
+ " 'max_depth': [15, 20, 25],\n",
454
+ " 'min_samples_split': [2, 5],\n",
455
+ " 'min_samples_leaf': [1, 2]\n",
456
+ "}\n",
457
+ "\n",
458
+ "# Perform randomized search with cross-validation\n",
459
+ "cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)\n",
460
+ "random_search = RandomizedSearchCV(\n",
461
+ " estimator=rf, param_distributions=param_dist, n_iter=15, cv=cv,\n",
462
+ " scoring='f1_macro', n_jobs=1, random_state=SEED\n",
463
+ ")\n",
464
+ "random_search.fit(X_train_res, y_train_res)\n",
465
+ "\n",
466
+ "# Get best model\n",
467
+ "best_rf = random_search.best_estimator_\n",
468
+ "print(\"Best hyperparameters:\", random_search.best_params_)\n",
469
+ "print(\"Best cross-validation F1-score:\", random_search.best_score_)"
470
+ ]
471
+ },
472
+ {
473
+ "cell_type": "code",
474
+ "execution_count": 8,
475
+ "metadata": {
476
+ "colab": {
477
+ "base_uri": "https://localhost:8080/"
478
+ },
479
+ "id": "KJCINYHCLL_O",
480
+ "outputId": "d03379d8-2c2a-4db6-997f-73ad2dd86616"
481
+ },
482
+ "outputs": [
483
+ {
484
+ "name": "stdout",
485
+ "output_type": "stream",
486
+ "text": [
487
+ "\n",
488
+ "📏 Calibrating probabilities...\n",
489
+ "\n",
490
+ "📈 Model Evaluation on Test Set:\n",
491
+ "F1 Score (Macro): 0.9141359042280297\n",
492
+ "Accuracy: 0.929\n",
493
+ "\n",
494
+ "Classification Report:\n",
495
+ " precision recall f1-score support\n",
496
+ "\n",
497
+ " High 0.83 0.92 0.87 231\n",
498
+ " Low 0.92 0.95 0.93 566\n",
499
+ " Medium 0.96 0.92 0.94 1203\n",
500
+ "\n",
501
+ " accuracy 0.93 2000\n",
502
+ " macro avg 0.90 0.93 0.91 2000\n",
503
+ "weighted avg 0.93 0.93 0.93 2000\n",
504
+ "\n",
505
+ "Cross-Validation F1 Scores: [0.94929183 0.94712258 0.94513455 0.94776378 0.95005877]\n",
506
+ "Mean CV F1 Score: 0.9479 ± 0.0017\n"
507
+ ]
508
+ }
509
+ ],
510
+ "source": [
511
+ "# ---------------------- 6. Calibrate Probabilities ----------------------\n",
512
+ "print(\"\\n📏 Calibrating probabilities...\")\n",
513
+ "\n",
514
+ "# Calibrate the best model using CalibratedClassifierCV with reduced folds\n",
515
+ "calibrated_rf = CalibratedClassifierCV(best_rf, method='sigmoid', cv=3, ensemble=True)\n",
516
+ "calibrated_rf.fit(X_train_res, y_train_res)\n",
517
+ "\n",
518
+ "# Evaluate calibrated model on test set\n",
519
+ "y_pred = calibrated_rf.predict(X_test)\n",
520
+ "y_proba = calibrated_rf.predict_proba(X_test)\n",
521
+ "\n",
522
+ "# Print evaluation metrics\n",
523
+ "print(\"\\n📈 Model Evaluation on Test Set:\")\n",
524
+ "print(\"F1 Score (Macro):\", f1_score(y_test, y_pred, average=\"macro\"))\n",
525
+ "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
526
+ "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred, target_names=le.classes_))\n",
527
+ "\n",
528
+ "# Cross-validation on calibrated model\n",
529
+ "cv_scores = cross_val_score(calibrated_rf, X_train_res, y_train_res, cv=cv, scoring=\"f1_macro\")\n",
530
+ "print(f\"Cross-Validation F1 Scores: {cv_scores}\")\n",
531
+ "print(f\"Mean CV F1 Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}\")"
532
+ ]
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "execution_count": 9,
537
+ "metadata": {
538
+ "colab": {
539
+ "base_uri": "https://localhost:8080/"
540
+ },
541
+ "id": "lB69j9OWLMJP",
542
+ "outputId": "66942bbc-dcaf-4441-97d5-5b2715f9c508"
543
+ },
544
+ "outputs": [
545
+ {
546
+ "name": "stdout",
547
+ "output_type": "stream",
548
+ "text": [
549
+ "\n",
550
+ "📉 Generating visualizations...\n",
551
+ "Visuals saved to model/ directory.\n"
552
+ ]
553
+ }
554
+ ],
555
+ "source": [
556
+ "# ---------------------- 7. Visual Insights ----------------------\n",
557
+ "print(\"\\n📉 Generating visualizations...\")\n",
558
+ "\n",
559
+ "# Ensure model directory exists\n",
560
+ "os.makedirs(\"model\", exist_ok=True)\n",
561
+ "\n",
562
+ "# Confusion Matrix\n",
563
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
564
+ "plt.figure(figsize=(8, 6))\n",
565
+ "sns.heatmap(conf_matrix, annot=True, fmt=\"g\", cmap=\"Blues\", xticklabels=le.classes_, yticklabels=le.classes_)\n",
566
+ "plt.title(\"Confusion Matrix - RandomForest\")\n",
567
+ "plt.xlabel(\"Predicted\")\n",
568
+ "plt.ylabel(\"True\")\n",
569
+ "plt.savefig(\"model/rf_confusion_matrix.png\")\n",
570
+ "plt.close()\n",
571
+ "\n",
572
+ "# Feature Importance\n",
573
+ "feat_imp = pd.DataFrame({\"Feature\": features, \"Importance\": best_rf.feature_importances_})\n",
574
+ "feat_imp.sort_values(\"Importance\", ascending=False, inplace=True)\n",
575
+ "plt.figure(figsize=(10, 6))\n",
576
+ "sns.barplot(x=\"Importance\", y=\"Feature\", data=feat_imp)\n",
577
+ "plt.title(\"Feature Importances - RandomForest\")\n",
578
+ "plt.tight_layout()\n",
579
+ "plt.savefig(\"model/rf_feature_importance.png\")\n",
580
+ "plt.close()\n",
581
+ "\n",
582
+ "print(\"Visuals saved to model/ directory.\")"
583
+ ]
584
+ },
585
+ {
586
+ "cell_type": "code",
587
+ "execution_count": 10,
588
+ "metadata": {
589
+ "colab": {
590
+ "base_uri": "https://localhost:8080/"
591
+ },
592
+ "id": "KJCINYHCLL_O",
593
+ "outputId": "d03379d8-2c2a-4db6-997f-73ad2dd86616"
594
+ },
595
+ "outputs": [
596
+ {
597
+ "name": "stdout",
598
+ "output_type": "stream",
599
+ "text": [
600
+ "\n",
601
+ "💾 Saving model and encoder...\n",
602
+ "Model and encoder saved to model/ directory.\n"
603
+ ]
604
+ }
605
+ ],
606
+ "source": [
607
+ "# ---------------------- 8. Save Model and Encoder ----------------------\n",
608
+ "print(\"\\n💾 Saving model and encoder...\")\n",
609
+ "\n",
610
+ "# Ensure model directory exists\n",
611
+ "os.makedirs(\"model\", exist_ok=True)\n",
612
+ "\n",
613
+ "# Save the calibrated model\n",
614
+ "joblib.dump(calibrated_rf, \"model/rf_injury_model.pkl\")\n",
615
+ "\n",
616
+ "# Save the label encoder\n",
617
+ "joblib.dump(le, \"model/rf_target_encoder.pkl\")\n",
618
+ "\n",
619
+ "print(\"Model and encoder saved to model/ directory.\")"
620
+ ]
621
+ }
622
+ ],
623
+ "metadata": {
624
+ "colab": {
625
+ "provenance": []
626
+ },
627
+ "kernelspec": {
628
+ "display_name": "Python 3",
629
+ "name": "python3"
630
+ },
631
+ "language_info": {
632
+ "name": "python"
633
+ }
634
+ },
635
+ "nbformat": 4,
636
+ "nbformat_minor": 0
637
+ }
deployment/model/XGBOOST.ipynb ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "markdown",
19
+ "source": [
20
+ "# XGBoost Model for Injury Risk Prediction\n",
21
+ "\n",
22
+ "This notebook trains an XGBoost model to predict injury risk levels (Low, Medium, High) for athletes based on their training and physical attributes. The pipeline includes data loading, preprocessing, feature engineering, model training with hyperparameter tuning, probability calibration, and evaluation.\n",
23
+ "\n",
24
+ "## Objectives\n",
25
+ "- Load and preprocess the dataset consistently with the RandomForest model.\n",
26
+ "- Perform feature engineering to create meaningful features.\n",
27
+ "- Train an XGBoost model with hyperparameter tuning.\n",
28
+ "- Calibrate probabilities to ensure reliable confidence scores.\n",
29
+ "- Evaluate the model using appropriate metrics and visualizations.\n",
30
+ "- Save the model and encoder for use in the prediction pipeline."
31
+ ],
32
+ "metadata": {
33
+ "id": "X4Q8pyQ9mKOa"
34
+ }
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 1,
39
+ "metadata": {
40
+ "id": "X4Q8pyQ9mKOa"
41
+ },
42
+ "outputs": [],
43
+ "source": [
44
+ "# Import libraries\n",
45
+ "import os\n",
46
+ "import pandas as pd\n",
47
+ "import numpy as np\n",
48
+ "import xgboost as xgb\n",
49
+ "import joblib\n",
50
+ "import matplotlib.pyplot as plt\n",
51
+ "import seaborn as sns\n",
52
+ "from sklearn.model_selection import train_test_split, GridSearchCV\n",
53
+ "from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score\n",
54
+ "from sklearn.preprocessing import LabelEncoder\n",
55
+ "from imblearn.over_sampling import SMOTE\n",
56
+ "from sklearn.calibration import CalibratedClassifierCV\n",
57
+ "\n",
58
+ "# Set seed for reproducibility\n",
59
+ "np.random.seed(42)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "source": [
65
+ "# ---------------------- 1. Load Dataset ----------------------\n",
66
+ "# Load the dataset\n",
67
+ "df = pd.read_csv(\"/content/Refined_Sports_Injury_Dataset.csv\")\n",
68
+ "\n",
69
+ "# Display dataset info\n",
70
+ "print(\"Dataset loaded. Shape:\", df.shape)\n",
71
+ "print(\"\\nDataset Info:\")\n",
72
+ "print(df.info())\n",
73
+ "print(\"\\nClass Distribution:\")\n",
74
+ "print(df[\"Injury_Risk_Level\"].value_counts())"
75
+ ],
76
+ "metadata": {
77
+ "id": "AkHrpR7omsxv",
78
+ "colab": {
79
+ "base_uri": "https://localhost:8080/"
80
+ },
81
+ "outputId": "e685149d-3638-4c1c-9261-be48b842afab"
82
+ },
83
+ "execution_count": 2,
84
+ "outputs": [
85
+ {
86
+ "output_type": "stream",
87
+ "name": "stdout",
88
+ "text": [
89
+ "Dataset loaded. Shape: (10000, 18)\n",
90
+ "\n",
91
+ "Dataset Info:\n",
92
+ "<class 'pandas.core.frame.DataFrame'>\n",
93
+ "RangeIndex: 10000 entries, 0 to 9999\n",
94
+ "Data columns (total 18 columns):\n",
95
+ " # Column Non-Null Count Dtype \n",
96
+ "--- ------ -------------- ----- \n",
97
+ " 0 Age 10000 non-null int64 \n",
98
+ " 1 Gender 10000 non-null object \n",
99
+ " 2 Sport_Type 10000 non-null object \n",
100
+ " 3 Experience_Level 10000 non-null object \n",
101
+ " 4 Flexibility_Score 10000 non-null float64\n",
102
+ " 5 Total_Weekly_Training_Hours 10000 non-null float64\n",
103
+ " 6 High_Intensity_Training_Hours 10000 non-null float64\n",
104
+ " 7 Strength_Training_Frequency 10000 non-null int64 \n",
105
+ " 8 Recovery_Time_Between_Sessions 10000 non-null float64\n",
106
+ " 9 Training_Load_Score 10000 non-null float64\n",
107
+ " 10 Sprint_Speed 10000 non-null float64\n",
108
+ " 11 Endurance_Score 10000 non-null float64\n",
109
+ " 12 Agility_Score 10000 non-null float64\n",
110
+ " 13 Fatigue_Level 10000 non-null int64 \n",
111
+ " 14 Previous_Injury_Count 10000 non-null int64 \n",
112
+ " 15 Previous_Injury_Type 10000 non-null object \n",
113
+ " 16 Injury_Risk_Level 10000 non-null object \n",
114
+ " 17 Injury_Outcome 10000 non-null int64 \n",
115
+ "dtypes: float64(8), int64(5), object(5)\n",
116
+ "memory usage: 1.4+ MB\n",
117
+ "None\n",
118
+ "\n",
119
+ "Class Distribution:\n",
120
+ "Injury_Risk_Level\n",
121
+ "Medium 6014\n",
122
+ "Low 2832\n",
123
+ "High 1154\n",
124
+ "Name: count, dtype: int64\n"
125
+ ]
126
+ }
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "source": [
132
+ "# ---------------------- 2. Encode Categorical Columns ----------------------\n",
133
+ "print(\"\\n🔠 Encoding categorical columns...\")\n",
134
+ "\n",
135
+ "# Define mappings for categorical variables (consistent with RandomForest)\n",
136
+ "gender_mapping = {\"Male\": 0, \"Female\": 1}\n",
137
+ "experience_mapping = {\"Beginner\": 0, \"Intermediate\": 1, \"Advanced\": 2, \"Professional\": 3}\n",
138
+ "injury_type_mapping = {\"None\": 0, \"Sprain\": 1, \"Ligament Tear\": 2, \"Tendonitis\": 3, \"Strain\": 4, \"Fracture\": 5}\n",
139
+ "\n",
140
+ "# Encode Gender\n",
141
+ "df[\"Gender\"] = df[\"Gender\"].map(gender_mapping).fillna(0).astype(int)\n",
142
+ "\n",
143
+ "# Encode Sport_Type dynamically\n",
144
+ "df[\"Sport_Type\"] = df[\"Sport_Type\"].astype(\"category\").cat.codes\n",
145
+ "\n",
146
+ "# Encode Experience_Level\n",
147
+ "df[\"Experience_Level\"] = df[\"Experience_Level\"].map(experience_mapping).fillna(0).astype(int)\n",
148
+ "\n",
149
+ "# Encode Previous_Injury_Type\n",
150
+ "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].map(injury_type_mapping).fillna(0).astype(int)\n",
151
+ "\n",
152
+ "# Encode Target\n",
153
+ "le = LabelEncoder()\n",
154
+ "df[\"Injury_Risk_Level\"] = le.fit_transform(df[\"Injury_Risk_Level\"].astype(str))\n",
155
+ "\n",
156
+ "# Verify encoding\n",
157
+ "print(\"Encoded class mapping:\", dict(zip(le.classes_, range(len(le.classes_)))))\n",
158
+ "print(\"Sample of encoded data:\\n\", df.head())"
159
+ ],
160
+ "metadata": {
161
+ "id": "B4MoE4YtKU0d",
162
+ "colab": {
163
+ "base_uri": "https://localhost:8080/"
164
+ },
165
+ "outputId": "e685149d-3638-4c1c-9261-be48b842afab"
166
+ },
167
+ "execution_count": 3,
168
+ "outputs": [
169
+ {
170
+ "output_type": "stream",
171
+ "name": "stdout",
172
+ "text": [
173
+ "\n",
174
+ "🔠 Encoding categorical columns...\n",
175
+ "Encoded class mapping: {'High': 0, 'Low': 1, 'Medium': 2}\n",
176
+ "Sample of encoded data:\n",
177
+ " Age Gender Sport_Type Experience_Level Flexibility_Score \\\n",
178
+ "0 34 0 2 0 7.2 \n",
179
+ "1 29 1 4 3 8.5 \n",
180
+ "2 31 0 2 2 6.8 \n",
181
+ "3 27 1 0 1 7.9 \n",
182
+ "4 33 0 3 2 6.5 \n",
183
+ "\n",
184
+ " Total_Weekly_Training_Hours High_Intensity_Training_Hours \\\n",
185
+ "0 12.0 4.0 \n",
186
+ "1 8.0 2.0 \n",
187
+ "2 15.0 6.0 \n",
188
+ "3 10.0 3.0 \n",
189
+ "4 9.0 3.0 \n",
190
+ "\n",
191
+ " Strength_Training_Frequency Recovery_Time_Between_Sessions \\\n",
192
+ "0 3 48.0 \n",
193
+ "1 2 72.0 \n",
194
+ "2 4 36.0 \n",
195
+ "3 2 60.0 \n",
196
+ "4 3 48.0 \n",
197
+ "\n",
198
+ " Training_Load_Score Sprint_Speed Endurance_Score Agility_Score \\\n",
199
+ "0 65.0 6.8 7.5 7.0 \n",
200
+ "1 45.0 7.2 8.0 8.2 \n",
201
+ "2 80.0 6.5 7.0 6.8 \n",
202
+ "3 55.0 7.0 7.8 7.5 \n",
203
+ "4 60.0 6.7 7.2 6.9 \n",
204
+ "\n",
205
+ " Fatigue_Level Previous_Injury_Count Previous_Injury_Type \\\n",
206
+ "0 4 1 1 \n",
207
+ "1 2 0 0 \n",
208
+ "2 6 2 2 \n",
209
+ "3 3 0 0 \n",
210
+ "4 5 1 3 \n",
211
+ "\n",
212
+ " Injury_Risk_Level Injury_Outcome \n",
213
+ "0 2 0 \n",
214
+ "1 1 0 \n",
215
+ "2 0 1 \n",
216
+ "3 1 0 \n",
217
+ "4 2 0 \n"
218
+ ]
219
+ }
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "source": [
225
+ "# ---------------------- 3. Feature Engineering ----------------------\n",
226
+ "print(\"\\n🛠️ Creating derived features...\")\n",
227
+ "\n",
228
+ "# Replace 0 with 0.1 in Total_Weekly_Training_Hours to avoid division by zero\n",
229
+ "df[\"Total_Weekly_Training_Hours\"] = df[\"Total_Weekly_Training_Hours\"].replace(0, 0.1)\n",
230
+ "\n",
231
+ "# Create derived features\n",
232
+ "df[\"Intensity_Ratio\"] = df[\"High_Intensity_Training_Hours\"] / df[\"Total_Weekly_Training_Hours\"]\n",
233
+ "df[\"Recovery_Per_Training\"] = df[\"Recovery_Time_Between_Sessions\"] / df[\"Total_Weekly_Training_Hours\"]\n",
234
+ "\n",
235
+ "# Define features (removed Predicted_Injury_Type)\n",
236
+ "features = [\n",
237
+ " \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n",
238
+ " \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n",
239
+ " \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n",
240
+ " \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n",
241
+ " \"Intensity_Ratio\", \"Recovery_Per_Training\"\n",
242
+ "]\n",
243
+ "\n",
244
+ "# Prepare features and target\n",
245
+ "X = df[features]\n",
246
+ "y = df[\"Injury_Risk_Level\"]\n",
247
+ "\n",
248
+ "# Verify features\n",
249
+ "print(\"Features created:\", features)\n",
250
+ "print(\"Sample of features:\\n\", X.head())"
251
+ ],
252
+ "metadata": {
253
+ "id": "M4xvzYqzKU-9",
254
+ "colab": {
255
+ "base_uri": "https://localhost:8080/"
256
+ },
257
+ "outputId": "e685149d-3638-4c1c-9261-be48b842afab"
258
+ },
259
+ "execution_count": 4,
260
+ "outputs": [
261
+ {
262
+ "output_type": "stream",
263
+ "name": "stdout",
264
+ "text": [
265
+ "\n",
266
+ "🛠️ Creating derived features...\n",
267
+ "Features created: ['Age', 'Gender', 'Sport_Type', 'Experience_Level', 'Flexibility_Score', 'Total_Weekly_Training_Hours', 'High_Intensity_Training_Hours', 'Strength_Training_Frequency', 'Recovery_Time_Between_Sessions', 'Training_Load_Score', 'Sprint_Speed', 'Endurance_Score', 'Agility_Score', 'Fatigue_Level', 'Previous_Injury_Count', 'Previous_Injury_Type', 'Intensity_Ratio', 'Recovery_Per_Training']\n",
268
+ "Sample of features:\n",
269
+ " Age Gender Sport_Type Experience_Level Flexibility_Score \\\n",
270
+ "0 34 0 2 0 7.2 \n",
271
+ "1 29 1 4 3 8.5 \n",
272
+ "2 31 0 2 2 6.8 \n",
273
+ "3 27 1 0 1 7.9 \n",
274
+ "4 33 0 3 2 6.5 \n",
275
+ "\n",
276
+ " Total_Weekly_Training_Hours High_Intensity_Training_Hours \\\n",
277
+ "0 12.0 4.0 \n",
278
+ "1 8.0 2.0 \n",
279
+ "2 15.0 6.0 \n",
280
+ "3 10.0 3.0 \n",
281
+ "4 9.0 3.0 \n",
282
+ "\n",
283
+ " Strength_Training_Frequency Recovery_Time_Between_Sessions \\\n",
284
+ "0 3 48.0 \n",
285
+ "1 2 72.0 \n",
286
+ "2 4 36.0 \n",
287
+ "3 2 60.0 \n",
288
+ "4 3 48.0 \n",
289
+ "\n",
290
+ " Training_Load_Score Sprint_Speed Endurance_Score Agility_Score \\\n",
291
+ "0 65.0 6.8 7.5 7.0 \n",
292
+ "1 45.0 7.2 8.0 8.2 \n",
293
+ "2 80.0 6.5 7.0 6.8 \n",
294
+ "3 55.0 7.0 7.8 7.5 \n",
295
+ "4 60.0 6.7 7.2 6.9 \n",
296
+ "\n",
297
+ " Fatigue_Level Previous_Injury_Count Previous_Injury_Type Intensity_Ratio \\\n",
298
+ "0 4 1 1 0.333333 \n",
299
+ "1 2 0 0 0.250000 \n",
300
+ "2 6 2 2 0.400000 \n",
301
+ "3 3 0 0 0.300000 \n",
302
+ "4 5 1 3 0.333333 \n",
303
+ "\n",
304
+ " Recovery_Per_Training \n",
305
+ "0 4.0 \n",
306
+ "1 9.0 \n",
307
+ "2 2.4 \n",
308
+ "3 6.0 \n",
309
+ "4 5.3 \n"
310
+ ]
311
+ }
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "source": [
317
+ "# ---------------------- 4. Train/Test Split & SMOTE ----------------------\n",
318
+ "print(\"\\n📊 Splitting data & applying SMOTE...\")\n",
319
+ "\n",
320
+ "# Split data into train and test sets\n",
321
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)\n",
322
+ "\n",
323
+ "# Apply SMOTE to balance the training set\n",
324
+ "X_train_res, y_train_res = SMOTE(random_state=42).fit_resample(X_train, y_train)\n",
325
+ "\n",
326
+ "# Print class distribution after SMOTE\n",
327
+ "print(\"Training set class distribution after SMOTE:\")\n",
328
+ "print(pd.Series(y_train_res).value_counts())"
329
+ ],
330
+ "metadata": {
331
+ "id": "GtyXbFTsKVUS",
332
+ "colab": {
333
+ "base_uri": "https://localhost:8080/"
334
+ },
335
+ "outputId": "e685149d-3638-4c1c-9261-be48b842afab"
336
+ },
337
+ "execution_count": 5,
338
+ "outputs": [
339
+ {
340
+ "output_type": "stream",
341
+ "name": "stdout",
342
+ "text": [
343
+ "\n",
344
+ "📊 Splitting data & applying SMOTE...\n",
345
+ "Training set class distribution after SMOTE:\n",
346
+ "0 4811\n",
347
+ "1 4811\n",
348
+ "2 4811\n",
349
+ "Name: count, dtype: int64\n"
350
+ ]
351
+ }
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "source": [
357
+ "# ---------------------- 5. Hyperparameter Tuning with GridSearchCV ----------------------\n",
358
+ "print(\"\\n🔍 Performing hyperparameter tuning...\")\n",
359
+ "\n",
360
+ "# Define XGBoost model\n",
361
+ "xgb_model = xgb.XGBClassifier(\n",
362
+ " objective=\"multi:softprob\",\n",
363
+ " eval_metric=\"mlogloss\",\n",
364
+ " num_class=len(le.classes_),\n",
365
+ " random_state=42\n",
366
+ ")\n",
367
+ "\n",
368
+ "# Define hyperparameter grid\n",
369
+ "param_grid = {\n",
370
+ " 'n_estimators': [100, 200, 300],\n",
371
+ " 'max_depth': [5, 10, 15],\n",
372
+ " 'learning_rate': [0.01, 0.1, 0.3],\n",
373
+ " 'subsample': [0.8, 1.0]\n",
374
+ "}\n",
375
+ "\n",
376
+ "# Perform grid search with cross-validation\n",
377
+ "grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)\n",
378
+ "grid_search.fit(X_train_res, y_train_res)\n",
379
+ "\n",
380
+ "# Get best model\n",
381
+ "best_xgb = grid_search.best_estimator_\n",
382
+ "print(\"Best hyperparameters:\", grid_search.best_params_)\n",
383
+ "print(\"Best cross-validation F1-score:\", grid_search.best_score_)"
384
+ ],
385
+ "metadata": {
386
+ "id": "CFoUgTwZKVf3",
387
+ "colab": {
388
+ "base_uri": "https://localhost:8080/"
389
+ },
390
+ "outputId": "aae9b818-ef59-48e9-bb06-e079913edaf5"
391
+ },
392
+ "execution_count": 6,
393
+ "outputs": [
394
+ {
395
+ "output_type": "stream",
396
+ "name": "stdout",
397
+ "text": [
398
+ "\n",
399
+ "🔍 Performing hyperparameter tuning...\n",
400
+ "Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 300, 'subsample': 1.0}\n",
401
+ "Best cross-validation F1-score: 0.9561784041412857\n"
402
+ ]
403
+ }
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "source": [
409
+ "# ---------------------- 6. Calibrate Probabilities ----------------------\n",
410
+ "print(\"\\n📏 Calibrating probabilities...\")\n",
411
+ "\n",
412
+ "# Calibrate the best model using CalibratedClassifierCV\n",
413
+ "calibrated_xgb = CalibratedClassifierCV(best_xgb, method='sigmoid', cv=5)\n",
414
+ "calibrated_xgb.fit(X_train_res, y_train_res)\n",
415
+ "\n",
416
+ "# Evaluate calibrated model on test set\n",
417
+ "y_pred = calibrated_xgb.predict(X_test)\n",
418
+ "y_proba = calibrated_xgb.predict_proba(X_test)\n",
419
+ "\n",
420
+ "# Print evaluation metrics\n",
421
+ "print(\"\\n📈 Model Evaluation on Test Set:\")\n",
422
+ "print(\"F1 Score (Macro):\", f1_score(y_test, y_pred, average=\"macro\"))\n",
423
+ "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
424
+ "print(\"\\nClassification Report:\\n\", classification_report(y_test, y_pred, target_names=le.classes_))"
425
+ ],
426
+ "metadata": {
427
+ "id": "5Fgs6oHUKVwK",
428
+ "colab": {
429
+ "base_uri": "https://localhost:8080/"
430
+ },
431
+ "outputId": "c2f9145f-8f25-40ef-b1d9-26d85332cc2f"
432
+ },
433
+ "execution_count": 7,
434
+ "outputs": [
435
+ {
436
+ "output_type": "stream",
437
+ "name": "stdout",
438
+ "text": [
439
+ "\n",
440
+ "📏 Calibrating probabilities...\n",
441
+ "\n",
442
+ "📈 Model Evaluation on Test Set:\n",
443
+ "F1 Score (Macro): 0.9154651493598866\n",
444
+ "Accuracy: 0.929\n",
445
+ "\n",
446
+ "Classification Report:\n",
447
+ " precision recall f1-score support\n",
448
+ "\n",
449
+ " High 0.82 0.90 0.86 231\n",
450
+ " Low 0.93 0.94 0.94 566\n",
451
+ " Medium 0.95 0.93 0.94 1203\n",
452
+ "\n",
453
+ " accuracy 0.93 2000\n",
454
+ " macro avg 0.90 0.93 0.92 2000\n",
455
+ "weighted avg 0.93 0.93 0.93 2000\n"
456
+ ]
457
+ }
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "source": [
463
+ "# ---------------------- 7. Visual Insights ----------------------\n",
464
+ "print(\"\\n📉 Generating visualizations...\")\n",
465
+ "\n",
466
+ "# Confusion Matrix\n",
467
+ "conf_matrix = confusion_matrix(y_test, y_pred)\n",
468
+ "plt.figure(figsize=(8, 6))\n",
469
+ "sns.heatmap(conf_matrix, annot=True, fmt=\"g\", cmap=\"Blues\", xticklabels=le.classes_, yticklabels=le.classes_)\n",
470
+ "plt.title(\"Confusion Matrix - XGBoost\")\n",
471
+ "plt.xlabel(\"Predicted\")\n",
472
+ "plt.ylabel(\"True\")\n",
473
+ "plt.savefig(\"model/xgb_confusion_matrix.png\")\n",
474
+ "plt.close()\n",
475
+ "\n",
476
+ "# Feature Importance\n",
477
+ "feat_imp = pd.DataFrame({\"Feature\": features, \"Importance\": best_xgb.feature_importances_})\n",
478
+ "feat_imp.sort_values(\"Importance\", ascending=False, inplace=True)\n",
479
+ "plt.figure(figsize=(10, 6))\n",
480
+ "sns.barplot(x=\"Importance\", y=\"Feature\", data=feat_imp)\n",
481
+ "plt.title(\"Feature Importances - XGBoost\")\n",
482
+ "plt.tight_layout()\n",
483
+ "plt.savefig(\"model/xgb_feature_importance.png\")\n",
484
+ "plt.close()\n",
485
+ "\n",
486
+ "print(\"Visuals saved to model/ directory.\")"
487
+ ],
488
+ "metadata": {
489
+ "id": "5Fgs6oHUKVwK",
490
+ "colab": {
491
+ "base_uri": "https://localhost:8080/"
492
+ },
493
+ "outputId": "c2f9145f-8f25-40ef-b1d9-26d85332cc2f"
494
+ },
495
+ "execution_count": 8,
496
+ "outputs": [
497
+ {
498
+ "output_type": "stream",
499
+ "name": "stdout",
500
+ "text": [
501
+ "\n",
502
+ "📉 Generating visualizations...\n",
503
+ "Visuals saved to model/ directory.\n"
504
+ ]
505
+ }
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "source": [
511
+ "# ---------------------- 8. Save Model and Encoder ----------------------\n",
512
+ "print(\"\\n💾 Saving model and encoder...\")\n",
513
+ "\n",
514
+ "# Create model directory if it doesn't exist\n",
515
+ "os.makedirs(\"model\", exist_ok=True)\n",
516
+ "\n",
517
+ "# Save the calibrated model\n",
518
+ "joblib.dump(calibrated_xgb, \"model/xgboost_injury_model.pkl\")\n",
519
+ "\n",
520
+ "# Save the label encoder\n",
521
+ "joblib.dump(le, \"model/xgb_target_encoder.pkl\")\n",
522
+ "\n",
523
+ "print(\"Model and encoder saved to model/ directory.\")"
524
+ ],
525
+ "metadata": {
526
+ "id": "4bvmzxoJKlI1",
527
+ "colab": {
528
+ "base_uri": "https://localhost:8080/"
529
+ },
530
+ "outputId": "5753ab2c-7494-48f8-98df-ad70c45377b7"
531
+ },
532
+ "execution_count": 9,
533
+ "outputs": [
534
+ {
535
+ "output_type": "stream",
536
+ "name": "stdout",
537
+ "text": [
538
+ "\n",
539
+ "💾 Saving model and encoder...\n",
540
+ "Model and encoder saved to model/ directory.\n"
541
+ ]
542
+ }
543
+ ]
544
+ }
545
+ ]
546
+ }