{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Calibrate Injury Likelihood Mapping\n", "\n", "This notebook calibrates the injury likelihood mapping for the injury risk prediction system. We use the training dataset (`Refined_Sports_Injury_Dataset.csv`) to map the model's predicted probabilities to true injury probabilities.\n", "\n", "## Objectives\n", "- Load the dataset and trained models.\n", "- Preprocess the data consistently with the training pipeline.\n", "- Use the models to predict probabilities on a calibration set.\n", "- Fit a logistic regression model to map predicted probabilities to true injury probabilities.\n", "- Save the calibration model for use in `predict.py`." ], "metadata": { "id": "v4eUbEr3u9Rm" } }, { "cell_type": "code", "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "import joblib\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import brier_score_loss\n", "import matplotlib.pyplot as plt\n", "\n", "# Set random seed\n", "np.random.seed(42)" ], "metadata": { "id": "Y_dhli46u9Ro" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "# ---------------------- 1. Load Data and Models ----------------------\n", "# Load the dataset\n", "data_path = \"/content/Refined_Sports_Injury_Dataset.csv\"\n", "try:\n", " df = pd.read_csv(data_path)\n", "except FileNotFoundError:\n", " raise FileNotFoundError(f\"Dataset file not found at {data_path}. Please ensure the file exists.\")\n", "print(\"Dataset loaded. Shape:\", df.shape)\n", "\n", "# Load the trained models with error handling\n", "model_dir = \"/content/model\"\n", "try:\n", " rf_model = joblib.load(f\"{model_dir}/rf_injury_model.pkl\")\n", "except Exception as e:\n", " raise FileNotFoundError(f\"Failed to load RandomForest model: {str(e)}. Ensure RandomForest.ipynb has been run successfully to generate {model_dir}/rf_injury_model.pkl.\")\n", "\n", "try:\n", " xgb_model = joblib.load(f\"{model_dir}/xgboost_injury_model.pkl\")\n", "except Exception as e:\n", " raise FileNotFoundError(f\"Failed to load XGBoost model: {str(e)}. Ensure XGBOOST.ipynb has been run successfully to generate {model_dir}/xgboost_injury_model.pkl.\")\n", "\n", "print(\"Models loaded.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NPDVk8WLu9Rp", "outputId": "303e0a70-70b3-4796-d887-0b845c219e9a" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Dataset loaded. Shape: (10000, 18)\n", "Models loaded.\n" ] } ] }, { "cell_type": "code", "source": [ "# ---------------------- 2. Preprocess Data ----------------------\n", "# Encode categorical columns (consistent with training notebooks)\n", "gender_mapping = {\"Male\": 0, \"Female\": 1}\n", "experience_mapping = {\"Beginner\": 0, \"Intermediate\": 1, \"Advanced\": 2, \"Professional\": 3}\n", "injury_type_mapping = {\"None\": 0, \"Sprain\": 1, \"Ligament Tear\": 2, \"Tendonitis\": 3, \"Strain\": 4, \"Fracture\": 5}\n", "\n", "# Handle NaN values in Previous_Injury_Type\n", "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].fillna(\"None\")\n", "\n", "df[\"Gender\"] = df[\"Gender\"].map(gender_mapping).fillna(0).astype(int)\n", "df[\"Sport_Type\"] = df[\"Sport_Type\"].astype(\"category\").cat.codes\n", "df[\"Experience_Level\"] = df[\"Experience_Level\"].map(experience_mapping).fillna(0).astype(int)\n", "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].map(injury_type_mapping).fillna(0).astype(int)\n", "\n", "# Replace 0 with 0.1 in Total_Weekly_Training_Hours\n", "df[\"Total_Weekly_Training_Hours\"] = df[\"Total_Weekly_Training_Hours\"].replace(0, 0.1)\n", "\n", "# Create derived features\n", "df[\"Intensity_Ratio\"] = df[\"High_Intensity_Training_Hours\"] / df[\"Total_Weekly_Training_Hours\"]\n", "df[\"Recovery_Per_Training\"] = df[\"Recovery_Time_Between_Sessions\"] / df[\"Total_Weekly_Training_Hours\"]\n", "\n", "# Create Injury_Occurred column probabilistically based on Injury_Risk_Level\n", "print(\"Injury_Risk_Level distribution:\\n\", df[\"Injury_Risk_Level\"].value_counts())\n", "\n", "# Define probabilities of injury occurrence based on risk level\n", "injury_probabilities = {\n", " \"High\": 0.95, # 95% chance of injury\n", " \"Medium\": 0.5, # 50% chance of injury\n", " \"Low\": 0.05 # 5% chance of injury\n", "}\n", "\n", "# Generate Injury_Occurred using random sampling based on Injury_Risk_Level\n", "df[\"Injury_Occurred\"] = df[\"Injury_Risk_Level\"].apply(\n", " lambda x: np.random.binomial(1, injury_probabilities[x])\n", ")\n", "\n", "# Check the distribution of Injury_Occurred\n", "print(\"Injury_Occurred distribution (full dataset):\\n\", df[\"Injury_Occurred\"].value_counts())\n", "\n", "# Ensure both classes are present\n", "if len(df[\"Injury_Occurred\"].unique()) < 2:\n", " raise ValueError(\"Injury_Occurred contains only one class after probabilistic assignment. Adjust probabilities or dataset.\")\n", "\n", "# Define features\n", "features = [\n", " \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n", " \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n", " \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n", " \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n", " \"Intensity_Ratio\", \"Recovery_Per_Training\"\n", "]\n", "\n", "# Prepare features and target\n", "X = df[features]\n", "y_outcome = df[\"Injury_Occurred\"]\n", "\n", "print(\"Features prepared:\", features)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rEUcpksxu9Rq", "outputId": "70eee7a9-b2f5-4d3c-a668-504cd0c9dd81" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Injury_Risk_Level distribution:\n", " Injury_Risk_Level\n", "Medium 6016\n", "Low 2827\n", "High 1157\n", "Name: count, dtype: int64\n", "Injury_Occurred distribution (full dataset):\n", " Injury_Occurred\n", "0 5801\n", "1 4199\n", "Name: count, dtype: int64\n", "Features prepared: ['Age', 'Gender', 'Sport_Type', 'Experience_Level', 'Flexibility_Score', 'Total_Weekly_Training_Hours', 'High_Intensity_Training_Hours', 'Strength_Training_Frequency', 'Recovery_Time_Between_Sessions', 'Training_Load_Score', 'Sprint_Speed', 'Endurance_Score', 'Agility_Score', 'Fatigue_Level', 'Previous_Injury_Count', 'Previous_Injury_Type', 'Intensity_Ratio', 'Recovery_Per_Training']\n" ] } ] }, { "cell_type": "code", "source": [ "# ---------------------- 3. Split Data for Calibration ----------------------\n", "# Split data into training and calibration sets\n", "X_train, X_calib, y_train_outcome, y_calib_outcome = train_test_split(\n", " X, y_outcome, test_size=0.2, stratify=y_outcome, random_state=42\n", ")\n", "\n", "print(\"Calibration set size:\", X_calib.shape)\n", "print(\"Calibration Injury_Occurred distribution:\\n\", y_calib_outcome.value_counts())\n", "\n", "# Check if y_calib_outcome has both classes\n", "if len(y_calib_outcome.unique()) < 2:\n", " raise ValueError(\"Calibration set contains only one class in Injury_Occurred. Cannot proceed with calibration.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kbGVsNq4u9Rr", "outputId": "732a5e13-f52b-4001-ee03-71a1b929fac7" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Calibration set size: (2000, 18)\n", "Calibration Injury_Occurred distribution:\n", " Injury_Occurred\n", "0 1160\n", "1 840\n", "Name: count, dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "# ---------------------- 4. Predict Probabilities with Ensemble ----------------------\n", "# Use the ensemble (average of RandomForest and XGBoost) to predict probabilities\n", "rf_probs = rf_model.predict_proba(X_calib)\n", "xgb_probs = xgb_model.predict_proba(X_calib)\n", "\n", "# Average the probabilities (same as in predict.py)\n", "avg_probs = (rf_probs + xgb_probs) / 2\n", "\n", "# Get the predicted class and confidence\n", "predicted_classes = np.argmax(avg_probs, axis=1)\n", "confidences = np.max(avg_probs, axis=1)\n", "\n", "print(\"Sample of predicted confidences:\\n\", confidences[:5])\n", "print(\"Sample of predicted classes:\\n\", predicted_classes[:5])\n", "\n", "# Inspect the relationship between predicted classes and Injury_Occurred\n", "calib_df = pd.DataFrame({\n", " \"Predicted_Class\": predicted_classes,\n", " \"Injury_Occurred\": y_calib_outcome\n", "})\n", "print(\"Distribution of Injury_Occurred by Predicted Class:\\n\", calib_df.groupby(\"Predicted_Class\")[\"Injury_Occurred\"].value_counts())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-Qyc_ZYOu9Rr", "outputId": "10e3592e-c51c-49cb-9ff4-6e157701811e" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Sample of predicted confidences:\n", " [0.98814357 0.9881264 0.97794891 0.97335743 0.97383904]\n", "Sample of predicted classes:\n", " [2 2 2 1 1]\n", "Distribution of Injury_Occurred by Predicted Class:\n", " Predicted_Class Injury_Occurred\n", "0 1 216\n", " 0 14\n", "1 0 540\n", " 1 40\n", "2 0 606\n", " 1 584\n", "Name: count, dtype: int64\n" ] } ] }, { "cell_type": "code", "source": [ "# ---------------------- 5. Map Probabilities to Injury Probabilities ----------------------\n", "# Use the raw probabilities as features for calibration\n", "calib_data = pd.DataFrame({\n", " \"prob_high\": avg_probs[:, 0], # Probability of High risk\n", " \"prob_low\": avg_probs[:, 1], # Probability of Low risk\n", " \"prob_medium\": avg_probs[:, 2] # Probability of Medium risk\n", "})\n", "\n", "# Fit logistic regression with regularization\n", "lr_calib = LogisticRegression(max_iter=1000, penalty='l2', C=1.0)\n", "lr_calib.fit(calib_data, y_calib_outcome)\n", "\n", "# Predict calibrated probabilities\n", "calibrated_probs = lr_calib.predict_proba(calib_data)[:, 1]\n", "\n", "# Evaluate calibration using Brier score\n", "brier_score = brier_score_loss(y_calib_outcome, calibrated_probs)\n", "print(f\"Brier Score (lower is better): {brier_score:.4f}\")\n", "\n", "# Inspect the mapping\n", "calib_results = pd.DataFrame({\n", " \"Predicted_Class\": predicted_classes,\n", " \"Confidence\": confidences,\n", " \"Calibrated_Probability\": calibrated_probs,\n", " \"True_Injury_Occurred\": y_calib_outcome\n", "})\n", "print(\"Average Calibrated Probability by Predicted Class:\\n\", calib_results.groupby(\"Predicted_Class\")[\"Calibrated_Probability\"].mean())\n", "\n", "# Plot calibration curve\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(confidences, calibrated_probs, alpha=0.5)\n", "plt.plot([0, 1], [0, 1], 'k--', label=\"Perfectly Calibrated\")\n", "plt.xlabel(\"Original Confidence (Ensemble)\")\n", "plt.ylabel(\"Calibrated Injury Probability (P(Injury_Occurred=1))\")\n", "plt.title(\"Calibration Curve: Confidence vs. Injury Probability\")\n", "plt.legend()\n", "plt.savefig(f\"{model_dir}/calibration_curve.png\")\n", "plt.close()\n", "\n", "print(f\"Calibration curve saved to {model_dir}/calibration_curve.png\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oITFxyA4u9Rs", "outputId": "98d1b529-b387-43ee-a7a8-9a7244701dd3" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Brier Score (lower is better): 0.1734\n", "Average Calibrated Probability by Predicted Class:\n", " Predicted_Class\n", "0 0.923188\n", "1 0.075402\n", "2 0.490621\n", "Name: Calibrated_Probability, dtype: float64\n", "Calibration curve saved to /content/model/calibration_curve.png\n" ] } ] }, { "cell_type": "code", "source": [ "# ---------------------- 6. Save Calibration Model ----------------------\n", "# Save the logistic regression model for use in predict.py\n", "import os\n", "os.makedirs(model_dir, exist_ok=True)\n", "joblib.dump(lr_calib, f\"{model_dir}/likelihood_calibrator.pkl\")\n", "print(f\"Calibration model saved to {model_dir}/likelihood_calibrator.pkl\")\n", "print(\"Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\")\n", "print(\"Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\")\n", "print(\"Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S8IY2YCZu9Rs", "outputId": "4585d640-eb1b-49d2-ec60-2727704b2de5" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Calibration model saved to /content/model/likelihood_calibrator.pkl\n", "Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\n", "Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\n", "Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\n" ] } ] } ] }