{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Calibrate Injury Likelihood Mapping\n",
        "\n",
        "This notebook calibrates the injury likelihood mapping for the injury risk prediction system. We use the training dataset (`Refined_Sports_Injury_Dataset.csv`) to map the model's predicted probabilities to true injury probabilities.\n",
        "\n",
        "## Objectives\n",
        "- Load the dataset and trained models.\n",
        "- Preprocess the data consistently with the training pipeline.\n",
        "- Use the models to predict probabilities on a calibration set.\n",
        "- Fit a logistic regression model to map predicted probabilities to true injury probabilities.\n",
        "- Save the calibration model for use in `predict.py`."
      ],
      "metadata": {
        "id": "v4eUbEr3u9Rm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Import libraries\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import joblib\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import brier_score_loss\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# Set random seed\n",
        "np.random.seed(42)"
      ],
      "metadata": {
        "id": "Y_dhli46u9Ro"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 1. Load Data and Models ----------------------\n",
        "# Load the dataset\n",
        "data_path = \"/content/Refined_Sports_Injury_Dataset.csv\"\n",
        "try:\n",
        "    df = pd.read_csv(data_path)\n",
        "except FileNotFoundError:\n",
        "    raise FileNotFoundError(f\"Dataset file not found at {data_path}. Please ensure the file exists.\")\n",
        "print(\"Dataset loaded. Shape:\", df.shape)\n",
        "\n",
        "# Load the trained models with error handling\n",
        "model_dir = \"/content/model\"\n",
        "try:\n",
        "    rf_model = joblib.load(f\"{model_dir}/rf_injury_model.pkl\")\n",
        "except Exception as e:\n",
        "    raise FileNotFoundError(f\"Failed to load RandomForest model: {str(e)}. Ensure RandomForest.ipynb has been run successfully to generate {model_dir}/rf_injury_model.pkl.\")\n",
        "\n",
        "try:\n",
        "    xgb_model = joblib.load(f\"{model_dir}/xgboost_injury_model.pkl\")\n",
        "except Exception as e:\n",
        "    raise FileNotFoundError(f\"Failed to load XGBoost model: {str(e)}. Ensure XGBOOST.ipynb has been run successfully to generate {model_dir}/xgboost_injury_model.pkl.\")\n",
        "\n",
        "print(\"Models loaded.\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NPDVk8WLu9Rp",
        "outputId": "303e0a70-70b3-4796-d887-0b845c219e9a"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Dataset loaded. Shape: (10000, 18)\n",
            "Models loaded.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 2. Preprocess Data ----------------------\n",
        "# Encode categorical columns (consistent with training notebooks)\n",
        "gender_mapping = {\"Male\": 0, \"Female\": 1}\n",
        "experience_mapping = {\"Beginner\": 0, \"Intermediate\": 1, \"Advanced\": 2, \"Professional\": 3}\n",
        "injury_type_mapping = {\"None\": 0, \"Sprain\": 1, \"Ligament Tear\": 2, \"Tendonitis\": 3, \"Strain\": 4, \"Fracture\": 5}\n",
        "\n",
        "# Handle NaN values in Previous_Injury_Type\n",
        "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].fillna(\"None\")\n",
        "\n",
        "df[\"Gender\"] = df[\"Gender\"].map(gender_mapping).fillna(0).astype(int)\n",
        "df[\"Sport_Type\"] = df[\"Sport_Type\"].astype(\"category\").cat.codes\n",
        "df[\"Experience_Level\"] = df[\"Experience_Level\"].map(experience_mapping).fillna(0).astype(int)\n",
        "df[\"Previous_Injury_Type\"] = df[\"Previous_Injury_Type\"].map(injury_type_mapping).fillna(0).astype(int)\n",
        "\n",
        "# Replace 0 with 0.1 in Total_Weekly_Training_Hours\n",
        "df[\"Total_Weekly_Training_Hours\"] = df[\"Total_Weekly_Training_Hours\"].replace(0, 0.1)\n",
        "\n",
        "# Create derived features\n",
        "df[\"Intensity_Ratio\"] = df[\"High_Intensity_Training_Hours\"] / df[\"Total_Weekly_Training_Hours\"]\n",
        "df[\"Recovery_Per_Training\"] = df[\"Recovery_Time_Between_Sessions\"] / df[\"Total_Weekly_Training_Hours\"]\n",
        "\n",
        "# Create Injury_Occurred column probabilistically based on Injury_Risk_Level\n",
        "print(\"Injury_Risk_Level distribution:\\n\", df[\"Injury_Risk_Level\"].value_counts())\n",
        "\n",
        "# Define probabilities of injury occurrence based on risk level\n",
        "injury_probabilities = {\n",
        "    \"High\": 0.95,  # 95% chance of injury\n",
        "    \"Medium\": 0.5, # 50% chance of injury\n",
        "    \"Low\": 0.05    # 5% chance of injury\n",
        "}\n",
        "\n",
        "# Generate Injury_Occurred using random sampling based on Injury_Risk_Level\n",
        "df[\"Injury_Occurred\"] = df[\"Injury_Risk_Level\"].apply(\n",
        "    lambda x: np.random.binomial(1, injury_probabilities[x])\n",
        ")\n",
        "\n",
        "# Check the distribution of Injury_Occurred\n",
        "print(\"Injury_Occurred distribution (full dataset):\\n\", df[\"Injury_Occurred\"].value_counts())\n",
        "\n",
        "# Ensure both classes are present\n",
        "if len(df[\"Injury_Occurred\"].unique()) < 2:\n",
        "    raise ValueError(\"Injury_Occurred contains only one class after probabilistic assignment. Adjust probabilities or dataset.\")\n",
        "\n",
        "# Define features\n",
        "features = [\n",
        "    \"Age\", \"Gender\", \"Sport_Type\", \"Experience_Level\", \"Flexibility_Score\",\n",
        "    \"Total_Weekly_Training_Hours\", \"High_Intensity_Training_Hours\", \"Strength_Training_Frequency\",\n",
        "    \"Recovery_Time_Between_Sessions\", \"Training_Load_Score\", \"Sprint_Speed\", \"Endurance_Score\",\n",
        "    \"Agility_Score\", \"Fatigue_Level\", \"Previous_Injury_Count\", \"Previous_Injury_Type\",\n",
        "    \"Intensity_Ratio\", \"Recovery_Per_Training\"\n",
        "]\n",
        "\n",
        "# Prepare features and target\n",
        "X = df[features]\n",
        "y_outcome = df[\"Injury_Occurred\"]\n",
        "\n",
        "print(\"Features prepared:\", features)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rEUcpksxu9Rq",
        "outputId": "70eee7a9-b2f5-4d3c-a668-504cd0c9dd81"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Injury_Risk_Level distribution:\n",
            " Injury_Risk_Level\n",
            "Medium    6016\n",
            "Low       2827\n",
            "High      1157\n",
            "Name: count, dtype: int64\n",
            "Injury_Occurred distribution (full dataset):\n",
            " Injury_Occurred\n",
            "0    5801\n",
            "1    4199\n",
            "Name: count, dtype: int64\n",
            "Features prepared: ['Age', 'Gender', 'Sport_Type', 'Experience_Level', 'Flexibility_Score', 'Total_Weekly_Training_Hours', 'High_Intensity_Training_Hours', 'Strength_Training_Frequency', 'Recovery_Time_Between_Sessions', 'Training_Load_Score', 'Sprint_Speed', 'Endurance_Score', 'Agility_Score', 'Fatigue_Level', 'Previous_Injury_Count', 'Previous_Injury_Type', 'Intensity_Ratio', 'Recovery_Per_Training']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 3. Split Data for Calibration ----------------------\n",
        "# Split data into training and calibration sets\n",
        "X_train, X_calib, y_train_outcome, y_calib_outcome = train_test_split(\n",
        "    X, y_outcome, test_size=0.2, stratify=y_outcome, random_state=42\n",
        ")\n",
        "\n",
        "print(\"Calibration set size:\", X_calib.shape)\n",
        "print(\"Calibration Injury_Occurred distribution:\\n\", y_calib_outcome.value_counts())\n",
        "\n",
        "# Check if y_calib_outcome has both classes\n",
        "if len(y_calib_outcome.unique()) < 2:\n",
        "    raise ValueError(\"Calibration set contains only one class in Injury_Occurred. Cannot proceed with calibration.\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kbGVsNq4u9Rr",
        "outputId": "732a5e13-f52b-4001-ee03-71a1b929fac7"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Calibration set size: (2000, 18)\n",
            "Calibration Injury_Occurred distribution:\n",
            " Injury_Occurred\n",
            "0    1160\n",
            "1     840\n",
            "Name: count, dtype: int64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 4. Predict Probabilities with Ensemble ----------------------\n",
        "# Use the ensemble (average of RandomForest and XGBoost) to predict probabilities\n",
        "rf_probs = rf_model.predict_proba(X_calib)\n",
        "xgb_probs = xgb_model.predict_proba(X_calib)\n",
        "\n",
        "# Average the probabilities (same as in predict.py)\n",
        "avg_probs = (rf_probs + xgb_probs) / 2\n",
        "\n",
        "# Get the predicted class and confidence\n",
        "predicted_classes = np.argmax(avg_probs, axis=1)\n",
        "confidences = np.max(avg_probs, axis=1)\n",
        "\n",
        "print(\"Sample of predicted confidences:\\n\", confidences[:5])\n",
        "print(\"Sample of predicted classes:\\n\", predicted_classes[:5])\n",
        "\n",
        "# Inspect the relationship between predicted classes and Injury_Occurred\n",
        "calib_df = pd.DataFrame({\n",
        "    \"Predicted_Class\": predicted_classes,\n",
        "    \"Injury_Occurred\": y_calib_outcome\n",
        "})\n",
        "print(\"Distribution of Injury_Occurred by Predicted Class:\\n\", calib_df.groupby(\"Predicted_Class\")[\"Injury_Occurred\"].value_counts())"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-Qyc_ZYOu9Rr",
        "outputId": "10e3592e-c51c-49cb-9ff4-6e157701811e"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Sample of predicted confidences:\n",
            " [0.98814357 0.9881264  0.97794891 0.97335743 0.97383904]\n",
            "Sample of predicted classes:\n",
            " [2 2 2 1 1]\n",
            "Distribution of Injury_Occurred by Predicted Class:\n",
            " Predicted_Class  Injury_Occurred\n",
            "0                1                  216\n",
            "                 0                   14\n",
            "1                0                  540\n",
            "                 1                   40\n",
            "2                0                  606\n",
            "                 1                  584\n",
            "Name: count, dtype: int64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 5. Map Probabilities to Injury Probabilities ----------------------\n",
        "# Use the raw probabilities as features for calibration\n",
        "calib_data = pd.DataFrame({\n",
        "    \"prob_high\": avg_probs[:, 0],  # Probability of High risk\n",
        "    \"prob_low\": avg_probs[:, 1],   # Probability of Low risk\n",
        "    \"prob_medium\": avg_probs[:, 2] # Probability of Medium risk\n",
        "})\n",
        "\n",
        "# Fit logistic regression with regularization\n",
        "lr_calib = LogisticRegression(max_iter=1000, penalty='l2', C=1.0)\n",
        "lr_calib.fit(calib_data, y_calib_outcome)\n",
        "\n",
        "# Predict calibrated probabilities\n",
        "calibrated_probs = lr_calib.predict_proba(calib_data)[:, 1]\n",
        "\n",
        "# Evaluate calibration using Brier score\n",
        "brier_score = brier_score_loss(y_calib_outcome, calibrated_probs)\n",
        "print(f\"Brier Score (lower is better): {brier_score:.4f}\")\n",
        "\n",
        "# Inspect the mapping\n",
        "calib_results = pd.DataFrame({\n",
        "    \"Predicted_Class\": predicted_classes,\n",
        "    \"Confidence\": confidences,\n",
        "    \"Calibrated_Probability\": calibrated_probs,\n",
        "    \"True_Injury_Occurred\": y_calib_outcome\n",
        "})\n",
        "print(\"Average Calibrated Probability by Predicted Class:\\n\", calib_results.groupby(\"Predicted_Class\")[\"Calibrated_Probability\"].mean())\n",
        "\n",
        "# Plot calibration curve\n",
        "plt.figure(figsize=(8, 6))\n",
        "plt.scatter(confidences, calibrated_probs, alpha=0.5)\n",
        "plt.plot([0, 1], [0, 1], 'k--', label=\"Perfectly Calibrated\")\n",
        "plt.xlabel(\"Original Confidence (Ensemble)\")\n",
        "plt.ylabel(\"Calibrated Injury Probability (P(Injury_Occurred=1))\")\n",
        "plt.title(\"Calibration Curve: Confidence vs. Injury Probability\")\n",
        "plt.legend()\n",
        "plt.savefig(f\"{model_dir}/calibration_curve.png\")\n",
        "plt.close()\n",
        "\n",
        "print(f\"Calibration curve saved to {model_dir}/calibration_curve.png\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oITFxyA4u9Rs",
        "outputId": "98d1b529-b387-43ee-a7a8-9a7244701dd3"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Brier Score (lower is better): 0.1734\n",
            "Average Calibrated Probability by Predicted Class:\n",
            " Predicted_Class\n",
            "0    0.923188\n",
            "1    0.075402\n",
            "2    0.490621\n",
            "Name: Calibrated_Probability, dtype: float64\n",
            "Calibration curve saved to /content/model/calibration_curve.png\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# ---------------------- 6. Save Calibration Model ----------------------\n",
        "# Save the logistic regression model for use in predict.py\n",
        "import os\n",
        "os.makedirs(model_dir, exist_ok=True)\n",
        "joblib.dump(lr_calib, f\"{model_dir}/likelihood_calibrator.pkl\")\n",
        "print(f\"Calibration model saved to {model_dir}/likelihood_calibrator.pkl\")\n",
        "print(\"Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\")\n",
        "print(\"Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\")\n",
        "print(\"Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "S8IY2YCZu9Rs",
        "outputId": "4585d640-eb1b-49d2-ec60-2727704b2de5"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Calibration model saved to /content/model/likelihood_calibrator.pkl\n",
            "Note: You are running this in Google Colab. The file is saved to /content/model/likelihood_calibrator.pkl.\n",
            "Please download it and move it to C:/Users/amrHa/Desktop/final 3/deployment/model/ for deployment.\n",
            "Alternatively, if running locally, update model_dir to 'C:/Users/amrHa/Desktop/final 3/deployment/model'.\n"
          ]
        }
      ]
    }
  ]
}