Spaces:

AmrGaberr
/

Injury_Prediction_System

Sleeping

File size: 5,875 Bytes

05851ff

import pandas as pd
import numpy as np
import joblib
import os
from recommendation import generate_recommendations

# Define mappings for categorical variables (consistent with CalibrateLikelihood.ipynb)
gender_mapping = {"Male": 0, "Female": 1}
experience_mapping = {"Beginner": 0, "Intermediate": 1, "Advanced": 2, "Professional": 3}
injury_type_mapping = {"None": 0, "Sprain": 1, "Ligament Tear": 2, "Tendonitis": 3, "Strain": 4, "Fracture": 5}
sport_type_mapping = {"Football": 0, "Basketball": 1, "Swimming": 2, "Tennis": 3, "Running": 4}
risk_level_mapping = {0: "High", 1: "Low", 2: "Medium"}

# Define model directory using relative path
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "model")

# Load models, encoders, and calibration threshold
try:
    rf_model = joblib.load(os.path.join(MODEL_DIR, "rf_injury_model.pkl"))
    xgb_model = joblib.load(os.path.join(MODEL_DIR, "xgboost_injury_model.pkl"))
    calibrator = joblib.load(os.path.join(MODEL_DIR, "likelihood_calibrator.pkl"))
    rf_encoder = joblib.load(os.path.join(MODEL_DIR, "rf_target_encoder.pkl"))
    xgb_encoder = joblib.load(os.path.join(MODEL_DIR, "xgb_target_encoder.pkl"))
    low_threshold = joblib.load(os.path.join(MODEL_DIR, "calibration_threshold.pkl"))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Model file not found: {str(e)}. Ensure all model files are in {MODEL_DIR}.")

# Verify encoder consistency
if not (rf_encoder.classes_ == xgb_encoder.classes_).all():
    raise ValueError("RandomForest and XGBoost encoders have inconsistent class mappings.")

def preprocess_data(data_dict):
    """
    Preprocess the input data consistently with CalibrateLikelihood.ipynb.
    
    Args:
        data_dict (dict): Input dictionary containing athlete data.
    
    Returns:
        pd.DataFrame: Preprocessed features ready for prediction.
    """
    try:
        df = pd.DataFrame([data_dict])

        df["Gender"] = df["Gender"].map(gender_mapping).fillna(0).astype(int)
        df["Sport_Type"] = df["Sport_Type"].map(sport_type_mapping).fillna(0).astype(int)
        df["Experience_Level"] = df["Experience_Level"].map(experience_mapping).fillna(0).astype(int)
        df["Previous_Injury_Type"] = df["Previous_Injury_Type"].fillna("None")
        df["Previous_Injury_Type"] = df["Previous_Injury_Type"].map(injury_type_mapping).fillna(0).astype(int)

        df["Total_Weekly_Training_Hours"] = df["Total_Weekly_Training_Hours"].replace(0, 0.1)

        df["Intensity_Ratio"] = df["High_Intensity_Training_Hours"] / df["Total_Weekly_Training_Hours"]
        df["Recovery_Per_Training"] = df["Recovery_Time_Between_Sessions"] / df["Total_Weekly_Training_Hours"]

        features = [
            "Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
            "Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
            "Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
            "Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type",
            "Intensity_Ratio", "Recovery_Per_Training"
        ]

        missing_features = [f for f in features if f not in df.columns]
        if missing_features:
            raise ValueError(f"Missing required features: {missing_features}")

        X = df[features]
        return X
    except Exception as e:
        raise Exception(f"Error in preprocessing data: {str(e)}")

def predict_injury_risk(user_input: dict) -> dict:
    """
    Predict injury risk using the ensemble of RandomForest and XGBoost models with calibrated probabilities.
    Uses a data-driven threshold from calibration to classify Low vs. Medium risks.
    
    Args:
        user_input (dict): Input dictionary containing athlete data.
    
    Returns:
        dict: Prediction results including risk level, likelihood, and recommendations.
    """
    print("User Input Received:\n", user_input)

    # Preprocess input
    features = preprocess_data(user_input)
    print("FINAL Features After Preprocessing:", list(features.columns))
    print("Preprocessed Input:\n", features)

    # Predict probabilities from both models
    rf_probs = rf_model.predict_proba(features)
    xgb_probs = xgb_model.predict_proba(features)
    print("RandomForest Probabilities (High, Low, Medium):", rf_probs)
    print("XGBoost Probabilities (High, Low, Medium):", xgb_probs)

    # Ensemble average
    avg_probs = (rf_probs + xgb_probs) / 2
    predicted_class = np.argmax(avg_probs, axis=1)[0]
    confidence = np.max(avg_probs, axis=1)[0]
    predicted_label = rf_encoder.inverse_transform([predicted_class])[0]
    print("Ensemble Probabilities (High, Low, Medium):", avg_probs)
    print("Predicted Class:", predicted_class, "Label:", predicted_label, "Confidence:", confidence)

    # Calibrate the probability using the likelihood calibrator
    calib_data = pd.DataFrame({
        "prob_high": [avg_probs[0][0]],
        "prob_low": [avg_probs[0][1]],
        "prob_medium": [avg_probs[0][2]]
    })
    injury_likelihood = calibrator.predict_proba(calib_data)[:, 1][0] * 100
    print("Calibrated Injury Likelihood (%):", injury_likelihood)

    # Adjust prediction using dynamic threshold based on raw prob_low
    if avg_probs[0][1] > low_threshold and predicted_label != "Low":
        print(f"⚠️ Calibration adjustment: Low probability ({avg_probs[0][1]:.2f}) above threshold ({low_threshold:.2f}) — classifying as Low.")
        predicted_label = "Low"

    # Generate recommendations
    recommendations = generate_recommendations(user_input)

    return {
        "predicted_risk_level": predicted_label,
        "injury_likelihood_percent": round(injury_likelihood, 2),
        "model_class_probability": round(confidence * 100, 2),
        "recommendations": recommendations
    }