File size: 5,875 Bytes
05851ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import numpy as np
import joblib
import os
from recommendation import generate_recommendations

# Define mappings for categorical variables (consistent with CalibrateLikelihood.ipynb)
gender_mapping = {"Male": 0, "Female": 1}
experience_mapping = {"Beginner": 0, "Intermediate": 1, "Advanced": 2, "Professional": 3}
injury_type_mapping = {"None": 0, "Sprain": 1, "Ligament Tear": 2, "Tendonitis": 3, "Strain": 4, "Fracture": 5}
sport_type_mapping = {"Football": 0, "Basketball": 1, "Swimming": 2, "Tennis": 3, "Running": 4}
risk_level_mapping = {0: "High", 1: "Low", 2: "Medium"}

# Define model directory using relative path
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "model")

# Load models, encoders, and calibration threshold
try:
    rf_model = joblib.load(os.path.join(MODEL_DIR, "rf_injury_model.pkl"))
    xgb_model = joblib.load(os.path.join(MODEL_DIR, "xgboost_injury_model.pkl"))
    calibrator = joblib.load(os.path.join(MODEL_DIR, "likelihood_calibrator.pkl"))
    rf_encoder = joblib.load(os.path.join(MODEL_DIR, "rf_target_encoder.pkl"))
    xgb_encoder = joblib.load(os.path.join(MODEL_DIR, "xgb_target_encoder.pkl"))
    low_threshold = joblib.load(os.path.join(MODEL_DIR, "calibration_threshold.pkl"))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Model file not found: {str(e)}. Ensure all model files are in {MODEL_DIR}.")

# Verify encoder consistency
if not (rf_encoder.classes_ == xgb_encoder.classes_).all():
    raise ValueError("RandomForest and XGBoost encoders have inconsistent class mappings.")

def preprocess_data(data_dict):
    """
    Preprocess the input data consistently with CalibrateLikelihood.ipynb.
    
    Args:
        data_dict (dict): Input dictionary containing athlete data.
    
    Returns:
        pd.DataFrame: Preprocessed features ready for prediction.
    """
    try:
        df = pd.DataFrame([data_dict])

        df["Gender"] = df["Gender"].map(gender_mapping).fillna(0).astype(int)
        df["Sport_Type"] = df["Sport_Type"].map(sport_type_mapping).fillna(0).astype(int)
        df["Experience_Level"] = df["Experience_Level"].map(experience_mapping).fillna(0).astype(int)
        df["Previous_Injury_Type"] = df["Previous_Injury_Type"].fillna("None")
        df["Previous_Injury_Type"] = df["Previous_Injury_Type"].map(injury_type_mapping).fillna(0).astype(int)

        df["Total_Weekly_Training_Hours"] = df["Total_Weekly_Training_Hours"].replace(0, 0.1)

        df["Intensity_Ratio"] = df["High_Intensity_Training_Hours"] / df["Total_Weekly_Training_Hours"]
        df["Recovery_Per_Training"] = df["Recovery_Time_Between_Sessions"] / df["Total_Weekly_Training_Hours"]

        features = [
            "Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
            "Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
            "Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
            "Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type",
            "Intensity_Ratio", "Recovery_Per_Training"
        ]

        missing_features = [f for f in features if f not in df.columns]
        if missing_features:
            raise ValueError(f"Missing required features: {missing_features}")

        X = df[features]
        return X
    except Exception as e:
        raise Exception(f"Error in preprocessing data: {str(e)}")

def predict_injury_risk(user_input: dict) -> dict:
    """
    Predict injury risk using the ensemble of RandomForest and XGBoost models with calibrated probabilities.
    Uses a data-driven threshold from calibration to classify Low vs. Medium risks.
    
    Args:
        user_input (dict): Input dictionary containing athlete data.
    
    Returns:
        dict: Prediction results including risk level, likelihood, and recommendations.
    """
    print("User Input Received:\n", user_input)

    # Preprocess input
    features = preprocess_data(user_input)
    print("FINAL Features After Preprocessing:", list(features.columns))
    print("Preprocessed Input:\n", features)

    # Predict probabilities from both models
    rf_probs = rf_model.predict_proba(features)
    xgb_probs = xgb_model.predict_proba(features)
    print("RandomForest Probabilities (High, Low, Medium):", rf_probs)
    print("XGBoost Probabilities (High, Low, Medium):", xgb_probs)

    # Ensemble average
    avg_probs = (rf_probs + xgb_probs) / 2
    predicted_class = np.argmax(avg_probs, axis=1)[0]
    confidence = np.max(avg_probs, axis=1)[0]
    predicted_label = rf_encoder.inverse_transform([predicted_class])[0]
    print("Ensemble Probabilities (High, Low, Medium):", avg_probs)
    print("Predicted Class:", predicted_class, "Label:", predicted_label, "Confidence:", confidence)

    # Calibrate the probability using the likelihood calibrator
    calib_data = pd.DataFrame({
        "prob_high": [avg_probs[0][0]],
        "prob_low": [avg_probs[0][1]],
        "prob_medium": [avg_probs[0][2]]
    })
    injury_likelihood = calibrator.predict_proba(calib_data)[:, 1][0] * 100
    print("Calibrated Injury Likelihood (%):", injury_likelihood)

    # Adjust prediction using dynamic threshold based on raw prob_low
    if avg_probs[0][1] > low_threshold and predicted_label != "Low":
        print(f"⚠️ Calibration adjustment: Low probability ({avg_probs[0][1]:.2f}) above threshold ({low_threshold:.2f}) — classifying as Low.")
        predicted_label = "Low"

    # Generate recommendations
    recommendations = generate_recommendations(user_input)

    return {
        "predicted_risk_level": predicted_label,
        "injury_likelihood_percent": round(injury_likelihood, 2),
        "model_class_probability": round(confidence * 100, 2),
        "recommendations": recommendations
    }