AmrGaberr's picture
Upload 10 files
05851ff verified
raw
history blame
5.88 kB
import pandas as pd
import numpy as np
import joblib
import os
from recommendation import generate_recommendations
# Define mappings for categorical variables (consistent with CalibrateLikelihood.ipynb)
gender_mapping = {"Male": 0, "Female": 1}
experience_mapping = {"Beginner": 0, "Intermediate": 1, "Advanced": 2, "Professional": 3}
injury_type_mapping = {"None": 0, "Sprain": 1, "Ligament Tear": 2, "Tendonitis": 3, "Strain": 4, "Fracture": 5}
sport_type_mapping = {"Football": 0, "Basketball": 1, "Swimming": 2, "Tennis": 3, "Running": 4}
risk_level_mapping = {0: "High", 1: "Low", 2: "Medium"}
# Define model directory using relative path
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "model")
# Load models, encoders, and calibration threshold
try:
rf_model = joblib.load(os.path.join(MODEL_DIR, "rf_injury_model.pkl"))
xgb_model = joblib.load(os.path.join(MODEL_DIR, "xgboost_injury_model.pkl"))
calibrator = joblib.load(os.path.join(MODEL_DIR, "likelihood_calibrator.pkl"))
rf_encoder = joblib.load(os.path.join(MODEL_DIR, "rf_target_encoder.pkl"))
xgb_encoder = joblib.load(os.path.join(MODEL_DIR, "xgb_target_encoder.pkl"))
low_threshold = joblib.load(os.path.join(MODEL_DIR, "calibration_threshold.pkl"))
except FileNotFoundError as e:
raise FileNotFoundError(f"Model file not found: {str(e)}. Ensure all model files are in {MODEL_DIR}.")
# Verify encoder consistency
if not (rf_encoder.classes_ == xgb_encoder.classes_).all():
raise ValueError("RandomForest and XGBoost encoders have inconsistent class mappings.")
def preprocess_data(data_dict):
"""
Preprocess the input data consistently with CalibrateLikelihood.ipynb.
Args:
data_dict (dict): Input dictionary containing athlete data.
Returns:
pd.DataFrame: Preprocessed features ready for prediction.
"""
try:
df = pd.DataFrame([data_dict])
df["Gender"] = df["Gender"].map(gender_mapping).fillna(0).astype(int)
df["Sport_Type"] = df["Sport_Type"].map(sport_type_mapping).fillna(0).astype(int)
df["Experience_Level"] = df["Experience_Level"].map(experience_mapping).fillna(0).astype(int)
df["Previous_Injury_Type"] = df["Previous_Injury_Type"].fillna("None")
df["Previous_Injury_Type"] = df["Previous_Injury_Type"].map(injury_type_mapping).fillna(0).astype(int)
df["Total_Weekly_Training_Hours"] = df["Total_Weekly_Training_Hours"].replace(0, 0.1)
df["Intensity_Ratio"] = df["High_Intensity_Training_Hours"] / df["Total_Weekly_Training_Hours"]
df["Recovery_Per_Training"] = df["Recovery_Time_Between_Sessions"] / df["Total_Weekly_Training_Hours"]
features = [
"Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
"Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
"Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
"Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type",
"Intensity_Ratio", "Recovery_Per_Training"
]
missing_features = [f for f in features if f not in df.columns]
if missing_features:
raise ValueError(f"Missing required features: {missing_features}")
X = df[features]
return X
except Exception as e:
raise Exception(f"Error in preprocessing data: {str(e)}")
def predict_injury_risk(user_input: dict) -> dict:
"""
Predict injury risk using the ensemble of RandomForest and XGBoost models with calibrated probabilities.
Uses a data-driven threshold from calibration to classify Low vs. Medium risks.
Args:
user_input (dict): Input dictionary containing athlete data.
Returns:
dict: Prediction results including risk level, likelihood, and recommendations.
"""
print("User Input Received:\n", user_input)
# Preprocess input
features = preprocess_data(user_input)
print("FINAL Features After Preprocessing:", list(features.columns))
print("Preprocessed Input:\n", features)
# Predict probabilities from both models
rf_probs = rf_model.predict_proba(features)
xgb_probs = xgb_model.predict_proba(features)
print("RandomForest Probabilities (High, Low, Medium):", rf_probs)
print("XGBoost Probabilities (High, Low, Medium):", xgb_probs)
# Ensemble average
avg_probs = (rf_probs + xgb_probs) / 2
predicted_class = np.argmax(avg_probs, axis=1)[0]
confidence = np.max(avg_probs, axis=1)[0]
predicted_label = rf_encoder.inverse_transform([predicted_class])[0]
print("Ensemble Probabilities (High, Low, Medium):", avg_probs)
print("Predicted Class:", predicted_class, "Label:", predicted_label, "Confidence:", confidence)
# Calibrate the probability using the likelihood calibrator
calib_data = pd.DataFrame({
"prob_high": [avg_probs[0][0]],
"prob_low": [avg_probs[0][1]],
"prob_medium": [avg_probs[0][2]]
})
injury_likelihood = calibrator.predict_proba(calib_data)[:, 1][0] * 100
print("Calibrated Injury Likelihood (%):", injury_likelihood)
# Adjust prediction using dynamic threshold based on raw prob_low
if avg_probs[0][1] > low_threshold and predicted_label != "Low":
print(f"⚠️ Calibration adjustment: Low probability ({avg_probs[0][1]:.2f}) above threshold ({low_threshold:.2f}) — classifying as Low.")
predicted_label = "Low"
# Generate recommendations
recommendations = generate_recommendations(user_input)
return {
"predicted_risk_level": predicted_label,
"injury_likelihood_percent": round(injury_likelihood, 2),
"model_class_probability": round(confidence * 100, 2),
"recommendations": recommendations
}