import pandas as pd import numpy as np import joblib import os from recommendation import generate_recommendations # Define mappings for categorical variables (consistent with CalibrateLikelihood.ipynb) gender_mapping = {"Male": 0, "Female": 1} experience_mapping = {"Beginner": 0, "Intermediate": 1, "Advanced": 2, "Professional": 3} injury_type_mapping = {"None": 0, "Sprain": 1, "Ligament Tear": 2, "Tendonitis": 3, "Strain": 4, "Fracture": 5} sport_type_mapping = {"Football": 0, "Basketball": 1, "Swimming": 2, "Tennis": 3, "Running": 4} risk_level_mapping = {0: "High", 1: "Low", 2: "Medium"} # Define model directory using relative path MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "model") # Load models, encoders, and calibration threshold try: rf_model = joblib.load(os.path.join(MODEL_DIR, "rf_injury_model.pkl")) xgb_model = joblib.load(os.path.join(MODEL_DIR, "xgboost_injury_model.pkl")) calibrator = joblib.load(os.path.join(MODEL_DIR, "likelihood_calibrator.pkl")) rf_encoder = joblib.load(os.path.join(MODEL_DIR, "rf_target_encoder.pkl")) xgb_encoder = joblib.load(os.path.join(MODEL_DIR, "xgb_target_encoder.pkl")) low_threshold = joblib.load(os.path.join(MODEL_DIR, "calibration_threshold.pkl")) except FileNotFoundError as e: raise FileNotFoundError(f"Model file not found: {str(e)}. Ensure all model files are in {MODEL_DIR}.") # Verify encoder consistency if not (rf_encoder.classes_ == xgb_encoder.classes_).all(): raise ValueError("RandomForest and XGBoost encoders have inconsistent class mappings.") def preprocess_data(data_dict): """ Preprocess the input data consistently with CalibrateLikelihood.ipynb. Args: data_dict (dict): Input dictionary containing athlete data. Returns: pd.DataFrame: Preprocessed features ready for prediction. """ try: df = pd.DataFrame([data_dict]) df["Gender"] = df["Gender"].map(gender_mapping).fillna(0).astype(int) df["Sport_Type"] = df["Sport_Type"].map(sport_type_mapping).fillna(0).astype(int) df["Experience_Level"] = df["Experience_Level"].map(experience_mapping).fillna(0).astype(int) df["Previous_Injury_Type"] = df["Previous_Injury_Type"].fillna("None") df["Previous_Injury_Type"] = df["Previous_Injury_Type"].map(injury_type_mapping).fillna(0).astype(int) df["Total_Weekly_Training_Hours"] = df["Total_Weekly_Training_Hours"].replace(0, 0.1) df["Intensity_Ratio"] = df["High_Intensity_Training_Hours"] / df["Total_Weekly_Training_Hours"] df["Recovery_Per_Training"] = df["Recovery_Time_Between_Sessions"] / df["Total_Weekly_Training_Hours"] features = [ "Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score", "Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency", "Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score", "Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type", "Intensity_Ratio", "Recovery_Per_Training" ] missing_features = [f for f in features if f not in df.columns] if missing_features: raise ValueError(f"Missing required features: {missing_features}") X = df[features] return X except Exception as e: raise Exception(f"Error in preprocessing data: {str(e)}") def predict_injury_risk(user_input: dict) -> dict: """ Predict injury risk using the ensemble of RandomForest and XGBoost models with calibrated probabilities. Uses a data-driven threshold from calibration to classify Low vs. Medium risks. Args: user_input (dict): Input dictionary containing athlete data. Returns: dict: Prediction results including risk level, likelihood, and recommendations. """ print("User Input Received:\n", user_input) # Preprocess input features = preprocess_data(user_input) print("FINAL Features After Preprocessing:", list(features.columns)) print("Preprocessed Input:\n", features) # Predict probabilities from both models rf_probs = rf_model.predict_proba(features) xgb_probs = xgb_model.predict_proba(features) print("RandomForest Probabilities (High, Low, Medium):", rf_probs) print("XGBoost Probabilities (High, Low, Medium):", xgb_probs) # Ensemble average avg_probs = (rf_probs + xgb_probs) / 2 predicted_class = np.argmax(avg_probs, axis=1)[0] confidence = np.max(avg_probs, axis=1)[0] predicted_label = rf_encoder.inverse_transform([predicted_class])[0] print("Ensemble Probabilities (High, Low, Medium):", avg_probs) print("Predicted Class:", predicted_class, "Label:", predicted_label, "Confidence:", confidence) # Calibrate the probability using the likelihood calibrator calib_data = pd.DataFrame({ "prob_high": [avg_probs[0][0]], "prob_low": [avg_probs[0][1]], "prob_medium": [avg_probs[0][2]] }) injury_likelihood = calibrator.predict_proba(calib_data)[:, 1][0] * 100 print("Calibrated Injury Likelihood (%):", injury_likelihood) # Adjust prediction using dynamic threshold based on raw prob_low if avg_probs[0][1] > low_threshold and predicted_label != "Low": print(f"⚠️ Calibration adjustment: Low probability ({avg_probs[0][1]:.2f}) above threshold ({low_threshold:.2f}) — classifying as Low.") predicted_label = "Low" # Generate recommendations recommendations = generate_recommendations(user_input) return { "predicted_risk_level": predicted_label, "injury_likelihood_percent": round(injury_likelihood, 2), "model_class_probability": round(confidence * 100, 2), "recommendations": recommendations }