Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| def preprocess_data(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Preprocess input data for injury risk prediction. | |
| Args: | |
| df (pd.DataFrame): Input DataFrame with raw features. | |
| Returns: | |
| pd.DataFrame: Processed DataFrame with engineered features. | |
| """ | |
| # Input validation | |
| required_columns = [ | |
| "Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score", | |
| "Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency", | |
| "Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score", | |
| "Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type" | |
| ] | |
| # Check for missing columns | |
| missing_cols = [col for col in required_columns if col not in df.columns] | |
| if missing_cols: | |
| raise ValueError(f"Missing required columns: {missing_cols}") | |
| # Create a copy to avoid modifying the original DataFrame | |
| df_processed = df.copy() | |
| # Handle missing values (fill with median) | |
| df_processed.fillna(df_processed.median(), inplace=True) | |
| # Replace 0 with 0.1 in Total_Weekly_Training_Hours to avoid division by zero | |
| df_processed["Total_Weekly_Training_Hours"] = df_processed["Total_Weekly_Training_Hours"].replace(0, 0.1) | |
| # Create derived features | |
| df_processed["Intensity_Ratio"] = df_processed["High_Intensity_Training_Hours"] / df_processed["Total_Weekly_Training_Hours"] | |
| df_processed["Recovery_Per_Training"] = df_processed["Recovery_Time_Between_Sessions"] / df_processed["Total_Weekly_Training_Hours"] | |
| # Define final feature set (excluding Predicted_Injury_Type) | |
| model_features = [ | |
| "Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score", | |
| "Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency", | |
| "Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score", | |
| "Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type", | |
| "Intensity_Ratio", "Recovery_Per_Training" | |
| ] | |
| return df_processed[model_features] |