Spaces:
Sleeping
Sleeping
File size: 2,218 Bytes
05851ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import pandas as pd
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess input data for injury risk prediction.
Args:
df (pd.DataFrame): Input DataFrame with raw features.
Returns:
pd.DataFrame: Processed DataFrame with engineered features.
"""
# Input validation
required_columns = [
"Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
"Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
"Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
"Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type"
]
# Check for missing columns
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Create a copy to avoid modifying the original DataFrame
df_processed = df.copy()
# Handle missing values (fill with median)
df_processed.fillna(df_processed.median(), inplace=True)
# Replace 0 with 0.1 in Total_Weekly_Training_Hours to avoid division by zero
df_processed["Total_Weekly_Training_Hours"] = df_processed["Total_Weekly_Training_Hours"].replace(0, 0.1)
# Create derived features
df_processed["Intensity_Ratio"] = df_processed["High_Intensity_Training_Hours"] / df_processed["Total_Weekly_Training_Hours"]
df_processed["Recovery_Per_Training"] = df_processed["Recovery_Time_Between_Sessions"] / df_processed["Total_Weekly_Training_Hours"]
# Define final feature set (excluding Predicted_Injury_Type)
model_features = [
"Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
"Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
"Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
"Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type",
"Intensity_Ratio", "Recovery_Per_Training"
]
return df_processed[model_features] |