AmrGaberr's picture
Upload 10 files
05851ff verified
raw
history blame
2.22 kB
import pandas as pd
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess input data for injury risk prediction.
Args:
df (pd.DataFrame): Input DataFrame with raw features.
Returns:
pd.DataFrame: Processed DataFrame with engineered features.
"""
# Input validation
required_columns = [
"Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
"Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
"Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
"Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type"
]
# Check for missing columns
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Create a copy to avoid modifying the original DataFrame
df_processed = df.copy()
# Handle missing values (fill with median)
df_processed.fillna(df_processed.median(), inplace=True)
# Replace 0 with 0.1 in Total_Weekly_Training_Hours to avoid division by zero
df_processed["Total_Weekly_Training_Hours"] = df_processed["Total_Weekly_Training_Hours"].replace(0, 0.1)
# Create derived features
df_processed["Intensity_Ratio"] = df_processed["High_Intensity_Training_Hours"] / df_processed["Total_Weekly_Training_Hours"]
df_processed["Recovery_Per_Training"] = df_processed["Recovery_Time_Between_Sessions"] / df_processed["Total_Weekly_Training_Hours"]
# Define final feature set (excluding Predicted_Injury_Type)
model_features = [
"Age", "Gender", "Sport_Type", "Experience_Level", "Flexibility_Score",
"Total_Weekly_Training_Hours", "High_Intensity_Training_Hours", "Strength_Training_Frequency",
"Recovery_Time_Between_Sessions", "Training_Load_Score", "Sprint_Speed", "Endurance_Score",
"Agility_Score", "Fatigue_Level", "Previous_Injury_Count", "Previous_Injury_Type",
"Intensity_Ratio", "Recovery_Per_Training"
]
return df_processed[model_features]