F1_Race_Predictor / feature_engineering.py
Tryfonas's picture
Upload 7 files
bcb0385 verified
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class FeatureEngineer:
def __init__(self, n_races=5):
self.n_races = n_races
self.driver_encoder = LabelEncoder()
self.scaler = StandardScaler()
self.training_columns = None
def _calculate_recent_performance(self, results_df: pd.DataFrame, driver_id: str) -> pd.Series:
"""Calculate simple performance metrics."""
driver_results = results_df[results_df['DriverId'] == driver_id].sort_values('EventDate', ascending=False)
recent_races = driver_results.head(self.n_races).copy() # Use the user-selected number!
if recent_races.empty:
return pd.Series({
'avg_recent_position': 20.0,
'avg_recent_grid': 20.0,
'recent_dnf_rate': 1.0,
'recent_overtakes': 0.0
})
recent_races['Position'] = pd.to_numeric(recent_races['Position'], errors='coerce')
recent_races['GridPosition'] = pd.to_numeric(recent_races['GridPosition'], errors='coerce')
avg_pos = recent_races['Position'].mean()
avg_grid = recent_races['GridPosition'].mean()
dnf_rate = recent_races['Status'].apply(lambda x: 'Finished' not in str(x) and '+' not in str(x)).mean()
overtakes = (recent_races['GridPosition'] - recent_races['Position']).mean()
return pd.Series({
'avg_recent_position': avg_pos if pd.notna(avg_pos) else 20.0,
'avg_recent_grid': avg_grid if pd.notna(avg_grid) else 20.0,
'recent_dnf_rate': dnf_rate if pd.notna(dnf_rate) else 1.0,
'recent_overtakes': overtakes if pd.notna(overtakes) else 0.0
})
def prepare_features(self, race_results_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
"""Prepare simple features for training."""
logger.info(f"Preparing simple features from {len(race_results_df)} race results...")
if race_results_df.empty:
raise ValueError("Input race_results_df cannot be empty.")
race_results_df['EventDate'] = pd.to_datetime(race_results_df['EventDate'])
race_results_df['Position'] = pd.to_numeric(race_results_df['Position'], errors='coerce')
unique_drivers = race_results_df['DriverId'].unique()
all_features = []
for driver_id in unique_drivers:
driver_features = self._calculate_recent_performance(race_results_df, driver_id)
driver_features['DriverId'] = driver_id
driver_features['TargetPosition'] = race_results_df[race_results_df['DriverId'] == driver_id].sort_values('EventDate').iloc[-1]['Position']
all_features.append(driver_features)
features_df = pd.DataFrame(all_features)
y = features_df['TargetPosition']
X = features_df.drop(columns=['TargetPosition', 'DriverId'])
for col in X.columns:
if X[col].isnull().any():
median_val = X[col].median()
X[col] = X[col].fillna(median_val)
self.training_columns = X.columns.tolist()
X_scaled = self.scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=self.training_columns)
logger.info(f"Generated simple features shape: {X.shape}, Target shape: {y.shape}")
return X, y
def prepare_prediction_features(self, last_5_races_df: pd.DataFrame) -> pd.DataFrame:
"""Prepare simple prediction features."""
logger.info("Preparing simple features for prediction...")
if self.training_columns is None:
raise ValueError("Model has not been trained yet.")
unique_drivers = last_5_races_df['DriverId'].unique()
prediction_features_list = []
for driver_id in unique_drivers:
driver_perf = self._calculate_recent_performance(last_5_races_df, driver_id)
prediction_features_list.append(driver_perf)
X_pred = pd.DataFrame(prediction_features_list)
X_pred = X_pred.reindex(columns=self.training_columns, fill_value=0)
X_pred_scaled = self.scaler.transform(X_pred)
X_pred = pd.DataFrame(X_pred_scaled, columns=self.training_columns)
logger.info(f"Generated simple prediction features shape: {X_pred.shape}")
return X_pred