Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class FeatureEngineer: | |
| def __init__(self, n_races=5): | |
| self.n_races = n_races | |
| self.driver_encoder = LabelEncoder() | |
| self.scaler = StandardScaler() | |
| self.training_columns = None | |
| def _calculate_recent_performance(self, results_df: pd.DataFrame, driver_id: str) -> pd.Series: | |
| """Calculate simple performance metrics.""" | |
| driver_results = results_df[results_df['DriverId'] == driver_id].sort_values('EventDate', ascending=False) | |
| recent_races = driver_results.head(self.n_races).copy() # Use the user-selected number! | |
| if recent_races.empty: | |
| return pd.Series({ | |
| 'avg_recent_position': 20.0, | |
| 'avg_recent_grid': 20.0, | |
| 'recent_dnf_rate': 1.0, | |
| 'recent_overtakes': 0.0 | |
| }) | |
| recent_races['Position'] = pd.to_numeric(recent_races['Position'], errors='coerce') | |
| recent_races['GridPosition'] = pd.to_numeric(recent_races['GridPosition'], errors='coerce') | |
| avg_pos = recent_races['Position'].mean() | |
| avg_grid = recent_races['GridPosition'].mean() | |
| dnf_rate = recent_races['Status'].apply(lambda x: 'Finished' not in str(x) and '+' not in str(x)).mean() | |
| overtakes = (recent_races['GridPosition'] - recent_races['Position']).mean() | |
| return pd.Series({ | |
| 'avg_recent_position': avg_pos if pd.notna(avg_pos) else 20.0, | |
| 'avg_recent_grid': avg_grid if pd.notna(avg_grid) else 20.0, | |
| 'recent_dnf_rate': dnf_rate if pd.notna(dnf_rate) else 1.0, | |
| 'recent_overtakes': overtakes if pd.notna(overtakes) else 0.0 | |
| }) | |
| def prepare_features(self, race_results_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]: | |
| """Prepare simple features for training.""" | |
| logger.info(f"Preparing simple features from {len(race_results_df)} race results...") | |
| if race_results_df.empty: | |
| raise ValueError("Input race_results_df cannot be empty.") | |
| race_results_df['EventDate'] = pd.to_datetime(race_results_df['EventDate']) | |
| race_results_df['Position'] = pd.to_numeric(race_results_df['Position'], errors='coerce') | |
| unique_drivers = race_results_df['DriverId'].unique() | |
| all_features = [] | |
| for driver_id in unique_drivers: | |
| driver_features = self._calculate_recent_performance(race_results_df, driver_id) | |
| driver_features['DriverId'] = driver_id | |
| driver_features['TargetPosition'] = race_results_df[race_results_df['DriverId'] == driver_id].sort_values('EventDate').iloc[-1]['Position'] | |
| all_features.append(driver_features) | |
| features_df = pd.DataFrame(all_features) | |
| y = features_df['TargetPosition'] | |
| X = features_df.drop(columns=['TargetPosition', 'DriverId']) | |
| for col in X.columns: | |
| if X[col].isnull().any(): | |
| median_val = X[col].median() | |
| X[col] = X[col].fillna(median_val) | |
| self.training_columns = X.columns.tolist() | |
| X_scaled = self.scaler.fit_transform(X) | |
| X = pd.DataFrame(X_scaled, columns=self.training_columns) | |
| logger.info(f"Generated simple features shape: {X.shape}, Target shape: {y.shape}") | |
| return X, y | |
| def prepare_prediction_features(self, last_5_races_df: pd.DataFrame) -> pd.DataFrame: | |
| """Prepare simple prediction features.""" | |
| logger.info("Preparing simple features for prediction...") | |
| if self.training_columns is None: | |
| raise ValueError("Model has not been trained yet.") | |
| unique_drivers = last_5_races_df['DriverId'].unique() | |
| prediction_features_list = [] | |
| for driver_id in unique_drivers: | |
| driver_perf = self._calculate_recent_performance(last_5_races_df, driver_id) | |
| prediction_features_list.append(driver_perf) | |
| X_pred = pd.DataFrame(prediction_features_list) | |
| X_pred = X_pred.reindex(columns=self.training_columns, fill_value=0) | |
| X_pred_scaled = self.scaler.transform(X_pred) | |
| X_pred = pd.DataFrame(X_pred_scaled, columns=self.training_columns) | |
| logger.info(f"Generated simple prediction features shape: {X_pred.shape}") | |
| return X_pred | |