File size: 4,539 Bytes
bcb0385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureEngineer:
    def __init__(self, n_races=5):
        self.n_races = n_races
        self.driver_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.training_columns = None

    def _calculate_recent_performance(self, results_df: pd.DataFrame, driver_id: str) -> pd.Series:
        """Calculate simple performance metrics."""
        driver_results = results_df[results_df['DriverId'] == driver_id].sort_values('EventDate', ascending=False)
        recent_races = driver_results.head(self.n_races).copy()  # Use the user-selected number!

        if recent_races.empty:
            return pd.Series({
                'avg_recent_position': 20.0,
                'avg_recent_grid': 20.0,
                'recent_dnf_rate': 1.0,
                'recent_overtakes': 0.0
            })

        recent_races['Position'] = pd.to_numeric(recent_races['Position'], errors='coerce')
        recent_races['GridPosition'] = pd.to_numeric(recent_races['GridPosition'], errors='coerce')

        avg_pos = recent_races['Position'].mean()
        avg_grid = recent_races['GridPosition'].mean()
        dnf_rate = recent_races['Status'].apply(lambda x: 'Finished' not in str(x) and '+' not in str(x)).mean()
        overtakes = (recent_races['GridPosition'] - recent_races['Position']).mean()

        return pd.Series({
            'avg_recent_position': avg_pos if pd.notna(avg_pos) else 20.0,
            'avg_recent_grid': avg_grid if pd.notna(avg_grid) else 20.0,
            'recent_dnf_rate': dnf_rate if pd.notna(dnf_rate) else 1.0,
            'recent_overtakes': overtakes if pd.notna(overtakes) else 0.0
        })

    def prepare_features(self, race_results_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
        """Prepare simple features for training."""
        logger.info(f"Preparing simple features from {len(race_results_df)} race results...")

        if race_results_df.empty:
            raise ValueError("Input race_results_df cannot be empty.")

        race_results_df['EventDate'] = pd.to_datetime(race_results_df['EventDate'])
        race_results_df['Position'] = pd.to_numeric(race_results_df['Position'], errors='coerce')

        unique_drivers = race_results_df['DriverId'].unique()
        all_features = []

        for driver_id in unique_drivers:
            driver_features = self._calculate_recent_performance(race_results_df, driver_id)
            driver_features['DriverId'] = driver_id
            driver_features['TargetPosition'] = race_results_df[race_results_df['DriverId'] == driver_id].sort_values('EventDate').iloc[-1]['Position']
            all_features.append(driver_features)

        features_df = pd.DataFrame(all_features)

        y = features_df['TargetPosition']
        X = features_df.drop(columns=['TargetPosition', 'DriverId'])

        for col in X.columns:
            if X[col].isnull().any():
                median_val = X[col].median()
                X[col] = X[col].fillna(median_val)

        self.training_columns = X.columns.tolist()

        X_scaled = self.scaler.fit_transform(X)
        X = pd.DataFrame(X_scaled, columns=self.training_columns)

        logger.info(f"Generated simple features shape: {X.shape}, Target shape: {y.shape}")
        return X, y

    def prepare_prediction_features(self, last_5_races_df: pd.DataFrame) -> pd.DataFrame:
        """Prepare simple prediction features."""
        logger.info("Preparing simple features for prediction...")

        if self.training_columns is None:
            raise ValueError("Model has not been trained yet.")

        unique_drivers = last_5_races_df['DriverId'].unique()
        prediction_features_list = []

        for driver_id in unique_drivers:
            driver_perf = self._calculate_recent_performance(last_5_races_df, driver_id)
            prediction_features_list.append(driver_perf)

        X_pred = pd.DataFrame(prediction_features_list)
        X_pred = X_pred.reindex(columns=self.training_columns, fill_value=0)

        X_pred_scaled = self.scaler.transform(X_pred)
        X_pred = pd.DataFrame(X_pred_scaled, columns=self.training_columns)

        logger.info(f"Generated simple prediction features shape: {X_pred.shape}")
        return X_pred