File size: 1,875 Bytes
bb21b5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Feature engineering ported from notebooks/EDA.ipynb (add_features cell).

All features are scoped to (Year, Race, Driver, Stint) so nothing leaks across
pit stops, races, or years. Rolling features apply shift(1) before rolling so
the current lap is never part of its own feature.
"""

from __future__ import annotations

import pandas as pd

GROUP_KEY = ["Year", "Race", "Driver", "Stint"]
SORT_KEY = GROUP_KEY + ["LapNumber"]

TYRELIFE_BUCKETS = {
    "bins": [-0.1, 5, 10, 15, 20, 25, 30, 40, 60, 200],
    "labels": ["0_5", "6_10", "11_15", "16_20", "21_25", "26_30", "31_40", "41_60", "60plus"],
}


def apply_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(SORT_KEY).reset_index(drop=True)
    g = df.groupby(GROUP_KEY, sort=False)

    for lag in (1, 2, 3):
        df[f"LapTime_lag{lag}"] = g["LapTime (s)"].shift(lag)
        df[f"LapTime_Delta_lag{lag}"] = g["LapTime_Delta"].shift(lag)
        df[f"Position_lag{lag}"] = g["Position"].shift(lag)

    for w in (3, 5):
        df[f"LapTime_roll{w}_mean"] = g["LapTime (s)"].transform(
            lambda s: s.shift(1).rolling(w, min_periods=1).mean()
        )
        df[f"LapTime_roll{w}_std"] = g["LapTime (s)"].transform(
            lambda s: s.shift(1).rolling(w, min_periods=2).std()
        )
        df[f"LapTimeDelta_roll{w}_mean"] = g["LapTime_Delta"].transform(
            lambda s: s.shift(1).rolling(w, min_periods=1).mean()
        )

    df["TyreLife_bucket"] = pd.cut(
        df["TyreLife"],
        bins=TYRELIFE_BUCKETS["bins"],
        labels=TYRELIFE_BUCKETS["labels"],
    ).astype(str)

    df["IsEarlyRace"] = (df["RaceProgress"] < 0.25).astype(int)
    df["IsLateRace"] = (df["RaceProgress"] > 0.70).astype(int)

    df["StintMin_sofar"] = g["LapTime (s)"].cummin()
    df["LapTime_vs_StintMin"] = df["LapTime (s)"] - df["StintMin_sofar"]

    return df