File size: 2,849 Bytes
bb21b5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""Preprocess task: split raw by Year, apply feature engineering, build ColumnTransformer."""

from __future__ import annotations

import logging

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from src import config
from src.feature_engineering import apply_feature_engineering

logger = logging.getLogger(__name__)


NUMERIC_FEATURES = [
    "PitStop",
    "LapNumber",
    "Stint",
    "TyreLife",
    "Position",
    "LapTime (s)",
    "LapTime_Delta",
    "Cumulative_Degradation",
    "RaceProgress",
    "Position_Change",
    "LapTime_lag1", "LapTime_lag2", "LapTime_lag3",
    "LapTime_Delta_lag1", "LapTime_Delta_lag2", "LapTime_Delta_lag3",
    "Position_lag1", "Position_lag2", "Position_lag3",
    "LapTime_roll3_mean", "LapTime_roll3_std",
    "LapTime_roll5_mean", "LapTime_roll5_std",
    "LapTimeDelta_roll3_mean", "LapTimeDelta_roll5_mean",
    "StintMin_sofar",
    "LapTime_vs_StintMin",
    "IsEarlyRace",
    "IsLateRace",
]

CATEGORICAL_FEATURES = ["Compound", "TyreLife_bucket"]

FEATURE_COLUMNS = NUMERIC_FEATURES + CATEGORICAL_FEATURES


def run() -> tuple[str, str]:
    config.ensure_dirs()
    logger.info("Reading %s", config.RAW_PARQUET)
    df = pd.read_parquet(config.RAW_PARQUET)

    train = df[df["Year"].isin(config.TRAIN_YEARS)].reset_index(drop=True)
    test = df[df["Year"].isin(config.TEST_YEARS)].reset_index(drop=True)
    logger.info("Year split -> train=%s, test=%s", len(train), len(test))

    train_fe = apply_feature_engineering(train)
    test_fe = apply_feature_engineering(test)

    train_fe.to_parquet(config.PROCESSED_TRAIN, index=False)
    test_fe.to_parquet(config.PROCESSED_TEST, index=False)
    logger.info("Wrote %s and %s", config.PROCESSED_TRAIN, config.PROCESSED_TEST)
    return str(config.PROCESSED_TRAIN), str(config.PROCESSED_TEST)


def build_preprocessor() -> ColumnTransformer:
    return ColumnTransformer(
        transformers=[
            ("numeric", "passthrough", NUMERIC_FEATURES),
            (
                "categorical",
                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
                CATEGORICAL_FEATURES,
            ),
        ],
        remainder="drop",
    )


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
    train_path, test_path = run()
    train_fe = pd.read_parquet(train_path)
    test_fe = pd.read_parquet(test_path)
    print(f"\nPreprocess complete:")
    print(f"  Train: {train_fe.shape} -> {train_path}")
    print(f"  Test:  {test_fe.shape}  -> {test_path}")
    print(f"\nFeature columns to model ({len(FEATURE_COLUMNS)}):")
    print(f"  numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
    print(f"  categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")