Spaces:
Sleeping
Sleeping
File size: 2,849 Bytes
bb21b5d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """Preprocess task: split raw by Year, apply feature engineering, build ColumnTransformer."""
from __future__ import annotations
import logging
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from src import config
from src.feature_engineering import apply_feature_engineering
logger = logging.getLogger(__name__)
NUMERIC_FEATURES = [
"PitStop",
"LapNumber",
"Stint",
"TyreLife",
"Position",
"LapTime (s)",
"LapTime_Delta",
"Cumulative_Degradation",
"RaceProgress",
"Position_Change",
"LapTime_lag1", "LapTime_lag2", "LapTime_lag3",
"LapTime_Delta_lag1", "LapTime_Delta_lag2", "LapTime_Delta_lag3",
"Position_lag1", "Position_lag2", "Position_lag3",
"LapTime_roll3_mean", "LapTime_roll3_std",
"LapTime_roll5_mean", "LapTime_roll5_std",
"LapTimeDelta_roll3_mean", "LapTimeDelta_roll5_mean",
"StintMin_sofar",
"LapTime_vs_StintMin",
"IsEarlyRace",
"IsLateRace",
]
CATEGORICAL_FEATURES = ["Compound", "TyreLife_bucket"]
FEATURE_COLUMNS = NUMERIC_FEATURES + CATEGORICAL_FEATURES
def run() -> tuple[str, str]:
config.ensure_dirs()
logger.info("Reading %s", config.RAW_PARQUET)
df = pd.read_parquet(config.RAW_PARQUET)
train = df[df["Year"].isin(config.TRAIN_YEARS)].reset_index(drop=True)
test = df[df["Year"].isin(config.TEST_YEARS)].reset_index(drop=True)
logger.info("Year split -> train=%s, test=%s", len(train), len(test))
train_fe = apply_feature_engineering(train)
test_fe = apply_feature_engineering(test)
train_fe.to_parquet(config.PROCESSED_TRAIN, index=False)
test_fe.to_parquet(config.PROCESSED_TEST, index=False)
logger.info("Wrote %s and %s", config.PROCESSED_TRAIN, config.PROCESSED_TEST)
return str(config.PROCESSED_TRAIN), str(config.PROCESSED_TEST)
def build_preprocessor() -> ColumnTransformer:
return ColumnTransformer(
transformers=[
("numeric", "passthrough", NUMERIC_FEATURES),
(
"categorical",
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
CATEGORICAL_FEATURES,
),
],
remainder="drop",
)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
train_path, test_path = run()
train_fe = pd.read_parquet(train_path)
test_fe = pd.read_parquet(test_path)
print(f"\nPreprocess complete:")
print(f" Train: {train_fe.shape} -> {train_path}")
print(f" Test: {test_fe.shape} -> {test_path}")
print(f"\nFeature columns to model ({len(FEATURE_COLUMNS)}):")
print(f" numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f" categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")
|