f1-pit-predictor / src /preprocess.py
T0MYYY's picture
Deploy full-stack FastAPI + dashboard with CSV batch inference
bb21b5d verified
"""Preprocess task: split raw by Year, apply feature engineering, build ColumnTransformer."""
from __future__ import annotations
import logging
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from src import config
from src.feature_engineering import apply_feature_engineering
logger = logging.getLogger(__name__)
NUMERIC_FEATURES = [
"PitStop",
"LapNumber",
"Stint",
"TyreLife",
"Position",
"LapTime (s)",
"LapTime_Delta",
"Cumulative_Degradation",
"RaceProgress",
"Position_Change",
"LapTime_lag1", "LapTime_lag2", "LapTime_lag3",
"LapTime_Delta_lag1", "LapTime_Delta_lag2", "LapTime_Delta_lag3",
"Position_lag1", "Position_lag2", "Position_lag3",
"LapTime_roll3_mean", "LapTime_roll3_std",
"LapTime_roll5_mean", "LapTime_roll5_std",
"LapTimeDelta_roll3_mean", "LapTimeDelta_roll5_mean",
"StintMin_sofar",
"LapTime_vs_StintMin",
"IsEarlyRace",
"IsLateRace",
]
CATEGORICAL_FEATURES = ["Compound", "TyreLife_bucket"]
FEATURE_COLUMNS = NUMERIC_FEATURES + CATEGORICAL_FEATURES
def run() -> tuple[str, str]:
config.ensure_dirs()
logger.info("Reading %s", config.RAW_PARQUET)
df = pd.read_parquet(config.RAW_PARQUET)
train = df[df["Year"].isin(config.TRAIN_YEARS)].reset_index(drop=True)
test = df[df["Year"].isin(config.TEST_YEARS)].reset_index(drop=True)
logger.info("Year split -> train=%s, test=%s", len(train), len(test))
train_fe = apply_feature_engineering(train)
test_fe = apply_feature_engineering(test)
train_fe.to_parquet(config.PROCESSED_TRAIN, index=False)
test_fe.to_parquet(config.PROCESSED_TEST, index=False)
logger.info("Wrote %s and %s", config.PROCESSED_TRAIN, config.PROCESSED_TEST)
return str(config.PROCESSED_TRAIN), str(config.PROCESSED_TEST)
def build_preprocessor() -> ColumnTransformer:
return ColumnTransformer(
transformers=[
("numeric", "passthrough", NUMERIC_FEATURES),
(
"categorical",
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
CATEGORICAL_FEATURES,
),
],
remainder="drop",
)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
train_path, test_path = run()
train_fe = pd.read_parquet(train_path)
test_fe = pd.read_parquet(test_path)
print(f"\nPreprocess complete:")
print(f" Train: {train_fe.shape} -> {train_path}")
print(f" Test: {test_fe.shape} -> {test_path}")
print(f"\nFeature columns to model ({len(FEATURE_COLUMNS)}):")
print(f" numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f" categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")