Spaces:
Sleeping
Sleeping
| """Preprocess task: split raw by Year, apply feature engineering, build ColumnTransformer.""" | |
| from __future__ import annotations | |
| import logging | |
| import pandas as pd | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import OneHotEncoder | |
| from src import config | |
| from src.feature_engineering import apply_feature_engineering | |
| logger = logging.getLogger(__name__) | |
| NUMERIC_FEATURES = [ | |
| "PitStop", | |
| "LapNumber", | |
| "Stint", | |
| "TyreLife", | |
| "Position", | |
| "LapTime (s)", | |
| "LapTime_Delta", | |
| "Cumulative_Degradation", | |
| "RaceProgress", | |
| "Position_Change", | |
| "LapTime_lag1", "LapTime_lag2", "LapTime_lag3", | |
| "LapTime_Delta_lag1", "LapTime_Delta_lag2", "LapTime_Delta_lag3", | |
| "Position_lag1", "Position_lag2", "Position_lag3", | |
| "LapTime_roll3_mean", "LapTime_roll3_std", | |
| "LapTime_roll5_mean", "LapTime_roll5_std", | |
| "LapTimeDelta_roll3_mean", "LapTimeDelta_roll5_mean", | |
| "StintMin_sofar", | |
| "LapTime_vs_StintMin", | |
| "IsEarlyRace", | |
| "IsLateRace", | |
| ] | |
| CATEGORICAL_FEATURES = ["Compound", "TyreLife_bucket"] | |
| FEATURE_COLUMNS = NUMERIC_FEATURES + CATEGORICAL_FEATURES | |
| def run() -> tuple[str, str]: | |
| config.ensure_dirs() | |
| logger.info("Reading %s", config.RAW_PARQUET) | |
| df = pd.read_parquet(config.RAW_PARQUET) | |
| train = df[df["Year"].isin(config.TRAIN_YEARS)].reset_index(drop=True) | |
| test = df[df["Year"].isin(config.TEST_YEARS)].reset_index(drop=True) | |
| logger.info("Year split -> train=%s, test=%s", len(train), len(test)) | |
| train_fe = apply_feature_engineering(train) | |
| test_fe = apply_feature_engineering(test) | |
| train_fe.to_parquet(config.PROCESSED_TRAIN, index=False) | |
| test_fe.to_parquet(config.PROCESSED_TEST, index=False) | |
| logger.info("Wrote %s and %s", config.PROCESSED_TRAIN, config.PROCESSED_TEST) | |
| return str(config.PROCESSED_TRAIN), str(config.PROCESSED_TEST) | |
| def build_preprocessor() -> ColumnTransformer: | |
| return ColumnTransformer( | |
| transformers=[ | |
| ("numeric", "passthrough", NUMERIC_FEATURES), | |
| ( | |
| "categorical", | |
| OneHotEncoder(handle_unknown="ignore", sparse_output=False), | |
| CATEGORICAL_FEATURES, | |
| ), | |
| ], | |
| remainder="drop", | |
| ) | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s") | |
| train_path, test_path = run() | |
| train_fe = pd.read_parquet(train_path) | |
| test_fe = pd.read_parquet(test_path) | |
| print(f"\nPreprocess complete:") | |
| print(f" Train: {train_fe.shape} -> {train_path}") | |
| print(f" Test: {test_fe.shape} -> {test_path}") | |
| print(f"\nFeature columns to model ({len(FEATURE_COLUMNS)}):") | |
| print(f" numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}") | |
| print(f" categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}") | |