Spaces:

T0MYYY
/

f1-pit-predictor

Sleeping

App Files Files Community

f1-pit-predictor / src /preprocess.py

T0MYYY

Deploy full-stack FastAPI + dashboard with CSV batch inference

bb21b5d verified 12 days ago

raw

history blame contribute delete

2.85 kB

	"""Preprocess task: split raw by Year, apply feature engineering, build ColumnTransformer."""

	from __future__ import annotations

	import logging

	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import OneHotEncoder

	from src import config
	from src.feature_engineering import apply_feature_engineering

	logger = logging.getLogger(__name__)


	NUMERIC_FEATURES = [
	"PitStop",
	"LapNumber",
	"Stint",
	"TyreLife",
	"Position",
	"LapTime (s)",
	"LapTime_Delta",
	"Cumulative_Degradation",
	"RaceProgress",
	"Position_Change",
	"LapTime_lag1", "LapTime_lag2", "LapTime_lag3",
	"LapTime_Delta_lag1", "LapTime_Delta_lag2", "LapTime_Delta_lag3",
	"Position_lag1", "Position_lag2", "Position_lag3",
	"LapTime_roll3_mean", "LapTime_roll3_std",
	"LapTime_roll5_mean", "LapTime_roll5_std",
	"LapTimeDelta_roll3_mean", "LapTimeDelta_roll5_mean",
	"StintMin_sofar",
	"LapTime_vs_StintMin",
	"IsEarlyRace",
	"IsLateRace",
	]

	CATEGORICAL_FEATURES = ["Compound", "TyreLife_bucket"]

	FEATURE_COLUMNS = NUMERIC_FEATURES + CATEGORICAL_FEATURES


	def run() -> tuple[str, str]:
	config.ensure_dirs()
	logger.info("Reading %s", config.RAW_PARQUET)
	df = pd.read_parquet(config.RAW_PARQUET)

	train = df[df["Year"].isin(config.TRAIN_YEARS)].reset_index(drop=True)
	test = df[df["Year"].isin(config.TEST_YEARS)].reset_index(drop=True)
	logger.info("Year split -> train=%s, test=%s", len(train), len(test))

	train_fe = apply_feature_engineering(train)
	test_fe = apply_feature_engineering(test)

	train_fe.to_parquet(config.PROCESSED_TRAIN, index=False)
	test_fe.to_parquet(config.PROCESSED_TEST, index=False)
	logger.info("Wrote %s and %s", config.PROCESSED_TRAIN, config.PROCESSED_TEST)
	return str(config.PROCESSED_TRAIN), str(config.PROCESSED_TEST)


	def build_preprocessor() -> ColumnTransformer:
	return ColumnTransformer(
	transformers=[
	("numeric", "passthrough", NUMERIC_FEATURES),
	(
	"categorical",
	OneHotEncoder(handle_unknown="ignore", sparse_output=False),
	CATEGORICAL_FEATURES,
	),
	],
	remainder="drop",
	)


	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
	train_path, test_path = run()
	train_fe = pd.read_parquet(train_path)
	test_fe = pd.read_parquet(test_path)
	print(f"\nPreprocess complete:")
	print(f" Train: {train_fe.shape} -> {train_path}")
	print(f" Test: {test_fe.shape} -> {test_path}")
	print(f"\nFeature columns to model ({len(FEATURE_COLUMNS)}):")
	print(f" numeric ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
	print(f" categorical ({len(CATEGORICAL_FEATURES)}): {CATEGORICAL_FEATURES}")