import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer def prepare_data(df: pd.DataFrame): # Drop leakage / unnecessary columns df = df.drop(columns=["AQI_Bucket"], errors="ignore") df = df.drop(columns=["Date"], errors="ignore") # Split features & target X = df.drop(columns=["AQI"]) y = df["AQI"] # Train-test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Column selection num_cols = X.select_dtypes(include=["number"]).columns cat_cols = X.select_dtypes(include=["object"]).columns # Preprocessing pipeline preprocessor = ColumnTransformer( transformers=[ ("num", StandardScaler(), num_cols), ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols) ] ) return X_train, X_test, y_train, y_test, preprocessor