import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline def load_csv(path): df = pd.read_csv("/content/merged.csv") return df def build_preprocessing_pipeline(df, categorical_cols=None, numeric_cols=None, scale=True): # auto-detect if not given if categorical_cols is None: categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() if numeric_cols is None: numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() # remove numeric cols that are actually target or indices must be handled by caller # numeric preprocessing numeric_transformers = [] if numeric_cols: numeric_transformers = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler() if scale else 'passthrough') ]) # categorical preprocessing categorical_transformers = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) if categorical_cols else 'passthrough' preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformers, numeric_cols), ('cat', categorical_transformers, categorical_cols) ], remainder='drop', sparse_threshold=0) return preprocessor, numeric_cols, categorical_cols def split_features_target(df, target_col, test_size=0.2, random_state=42): X = df.drop(columns=[target_col]) y = df[target_col] X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y if len(np.unique(y))>1 else None, test_size=test_size, random_state=random_state ) return X_train, X_test, y_train, y_test