Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| def load_csv(path): | |
| df = pd.read_csv("/content/merged.csv") | |
| return df | |
| def build_preprocessing_pipeline(df, categorical_cols=None, numeric_cols=None, scale=True): | |
| # auto-detect if not given | |
| if categorical_cols is None: | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if numeric_cols is None: | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| # remove numeric cols that are actually target or indices must be handled by caller | |
| # numeric preprocessing | |
| numeric_transformers = [] | |
| if numeric_cols: | |
| numeric_transformers = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='median')), | |
| ('scaler', StandardScaler() if scale else 'passthrough') | |
| ]) | |
| # categorical preprocessing | |
| categorical_transformers = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='most_frequent')), | |
| ('onehot', OneHotEncoder(handle_unknown='ignore')) | |
| ]) if categorical_cols else 'passthrough' | |
| preprocessor = ColumnTransformer(transformers=[ | |
| ('num', numeric_transformers, numeric_cols), | |
| ('cat', categorical_transformers, categorical_cols) | |
| ], remainder='drop', sparse_threshold=0) | |
| return preprocessor, numeric_cols, categorical_cols | |
| def split_features_target(df, target_col, test_size=0.2, random_state=42): | |
| X = df.drop(columns=[target_col]) | |
| y = df[target_col] | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, stratify=y if len(np.unique(y))>1 else None, | |
| test_size=test_size, random_state=random_state | |
| ) | |
| return X_train, X_test, y_train, y_test | |