Spaces:
Sleeping
Sleeping
File size: 1,946 Bytes
20fdb7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
def load_csv(path):
df = pd.read_csv("/content/merged.csv")
return df
def build_preprocessing_pipeline(df, categorical_cols=None, numeric_cols=None, scale=True):
# auto-detect if not given
if categorical_cols is None:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if numeric_cols is None:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# remove numeric cols that are actually target or indices must be handled by caller
# numeric preprocessing
numeric_transformers = []
if numeric_cols:
numeric_transformers = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler() if scale else 'passthrough')
])
# categorical preprocessing
categorical_transformers = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
]) if categorical_cols else 'passthrough'
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformers, numeric_cols),
('cat', categorical_transformers, categorical_cols)
], remainder='drop', sparse_threshold=0)
return preprocessor, numeric_cols, categorical_cols
def split_features_target(df, target_col, test_size=0.2, random_state=42):
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y if len(np.unique(y))>1 else None,
test_size=test_size, random_state=random_state
)
return X_train, X_test, y_train, y_test
|