File size: 1,946 Bytes
20fdb7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def load_csv(path):
    df = pd.read_csv("/content/merged.csv")
    return df

def build_preprocessing_pipeline(df, categorical_cols=None, numeric_cols=None, scale=True):
    # auto-detect if not given
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # remove numeric cols that are actually target or indices must be handled by caller
    # numeric preprocessing
    numeric_transformers = []
    if numeric_cols:
        numeric_transformers = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler() if scale else 'passthrough')
        ])

    # categorical preprocessing
    categorical_transformers = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]) if categorical_cols else 'passthrough'

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformers, numeric_cols),
        ('cat', categorical_transformers, categorical_cols)
    ], remainder='drop', sparse_threshold=0)

    return preprocessor, numeric_cols, categorical_cols

def split_features_target(df, target_col, test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y if len(np.unique(y))>1 else None,
        test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test