File size: 6,405 Bytes
bd9910a 9995a6a bd9910a 9995a6a bd9910a 9995a6a 2a3fc10 9995a6a 2a3fc10 9995a6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from numpy import nan, ndarray
from pandas import DataFrame, concat
from scipy.sparse import spmatrix
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
def preprocess_data(train_df: DataFrame, test_df: DataFrame) -> tuple[ndarray, ndarray]:
"""
Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed.
Args:
train_df (DataFrame): The training dataframe.
test_df (DataFrame): The test dataframe.
Returns:
tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays
"""
aux_train_df = train_df.copy()
aux_test_df = test_df.copy()
# π [1] Correct outliers/anomalous values in numerical columns
aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan})
aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan})
# π [2] Encode string categorical features
categorical_cols = aux_train_df.select_dtypes(include="object").columns
binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2]
multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2]
# [2.1] Encode Binary Categorical Features
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(aux_train_df[binary_cols])
aux_train_df[binary_cols] = ordinal_encoder.transform(aux_train_df[binary_cols])
aux_test_df[binary_cols] = ordinal_encoder.transform(aux_test_df[binary_cols])
# [2.2] Encode Multi Categorical Features
one_hot_encoder = OneHotEncoder(
handle_unknown="ignore", # Prevents errors when test set contain categories that didn't appear in train dataframe
sparse_output=False, # Returns a dense array instead of a sparse matrix
)
one_hot_encoder.fit(aux_train_df[multi_cols])
ohe_train = one_hot_encoder.transform(aux_train_df[multi_cols])
ohe_test = one_hot_encoder.transform(aux_test_df[multi_cols])
# Get columns names
ohe_cols = one_hot_encoder.get_feature_names_out(input_features=multi_cols)
# Convert arrays to DataFrames
ohe_train_df = DataFrame(data=ohe_train, columns=ohe_cols, index=aux_train_df.index) # type: ignore
ohe_test_df = DataFrame(data=ohe_test, columns=ohe_cols, index=aux_test_df.index) # type: ignore
# Drop original multi category columns
aux_train_df.drop(columns=multi_cols, inplace=True)
aux_test_df.drop(columns=multi_cols, inplace=True)
# Concatenate encoded dataframe
aux_train_df = concat([aux_train_df, ohe_train_df], axis=1)
aux_test_df = concat([aux_test_df, ohe_test_df], axis=1)
# π [3] Impute values for columns with missing data
imputer = SimpleImputer(strategy="median")
imputer.fit(aux_train_df)
imputer_train = imputer.transform(aux_train_df)
imputer_test = imputer.transform(aux_test_df)
aux_train_df = DataFrame(
data=imputer_train, # type: ignore
columns=aux_train_df.columns,
index=aux_train_df.index,
)
aux_test_df = DataFrame(
data=imputer_test, # type: ignore
columns=aux_test_df.columns,
index=aux_test_df.index,
)
# π [4] Feature Scaling with Min-Max Scaler
scaler = MinMaxScaler()
scaler.fit(aux_train_df)
scaler_train = scaler.transform(aux_train_df)
scaler_test = scaler.transform(aux_test_df)
return scaler_train, scaler_test
def preprocess_data_pipeline(
train_df: DataFrame, test_df: DataFrame
) -> tuple[ndarray | spmatrix, ndarray | spmatrix]:
"""
Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed.
Args:
train_df (DataFrame): The training dataframe.
test_df (DataFrame): The test dataframe.
Returns:
tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays
"""
# Create copies to avoid modifying original dataframes
aux_train_df = train_df.copy()
aux_test_df = test_df.copy()
# π [1] Correct outliers/anomalous values in numerical columns
aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan})
aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan})
# π [2] Define column types for the ColumnTransformer
numerical_cols = aux_train_df.select_dtypes(include="number").columns.to_list()
categorical_cols = aux_train_df.select_dtypes(include="object").columns.to_list()
binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2]
multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2]
# π [3] Build the preprocessing pipeline using ColumnTransformer
# Create a pipeline for numerical columns: Impute and Scale processes
numerical_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", MinMaxScaler()),
]
)
binary_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("ordinal", OrdinalEncoder()),
("scaler", MinMaxScaler()),
]
)
multi_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
("scaler", MinMaxScaler()),
]
)
# Create a ColumnTransformer object with the defined pipelines and transformers
preprocessor = ColumnTransformer(
transformers=[
# Tuple format: ('name', transformer, list_of_columns)
("binary", binary_pipeline, binary_cols),
("multi", multi_pipeline, multi_cols),
("numerical", numerical_pipeline, numerical_cols),
],
remainder="passthrough",
)
# π [4] Fit and transform the data
preprocessor.fit(aux_train_df)
train_preprocessed = preprocessor.transform(aux_train_df)
test_preprocessed = preprocessor.transform(aux_test_df)
return train_preprocessed, test_preprocessed
|