| import pandas as pd
|
| import numpy as np
|
| import pickle
|
| from sklearn.base import BaseEstimator, TransformerMixin
|
| from sklearn.preprocessing import MinMaxScaler
|
| from sklearn.preprocessing import LabelEncoder
|
| from sklearn.preprocessing import RobustScaler
|
|
|
| class AutoPreprocess(BaseEstimator, TransformerMixin):
|
| def __init__(self):
|
| self.scaler = {}
|
| self.fillna_value = {}
|
| self.onehotencode_value = {}
|
| self.field_names = []
|
| self.final_field_names = []
|
| self.field_dtype = {}
|
|
|
| def fit(self, X, y = None, field_names=None):
|
| self.__init__()
|
| if field_names is None:
|
| self.field_names = X.columns.tolist()
|
| else:
|
| self.field_names = field_names
|
|
|
| for fname in self.field_names:
|
| self.field_dtype = X[fname].dtype
|
|
|
| for fname in self.field_names:
|
|
|
|
|
| if pd.api.types.is_string_dtype(X[fname]):
|
| self.fillna_value[fname] = X[fname].mode()[0]
|
|
|
|
|
|
|
| elif pd.api.types.is_bool_dtype(X[fname]):
|
| self.fillna_value[fname] = X[fname].mode()[0]
|
| else:
|
| self.fillna_value[fname] = X[fname].median()
|
|
|
|
|
|
|
| if pd.api.types.is_string_dtype(X[fname]):
|
| pass
|
|
|
| elif pd.api.types.is_bool_dtype(X[fname]):
|
| pass
|
| else:
|
| vc = X[fname].value_counts()
|
| if X[fname].isin([0, 1]).all():
|
| pass
|
| elif pd.api.types.is_integer_dtype(X[fname]) and X[fname].nunique() <= 10:
|
| self.scaler[fname] = MinMaxScaler()
|
| self.scaler[fname].fit(X[[fname]])
|
| else:
|
| self.scaler[fname] = RobustScaler()
|
| self.scaler[fname].fit(X[[fname]])
|
|
|
|
|
|
|
|
|
| if pd.api.types.is_string_dtype(X[fname]):
|
| field_value = X[fname].value_counts().index
|
| self.onehotencode_value[fname] = field_value
|
| for value in field_value:
|
| fn = fname+"_"+value
|
|
|
| self.final_field_names.append(fn)
|
|
|
| elif pd.api.types.is_bool_dtype(X[fname]):
|
|
|
| self.final_field_names.append(fname)
|
| else:
|
| self.final_field_names.append(fname)
|
|
|
| return self
|
|
|
| def transform(self, X):
|
|
|
| if isinstance(X, dict):
|
| for fname in self.field_names:
|
| if fname in X:
|
| X[fname] = [X[fname]]
|
| else:
|
|
|
| X[fname] = self.fillna_value[fname]
|
|
|
| data = pd.DataFrame(X)
|
|
|
|
|
|
|
|
|
| else:
|
| data = X.copy()
|
|
|
| for fname in self.field_names:
|
|
|
| if data[fname].isnull().any():
|
|
|
| data[fname] = data[fname].fillna(self.fillna_value[fname])
|
|
|
|
|
|
|
| if fname in self.scaler:
|
| data[fname] = self.scaler[fname].transform(data[[fname]])
|
|
|
|
|
|
|
| if pd.api.types.is_string_dtype(data[fname]):
|
| if fname in self.onehotencode_value:
|
| field_value = self.onehotencode_value[fname]
|
| for value in field_value:
|
| fn = fname+"_"+value
|
| data[fn] = (data[fname] == value).astype('int8')
|
|
|
| elif pd.api.types.is_bool_dtype(data[fname]):
|
| data[fname] = data[fname].astype(int)
|
| else:
|
| pass
|
| return data[self.final_field_names]
|
|
|
| def save(self, file_name):
|
| with open(file_name, "wb") as f:
|
| pickle.dump(self, f)
|
|
|
| @staticmethod
|
| def load(file_name):
|
| with open(file_name, "rb") as f:
|
| return pickle.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|