Spaces:
Running
Running
File size: 6,476 Bytes
f8da90e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sksurv.util import Surv
class Preprocessor:
def __init__(self, dataset_url, target_column, target_columnn_ttc, resampling = None, scaling = 'minmax', test_size=0.2, random_state=42):
self.url = dataset_url
self.target_column = target_column
self.target_column_ttc = target_columnn_ttc
self.resampling = resampling
self.scaling = scaling
self.test_size = test_size
self.random_state = random_state
self.label_encoders = {}
self.scaler = None
def _download_dataset(self):
d = pd.read_csv(self.url)
ids = d.id
d.drop(['id', 'Surname', 'CustomerId'], axis = 1, inplace = True)
return d
def _encode_categorical(self):
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
le = LabelEncoder()
self.df[col] = le.fit_transform(self.df[col].astype(str))
self.label_encoders[col] = le
def _scale_features(self, df, scaler, df_type = 'train'):
if isinstance(scaler, str) and scaler.lower() not in ['standard', 'robust', 'minmax']:
print("Invalid scaler. Choose one between 'standard', 'robust' or 'minmax'")
return df, scaler
feature_cols = self.df.drop(columns=[self.target_column]).columns
if scaler == 'standard':
scaler = StandardScaler()
elif scaler == 'minmax':
scaler = MinMaxScaler(feature_range = (0,1))
elif scaler == 'robust':
scaler = RobustScaler()
if df_type == 'train':
dataset = scaler.fit_transform(df)
self.scaler = scaler
else:
dataset = scaler.transform(df)
return pd.DataFrame(dataset, columns=feature_cols, index=df.index), scaler
def _scale_features_ttc(self, df, scaler, df_type = 'train'):
if isinstance(scaler, str) and scaler.lower() not in ['standard', 'robust', 'minmax']:
print("Invalid scaler. Choose one between 'standard', 'robust' or 'minmax'")
return df, scaler
cols = []
for col in self.target_column_ttc:
cols.append(col)
feature_cols = self.df.drop(cols, axis = 1).columns
if scaler == 'standard':
scaler = StandardScaler()
elif scaler == 'minmax':
scaler = MinMaxScaler(feature_range = (0,1))
elif scaler == 'robust':
scaler = RobustScaler()
if df_type == 'train':
dataset = scaler.fit_transform(df)
self.scaler = scaler
else:
dataset = scaler.transform(df)
return pd.DataFrame(dataset, columns=feature_cols, index=df.index), scaler
def _split_data(self):
X = self.df.drop(columns=[self.target_column])
y = self.df[self.target_column]
if isinstance(self.resampling, str) and self.resampling.lower() == "over":
sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
elif isinstance(self.resampling, str) and self.resampling.lower() == "under":
sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
else:
sampler = None
if sampler != None:
X, y = sampler.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=self.test_size, random_state=self.random_state)
return X_train, X_test, y_train, y_test
def _split_data_ttc(self):
cols = []
for col in self.target_column_ttc:
cols.append(col)
#X = self.df.drop(columns = [col])
print(cols)
X = self.df.drop(cols, axis = 1)
print(X.columns)
y = self.df[self.target_column_ttc]
"""
if isinstance(self.resampling, str) and self.resampling.lower() == "over":
sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
elif isinstance(self.resampling, str) and self.resampling.lower() == "under":
sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
else:
sampler = None
if len(self.target_column_ttc) > 1:
sampler = None
if sampler != None:
X, y = sampler.fit_resample(X, y)"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=self.test_size, random_state=self.random_state)
return X_train, X_test, y_train, y_test
def process_cp(self):
# 1) download the dataset
self.df = self._download_dataset()
# 2) encode categorical features
self._encode_categorical()
# 3) divide into train and test sets
X_train, X_test, y_train, y_test = self._split_data()
X_train_df = X_train.copy()
X_test_df = X_test.copy()
y_train_df = y_train.copy()
y_test_df = y_test.copy()
# 4) scale features
X_train, scaler = self._scale_features(X_train, self.scaling, 'train')
X_test, scaler = self._scale_features(X_test, scaler, 'test')
return X_train, X_test, y_train, y_test, X_train_df, X_test_df, y_train_df, y_test_df
def process_ttcp(self):
# 1) download the dataset
self.df = self._download_dataset()
# 2) encode categorical features
self._encode_categorical()
# 3) divide into train and test sets
X_train, X_test, y_train, y_test = self._split_data_ttc()
X_train_df = X_train.copy()
X_test_df = X_test.copy()
y_train_df = y_train.copy()
y_test_df = y_test.copy()
# 4) scale features
X_train, scaler = self._scale_features_ttc(X_train, self.scaling, 'train')
X_test, scaler = self._scale_features_ttc(X_test, scaler, 'test')
y_train = Surv.from_dataframe(self.target_column_ttc[0], self.target_column_ttc[1], y_train)
y_test = Surv.from_dataframe(self.target_column_ttc[0], self.target_column_ttc[1], y_test)
return X_train, X_test, y_train, y_test, X_train_df, X_test_df, y_train_df, y_test_df
|