File size: 6,476 Bytes
f8da90e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sksurv.util import Surv

class Preprocessor:
    def __init__(self, dataset_url, target_column, target_columnn_ttc, resampling = None, scaling = 'minmax', test_size=0.2, random_state=42):
        self.url = dataset_url
        self.target_column = target_column
        self.target_column_ttc = target_columnn_ttc
        self.resampling = resampling
        self.scaling = scaling
        self.test_size = test_size
        self.random_state = random_state
        self.label_encoders = {}
        self.scaler = None

    def _download_dataset(self):
        d = pd.read_csv(self.url)
        ids = d.id
        d.drop(['id', 'Surname', 'CustomerId'], axis = 1, inplace = True)
        return d

    def _encode_categorical(self):
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns
        for col in categorical_cols:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col].astype(str))
            self.label_encoders[col] = le

    def _scale_features(self, df, scaler, df_type = 'train'):
        if isinstance(scaler, str) and scaler.lower() not in ['standard', 'robust', 'minmax']:
            print("Invalid scaler. Choose one between 'standard', 'robust' or 'minmax'")
            return df, scaler
        feature_cols = self.df.drop(columns=[self.target_column]).columns
        if scaler == 'standard':
            scaler = StandardScaler()
        elif scaler == 'minmax':
            scaler = MinMaxScaler(feature_range = (0,1))
        elif scaler == 'robust':
            scaler = RobustScaler()
        
        if df_type == 'train':
            dataset = scaler.fit_transform(df)
            self.scaler = scaler
        else:
            dataset = scaler.transform(df)
        return pd.DataFrame(dataset, columns=feature_cols, index=df.index), scaler
    
    def _scale_features_ttc(self, df, scaler, df_type = 'train'):
        if isinstance(scaler, str) and scaler.lower() not in ['standard', 'robust', 'minmax']:
            print("Invalid scaler. Choose one between 'standard', 'robust' or 'minmax'")
            return df, scaler
        cols = []
        for col in self.target_column_ttc:
            cols.append(col)
        feature_cols = self.df.drop(cols, axis = 1).columns
        if scaler == 'standard':
            scaler = StandardScaler()
        elif scaler == 'minmax':
            scaler = MinMaxScaler(feature_range = (0,1))
        elif scaler == 'robust':
            scaler = RobustScaler()
        
        if df_type == 'train':
            dataset = scaler.fit_transform(df)
            self.scaler = scaler
        else:
            dataset = scaler.transform(df)
        return pd.DataFrame(dataset, columns=feature_cols, index=df.index), scaler
        
    def _split_data(self):
        X = self.df.drop(columns=[self.target_column])
        y = self.df[self.target_column]
        if isinstance(self.resampling, str) and self.resampling.lower() == "over":
            sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
        elif isinstance(self.resampling, str) and self.resampling.lower() == "under":
            sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
        else:
            sampler = None
        if sampler != None:
            X, y = sampler.fit_resample(X, y)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state)
        return X_train, X_test, y_train, y_test
    
    def _split_data_ttc(self):
        cols = []
        for col in self.target_column_ttc:
            cols.append(col)
            #X = self.df.drop(columns = [col])
        print(cols)
        X = self.df.drop(cols, axis = 1)
        print(X.columns)
        y = self.df[self.target_column_ttc]
        """
        if isinstance(self.resampling, str) and self.resampling.lower() == "over":
            sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
        elif isinstance(self.resampling, str) and self.resampling.lower() == "under":
            sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
        else:
            sampler = None
        if len(self.target_column_ttc) > 1:
            sampler = None
        if sampler != None:
            X, y = sampler.fit_resample(X, y)"""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state)
        return X_train, X_test, y_train, y_test
    
    def process_cp(self):
        # 1) download the dataset
        self.df = self._download_dataset()
        # 2) encode categorical features
        self._encode_categorical()
        # 3) divide into train and test sets
        X_train, X_test, y_train, y_test = self._split_data()
        X_train_df = X_train.copy()
        X_test_df = X_test.copy()
        y_train_df = y_train.copy()
        y_test_df = y_test.copy()
        # 4) scale features
        X_train, scaler = self._scale_features(X_train, self.scaling, 'train')
        X_test, scaler = self._scale_features(X_test, scaler, 'test')
        return X_train, X_test, y_train, y_test, X_train_df, X_test_df, y_train_df, y_test_df
    
    def process_ttcp(self):
        # 1) download the dataset
        self.df = self._download_dataset()
        # 2) encode categorical features
        self._encode_categorical()
        # 3) divide into train and test sets
        X_train, X_test, y_train, y_test = self._split_data_ttc()
        X_train_df = X_train.copy()
        X_test_df = X_test.copy()
        y_train_df = y_train.copy()
        y_test_df = y_test.copy()
        # 4) scale features
        X_train, scaler = self._scale_features_ttc(X_train, self.scaling, 'train')
        X_test, scaler = self._scale_features_ttc(X_test, scaler,  'test')

        y_train = Surv.from_dataframe(self.target_column_ttc[0], self.target_column_ttc[1], y_train)
        y_test = Surv.from_dataframe(self.target_column_ttc[0], self.target_column_ttc[1], y_test)
        return X_train, X_test, y_train, y_test, X_train_df, X_test_df, y_train_df, y_test_df