In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from feature_engineering import discretization as dc

# plt.style.use('seaborn-colorblind')
# %matplotlib inline
#from feature_cleaning import rare_values as ra

## Load Dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('./data/titanic.csv', usecols=use_cols)


In [3]:
data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925


In [4]:
# Note that we include target variable in the X_train 
# because we need it to supervise our discretization
# this is not the standard way of using train-test-split
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 6), (268, 6))

## Equal width binning
divides the scope of possible values into N bins of the same width

In [5]:
from sklearn.preprocessing import KBinsDiscretizer
enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])

In [6]:
# equal width for every bins
enc_equal_width.bin_edges_

array([array([  0.    , 170.7764, 341.5528, 512.3292])], dtype=object)

In [7]:
result = enc_equal_width.transform(X_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

0.0    610
1.0     11
2.0      2
Name: 0, dtype: int64

In [8]:
# add the new discretized variable
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_equal_width'] = enc_equal_width.transform(X_train[['Fare']])
print(X_train_copy.head(10))

     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_width
857         1       1    male  51.0      0   26.5500               0.0
52          1       1  female  49.0      1   76.7292               0.0
386         0       3    male   1.0      5   46.9000               0.0
124         0       1    male  54.0      0   77.2875               0.0
578         0       3  female   NaN      1   14.4583               0.0
549         1       2    male   8.0      1   36.7500               0.0
118         0       1    male  24.0      0  247.5208               1.0
12          0       3    male  20.0      0    8.0500               0.0
157         0       3    male  30.0      0    8.0500               0.0
127         1       3    male  24.0      0    7.1417               0.0


## Equal frequency binning
divides the scope of possible values of the variable into N bins, 
where each bin carries the same amount of observations

In [9]:
enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['Fare']])

In [10]:
# check the bin edges
enc_equal_freq.bin_edges_

array([array([  0.        ,   8.69303333,  26.2875    , 512.3292    ])],
      dtype=object)

In [11]:
# equal number of case for every bins
result = enc_equal_freq.transform(X_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

2.0    209
0.0    208
1.0    206
Name: 0, dtype: int64

In [12]:
# add the new discretized variable
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(X_train[['Fare']])
print(X_train_copy.head(10))

     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_freq
857         1       1    male  51.0      0   26.5500              2.0
52          1       1  female  49.0      1   76.7292              2.0
386         0       3    male   1.0      5   46.9000              2.0
124         0       1    male  54.0      0   77.2875              2.0
578         0       3  female   NaN      1   14.4583              1.0
549         1       2    male   8.0      1   36.7500              2.0
118         0       1    male  24.0      0  247.5208              2.0
12          0       3    male  20.0      0    8.0500              0.0
157         0       3    male  30.0      0    8.0500              0.0
127         1       3    male  24.0      0    7.1417              0.0


## K-means binning
using k-means to partition values into clusters

In [13]:
enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])

In [14]:
# check the bin edges
enc_kmeans.bin_edges_

array([array([  0.        ,  93.5271531 , 338.08506324, 512.3292    ])],
      dtype=object)

In [15]:
result = enc_kmeans.transform(X_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

0.0    587
1.0     34
2.0      2
Name: 0, dtype: int64

In [16]:
# add the new discretized variable
X_train_copy = X_train.copy(deep=True)
X_train_copy['Fare_kmeans'] = enc_kmeans.transform(X_train[['Fare']])
print(X_train_copy.head(10))

     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_kmeans
857         1       1    male  51.0      0   26.5500          0.0
52          1       1  female  49.0      1   76.7292          0.0
386         0       3    male   1.0      5   46.9000          0.0
124         0       1    male  54.0      0   77.2875          0.0
578         0       3  female   NaN      1   14.4583          0.0
549         1       2    male   8.0      1   36.7500          0.0
118         0       1    male  24.0      0  247.5208          1.0
12          0       3    male  20.0      0    8.0500          0.0
157         0       3    male  30.0      0    8.0500          0.0
127         1       3    male  24.0      0    7.1417          0.0


## Discretisation with Decision Tree
using a decision tree to identify the optimal splitting points that would determine the bins

In [17]:
enc1 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=2).fit(X=X_train,y=y_train)

In [18]:
enc1.tree_model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
data1 = enc1.transform(data)

In [20]:
# see how the new column Fare_tree_discret is distributed
# the values are corresponding to the proba of the prediction by the tree
print(data1.head(5))

# the unique value of the discretisized column
print(data1.Fare_tree_discret.unique())

   Survived  Pclass     Sex   Age  SibSp     Fare  Fare_tree_discret
0         0       3    male  22.0      1   7.2500           0.107143
1         1       1  female  38.0      1  71.2833           0.442308
2         1       3  female  26.0      0   7.9250           0.255319
3         1       1  female  35.0      1  53.1000           0.442308
4         0       3    male  35.0      0   8.0500           0.255319
[0.10714286 0.44230769 0.25531915 0.74626866]


In [21]:
# see how the bins are cut
# because we use a tree with max-depth of 2, we have at most 2*2=4 bins generated by the tree
col='Fare'
bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),
                  data1.groupby([col+'_tree_discret'])[col].max()], axis=1)
print(bins)

# all values between 0 to 7.5208 in the original variable 'Fare' 
# are given new value 0.107143 in the new column 'Fare_tree_discret'
# and so on

                      Fare      Fare
Fare_tree_discret                   
0.107143            0.0000    7.5208
0.255319            7.5500   10.5167
0.442308           11.1333   73.5000
0.746269           75.2500  512.3292


## Discretisation with Decision Tree with optimal depth search

In [22]:
# search for the best depth from range 2-7
# we see when depth=2 we get the best roc-auc mean
enc2 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=[2,3,4,5,6,7]).fit(X=X_train,y=y_train)

result ROC-AUC for each depth
   depth  roc_auc_mean  roc_auc_std
0      2      0.662132     0.026253
1      3      0.647950     0.045010
2      4      0.650984     0.035127
3      5      0.651180     0.027663
4      6      0.653961     0.037421
5      7      0.643688     0.033513
optimal_depth: [2]


In [23]:
# using optimal depth=2 we train the model, same result as last one
enc2.tree_model

DecisionTreeClassifier(class_weight=None, criterion='gini',
            max_depth=array([2], dtype=int64), max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
data2 = enc2.transform(data)
data2.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Fare_tree_discret
0,0,3,male,22.0,1,7.25,0.107143
1,1,1,female,38.0,1,71.2833,0.442308
2,1,3,female,26.0,0,7.925,0.255319
3,1,1,female,35.0,1,53.1,0.442308
4,0,3,male,35.0,0,8.05,0.255319


## Discretisation with ChiMerge
supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged

In [25]:
enc3 = dc.ChiMerge(col='Fare',num_of_bins=5).fit(X=X_train,y='Survived')

Interval for variable Fare
  variable       interval  flag_0  flag_1
0     Fare     -inf,7.875    94.0    28.0
1     Fare   7.875,7.8792     0.0     3.0
2     Fare  7.8792,7.8958    25.0     1.0
3     Fare    7.8958,73.5   245.0   160.0
4     Fare          73.5+    17.0    50.0


In [26]:
# the bins boundary created by ChiMerge

enc3.bins

[-0.1, 7.875, 7.8792, 7.8958, 73.5, 512.3292]

In [27]:
data3 = enc3.transform(data)

In [28]:
print(data3.head(5))

   Survived  Pclass     Sex   Age  SibSp     Fare    Fare_chimerge
0         0       3    male  22.0      1   7.2500  (-0.101, 7.875]
1         1       1  female  38.0      1  71.2833    (7.896, 73.5]
2         1       3  female  26.0      0   7.9250    (7.896, 73.5]
3         1       1  female  35.0      1  53.1000    (7.896, 73.5]
4         0       3    male  35.0      0   8.0500    (7.896, 73.5]


In [29]:
# all values are grouped into 5 intervals
data3.Fare_chimerge.unique()

[(-0.101, 7.875], (7.896, 73.5], (73.5, 512.329], (7.875, 7.879], (7.879, 7.896]]
Categories (5, interval[float64]): [(-0.101, 7.875] < (7.875, 7.879] < (7.879, 7.896] < (7.896, 73.5] < (73.5, 512.329]]