In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,  roc_auc_score

# plt.style.use('seaborn-colorblind')
# %matplotlib inline
#from feature_cleaning import rare_values as ra

## Load Dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('./data/titanic.csv', usecols=use_cols)


In [3]:
data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925


In [4]:
# Note that we include target variable in the X_train 
# because we need it to supervise our discretization
# this is not the standard way of using train-test-split
X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((623, 6), (268, 6))

## Polynomial Expansion

generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree

In [5]:
# create polynomial combinations of feature 'Pclass','SibSp' with degree 2
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])
tmp = pf.transform(X_train[['Pclass','SibSp']])
X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))
print(X_train_copy.head(6))

   Pclass  SibSp  Pclass^2  Pclass SibSp  SibSp^2
0     1.0    0.0       1.0           0.0      0.0
1     1.0    1.0       1.0           1.0      1.0
2     3.0    5.0       9.0          15.0     25.0
3     1.0    0.0       1.0           0.0      0.0
4     3.0    1.0       9.0           3.0      1.0
5     2.0    1.0       4.0           2.0      1.0


## Feature Learning by Trees
GBDT derived feature + LR

In [6]:
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

gbdt = GradientBoostingClassifier(n_estimators=20)
one_hot = OneHotEncoder()

X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)
X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)

gbdt.fit(X_train, y_train)

X_leaf_index = gbdt.apply(X_train)[:, :, 0]  # apply return the node index on each tree 
print("sample's belonging node of each base tree \n'",X_leaf_index)
# fit one-hot encoder
one_hot.fit(X_leaf_index)   
X_one_hot = one_hot.transform(X_leaf_index)  


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_one_hot,y_train)
y_pred = lr.predict_proba(
    one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for GBDT derived feature + LR：", roc_auc_score(y_test, y_pred))


sample's belonging node of each base tree 
' [[ 7.  7.  6. ...  4.  7.  4.]
 [ 7.  7.  6. ... 14.  7.  7.]
 [11. 11. 11. ...  4.  6. 11.]
 ...
 [10. 10. 10. ...  4.  6. 10.]
 [13. 14. 13. ...  4.  7. 13.]
 [ 7.  7.  6. ...  6.  7.  7.]]
AUC for GBDT derived feature + LR： 0.7746130952380953


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## Feature Learning by Trees
RandomForest derived feature + LR

In [7]:
rf = RandomForestClassifier(n_estimators=20)
one_hot = OneHotEncoder()

X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)
X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)

rf.fit(X_train, y_train)

X_leaf_index = rf.apply(X_train)  # apply return the node index on each tree 
print("sample's belonging node of each base tree \n'",X_leaf_index)
# fit one-hot encoder
one_hot.fit(X_leaf_index)   
X_one_hot = one_hot.transform(X_leaf_index)  


lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_one_hot,y_train)
y_pred = lr.predict_proba(
    one_hot.transform(rf.apply(X_test)))[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for RandomForest derived feature + LR：", roc_auc_score(y_test, y_pred))


sample's belonging node of each base tree 
' [[212  35  79 ... 146  60  46]
 [307 165 266 ... 136 132  44]
 [285 285 320 ... 301 294 300]
 ...
 [ 13 177 133 ... 186 169 117]
 [190 296 311 ... 282 289 297]
 [264 165 243 ... 152 110 314]]
AUC for RandomForest derived feature + LR： 0.759672619047619


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


##  Feature Learning by Trees
GBDT derived feature + Raw feature +LR

In [8]:
from scipy.sparse import hstack

X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])
X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train_ext,y_train)
y_pred = lr.predict_proba(X_test_ext)[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for GBDT derived feature + Raw feature +LR：", roc_auc_score(y_test, y_pred))


AUC for GBDT derived feature + Raw feature +LR： 0.7603571428571428


##  Feature Learning by Trees
RandomForest derived feature + Raw feature +LR

In [9]:
X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])
X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train_ext,y_train)
y_pred = lr.predict_proba(X_test_ext)[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for RandomForest derived feature + Raw feature + LR：", roc_auc_score(y_test, y_pred))


AUC for RandomForest derived feature + Raw feature + LR： 0.76


##  Feature Learning by Trees
Use only Raw Feature + LR

In [10]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train,y_train)
y_pred = lr.predict_proba(X_test)[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for RandomForest derived feature + LR：", roc_auc_score(y_test, y_pred))


AUC for RandomForest derived feature + LR： 0.6988690476190476


## Feature Learning by Trees

Use only Raw Feature + GBDT

In [13]:
gbdt = GradientBoostingClassifier(n_estimators=20)

X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)
X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)

gbdt.fit(X_train, y_train)
y_pred = gbdt.predict_proba(X_test)[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for Raw feature + GBDT：", roc_auc_score(y_test, y_pred))


AUC for Raw feature + GBDT： 0.7613988095238096


## Feature Learning by Trees

Use only Raw Feature + RF


In [16]:
rf = RandomForestClassifier(n_estimators=20)

X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)
X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)

rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)[:,1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)
print("AUC for Raw feature + RF：", roc_auc_score(y_test, y_pred))

AUC for Raw feature + RF： 0.7235119047619047


#### Without tuning, we can see GBDT derived feature + LR get the best result