In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
from sklearn.feature_selection import RFE
from feature_selection import hybrid


## Load Dataset

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data = pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))

In [3]:
data.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), 
                                                    data.target, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

##  Recursive Feature Elimination 
### with Random Forests Importance


### Example 1
This method is slightly **different from the guide**, as it use a different stopping criterion: the desired number of features to select is eventually reached.

In [5]:
#  n_features_to_select decide the stopping criterion
# we stop till 10 features remaining

sel_ = RFE(RandomForestClassifier(n_estimators=20), n_features_to_select=10)
sel_.fit(X_train.fillna(0), y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
  n_features_to_select=10, step=1, verbose=0)

In [6]:
selected_feat = X_train.columns[(sel_.get_support())]
print(selected_feat)

Index(['mean texture', 'mean perimeter', 'mean area', 'mean concavity',
       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concave points', 'worst symmetry'],
      dtype='object')


### Example 2
recursive feature elimination with RandomForest
with the method same as the guide
1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.
2. Remove one feature -the least important- and build a machine learning algorithm utilizing the remaining features.
3. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
4. If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.
5. Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.


In [7]:
# tol decide whether we should drop or keep the feature in current round
features_to_keep = hybrid.recursive_feature_elimination_rf(X_train=X_train,
                                                           y_train=y_train,
                                                           X_test=X_test,
                                                           y_test=y_test,
                                                           tol=0.001)


testing feature:  mean radius  which is feature  1  out of  30
New Test ROC AUC=0.9941251190854239
All features Test ROC AUC=0.9968243886948238
Drop in ROC AUC=0.0026992696093999236
keep:  mean radius

testing feature:  mean texture  which is feature  2  out of  30
New Test ROC AUC=0.9936487773896475
All features Test ROC AUC=0.9968243886948238
Drop in ROC AUC=0.0031756113051762958
keep:  mean texture

testing feature:  mean perimeter  which is feature  3  out of  30
New Test ROC AUC=0.9968243886948238
All features Test ROC AUC=0.9968243886948238
Drop in ROC AUC=0.0
remove:  mean perimeter

testing feature:  mean area  which is feature  4  out of  30
New Test ROC AUC=0.9960304858685297
All features Test ROC AUC=0.9968243886948238
Drop in ROC AUC=0.0007939028262941017
remove:  mean area

testing feature:  mean smoothness  which is feature  5  out of  30
New Test ROC AUC=0.9965068275643061
All features Test ROC AUC=0.9960304858685297
Drop in ROC AUC=-0.0004763416957763722
remove:  mean 

In [8]:
features_to_keep

['mean radius',
 'mean texture',
 'mean compactness',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'fractal dimension error',
 'worst radius',
 'worst area',
 'worst smoothness',
 'worst concave points']

## Recursive Feature Addition
### with Random Forests Importance

### Example 1
recursive feature addition with RandomForest
with the method same as the guide
1. Rank the features according to their importance derived from a  machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.
2. Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.
3. Add one feature -the most important- and build a machine learning  algorithm utilizing the added and any feature from previous rounds.
4. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
5. If the metric increases by more than an arbitrarily set threshold,  then that feature is important and should be kept. Otherwise, we can  remove that feature.
6. Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.


In [9]:
features_to_keep = hybrid.recursive_feature_addition_rf(X_train=X_train,
                                                        y_train=y_train,
                                                        X_test=X_test,
                                                        y_test=y_test,
                                                        tol=0.001)


testing feature:  mean texture  which is feature  1  out of  30
New Test ROC AUC=0.9558590028580501
All features Test ROC AUC=0.9009209272785013
Increase in ROC AUC=0.054938075579548884
keep:  mean texture

testing feature:  mean perimeter  which is feature  2  out of  30
New Test ROC AUC=0.9609399809463322
All features Test ROC AUC=0.9558590028580501
Increase in ROC AUC=0.005080978088282007
keep:  mean perimeter

testing feature:  mean area  which is feature  3  out of  30
New Test ROC AUC=0.9609399809463322
All features Test ROC AUC=0.9609399809463322
Increase in ROC AUC=0.0
remove:  mean area

testing feature:  mean smoothness  which is feature  4  out of  30
New Test ROC AUC=0.9684026675134964
All features Test ROC AUC=0.9609399809463322
Increase in ROC AUC=0.007462686567164201
keep:  mean smoothness

testing feature:  mean compactness  which is feature  5  out of  30
New Test ROC AUC=0.9750714512543665
All features Test ROC AUC=0.9684026675134964
Increase in ROC AUC=0.00666878374

In [10]:
features_to_keep

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'worst concave points']