File size: 1,697 Bytes
0ab7b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import pandas as pd
#import numpy as np
from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
from sklearn.metrics import roc_auc_score #, mean_squared_error
# 2018.11.28 Created by Eamon.Zhang
def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
random_state=random_state,class_weight=class_weight,
n_jobs=-1)
model.fit(X_train, y_train)
train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
feature_dict = {}
# selection logic
for feature in X_train.columns:
X_train_c = X_train.copy().reset_index(drop=True)
y_train_c = y_train.copy().reset_index(drop=True)
# shuffle individual feature
X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
drop=True)
#print(X_train_c.isnull().sum())
# make prediction with shuffled feature and calculate roc-auc
shuff_auc = roc_auc_score(y_train_c,
(model.predict_proba(X_train_c))[:, 1])
#print(shuff_auc)
# save the drop in roc-auc
feature_dict[feature] = (train_auc - shuff_auc)
#print(feature_dict)
auc_drop = pd.Series(feature_dict).reset_index()
auc_drop.columns = ['feature', 'auc_drop']
auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
selected_features = auc_drop[auc_drop.auc_drop>0]['feature']
return auc_drop, selected_features
|