|
|
|
|
|
|
|
|
|
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.metrics import roc_auc_score |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recursive_feature_elimination_rf(X_train,y_train,X_test,y_test, |
|
|
tol=0.001,max_depth=None, |
|
|
class_weight=None, |
|
|
top_n=15,n_estimators=50,random_state=0): |
|
|
|
|
|
|
|
|
features_to_remove = [] |
|
|
count = 1 |
|
|
|
|
|
model_all_features = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, |
|
|
random_state=random_state,class_weight=class_weight, |
|
|
n_jobs=-1) |
|
|
model_all_features.fit(X_train, y_train) |
|
|
y_pred_test = model_all_features.predict_proba(X_test)[:, 1] |
|
|
auc_score_all = roc_auc_score(y_test, y_pred_test) |
|
|
|
|
|
for feature in X_train.columns: |
|
|
print() |
|
|
print('testing feature: ', feature, ' which is feature ', count, |
|
|
' out of ', len(X_train.columns)) |
|
|
count += 1 |
|
|
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, |
|
|
random_state=random_state,class_weight=class_weight, |
|
|
n_jobs=-1) |
|
|
|
|
|
|
|
|
|
|
|
model.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train) |
|
|
y_pred_test = model.predict_proba( |
|
|
X_test.drop(features_to_remove + [feature], axis=1))[:, 1] |
|
|
auc_score_int = roc_auc_score(y_test, y_pred_test) |
|
|
print('New Test ROC AUC={}'.format((auc_score_int))) |
|
|
|
|
|
|
|
|
print('All features Test ROC AUC={}'.format((auc_score_all))) |
|
|
|
|
|
|
|
|
diff_auc = auc_score_all - auc_score_int |
|
|
|
|
|
|
|
|
if diff_auc >= tol: |
|
|
print('Drop in ROC AUC={}'.format(diff_auc)) |
|
|
print('keep: ', feature) |
|
|
|
|
|
else: |
|
|
print('Drop in ROC AUC={}'.format(diff_auc)) |
|
|
print('remove: ', feature) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auc_score_all = auc_score_int |
|
|
|
|
|
|
|
|
features_to_remove.append(feature) |
|
|
print('DONE!!') |
|
|
print('total features to remove: ', len(features_to_remove)) |
|
|
features_to_keep = [x for x in X_train.columns if x not in features_to_remove] |
|
|
print('total features to keep: ', len(features_to_keep)) |
|
|
|
|
|
return features_to_keep |
|
|
|
|
|
|
|
|
def recursive_feature_addition_rf(X_train,y_train,X_test,y_test, |
|
|
tol=0.001,max_depth=None, |
|
|
class_weight=None, |
|
|
top_n=15,n_estimators=50,random_state=0): |
|
|
|
|
|
|
|
|
features_to_keep = [X_train.columns[0]] |
|
|
count = 1 |
|
|
|
|
|
model_one_feature = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, |
|
|
random_state=random_state,class_weight=class_weight, |
|
|
n_jobs=-1) |
|
|
model_one_feature.fit(X_train[[X_train.columns[0]]], y_train) |
|
|
y_pred_test = model_one_feature.predict_proba(X_test[[X_train.columns[0]]])[:, 1] |
|
|
auc_score_all = roc_auc_score(y_test, y_pred_test) |
|
|
|
|
|
for feature in X_train.columns[1:]: |
|
|
print() |
|
|
print('testing feature: ', feature, ' which is feature ', count, |
|
|
' out of ', len(X_train.columns)) |
|
|
count += 1 |
|
|
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, |
|
|
random_state=random_state,class_weight=class_weight, |
|
|
n_jobs=-1) |
|
|
|
|
|
|
|
|
|
|
|
model.fit(X_train[features_to_keep + [feature]], y_train) |
|
|
y_pred_test = model.predict_proba( |
|
|
X_test[features_to_keep + [feature]])[:, 1] |
|
|
auc_score_int = roc_auc_score(y_test, y_pred_test) |
|
|
print('New Test ROC AUC={}'.format((auc_score_int))) |
|
|
|
|
|
|
|
|
print('All features Test ROC AUC={}'.format((auc_score_all))) |
|
|
|
|
|
|
|
|
diff_auc = auc_score_int - auc_score_all |
|
|
|
|
|
|
|
|
if diff_auc >= tol: |
|
|
|
|
|
|
|
|
|
|
|
print('Increase in ROC AUC={}'.format(diff_auc)) |
|
|
print('keep: ', feature) |
|
|
auc_score_all = auc_score_int |
|
|
features_to_keep.append(feature) |
|
|
else: |
|
|
print('Increase in ROC AUC={}'.format(diff_auc)) |
|
|
print('remove: ', feature) |
|
|
|
|
|
print('DONE!!') |
|
|
print('total features to keep: ', len(features_to_keep)) |
|
|
|
|
|
return features_to_keep |