#import pandas as pd import numpy as np import matplotlib.pyplot as plt #import seaborn as sns #from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor #from sklearn.feature_selection import SelectFromModel # 2018.11.27 Created by Eamon.Zhang def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0): model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, random_state=random_state,class_weight=class_weight, n_jobs=-1) model.fit(X_train, y_train) importances = model.feature_importances_ indices = np.argsort(importances)[::-1] feat_labels = X_train.columns std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) # inter-trees variability. print("Feature ranking:") # l1,l2,l3,l4 = [],[],[],[] for f in range(X_train.shape[1]): print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]])) # l1.append(f+1) # l2.append(indices[f]) # l3.append(feat_labels[indices[f]]) # l4.append(importances[indices[f]]) #feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances']) # plotting indices = indices[0:top_n] plt.figure() plt.title("Feature importances top %d" % top_n) plt.bar(range(top_n), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(top_n), indices) plt.xlim([-1,top_n]) plt.show() return model def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0): model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth, random_state=random_state) model.fit(X_train, y_train) importances = model.feature_importances_ indices = np.argsort(importances)[::-1] feat_labels = X_train.columns std = np.std([tree[0].feature_importances_ for tree in model.estimators_], axis=0) # inter-trees variability. print("Feature ranking:") # l1,l2,l3,l4 = [],[],[],[] for f in range(X_train.shape[1]): print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]])) # l1.append(f+1) # l2.append(indices[f]) # l3.append(feat_labels[indices[f]]) # l4.append(importances[indices[f]]) # feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances']) # plotting indices = indices[0:top_n] plt.figure() plt.title("Feature importances top %d" % top_n) plt.bar(range(top_n), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(top_n), indices) plt.xlim([-1,top_n]) plt.show() return model