aitek230telu's picture
Upload 52 files
0ab7b0c verified
#import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
#from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor
#from sklearn.feature_selection import SelectFromModel
# 2018.11.27 Created by Eamon.Zhang
def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0):
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
random_state=random_state,class_weight=class_weight,
n_jobs=-1)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feat_labels = X_train.columns
std = np.std([tree.feature_importances_ for tree in model.estimators_],
axis=0) # inter-trees variability.
print("Feature ranking:")
# l1,l2,l3,l4 = [],[],[],[]
for f in range(X_train.shape[1]):
print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
# l1.append(f+1)
# l2.append(indices[f])
# l3.append(feat_labels[indices[f]])
# l4.append(importances[indices[f]])
#feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
# plotting
indices = indices[0:top_n]
plt.figure()
plt.title("Feature importances top %d" % top_n)
plt.bar(range(top_n), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(top_n), indices)
plt.xlim([-1,top_n])
plt.show()
return model
def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0):
model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,
random_state=random_state)
model.fit(X_train, y_train)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
feat_labels = X_train.columns
std = np.std([tree[0].feature_importances_ for tree in model.estimators_],
axis=0) # inter-trees variability.
print("Feature ranking:")
# l1,l2,l3,l4 = [],[],[],[]
for f in range(X_train.shape[1]):
print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
# l1.append(f+1)
# l2.append(indices[f])
# l3.append(feat_labels[indices[f]])
# l4.append(importances[indices[f]])
# feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
# plotting
indices = indices[0:top_n]
plt.figure()
plt.title("Feature importances top %d" % top_n)
plt.bar(range(top_n), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(top_n), indices)
plt.xlim([-1,top_n])
plt.show()
return model