File size: 5,596 Bytes
0ab7b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import pandas as pd
import numpy as np
#from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
# 2018.11.17 Created by Eamon.Zhang
def constant_feature_detect(data,threshold=0.98):
""" detect features that show the same value for the
majority/all of the observations (constant/quasi-constant features)
Parameters
----------
data : pd.Dataframe
threshold : threshold to identify the variable as constant
Returns
-------
list of variables names
"""
data_copy = data.copy(deep=True)
quasi_constant_feature = []
for feature in data_copy.columns:
predominant = (data_copy[feature].value_counts() / np.float(
len(data_copy))).sort_values(ascending=False).values[0]
if predominant >= threshold:
quasi_constant_feature.append(feature)
print(len(quasi_constant_feature),' variables are found to be almost constant')
return quasi_constant_feature
def corr_feature_detect(data,threshold=0.8):
""" detect highly-correlated features of a Dataframe
Parameters
----------
data : pd.Dataframe
threshold : threshold to identify the variable correlated
Returns
-------
pairs of correlated variables
"""
corrmat = data.corr()
corrmat = corrmat.abs().unstack() # absolute value of corr coef
corrmat = corrmat.sort_values(ascending=False)
corrmat = corrmat[corrmat >= threshold]
corrmat = corrmat[corrmat < 1] # remove the digonal
corrmat = pd.DataFrame(corrmat).reset_index()
corrmat.columns = ['feature1', 'feature2', 'corr']
grouped_feature_ls = []
correlated_groups = []
for feature in corrmat.feature1.unique():
if feature not in grouped_feature_ls:
# find all features correlated to a single feature
correlated_block = corrmat[corrmat.feature1 == feature]
grouped_feature_ls = grouped_feature_ls + list(
correlated_block.feature2.unique()) + [feature]
# append the block of features to the list
correlated_groups.append(correlated_block)
return correlated_groups
def mutual_info(X,y,select_k=10):
# mi = mutual_info_classif(X,y)
# mi = pd.Series(mi)
# mi.index = X.columns
# mi.sort_values(ascending=False)
if select_k >= 1:
sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
col = X.columns[sel_.get_support()]
elif 0 < select_k < 1:
sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
col = X.columns[sel_.get_support()]
else:
raise ValueError("select_k must be a positive number")
return col
# 2018.11.27 edit Chi-square test
def chi_square_test(X,y,select_k=10):
"""
Compute chi-squared stats between each non-negative feature and class.
This score should be used to evaluate categorical variables in a classification task
"""
if select_k >= 1:
sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
col = X.columns[sel_.get_support()]
elif 0 < select_k < 1:
sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
col = X.columns[sel_.get_support()]
else:
raise ValueError("select_k must be a positive number")
return col
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):
"""
First, it builds one decision tree per feature, to predict the target
Second, it makes predictions using the decision tree and the mentioned feature
Third, it ranks the features according to the machine learning metric (roc-auc or mse)
It selects the highest ranked features
"""
roc_values = []
for feature in X_train.columns:
clf = DecisionTreeClassifier()
clf.fit(X_train[feature].to_frame(), y_train)
y_scored = clf.predict_proba(X_test[feature].to_frame())
roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
print(roc_values.sort_values(ascending=False))
print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
keep_col = roc_values[roc_values > threshold]
return keep_col
def univariate_mse(X_train,y_train,X_test,y_test,threshold):
"""
First, it builds one decision tree per feature, to predict the target
Second, it makes predictions using the decision tree and the mentioned feature
Third, it ranks the features according to the machine learning metric (roc-auc or mse)
It selects the highest ranked features
"""
mse_values = []
for feature in X_train.columns:
clf = DecisionTreeRegressor()
clf.fit(X_train[feature].to_frame(), y_train)
y_scored = clf.predict(X_test[feature].to_frame())
mse_values.append(mean_squared_error(y_test, y_scored))
mse_values = pd.Series(mse_values)
mse_values.index = X_train.columns
print(mse_values.sort_values(ascending=False))
print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
keep_col = mse_values[mse_values > threshold]
return keep_col
|