import pandas as pd import numpy as np #from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import mutual_info_classif,chi2 from sklearn.feature_selection import SelectKBest, SelectPercentile from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.metrics import roc_auc_score, mean_squared_error # 2018.11.17 Created by Eamon.Zhang def constant_feature_detect(data,threshold=0.98): """ detect features that show the same value for the majority/all of the observations (constant/quasi-constant features) Parameters ---------- data : pd.Dataframe threshold : threshold to identify the variable as constant Returns ------- list of variables names """ data_copy = data.copy(deep=True) quasi_constant_feature = [] for feature in data_copy.columns: predominant = (data_copy[feature].value_counts() / np.float( len(data_copy))).sort_values(ascending=False).values[0] if predominant >= threshold: quasi_constant_feature.append(feature) print(len(quasi_constant_feature),' variables are found to be almost constant') return quasi_constant_feature def corr_feature_detect(data,threshold=0.8): """ detect highly-correlated features of a Dataframe Parameters ---------- data : pd.Dataframe threshold : threshold to identify the variable correlated Returns ------- pairs of correlated variables """ corrmat = data.corr() corrmat = corrmat.abs().unstack() # absolute value of corr coef corrmat = corrmat.sort_values(ascending=False) corrmat = corrmat[corrmat >= threshold] corrmat = corrmat[corrmat < 1] # remove the digonal corrmat = pd.DataFrame(corrmat).reset_index() corrmat.columns = ['feature1', 'feature2', 'corr'] grouped_feature_ls = [] correlated_groups = [] for feature in corrmat.feature1.unique(): if feature not in grouped_feature_ls: # find all features correlated to a single feature correlated_block = corrmat[corrmat.feature1 == feature] grouped_feature_ls = grouped_feature_ls + list( correlated_block.feature2.unique()) + [feature] # append the block of features to the list correlated_groups.append(correlated_block) return correlated_groups def mutual_info(X,y,select_k=10): # mi = mutual_info_classif(X,y) # mi = pd.Series(mi) # mi.index = X.columns # mi.sort_values(ascending=False) if select_k >= 1: sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y) col = X.columns[sel_.get_support()] elif 0 < select_k < 1: sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y) col = X.columns[sel_.get_support()] else: raise ValueError("select_k must be a positive number") return col # 2018.11.27 edit Chi-square test def chi_square_test(X,y,select_k=10): """ Compute chi-squared stats between each non-negative feature and class. This score should be used to evaluate categorical variables in a classification task """ if select_k >= 1: sel_ = SelectKBest(chi2, k=select_k).fit(X,y) col = X.columns[sel_.get_support()] elif 0 < select_k < 1: sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y) col = X.columns[sel_.get_support()] else: raise ValueError("select_k must be a positive number") return col def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold): """ First, it builds one decision tree per feature, to predict the target Second, it makes predictions using the decision tree and the mentioned feature Third, it ranks the features according to the machine learning metric (roc-auc or mse) It selects the highest ranked features """ roc_values = [] for feature in X_train.columns: clf = DecisionTreeClassifier() clf.fit(X_train[feature].to_frame(), y_train) y_scored = clf.predict_proba(X_test[feature].to_frame()) roc_values.append(roc_auc_score(y_test, y_scored[:, 1])) roc_values = pd.Series(roc_values) roc_values.index = X_train.columns print(roc_values.sort_values(ascending=False)) print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) keep_col = roc_values[roc_values > threshold] return keep_col def univariate_mse(X_train,y_train,X_test,y_test,threshold): """ First, it builds one decision tree per feature, to predict the target Second, it makes predictions using the decision tree and the mentioned feature Third, it ranks the features according to the machine learning metric (roc-auc or mse) It selects the highest ranked features """ mse_values = [] for feature in X_train.columns: clf = DecisionTreeRegressor() clf.fit(X_train[feature].to_frame(), y_train) y_scored = clf.predict(X_test[feature].to_frame()) mse_values.append(mean_squared_error(y_test, y_scored)) mse_values = pd.Series(mse_values) mse_values.index = X_train.columns print(mse_values.sort_values(ascending=False)) print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) keep_col = mse_values[mse_values > threshold] return keep_col