|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
from sklearn.feature_selection import mutual_info_classif,chi2 |
|
|
from sklearn.feature_selection import SelectKBest, SelectPercentile |
|
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor |
|
|
from sklearn.metrics import roc_auc_score, mean_squared_error |
|
|
|
|
|
|
|
|
|
|
|
def constant_feature_detect(data,threshold=0.98): |
|
|
""" detect features that show the same value for the |
|
|
majority/all of the observations (constant/quasi-constant features) |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
data : pd.Dataframe |
|
|
threshold : threshold to identify the variable as constant |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list of variables names |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
quasi_constant_feature = [] |
|
|
for feature in data_copy.columns: |
|
|
predominant = (data_copy[feature].value_counts() / np.float( |
|
|
len(data_copy))).sort_values(ascending=False).values[0] |
|
|
if predominant >= threshold: |
|
|
quasi_constant_feature.append(feature) |
|
|
print(len(quasi_constant_feature),' variables are found to be almost constant') |
|
|
return quasi_constant_feature |
|
|
|
|
|
|
|
|
def corr_feature_detect(data,threshold=0.8): |
|
|
""" detect highly-correlated features of a Dataframe |
|
|
Parameters |
|
|
---------- |
|
|
data : pd.Dataframe |
|
|
threshold : threshold to identify the variable correlated |
|
|
|
|
|
Returns |
|
|
------- |
|
|
pairs of correlated variables |
|
|
""" |
|
|
|
|
|
corrmat = data.corr() |
|
|
corrmat = corrmat.abs().unstack() |
|
|
corrmat = corrmat.sort_values(ascending=False) |
|
|
corrmat = corrmat[corrmat >= threshold] |
|
|
corrmat = corrmat[corrmat < 1] |
|
|
corrmat = pd.DataFrame(corrmat).reset_index() |
|
|
corrmat.columns = ['feature1', 'feature2', 'corr'] |
|
|
|
|
|
grouped_feature_ls = [] |
|
|
correlated_groups = [] |
|
|
|
|
|
for feature in corrmat.feature1.unique(): |
|
|
if feature not in grouped_feature_ls: |
|
|
|
|
|
|
|
|
correlated_block = corrmat[corrmat.feature1 == feature] |
|
|
grouped_feature_ls = grouped_feature_ls + list( |
|
|
correlated_block.feature2.unique()) + [feature] |
|
|
|
|
|
|
|
|
correlated_groups.append(correlated_block) |
|
|
return correlated_groups |
|
|
|
|
|
|
|
|
def mutual_info(X,y,select_k=10): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if select_k >= 1: |
|
|
sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y) |
|
|
col = X.columns[sel_.get_support()] |
|
|
|
|
|
elif 0 < select_k < 1: |
|
|
sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y) |
|
|
col = X.columns[sel_.get_support()] |
|
|
|
|
|
else: |
|
|
raise ValueError("select_k must be a positive number") |
|
|
|
|
|
return col |
|
|
|
|
|
|
|
|
|
|
|
def chi_square_test(X,y,select_k=10): |
|
|
|
|
|
""" |
|
|
Compute chi-squared stats between each non-negative feature and class. |
|
|
This score should be used to evaluate categorical variables in a classification task |
|
|
""" |
|
|
if select_k >= 1: |
|
|
sel_ = SelectKBest(chi2, k=select_k).fit(X,y) |
|
|
col = X.columns[sel_.get_support()] |
|
|
elif 0 < select_k < 1: |
|
|
sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y) |
|
|
col = X.columns[sel_.get_support()] |
|
|
else: |
|
|
raise ValueError("select_k must be a positive number") |
|
|
|
|
|
return col |
|
|
|
|
|
|
|
|
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold): |
|
|
|
|
|
""" |
|
|
First, it builds one decision tree per feature, to predict the target |
|
|
Second, it makes predictions using the decision tree and the mentioned feature |
|
|
Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
|
|
It selects the highest ranked features |
|
|
|
|
|
""" |
|
|
roc_values = [] |
|
|
for feature in X_train.columns: |
|
|
clf = DecisionTreeClassifier() |
|
|
clf.fit(X_train[feature].to_frame(), y_train) |
|
|
y_scored = clf.predict_proba(X_test[feature].to_frame()) |
|
|
roc_values.append(roc_auc_score(y_test, y_scored[:, 1])) |
|
|
roc_values = pd.Series(roc_values) |
|
|
roc_values.index = X_train.columns |
|
|
print(roc_values.sort_values(ascending=False)) |
|
|
print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
|
|
keep_col = roc_values[roc_values > threshold] |
|
|
return keep_col |
|
|
|
|
|
|
|
|
def univariate_mse(X_train,y_train,X_test,y_test,threshold): |
|
|
|
|
|
""" |
|
|
First, it builds one decision tree per feature, to predict the target |
|
|
Second, it makes predictions using the decision tree and the mentioned feature |
|
|
Third, it ranks the features according to the machine learning metric (roc-auc or mse) |
|
|
It selects the highest ranked features |
|
|
|
|
|
""" |
|
|
mse_values = [] |
|
|
for feature in X_train.columns: |
|
|
clf = DecisionTreeRegressor() |
|
|
clf.fit(X_train[feature].to_frame(), y_train) |
|
|
y_scored = clf.predict(X_test[feature].to_frame()) |
|
|
mse_values.append(mean_squared_error(y_test, y_scored)) |
|
|
mse_values = pd.Series(mse_values) |
|
|
mse_values.index = X_train.columns |
|
|
print(mse_values.sort_values(ascending=False)) |
|
|
print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns)) |
|
|
keep_col = mse_values[mse_values > threshold] |
|
|
return keep_col |
|
|
|