Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, mutual_info_regression | |
| #from utils.decision import decide_pipeline | |
| def detect_target_type(y): | |
| # target detection ... categorical (boolean , object , categorical numbers ) | |
| if y.dtype == "object": | |
| return "categorical" | |
| if str(y.dtype) == "category": | |
| return "categorical" | |
| if str(y.dtype) == "bool": | |
| return "categorical" | |
| if pd.api.types.is_numeric_dtype(y): | |
| unique_vals = y.nunique() | |
| # binary or multiclass target | |
| if unique_vals <= 10: | |
| return "categorical" | |
| return "numerical" | |
| return "categorical" | |
| def clean_numeric_scores(series): | |
| # remove infinity , nan , | |
| # sort descending | |
| series = series.replace([np.inf, -np.inf], np.nan).dropna() | |
| return series.sort_values(ascending=False) | |
| def feature_selection(df, target ,decisions ): | |
| results = {} | |
| if target not in df.columns: | |
| return results | |
| # separate columns | |
| numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist() | |
| categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist() | |
| # remove target | |
| if target in numeric_cols: | |
| numeric_cols.remove(target) | |
| if target in categorical_cols: | |
| categorical_cols.remove(target) | |
| # remove high cardinality categorical columns | |
| categorical_cols = [ | |
| col for col in categorical_cols | |
| if df[col].nunique(dropna=True) <= 50 | |
| # cardh | |
| ] | |
| y = df[target].copy() | |
| target_type = detect_target_type(y) | |
| # case 1 -> numerical target (regression) | |
| if target_type == "numerical": | |
| # numerical Features vs target | |
| if len(numeric_cols) > 0: | |
| X_num = df[numeric_cols] | |
| corr_scores = X_num.corrwith(y).abs() | |
| corr_scores = clean_numeric_scores(corr_scores) | |
| if len(corr_scores) > 0: | |
| results["numerical_correlation"] = corr_scores | |
| # mutual information | |
| try: | |
| mi = mutual_info_regression( | |
| X_num.fillna(X_num.median()), | |
| y | |
| ) | |
| mi_scores = pd.Series( | |
| mi, | |
| index=numeric_cols | |
| ) | |
| mi_scores = clean_numeric_scores(mi_scores) | |
| if len(mi_scores) > 0: | |
| results["numerical_mutual_info"] = mi_scores | |
| except: | |
| pass | |
| # categorical features vs target | |
| if len(categorical_cols) > 0: | |
| X_cat = pd.get_dummies( | |
| df[categorical_cols], | |
| drop_first=True | |
| ) | |
| if X_cat.shape[1] > 0: | |
| try: | |
| f_scores, _ = f_classif(X_cat, y) | |
| anova_scores = pd.Series( | |
| f_scores, | |
| index=X_cat.columns | |
| ) | |
| anova_scores = clean_numeric_scores(anova_scores) | |
| if len(anova_scores) > 0: | |
| results["categorical_anova"] = anova_scores | |
| except: | |
| pass | |
| #case 2 -? categorical target (classification) | |
| else: | |
| y_encoded = pd.factorize(y)[0] | |
| # numeric features vs target | |
| if len(numeric_cols) > 0: | |
| X_num = df[numeric_cols].copy() | |
| # fill nulls | |
| for col in X_num.columns: | |
| X_num[col] = X_num[col].fillna(X_num[col].median()) | |
| try: | |
| f_scores, _ = f_classif(X_num, y_encoded) | |
| anova_scores = pd.Series( | |
| f_scores, | |
| index=numeric_cols | |
| ) | |
| anova_scores = clean_numeric_scores(anova_scores) | |
| if len(anova_scores) > 0: | |
| results["numerical_anova"] = anova_scores | |
| except: | |
| pass | |
| # mutual information ? | |
| try: | |
| mi = mutual_info_classif(X_num, y_encoded) | |
| mi_scores = pd.Series( | |
| mi, | |
| index=numeric_cols | |
| ) | |
| mi_scores = clean_numeric_scores(mi_scores) | |
| if len(mi_scores) > 0: | |
| results["numerical_mutual_info"] = mi_scores | |
| except: | |
| pass | |
| # categorical features vs target | |
| if len(categorical_cols) > 0: | |
| X_cat = pd.get_dummies( | |
| df[categorical_cols], | |
| drop_first=True | |
| ) | |
| if X_cat.shape[1] > 0: | |
| try: | |
| chi_scores, _ = chi2(X_cat, y_encoded) | |
| chi_scores = pd.Series( | |
| chi_scores, | |
| index=X_cat.columns | |
| ) | |
| chi_scores = clean_numeric_scores(chi_scores) | |
| if len(chi_scores) > 0: | |
| results["categorical_chi2"] = chi_scores | |
| except: | |
| pass | |
| return results |