data_analysis_agent / utils /feature_selection.py
Shrouk04's picture
Upload 36 files
f73646a verified
Raw
History Blame Contribute Delete
5.32 kB
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, mutual_info_regression
#from utils.decision import decide_pipeline
def detect_target_type(y):
# target detection ... categorical (boolean , object , categorical numbers )
if y.dtype == "object":
return "categorical"
if str(y.dtype) == "category":
return "categorical"
if str(y.dtype) == "bool":
return "categorical"
if pd.api.types.is_numeric_dtype(y):
unique_vals = y.nunique()
# binary or multiclass target
if unique_vals <= 10:
return "categorical"
return "numerical"
return "categorical"
def clean_numeric_scores(series):
# remove infinity , nan ,
# sort descending
series = series.replace([np.inf, -np.inf], np.nan).dropna()
return series.sort_values(ascending=False)
def feature_selection(df, target ,decisions ):
results = {}
if target not in df.columns:
return results
# separate columns
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
# remove target
if target in numeric_cols:
numeric_cols.remove(target)
if target in categorical_cols:
categorical_cols.remove(target)
# remove high cardinality categorical columns
categorical_cols = [
col for col in categorical_cols
if df[col].nunique(dropna=True) <= 50
# cardh
]
y = df[target].copy()
target_type = detect_target_type(y)
# case 1 -> numerical target (regression)
if target_type == "numerical":
# numerical Features vs target
if len(numeric_cols) > 0:
X_num = df[numeric_cols]
corr_scores = X_num.corrwith(y).abs()
corr_scores = clean_numeric_scores(corr_scores)
if len(corr_scores) > 0:
results["numerical_correlation"] = corr_scores
# mutual information
try:
mi = mutual_info_regression(
X_num.fillna(X_num.median()),
y
)
mi_scores = pd.Series(
mi,
index=numeric_cols
)
mi_scores = clean_numeric_scores(mi_scores)
if len(mi_scores) > 0:
results["numerical_mutual_info"] = mi_scores
except:
pass
# categorical features vs target
if len(categorical_cols) > 0:
X_cat = pd.get_dummies(
df[categorical_cols],
drop_first=True
)
if X_cat.shape[1] > 0:
try:
f_scores, _ = f_classif(X_cat, y)
anova_scores = pd.Series(
f_scores,
index=X_cat.columns
)
anova_scores = clean_numeric_scores(anova_scores)
if len(anova_scores) > 0:
results["categorical_anova"] = anova_scores
except:
pass
#case 2 -? categorical target (classification)
else:
y_encoded = pd.factorize(y)[0]
# numeric features vs target
if len(numeric_cols) > 0:
X_num = df[numeric_cols].copy()
# fill nulls
for col in X_num.columns:
X_num[col] = X_num[col].fillna(X_num[col].median())
try:
f_scores, _ = f_classif(X_num, y_encoded)
anova_scores = pd.Series(
f_scores,
index=numeric_cols
)
anova_scores = clean_numeric_scores(anova_scores)
if len(anova_scores) > 0:
results["numerical_anova"] = anova_scores
except:
pass
# mutual information ?
try:
mi = mutual_info_classif(X_num, y_encoded)
mi_scores = pd.Series(
mi,
index=numeric_cols
)
mi_scores = clean_numeric_scores(mi_scores)
if len(mi_scores) > 0:
results["numerical_mutual_info"] = mi_scores
except:
pass
# categorical features vs target
if len(categorical_cols) > 0:
X_cat = pd.get_dummies(
df[categorical_cols],
drop_first=True
)
if X_cat.shape[1] > 0:
try:
chi_scores, _ = chi2(X_cat, y_encoded)
chi_scores = pd.Series(
chi_scores,
index=X_cat.columns
)
chi_scores = clean_numeric_scores(chi_scores)
if len(chi_scores) > 0:
results["categorical_chi2"] = chi_scores
except:
pass
return results