File size: 5,851 Bytes
2c29579 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | import numpy as np
import pandas as pd
# ✅ FIX 1: remove duplicate import, make optional
try:
import featuretools as ft
except Exception:
ft = None
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, SelectFromModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
class ManagedFeatureEngine:
def __init__(self, target_col, task_type="classification", max_features=1000):
self.target_col = target_col
self.task_type = task_type
self.max_features = max_features
self.selected_features = []
def get_dynamic_budget(self, n_rows):
if n_rows < 1000:
return 300
elif n_rows < 10000:
return 800
return self.max_features
def generate_features(self, df):
"""Uses Featuretools to generate automated features within budget."""
# ✅ FIX 2: safe input
if df is None or df.empty:
return df
# ✅ FIX 3: target existence
if self.target_col not in df.columns:
return df
# ✅ FIX 4: featuretools optional
if not ft:
return df
n_rows = len(df)
budget = self.get_dynamic_budget(n_rows)
# ✅ FIX 5: safe drop
base = df.drop(columns=[self.target_col], errors="ignore").copy()
# Normalize datetime-like columns
for col in base.select_dtypes(include=["object"]).columns:
sample = base[col].dropna().head(50).astype(str)
if sample.empty:
continue
parsed_sample = pd.to_datetime(sample, format="mixed", errors="coerce")
if parsed_sample.notna().mean() >= 0.8:
base[col] = pd.to_datetime(base[col], format="mixed", errors="coerce")
# ✅ FIX 6: wrap DFS safely
try:
es = ft.EntitySet(id="dataset")
es = es.add_dataframe(
dataframe_name="data",
dataframe=base,
index="id",
make_index=True
)
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_dataframe_name="data",
max_depth=1,
verbose=False
)
except Exception:
return df
# Add target back
try:
feature_matrix[self.target_col] = df[self.target_col].values
except Exception:
return df
# Handle NaNs
num_cols = feature_matrix.select_dtypes(include=[np.number]).columns
dt_cols = feature_matrix.select_dtypes(include=["datetime64[ns]", "datetimetz"]).columns
cat_cols = feature_matrix.select_dtypes(include=["category"]).columns
obj_cols = feature_matrix.select_dtypes(include=["object"]).columns
if len(num_cols):
feature_matrix[num_cols] = feature_matrix[num_cols].fillna(0)
if len(dt_cols):
feature_matrix[dt_cols] = feature_matrix[dt_cols].fillna(pd.Timestamp("1970-01-01"))
for col in cat_cols:
try:
if "missing" not in feature_matrix[col].cat.categories:
feature_matrix[col] = feature_matrix[col].cat.add_categories(["missing"])
feature_matrix[col] = feature_matrix[col].fillna("missing")
except Exception:
pass # ✅ FIX 7
if len(obj_cols):
feature_matrix[obj_cols] = feature_matrix[obj_cols].fillna("missing")
# Feature selection
X = feature_matrix.drop(columns=[self.target_col], errors="ignore")
y = feature_matrix[self.target_col]
X_numeric = X.select_dtypes(include=['number'])
if X_numeric.empty:
return df
# Mutual Info
try:
if self.task_type == "classification":
mi_scores = mutual_info_classif(X_numeric, y)
else:
mi_scores = mutual_info_regression(X_numeric, y)
except Exception:
return df # ✅ FIX 8
mi_series = pd.Series(mi_scores, index=X_numeric.columns).sort_values(ascending=False)
# Tree-based selection
try:
if self.task_type == "classification":
selector = SelectFromModel(RandomForestClassifier(n_estimators=50, max_depth=5))
else:
selector = SelectFromModel(RandomForestRegressor(n_estimators=50, max_depth=5))
selector.fit(X_numeric, y)
tree_selected = X_numeric.columns[selector.get_support()]
except Exception:
tree_selected = X_numeric.columns # ✅ FIX 9 fallback
# Ensemble
top_mi = mi_series.head(budget).index
final_features = list(set(top_mi).intersection(set(tree_selected)))
if len(final_features) < 5:
final_features = list(mi_series.head(budget).index)
final_features = final_features[:budget]
self.selected_features = final_features
# ✅ FIX 10: safe column selection
valid_features = [f for f in final_features if f in feature_matrix.columns]
return feature_matrix[valid_features + [self.target_col]]
def detect_leakage(self, df):
if df is None or df.empty: # ✅ FIX 11
return []
if self.target_col not in df.columns:
return []
numeric_df = df.select_dtypes(include=[np.number])
if self.target_col not in numeric_df.columns or numeric_df.shape[1] < 2:
return []
try:
correlations = numeric_df.corr()[self.target_col].abs().sort_values(ascending=False)
except Exception:
return [] # ✅ FIX 12
leaks = correlations[correlations > 0.99].index.tolist()
return [c for c in leaks if c != self.target_col] |