File size: 5,851 Bytes
2c29579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy as np
import pandas as pd

# ✅ FIX 1: remove duplicate import, make optional
try:
    import featuretools as ft
except Exception:
    ft = None

from sklearn.feature_selection import mutual_info_classif, mutual_info_regression, SelectFromModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


class ManagedFeatureEngine:
    def __init__(self, target_col, task_type="classification", max_features=1000):
        self.target_col = target_col
        self.task_type = task_type
        self.max_features = max_features
        self.selected_features = []

    def get_dynamic_budget(self, n_rows):
        if n_rows < 1000:
            return 300
        elif n_rows < 10000:
            return 800
        return self.max_features

    def generate_features(self, df):
        """Uses Featuretools to generate automated features within budget."""

        # ✅ FIX 2: safe input
        if df is None or df.empty:
            return df

        # ✅ FIX 3: target existence
        if self.target_col not in df.columns:
            return df

        # ✅ FIX 4: featuretools optional
        if not ft:
            return df

        n_rows = len(df)
        budget = self.get_dynamic_budget(n_rows)

        # ✅ FIX 5: safe drop
        base = df.drop(columns=[self.target_col], errors="ignore").copy()

        # Normalize datetime-like columns
        for col in base.select_dtypes(include=["object"]).columns:
            sample = base[col].dropna().head(50).astype(str)
            if sample.empty:
                continue
            parsed_sample = pd.to_datetime(sample, format="mixed", errors="coerce")
            if parsed_sample.notna().mean() >= 0.8:
                base[col] = pd.to_datetime(base[col], format="mixed", errors="coerce")

        # ✅ FIX 6: wrap DFS safely
        try:
            es = ft.EntitySet(id="dataset")
            es = es.add_dataframe(
                dataframe_name="data",
                dataframe=base,
                index="id",
                make_index=True
            )

            feature_matrix, feature_defs = ft.dfs(
                entityset=es,
                target_dataframe_name="data",
                max_depth=1,
                verbose=False
            )
        except Exception:
            return df

        # Add target back
        try:
            feature_matrix[self.target_col] = df[self.target_col].values
        except Exception:
            return df

        # Handle NaNs
        num_cols = feature_matrix.select_dtypes(include=[np.number]).columns
        dt_cols = feature_matrix.select_dtypes(include=["datetime64[ns]", "datetimetz"]).columns
        cat_cols = feature_matrix.select_dtypes(include=["category"]).columns
        obj_cols = feature_matrix.select_dtypes(include=["object"]).columns

        if len(num_cols):
            feature_matrix[num_cols] = feature_matrix[num_cols].fillna(0)

        if len(dt_cols):
            feature_matrix[dt_cols] = feature_matrix[dt_cols].fillna(pd.Timestamp("1970-01-01"))

        for col in cat_cols:
            try:
                if "missing" not in feature_matrix[col].cat.categories:
                    feature_matrix[col] = feature_matrix[col].cat.add_categories(["missing"])
                feature_matrix[col] = feature_matrix[col].fillna("missing")
            except Exception:
                pass  # ✅ FIX 7

        if len(obj_cols):
            feature_matrix[obj_cols] = feature_matrix[obj_cols].fillna("missing")

        # Feature selection
        X = feature_matrix.drop(columns=[self.target_col], errors="ignore")
        y = feature_matrix[self.target_col]

        X_numeric = X.select_dtypes(include=['number'])

        if X_numeric.empty:
            return df

        # Mutual Info
        try:
            if self.task_type == "classification":
                mi_scores = mutual_info_classif(X_numeric, y)
            else:
                mi_scores = mutual_info_regression(X_numeric, y)
        except Exception:
            return df  # ✅ FIX 8

        mi_series = pd.Series(mi_scores, index=X_numeric.columns).sort_values(ascending=False)

        # Tree-based selection
        try:
            if self.task_type == "classification":
                selector = SelectFromModel(RandomForestClassifier(n_estimators=50, max_depth=5))
            else:
                selector = SelectFromModel(RandomForestRegressor(n_estimators=50, max_depth=5))

            selector.fit(X_numeric, y)
            tree_selected = X_numeric.columns[selector.get_support()]
        except Exception:
            tree_selected = X_numeric.columns  # ✅ FIX 9 fallback

        # Ensemble
        top_mi = mi_series.head(budget).index
        final_features = list(set(top_mi).intersection(set(tree_selected)))

        if len(final_features) < 5:
            final_features = list(mi_series.head(budget).index)

        final_features = final_features[:budget]
        self.selected_features = final_features

        # ✅ FIX 10: safe column selection
        valid_features = [f for f in final_features if f in feature_matrix.columns]

        return feature_matrix[valid_features + [self.target_col]]

    def detect_leakage(self, df):
        if df is None or df.empty:  # ✅ FIX 11
            return []

        if self.target_col not in df.columns:
            return []

        numeric_df = df.select_dtypes(include=[np.number])

        if self.target_col not in numeric_df.columns or numeric_df.shape[1] < 2:
            return []

        try:
            correlations = numeric_df.corr()[self.target_col].abs().sort_values(ascending=False)
        except Exception:
            return []  # ✅ FIX 12

        leaks = correlations[correlations > 0.99].index.tolist()
        return [c for c in leaks if c != self.target_col]