File size: 11,087 Bytes
9eecab5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
from utils.logger import logger
import numpy as np
from sklearn.ensemble import RandomForestClassifier

class AnalysisAgent:

    def __init__(self, registry):
        
        self.registry = registry

    # ---------------------------
    # Proper dataset extraction
    # ---------------------------
    def _extract_dataset(self, text):
        print("๐Ÿ” Running dataset analysis...")
        datasets = self.registry.list_datasets()
        words = str(text).lower().split()

        for word in words:
            for d in datasets:
                if word == d.lower():
                    return d
        return None

    # ---------------------------
    # Remove ID-like columns
    # ---------------------------
    def _remove_id_like_columns(self, df):
        cols_to_drop = []

        for col in df.columns:
            unique_ratio = df[col].nunique() / len(df)

            if unique_ratio > 0.9:
                cols_to_drop.append(col)

        df_clean = df.drop(columns=cols_to_drop)

        return df_clean, cols_to_drop

    # ---------------------------
    # Select target column
    # ---------------------------
    def _select_target(self, df):
        candidates = []

        for col in df.columns:
            unique_count = df[col].nunique()
            unique_ratio = unique_count / len(df)

            # Skip obvious bad columns
            if any(k in col.lower() for k in ["id", "name", "email", "phone"]):
                continue

            #  Skip high-cardinality
            if unique_ratio > 0.5:
                continue

            # Prefer categorical / classification targets
            if unique_count <= 20:
                candidates.append((col, unique_count))

        #  Pick best candidate (lowest unique count but >1)
        if candidates:
            candidates = sorted(candidates, key=lambda x: x[1])
            return candidates[0][0]

        return None

    # ---------------------------
    # Feature importance
    # ---------------------------
    def _compute_feature_importance(self, df):

        df_clean, dropped_cols = self._remove_id_like_columns(df)
        if len(df.columns) <= 2:
            return None, dropped_cols, "Dataset too small for feature importance."
        target = self._select_target(df_clean)
        if not target:
            return None, dropped_cols, "No suitable target column found."
        
        y = df_clean[target]

        # Fix NaN issue
        if y.isnull().sum() > 0:
            return None, dropped_cols, "Target contains missing values. Cannot compute feature importance."
        if not target:
            return None, dropped_cols, "No suitable target column found."

        #  Prevent sklearn warning
        if df_clean[target].nunique() > 0.5 * len(df_clean):
            return None, dropped_cols, "Target not suitable for classification."

        X = df_clean.drop(columns=[target])
        y = df_clean[target]

        # Encode categoricals
        X = X.apply(lambda col: col.astype('category').cat.codes)

        try:
            model = RandomForestClassifier(n_estimators=50)
            model.fit(X, y)

            importances = dict(zip(X.columns, model.feature_importances_))
            sorted_imp = sorted(importances.items(), key=lambda x: x[1], reverse=True)

            return sorted_imp[:5], dropped_cols, None

        except Exception as e:
            return None, dropped_cols, str(e)

    # ---------------------------
    # Optional explanation layer
    # ---------------------------
    def _explain_feature(self, col):
        return f"{col} shows strong predictive signal based on dataset patterns."
    
    #----------------------------
    # Outlier Detection
    #----------------------------
    def _detect_outliers(self, df):
        try:
            numeric_df = df.select_dtypes(include="number")

            outlier_summary = {}

            for col in numeric_df.columns:
                q1 = numeric_df[col].quantile(0.25)
                q3 = numeric_df[col].quantile(0.75)
                iqr = q3 - q1

                lower = q1 - 1.5 * iqr
                upper = q3 + 1.5 * iqr

                outliers = numeric_df[(numeric_df[col] < lower) | (numeric_df[col] > upper)]

                if len(outliers) > 0:
                    outlier_summary[col] = len(outliers)

            return outlier_summary, None

        except Exception as e:
            logger.error(f"Outlier detection failed | {e}")
            return None, str(e)
    
    #---------------------------
    # Correlation analysis
    #---------------------------
    def _compute_correlation(self, df):
        try:
            numeric_df = df.select_dtypes(include="number")

            if numeric_df.shape[1] < 2:
                return None, "Not enough numeric columns for correlation."

            # corr = numeric_df.corr()

            # Get top correlations (excluding self)
            corr_matrix = numeric_df.corr().abs()

            upper = corr_matrix.where(
                np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
            )

            top_pairs = (
                upper.unstack()
                .dropna()
                .sort_values(ascending=False)
                .head(5)
            )
            return top_pairs.to_dict(), None

        except Exception as e:
            logger.error(f"Correlation failed | {e}")
            return None, str(e)
        
    #-------------------------
    #Saving report    
    #-------------------------
    def _export_report(self, dataset, content):
        try:
            path = f"output/report_{dataset}.txt"

            with open(path, "w", encoding="utf-8") as f:
                f.write(content)

            logger.info(f"Report exported: {path}")

            return path

        except Exception as e:
            logger.error(f"Report export failed | {e}")
            return None

    # ---------------------------
    # MAIN HANDLER
    # ---------------------------
    def handle(self, dataset=None):

        try:
            # ---- HANDLE "analyze people" CASE ----
            if isinstance(dataset, str):
                extracted = self._extract_dataset(dataset)
                if extracted:
                    dataset = extracted

            # ---- STRICT DATASET CHECK ----
            if not dataset:
                return "Please specify a dataset (e.g., 'analyze people')"

            df = self.registry.load_dataframe(dataset)

        except Exception as e:
            logger.error(f"Failed loading dataset | {e}")
            return f"Failed to load dataset: {dataset}"

        try:
            # ---------- OUTPUT ----------
            output = []
            rows, cols = df.shape
            print("๐Ÿงน Checking duplicates...")
            # ---------- DATA QUALITY ----------
            total_missing = df.isnull().sum().sum()
            duplicates = df.duplicated().sum()

            missing_by_column = df.isnull().sum()
            missing_by_column = missing_by_column[missing_by_column > 0]

            # ---------- COLUMN TYPES ----------
            numeric_cols = df.select_dtypes(include="number").columns.tolist()
            categorical_cols = df.select_dtypes(exclude="number").columns.tolist()

            # ---------- WARNINGS ----------
            print("โš ๏ธ Generating warnings...")
            warnings = []

            for col in df.columns:
                if len(df) == 0:
                    continue

                unique_ratio = df[col].nunique() / len(df)

                if unique_ratio > 0.95 and "id" in col.lower():
                    warnings.append(f"{col} looks like an ID column")

                missing_ratio = df[col].isnull().sum() / len(df)
                if missing_ratio > 0.5:
                    warnings.append(f"{col} has {missing_ratio:.2%} missing values")

                if df[col].nunique() == 1:
                    warnings.append(f"{col} is constant (no variance)")

            # ---------- FEATURE IMPORTANCE (NEW CLEAN VERSION) ----------
            print("๐Ÿ“ˆ Looking for potential feature importance...")
            fi, dropped_cols, error = self._compute_feature_importance(df)

            # ---------- CORRELATION ANALYSIS ----------
            print("๐Ÿ“Š Computing correlation...")
            corr_pairs, corr_error = self._compute_correlation(df)

            output= []

            output.append(f"\nDataset Analysis: {dataset}")
            output.append("=" * 40)

            output.append(f"Rows: {rows}")
            output.append(f"Columns: {cols}")

            output.append("\nData Quality")
            output.append("-" * 20)
            output.append(f"Total Missing Values : {total_missing}")
            output.append(f"Duplicate Rows       : {duplicates}")
            # ---------- CORRELATION OUTPUT ----------
            output.append("\nTop Correlations")
            output.append("-" * 20)


            if corr_error:
                output.append(corr_error)
            elif corr_pairs is not None:
                for (col1, col2), val in corr_pairs.items():
                    output.append(f"{col1} โ†” {col2}: {val:.3f}")
            else:
                output.append("No correlation data available.")
            if not missing_by_column.empty:
                output.append("\nMissing by Column")
                output.append("-" * 20)
                for col, val in missing_by_column.items():
                    output.append(f"{col}: {val}")

            output.append("\nColumn Types")
            output.append("-" * 20)
            output.append(f"Numeric      : {', '.join(numeric_cols) if numeric_cols else 'None'}")
            output.append(f"Categorical  : {', '.join(categorical_cols) if categorical_cols else 'None'}")

            if warnings:
                output.append("\nโš ๏ธ Data Warnings")
                output.append("-" * 20)
                for w in warnings[:5]:
                    output.append(f"- {w}")

            # ---------- FEATURE IMPORTANCE OUTPUT ----------
            output.append("\nPotential Feature Importance")
            output.append("-" * 20)

            if error:
                output.append(error)
            else:
                for col, score in fi:
                    explanation = self._explain_feature(col)
                    output.append(f"{col}: {score:.4f} โ†’ {explanation}")

            # ---------- DROPPED COLUMNS ----------
            if dropped_cols:
                output.append("\nโš ๏ธ Ignored high-cardinality columns:")
                for col in dropped_cols:
                    output.append(f"- {col}")

             # ---------- EXPORT (ONLY ONCE) ----------
            report_path = self._export_report(dataset, "\n".join(output))

            if report_path:
                output.append(f"\n๐Ÿ“ Report saved to: {report_path}")

            return "\n".join(output)

        except Exception as e:
            logger.error(f"Analysis failed | {e}")
            return "Analysis agent error."