File size: 7,513 Bytes
2c29579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pandas as pd
import numpy as np


def profile_dataset(df: pd.DataFrame) -> dict:
    # ✅ FIX 1: handle empty dataframe
    if df is None or df.empty:
        return {"error": "Empty dataset"}

    rows, cols = df.shape

    if rows < 1000:
        size = "Small"
    elif rows < 100000:
        size = "Medium"
    else:
        size = "Large"

    # Missing values
    total_cells = rows * cols
    missing_by_col = df.isnull().sum()
    missing_cells = int(missing_by_col.sum())
    missing_pct = round((missing_cells / total_cells) * 100, 2) if total_cells > 0 else 0

    # Sampling
    is_sampled = rows > 100000
    if is_sampled:
        # ✅ FIX 2: prevent crash if rows < 100000 due to edge conditions
        sample_n = min(100000, rows)
        df_stats = df.sample(n=sample_n, random_state=42)
    else:
        df_stats = df

    columns = df.columns.tolist()

    target_names = [
        'target', 'label', 'class', 'outcome', 'result', 'churn', 'price',
        'default', 'y', 'survived', 'fraud', 'output', 'pred', 'prediction',
        'target_class', 'target_label'
    ]
    id_hints = ['id', 'uuid', 'uid', 'index', 'idx', 'timestamp', 'date', 'time', 'row_id']

    suggested_target = None

    for col in columns:
        # ✅ FIX 3: safe string conversion
        if str(col).lower() in target_names:
            suggested_target = col
            break

    if not suggested_target and columns:
        for col in reversed(columns):
            if not any(h in str(col).lower() for h in id_hints):
                suggested_target = col
                break

        if not suggested_target:
            suggested_target = columns[-1]

    # Feature types
    num_cols = df_stats.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df_stats.select_dtypes(include=['object', 'category']).columns.tolist()

    # Model suggestion
    suggested_model = "Tree-based (Random Forest / XGBoost)"
    if rows > 10000 and len(num_cols) > len(cat_cols):
        suggested_model = "Gradient Boosting (XGBoost/LightGBM)"
    elif len(num_cols) > 0 and len(cat_cols) == 0 and rows < 5000:
        suggested_model = "Linear Model / SVM"

    # Imbalance
    imbalance = "Low"
    if suggested_target and suggested_target in df_stats.columns:
        target_counts = df_stats[suggested_target].value_counts()

        # ✅ FIX 4: avoid division errors
        if len(target_counts) >= 2 and target_counts.iloc[1] != 0:
            ratio = target_counts.iloc[0] / target_counts.iloc[1]
            if ratio > 3 or ratio < 0.33:
                imbalance = "High ⚠️"

    column_stats = {}
    feature_types = {}

    for col in columns:
        # ✅ FIX 5: safe column access
        if col not in df_stats.columns:
            continue

        col_series_stats = df_stats[col]

        unique_count = int(col_series_stats.nunique())
        unique_pct = unique_count / (len(df_stats) if len(df_stats) > 0 else 1)

        stats = {
            "dtype": str(df[col].dtype),
            "missing": int(missing_by_col.get(col, 0)),  # ✅ FIX 6
            "missing_pct": round(float(missing_by_col.get(col, 0) / rows * 100), 1) if rows > 0 else 0,
            "unique": unique_count,
            "unique_pct": unique_pct,
            "outliers": 0
        }

        # Semantic typing
        semantic_type = "Unknown"

        if unique_count == 2:
            semantic_type = "Binary"

        elif any(id_str in str(col).lower() for id_str in ['id', 'uuid', 'index']) and unique_pct > 0.8:
            semantic_type = "ID/Index"

        elif col in num_cols:
            # ✅ FIX 7: robust dtype check
            if pd.api.types.is_float_dtype(df[col]):
                semantic_type = "Continuous"
            elif unique_count < 20:
                semantic_type = "Discrete/Ordinal"
            else:
                semantic_type = "Continuous"

        else:
            # ✅ FIX 8: proper datetime detection
            if pd.api.types.is_datetime64_any_dtype(df[col]):
                semantic_type = "DateTime"
            else:
                semantic_type = "Nominal Category"

        feature_types[col] = semantic_type
        stats["semantic_type"] = semantic_type

        # Numeric stats
        if col in num_cols and not col_series_stats.isnull().all():
            try:
                stats["mean"] = round(float(col_series_stats.mean()), 4)
                stats["std"] = round(float(col_series_stats.std()), 4)
                stats["min"] = round(float(col_series_stats.min()), 4)
                stats["max"] = round(float(col_series_stats.max()), 4)

                stats["skew"] = (
                    round(float(col_series_stats.skew()), 1)
                    if not is_sampled else "N/A (Sampled)"
                )

                # Outliers (IQR)
                q1 = col_series_stats.quantile(0.25)
                q3 = col_series_stats.quantile(0.75)
                iqr = q3 - q1

                if iqr != 0:  # ✅ FIX 9: avoid zero division
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr

                    outliers = col_series_stats[
                        (col_series_stats < lower_bound) | (col_series_stats > upper_bound)
                    ]

                    stats["outliers"] = int(len(outliers))
                    stats["outlier_pct"] = round(
                        (stats["outliers"] / len(df_stats)) * 100, 2
                    )

            except Exception:
                pass  # ✅ FIX 10: prevent crash on bad data

        elif col in cat_cols:
            try:
                top_vals = col_series_stats.value_counts().head(3)
                stats["top_values"] = top_vals.index.tolist()
            except Exception:
                stats["top_values"] = []

        column_stats[col] = stats

    task_type = "classification"
    if suggested_target and suggested_target in df.columns:
        target_series = df[suggested_target].dropna()
        if not target_series.empty:
            if pd.api.types.is_numeric_dtype(target_series):
                unique_count = target_series.nunique(dropna=True)
                unique_ratio = unique_count / max(len(target_series), 1)
                if pd.api.types.is_float_dtype(target_series) or not (
                    unique_count <= 20 and unique_ratio <= 0.2
                ):
                    task_type = "regression"

    # Health score
    try:
        from core.health_score import compute_health_score

        health_metadata = compute_health_score({
            "rows": rows,
            "cols": cols,
            "missing_pct": missing_pct,
            "imbalance": imbalance,
            "num_cols": num_cols,
            "cat_cols": cat_cols,
            "column_stats": column_stats
        })
    except Exception:
        health_metadata = {"error": "health_score unavailable"}  # ✅ FIX 11

    return {
        "rows": rows,
        "cols": cols,
        "size": size,
        "missing_pct": missing_pct,
        "missing_values": missing_cells,
        "columns": columns,
        "num_cols": num_cols,
        "cat_cols": cat_cols,
        "imbalance": imbalance,
        "suggested_target": suggested_target,
        "task_type": task_type,
        "suggested_model": suggested_model,
        "column_stats": column_stats,
        "is_sampled": is_sampled,
        "sample_size": len(df_stats),  # ✅ FIX 12 (accurate)
        "health": health_metadata
    }