File size: 17,444 Bytes
13bf251
 
 
 
 
 
6753c42
13bf251
6753c42
 
13bf251
 
 
3e930db
13bf251
3e930db
e791e5b
 
 
 
 
2fe6c63
 
 
eaca5d0
 
5e1bd42
13bf251
eaca5d0
 
 
 
 
5e1bd42
 
e791e5b
5715ac3
5e1bd42
eaca5d0
 
e791e5b
 
 
eaca5d0
 
 
 
 
 
 
13bf251
5715ac3
eaca5d0
 
e791e5b
 
13bf251
e791e5b
 
13bf251
e791e5b
 
 
eaca5d0
 
13bf251
6753c42
3e930db
e791e5b
 
 
6753c42
 
eaca5d0
13bf251
6753c42
864aed7
 
 
 
13bf251
e791e5b
 
 
 
 
 
 
 
 
 
755fb3a
 
 
3e930db
e791e5b
3e930db
755fb3a
3e930db
 
 
 
 
 
 
2fe6c63
 
3e930db
 
 
2fe6c63
 
 
3e930db
 
 
 
 
 
 
 
 
 
 
2fe6c63
e791e5b
2fe6c63
e791e5b
755fb3a
e791e5b
 
 
 
 
 
755fb3a
2fe6c63
 
 
 
 
 
 
 
 
 
 
 
 
e791e5b
864aed7
e791e5b
 
 
 
 
 
 
 
 
 
 
 
3e930db
e791e5b
 
 
 
 
 
 
 
 
eaca5d0
e791e5b
 
 
 
 
13bf251
6753c42
13bf251
 
3e930db
e791e5b
 
 
 
 
 
eaca5d0
e791e5b
 
 
 
 
 
 
 
 
3e930db
e791e5b
 
 
 
 
5b239bf
 
e791e5b
 
5b239bf
e791e5b
 
 
5b239bf
13bf251
6753c42
13bf251
 
3e930db
e791e5b
 
13bf251
6753c42
13bf251
5b239bf
2fe6c63
 
 
 
5b239bf
 
 
 
 
 
 
2fe6c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5715ac3
2fe6c63
 
 
 
 
 
 
 
5715ac3
 
 
 
 
2fe6c63
 
 
5715ac3
 
 
 
3ee9608
2fe6c63
 
 
6f355e0
2fe6c63
6f355e0
 
5715ac3
3ee9608
5715ac3
 
2fe6c63
 
13bf251
eaca5d0
ee936fb
 
 
 
e791e5b
ee936fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
ee936fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
1ed57a6
 
 
 
 
 
 
2fe6c63
6f355e0
1ed57a6
6f355e0
1ed57a6
 
 
 
 
 
6f355e0
1ed57a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fe6c63
ee936fb
 
 
5715ac3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import pandas as pd
import numpy as np
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import io
from PIL import Image

# Constants
RANDOM_STATE = 42
MIN_ROWS = 10
MIN_COLS = 2
MAX_FEATURES_TO_SHOW = 10

# Global variable to store trained model and data
global_data = {'model': None, 'scaler': None, 'X_columns': None, 'y_type': None, 'uniques': None}

def update_dropdown(file):
    if file is None:
        return gr.update(choices=[], value=None)
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return gr.update(choices=[], value=None)
        return gr.update(choices=list(df.columns), value=None)
    except Exception as e:
        print(f"Error in update_dropdown: {e}")  # Debug logging
        return gr.update(choices=[], value=None)

def analyze_file(file, label_col, n_clusters):
    if file is None:
        return ("Please upload a file.", None, None, None, None, None)
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None)
    except Exception as e:
        print(f"Error reading file: {e}")  # Debug logging
        return (f"Error reading file: {e}", None, None, None, None, None)

    if df.empty:
        return ("File is empty.", None, None, None, None, None)
    if label_col not in df.columns:
        return (f"Label column '{label_col}' not found.", None, None, None, None, None)
    
    df = df.dropna()
    if df.shape[0] < MIN_ROWS:
        return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
    if df.shape[1] < MIN_COLS:
        return ("Need at least one feature and one label column.", None, None, None, None, None)

    y = df[label_col]
    X = df.drop(columns=[label_col])
    X_processed = pd.get_dummies(X)
    if X_processed.shape[1] == 0:
        return ("No valid features after preprocessing.", None, None, None, None, None)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_processed)

    results_text = ""
    model_img = None
    fi_img = None
    kmeans_img = None
    agg_img = None
    diff_img = None

    try:
        if pd.api.types.is_numeric_dtype(y):
            # Regression
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestRegressor(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results_text += (
                "Regression Results:\n"
                f"- MSE: {mse:.3f}\n"
                f"- R²: {r2:.3f}\n"
                "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
            )
            # 2D Plots: Top 3 features vs predicted and true vs predicted
            fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
            top_features = fi.head(3).index
            fig, axes = plt.subplots(2, 2, figsize=(12, 10))
            axes = axes.flatten()
            for i, feature in enumerate(top_features):
                ax = axes[i]
                ax.scatter(X_test[feature], y_pred, alpha=0.5)
                ax.set_xlabel(feature)
                ax.set_ylabel('Predicted Value')
                ax.set_title(f'{feature} vs Predicted')
            ax = axes[3]
            ax.scatter(y_test, y_pred, alpha=0.5)
            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
            ax.set_xlabel('True Value')
            ax.set_ylabel('Predicted Value')
            ax.set_title('True vs Predicted')
            min_val = min(y_test.min(), y_pred.min())
            max_val = max(y_test.max(), y_pred.max())
            ax.set_xlim(min_val, max_val)
            ax.set_ylim(min_val, max_val)
            ax.legend()
            plt.tight_layout()
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'regression', 'uniques': None})
        else:
            # Classification
            if len(y.unique()) < 2:
                return ("Label must have at least 2 unique values.", None, None, None, None, None)
            y_encoded, uniques = pd.factorize(y)
            X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
            model = RandomForestClassifier(random_state=RANDOM_STATE)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
            results_text += "Classification Results:\n" + cr + "\n"
            # 2D Confusion Matrix
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(u) for u in uniques], yticklabels=[str(u) for u in uniques])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title('Confusion Matrix')
            buf = io.BytesIO()
            plt.savefig(buf, format="png", bbox_inches="tight")
            plt.close()
            buf.seek(0)
            model_img = Image.open(buf)
            global_data.update({'model': model, 'scaler': scaler, 'X_columns': X_processed.columns, 'y_type': 'classification', 'uniques': uniques})
    except Exception as e:
        results_text += f"\nError during model training: {e}"

    try:
        fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=fi.values, y=fi.index)
        plt.title("Top 10 Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        fi_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute feature importance: {e}"

    try:
        kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
        clusters_kmeans = kmeans.fit_predict(X_scaled)
        pca = PCA(n_components=2, random_state=RANDOM_STATE)
        X_pca = pca.fit_transform(X_scaled)
        explained_var = sum(pca.explained_variance_ratio_)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        kmeans_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: KMeans clustering failed: {e}"

    try:
        agg = AgglomerativeClustering(n_clusters=n_clusters)
        clusters_agg = agg.fit_predict(X_scaled)
        plt.figure(figsize=(8, 6))
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
        plt.xlabel("PCA 1")
        plt.ylabel("PCA 2")
        plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
        plt.colorbar(scatter, ticks=range(n_clusters))
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        agg_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Agglomerative clustering failed: {e}"

    try:
        f_scores, _ = f_classif(X_processed, clusters_kmeans)
        # Handle potential division by zero or NaN values
        f_scores = np.nan_to_num(f_scores, nan=0.0, posinf=0.0)
        f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
        plt.figure(figsize=(10, 6))
        sns.barplot(data=f_series.reset_index(), x="index", y=0, hue="index", legend=False)  # Fix palette warning
        plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
        plt.xlabel("F-score")
        plt.ylabel("Feature")
        plt.xticks(rotation=45)
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight")
        plt.close()
        buf.seek(0)
        diff_img = Image.open(buf)
    except Exception as e:
        results_text += f"\nWarning: Could not compute differentiating features: {e}"

    return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img

def predict_interactive(*args):
    if global_data['model'] is None:
        return "Please analyze a file first to train the model."
    
    try:
        # Convert args to kwargs based on column names
        kwargs = {}
        if len(args) > 0 and global_data['X_columns'] is not None:
            for i, col in enumerate(global_data['X_columns']):
                if i < len(args):
                    kwargs[col] = args[i]
        
        # Create DataFrame from user inputs
        input_data = pd.DataFrame([kwargs])
        
        # Handle categorical variables with one-hot encoding
        X_processed = pd.get_dummies(input_data)
        
        # Ensure all expected columns are present
        for col in global_data['X_columns']:
            if col not in X_processed.columns:
                X_processed[col] = 0
        
        # Reorder columns to match training data
        X_processed = X_processed[global_data['X_columns']]
        
        # Scale the input
        X_scaled = global_data['scaler'].transform(X_processed)
        
        # Predict
        prediction = global_data['model'].predict(X_scaled)
        
        if global_data['y_type'] == 'classification':
            pred_value = global_data['uniques'][int(prediction[0])]
            return f"Predicted class: {pred_value}"
        else:
            return f"Predicted value: {prediction[0]:.3f}"
    except Exception as e:
        return f"Error in prediction: {str(e)}. Please ensure all inputs are valid numbers or categories."

def create_interactive_inputs(file, label_col):
    if file is None or label_col is None:
        print("No file or label column provided")  # Debug logging
        return []
    
    try:
        if file.name.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif file.name.endswith('.xlsx'):
            df = pd.read_excel(file.name)
        else:
            print("Unsupported file type")  # Debug logging
            return []
        
        if df.empty or label_col not in df.columns:
            print(f"Empty DataFrame or invalid label column: {label_col}")  # Debug logging
            return []
        
        X = df.drop(columns=[label_col])
        if X.empty:
            print("No features available after dropping label column")  # Debug logging
            return []
        
        components = []
        for col in X.columns:
            examples = X[col].dropna().sample(min(3, len(X[col].dropna()))).tolist()
            if pd.api.types.is_numeric_dtype(X[col]):
                components.append(gr.Number(label=f"{col} (e.g., {', '.join(map(str, examples))})", value=None))
            else:
                unique_values = X[col].dropna().unique().tolist()
                components.append(gr.Dropdown(label=f"{col} (e.g., {', '.join(map(str, examples))})", choices=unique_values, value=None))
        print(f"Generated {len(components)} input components")  # Debug logging
        return components
    except Exception as e:
        print(f"Error in create_interactive_inputs: {e}")  # Debug logging
        return []

with gr.Blocks() as demo:
    gr.Markdown("## Data Analysis Explorer")
    gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")

    with gr.Row():
        file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
        label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
        clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")

    file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
    analyze_btn = gr.Button("Analyze")

    with gr.Tabs():
        with gr.TabItem("Prediction Results"):
            gr.Markdown("### Classification or Regression")
            gr.Markdown("""
            - **Regression**: Predicts numbers (e.g., sales). Uses Random Forest.
            - **Classification**: Predicts categories (e.g., yes/no). Uses Random Forest.
            - Rows with missing values are removed. 70% of data trains the model; 30% tests it.
            """)
            results_textbox = gr.Textbox(label="Performance Metrics", lines=10)

        with gr.TabItem("Prediction Plot"):
            gr.Markdown("### Prediction Visualization")
            gr.Markdown("For regression: scatter plots of top 3 features vs. predicted values and true vs. predicted. For classification: confusion matrix.")
            model_img_output = gr.Image(label="Prediction Output")

        with gr.TabItem("Feature Importances"):
            gr.Markdown("### Top 10 Key Features")
            gr.Markdown("Shows the most important features for predictions. Higher bars mean bigger impact.")
            fi_output = gr.Image(label="Feature Importances")

        with gr.TabItem("KMeans Clustering"):
            gr.Markdown("### KMeans Clustering")
            gr.Markdown("Groups similar data points without using the selected column. Colors show clusters in 2D (PCA projection).")
            kmeans_output = gr.Image(label="KMeans Clusters")

        with gr.TabItem("Agglomerative Clustering"):
            gr.Markdown("### Agglomerative Clustering")
            gr.Markdown("Another way to group data hierarchically. Compare with KMeans to see differences!")
            agg_output = gr.Image(label="Agglomerative Clusters")

        with gr.TabItem("Cluster Differences"):
            gr.Markdown("### Top 10 Cluster-Differentiating Features")
            gr.Markdown("Shows features that vary most between clusters, helping explain the groupings.")
            diff_output = gr.Image(label="Differentiating Features")

        with gr.TabItem("Interactive"):
            gr.Markdown("### Interactive Prediction")
            gr.Markdown("Enter values for each feature to get a prediction based on the trained model.")
            with gr.Column():
                input_components = gr.State(value=[])
                dynamic_inputs = gr.Column(visible=True)
                predict_btn = gr.Button("Predict")
                prediction_output = gr.Textbox(label="Prediction Result")
            
            def update_inputs(file, label_col):
                print(f"Updating inputs with file: {file}, label_col: {label_col}")  # Debug logging
                components = create_interactive_inputs(file, label_col)
                # Return the components and update the Column's visibility
                return components, gr.update(visible=True)  # Only update visibility, components are rendered in Blocks
                # Use Blocks to render components dynamically
                with dynamic_inputs:
                    for component in components:
                        component.render()

            file_input.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            label_dropdown.change(
                fn=update_inputs,
                inputs=[file_input, label_dropdown],
                outputs=[input_components, dynamic_inputs]
            )
            predict_btn.click(
                fn=predict_interactive,
                inputs=input_components,
                outputs=prediction_output
            )

    analyze_btn.click(fn=analyze_file, inputs=[file_input, label_dropdown, clusters_slider],
                      outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])

demo.launch(debug=True)  # Enable debug mode for more detailed error logging