jaker86 commited on
Commit
e791e5b
·
verified ·
1 Parent(s): ee936fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -108
app.py CHANGED
@@ -12,8 +12,14 @@ import matplotlib.pyplot as plt
12
  import seaborn as sns
13
  import io
14
 
 
 
 
 
 
 
15
  def update_dropdown(file):
16
- """Update the dropdown choices with column names from the uploaded file."""
17
  if file is None:
18
  return gr.Dropdown.update(choices=[], value=None)
19
  try:
@@ -24,12 +30,16 @@ def update_dropdown(file):
24
  else:
25
  return gr.Dropdown.update(choices=[], value=None)
26
  return gr.Dropdown.update(choices=list(df.columns), value=None)
27
- except Exception:
28
  return gr.Dropdown.update(choices=[], value=None)
29
 
30
  def analyze_file(file, label_col, n_clusters):
31
  """Analyze the uploaded file with ML techniques and return results and plots."""
32
- # Read the file based on its extension
 
 
 
 
33
  try:
34
  if file.name.endswith('.csv'):
35
  df = pd.read_csv(file.name)
@@ -40,147 +50,173 @@ def analyze_file(file, label_col, n_clusters):
40
  except Exception as e:
41
  return (f"Error reading file: {e}", None, None, None, None, None)
42
 
43
- # Validate label column
 
 
44
  if label_col not in df.columns:
45
- return (f"Label column '{label_col}' not found. Please select a valid column.", None, None, None, None, None)
46
-
47
- # Clean data and validate size
48
  df = df.dropna()
49
- if df.shape[0] < 10:
50
- return ("Not enough data rows (less than 10) after removing missing values.", None, None, None, None, None)
51
- if df.shape[1] < 2:
52
  return ("Need at least one feature and one label column.", None, None, None, None, None)
53
 
54
  # Separate features and target
55
  y = df[label_col]
56
  X = df.drop(columns=[label_col])
57
  X_processed = pd.get_dummies(X) # One-hot encode categorical features
 
 
 
 
58
  scaler = StandardScaler()
59
  X_scaled = scaler.fit_transform(X_processed)
60
 
61
  results_text = ""
62
  model_img = None
63
 
64
- # Prediction: regression or classification based on target type
65
- if pd.api.types.is_numeric_dtype(y):
66
- # Regression
67
- X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)
68
- model = RandomForestRegressor(random_state=42)
69
- model.fit(X_train, y_train)
70
- y_pred = model.predict(X_test)
71
- mse = mean_squared_error(y_test, y_pred)
72
- r2 = r2_score(y_test, y_pred)
73
- results_text += (
74
- "Regression Results (predicting numeric values):\n"
75
- f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
76
- f"- Score: {r2:.3f} (0 to 1, higher is better)\n"
77
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  plt.figure(figsize=(8, 6))
79
- plt.scatter(y_test, y_pred, alpha=0.7)
80
- plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
81
- plt.xlabel("True Values")
82
- plt.ylabel("Predicted Values")
83
- plt.title("Regression: True vs Predicted")
84
  buf = io.BytesIO()
85
  plt.savefig(buf, format="png", bbox_inches="tight")
86
  plt.close()
87
  buf.seek(0)
88
- model_img = buf
89
- else:
90
- # Classification
91
- y_encoded, uniques = pd.factorize(y)
92
- X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=42)
93
- model = RandomForestClassifier(random_state=42)
94
- model.fit(X_train, y_train)
95
- y_pred = model.predict(X_test)
96
- cm = confusion_matrix(y_test, y_pred)
97
- cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
98
- results_text += "Classification Results (predicting categories):\n" + cr + "\n"
99
  plt.figure(figsize=(8, 6))
100
- sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
101
- plt.xlabel("Predicted")
102
- plt.ylabel("True")
103
- plt.title("Confusion Matrix")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  buf = io.BytesIO()
105
  plt.savefig(buf, format="png", bbox_inches="tight")
106
  plt.close()
107
  buf.seek(0)
108
- model_img = buf
109
-
110
- # Feature importance (top 10)
111
- fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(10)
112
- plt.figure(figsize=(10, 6))
113
- sns.barplot(x=fi.values, y=fi.index)
114
- plt.title("Top 10 Feature Importances")
115
- plt.xlabel("Importance")
116
- plt.ylabel("Feature")
117
- buf = io.BytesIO()
118
- plt.savefig(buf, format="png", bbox_inches="tight")
119
- plt.close()
120
- buf.seek(0)
121
- fi_img = buf
122
-
123
- # KMeans clustering
124
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
125
- clusters_kmeans = kmeans.fit_predict(X_scaled)
126
- pca = PCA(n_components=2, random_state=42)
127
- X_pca = pca.fit_transform(X_scaled)
128
- explained_var = sum(pca.explained_variance_ratio_)
129
- plt.figure(figsize=(8, 6))
130
- scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
131
- plt.xlabel("PCA 1")
132
- plt.ylabel("PCA 2")
133
- plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
134
- plt.colorbar(scatter, ticks=range(n_clusters))
135
- buf = io.BytesIO()
136
- plt.savefig(buf, format="png", bbox_inches="tight")
137
- plt.close()
138
- buf.seek(0)
139
- kmeans_img = buf
140
-
141
- # Agglomerative clustering
142
- agg = AgglomerativeClustering(n_clusters=n_clusters)
143
- clusters_agg = agg.fit_predict(X_scaled)
144
- plt.figure(figsize=(8, 6))
145
- scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
146
- plt.xlabel("PCA 1")
147
- plt.ylabel("PCA 2")
148
- plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
149
- plt.colorbar(scatter, ticks=range(n_clusters))
150
- buf = io.BytesIO()
151
- plt.savefig(buf, format="png", bbox_inches="tight")
152
- plt.close()
153
- buf.seek(0)
154
- agg_img = buf
155
-
156
- # Differentiating features (top 10)
157
- f_scores, _ = f_classif(X_processed, clusters_kmeans)
158
- f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(10)
159
- plt.figure(figsize=(10, 6))
160
- sns.barplot(x=f_series.values, y=f_series.index, palette="mako")
161
- plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
162
- plt.xlabel("F-score")
163
- plt.ylabel("Feature")
164
- buf = io.BytesIO()
165
- plt.savefig(buf, format="png", bbox_inches="tight")
166
- plt.close()
167
- buf.seek(0)
168
- diff_img = buf
169
 
170
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
171
 
 
172
  with gr.Blocks() as demo:
173
  gr.Markdown("## Data Analysis Explorer")
174
  gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
175
 
176
  with gr.Row():
177
  file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
178
- label_dropdown = gr.Dropdown(label="Select Column to Predict", interactive=True)
179
  clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")
180
 
181
- # Event handler to update dropdown when file is uploaded
182
  file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
183
-
184
  analyze_btn = gr.Button("Analyze")
185
 
186
  with gr.Tabs():
 
12
  import seaborn as sns
13
  import io
14
 
15
+ # Constants for reproducibility and configuration
16
+ RANDOM_STATE = 42
17
+ MIN_ROWS = 10
18
+ MIN_COLS = 2
19
+ MAX_FEATURES_TO_SHOW = 10
20
+
21
  def update_dropdown(file):
22
+ """Update dropdown choices with column names from the uploaded file."""
23
  if file is None:
24
  return gr.Dropdown.update(choices=[], value=None)
25
  try:
 
30
  else:
31
  return gr.Dropdown.update(choices=[], value=None)
32
  return gr.Dropdown.update(choices=list(df.columns), value=None)
33
+ except Exception as e:
34
  return gr.Dropdown.update(choices=[], value=None)
35
 
36
  def analyze_file(file, label_col, n_clusters):
37
  """Analyze the uploaded file with ML techniques and return results and plots."""
38
+ # Validate file input
39
+ if file is None:
40
+ return ("Please upload a file.", None, None, None, None, None)
41
+
42
+ # Read file based on extension
43
  try:
44
  if file.name.endswith('.csv'):
45
  df = pd.read_csv(file.name)
 
50
  except Exception as e:
51
  return (f"Error reading file: {e}", None, None, None, None, None)
52
 
53
+ # Validate data shape and label column
54
+ if df.empty:
55
+ return ("File is empty.", None, None, None, None, None)
56
  if label_col not in df.columns:
57
+ return (f"Label column '{label_col}' not found.", None, None, None, None, None)
58
+
59
+ # Clean data and check minimum requirements
60
  df = df.dropna()
61
+ if df.shape[0] < MIN_ROWS:
62
+ return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
63
+ if df.shape[1] < MIN_COLS:
64
  return ("Need at least one feature and one label column.", None, None, None, None, None)
65
 
66
  # Separate features and target
67
  y = df[label_col]
68
  X = df.drop(columns=[label_col])
69
  X_processed = pd.get_dummies(X) # One-hot encode categorical features
70
+ if X_processed.shape[1] == 0:
71
+ return ("No valid features after preprocessing.", None, None, None, None, None)
72
+
73
+ # Scale features
74
  scaler = StandardScaler()
75
  X_scaled = scaler.fit_transform(X_processed)
76
 
77
  results_text = ""
78
  model_img = None
79
 
80
+ # Prediction: Regression or Classification
81
+ try:
82
+ if pd.api.types.is_numeric_dtype(y):
83
+ # Regression
84
+ X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=RANDOM_STATE)
85
+ model = RandomForestRegressor(random_state=RANDOM_STATE)
86
+ model.fit(X_train, y_train)
87
+ y_pred = model.predict(X_test)
88
+ mse = mean_squared_error(y_test, y_pred)
89
+ r2 = r2_score(y_test, y_pred)
90
+ results_text += (
91
+ "Regression Results (predicting numeric values):\n"
92
+ f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
93
+ f"- R² Score: {r2:.3f} (0 to 1, higher is better)\n"
94
+ )
95
+ plt.figure(figsize=(8, 6))
96
+ plt.scatter(y_test, y_pred, alpha=0.7)
97
+ plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
98
+ plt.xlabel("True Values")
99
+ plt.ylabel("Predicted Values")
100
+ plt.title("Regression: True vs Predicted")
101
+ buf = io.BytesIO()
102
+ plt.savefig(buf, format="png", bbox_inches="tight")
103
+ plt.close()
104
+ buf.seek(0)
105
+ model_img = buf
106
+ else:
107
+ # Classification
108
+ if len(y.unique()) < 2:
109
+ return ("Label column must have at least 2 unique values for classification.", None, None, None, None, None)
110
+ y_encoded, uniques = pd.factorize(y)
111
+ X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=RANDOM_STATE)
112
+ model = RandomForestClassifier(random_state=RANDOM_STATE)
113
+ model.fit(X_train, y_train)
114
+ y_pred = model.predict(X_test)
115
+ cm = confusion_matrix(y_test, y_pred)
116
+ cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
117
+ results_text += "Classification Results (predicting categories):\n" + cr + "\n"
118
+ plt.figure(figsize=(8, 6))
119
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
120
+ plt.xlabel("Predicted")
121
+ plt.ylabel("True")
122
+ plt.title("Confusion Matrix")
123
+ buf = io.BytesIO()
124
+ plt.savefig(buf, format="png", bbox_inches="tight")
125
+ plt.close()
126
+ buf.seek(0)
127
+ model_img = buf
128
+ except Exception as e:
129
+ return (f"Error during model training: {e}", None, None, None, None, None)
130
+
131
+ # Feature Importance
132
+ try:
133
+ fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
134
+ plt.figure(figsize=(10, 6))
135
+ sns.barplot(x=fi.values, y=fi.index)
136
+ plt.title("Top 10 Feature Importances")
137
+ plt.xlabel("Importance")
138
+ plt.ylabel("Feature")
139
+ buf = io.BytesIO()
140
+ plt.savefig(buf, format="png", bbox_inches="tight")
141
+ plt.close()
142
+ buf.seek(0)
143
+ fi_img = buf
144
+ except Exception as e:
145
+ fi_img = None
146
+ results_text += f"\nWarning: Could not compute feature importance: {e}"
147
+
148
+ # KMeans Clustering
149
+ try:
150
+ kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
151
+ clusters_kmeans = kmeans.fit_predict(X_scaled)
152
+ pca = PCA(n_components=2, random_state=RANDOM_STATE)
153
+ X_pca = pca.fit_transform(X_scaled)
154
+ explained_var = sum(pca.explained_variance_ratio_)
155
  plt.figure(figsize=(8, 6))
156
+ scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
157
+ plt.xlabel("PCA 1")
158
+ plt.ylabel("PCA 2")
159
+ plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
160
+ plt.colorbar(scatter, ticks=range(n_clusters))
161
  buf = io.BytesIO()
162
  plt.savefig(buf, format="png", bbox_inches="tight")
163
  plt.close()
164
  buf.seek(0)
165
+ kmeans_img = buf
166
+ except Exception as e:
167
+ kmeans_img = None
168
+ results_text += f"\nWarning: KMeans clustering failed: {e}"
169
+
170
+ # Agglomerative Clustering
171
+ try:
172
+ agg = AgglomerativeClustering(n_clusters=n_clusters)
173
+ clusters_agg = agg.fit_predict(X_scaled)
 
 
174
  plt.figure(figsize=(8, 6))
175
+ scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
176
+ plt.xlabel("PCA 1")
177
+ plt.ylabel("PCA 2")
178
+ plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
179
+ plt.colorbar(scatter, ticks=range(n_clusters))
180
+ buf = io.BytesIO()
181
+ plt.savefig(buf, format="png", bbox_inches="tight")
182
+ plt.close()
183
+ buf.seek(0)
184
+ agg_img = buf
185
+ except Exception as e:
186
+ agg_img = None
187
+ results_text += f"\nWarning: Agglomerative clustering failed: {e}"
188
+
189
+ # Differentiating Features
190
+ try:
191
+ f_scores, _ = f_classif(X_processed, clusters_kmeans)
192
+ f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
193
+ plt.figure(figsize=(10, 6))
194
+ sns.barplot(x=f_series.values, y=f_series.index, palette="mako")
195
+ plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
196
+ plt.xlabel("F-score")
197
+ plt.ylabel("Feature")
198
  buf = io.BytesIO()
199
  plt.savefig(buf, format="png", bbox_inches="tight")
200
  plt.close()
201
  buf.seek(0)
202
+ diff_img = buf
203
+ except Exception as e:
204
+ diff_img = None
205
+ results_text += f"\nWarning: Could not compute differentiating features: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
208
 
209
+ # Gradio Interface
210
  with gr.Blocks() as demo:
211
  gr.Markdown("## Data Analysis Explorer")
212
  gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
213
 
214
  with gr.Row():
215
  file_input = gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xlsx"])
216
+ label_dropdown = gr.Dropdown(label="Select Column to Predict", choices=[], interactive=True)
217
  clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of Clusters")
218
 
 
219
  file_input.change(fn=update_dropdown, inputs=file_input, outputs=label_dropdown)
 
220
  analyze_btn = gr.Button("Analyze")
221
 
222
  with gr.Tabs():