jaker86 commited on
Commit
3e930db
·
verified ·
1 Parent(s): 755fb3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -44
app.py CHANGED
@@ -11,16 +11,15 @@ from sklearn.feature_selection import f_classif
11
  import matplotlib.pyplot as plt
12
  import seaborn as sns
13
  import io
14
- from PIL import Image # For converting BytesIO to PIL Image
15
 
16
- # Constants for reproducibility and configuration
17
  RANDOM_STATE = 42
18
  MIN_ROWS = 10
19
  MIN_COLS = 2
20
  MAX_FEATURES_TO_SHOW = 10
21
 
22
  def update_dropdown(file):
23
- """Update dropdown choices with column names from the uploaded file."""
24
  if file is None:
25
  return gr.update(choices=[], value=None)
26
  try:
@@ -35,12 +34,9 @@ def update_dropdown(file):
35
  return gr.update(choices=[], value=None)
36
 
37
  def analyze_file(file, label_col, n_clusters):
38
- """Analyze the uploaded file with ML techniques and return results and plots."""
39
- # Validate file input
40
  if file is None:
41
  return ("Please upload a file.", None, None, None, None, None)
42
 
43
- # Read file based on extension
44
  try:
45
  if file.name.endswith('.csv'):
46
  df = pd.read_csv(file.name)
@@ -51,27 +47,23 @@ def analyze_file(file, label_col, n_clusters):
51
  except Exception as e:
52
  return (f"Error reading file: {e}", None, None, None, None, None)
53
 
54
- # Validate data shape and label column
55
  if df.empty:
56
  return ("File is empty.", None, None, None, None, None)
57
  if label_col not in df.columns:
58
  return (f"Label column '{label_col}' not found.", None, None, None, None, None)
59
 
60
- # Clean data and check minimum requirements
61
  df = df.dropna()
62
  if df.shape[0] < MIN_ROWS:
63
  return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
64
  if df.shape[1] < MIN_COLS:
65
  return ("Need at least one feature and one label column.", None, None, None, None, None)
66
 
67
- # Separate features and target
68
  y = df[label_col]
69
  X = df.drop(columns=[label_col])
70
- X_processed = pd.get_dummies(X) # One-hot encode categorical features
71
  if X_processed.shape[1] == 0:
72
  return ("No valid features after preprocessing.", None, None, None, None, None)
73
 
74
- # Scale features
75
  scaler = StandardScaler()
76
  X_scaled = scaler.fit_transform(X_processed)
77
 
@@ -82,7 +74,6 @@ def analyze_file(file, label_col, n_clusters):
82
  agg_img = None
83
  diff_img = None
84
 
85
- # Prediction: Regression or Classification
86
  try:
87
  if pd.api.types.is_numeric_dtype(y):
88
  # Regression
@@ -96,29 +87,38 @@ def analyze_file(file, label_col, n_clusters):
96
  "Regression Results:\n"
97
  f"- MSE: {mse:.3f}\n"
98
  f"- R²: {r2:.3f}\n"
 
99
  )
100
- # 3D Plot with next two most important features
101
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
102
- if len(fi) < 3:
103
- results_text += "\nNot enough features for a 3D plot with the next two most important features."
104
- else:
105
- next_two_features = fi.index[1:3] # Second and third most important features
106
- fig = plt.figure(figsize=(10, 8))
107
- ax = fig.add_subplot(111, projection='3d')
108
- ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c='blue', marker='o', label='True Values')
109
- ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_pred, c='red', marker='^', label='Predicted Values')
110
- ax.set_xlabel(next_two_features[0])
111
- ax.set_ylabel(next_two_features[1])
112
- ax.set_zlabel(label_col)
113
- ax.set_title("3D Plot: Label vs Next Two Most Important Features")
114
- ax.legend()
115
- buf = io.BytesIO()
116
- plt.savefig(buf, format="png", bbox_inches="tight")
117
- plt.close()
118
- buf.seek(0)
119
- model_img = Image.open(buf)
 
 
 
 
 
 
 
 
120
  else:
121
- # Classification
122
  if len(y.unique()) < 2:
123
  return ("Label must have at least 2 unique values.", None, None, None, None, None)
124
  y_encoded, uniques = pd.factorize(y)
@@ -128,12 +128,11 @@ def analyze_file(file, label_col, n_clusters):
128
  y_pred = model.predict(X_test)
129
  cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
130
  results_text += "Classification Results:\n" + cr + "\n"
131
- # 3D Plot with next two most important features
132
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
133
  if len(fi) < 3:
134
  results_text += "\nNot enough features for a 3D plot with the next two most important features."
135
  else:
136
- next_two_features = fi.index[1:3] # Second and third most important features
137
  fig = plt.figure(figsize=(10, 8))
138
  ax = fig.add_subplot(111, projection='3d')
139
  scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
@@ -149,7 +148,6 @@ def analyze_file(file, label_col, n_clusters):
149
  except Exception as e:
150
  results_text += f"\nError during model training: {e}"
151
 
152
- # Feature Importance
153
  try:
154
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
155
  plt.figure(figsize=(10, 6))
@@ -161,11 +159,10 @@ def analyze_file(file, label_col, n_clusters):
161
  plt.savefig(buf, format="png", bbox_inches="tight")
162
  plt.close()
163
  buf.seek(0)
164
- fi_img = Image.open(buf) # Convert to PIL Image
165
  except Exception as e:
166
  results_text += f"\nWarning: Could not compute feature importance: {e}"
167
 
168
- # KMeans Clustering
169
  try:
170
  kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
171
  clusters_kmeans = kmeans.fit_predict(X_scaled)
@@ -182,11 +179,10 @@ def analyze_file(file, label_col, n_clusters):
182
  plt.savefig(buf, format="png", bbox_inches="tight")
183
  plt.close()
184
  buf.seek(0)
185
- kmeans_img = Image.open(buf) # Convert to PIL Image
186
  except Exception as e:
187
  results_text += f"\nWarning: KMeans clustering failed: {e}"
188
 
189
- # Agglomerative Clustering
190
  try:
191
  agg = AgglomerativeClustering(n_clusters=n_clusters)
192
  clusters_agg = agg.fit_predict(X_scaled)
@@ -200,11 +196,10 @@ def analyze_file(file, label_col, n_clusters):
200
  plt.savefig(buf, format="png", bbox_inches="tight")
201
  plt.close()
202
  buf.seek(0)
203
- agg_img = Image.open(buf) # Convert to PIL Image
204
  except Exception as e:
205
  results_text += f"\nWarning: Agglomerative clustering failed: {e}"
206
 
207
- # Differentiating Features
208
  try:
209
  f_scores, _ = f_classif(X_processed, clusters_kmeans)
210
  f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
@@ -217,13 +212,12 @@ def analyze_file(file, label_col, n_clusters):
217
  plt.savefig(buf, format="png", bbox_inches="tight")
218
  plt.close()
219
  buf.seek(0)
220
- diff_img = Image.open(buf) # Convert to PIL Image
221
  except Exception as e:
222
  results_text += f"\nWarning: Could not compute differentiating features: {e}"
223
 
224
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
225
 
226
- # Gradio Interface
227
  with gr.Blocks() as demo:
228
  gr.Markdown("## Data Analysis Explorer")
229
  gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
@@ -248,7 +242,7 @@ with gr.Blocks() as demo:
248
 
249
  with gr.TabItem("Prediction Plot"):
250
  gr.Markdown("### Prediction Visualization")
251
- gr.Markdown("Regression shows true vs. predicted values. Classification shows a confusion matrix of correct/incorrect predictions.")
252
  model_img_output = gr.Image(label="Prediction Output")
253
 
254
  with gr.TabItem("Feature Importances"):
 
11
  import matplotlib.pyplot as plt
12
  import seaborn as sns
13
  import io
14
+ from PIL import Image
15
 
16
+ # Constants
17
  RANDOM_STATE = 42
18
  MIN_ROWS = 10
19
  MIN_COLS = 2
20
  MAX_FEATURES_TO_SHOW = 10
21
 
22
  def update_dropdown(file):
 
23
  if file is None:
24
  return gr.update(choices=[], value=None)
25
  try:
 
34
  return gr.update(choices=[], value=None)
35
 
36
  def analyze_file(file, label_col, n_clusters):
 
 
37
  if file is None:
38
  return ("Please upload a file.", None, None, None, None, None)
39
 
 
40
  try:
41
  if file.name.endswith('.csv'):
42
  df = pd.read_csv(file.name)
 
47
  except Exception as e:
48
  return (f"Error reading file: {e}", None, None, None, None, None)
49
 
 
50
  if df.empty:
51
  return ("File is empty.", None, None, None, None, None)
52
  if label_col not in df.columns:
53
  return (f"Label column '{label_col}' not found.", None, None, None, None, None)
54
 
 
55
  df = df.dropna()
56
  if df.shape[0] < MIN_ROWS:
57
  return (f"Not enough data rows (less than {MIN_ROWS}) after removing missing values.", None, None, None, None, None)
58
  if df.shape[1] < MIN_COLS:
59
  return ("Need at least one feature and one label column.", None, None, None, None, None)
60
 
 
61
  y = df[label_col]
62
  X = df.drop(columns=[label_col])
63
+ X_processed = pd.get_dummies(X)
64
  if X_processed.shape[1] == 0:
65
  return ("No valid features after preprocessing.", None, None, None, None, None)
66
 
 
67
  scaler = StandardScaler()
68
  X_scaled = scaler.fit_transform(X_processed)
69
 
 
74
  agg_img = None
75
  diff_img = None
76
 
 
77
  try:
78
  if pd.api.types.is_numeric_dtype(y):
79
  # Regression
 
87
  "Regression Results:\n"
88
  f"- MSE: {mse:.3f}\n"
89
  f"- R²: {r2:.3f}\n"
90
+ "\nCheck the 'Feature Importances' tab to see the top features impacting predictions.\n"
91
  )
92
+ # 2D Plots: Top 3 features vs predicted and true vs predicted
93
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
94
+ top_features = fi.head(3).index
95
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
96
+ axes = axes.flatten()
97
+ for i, feature in enumerate(top_features):
98
+ ax = axes[i]
99
+ ax.scatter(X_test[feature], y_pred, alpha=0.5)
100
+ ax.set_xlabel(feature)
101
+ ax.set_ylabel('Predicted SalePrice')
102
+ ax.set_title(f'{feature} vs Predicted SalePrice')
103
+ ax = axes[3]
104
+ ax.scatter(y_test, y_pred, alpha=0.5)
105
+ ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
106
+ ax.set_xlabel('True SalePrice')
107
+ ax.set_ylabel('Predicted SalePrice')
108
+ ax.set_title('True vs Predicted SalePrice')
109
+ min_val = min(y_test.min(), y_pred.min())
110
+ max_val = max(y_test.max(), y_pred.max())
111
+ ax.set_xlim(min_val, max_val)
112
+ ax.set_ylim(min_val, max_val)
113
+ ax.legend()
114
+ plt.tight_layout()
115
+ buf = io.BytesIO()
116
+ plt.savefig(buf, format="png", bbox_inches="tight")
117
+ plt.close()
118
+ buf.seek(0)
119
+ model_img = Image.open(buf)
120
  else:
121
+ # Classification (unchanged)
122
  if len(y.unique()) < 2:
123
  return ("Label must have at least 2 unique values.", None, None, None, None, None)
124
  y_encoded, uniques = pd.factorize(y)
 
128
  y_pred = model.predict(X_test)
129
  cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
130
  results_text += "Classification Results:\n" + cr + "\n"
 
131
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
132
  if len(fi) < 3:
133
  results_text += "\nNot enough features for a 3D plot with the next two most important features."
134
  else:
135
+ next_two_features = fi.index[1:3]
136
  fig = plt.figure(figsize=(10, 8))
137
  ax = fig.add_subplot(111, projection='3d')
138
  scatter = ax.scatter(X_test[next_two_features[0]], X_test[next_two_features[1]], y_test, c=y_test, cmap='viridis', marker='o')
 
148
  except Exception as e:
149
  results_text += f"\nError during model training: {e}"
150
 
 
151
  try:
152
  fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
153
  plt.figure(figsize=(10, 6))
 
159
  plt.savefig(buf, format="png", bbox_inches="tight")
160
  plt.close()
161
  buf.seek(0)
162
+ fi_img = Image.open(buf)
163
  except Exception as e:
164
  results_text += f"\nWarning: Could not compute feature importance: {e}"
165
 
 
166
  try:
167
  kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE)
168
  clusters_kmeans = kmeans.fit_predict(X_scaled)
 
179
  plt.savefig(buf, format="png", bbox_inches="tight")
180
  plt.close()
181
  buf.seek(0)
182
+ kmeans_img = Image.open(buf)
183
  except Exception as e:
184
  results_text += f"\nWarning: KMeans clustering failed: {e}"
185
 
 
186
  try:
187
  agg = AgglomerativeClustering(n_clusters=n_clusters)
188
  clusters_agg = agg.fit_predict(X_scaled)
 
196
  plt.savefig(buf, format="png", bbox_inches="tight")
197
  plt.close()
198
  buf.seek(0)
199
+ agg_img = Image.open(buf)
200
  except Exception as e:
201
  results_text += f"\nWarning: Agglomerative clustering failed: {e}"
202
 
 
203
  try:
204
  f_scores, _ = f_classif(X_processed, clusters_kmeans)
205
  f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(MAX_FEATURES_TO_SHOW)
 
212
  plt.savefig(buf, format="png", bbox_inches="tight")
213
  plt.close()
214
  buf.seek(0)
215
+ diff_img = Image.open(buf)
216
  except Exception as e:
217
  results_text += f"\nWarning: Could not compute differentiating features: {e}"
218
 
219
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
220
 
 
221
  with gr.Blocks() as demo:
222
  gr.Markdown("## Data Analysis Explorer")
223
  gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of clusters!")
 
242
 
243
  with gr.TabItem("Prediction Plot"):
244
  gr.Markdown("### Prediction Visualization")
245
+ gr.Markdown("For regression, shows scatter plots of the top three features vs. predicted values and a plot of true vs. predicted values. For classification, shows a 3D plot of the label vs. next two features.")
246
  model_img_output = gr.Image(label="Prediction Output")
247
 
248
  with gr.TabItem("Feature Importances"):