jaker86 commited on
Commit
eaca5d0
·
verified ·
1 Parent(s): 157d716

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -96
app.py CHANGED
@@ -12,95 +12,125 @@ import matplotlib.pyplot as plt
12
  import seaborn as sns
13
  import io
14
 
15
- def analyze_csv(file, label_col, n_clusters):
 
 
 
16
  try:
17
- df = pd.read_csv(file.name if hasattr(file, "name") else file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  except Exception as e:
19
- return (None,)*6 + (f"error reading csv: {e}",)
20
-
 
21
  if label_col not in df.columns:
22
- return (None,)*6 + (f"label column '{label_col}' not in data",)
23
-
 
24
  df = df.dropna()
25
- # separate target and features
 
 
 
 
 
26
  y = df[label_col]
27
  X = df.drop(columns=[label_col])
28
- # create one-hot encodings for non-numeric columns
29
- X_processed = pd.get_dummies(X)
30
-
31
- # scale features for clustering methods
32
  scaler = StandardScaler()
33
  X_scaled = scaler.fit_transform(X_processed)
34
-
35
  results_text = ""
36
  model_img = None
37
 
38
- # model training & evaluation: regression if y numeric, classification otherwise
39
  if pd.api.types.is_numeric_dtype(y):
40
- # regression
41
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)
42
  model = RandomForestRegressor(random_state=42)
43
  model.fit(X_train, y_train)
44
  y_pred = model.predict(X_test)
45
  mse = mean_squared_error(y_test, y_pred)
46
  r2 = r2_score(y_test, y_pred)
47
- results_text += f"regression results:\nmse: {mse:.3f}\nr2: {r2:.3f}\n"
48
- # scatter plot: true vs predicted with y=x line
49
- plt.figure(figsize=(6,4))
 
 
 
50
  plt.scatter(y_test, y_pred, alpha=0.7)
51
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
52
- plt.xlabel("true values")
53
- plt.ylabel("predicted values")
54
- plt.title("regression: true vs predicted")
55
  buf = io.BytesIO()
56
  plt.savefig(buf, format="png", bbox_inches="tight")
57
  plt.close()
58
  buf.seek(0)
59
  model_img = buf
60
  else:
61
- # classification
62
  y_encoded, uniques = pd.factorize(y)
63
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=42)
64
  model = RandomForestClassifier(random_state=42)
65
  model.fit(X_train, y_train)
66
  y_pred = model.predict(X_test)
67
  cm = confusion_matrix(y_test, y_pred)
68
- cr = classification_report(y_test, y_pred)
69
- results_text += f"classification results:\n{cr}\n"
70
- plt.figure(figsize=(6,4))
71
- sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
72
- plt.xlabel("predicted")
73
- plt.ylabel("true")
74
- plt.title("confusion matrix")
75
  buf = io.BytesIO()
76
  plt.savefig(buf, format="png", bbox_inches="tight")
77
  plt.close()
78
  buf.seek(0)
79
  model_img = buf
80
 
81
- # feature importance plot (from the model)
82
- fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False)
83
- plt.figure(figsize=(8,4))
84
  sns.barplot(x=fi.values, y=fi.index)
85
- plt.title("feature importances")
86
- plt.xlabel("importance")
87
- plt.ylabel("feature")
88
  buf = io.BytesIO()
89
  plt.savefig(buf, format="png", bbox_inches="tight")
90
  plt.close()
91
  buf.seek(0)
92
  fi_img = buf
93
 
94
- # clustering with kmeans
95
  kmeans = KMeans(n_clusters=n_clusters, random_state=42)
96
  clusters_kmeans = kmeans.fit_predict(X_scaled)
97
  pca = PCA(n_components=2, random_state=42)
98
  X_pca = pca.fit_transform(X_scaled)
99
- plt.figure(figsize=(6,4))
100
- scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
101
- plt.xlabel("pca 1")
102
- plt.ylabel("pca 2")
103
- plt.title(f"kmeans clustering (k={n_clusters})")
 
104
  plt.colorbar(scatter, ticks=range(n_clusters))
105
  buf = io.BytesIO()
106
  plt.savefig(buf, format="png", bbox_inches="tight")
@@ -108,14 +138,14 @@ def analyze_csv(file, label_col, n_clusters):
108
  buf.seek(0)
109
  kmeans_img = buf
110
 
111
- # clustering with agglomerative clustering
112
  agg = AgglomerativeClustering(n_clusters=n_clusters)
113
  clusters_agg = agg.fit_predict(X_scaled)
114
- plt.figure(figsize=(6,4))
115
- scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters_agg, cmap="plasma", alpha=0.7)
116
- plt.xlabel("pca 1")
117
- plt.ylabel("pca 2")
118
- plt.title(f"agglomerative clustering (k={n_clusters})")
119
  plt.colorbar(scatter, ticks=range(n_clusters))
120
  buf = io.BytesIO()
121
  plt.savefig(buf, format="png", bbox_inches="tight")
@@ -123,15 +153,14 @@ def analyze_csv(file, label_col, n_clusters):
123
  buf.seek(0)
124
  agg_img = buf
125
 
126
- # differentiating features among clusters (using kmeans clusters)
127
- f_scores, p_vals = f_classif(X_processed, clusters_kmeans)
128
- f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False)
129
- top_features = f_series.head(10)
130
- plt.figure(figsize=(8,4))
131
- sns.barplot(x=top_features.values, y=top_features.index, palette="mako")
132
- plt.title("top differentiating features (anova f-scores)")
133
- plt.xlabel("f-score")
134
- plt.ylabel("feature")
135
  buf = io.BytesIO()
136
  plt.savefig(buf, format="png", bbox_inches="tight")
137
  plt.close()
@@ -140,45 +169,6 @@ def analyze_csv(file, label_col, n_clusters):
140
 
141
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
142
 
143
- def update_dropdown(file):
144
- if file is None:
145
- return gr.Dropdown.update(choices=[], value=None)
146
- try:
147
- df = pd.read_csv(file.name if hasattr(file, "name") else file)
148
- return gr.Dropdown.update(choices=list(df.columns), value=None)
149
- except:
150
- return gr.Dropdown.update(choices=[], value=None)
151
-
152
  with gr.Blocks() as demo:
153
- gr.Markdown("## csv analysis app")
154
- with gr.Row():
155
- file_input = gr.File(label="upload csv", file_types=[".csv"])
156
- label_dropdown = gr.Dropdown(label="select label column", interactive=True)
157
-
158
- file_input.change(
159
- fn=update_dropdown,
160
- inputs=file_input,
161
- outputs=label_dropdown
162
- )
163
- clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="number of clusters")
164
-
165
-
166
- analyze_btn = gr.Button("analyze")
167
- with gr.Tabs():
168
- with gr.TabItem("results"):
169
- results_textbox = gr.Textbox(label="metrics & descriptions", lines=10)
170
- with gr.TabItem("model visualization"):
171
- model_img_output = gr.Image(label="model output (confusion matrix or regression scatter)")
172
- with gr.TabItem("feature importances"):
173
- fi_output = gr.Image(label="feature importances")
174
- with gr.TabItem("kmeans clustering"):
175
- kmeans_output = gr.Image(label="kmeans clustering (pca projection)")
176
- with gr.TabItem("agglomerative clustering"):
177
- agg_output = gr.Image(label="agglomerative clustering (pca projection)")
178
- with gr.TabItem("cluster differentiation"):
179
- diff_output = gr.Image(label="differentiating features among clusters")
180
-
181
- analyze_btn.click(fn=analyze_csv, inputs=[file_input, label_dropdown, clusters_slider],
182
- outputs=[results_textbox, model_img_output, fi_output, kmeans_output, agg_output, diff_output])
183
-
184
- demo.launch()
 
12
  import seaborn as sns
13
  import io
14
 
15
+ def update_dropdown(file):
16
+ """Update the dropdown choices with column names from the uploaded file."""
17
+ if file is None:
18
+ return gr.Dropdown.update(choices=[], value=None)
19
  try:
20
+ if file.name.endswith('.csv'):
21
+ df = pd.read_csv(file.name)
22
+ elif file.name.endswith('.xlsx'):
23
+ df = pd.read_excel(file.name)
24
+ else:
25
+ return gr.Dropdown.update(choices=[], value=None)
26
+ return gr.Dropdown.update(choices=list(df.columns), value=None)
27
+ except Exception:
28
+ return gr.Dropdown.update(choices=[], value=None)
29
+
30
+ def analyze_file(file, label_col, n_clusters):
31
+ """Analyze the uploaded file with ML techniques and return results and plots."""
32
+ # Read the file based on its extension
33
+ try:
34
+ if file.name.endswith('.csv'):
35
+ df = pd.read_csv(file.name)
36
+ elif file.name.endswith('.xlsx'):
37
+ df = pd.read_excel(file.name)
38
+ else:
39
+ return ("Unsupported file type. Please upload a CSV or XLSX file.", None, None, None, None, None)
40
  except Exception as e:
41
+ return (f"Error reading file: {e}", None, None, None, None, None)
42
+
43
+ # Validate label column
44
  if label_col not in df.columns:
45
+ return (f"Label column '{label_col}' not found. Please select a valid column.", None, None, None, None, None)
46
+
47
+ # Clean data and validate size
48
  df = df.dropna()
49
+ if df.shape[0] < 10:
50
+ return ("Not enough data rows (less than 10) after removing missing values.", None, None, None, None, None)
51
+ if df.shape[1] < 2:
52
+ return ("Need at least one feature and one label column.", None, None, None, None, None)
53
+
54
+ # Separate features and target
55
  y = df[label_col]
56
  X = df.drop(columns=[label_col])
57
+ X_processed = pd.get_dummies(X) # One-hot encode categorical features
 
 
 
58
  scaler = StandardScaler()
59
  X_scaled = scaler.fit_transform(X_processed)
60
+
61
  results_text = ""
62
  model_img = None
63
 
64
+ # Prediction: regression or classification based on target type
65
  if pd.api.types.is_numeric_dtype(y):
66
+ # Regression
67
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)
68
  model = RandomForestRegressor(random_state=42)
69
  model.fit(X_train, y_train)
70
  y_pred = model.predict(X_test)
71
  mse = mean_squared_error(y_test, y_pred)
72
  r2 = r2_score(y_test, y_pred)
73
+ results_text += (
74
+ "Regression Results (predicting numeric values):\n"
75
+ f"- Mean Squared Error (MSE): {mse:.3f} (lower is better)\n"
76
+ f"- R² Score: {r2:.3f} (0 to 1, higher is better)\n"
77
+ )
78
+ plt.figure(figsize=(8, 6))
79
  plt.scatter(y_test, y_pred, alpha=0.7)
80
  plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
81
+ plt.xlabel("True Values")
82
+ plt.ylabel("Predicted Values")
83
+ plt.title("Regression: True vs Predicted")
84
  buf = io.BytesIO()
85
  plt.savefig(buf, format="png", bbox_inches="tight")
86
  plt.close()
87
  buf.seek(0)
88
  model_img = buf
89
  else:
90
+ # Classification
91
  y_encoded, uniques = pd.factorize(y)
92
  X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.3, random_state=42)
93
  model = RandomForestClassifier(random_state=42)
94
  model.fit(X_train, y_train)
95
  y_pred = model.predict(X_test)
96
  cm = confusion_matrix(y_test, y_pred)
97
+ cr = classification_report(y_test, y_pred, target_names=[str(u) for u in uniques])
98
+ results_text += "Classification Results (predicting categories):\n" + cr + "\n"
99
+ plt.figure(figsize=(8, 6))
100
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=uniques, yticklabels=uniques)
101
+ plt.xlabel("Predicted")
102
+ plt.ylabel("True")
103
+ plt.title("Confusion Matrix")
104
  buf = io.BytesIO()
105
  plt.savefig(buf, format="png", bbox_inches="tight")
106
  plt.close()
107
  buf.seek(0)
108
  model_img = buf
109
 
110
+ # Feature importance (top 10)
111
+ fi = pd.Series(model.feature_importances_, index=X_processed.columns).sort_values(ascending=False).head(10)
112
+ plt.figure(figsize=(10, 6))
113
  sns.barplot(x=fi.values, y=fi.index)
114
+ plt.title("Top 10 Feature Importances")
115
+ plt.xlabel("Importance")
116
+ plt.ylabel("Feature")
117
  buf = io.BytesIO()
118
  plt.savefig(buf, format="png", bbox_inches="tight")
119
  plt.close()
120
  buf.seek(0)
121
  fi_img = buf
122
 
123
+ # KMeans clustering
124
  kmeans = KMeans(n_clusters=n_clusters, random_state=42)
125
  clusters_kmeans = kmeans.fit_predict(X_scaled)
126
  pca = PCA(n_components=2, random_state=42)
127
  X_pca = pca.fit_transform(X_scaled)
128
+ explained_var = sum(pca.explained_variance_ratio_)
129
+ plt.figure(figsize=(8, 6))
130
+ scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="viridis", alpha=0.7)
131
+ plt.xlabel("PCA 1")
132
+ plt.ylabel("PCA 2")
133
+ plt.title(f"KMeans Clustering (PCA, {explained_var:.2%} variance explained)")
134
  plt.colorbar(scatter, ticks=range(n_clusters))
135
  buf = io.BytesIO()
136
  plt.savefig(buf, format="png", bbox_inches="tight")
 
138
  buf.seek(0)
139
  kmeans_img = buf
140
 
141
+ # Agglomerative clustering
142
  agg = AgglomerativeClustering(n_clusters=n_clusters)
143
  clusters_agg = agg.fit_predict(X_scaled)
144
+ plt.figure(figsize=(8, 6))
145
+ scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters_agg, cmap="plasma", alpha=0.7)
146
+ plt.xlabel("PCA 1")
147
+ plt.ylabel("PCA 2")
148
+ plt.title(f"Agglomerative Clustering (PCA, {explained_var:.2%} variance explained)")
149
  plt.colorbar(scatter, ticks=range(n_clusters))
150
  buf = io.BytesIO()
151
  plt.savefig(buf, format="png", bbox_inches="tight")
 
153
  buf.seek(0)
154
  agg_img = buf
155
 
156
+ # Differentiating features (top 10)
157
+ f_scores, _ = f_classif(X_processed, clusters_kmeans)
158
+ f_series = pd.Series(f_scores, index=X_processed.columns).sort_values(ascending=False).head(10)
159
+ plt.figure(figsize=(10, 6))
160
+ sns.barplot(x=f_series.values, y=f_series.index, palette="mako")
161
+ plt.title("Top 10 Differentiating Features (ANOVA F-scores)")
162
+ plt.xlabel("F-score")
163
+ plt.ylabel("Feature")
 
164
  buf = io.BytesIO()
165
  plt.savefig(buf, format="png", bbox_inches="tight")
166
  plt.close()
 
169
 
170
  return results_text, model_img, fi_img, kmeans_img, agg_img, diff_img
171
 
 
 
 
 
 
 
 
 
 
172
  with gr.Blocks() as demo:
173
+ gr.Markdown("## Data Analysis Explorer")
174
+ gr.Markdown("Upload a CSV or XLSX file to explore classification, regression, and clustering. Select a column to predict and the number of