tanish78 commited on
Commit
ee4d135
·
verified ·
1 Parent(s): 2e20c8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -29
app.py CHANGED
@@ -177,25 +177,7 @@ def silhouette_analysis(X, labels, num_clusters):
177
  img = Image.open(buf)
178
  return img
179
 
180
- def find_optimal_clusters(X, max_clusters):
181
- silhouette_scores = []
182
- davies_bouldin_scores = []
183
-
184
- for n_clusters in range(2, max_clusters + 1):
185
- kmeans = KMeans(n_clusters=n_clusters, random_state=0)
186
- labels = kmeans.fit_predict(X)
187
- silhouette_avg = silhouette_score(X, labels)
188
- davies_bouldin = davies_bouldin_score(X, labels)
189
-
190
- silhouette_scores.append(silhouette_avg)
191
- davies_bouldin_scores.append(davies_bouldin)
192
-
193
- print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg}, Davies-Bouldin Index: {davies_bouldin}")
194
-
195
- optimal_clusters = np.argmax(silhouette_scores) + 2
196
- return optimal_clusters, silhouette_scores, davies_bouldin_scores
197
-
198
- def main(file, max_clusters_to_display):
199
  try:
200
  df = pd.read_csv(file)
201
 
@@ -203,13 +185,7 @@ def main(file, max_clusters_to_display):
203
  df = df[(df['Answer'] == 'Fallback Message shown')]
204
 
205
  df = preprocess_data(df)
206
- vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, min_df=2)
207
- X = vectorizer.fit_transform(df['texts'])
208
- X = normalize(X)
209
-
210
- optimal_clusters, silhouette_scores, davies_bouldin_scores = find_optimal_clusters(X, max_clusters_to_display)
211
-
212
- df, X, kmeans = cluster_data(df, num_clusters=optimal_clusters)
213
 
214
  cluster_plot = visualize_clusters(df)
215
 
@@ -220,14 +196,14 @@ def main(file, max_clusters_to_display):
220
 
221
  # Filter out the largest cluster and get the next largest clusters
222
  largest_cluster = sorted_clusters[0]
223
- filtered_clusters = sorted_clusters[1:max_clusters_to_display+1]
224
 
225
  df = df[df['Cluster'].isin(filtered_clusters)]
226
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
227
  df = df.sort_values('Cluster')
228
 
229
  silhouette_avg = silhouette_score(X, kmeans.labels_)
230
- silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=optimal_clusters)
231
 
232
  davies_bouldin = davies_bouldin_score(X, kmeans.labels_)
233
 
@@ -245,7 +221,7 @@ interface = gr.Interface(
245
  fn=main,
246
  inputs=[
247
  gr.File(label="Upload CSV File (.csv)"),
248
- gr.Slider(label="Max Clusters to Display", minimum=2, maximum=50, step=1, value=10)
249
  ],
250
  outputs=[
251
  gr.File(label="Clustered Data CSV"),
 
177
  img = Image.open(buf)
178
  return img
179
 
180
+ def main(file, num_clusters_to_display):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  try:
182
  df = pd.read_csv(file)
183
 
 
185
  df = df[(df['Answer'] == 'Fallback Message shown')]
186
 
187
  df = preprocess_data(df)
188
+ df, X, kmeans = cluster_data(df, num_clusters=15)
 
 
 
 
 
 
189
 
190
  cluster_plot = visualize_clusters(df)
191
 
 
196
 
197
  # Filter out the largest cluster and get the next largest clusters
198
  largest_cluster = sorted_clusters[0]
199
+ filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
200
 
201
  df = df[df['Cluster'].isin(filtered_clusters)]
202
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
203
  df = df.sort_values('Cluster')
204
 
205
  silhouette_avg = silhouette_score(X, kmeans.labels_)
206
+ silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
207
 
208
  davies_bouldin = davies_bouldin_score(X, kmeans.labels_)
209
 
 
221
  fn=main,
222
  inputs=[
223
  gr.File(label="Upload CSV File (.csv)"),
224
+ gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
225
  ],
226
  outputs=[
227
  gr.File(label="Clustered Data CSV"),