tanish78 commited on
Commit
13d2652
·
verified ·
1 Parent(s): 5983f67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -14
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
 
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
@@ -9,8 +10,6 @@ from io import BytesIO
9
  import tempfile
10
  import numpy as np
11
  from PIL import Image
12
- from wordcloud import WordCloud
13
-
14
 
15
  def preprocess_data(df):
16
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
@@ -126,12 +125,46 @@ def cluster_data(df, num_clusters):
126
 
127
  return df, X, kmeans
128
 
129
- def generate_wordcloud(df):
130
- text = " ".join(df['texts'].tolist())
131
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
132
- plt.figure(figsize=(10, 5))
133
- plt.imshow(wordcloud, interpolation='bilinear')
134
- plt.axis('off')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  buf = BytesIO()
136
  plt.savefig(buf, format='png')
137
  buf.seek(0)
@@ -148,6 +181,8 @@ def main(file, num_clusters_to_display):
148
  df = preprocess_data(df)
149
  df, X, kmeans = cluster_data(df, num_clusters=15)
150
 
 
 
151
  cluster_sizes = df['Cluster'].value_counts()
152
  sorted_clusters = cluster_sizes.index.tolist()
153
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
@@ -161,16 +196,18 @@ def main(file, num_clusters_to_display):
161
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
162
  df = df.sort_values('Cluster')
163
 
164
- wordcloud_img = generate_wordcloud(df)
 
 
 
 
165
 
166
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
167
  df.to_csv(tmpfile.name, index=False)
168
- csv_file_path = tmpfile.name
169
-
170
- return csv_file_path, wordcloud_img
171
  except Exception as e:
172
  print(f"Error: {e}")
173
- return str(e), None
174
 
175
  interface = gr.Interface(
176
  fn=main,
@@ -180,10 +217,13 @@ interface = gr.Interface(
180
  ],
181
  outputs=[
182
  gr.File(label="Clustered Data CSV"),
183
- gr.Image(label="Word Cloud")
 
 
184
  ],
185
  title="Unanswered User Queries Clustering",
186
  description="Unanswered User Query Categorization"
187
  )
188
 
189
  interface.launch()
 
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
+ from sklearn.metrics import silhouette_score, silhouette_samples
6
  import matplotlib.pyplot as plt
7
  from sklearn.decomposition import PCA
8
  import re
 
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
 
 
13
 
14
  def preprocess_data(df):
15
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
 
125
 
126
  return df, X, kmeans
127
 
128
+ def visualize_clusters(df):
129
+ plt.figure(figsize=(10, 6))
130
+ scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
131
+ plt.legend(*scatter.legend_elements(), title="Clusters")
132
+ plt.title('Clusters of User Queries')
133
+ plt.xlabel('PCA Component 1')
134
+ plt.ylabel('PCA Component 2')
135
+ buf = BytesIO()
136
+ plt.savefig(buf, format='png')
137
+ buf.seek(0)
138
+ img = Image.open(buf)
139
+ return img
140
+
141
+ def silhouette_analysis(X, labels, num_clusters):
142
+ fig, ax1 = plt.subplots(1, 1)
143
+ fig.set_size_inches(10, 6)
144
+
145
+ ax1.set_xlim([-0.1, 1])
146
+ ax1.set_ylim([0, X.shape[0] + (num_clusters + 1) * 10])
147
+
148
+ sample_silhouette_values = silhouette_samples(X, labels)
149
+ y_lower = 10
150
+ for i in range(num_clusters):
151
+ ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
152
+ ith_cluster_silhouette_values.sort()
153
+ size_cluster_i = ith_cluster_silhouette_values.shape[0]
154
+ y_upper = y_lower + size_cluster_i
155
+ color = plt.cm.nipy_spectral(float(i) / num_clusters)
156
+ ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
157
+ facecolor=color, edgecolor=color, alpha=0.7)
158
+ ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
159
+ y_lower = y_upper + 10
160
+
161
+ ax1.set_title("The silhouette plot for the various clusters.")
162
+ ax1.set_xlabel("The silhouette coefficient values")
163
+ ax1.set_ylabel("Cluster label")
164
+ ax1.axvline(x=np.mean(sample_silhouette_values), color="red", linestyle="--")
165
+ ax1.set_yticks([])
166
+ ax1.set_xticks([i/10.0 for i in range(-1, 11)])
167
+
168
  buf = BytesIO()
169
  plt.savefig(buf, format='png')
170
  buf.seek(0)
 
181
  df = preprocess_data(df)
182
  df, X, kmeans = cluster_data(df, num_clusters=15)
183
 
184
+ cluster_plot = visualize_clusters(df)
185
+
186
  cluster_sizes = df['Cluster'].value_counts()
187
  sorted_clusters = cluster_sizes.index.tolist()
188
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
 
196
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
197
  df = df.sort_values('Cluster')
198
 
199
+ silhouette_avg = silhouette_score(X, kmeans.labels_)
200
+ silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
201
+
202
+ # Convert silhouette score to percentage
203
+ silhouette_percentage = (silhouette_avg + 1) * 50
204
 
205
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
206
  df.to_csv(tmpfile.name, index=False)
207
+ return tmpfile.name, silhouette_percentage, cluster_plot, silhouette_plot
 
 
208
  except Exception as e:
209
  print(f"Error: {e}")
210
+ return str(e), None, None, None
211
 
212
  interface = gr.Interface(
213
  fn=main,
 
217
  ],
218
  outputs=[
219
  gr.File(label="Clustered Data CSV"),
220
+ gr.Number(label="Clustering Quality (%)"),
221
+ gr.Image(label="Cluster Plot"),
222
+ gr.Image(label="Silhouette Plot")
223
  ],
224
  title="Unanswered User Queries Clustering",
225
  description="Unanswered User Query Categorization"
226
  )
227
 
228
  interface.launch()
229
+