tanish78 commited on
Commit
f1b7c2a
·
verified ·
1 Parent(s): 90eec1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -105
app.py CHANGED
@@ -2,14 +2,11 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- from sklearn.metrics import silhouette_score, silhouette_samples
6
- import matplotlib.pyplot as plt
7
- from sklearn.decomposition import PCA
8
  import re
9
- from io import BytesIO
10
- import tempfile
11
- import numpy as np
12
- from PIL import Image
13
 
14
  def preprocess_data(df):
15
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
@@ -17,19 +14,6 @@ def preprocess_data(df):
17
  df['texts'] = df['texts'].str.lower()
18
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
19
 
20
- def remove_emoji(string):
21
- emoji_pattern = re.compile("["
22
- u"\U0001F600-\U0001F64F"
23
- u"\U0001F300-\U0001F5FF"
24
- u"\U0001F680-\U0001F6FF"
25
- u"\U0001F1E0-\U0001F1FF"
26
- u"\U00002702-\U000027B0"
27
- u"\U000024C2-\U0001F251"
28
- "]+", flags=re.UNICODE)
29
- return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
30
-
31
- df['texts'] = df['texts'].apply(remove_emoji)
32
-
33
  custom_synonyms = {
34
  'application': ['form'],
35
  'apply': ['fill', 'applied'],
@@ -113,113 +97,64 @@ def preprocess_data(df):
113
  def cluster_data(df, num_clusters):
114
  vectorizer = TfidfVectorizer(stop_words='english')
115
  X = vectorizer.fit_transform(df['texts'])
 
116
 
117
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
118
  kmeans.fit(X)
119
  df['Cluster'] = kmeans.labels_
120
 
121
- pca = PCA(n_components=2)
122
- principal_components = pca.fit_transform(X.toarray())
123
- df['PCA1'] = principal_components[:, 0]
124
- df['PCA2'] = principal_components[:, 1]
125
-
126
  return df, X, kmeans
127
 
128
- def visualize_clusters(df):
129
- plt.figure(figsize=(10, 6))
130
- scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
131
- plt.legend(*scatter.legend_elements(), title="Clusters")
132
- plt.title('Clusters of User Queries')
133
- plt.xlabel('PCA Component 1')
134
- plt.ylabel('PCA Component 2')
135
  buf = BytesIO()
136
  plt.savefig(buf, format='png')
137
  buf.seek(0)
138
- img = Image.open(buf)
139
- return img
140
 
141
- def silhouette_analysis(X, labels, num_clusters):
142
- fig, ax1 = plt.subplots(1, 1)
143
- fig.set_size_inches(10, 6)
144
 
145
- ax1.set_xlim([-0.1, 1])
146
- ax1.set_ylim([0, X.shape[0] + (num_clusters + 1) * 10])
147
 
148
- sample_silhouette_values = silhouette_samples(X, labels)
149
- y_lower = 10
150
- for i in range(num_clusters):
151
- ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
152
- ith_cluster_silhouette_values.sort()
153
- size_cluster_i = ith_cluster_silhouette_values.shape[0]
154
- y_upper = y_lower + size_cluster_i
155
- color = plt.cm.nipy_spectral(float(i) / num_clusters)
156
- ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
157
- facecolor=color, edgecolor=color, alpha=0.7)
158
- ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
159
- y_lower = y_upper + 10
160
 
161
- ax1.set_title("The silhouette plot for the various clusters.")
162
- ax1.set_xlabel("The silhouette coefficient values")
163
- ax1.set_ylabel("Cluster label")
164
- ax1.axvline(x=np.mean(sample_silhouette_values), color="red", linestyle="--")
165
- ax1.set_yticks([])
166
- ax1.set_xticks([i/10.0 for i in range(-1, 11)])
 
 
 
 
167
 
168
- buf = BytesIO()
169
- plt.savefig(buf, format='png')
170
- buf.seek(0)
171
- img = Image.open(buf)
172
- return img
173
-
174
- def main(file, num_clusters_to_display):
175
- try:
176
- df = pd.read_csv(file)
177
-
178
- # Filter by 'Fallback Message shown'
179
- df = df[(df['Answer'] == 'Fallback Message shown')]
180
-
181
- df = preprocess_data(df)
182
- df, X, kmeans = cluster_data(df, num_clusters=15)
183
-
184
- cluster_plot = visualize_clusters(df)
185
-
186
- cluster_sizes = df['Cluster'].value_counts()
187
- sorted_clusters = cluster_sizes.index.tolist()
188
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
189
- df = df.sort_values('Cluster')
190
-
191
- # Filter out the largest cluster and get the next largest clusters
192
- largest_cluster = sorted_clusters[0]
193
- filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
194
-
195
- df = df[df['Cluster'].isin(filtered_clusters)]
196
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
197
- df = df.sort_values('Cluster')
198
-
199
- silhouette_avg = silhouette_score(X, kmeans.labels_)
200
- silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
201
-
202
- # Convert silhouette score to percentage
203
- silhouette_percentage = (silhouette_avg + 1) * 50
204
-
205
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
206
- df.to_csv(tmpfile.name, index=False)
207
- return tmpfile.name, silhouette_percentage, cluster_plot, silhouette_plot
208
- except Exception as e:
209
- print(f"Error: {e}")
210
- return str(e), None, None, None
211
 
212
  interface = gr.Interface(
213
  fn=main,
214
  inputs=[
215
  gr.File(label="Upload CSV File (.csv)"),
216
- gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
217
  ],
218
  outputs=[
219
- gr.File(label="Clustered Data CSV"),
220
- gr.Number(label="Clustering Quality (%)"),
221
- gr.Image(label="Cluster Plot"),
222
- gr.Image(label="Silhouette Plot")
223
  ],
224
  title="Unanswered User Queries Clustering",
225
  description="Unanswered User Query Categorization"
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
+ from sklearn.metrics import silhouette_score
6
+ from sklearn.preprocessing import normalize
 
7
  import re
8
+ from wordcloud import WordCloud
9
+ import matplotlib.pyplot as plt
 
 
10
 
11
  def preprocess_data(df):
12
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
 
14
  df['texts'] = df['texts'].str.lower()
15
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  custom_synonyms = {
18
  'application': ['form'],
19
  'apply': ['fill', 'applied'],
 
97
  def cluster_data(df, num_clusters):
98
  vectorizer = TfidfVectorizer(stop_words='english')
99
  X = vectorizer.fit_transform(df['texts'])
100
+ X = normalize(X)
101
 
102
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
103
  kmeans.fit(X)
104
  df['Cluster'] = kmeans.labels_
105
 
 
 
 
 
 
106
  return df, X, kmeans
107
 
108
+ def generate_wordcloud(texts):
109
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(texts))
110
+ plt.figure(figsize=(10, 5))
111
+ plt.imshow(wordcloud, interpolation='bilinear')
112
+ plt.axis('off')
 
 
113
  buf = BytesIO()
114
  plt.savefig(buf, format='png')
115
  buf.seek(0)
116
+ return buf
 
117
 
118
+ def main(file, num_clusters):
119
+ df = pd.read_csv(file)
 
120
 
121
+ # Filter by 'Fallback Message shown'
122
+ df = df[df['Answer'] == 'Fallback Message shown']
123
 
124
+ df = preprocess_data(df)
125
+ df, X, kmeans = cluster_data(df, num_clusters)
 
 
 
 
 
 
 
 
 
 
126
 
127
+ clusters = df['Cluster'].unique()
128
+ wordclouds = []
129
+ for cluster in clusters:
130
+ texts = df[df['Cluster'] == cluster]['texts'].tolist()
131
+ wordcloud_image = generate_wordcloud(texts)
132
+ wordclouds.append((f"Cluster {cluster}", wordcloud_image))
133
+
134
+ cluster_sizes = df['Cluster'].value_counts()
135
+ top_clusters = cluster_sizes.head(num_clusters).index
136
+ top_queries = df[df['Cluster'].isin(top_clusters)][['Cluster', 'texts']]
137
 
138
+ return wordclouds, top_queries
139
+
140
+ def display_results(wordclouds, top_queries):
141
+ for cluster, wordcloud in wordclouds:
142
+ print(cluster)
143
+ img = Image.open(wordcloud)
144
+ img.show()
145
+
146
+ print("Top Queries by Cluster:")
147
+ print(top_queries.to_string(index=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  interface = gr.Interface(
150
  fn=main,
151
  inputs=[
152
  gr.File(label="Upload CSV File (.csv)"),
153
+ gr.Slider(label="Number of Clusters", minimum=2, maximum=20, step=1, value=5)
154
  ],
155
  outputs=[
156
+ gr.Gallery(label="Word Clouds of Clusters"),
157
+ gr.Dataframe(label="Top Queries by Cluster")
 
 
158
  ],
159
  title="Unanswered User Queries Clustering",
160
  description="Unanswered User Query Categorization"