tanish78 commited on
Commit
c563933
·
verified ·
1 Parent(s): 85d1dc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -9
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- from sklearn.metrics import silhouette_score, silhouette_samples
6
  import matplotlib.pyplot as plt
7
  from sklearn.decomposition import PCA
8
  import re
@@ -10,12 +10,17 @@ from io import BytesIO
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
 
 
13
 
14
  def preprocess_data(df):
15
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
16
  df['texts'] = df['texts'].astype(str)
17
  df['texts'] = df['texts'].str.lower()
18
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
 
 
 
19
 
20
  def remove_emoji(string):
21
  emoji_pattern = re.compile("["
@@ -111,8 +116,9 @@ def preprocess_data(df):
111
  return df
112
 
113
  def cluster_data(df, num_clusters):
114
- vectorizer = TfidfVectorizer(stop_words='english')
115
  X = vectorizer.fit_transform(df['texts'])
 
116
 
117
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
118
  kmeans.fit(X)
@@ -171,7 +177,25 @@ def silhouette_analysis(X, labels, num_clusters):
171
  img = Image.open(buf)
172
  return img
173
 
174
- def main(file, num_clusters_to_display):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  try:
176
  df = pd.read_csv(file)
177
 
@@ -179,7 +203,13 @@ def main(file, num_clusters_to_display):
179
  df = df[(df['Answer'] == 'Fallback Message shown')]
180
 
181
  df = preprocess_data(df)
182
- df, X, kmeans = cluster_data(df, num_clusters=20)
 
 
 
 
 
 
183
 
184
  cluster_plot = visualize_clusters(df)
185
 
@@ -190,34 +220,37 @@ def main(file, num_clusters_to_display):
190
 
191
  # Filter out the largest cluster and get the next largest clusters
192
  largest_cluster = sorted_clusters[0]
193
- filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
194
 
195
  df = df[df['Cluster'].isin(filtered_clusters)]
196
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
197
  df = df.sort_values('Cluster')
198
 
199
  silhouette_avg = silhouette_score(X, kmeans.labels_)
200
- silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
 
 
201
 
202
  # Convert silhouette score to percentage
203
  silhouette_percentage = (silhouette_avg + 1) * 50
204
 
205
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
206
  df.to_csv(tmpfile.name, index=False)
207
- return tmpfile.name, silhouette_percentage, cluster_plot, silhouette_plot
208
  except Exception as e:
209
  print(f"Error: {e}")
210
- return str(e), None, None, None
211
 
212
  interface = gr.Interface(
213
  fn=main,
214
  inputs=[
215
  gr.File(label="Upload CSV File (.csv)"),
216
- gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
217
  ],
218
  outputs=[
219
  gr.File(label="Clustered Data CSV"),
220
  gr.Number(label="Clustering Quality (%)"),
 
221
  gr.Image(label="Cluster Plot"),
222
  gr.Image(label="Silhouette Plot")
223
  ],
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
+ from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score
6
  import matplotlib.pyplot as plt
7
  from sklearn.decomposition import PCA
8
  import re
 
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
13
+ from nltk.stem import WordNetLemmatizer
14
+ from sklearn.preprocessing import normalize
15
 
16
  def preprocess_data(df):
17
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
18
  df['texts'] = df['texts'].astype(str)
19
  df['texts'] = df['texts'].str.lower()
20
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
21
+
22
+ lemmatizer = WordNetLemmatizer()
23
+ df['texts'] = df['texts'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()]))
24
 
25
  def remove_emoji(string):
26
  emoji_pattern = re.compile("["
 
116
  return df
117
 
118
  def cluster_data(df, num_clusters):
119
+ vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, min_df=2)
120
  X = vectorizer.fit_transform(df['texts'])
121
+ X = normalize(X)
122
 
123
  kmeans = KMeans(n_clusters=num_clusters, random_state=0)
124
  kmeans.fit(X)
 
177
  img = Image.open(buf)
178
  return img
179
 
180
+ def find_optimal_clusters(X, max_clusters):
181
+ silhouette_scores = []
182
+ davies_bouldin_scores = []
183
+
184
+ for n_clusters in range(2, max_clusters + 1):
185
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0)
186
+ labels = kmeans.fit_predict(X)
187
+ silhouette_avg = silhouette_score(X, labels)
188
+ davies_bouldin = davies_bouldin_score(X, labels)
189
+
190
+ silhouette_scores.append(silhouette_avg)
191
+ davies_bouldin_scores.append(davies_bouldin)
192
+
193
+ print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg}, Davies-Bouldin Index: {davies_bouldin}")
194
+
195
+ optimal_clusters = np.argmax(silhouette_scores) + 2
196
+ return optimal_clusters, silhouette_scores, davies_bouldin_scores
197
+
198
+ def main(file, max_clusters_to_display):
199
  try:
200
  df = pd.read_csv(file)
201
 
 
203
  df = df[(df['Answer'] == 'Fallback Message shown')]
204
 
205
  df = preprocess_data(df)
206
+ vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, min_df=2)
207
+ X = vectorizer.fit_transform(df['texts'])
208
+ X = normalize(X)
209
+
210
+ optimal_clusters, silhouette_scores, davies_bouldin_scores = find_optimal_clusters(X, max_clusters_to_display)
211
+
212
+ df, X, kmeans = cluster_data(df, num_clusters=optimal_clusters)
213
 
214
  cluster_plot = visualize_clusters(df)
215
 
 
220
 
221
  # Filter out the largest cluster and get the next largest clusters
222
  largest_cluster = sorted_clusters[0]
223
+ filtered_clusters = sorted_clusters[1:max_clusters_to_display+1]
224
 
225
  df = df[df['Cluster'].isin(filtered_clusters)]
226
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
227
  df = df.sort_values('Cluster')
228
 
229
  silhouette_avg = silhouette_score(X, kmeans.labels_)
230
+ silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=optimal_clusters)
231
+
232
+ davies_bouldin = davies_bouldin_score(X, kmeans.labels_)
233
 
234
  # Convert silhouette score to percentage
235
  silhouette_percentage = (silhouette_avg + 1) * 50
236
 
237
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
238
  df.to_csv(tmpfile.name, index=False)
239
+ return tmpfile.name, silhouette_percentage, davies_bouldin, cluster_plot, silhouette_plot
240
  except Exception as e:
241
  print(f"Error: {e}")
242
+ return str(e), None, None, None, None
243
 
244
  interface = gr.Interface(
245
  fn=main,
246
  inputs=[
247
  gr.File(label="Upload CSV File (.csv)"),
248
+ gr.Slider(label="Max Clusters to Display", minimum=2, maximum 20, step=1, value=10)
249
  ],
250
  outputs=[
251
  gr.File(label="Clustered Data CSV"),
252
  gr.Number(label="Clustering Quality (%)"),
253
+ gr.Number(label="Davies-Bouldin Index"),
254
  gr.Image(label="Cluster Plot"),
255
  gr.Image(label="Silhouette Plot")
256
  ],