tanish78 commited on
Commit
8cfbb65
·
verified ·
1 Parent(s): 13d2652

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -138
app.py CHANGED
@@ -2,19 +2,17 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- from sklearn.metrics import silhouette_score, silhouette_samples
6
  import matplotlib.pyplot as plt
7
- from sklearn.decomposition import PCA
8
  import re
9
  from io import BytesIO
10
  import tempfile
11
  import numpy as np
12
  from PIL import Image
 
13
 
14
  def preprocess_data(df):
15
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
16
- df['texts'] = df['texts'].astype(str)
17
- df['texts'] = df['texts'].str.lower()
18
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
19
 
20
  def remove_emoji(string):
@@ -26,7 +24,7 @@ def preprocess_data(df):
26
  u"\U00002702-\U000027B0"
27
  u"\U000024C2-\U0001F251"
28
  "]+", flags=re.UNICODE)
29
- return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
30
 
31
  df['texts'] = df['texts'].apply(remove_emoji)
32
 
@@ -45,67 +43,22 @@ def preprocess_data(df):
45
 
46
  for original_word, synonym_list in custom_synonyms.items():
47
  for synonym in synonym_list:
48
- pattern = r"\b" + synonym + r"\b(?!\s*\()"
49
  df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
50
- pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
51
- df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
52
 
53
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
54
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
55
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
56
 
57
- rows_to_remove = set()
58
  for spam_phrase in spam_list:
59
  pattern = r"\b" + re.escape(spam_phrase) + r"\b"
60
- spam_rows = df['texts'].str.contains(pattern)
61
- rows_to_remove.update(df.index[spam_rows].tolist())
62
-
63
- df = df.drop(rows_to_remove)
64
-
65
- greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
66
- for greet_var in greet_variations:
67
- pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
68
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
69
-
70
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
71
- for okay_var in okay_variations:
72
- pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
73
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
74
-
75
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
76
- for yes_var in yes_variations:
77
- pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
78
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
79
-
80
- remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
81
- "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
82
- "actually no","next steps","i'm a student alumni","i have questions"]
83
-
84
- for phrase in remove_phrases:
85
- df['texts'] = df['texts'].str.replace(phrase, '')
86
-
87
- general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
88
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
89
- "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
90
- "actually no","next steps","i'm a student alumni","i have questions"]
91
- for gen_var in general_variations:
92
- pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
93
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
94
 
95
  def remove_punctuations(text):
96
  return re.sub(r'[^\w\s]', '', text)
97
- df['texts'] = df['texts'].apply(remove_punctuations)
98
-
99
- remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
100
-
101
- for phrase in remove_morephrases:
102
- df['texts'] = df['texts'].str.replace(phrase, '')
103
-
104
- df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
105
 
 
106
  df['texts'] = df['texts'].str.strip()
107
-
108
- df['texts'] = df['texts'].apply(lambda x: x.strip())
109
  df = df[df['texts'] != '']
110
 
111
  return df
@@ -118,96 +71,47 @@ def cluster_data(df, num_clusters):
118
  kmeans.fit(X)
119
  df['Cluster'] = kmeans.labels_
120
 
121
- pca = PCA(n_components=2)
122
- principal_components = pca.fit_transform(X.toarray())
123
- df['PCA1'] = principal_components[:, 0]
124
- df['PCA2'] = principal_components[:, 1]
125
-
126
- return df, X, kmeans
127
 
128
- def visualize_clusters(df):
129
- plt.figure(figsize=(10, 6))
130
- scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
131
- plt.legend(*scatter.legend_elements(), title="Clusters")
132
- plt.title('Clusters of User Queries')
133
- plt.xlabel('PCA Component 1')
134
- plt.ylabel('PCA Component 2')
135
  buf = BytesIO()
136
  plt.savefig(buf, format='png')
137
  buf.seek(0)
138
  img = Image.open(buf)
139
  return img
140
 
141
- def silhouette_analysis(X, labels, num_clusters):
142
- fig, ax1 = plt.subplots(1, 1)
143
- fig.set_size_inches(10, 6)
144
-
145
- ax1.set_xlim([-0.1, 1])
146
- ax1.set_ylim([0, X.shape[0] + (num_clusters + 1) * 10])
147
 
148
- sample_silhouette_values = silhouette_samples(X, labels)
149
- y_lower = 10
150
- for i in range(num_clusters):
151
- ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
152
- ith_cluster_silhouette_values.sort()
153
- size_cluster_i = ith_cluster_silhouette_values.shape[0]
154
- y_upper = y_lower + size_cluster_i
155
- color = plt.cm.nipy_spectral(float(i) / num_clusters)
156
- ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
157
- facecolor=color, edgecolor=color, alpha=0.7)
158
- ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
159
- y_lower = y_upper + 10
160
 
161
- ax1.set_title("The silhouette plot for the various clusters.")
162
- ax1.set_xlabel("The silhouette coefficient values")
163
- ax1.set_ylabel("Cluster label")
164
- ax1.axvline(x=np.mean(sample_silhouette_values), color="red", linestyle="--")
165
- ax1.set_yticks([])
166
- ax1.set_xticks([i/10.0 for i in range(-1, 11)])
167
 
168
- buf = BytesIO()
169
- plt.savefig(buf, format='png')
170
- buf.seek(0)
171
- img = Image.open(buf)
172
- return img
173
 
174
- def main(file, num_clusters_to_display):
175
- try:
176
- df = pd.read_csv(file)
177
-
178
- # Filter by 'Fallback Message shown'
179
- df = df[(df['Answer'] == 'Fallback Message shown')]
180
-
181
- df = preprocess_data(df)
182
- df, X, kmeans = cluster_data(df, num_clusters=15)
183
-
184
- cluster_plot = visualize_clusters(df)
185
-
186
- cluster_sizes = df['Cluster'].value_counts()
187
- sorted_clusters = cluster_sizes.index.tolist()
188
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
189
- df = df.sort_values('Cluster')
190
-
191
- # Filter out the largest cluster and get the next largest clusters
192
- largest_cluster = sorted_clusters[0]
193
- filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
194
-
195
- df = df[df['Cluster'].isin(filtered_clusters)]
196
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
197
- df = df.sort_values('Cluster')
198
-
199
- silhouette_avg = silhouette_score(X, kmeans.labels_)
200
- silhouette_plot = silhouette_analysis(X, kmeans.labels_, num_clusters=15)
201
-
202
- # Convert silhouette score to percentage
203
- silhouette_percentage = (silhouette_avg + 1) * 50
204
-
205
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
206
- df.to_csv(tmpfile.name, index=False)
207
- return tmpfile.name, silhouette_percentage, cluster_plot, silhouette_plot
208
- except Exception as e:
209
- print(f"Error: {e}")
210
- return str(e), None, None, None
211
 
212
  interface = gr.Interface(
213
  fn=main,
@@ -217,13 +121,10 @@ interface = gr.Interface(
217
  ],
218
  outputs=[
219
  gr.File(label="Clustered Data CSV"),
220
- gr.Number(label="Clustering Quality (%)"),
221
- gr.Image(label="Cluster Plot"),
222
- gr.Image(label="Silhouette Plot")
223
  ],
224
  title="Unanswered User Queries Clustering",
225
  description="Unanswered User Query Categorization"
226
  )
227
 
228
- interface.launch()
229
-
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
 
5
  import matplotlib.pyplot as plt
 
6
  import re
7
  from io import BytesIO
8
  import tempfile
9
  import numpy as np
10
  from PIL import Image
11
+ from wordcloud import WordCloud
12
 
13
  def preprocess_data(df):
14
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
15
+ df['texts'] = df['texts'].astype(str).str.lower()
 
16
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
17
 
18
  def remove_emoji(string):
 
24
  u"\U00002702-\U000027B0"
25
  u"\U000024C2-\U0001F251"
26
  "]+", flags=re.UNICODE)
27
+ return emoji_pattern.sub(r'', string)
28
 
29
  df['texts'] = df['texts'].apply(remove_emoji)
30
 
 
43
 
44
  for original_word, synonym_list in custom_synonyms.items():
45
  for synonym in synonym_list:
46
+ pattern = r"\b" + synonym + r"\b"
47
  df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
 
 
48
 
49
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
50
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
51
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
52
 
 
53
  for spam_phrase in spam_list:
54
  pattern = r"\b" + re.escape(spam_phrase) + r"\b"
55
+ df = df[~df['texts'].str.contains(pattern)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def remove_punctuations(text):
58
  return re.sub(r'[^\w\s]', '', text)
 
 
 
 
 
 
 
 
59
 
60
+ df['texts'] = df['texts'].apply(remove_punctuations)
61
  df['texts'] = df['texts'].str.strip()
 
 
62
  df = df[df['texts'] != '']
63
 
64
  return df
 
71
  kmeans.fit(X)
72
  df['Cluster'] = kmeans.labels_
73
 
74
+ return df, kmeans
 
 
 
 
 
75
 
76
+ def generate_wordcloud(df):
77
+ text = " ".join(df['texts'].tolist())
78
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
79
+ plt.figure(figsize=(10, 5))
80
+ plt.imshow(wordcloud, interpolation='bilinear')
81
+ plt.axis('off')
 
82
  buf = BytesIO()
83
  plt.savefig(buf, format='png')
84
  buf.seek(0)
85
  img = Image.open(buf)
86
  return img
87
 
88
+ def main(file, num_clusters_to_display):
89
+ df = pd.read_csv(file)
 
 
 
 
90
 
91
+ # Filter by 'Fallback Message shown'
92
+ df = df[df['Answer'] == 'Fallback Message shown']
 
 
 
 
 
 
 
 
 
 
93
 
94
+ df = preprocess_data(df)
95
+ df, kmeans = cluster_data(df, num_clusters=15)
 
 
 
 
96
 
97
+ cluster_sizes = df['Cluster'].value_counts()
98
+ sorted_clusters = cluster_sizes.index.tolist()
 
 
 
99
 
100
+ # Filter out the largest cluster and get the next largest clusters
101
+ largest_cluster = sorted_clusters[0]
102
+ filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
103
+
104
+ df = df[df['Cluster'].isin(filtered_clusters)]
105
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
106
+ df = df.sort_values('Cluster')
107
+
108
+ wordcloud_img = generate_wordcloud(df)
109
+
110
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
111
+ df.to_csv(tmpfile.name, index=False)
112
+ csv_file_path = tmpfile.name
113
+
114
+ return csv_file_path, wordcloud_img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  interface = gr.Interface(
117
  fn=main,
 
121
  ],
122
  outputs=[
123
  gr.File(label="Clustered Data CSV"),
124
+ gr.Image(label="Word Cloud")
 
 
125
  ],
126
  title="Unanswered User Queries Clustering",
127
  description="Unanswered User Query Categorization"
128
  )
129
 
130
+ interface.launch(share=True)