tanish78 commited on
Commit
b5afc59
·
verified ·
1 Parent(s): 1356191

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -64
app.py CHANGED
@@ -5,10 +5,8 @@ from sklearn.cluster import KMeans
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
8
- from io import BytesIO
9
- import tempfile
10
 
11
- def preprocess_data(df):
12
  df.rename(columns={'Queries': 'texts'}, inplace=True)
13
  df['texts'] = df['texts'].astype(str)
14
  df['texts'] = df['texts'].str.lower()
@@ -16,13 +14,13 @@ def preprocess_data(df):
16
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
- u"\U0001F600-\U0001F64F"
20
- u"\U0001F300-\U0001F5FF"
21
- u"\U0001F680-\U0001F6FF"
22
- u"\U0001F1E0-\U0001F1FF"
23
- u"\U00002702-\U000027B0"
24
- u"\U000024C2-\U0001F251"
25
- "]+", flags=re.UNICODE)
26
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
@@ -49,7 +47,7 @@ def preprocess_data(df):
49
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
53
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
@@ -64,27 +62,28 @@ def preprocess_data(df):
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
67
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
- remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
78
- "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
79
- "actually no","next steps","i'm a student alumni","i have questions"]
80
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
86
- "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
87
- "actually no","next steps","i'm a student alumni","i have questions"]
 
88
  for gen_var in general_variations:
89
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
90
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
@@ -93,7 +92,7 @@ def preprocess_data(df):
93
  return re.sub(r'[^\w\s]', '', text)
94
  df['texts'] = df['texts'].apply(remove_punctuations)
95
 
96
- remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
97
 
98
  for phrase in remove_morephrases:
99
  df['texts'] = df['texts'].str.replace(phrase, '')
@@ -107,66 +106,148 @@ def preprocess_data(df):
107
 
108
  return df
109
 
110
- def cluster_data(df):
111
- num_clusters = 10 # Set the number of clusters to 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
115
- kmeans = KMeans(n_clusters=num_clusters, random_state=0)
116
  kmeans.fit(X)
117
  df['Cluster'] = kmeans.labels_
118
 
119
- pca = PCA(n_components=2)
120
- principal_components = pca.fit_transform(X.toarray())
121
- df['PCA1'] = principal_components[:, 0]
122
- df['PCA2'] = principal_components[:, 1]
123
 
124
- return df
125
-
126
- def visualize_clusters(df):
127
- plt.figure(figsize=(10, 6))
128
- scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
129
- plt.legend(*scatter.legend_elements(), title="Clusters")
130
- plt.title('Clusters of User Queries')
131
- plt.xlabel('PCA Component 1')
132
- plt.ylabel('PCA Component 2')
133
- plt.show()
134
 
135
- def main(file, num_clusters_to_display):
136
  try:
137
- df = pd.read_excel(file)
138
- df = preprocess_data(df)
139
- df = cluster_data(df)
140
- visualize_clusters(df)
141
-
142
- cluster_sizes = df['Cluster'].value_counts()
143
- sorted_clusters = cluster_sizes.index.tolist()
144
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
145
- df = df.sort_values('Cluster')
146
-
147
- # Filter out cluster 0 and get the largest clusters
148
- filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
149
- top_clusters = filtered_clusters[:num_clusters_to_display]
150
-
151
- df = df[df['Cluster'].isin(top_clusters)]
152
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
153
  df = df.sort_values('Cluster')
154
 
155
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
156
- df.to_csv(tmpfile.name, index=False)
157
- return tmpfile.name
158
  except Exception as e:
159
  return str(e)
160
 
161
- interface = gr.Interface(
162
  fn=main,
163
  inputs=[
164
- gr.File(label="Upload Excel File (.xlsx)"),
165
- gr.Slider(1, 10, step=1, label="Number of Categories to Display")
166
  ],
167
- outputs=gr.File(label="Clustered Data CSV"),
168
- title="Unanswered User Queries Clustering",
169
- description="Upload an Excel file (.xlsx)"
170
  )
171
 
172
- interface.launch()
 
 
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
 
 
8
 
9
+ def preprocess_excel_data(df):
10
  df.rename(columns={'Queries': 'texts'}, inplace=True)
11
  df['texts'] = df['texts'].astype(str)
12
  df['texts'] = df['texts'].str.lower()
 
14
 
15
  def remove_emoji(string):
16
  emoji_pattern = re.compile("["
17
+ u"\U0001F600-\U0001F64F"
18
+ u"\U0001F300-\U0001F5FF"
19
+ u"\U0001F680-\U0001F6FF"
20
+ u"\U0001F1E0-\U0001F1FF"
21
+ u"\U00002702-\U000027B0"
22
+ u"\U000024C2-\U0001F251"
23
+ "]+", flags=re.UNICODE)
24
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
25
 
26
  df['texts'] = df['texts'].apply(remove_emoji)
 
47
 
48
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
49
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
50
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
51
 
52
  rows_to_remove = set()
53
  for spam_phrase in spam_list:
 
62
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
63
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
64
 
65
+ okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
66
  for okay_var in okay_variations:
67
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
68
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
69
 
70
+ yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
71
  for yes_var in yes_variations:
72
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
73
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
74
 
75
+ remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
76
+ "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
77
+ "actually no", "next steps", "i'm a student alumni", "i have questions"]
78
 
79
  for phrase in remove_phrases:
80
  df['texts'] = df['texts'].str.replace(phrase, '')
81
 
82
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
83
+ "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
84
+ "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
85
+ "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
86
+
87
  for gen_var in general_variations:
88
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
89
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
 
92
  return re.sub(r'[^\w\s]', '', text)
93
  df['texts'] = df['texts'].apply(remove_punctuations)
94
 
95
+ remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
96
 
97
  for phrase in remove_morephrases:
98
  df['texts'] = df['texts'].str.replace(phrase, '')
 
106
 
107
  return df
108
 
109
+ def preprocess_csv_data(df):
110
+ df = df[df['Answer'] == 'Fallback Message shown']
111
+ df.rename(columns={'User Query': 'texts'}, inplace=True)
112
+ df['texts'] = df['texts'].astype(str)
113
+ df['texts'] = df['texts'].str.lower()
114
+ df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
115
+
116
+ def remove_emoji(string):
117
+ emoji_pattern = re.compile("["
118
+ u"\U0001F600-\U0001F64F"
119
+ u"\U0001F300-\U0001F5FF"
120
+ u"\U0001F680-\U0001F6FF"
121
+ u"\U0001F1E0-\U0001F1FF"
122
+ u"\U00002702-\U000027B0"
123
+ u"\U000024C2-\U0001F251"
124
+ "]+", flags=re.UNICODE)
125
+ return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
126
+
127
+ df['texts'] = df['texts'].apply(remove_emoji)
128
+
129
+ custom_synonyms = {
130
+ 'application': ['form'],
131
+ 'apply': ['fill', 'applied'],
132
+ 'work': ['job'],
133
+ 'salary': ['stipend', 'pay', 'payment', 'paid'],
134
+ 'test': ['online test', 'amcat test', 'exam', 'assessment'],
135
+ 'pass': ['clear', 'selected', 'pass or not'],
136
+ 'result': ['outcome', 'mark', 'marks'],
137
+ 'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you",
138
+ "okaythank", "thx", "ty", "thankyou", "thank", "thank u"],
139
+ 'interview': ["pi"]
140
+ }
141
+
142
+ for original_word, synonym_list in custom_synonyms.items():
143
+ for synonym in synonym_list:
144
+ pattern = r"\b" + synonym + r"\b(?!\s*\()"
145
+ df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
146
+ pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
147
+ df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
148
+
149
+ spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
150
+ "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
151
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
152
+
153
+ rows_to_remove = set()
154
+ for spam_phrase in spam_list:
155
+ pattern = r"\b" + re.escape(spam_phrase) + r"\b"
156
+ spam_rows = df['texts'].str.contains(pattern)
157
+ rows_to_remove.update(df.index[spam_rows].tolist())
158
+
159
+ df = df.drop(rows_to_remove)
160
+
161
+ greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
162
+ for greet_var in greet_variations:
163
+ pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
164
+ df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
165
+
166
+ okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
167
+ for okay_var in okay_variations:
168
+ pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
169
+ df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
170
+
171
+ yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
172
+ for yes_var in yes_variations:
173
+ pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
174
+ df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
175
+
176
+ remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
177
+ "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
178
+ "actually no", "next steps", "i'm a student alumni", "i have questions"]
179
+
180
+ for phrase in remove_phrases:
181
+ df['texts'] = df['texts'].str.replace(phrase, '')
182
+
183
+ general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
184
+ "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
185
+ "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
186
+ "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
187
+
188
+ for gen_var in general_variations:
189
+ pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
190
+ df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
191
+
192
+ def remove_punctuations(text):
193
+ return re.sub(r'[^\w\s]', '', text)
194
+ df['texts'] = df['texts'].apply(remove_punctuations)
195
+
196
+ remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
197
+
198
+ for phrase in remove_morephrases:
199
+ df['texts'] = df['texts'].str.replace(phrase, '')
200
+
201
+ df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
202
+
203
+ df['texts'] = df['texts'].str.strip()
204
+
205
+ df['texts'] = df['texts'].apply(lambda x: x.strip())
206
+ df = df[df['texts'] != '']
207
+
208
+ return df
209
+
210
+ def kmeans_clustering(df, num_clusters):
211
  vectorizer = TfidfVectorizer(stop_words='english')
212
  X = vectorizer.fit_transform(df['texts'])
213
 
214
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
215
  kmeans.fit(X)
216
  df['Cluster'] = kmeans.labels_
217
 
218
+ cluster_counts = df['Cluster'].value_counts().sort_values(ascending=False)
219
+ df_filtered = df[df['Cluster'].isin(cluster_counts.head(num_clusters).index)]
 
 
220
 
221
+ return df_filtered
 
 
 
 
 
 
 
 
 
222
 
223
+ def main(file, num_clusters):
224
  try:
225
+ if file.name.endswith('.xlsx'):
226
+ df = pd.read_excel(file.name)
227
+ df = preprocess_excel_data(df)
228
+ elif file.name.endswith('.csv'):
229
+ df = pd.read_csv(file.name)
230
+ df = preprocess_csv_data(df)
231
+ else:
232
+ return "Invalid file format. Please upload an Excel or CSV file."
233
+
234
+ df = kmeans_clustering(df, num_clusters)
 
 
 
 
 
 
235
  df = df.sort_values('Cluster')
236
 
237
+ output_file = "filtered_clusters.csv"
238
+ df.to_csv(output_file, index=False)
239
+ return output_file
240
  except Exception as e:
241
  return str(e)
242
 
243
+ iface = gr.Interface(
244
  fn=main,
245
  inputs=[
246
+ gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
247
+ gr.Slider(minimum=1, maximum=20, step=1, label="Number of Categories to Display")
248
  ],
249
+ outputs=gr.File(label="Filtered CSV File")
 
 
250
  )
251
 
252
+ if __name__ == "__main__":
253
+ iface.launch()