tanish78 commited on
Commit
162880f
·
verified ·
1 Parent(s): b5afc59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -149
app.py CHANGED
@@ -5,127 +5,33 @@ from sklearn.cluster import KMeans
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
8
-
9
- def preprocess_excel_data(df):
10
- df.rename(columns={'Queries': 'texts'}, inplace=True)
11
- df['texts'] = df['texts'].astype(str)
12
- df['texts'] = df['texts'].str.lower()
13
- df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
14
-
15
- def remove_emoji(string):
16
- emoji_pattern = re.compile("["
17
- u"\U0001F600-\U0001F64F"
18
- u"\U0001F300-\U0001F5FF"
19
- u"\U0001F680-\U0001F6FF"
20
- u"\U0001F1E0-\U0001F1FF"
21
- u"\U00002702-\U000027B0"
22
- u"\U000024C2-\U0001F251"
23
- "]+", flags=re.UNICODE)
24
- return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
25
-
26
- df['texts'] = df['texts'].apply(remove_emoji)
27
-
28
- custom_synonyms = {
29
- 'application': ['form'],
30
- 'apply': ['fill', 'applied'],
31
- 'work': ['job'],
32
- 'salary': ['stipend', 'pay', 'payment', 'paid'],
33
- 'test': ['online test', 'amcat test', 'exam', 'assessment'],
34
- 'pass': ['clear', 'selected', 'pass or not'],
35
- 'result': ['outcome', 'mark', 'marks'],
36
- 'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you",
37
- "okaythank", "thx", "ty", "thankyou", "thank", "thank u"],
38
- 'interview': ["pi"]
39
- }
40
-
41
- for original_word, synonym_list in custom_synonyms.items():
42
- for synonym in synonym_list:
43
- pattern = r"\b" + synonym + r"\b(?!\s*\()"
44
- df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
45
- pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
46
- df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
47
-
48
- spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
49
- "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
50
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
51
-
52
- rows_to_remove = set()
53
- for spam_phrase in spam_list:
54
- pattern = r"\b" + re.escape(spam_phrase) + r"\b"
55
- spam_rows = df['texts'].str.contains(pattern)
56
- rows_to_remove.update(df.index[spam_rows].tolist())
57
-
58
- df = df.drop(rows_to_remove)
59
-
60
- greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
61
- for greet_var in greet_variations:
62
- pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
63
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
64
-
65
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
66
- for okay_var in okay_variations:
67
- pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
68
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
69
-
70
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
71
- for yes_var in yes_variations:
72
- pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
73
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
74
-
75
- remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
76
- "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
77
- "actually no", "next steps", "i'm a student alumni", "i have questions"]
78
-
79
- for phrase in remove_phrases:
80
- df['texts'] = df['texts'].str.replace(phrase, '')
81
-
82
- general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
83
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
84
- "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
85
- "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
86
-
87
- for gen_var in general_variations:
88
- pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
89
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
90
-
91
- def remove_punctuations(text):
92
- return re.sub(r'[^\w\s]', '', text)
93
- df['texts'] = df['texts'].apply(remove_punctuations)
94
-
95
- remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
96
-
97
- for phrase in remove_morephrases:
98
- df['texts'] = df['texts'].str.replace(phrase, '')
99
-
100
- df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
101
-
102
- df['texts'] = df['texts'].str.strip()
103
-
104
- df['texts'] = df['texts'].apply(lambda x: x.strip())
105
- df = df[df['texts'] != '']
106
-
107
- return df
108
-
109
- def preprocess_csv_data(df):
110
- df = df[df['Answer'] == 'Fallback Message shown']
111
- df.rename(columns={'User Query': 'texts'}, inplace=True)
112
  df['texts'] = df['texts'].astype(str)
113
  df['texts'] = df['texts'].str.lower()
114
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
115
 
116
  def remove_emoji(string):
117
  emoji_pattern = re.compile("["
118
- u"\U0001F600-\U0001F64F"
119
- u"\U0001F300-\U0001F5FF"
120
- u"\U0001F680-\U0001F6FF"
121
- u"\U0001F1E0-\U0001F1FF"
122
- u"\U00002702-\U000027B0"
123
- u"\U000024C2-\U0001F251"
124
- "]+", flags=re.UNICODE)
125
- return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
126
 
127
  df['texts'] = df['texts'].apply(remove_emoji)
128
-
129
  custom_synonyms = {
130
  'application': ['form'],
131
  'apply': ['fill', 'applied'],
@@ -148,7 +54,7 @@ def preprocess_csv_data(df):
148
 
149
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
150
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
151
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
152
 
153
  rows_to_remove = set()
154
  for spam_phrase in spam_list:
@@ -163,28 +69,27 @@ def preprocess_csv_data(df):
163
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
164
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
165
 
166
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
167
  for okay_var in okay_variations:
168
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
169
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
170
 
171
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
172
  for yes_var in yes_variations:
173
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
174
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
175
 
176
- remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
177
- "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
178
- "actually no", "next steps", "i'm a student alumni", "i have questions"]
179
 
180
  for phrase in remove_phrases:
181
  df['texts'] = df['texts'].str.replace(phrase, '')
182
 
183
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
184
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
185
- "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
186
- "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
187
-
188
  for gen_var in general_variations:
189
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
190
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
@@ -193,7 +98,7 @@ def preprocess_csv_data(df):
193
  return re.sub(r'[^\w\s]', '', text)
194
  df['texts'] = df['texts'].apply(remove_punctuations)
195
 
196
- remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
197
 
198
  for phrase in remove_morephrases:
199
  df['texts'] = df['texts'].str.replace(phrase, '')
@@ -207,47 +112,66 @@ def preprocess_csv_data(df):
207
 
208
  return df
209
 
210
- def kmeans_clustering(df, num_clusters):
 
211
  vectorizer = TfidfVectorizer(stop_words='english')
212
  X = vectorizer.fit_transform(df['texts'])
213
 
214
- kmeans = KMeans(n_clusters=num_clusters, random_state=42)
215
  kmeans.fit(X)
216
  df['Cluster'] = kmeans.labels_
217
 
218
- cluster_counts = df['Cluster'].value_counts().sort_values(ascending=False)
219
- df_filtered = df[df['Cluster'].isin(cluster_counts.head(num_clusters).index)]
 
 
220
 
221
- return df_filtered
 
 
 
 
 
 
 
 
 
222
 
223
- def main(file, num_clusters):
224
  try:
225
- if file.name.endswith('.xlsx'):
226
- df = pd.read_excel(file.name)
227
- df = preprocess_excel_data(df)
228
- elif file.name.endswith('.csv'):
229
- df = pd.read_csv(file.name)
230
- df = preprocess_csv_data(df)
231
- else:
232
- return "Invalid file format. Please upload an Excel or CSV file."
233
-
234
- df = kmeans_clustering(df, num_clusters)
 
 
 
 
 
 
235
  df = df.sort_values('Cluster')
236
 
237
- output_file = "filtered_clusters.csv"
238
- df.to_csv(output_file, index=False)
239
- return output_file
240
  except Exception as e:
241
  return str(e)
242
 
243
- iface = gr.Interface(
244
  fn=main,
245
  inputs=[
246
- gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
247
- gr.Slider(minimum=1, maximum=20, step=1, label="Number of Categories to Display")
248
  ],
249
- outputs=gr.File(label="Filtered CSV File")
 
 
250
  )
251
 
252
- if __name__ == "__main__":
253
- iface.launch()
 
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
  import re
8
+ from io import BytesIO
9
+ import tempfile
10
+ from datetime import datetime
11
+
12
+ def preprocess_data(df):
13
+ # Filter based on the 'Answer' column and the date
14
+ df = df[(df['Answer'] == 'Fallback Message shown') & (pd.to_datetime(df['Date'], dayfirst=True) > datetime(2024, 7, 1))]
15
+
16
+ # Rename and preprocess the 'Question Asked' column
17
+ df.rename(columns={'Question Asked': 'texts'}, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  df['texts'] = df['texts'].astype(str)
19
  df['texts'] = df['texts'].str.lower()
20
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
21
 
22
  def remove_emoji(string):
23
  emoji_pattern = re.compile("["
24
+ u"\U0001F600-\U0001F64F" # emoticons
25
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
26
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
27
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
28
+ u"\U00002702-\U000027B0"
29
+ u"\U000024C2-\U0001F251"
30
+ "]+", flags=re.UNICODE)
31
+ return emoji_pattern.sub(r'', string)
32
 
33
  df['texts'] = df['texts'].apply(remove_emoji)
34
+
35
  custom_synonyms = {
36
  'application': ['form'],
37
  'apply': ['fill', 'applied'],
 
54
 
55
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
56
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
57
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
58
 
59
  rows_to_remove = set()
60
  for spam_phrase in spam_list:
 
69
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
+ okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
73
  for okay_var in okay_variations:
74
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
+ yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
78
  for yes_var in yes_variations:
79
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
80
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
81
 
82
+ remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
83
+ "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
84
+ "actually no","next steps","i'm a student alumni","i have questions"]
85
 
86
  for phrase in remove_phrases:
87
  df['texts'] = df['texts'].str.replace(phrase, '')
88
 
89
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
90
+ "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
91
+ "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
92
+ "actually no","next steps","i'm a student alumni","i have questions"]
 
93
  for gen_var in general_variations:
94
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
95
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
 
98
  return re.sub(r'[^\w\s]', '', text)
99
  df['texts'] = df['texts'].apply(remove_punctuations)
100
 
101
+ remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
102
 
103
  for phrase in remove_morephrases:
104
  df['texts'] = df['texts'].str.replace(phrase, '')
 
112
 
113
  return df
114
 
115
+ def cluster_data(df):
116
+ num_clusters = 10 # Set the number of clusters to 15
117
  vectorizer = TfidfVectorizer(stop_words='english')
118
  X = vectorizer.fit_transform(df['texts'])
119
 
120
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0)
121
  kmeans.fit(X)
122
  df['Cluster'] = kmeans.labels_
123
 
124
+ pca = PCA(n_components=2)
125
+ principal_components = pca.fit_transform(X.toarray())
126
+ df['PCA1'] = principal_components[:, 0]
127
+ df['PCA2'] = principal_components[:, 1]
128
 
129
+ return df
130
+
131
+ def visualize_clusters(df):
132
+ plt.figure(figsize=(10, 6))
133
+ scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
134
+ plt.legend(*scatter.legend_elements(), title="Clusters")
135
+ plt.title('Clusters of User Queries')
136
+ plt.xlabel('PCA Component 1')
137
+ plt.ylabel('PCA Component 2')
138
+ plt.show()
139
 
140
+ def main(file, num_clusters_to_display):
141
  try:
142
+ df = pd.read_csv(file.name)
143
+ df = preprocess_data(df)
144
+ df = cluster_data(df)
145
+ visualize_clusters(df)
146
+
147
+ cluster_sizes = df['Cluster'].value_counts()
148
+ sorted_clusters = cluster_sizes.index.tolist()
149
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
150
+ df = df.sort_values('Cluster')
151
+
152
+ # Filter out cluster 0 and get the largest clusters
153
+ filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
154
+ top_clusters = filtered_clusters[:num_clusters_to_display]
155
+
156
+ df = df[df['texts'].isin(top_clusters)]
157
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
158
  df = df.sort_values('Cluster')
159
 
160
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
161
+ df.to_csv(tmpfile.name, index=False)
162
+ return tmpfile.name
163
  except Exception as e:
164
  return str(e)
165
 
166
+ interface = gr.Interface(
167
  fn=main,
168
  inputs=[
169
+ gr.File(label="Upload CSV File (.csv)"),
170
+ gr.Slider(1, 10, step=1, label="Number of Categories to Display")
171
  ],
172
+ outputs=gr.File(label="Clustered Data CSV"),
173
+ title="Unanswered User Queries Clustering",
174
+ description="Upload a CSV file (.csv) and select the number of largest clusters to display (excluding cluster 0)"
175
  )
176
 
177
+ interface.launch()