tanish78 commited on
Commit
8c44bd7
·
verified ·
1 Parent(s): a8cb37f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -35
app.py CHANGED
@@ -16,13 +16,13 @@ def preprocess_data(df):
16
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
- u"\U0001F600-\U0001F64F"
20
- u"\U0001F300-\U0001F5FF"
21
- u"\U0001F680-\U0001F6FF"
22
- u"\U0001F1E0-\U0001F1FF"
23
- u"\U00002702-\U000027B0"
24
- u"\U000024C2-\U0001F251"
25
- "]+", flags=re.UNICODE)
26
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
@@ -49,7 +49,7 @@ def preprocess_data(df):
49
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
53
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
@@ -64,28 +64,27 @@ def preprocess_data(df):
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
67
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
- remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
78
- "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
79
- "actually no", "next steps", "i'm a student alumni", "i have questions"]
80
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
86
- "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
87
- "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
88
-
89
  for gen_var in general_variations:
90
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
91
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
@@ -94,7 +93,7 @@ def preprocess_data(df):
94
  return re.sub(r'[^\w\s]', '', text)
95
  df['texts'] = df['texts'].apply(remove_punctuations)
96
 
97
- remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
98
 
99
  for phrase in remove_morephrases:
100
  df['texts'] = df['texts'].str.replace(phrase, '')
@@ -135,15 +134,8 @@ def visualize_clusters(df):
135
 
136
  def main(file, num_clusters_to_display):
137
  try:
138
- # Detect if the file is CSV or Excel
139
- if file.name.endswith('.csv'):
140
- df = pd.read_csv(file.name)
141
- df = df[df['Answer'] == 'Fallback Message shown'] # Filter for 'Fallback Message shown' in 'Answer' column
142
- df.rename(columns={'User Query': 'texts'}, inplace=True) # Rename column to 'texts' for processing
143
- else:
144
- df = pd.read_excel(file.name)
145
- df = preprocess_data(df)
146
-
147
  df = cluster_data(df)
148
  visualize_clusters(df)
149
 
@@ -156,23 +148,25 @@ def main(file, num_clusters_to_display):
156
  filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
157
  top_clusters = filtered_clusters[:num_clusters_to_display]
158
 
159
- df = df[df['texts'] != '']
160
  df = df[df['Cluster'].isin(top_clusters)]
161
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
162
  df = df.sort_values('Cluster')
163
 
164
- return df.to_csv(index=False)
 
 
165
  except Exception as e:
166
  return str(e)
167
 
168
- iface = gr.Interface(
169
  fn=main,
170
  inputs=[
171
- gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
172
- gr.Slider(minimum=1, maximum=20, step=1, label="Number of Categories to Display")
173
  ],
174
- outputs=gr.File(label="Filtered CSV File")
 
 
175
  )
176
 
177
- if __name__ == "__main__":
178
- iface.launch()
 
16
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
+ u"\U0001F600-\U0001F64F"
20
+ u"\U0001F300-\U0001F5FF"
21
+ u"\U0001F680-\U0001F6FF"
22
+ u"\U0001F1E0-\U0001F1FF"
23
+ u"\U00002702-\U000027B0"
24
+ u"\U000024C2-\U0001F251"
25
+ "]+", flags=re.UNICODE)
26
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
 
49
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
53
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
 
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
67
+ okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
+ yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
+ remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
78
+ "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
79
+ "actually no","next steps","i'm a student alumni","i have questions"]
80
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
+ "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
86
+ "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
87
+ "actually no","next steps","i'm a student alumni","i have questions"]
 
88
  for gen_var in general_variations:
89
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
90
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
 
93
  return re.sub(r'[^\w\s]', '', text)
94
  df['texts'] = df['texts'].apply(remove_punctuations)
95
 
96
+ remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
97
 
98
  for phrase in remove_morephrases:
99
  df['texts'] = df['texts'].str.replace(phrase, '')
 
134
 
135
  def main(file, num_clusters_to_display):
136
  try:
137
+ df = pd.read_excel(file)
138
+ df = preprocess_data(df)
 
 
 
 
 
 
 
139
  df = cluster_data(df)
140
  visualize_clusters(df)
141
 
 
148
  filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
149
  top_clusters = filtered_clusters[:num_clusters_to_display]
150
 
 
151
  df = df[df['Cluster'].isin(top_clusters)]
152
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
153
  df = df.sort_values('Cluster')
154
 
155
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
156
+ df.to_csv(tmpfile.name, index=False)
157
+ return tmpfile.name
158
  except Exception as e:
159
  return str(e)
160
 
161
+ interface = gr.Interface(
162
  fn=main,
163
  inputs=[
164
+ gr.File(label="Upload Excel File (.xlsx)"),
165
+ gr.Slider(1, 10, step=1, label="Number of Categories to Display")
166
  ],
167
+ outputs=gr.File(label="Clustered Data CSV"),
168
+ title="Unanswered User Queries Clustering",
169
+ description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
170
  )
171
 
172
+ interface.launch()