tanish78 commited on
Commit
a8cb37f
·
verified ·
1 Parent(s): c108323

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -58
app.py CHANGED
@@ -16,13 +16,13 @@ def preprocess_data(df):
16
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
- u"\U0001F600-\U0001F64F"
20
- u"\U0001F300-\U0001F5FF"
21
- u"\U0001F680-\U0001F6FF"
22
- u"\U0001F1E0-\U0001F1FF"
23
- u"\U00002702-\U000027B0"
24
- u"\U000024C2-\U0001F251"
25
- "]+", flags=re.UNICODE)
26
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
@@ -49,7 +49,7 @@ def preprocess_data(df):
49
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
53
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
@@ -64,27 +64,28 @@ def preprocess_data(df):
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
67
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
- remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
78
- "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
79
- "actually no","next steps","i'm a student alumni","i have questions"]
80
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
86
- "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
87
- "actually no","next steps","i'm a student alumni","i have questions"]
 
88
  for gen_var in general_variations:
89
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
90
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
@@ -93,7 +94,7 @@ def preprocess_data(df):
93
  return re.sub(r'[^\w\s]', '', text)
94
  df['texts'] = df['texts'].apply(remove_punctuations)
95
 
96
- remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
97
 
98
  for phrase in remove_morephrases:
99
  df['texts'] = df['texts'].str.replace(phrase, '')
@@ -108,7 +109,7 @@ def preprocess_data(df):
108
  return df
109
 
110
  def cluster_data(df):
111
- num_clusters = 15 # Set the number of clusters to 15
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
@@ -134,25 +135,15 @@ def visualize_clusters(df):
134
 
135
  def main(file, num_clusters_to_display):
136
  try:
137
- # Determine the file type
138
- file_extension = file.name.split('.')[-1].lower()
139
-
140
- # Load the file
141
- if file_extension == 'xlsx':
142
- df = pd.read_excel(file)
143
- elif file_extension == 'csv':
144
- df = pd.read_csv(file)
145
  else:
146
- return "Unsupported file type. Please upload an Excel or CSV file."
147
-
148
- # Process CSV file specifically
149
- if file_extension == 'csv':
150
- # Keep only rows where 'Answer' is 'Fallback Message shown'
151
- df = df[df['Answer'] == 'Fallback Message shown']
152
- # Focus on 'Query' column for text processing
153
- df.rename(columns={'Query': 'texts'}, inplace=True)
154
-
155
- df = preprocess_data(df)
156
  df = cluster_data(df)
157
  visualize_clusters(df)
158
 
@@ -165,35 +156,23 @@ def main(file, num_clusters_to_display):
165
  filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
166
  top_clusters = filtered_clusters[:num_clusters_to_display]
167
 
 
168
  df = df[df['Cluster'].isin(top_clusters)]
169
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
170
  df = df.sort_values('Cluster')
171
 
172
- # Save the resulting DataFrame to a CSV file
173
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
174
- df.to_csv(tmpfile.name, index=False)
175
- tmpfile.flush()
176
-
177
- return tmpfile.name
178
-
179
  except Exception as e:
180
- return f"An error occurred: {str(e)}"
181
-
182
- def upload_file(file, num_clusters_to_display):
183
- result = main(file, num_clusters_to_display)
184
- if result.endswith(".csv"):
185
- return result
186
- else:
187
- return f"Error: {result}"
188
 
189
- interface = gr.Interface(
190
- fn=upload_file,
191
  inputs=[
192
- gr.inputs.File(label="Upload Excel or CSV File (.xlsx or .csv)", type="file"),
193
- gr.inputs.Slider(minimum=1, maximum=20, step=1, default=5, label="Number of Categories to Display")
194
  ],
195
- outputs=gr.outputs.File(label="Output CSV File"),
196
- title="Unanswered User Queries Clustering"
197
  )
198
 
199
- interface.launch(debug=True)
 
 
16
 
17
  def remove_emoji(string):
18
  emoji_pattern = re.compile("["
19
+ u"\U0001F600-\U0001F64F"
20
+ u"\U0001F300-\U0001F5FF"
21
+ u"\U0001F680-\U0001F6FF"
22
+ u"\U0001F1E0-\U0001F1FF"
23
+ u"\U00002702-\U000027B0"
24
+ u"\U000024C2-\U0001F251"
25
+ "]+", flags=re.UNICODE)
26
  return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
27
 
28
  df['texts'] = df['texts'].apply(remove_emoji)
 
49
 
50
  spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
51
  "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
52
+ "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
53
 
54
  rows_to_remove = set()
55
  for spam_phrase in spam_list:
 
64
  pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
65
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
66
 
67
+ okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk", "t", "r"]
68
  for okay_var in okay_variations:
69
  pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
70
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
71
 
72
+ yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea", "no"]
73
  for yes_var in yes_variations:
74
  pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
75
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
76
 
77
+ remove_phrases = ["i'm all set", "ask a question", "apply the survey", "videos (2-8 min)", "long reads (> 8 min)",
78
+ "short reads (3-8 min)", "not a student alumni", "mock", "share feedback", "bite size (< 2 min)",
79
+ "actually no", "next steps", "i'm a student alumni", "i have questions"]
80
 
81
  for phrase in remove_phrases:
82
  df['texts'] = df['texts'].str.replace(phrase, '')
83
 
84
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
85
+ "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am", "i'm all set", "ask a question",
86
+ "apply the survey", "videos (2-8 min)", "long reads (> 8 min)", "short reads (3-8 min)", "not a student alumni",
87
+ "mock", "share feedback", "bite size (< 2 min)", "actually no", "next steps", "i'm a student alumni", "i have questions"]
88
+
89
  for gen_var in general_variations:
90
  pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
91
  df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
 
94
  return re.sub(r'[^\w\s]', '', text)
95
  df['texts'] = df['texts'].apply(remove_punctuations)
96
 
97
+ remove_morephrases = ["short reads 38 min", "bite size 2 min", "videos 28 min", "long reads 8 min"]
98
 
99
  for phrase in remove_morephrases:
100
  df['texts'] = df['texts'].str.replace(phrase, '')
 
109
  return df
110
 
111
  def cluster_data(df):
112
+ num_clusters = 10 # Set the number of clusters to 15
113
  vectorizer = TfidfVectorizer(stop_words='english')
114
  X = vectorizer.fit_transform(df['texts'])
115
 
 
135
 
136
  def main(file, num_clusters_to_display):
137
  try:
138
+ # Detect if the file is CSV or Excel
139
+ if file.name.endswith('.csv'):
140
+ df = pd.read_csv(file.name)
141
+ df = df[df['Answer'] == 'Fallback Message shown'] # Filter for 'Fallback Message shown' in 'Answer' column
142
+ df.rename(columns={'User Query': 'texts'}, inplace=True) # Rename column to 'texts' for processing
 
 
 
143
  else:
144
+ df = pd.read_excel(file.name)
145
+ df = preprocess_data(df)
146
+
 
 
 
 
 
 
 
147
  df = cluster_data(df)
148
  visualize_clusters(df)
149
 
 
156
  filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
157
  top_clusters = filtered_clusters[:num_clusters_to_display]
158
 
159
+ df = df[df['texts'] != '']
160
  df = df[df['Cluster'].isin(top_clusters)]
161
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
162
  df = df.sort_values('Cluster')
163
 
164
+ return df.to_csv(index=False)
 
 
 
 
 
 
165
  except Exception as e:
166
+ return str(e)
 
 
 
 
 
 
 
167
 
168
+ iface = gr.Interface(
169
+ fn=main,
170
  inputs=[
171
+ gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
172
+ gr.Slider(minimum=1, maximum=20, step=1, label="Number of Categories to Display")
173
  ],
174
+ outputs=gr.File(label="Filtered CSV File")
 
175
  )
176
 
177
+ if __name__ == "__main__":
178
+ iface.launch()