tanish78 commited on
Commit
34a9249
·
verified ·
1 Parent(s): 4e72933

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -42,23 +42,23 @@ categories_keywords = {
42
  def categorize_question(question):
43
  words = question.split()
44
 
45
- # List of words to exclude from 'End of Conversation'
46
  exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
47
 
48
- # Check if the question has only one word
49
  if len(words) == 1:
50
  single_word = words[0].lower()
51
  # Check if the single word fits into any other category
52
  for category, keywords in categories_keywords.items():
53
  if any(single_word in keyword for keyword in keywords):
54
  return category
55
- # If it doesn't fit into any other category, check if it should be 'End of Conversation'
56
  if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
57
  return "End of Conversation"
58
  else:
59
  return "Miscellaneous"
60
 
61
- # Categorization of other queries
62
  for category, keywords in categories_keywords.items():
63
  if any(keyword.lower() in question.lower() for keyword in keywords):
64
  return category
@@ -121,7 +121,7 @@ def preprocess_data(df):
121
  df['texts'] = df['texts'].str.strip()
122
  df = df[df['texts'] != '']
123
 
124
- # Categorize the texts
125
  df['Category'] = df['texts'].apply(categorize_question)
126
 
127
  return df
@@ -164,7 +164,7 @@ def generate_wordcloud(df):
164
  return img
165
 
166
  def generate_bar_chart(df, num_clusters_to_display):
167
- # Exclude common words from the top words
168
  common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
169
 
170
  top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
@@ -195,30 +195,30 @@ def main(file, num_clusters_to_display):
195
 
196
  df = preprocess_data(df)
197
 
198
- # Perform clustering before categorization
199
  num_clusters = 5 # Or any other appropriate number of clusters
200
  df, kmeans = cluster_data(df, num_clusters)
201
 
202
- # Categorize the texts after clustering
203
  df['Category'] = df['texts'].apply(categorize_question)
204
 
205
  df = df[df['Category'] != 'Miscellaneous']
206
 
207
- # Get category sizes and sort by size in ascending order
208
  category_sizes = df['Category'].value_counts().reset_index()
209
  category_sizes.columns = ['Category', 'Count']
210
  sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
211
  sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
212
 
213
- # Get the largest x categories as specified by num_clusters_to_display
214
  largest_categories = sorted_categories[:num_clusters_to_display]
215
  smallest_categories = sorted_categories_sm[:num_clusters_to_display]
216
 
217
- # Filter the dataframe to include only the largest categories
218
  filtered_df = df[df['Category'].isin(largest_categories)]
219
  filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
220
 
221
- # Sort the dataframe by Category and Cluster
222
  filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
223
  filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
224
 
 
42
  def categorize_question(question):
43
  words = question.split()
44
 
45
+ # words to exclude from End Conversation
46
  exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
47
 
48
+ # Single word user query
49
  if len(words) == 1:
50
  single_word = words[0].lower()
51
  # Check if the single word fits into any other category
52
  for category, keywords in categories_keywords.items():
53
  if any(single_word in keyword for keyword in keywords):
54
  return category
55
+
56
  if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
57
  return "End of Conversation"
58
  else:
59
  return "Miscellaneous"
60
 
61
+ # Categorization of non-ending queries
62
  for category, keywords in categories_keywords.items():
63
  if any(keyword.lower() in question.lower() for keyword in keywords):
64
  return category
 
121
  df['texts'] = df['texts'].str.strip()
122
  df = df[df['texts'] != '']
123
 
124
+ # Categorize
125
  df['Category'] = df['texts'].apply(categorize_question)
126
 
127
  return df
 
164
  return img
165
 
166
  def generate_bar_chart(df, num_clusters_to_display):
167
+ # Exclude common words
168
  common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
169
 
170
  top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
 
195
 
196
  df = preprocess_data(df)
197
 
198
+ # Clustering
199
  num_clusters = 5 # Or any other appropriate number of clusters
200
  df, kmeans = cluster_data(df, num_clusters)
201
 
202
+ # Categorization
203
  df['Category'] = df['texts'].apply(categorize_question)
204
 
205
  df = df[df['Category'] != 'Miscellaneous']
206
 
207
+ # Sorting (ascending order)
208
  category_sizes = df['Category'].value_counts().reset_index()
209
  category_sizes.columns = ['Category', 'Count']
210
  sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
211
  sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
212
 
213
+ # Display (according to input slider)
214
  largest_categories = sorted_categories[:num_clusters_to_display]
215
  smallest_categories = sorted_categories_sm[:num_clusters_to_display]
216
 
217
+ # Filtering (according to input slider)
218
  filtered_df = df[df['Category'].isin(largest_categories)]
219
  filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
220
 
221
+ # Sort the output file by Category and Cluster
222
  filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
223
  filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
224