Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -42,23 +42,23 @@ categories_keywords = {
|
|
| 42 |
def categorize_question(question):
|
| 43 |
words = question.split()
|
| 44 |
|
| 45 |
-
#
|
| 46 |
exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
|
| 47 |
|
| 48 |
-
#
|
| 49 |
if len(words) == 1:
|
| 50 |
single_word = words[0].lower()
|
| 51 |
# Check if the single word fits into any other category
|
| 52 |
for category, keywords in categories_keywords.items():
|
| 53 |
if any(single_word in keyword for keyword in keywords):
|
| 54 |
return category
|
| 55 |
-
|
| 56 |
if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
|
| 57 |
return "End of Conversation"
|
| 58 |
else:
|
| 59 |
return "Miscellaneous"
|
| 60 |
|
| 61 |
-
# Categorization of
|
| 62 |
for category, keywords in categories_keywords.items():
|
| 63 |
if any(keyword.lower() in question.lower() for keyword in keywords):
|
| 64 |
return category
|
|
@@ -121,7 +121,7 @@ def preprocess_data(df):
|
|
| 121 |
df['texts'] = df['texts'].str.strip()
|
| 122 |
df = df[df['texts'] != '']
|
| 123 |
|
| 124 |
-
# Categorize
|
| 125 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 126 |
|
| 127 |
return df
|
|
@@ -164,7 +164,7 @@ def generate_wordcloud(df):
|
|
| 164 |
return img
|
| 165 |
|
| 166 |
def generate_bar_chart(df, num_clusters_to_display):
|
| 167 |
-
# Exclude common words
|
| 168 |
common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
|
| 169 |
|
| 170 |
top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
|
|
@@ -195,30 +195,30 @@ def main(file, num_clusters_to_display):
|
|
| 195 |
|
| 196 |
df = preprocess_data(df)
|
| 197 |
|
| 198 |
-
#
|
| 199 |
num_clusters = 5 # Or any other appropriate number of clusters
|
| 200 |
df, kmeans = cluster_data(df, num_clusters)
|
| 201 |
|
| 202 |
-
#
|
| 203 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 204 |
|
| 205 |
df = df[df['Category'] != 'Miscellaneous']
|
| 206 |
|
| 207 |
-
#
|
| 208 |
category_sizes = df['Category'].value_counts().reset_index()
|
| 209 |
category_sizes.columns = ['Category', 'Count']
|
| 210 |
sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
|
| 211 |
sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
|
| 212 |
|
| 213 |
-
#
|
| 214 |
largest_categories = sorted_categories[:num_clusters_to_display]
|
| 215 |
smallest_categories = sorted_categories_sm[:num_clusters_to_display]
|
| 216 |
|
| 217 |
-
#
|
| 218 |
filtered_df = df[df['Category'].isin(largest_categories)]
|
| 219 |
filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
|
| 220 |
|
| 221 |
-
# Sort the
|
| 222 |
filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
|
| 223 |
filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
|
| 224 |
|
|
|
|
| 42 |
def categorize_question(question):
|
| 43 |
words = question.split()
|
| 44 |
|
| 45 |
+
# words to exclude from End Conversation
|
| 46 |
exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
|
| 47 |
|
| 48 |
+
# Single word user query
|
| 49 |
if len(words) == 1:
|
| 50 |
single_word = words[0].lower()
|
| 51 |
# Check if the single word fits into any other category
|
| 52 |
for category, keywords in categories_keywords.items():
|
| 53 |
if any(single_word in keyword for keyword in keywords):
|
| 54 |
return category
|
| 55 |
+
|
| 56 |
if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
|
| 57 |
return "End of Conversation"
|
| 58 |
else:
|
| 59 |
return "Miscellaneous"
|
| 60 |
|
| 61 |
+
# Categorization of non-ending queries
|
| 62 |
for category, keywords in categories_keywords.items():
|
| 63 |
if any(keyword.lower() in question.lower() for keyword in keywords):
|
| 64 |
return category
|
|
|
|
| 121 |
df['texts'] = df['texts'].str.strip()
|
| 122 |
df = df[df['texts'] != '']
|
| 123 |
|
| 124 |
+
# Categorize
|
| 125 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 126 |
|
| 127 |
return df
|
|
|
|
| 164 |
return img
|
| 165 |
|
| 166 |
def generate_bar_chart(df, num_clusters_to_display):
|
| 167 |
+
# Exclude common words
|
| 168 |
common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
|
| 169 |
|
| 170 |
top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
|
|
|
|
| 195 |
|
| 196 |
df = preprocess_data(df)
|
| 197 |
|
| 198 |
+
# Clustering
|
| 199 |
num_clusters = 5 # Or any other appropriate number of clusters
|
| 200 |
df, kmeans = cluster_data(df, num_clusters)
|
| 201 |
|
| 202 |
+
# Categorization
|
| 203 |
df['Category'] = df['texts'].apply(categorize_question)
|
| 204 |
|
| 205 |
df = df[df['Category'] != 'Miscellaneous']
|
| 206 |
|
| 207 |
+
# Sorting (ascending order)
|
| 208 |
category_sizes = df['Category'].value_counts().reset_index()
|
| 209 |
category_sizes.columns = ['Category', 'Count']
|
| 210 |
sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
|
| 211 |
sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
|
| 212 |
|
| 213 |
+
# Display (according to input slider)
|
| 214 |
largest_categories = sorted_categories[:num_clusters_to_display]
|
| 215 |
smallest_categories = sorted_categories_sm[:num_clusters_to_display]
|
| 216 |
|
| 217 |
+
# Filtering (according to input slider)
|
| 218 |
filtered_df = df[df['Category'].isin(largest_categories)]
|
| 219 |
filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
|
| 220 |
|
| 221 |
+
# Sort the output file by Category and Cluster
|
| 222 |
filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
|
| 223 |
filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
|
| 224 |
|