Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Aug 1, 2024

Commit

34a9249

verified ·

1 Parent(s): 4e72933

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -42,23 +42,23 @@ categories_keywords = {
 def categorize_question(question):
     words = question.split()
-    # List of words to exclude from 'End of Conversation'
     exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
-    # Check if the question has only one word
     if len(words) == 1:
         single_word = words[0].lower()
         # Check if the single word fits into any other category
         for category, keywords in categories_keywords.items():
             if any(single_word in keyword for keyword in keywords):
                 return category
-        # If it doesn't fit into any other category, check if it should be 'End of Conversation'
         if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
             return "End of Conversation"
         else:
             return "Miscellaneous"
-    # Categorization of other queries
     for category, keywords in categories_keywords.items():
         if any(keyword.lower() in question.lower() for keyword in keywords):
             return category
@@ -121,7 +121,7 @@ def preprocess_data(df):
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
-    # Categorize the texts
     df['Category'] = df['texts'].apply(categorize_question)
     return df
@@ -164,7 +164,7 @@ def generate_wordcloud(df):
     return img
 def generate_bar_chart(df, num_clusters_to_display):
-    # Exclude common words from the top words
     common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
     top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
@@ -195,30 +195,30 @@ def main(file, num_clusters_to_display):
         df = preprocess_data(df)
-        # Perform clustering before categorization
         num_clusters = 5  # Or any other appropriate number of clusters
         df, kmeans = cluster_data(df, num_clusters)
-        # Categorize the texts after clustering
         df['Category'] = df['texts'].apply(categorize_question)
         df = df[df['Category'] != 'Miscellaneous']
-        # Get category sizes and sort by size in ascending order
         category_sizes = df['Category'].value_counts().reset_index()
         category_sizes.columns = ['Category', 'Count']
         sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
         sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
-        # Get the largest x categories as specified by num_clusters_to_display
         largest_categories = sorted_categories[:num_clusters_to_display]
         smallest_categories = sorted_categories_sm[:num_clusters_to_display]
-        # Filter the dataframe to include only the largest categories
         filtered_df = df[df['Category'].isin(largest_categories)]
         filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
-        # Sort the dataframe by Category and Cluster
         filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
         filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')

 def categorize_question(question):
     words = question.split()
+    # words to exclude from End Conversation
     exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'}
+    # Single word user query
     if len(words) == 1:
         single_word = words[0].lower()
         # Check if the single word fits into any other category
         for category, keywords in categories_keywords.items():
             if any(single_word in keyword for keyword in keywords):
                 return category
         if any(single_word in keyword for keyword in categories_keywords["End of Conversation"]):
             return "End of Conversation"
         else:
             return "Miscellaneous"
+    # Categorization of non-ending queries
     for category, keywords in categories_keywords.items():
         if any(keyword.lower() in question.lower() for keyword in keywords):
             return category
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
+    # Categorize
     df['Category'] = df['texts'].apply(categorize_question)
     return df
     return img
 def generate_bar_chart(df, num_clusters_to_display):
+    # Exclude common words
     common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
     top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
         df = preprocess_data(df)
+        # Clustering
         num_clusters = 5  # Or any other appropriate number of clusters
         df, kmeans = cluster_data(df, num_clusters)
+        # Categorization
         df['Category'] = df['texts'].apply(categorize_question)
         df = df[df['Category'] != 'Miscellaneous']
+        # Sorting (ascending order)
         category_sizes = df['Category'].value_counts().reset_index()
         category_sizes.columns = ['Category', 'Count']
         sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
         sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
+        # Display (according to input slider)
         largest_categories = sorted_categories[:num_clusters_to_display]
         smallest_categories = sorted_categories_sm[:num_clusters_to_display]
+        # Filtering (according to input slider)
         filtered_df = df[df['Category'].isin(largest_categories)]
         filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
+        # Sort the output file by Category and Cluster
         filtered_df = filtered_df.sort_values(by=['Category', 'Cluster'])
         filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')