Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 26, 2024

Commit

b268803

verified ·

1 Parent(s): d731f11

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -27

app.py CHANGED Viewed

@@ -39,26 +39,19 @@ categories_keywords = {
 }
 def categorize_question(question):
-    # Split the question into words
     words = question.split()
-    # Check if the question has only one word
     if len(words) == 1:
         single_word = words[0].lower()
-        # Check if the single word is in the Start of Conversation category
         if any(single_word in keyword for keyword in categories_keywords["Start of Conversation"]):
             return "Start of Conversation"
         else:
             return "End of Conversation"
-    # Categorization of other queries
     for category, keywords in categories_keywords.items():
         if any(keyword.lower() in question.lower() for keyword in keywords):
             return category
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
@@ -110,7 +103,6 @@ def preprocess_data(df):
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
-    # Categorize the texts
     df['Category'] = df['texts'].apply(categorize_question)
     return df
@@ -153,7 +145,6 @@ def generate_wordcloud(df):
     return img
 def generate_bar_chart(df, num_clusters_to_display):
-    # Exclude common words from the top words
     common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
     top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
@@ -179,30 +170,15 @@ def main(file, num_clusters_to_display):
     try:
         df = pd.read_csv(file)
-        # Filter by 'Fallback Message shown'
         df = df[df['Answer'] == 'Fallback Message shown']
         df = preprocess_data(df)
-        # Get category sizes and sort by size in ascending order
-        category_sizes = df['Category'].value_counts().reset_index()
-        category_sizes.columns = ['Category', 'Count']
-        sorted_categories = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
-        # Get the largest x categories as specified by num_clusters_to_display
-        largest_categories = sorted_categories[:num_clusters_to_display]
-        # Filter the dataframe to include only the largest categories
-        filtered_df = df[df['Category'].isin(largest_categories)]
-        # Sort the dataframe by Category
-        filtered_df = filtered_df.sort_values(by='Category')
-        wordcloud_img = generate_wordcloud(filtered_df)
-        bar_chart_img = generate_bar_chart(filtered_df, num_clusters_to_display)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
-            filtered_df.to_csv(tmpfile.name, index=False)
             csv_file_path = tmpfile.name
         return csv_file_path, wordcloud_img, bar_chart_img

 }
 def categorize_question(question):
     words = question.split()
     if len(words) == 1:
         single_word = words[0].lower()
         if any(single_word in keyword for keyword in categories_keywords["Start of Conversation"]):
             return "Start of Conversation"
         else:
             return "End of Conversation"
     for category, keywords in categories_keywords.items():
         if any(keyword.lower() in question.lower() for keyword in keywords):
             return category
     return "Miscellaneous"
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
     df['Category'] = df['texts'].apply(categorize_question)
     return df
     return img
 def generate_bar_chart(df, num_clusters_to_display):
     common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
     top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
     try:
         df = pd.read_csv(file)
         df = df[df['Answer'] == 'Fallback Message shown']
         df = preprocess_data(df)
+        wordcloud_img = generate_wordcloud(df)
+        bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
+            df.to_csv(tmpfile.name, index=False)
             csv_file_path = tmpfile.name
         return csv_file_path, wordcloud_img, bar_chart_img