Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 26, 2024

Commit

1927724

verified ·

1 Parent(s): d563311

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -48

app.py CHANGED Viewed

@@ -10,29 +10,8 @@ import matplotlib.pyplot as plt
 import plotly.express as px
 from PIL import Image
-# Define categories for Teach For India Bot and Firki Bot
-categories_keywords_tfi = {
-    'Start of Conversation': ['hi', 'hello', 'hi I have a query', 'query', 'good morning', 'good afternoon', 'good evening'],
-    'Application Status': ['application', 'applied', 'update on my application', 'result of my application', 'selected', 'selection process', 'apply', 'fellow', 'lesson plan', 'status of my application', 'application update', 'application status', 'applied for'],
-    'Volunteering': ['volunteering', 'volunteer', 'volunteering certificate', 'resume my volunteering', 'volunteering journey', 'volunteering with TFI', 'volunteering opportunities', 'volunteer work', 'volunteer program'],
-    'Certificates': ['certificate', 'certificates', 'certificate of completion', 'volunteer certificate', 'issue certificate'],
-    'Job Opportunities': ['job', 'vacancy', 'Talent Acquisition Executive job', 'opportunity', 'job opening', 'job position', 'career opportunities'],
-    'Surveys and Forms': ['survey', 'form', 'fill out the survey', 'application form', 'survey link', 'survey form', 'form submission'],
-    'General Queries': ['query', 'queries', 'questions', 'feedback', 'loved', 'overwhelming', 'general question', 'inquiry', 'query about'],
-    'Spam': ['free recharge', 'offer', 'click the link', 'https'],
-    'Rescheduling and Postponing': ['reschedule', 'postpone', 'cancellation', 'date', 'time slot', 'change date', 'change time', 'reschedule appointment'],
-    'Contact and Communication Issues': ['call', 'phone', 'contact', 'not received', 'contact support', 'phone call', 'call back', 'internet'],
-    'Email and Credentials Issues': ['email', 'credentials', 'received', 'email issue', 'email problem', 'credential issue', 'login problem'],
-    'Timing and Scheduling': ['session', 'time', 'interview', 'baje', 'schedule time', 'meeting time', 'appointment time'],
-    'Salary and Benefits': ['salary', 'increment', 'accommodation', 'training period', 'reside', 'stipend', 'pay', 'wage', 'salary details', 'benefits information'],
-    'Technical Issues': ['network issues', 'zoom meeting', 'passcode', 'technical', 'issue','technical problem', 'system issue', 'technical support'],
-    'Complaint Handling': ['help', 'i need help', 'Help me', 'complaint', 'issue is unresolved', 'unsatisfied', 'bad experience'],
-    'User Feedback': ['feedback', 'loved', 'dissapointed', 'hated', 'it was good', 'it was bad', 'helpful',],
-    'End of Conversation': ['thanks', 'thankss', 'thank u', 'thank you', 'ok', 'okay', 'done', 'joining', 'sounds good', 'goodbye', 'end chat', 'end'],
-    'Miscellaneous': []
-}
-categories_keywords_firki = {
     "Application Status": ["application status", "application", "status", "submitted", "processing", "pending", "approval", "rejected", "accepted"],
     "Volunteering": ["volunteer", "volunteering", "help out", "assist", "volunteer work", "volunteer opportunities"],
     "Certificates": ["certificate", "certificates", "completion", "certification", "accreditation", "proof", "document", "certified"],
@@ -55,23 +34,32 @@ categories_keywords_firki = {
     "Service Requests": ["service", "support", "request", "assistance", "help", "aid", "maintenance"],
     "Account Issues": ["account", "profile", "update", "activation", "deactivation", "credentials", "reset"],
     "Product Information": ["product", "service", "details", "info", "information", "specifications", "features"],
     "Order Status": ["order", "status", "tracking", "shipment", "delivery", "purchase", "dispatch"],
     "Miscellaneous": ["miscellaneous", "other", "various", "random", "general", "unknown", "unsorted"]
 }
 # Initialize
-categories_keywords = categories_keywords_tfi
-def categorize_question(question, categories_keywords):
     for category, keywords in categories_keywords.items():
-        for keyword in keywords:
-            if keyword.lower() in question.lower():
-                if category == 'End of Conversation':
-                    return category
-                if category != 'End of Conversation':
-                    return category
-    return 'Miscellaneous'
 def preprocess_data(df, categories_keywords):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
@@ -187,28 +175,27 @@ def generate_bar_chart(df, num_clusters_to_display):
     img = Image.open(buf)
     return img
-def main(file, bot_name, num_clusters_to_display):
     try:
-        global categories_keywords
-        if bot_name == "Teach For India":
-            categories_keywords = categories_keywords_tfi
-        else:
-            categories_keywords = categories_keywords_firki
-        df = pd.read_csv(file.name)
         df = df[df['Answer'] == 'Fallback Message shown']
-        df = preprocess_data(df, categories_keywords)
         category_sizes = df['Category'].value_counts().reset_index()
         category_sizes.columns = ['Category', 'Count']
         sorted_categories = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
         largest_categories = sorted_categories[:num_clusters_to_display]
         filtered_df = df[df['Category'].isin(largest_categories)]
         filtered_df = filtered_df.sort_values(by='Category')
         wordcloud_img = generate_wordcloud(filtered_df)
@@ -223,13 +210,9 @@ def main(file, bot_name, num_clusters_to_display):
         print(f"Error: {e}")
         return str(e), None, None
-def categorize_unanswered_queries(bot_name, file, num_clusters_to_display):
-    return main(file, bot_name, num_clusters_to_display)
 interface = gr.Interface(
-    fn=categorize_unanswered_queries,
     inputs=[
-        gr.Radio(["Teach For India", "Firki"], label="Select ChatBot"),
         gr.File(label="Upload CSV File (.csv)"),
         gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
     ],
@@ -239,7 +222,7 @@ interface = gr.Interface(
         gr.Image(label="Bar Chart")
     ],
     title="Unanswered User Queries Categorization",
-    description="Select the bot, upload the CSV file, and specify the number of categories to display to categorize unanswered user queries."
 )
-interface.launch()

 import plotly.express as px
 from PIL import Image
+# Define categories
+categories_keywords = {
     "Application Status": ["application status", "application", "status", "submitted", "processing", "pending", "approval", "rejected", "accepted"],
     "Volunteering": ["volunteer", "volunteering", "help out", "assist", "volunteer work", "volunteer opportunities"],
     "Certificates": ["certificate", "certificates", "completion", "certification", "accreditation", "proof", "document", "certified"],
     "Service Requests": ["service", "support", "request", "assistance", "help", "aid", "maintenance"],
     "Account Issues": ["account", "profile", "update", "activation", "deactivation", "credentials", "reset"],
     "Product Information": ["product", "service", "details", "info", "information", "specifications", "features"],
+    "Account Management": ["login", "account", "new account", "create account", "log out", "google", "access"],
     "Order Status": ["order", "status", "tracking", "shipment", "delivery", "purchase", "dispatch"],
     "Miscellaneous": ["miscellaneous", "other", "various", "random", "general", "unknown", "unsorted"]
 }
 # Initialize
+def categorize_question(question):
+    # Split the question into words
+    words = question.split()
+    # Check if the question has only one word
+    if len(words) == 1:
+        single_word = words[0].lower()
+        # Check if the single word is in the Start of Conversation category
+        if any(single_word in keyword for keyword in categories_keywords["Start of Conversation"]):
+            return "Start of Conversation"
+        else:
+            return "End of Conversation"
+    # General categorization based on multiple words
     for category, keywords in categories_keywords.items():
+        if any(keyword.lower() in question.lower() for keyword in keywords):
+            return category
+    return "Miscellaneous"
 def preprocess_data(df, categories_keywords):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     img = Image.open(buf)
     return img
+def main(file, num_clusters_to_display):
     try:
+        df = pd.read_csv(file)
+        # Filter by 'Fallback Message shown'
         df = df[df['Answer'] == 'Fallback Message shown']
+        df = preprocess_data(df)
+        # Get category sizes and sort by size in ascending order
         category_sizes = df['Category'].value_counts().reset_index()
         category_sizes.columns = ['Category', 'Count']
         sorted_categories = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
+        # Get the largest x categories as specified by num_clusters_to_display
         largest_categories = sorted_categories[:num_clusters_to_display]
+        # Filter the dataframe to include only the largest categories
         filtered_df = df[df['Category'].isin(largest_categories)]
+        # Sort the dataframe by Category
         filtered_df = filtered_df.sort_values(by='Category')
         wordcloud_img = generate_wordcloud(filtered_df)
         print(f"Error: {e}")
         return str(e), None, None
 interface = gr.Interface(
+    fn=main,
     inputs=[
         gr.File(label="Upload CSV File (.csv)"),
         gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
     ],
         gr.Image(label="Bar Chart")
     ],
     title="Unanswered User Queries Categorization",
+    description="Categorize unanswered user queries into predefined categories"
 )
+interface.launch(share=True)