Spaces:

TFI
/

K-Means_Clustering_Algorithm

Sleeping

App Files Files Community

tanish78 commited on Jul 20, 2024

Commit

c246bae

verified ·

1 Parent(s): db51a4a

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -5

app.py CHANGED Viewed

@@ -10,6 +10,31 @@ import matplotlib.pyplot as plt
 import plotly.express as px
 from PIL import Image
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
@@ -61,8 +86,12 @@ def preprocess_data(df):
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
     return df
 def cluster_data(df, num_clusters):
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
@@ -142,11 +171,14 @@ def main(file, num_clusters_to_display):
         df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
         df = df.sort_values('Cluster')
         wordcloud_img = generate_wordcloud(df)
         bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
-            df.to_csv(tmpfile.name, index=False)
             csv_file_path = tmpfile.name
         return csv_file_path, wordcloud_img, bar_chart_img
@@ -154,6 +186,7 @@ def main(file, num_clusters_to_display):
         print(f"Error: {e}")
         return str(e), None, None
 interface = gr.Interface(
     fn=main,
     inputs=[
@@ -161,12 +194,12 @@ interface = gr.Interface(
         gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
     ],
     outputs=[
-        gr.File(label="Clustered Data CSV"),
         gr.Image(label="Word Cloud"),
         gr.Image(label="Bar Chart")
     ],
-    title="Unanswered User Queries Clustering",
-    description="Unanswered User Query Categorization"
 )
-interface.launch(share=True)

 import plotly.express as px
 from PIL import Image
+categories_keywords = {
+    'Application Status': ['application', 'applied', 'update on my application', 'result of my application', 'selected', 'selection process', 'apply', 'fellow', 'lesson plan'],
+    'Volunteering': ['volunteering', 'volunteer', 'volunteering certificate', 'resume my volunteering', 'volunteering journey', 'volunteering with TFI'],
+    'Certificates': ['certificate', 'certificates'],
+    'Job Opportunities': ['job', 'vacancy', 'Talent Acquisition Executive job', 'opportunity'],
+    'Surveys and Forms': ['survey', 'form', 'fill out the survey', 'application form'],
+    'General Queries': ['query', 'queries', 'questions', 'thank', 'thanks', 'ok', 'ok thank you', 'thankyou', 'no thank you', 'feedback', 'loved', 'overwhelming'],
+    'Spam': ['free recharge', 'offer', 'click the link', 'https', 'sorry', 'yes', 'no', 'ok', 'K', 'Sorry', 'yes.', 'noo', 'thnku', 'thx', 'thank'],
+    'Rescheduling and Postponing': ['reschedule', 'postpone', 'cancellation', 'date', 'time slot'],
+    'Contact and Communication Issues': ['call', 'phone', 'contact', 'not received'],
+    'Email and Credentials Issues': ['email', 'credentials', 'received'],
+    'Timing and Scheduling': ['session', 'time', 'interview', '6 baje', '23 feb', '12 april'],
+    'Salary and Benefits': ['salary', 'increment', 'accommodation', 'training period', 'reside'],
+    'Technical Issues': ['network issues', 'zoom meeting', 'passcode', 'technical', 'issue'],
+    'Miscellaneous': []
+}
+def categorize_question(question):
+    for category, keywords in categories_keywords.items():
+        for keyword in keywords:
+            if keyword.lower() in question.lower():
+                return category
+    return 'Miscellaneous'
 def preprocess_data(df):
     df.rename(columns={'Question Asked': 'texts'}, inplace=True)
     df['texts'] = df['texts'].astype(str).str.lower()
     df['texts'] = df['texts'].str.strip()
     df = df[df['texts'] != '']
+    # Categorize the texts
+    df['Category'] = df['texts'].apply(categorize_question)
     return df
 def cluster_data(df, num_clusters):
     vectorizer = TfidfVectorizer(stop_words='english')
     X = vectorizer.fit_transform(df['texts'])
         df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
         df = df.sort_values('Cluster')
+        # Generate categorized output
+        categorized_df = df[['texts', 'Cluster', 'Category']].copy()
         wordcloud_img = generate_wordcloud(df)
         bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
+            categorized_df.to_csv(tmpfile.name, index=False)
             csv_file_path = tmpfile.name
         return csv_file_path, wordcloud_img, bar_chart_img
         print(f"Error: {e}")
         return str(e), None, None
 interface = gr.Interface(
     fn=main,
     inputs=[
         gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
     ],
     outputs=[
+        gr.File(label="Categorized Data CSV"),
         gr.Image(label="Word Cloud"),
         gr.Image(label="Bar Chart")
     ],
+    title="Unanswered User Queries Categorization",
+    description="Categorize unanswered user queries into predefined categories"
 )
+interface.launch(share=True)