Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| import re | |
| from io import BytesIO | |
| import tempfile | |
| from wordcloud import WordCloud, STOPWORDS | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| from PIL import Image | |
| categories_keywords = { | |
| "Application Status": ["application status", "application", "status", "submitted", "processing", "pending", "approval", "rejected", "accepted", "apply", "how to apply", "can I apply"], | |
| "Follow-Ups": ['update', 'updates', 'any updates', 'any news', 'response from you', 'any reply'], | |
| "Firki": ['firki'], | |
| "Interviews": ['interview', 'set up interview', 'phone interview'], | |
| "Volunteering": ["volunteer", "volunteering", "help out", "assist", "volunteer work", "volunteer opportunities"], | |
| "Certificates": ["certificate", "certificates", "completion", "certification", "accreditation", "proof", "document", "certified"], | |
| "Job Opportunities": ["job", "opportunity", "career", "vacancy", "position", "employment", "hiring", "recruitment", "internship", "post", "posts", "available", "teacher", "teaching", "opportunities", "looking for"], | |
| "Surveys and Forms": ["survey", "form", "forms", "questionnaire", "feedback form", "response", "fill out", "submission"], | |
| "Spam": ["spam", "unsubscribe", "remove", "stop", "junk", "block", "opt-out"], | |
| "Rescheduling and Postponing": ["reschedule", "postpone", "delay", "change date", "new time", "rearrange", "shift", "adjust timing"], | |
| "Contact and Communication Issues": ["contact", "communicate", "communication", "reach out", "phone", "email", "address", "details"], | |
| "Email and Credentials Issues": ["email", "credentials", "login", "password", "gmail", "username", "verification", "reset"], | |
| "Timing and Scheduling": ["timing", "schedule", "scheduling", "time", "appointment", "availability", "calendar", "book", "slot"], | |
| "Salary and Benefits": ["salary", "benefits", "pay", "compensation", "wages", "earnings", "package", "remuneration", "incentives"], | |
| "Technical Issues": ["technical", "issue", "problem", "error", "bug", "glitch", "fix", "troubleshoot", "support"], | |
| "End of Conversation": ["bye", "thank you", "thanks", "goodbye", "end conversation", "ok", "ok thanks"], | |
| "Feedback": ["feedback", "comments", "review", "opinion", "suggestion", "critique", "rating"], | |
| "Event Inquiries": ["event", "webinar", "meeting", "conference", "session", "seminar", "workshop", "invitation"], | |
| "Payment Issues": ["payment", "billing", "transaction", "charge", "fee", "invoice", "refund", "receipt"], | |
| "Registration Issues": ["registration", "register", "sign up", "enroll", "join", "signup", "enrollment"], | |
| "Service Requests": ["service", "support", "request", "assistance", "help", "aid", "maintenance"], | |
| "Account Issues": ["account", "profile", "update", "activation", "deactivation", "reset", "account password"], | |
| "Product Information": ["product", "service", "details", "info", "information", "specifications", "features"], | |
| "Order Status": ["order", "status", "tracking", "shipment", "delivery", "purchase", "dispatch"], | |
| "Miscellaneous": [] | |
| } | |
| def categorize_question(question): | |
| words = question.split() | |
| # words to exclude from End Conversation | |
| exclusion_words = {'is', 'please', 'not resolved', 'unresolved', 'problem', 'help', 'issue', 'webinar', 'office', 'leave', 'approved', 'notice', 'period', 'good morning', 'when', 'where', 'why', 'how', 'which', 'and when'} | |
| # Categorization | |
| for category, keywords in categories_keywords.items(): | |
| if any(keyword.lower() in question.lower() for keyword in keywords): | |
| return category | |
| # Secondary check for 'End of Conversation' category | |
| if "end of conversation" in question.lower() and not any(exclusion_word in question.lower() for exclusion_word in exclusion_words): | |
| return "End of Conversation" | |
| return "Miscellaneous" | |
| def preprocess_data(df): | |
| df.rename(columns={'Question Asked': 'texts'}, inplace=True) | |
| df['texts'] = df['texts'].astype(str).str.lower() | |
| df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)) | |
| def remove_emoji(string): | |
| emoji_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" | |
| u"\U0001F300-\U0001F5FF" | |
| u"\U0001F680-\U0001F6FF" | |
| u"\U0001F1E0-\U0001F1FF" | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE) | |
| return emoji_pattern.sub(r'', string) | |
| df['texts'] = df['texts'].apply(remove_emoji) | |
| custom_synonyms = { | |
| 'application': ['form'], | |
| 'apply': ['fill', 'applied'], | |
| 'work': ['job'], | |
| 'salary': ['stipend', 'pay', 'payment', 'paid'], | |
| 'test': ['online test', 'amcat test', 'exam', 'assessment'], | |
| 'pass': ['clear', 'selected', 'pass or not'], | |
| 'result': ['outcome', 'mark', 'marks'], | |
| 'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you", | |
| "okaythank", "thx", "ty", "thankyou", "thank", "thank u"], | |
| 'interview': ["pi"] | |
| } | |
| for original_word, synonym_list in custom_synonyms.items(): | |
| for synonym in synonym_list: | |
| pattern = r"\b" + synonym + r"\b" | |
| df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True) | |
| spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein", | |
| "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar", | |
| "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"] | |
| for spam_phrase in spam_list: | |
| pattern = r"\b" + re.escape(spam_phrase) + r"\b" | |
| df = df[~df['texts'].str.contains(pattern)] | |
| def remove_punctuations(text): | |
| return re.sub(r'[^\w\s]', '', text) | |
| df['texts'] = df['texts'].apply(remove_punctuations) | |
| df['texts'] = df['texts'].str.strip() | |
| df = df[df['texts'] != ''] | |
| # Categorize | |
| df['Category'] = df['texts'].apply(categorize_question) | |
| return df | |
| def cluster_data(df, num_clusters): | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(df['texts']) | |
| kmeans = KMeans(n_clusters=num_clusters, random_state=0) | |
| kmeans.fit(X) | |
| df['Cluster'] = kmeans.labels_ | |
| return df, kmeans | |
| def generate_wordcloud(df): | |
| text = " ".join(df['texts'].tolist()) | |
| stopwords = set(STOPWORDS) | |
| wordcloud = WordCloud( | |
| width=800, | |
| height=400, | |
| background_color='white', | |
| max_words=300, | |
| collocations=False, | |
| min_font_size=10, | |
| max_font_size=200, | |
| stopwords=stopwords, | |
| prefer_horizontal=1.0, | |
| scale=2, | |
| relative_scaling=0.5, | |
| random_state=42 | |
| ).generate(text) | |
| plt.figure(figsize=(15, 7)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| buf = BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| return img | |
| def generate_bar_chart(df, num_clusters_to_display): | |
| # Exclude common words | |
| common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done', 'to', 'for', 'and', 'but', 'so'} | |
| top_categories = df['Category'].value_counts().index[:num_clusters_to_display] | |
| df_top_categories = df[df['Category'].isin(top_categories)] | |
| category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index() | |
| category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3])) | |
| category_sizes = df_top_categories['Category'].value_counts().reset_index() | |
| category_sizes.columns = ['Category', 'Count'] | |
| category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category') | |
| fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words') | |
| fig.update_traces(textposition='outside') | |
| fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False) | |
| buf = BytesIO() | |
| fig.write_image(buf, format='png') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| return img | |
| def main(file, num_clusters_to_display): | |
| try: | |
| df = pd.read_csv(file) | |
| # Filter by 'Fallback Message shown' | |
| df = df[df['Answer'] == 'Fallback Message shown'] | |
| df = preprocess_data(df) | |
| # Clustering | |
| num_clusters = 12 | |
| df, kmeans = cluster_data(df, num_clusters) | |
| # Categorization | |
| df['Category'] = df['texts'].apply(categorize_question) | |
| df = df[df['Category'] != 'Miscellaneous'] | |
| # Sorting (ascending order) | |
| category_sizes = df['Category'].value_counts().reset_index() | |
| category_sizes.columns = ['Category', 'Count'] | |
| sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist() | |
| sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist() | |
| # Display (according to input slider) | |
| largest_categories = sorted_categories[:num_clusters_to_display] | |
| smallest_categories = sorted_categories_sm[:num_clusters_to_display] | |
| # Filtering (according to input slider) | |
| filtered_df = df[df['Category'].isin(largest_categories)] | |
| filtered_cloud_df = df[df['Category'].isin(smallest_categories)] | |
| # Sort the output file by Category and Cluster | |
| filtered_df = filtered_df.sort_values(by=['Category', 'Cluster']) | |
| filtered_cloud_df = filtered_cloud_df.sort_values(by='Category') | |
| wordcloud_img = generate_wordcloud(filtered_cloud_df) | |
| bar_chart_img = generate_bar_chart(df, num_clusters_to_display) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile: | |
| filtered_df.to_csv(tmpfile.name, index=False) | |
| csv_file_path = tmpfile.name | |
| return csv_file_path, wordcloud_img, bar_chart_img | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return str(e), None, None | |
| interface = gr.Interface( | |
| fn=main, | |
| inputs=[ | |
| gr.File(label="Upload CSV File (.csv)"), | |
| gr.Slider(label="Number of Categories to Display", minimum=1, maximum=15, step=1, value=5) | |
| ], | |
| outputs=[ | |
| gr.File(label="Categorized Data CSV"), | |
| gr.Image(label="Word Cloud"), | |
| gr.Image(label="Bar Chart") | |
| ], | |
| title="Unanswered User Queries Categorization", | |
| ) | |
| interface.launch(share=True) | |