Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| import re | |
| from io import BytesIO | |
| import tempfile | |
| from wordcloud import WordCloud, STOPWORDS | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| from PIL import Image | |
| # Define categories for Bot 1 and Bot 2 | |
| categories_keywords_bot1 = { | |
| 'Application Status': ['application', 'applied', 'update on my application', 'result of my application', 'selected', 'selection process', 'apply', 'fellow', 'lesson plan', 'status of my application', 'application update', 'application status', 'applied for'], | |
| 'Volunteering': ['volunteering', 'volunteer', 'volunteering certificate', 'resume my volunteering', 'volunteering journey', 'volunteering with TFI', 'volunteering opportunities', 'volunteer work', 'volunteer program'], | |
| 'Certificates': ['certificate', 'certificates', 'certificate of completion', 'volunteer certificate', 'issue certificate'], | |
| 'Job Opportunities': ['job', 'vacancy', 'Talent Acquisition Executive job', 'opportunity', 'job opening', 'job position', 'career opportunities'], | |
| 'Surveys and Forms': ['survey', 'form', 'fill out the survey', 'application form', 'survey link', 'survey form', 'form submission'], | |
| 'General Queries': ['query', 'queries', 'questions', 'feedback', 'loved', 'overwhelming', 'general question', 'inquiry', 'query about'], | |
| 'Spam': ['free recharge', 'offer', 'click the link', 'https'], | |
| 'Rescheduling and Postponing': ['reschedule', 'postpone', 'cancellation', 'date', 'time slot', 'change date', 'change time', 'reschedule appointment'], | |
| 'Contact and Communication Issues': ['call', 'phone', 'contact', 'not received', 'contact support', 'phone call', 'call back', 'internet'], | |
| 'Email and Credentials Issues': ['email', 'credentials', 'received', 'email issue', 'email problem', 'credential issue', 'login problem'], | |
| 'Timing and Scheduling': ['session', 'time', 'interview', 'baje', 'schedule time', 'meeting time', 'appointment time'], | |
| 'Salary and Benefits': ['salary', 'increment', 'accommodation', 'training period', 'reside', 'stipend', 'pay', 'wage', 'salary details', 'benefits information'], | |
| 'Technical Issues': ['network issues', 'zoom meeting', 'passcode', 'technical', 'issue','technical problem', 'system issue', 'technical support'], | |
| 'Complaint Handling': ['help', 'i need help', 'Help me', 'complaint', 'issue is unresolved', 'unsatisfied', 'bad experience'], | |
| 'End of Conversation': ['thanks', 'thankss', 'thank u', 'thank you', 'ok', 'okay', 'done', 'joining', 'sounds good', 'goodbye', 'end chat', 'end'], | |
| 'Miscellaneous': [] | |
| } | |
| categories_keywords_bot2 = { | |
| 'Service Issues': ['service', 'support', 'help', 'assistance'], | |
| 'Billing': ['bill', 'billing', 'invoice', 'payment', 'charge'], | |
| 'Technical Problems': ['technical', 'error', 'problem', 'issue'], | |
| 'Account Management': ['account', 'login', 'credentials', 'password'], | |
| 'Product Information': ['product', 'information', 'details', 'specifications'], | |
| 'Miscellaneous': [] | |
| } | |
| # Initialize with Bot 1's categories | |
| categories_keywords = categories_keywords_bot1 | |
| def categorize_question(question, categories_keywords): | |
| for category, keywords in categories_keywords.items(): | |
| for keyword in keywords: | |
| if keyword.lower() in question.lower(): | |
| if category == 'End of Conversation': | |
| return category | |
| if category != 'End of Conversation': | |
| return category | |
| return 'Miscellaneous' | |
| def preprocess_data(df, categories_keywords): | |
| df.rename(columns={'Question Asked': 'texts'}, inplace=True) | |
| df['texts'] = df['texts'].astype(str).str.lower() | |
| df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)) | |
| def remove_emoji(string): | |
| emoji_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" | |
| u"\U0001F300-\U0001F5FF" | |
| u"\U0001F680-\U0001F6FF" | |
| u"\U0001F1E0-\U0001F1FF" | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE) | |
| return emoji_pattern.sub(r'', string) | |
| df['texts'] = df['texts'].apply(remove_emoji) | |
| custom_synonyms = { | |
| 'application': ['form'], | |
| 'apply': ['fill', 'applied'], | |
| 'work': ['job'], | |
| 'salary': ['stipend', 'pay', 'payment', 'paid'], | |
| 'test': ['online test', 'amcat test', 'exam', 'assessment'], | |
| 'pass': ['clear', 'selected', 'pass or not'], | |
| 'result': ['outcome', 'mark', 'marks'], | |
| 'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you", | |
| "okaythank", "thx", "ty", "thankyou", "thank", "thank u"], | |
| 'interview': ["pi"] | |
| } | |
| for original_word, synonym_list in custom_synonyms.items(): | |
| for synonym in synonym_list: | |
| pattern = r"\b" + synonym + r"\b" | |
| df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True) | |
| spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein", | |
| "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar", | |
| "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"] | |
| for spam_phrase in spam_list: | |
| pattern = r"\b" + re.escape(spam_phrase) + r"\b" | |
| df = df[~df['texts'].str.contains(pattern)] | |
| def remove_punctuations(text): | |
| return re.sub(r'[^\w\s]', '', text) | |
| df['texts'] = df['texts'].apply(remove_punctuations) | |
| df['texts'] = df['texts'].str.strip() | |
| df = df[df['texts'] != ''] | |
| df['Category'] = df['texts'].apply(lambda x: categorize_question(x, categories_keywords)) | |
| return df | |
| def cluster_data(df, num_clusters): | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(df['texts']) | |
| kmeans = KMeans(n_clusters=num_clusters, random_state=0) | |
| kmeans.fit(X) | |
| df['Cluster'] = kmeans.labels_ | |
| return df, kmeans | |
| def generate_wordcloud(df): | |
| text = " ".join(df['texts'].tolist()) | |
| stopwords = set(STOPWORDS) | |
| wordcloud = WordCloud( | |
| width=800, | |
| height=400, | |
| background_color='white', | |
| max_words=300, | |
| collocations=False, | |
| min_font_size=10, | |
| max_font_size=200, | |
| stopwords=stopwords, | |
| prefer_horizontal=1.0, | |
| scale=2, | |
| relative_scaling=0.5, | |
| random_state=42 | |
| ).generate(text) | |
| plt.figure(figsize=(15, 7)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| buf = BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| return img | |
| def generate_bar_chart(df, num_clusters_to_display): | |
| common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'} | |
| top_categories = df['Category'].value_counts().index[:num_clusters_to_display] | |
| df_top_categories = df[df['Category'].isin(top_categories)] | |
| category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index() | |
| category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3])) | |
| category_sizes = df_top_categories['Category'].value_counts().reset_index() | |
| category_sizes.columns = ['Category', 'Count'] | |
| category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category') | |
| fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words') | |
| fig.update_traces(textposition='outside') | |
| fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False) | |
| buf = BytesIO() | |
| fig.write_image(buf, format='png') | |
| buf.seek(0) | |
| img = Image.open(buf) | |
| return img | |
| def main(file, num_clusters_to_display, categories_keywords): | |
| try: | |
| df = pd.read_csv(file) | |
| df = df[df['Answer'] == 'Fallback Message shown'] | |
| df = preprocess_data(df, categories_keywords) | |
| category_sizes = df['Category'].value_counts().reset_index() | |
| category_sizes.columns = ['Category', 'Count'] | |
| sorted_categories = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist() | |
| largest_categories = sorted_categories[:num_clusters_to_display] | |
| filtered_df = df[df['Category'].isin(largest_categories)] | |
| filtered_df = filtered_df.sort_values(by='Category') | |
| wordcloud_img = generate_wordcloud(filtered_df) | |
| bar_chart_img = generate_bar_chart(filtered_df, num_clusters_to_display) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile: | |
| filtered_df.to_csv(tmpfile.name, index=False) | |
| csv_file_path = tmpfile.name | |
| return csv_file_path, wordcloud_img, bar_chart_img | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return str(e), None, None | |
| def select_bot(bot_name): | |
| global categories_keywords | |
| if bot_name == "Bot 1": | |
| categories_keywords = categories_keywords_bot1 | |
| else: | |
| categories_keywords = categories_keywords_bot2 | |
| return f"Selected {bot_name}" | |
| def categorize_unanswered_queries(file, num_clusters_to_display): | |
| return main(file, num_clusters_to_display, categories_keywords) | |
| bot_selection_interface = gr.Interface( | |
| fn=select_bot, | |
| inputs=[ | |
| gr.Radio(["Bot 1", "Bot 2"], label="Select Bot") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Selected Bot") | |
| ], | |
| title="Select Bot", | |
| description="Select the bot for categorizing unanswered queries." | |
| ) | |
| categorize_interface = gr.Interface( | |
| fn=categorize_unanswered_queries, | |
| inputs=[ | |
| gr.File(label="Upload CSV File (.csv)"), | |
| gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5) | |
| ], | |
| outputs=[ | |
| gr.File(label="Categorized Data CSV"), | |
| gr.Image(label="Word Cloud"), | |
| gr.Image(label="Bar Chart") | |
| ], | |
| title="Unanswered User Queries Categorization", | |
| description="Categorize unanswered user queries into predefined categories" | |
| ) | |
| main_interface = gr.TabbedInterface([bot_selection_interface, categorize_interface], ["Select Bot", "Categorize Queries"]) | |
| main_interface.launch(share=True) | |