import gradio as gr import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score import matplotlib.pyplot as plt from sklearn.decomposition import PCA import re from io import BytesIO import tempfile import numpy as np from PIL import Image from nltk.stem import WordNetLemmatizer from sklearn.preprocessing import normalize def preprocess_data(df): df.rename(columns={'Question Asked': 'texts'}, inplace=True) df['texts'] = df['texts'].astype(str) df['texts'] = df['texts'].str.lower() df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text)) lemmatizer = WordNetLemmatizer() df['texts'] = df['texts'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in text.split()])) def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', string) if isinstance(string, str) else string df['texts'] = df['texts'].apply(remove_emoji) custom_synonyms = { 'application': ['form'], 'apply': ['fill', 'applied'], 'work': ['job'], 'salary': ['stipend', 'pay', 'payment', 'paid'], 'test': ['online test', 'amcat test', 'exam', 'assessment'], 'pass': ['clear', 'selected', 'pass or not'], 'result': ['outcome', 'mark', 'marks'], 'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you", "okaythank", "thx", "ty", "thankyou", "thank", "thank u"], 'interview': ["pi"] } for original_word, synonym_list in custom_synonyms.items(): for synonym in synonym_list: pattern = r"\b" + synonym + r"\b(?!\s*\()" df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True) pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()" df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True) spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein", "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar", "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"] rows_to_remove = set() for spam_phrase in spam_list: pattern = r"\b" + re.escape(spam_phrase) + r"\b" spam_rows = df['texts'].str.contains(pattern) rows_to_remove.update(df.index[spam_rows].tolist()) df = df.drop(rows_to_remove) greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"] for greet_var in greet_variations: pattern = r"(? 8 min)", "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)", "actually no","next steps","i'm a student alumni","i have questions"] for phrase in remove_phrases: df['texts'] = df['texts'].str.replace(phrase, '') general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query", "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am","i'm all set","ask a question","apply the survey", "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)", "actually no","next steps","i'm a student alumni","i have questions"] for gen_var in general_variations: pattern = r"(?