Spaces:
Sleeping
Sleeping
File size: 7,485 Bytes
b3eb597 9d114a6 6847e76 b3eb597 148df09 b3eb597 148df09 b3eb597 6847e76 b3eb597 6847e76 b3eb597 6847e76 b3eb597 5d93751 904a011 b3eb597 ace8a63 faeddba e897e9d ae43fca e897e9d 5d93751 e897e9d 904a011 faeddba 6847e76 e897e9d 6925228 e897e9d 6925228 b3eb597 faeddba ab5743a 6925228 faeddba b3eb597 ab5743a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import re
from io import BytesIO
import tempfile
def preprocess_data(df):
df.rename(columns={'Queries': 'texts'}, inplace=True)
df['texts'] = df['texts'].astype(str)
df['texts'] = df['texts'].str.lower()
df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string) if isinstance(string, str) else string
df['texts'] = df['texts'].apply(remove_emoji)
custom_synonyms = {
'application': ['form'],
'apply': ['fill', 'applied'],
'work': ['job'],
'salary': ['stipend', 'pay', 'payment', 'paid'],
'test': ['online test', 'amcat test', 'exam', 'assessment'],
'pass': ['clear', 'selected', 'pass or not'],
'result': ['outcome', 'mark', 'marks'],
'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you",
"okaythank", "thx", "ty", "thankyou", "thank", "thank u"],
'interview': ["pi"]
}
for original_word, synonym_list in custom_synonyms.items():
for synonym in synonym_list:
pattern = r"\b" + synonym + r"\b(?!\s*\()"
df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
"how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
"kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
rows_to_remove = set()
for spam_phrase in spam_list:
pattern = r"\b" + re.escape(spam_phrase) + r"\b"
spam_rows = df['texts'].str.contains(pattern)
rows_to_remove.update(df.index[spam_rows].tolist())
df = df.drop(rows_to_remove)
greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
for greet_var in greet_variations:
pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
for okay_var in okay_variations:
pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
for yes_var in yes_variations:
pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
"short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
"actually no","next steps","i'm a student alumni","i have questions"]
for phrase in remove_phrases:
df['texts'] = df['texts'].str.replace(phrase, '')
general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
"stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am","i'm all set","ask a question","apply the survey",
"videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
"actually no","next steps","i'm a student alumni","i have questions"]
for gen_var in general_variations:
pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
def remove_punctuations(text):
return re.sub(r'[^\w\s]', '', text)
df['texts'] = df['texts'].apply(remove_punctuations)
remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
for phrase in remove_morephrases:
df['texts'] = df['texts'].str.replace(phrase, '')
df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
df['texts'] = df['texts'].str.strip()
df['texts'] = df['texts'].apply(lambda x: x.strip())
df = df[df['texts'] != '']
return df
def cluster_data(df):
num_clusters = 15 # Set the number of clusters to 15
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['texts'])
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(X)
df['Cluster'] = kmeans.labels_
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X.toarray())
df['PCA1'] = principal_components[:, 0]
df['PCA2'] = principal_components[:, 1]
return df
def visualize_clusters(df):
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.title('Clusters of User Queries')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
def main(file, num_clusters_to_display):
try:
df = pd.read_excel(file)
df = preprocess_data(df)
df = cluster_data(df)
visualize_clusters(df)
cluster_sizes = df['Cluster'].value_counts()
sorted_clusters = cluster_sizes.index.tolist()
df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
df = df.sort_values('Cluster')
# Filter out cluster 0 and get the largest clusters
filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
top_clusters = filtered_clusters[:num_clusters_to_display]
df = df[df['Cluster'].isin(top_clusters)]
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
df = df.sort_values('Cluster')
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
df.to_csv(tmpfile.name, index=False)
return tmpfile.name
except Exception as e:
return str(e)
interface = gr.Interface(
fn=main,
inputs=[
gr.File(label="Upload Excel File (.xlsx)"),
gr.Slider(1, 10, step=1, label="Number of Largest Clusters to Display")
],
outputs=gr.File(label="Clustered Data CSV"),
title="Unanswered User Queries Clustering",
description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
)
interface.launch()
|