tanish78's picture
Update app.py
d0a4478 verified
raw
history blame
5.19 kB
import gradio as gr
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
from io import BytesIO
import tempfile
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
def preprocess_data(df):
df.rename(columns={'Question Asked': 'texts'}, inplace=True)
df['texts'] = df['texts'].astype(str).str.lower()
df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F"
u"\U0001F300-\U0001F5FF"
u"\U0001F680-\U0001F6FF"
u"\U0001F1E0-\U0001F1FF"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
df['texts'] = df['texts'].apply(remove_emoji)
custom_synonyms = {
'application': ['form'],
'apply': ['fill', 'applied'],
'work': ['job'],
'salary': ['stipend', 'pay', 'payment', 'paid'],
'test': ['online test', 'amcat test', 'exam', 'assessment'],
'pass': ['clear', 'selected', 'pass or not'],
'result': ['outcome', 'mark', 'marks'],
'thanks': ["thanks a lot to you", "thankyou so much", "thank you so much", "tysm", "thank you",
"okaythank", "thx", "ty", "thankyou", "thank", "thank u"],
'interview': ["pi"]
}
for original_word, synonym_list in custom_synonyms.items():
for synonym in synonym_list:
pattern = r"\b" + synonym + r"\b"
df['texts'] = df['texts'].str.replace(pattern, original_word, regex=True)
spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
"how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
"kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b", "sent using truecaller"]
for spam_phrase in spam_list:
pattern = r"\b" + re.escape(spam_phrase) + r"\b"
df = df[~df['texts'].str.contains(pattern)]
def remove_punctuations(text):
return re.sub(r'[^\w\s]', '', text)
df['texts'] = df['texts'].apply(remove_punctuations)
df['texts'] = df['texts'].str.strip()
df = df[df['texts'] != '']
return df
def cluster_data(df, num_clusters):
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['texts'])
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(X)
df['Cluster'] = kmeans.labels_
return df, kmeans
def generate_wordcloud(df):
text = " ".join(df['texts'].tolist())
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
width=1000,
height=500,
background_color='white',
max_words=300,
collocations=False,
min_font_size=5,
max_font_size=100,
stopwords=stopwords,
prefer_horizontal=1.0,
scale=3,
contour_width=1,
contour_color='steelblue'
).generate(text)
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
buf = BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
img = Image.open(buf)
return img
def main(file, num_clusters_to_display):
try:
df = pd.read_csv(file)
# Filter by 'Fallback Message shown'
df = df[df['Answer'] == 'Fallback Message shown']
df = preprocess_data(df)
df, kmeans = cluster_data(df, num_clusters=15)
cluster_sizes = df['Cluster'].value_counts()
sorted_clusters = cluster_sizes.index.tolist()
# Filter out the largest cluster and get the next largest clusters
largest_cluster = sorted_clusters[0]
filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
df = df[df['texts'] != '']
df = df[df['Cluster'].isin(filtered_clusters)]
df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
df = df.sort_values('Cluster')
wordcloud_img = generate_wordcloud(df)
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
df.to_csv(tmpfile.name, index=False)
csv_file_path = tmpfile.name
return csv_file_path, wordcloud_img
except Exception as e:
print(f"Error: {e}")
return str(e), None
interface = gr.Interface(
fn=main,
inputs=[
gr.File(label="Upload CSV File (.csv)"),
gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
],
outputs=[
gr.File(label="Clustered Data CSV"),
gr.Image(label="Word Cloud")
],
title="Unanswered User Queries Clustering",
description="Unanswered User Query Categorization"
)
interface.launch(share=True)