tanish78 commited on
Commit
ae6433b
·
verified ·
1 Parent(s): cbf3516

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -80
app.py CHANGED
@@ -2,11 +2,12 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- from sklearn.metrics import silhouette_score
6
  from sklearn.preprocessing import normalize
7
- import re
8
  from wordcloud import WordCloud
9
  import matplotlib.pyplot as plt
 
 
 
10
 
11
  def preprocess_data(df):
12
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
@@ -34,64 +35,6 @@ def preprocess_data(df):
34
  pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
35
  df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
36
 
37
- spam_list = ["click here", "free", "recharge", "limited", "discount", "money back guarantee", "aaj", "kal", "mein",
38
- "how can i help you", "how can we help you", "how we can help you", "follow", "king", "contacting", "gar",
39
- "kirke", "subscribe", "youtube", "jio", "insta", "make money", "b2b","sent using truecaller"]
40
-
41
- rows_to_remove = set()
42
- for spam_phrase in spam_list:
43
- pattern = r"\b" + re.escape(spam_phrase) + r"\b"
44
- spam_rows = df['texts'].str.contains(pattern)
45
- rows_to_remove.update(df.index[spam_rows].tolist())
46
-
47
- df = df.drop(rows_to_remove)
48
-
49
- greet_variations = ["hello", "hy", "hey", "hii", "hi", "heyyy", "bie", "bye"]
50
- for greet_var in greet_variations:
51
- pattern = r"(?<!\S)" + greet_var + r"(?!\S)|\b" + greet_var + r"\b"
52
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
53
-
54
- okay_variations = ["ok", "k", "kay", "okay", "okie", "kk", "ohhhk","t","r"]
55
- for okay_var in okay_variations:
56
- pattern = r"(?<!\S)" + okay_var + r"(?!\S)|\b" + okay_var + r"\b"
57
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
58
-
59
- yes_variations = ["yes", "yeah", "yep", "yup", "yuh", "ya", "yes got it", "yeah it is", "yesss", "yea","no"]
60
- for yes_var in yes_variations:
61
- pattern = r"(?<!\S)" + yes_var + r"(?!\S)|\b" + yes_var + r"\b"
62
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
63
-
64
- remove_phrases = ["i'm all set","ask a question","apply the survey","videos (2-8 min)","long reads (> 8 min)",
65
- "short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
66
- "actually no","next steps","i'm a student alumni","i have questions"]
67
-
68
- for phrase in remove_phrases:
69
- df['texts'] = df['texts'].str.replace(phrase, '')
70
-
71
- general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
72
- "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma&#39;am","i'm all set","ask a question","apply the survey",
73
- "videos (2-8 min)","long reads (> 8 min)","short reads (3-8 min)","not a student alumni","mock","share feedback","bite size (< 2 min)",
74
- "actually no","next steps","i'm a student alumni","i have questions"]
75
- for gen_var in general_variations:
76
- pattern = r"(?<!\S)" + gen_var + r"(?!\S)|\b" + gen_var + r"\b(?=\W|$)"
77
- df['texts'] = df['texts'].str.replace(pattern, '', regex=True)
78
-
79
- def remove_punctuations(text):
80
- return re.sub(r'[^\w\s]', '', text)
81
- df['texts'] = df['texts'].apply(remove_punctuations)
82
-
83
- remove_morephrases = ["short reads 38 min","bite size 2 min","videos 28 min","long reads 8 min"]
84
-
85
- for phrase in remove_morephrases:
86
- df['texts'] = df['texts'].str.replace(phrase, '')
87
-
88
- df = df[~df['texts'].str.contains(r'\b\d{10}\b')]
89
-
90
- df['texts'] = df['texts'].str.strip()
91
-
92
- df['texts'] = df['texts'].apply(lambda x: x.strip())
93
- df = df[df['texts'] != '']
94
-
95
  return df
96
 
97
  def cluster_data(df, num_clusters):
@@ -103,7 +46,7 @@ def cluster_data(df, num_clusters):
103
  kmeans.fit(X)
104
  df['Cluster'] = kmeans.labels_
105
 
106
- return df, X, kmeans
107
 
108
  def generate_wordcloud(texts):
109
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(texts))
@@ -115,46 +58,50 @@ def generate_wordcloud(texts):
115
  buf.seek(0)
116
  return buf
117
 
118
- def main(file, num_clusters):
119
  df = pd.read_csv(file)
120
 
121
  # Filter by 'Fallback Message shown'
122
  df = df[df['Answer'] == 'Fallback Message shown']
123
 
124
  df = preprocess_data(df)
125
- df, X, kmeans = cluster_data(df, num_clusters)
126
 
127
- clusters = df['Cluster'].unique()
 
 
 
 
 
 
 
 
 
 
 
 
128
  wordclouds = []
129
- for cluster in clusters:
130
  texts = df[df['Cluster'] == cluster]['texts'].tolist()
131
  wordcloud_image = generate_wordcloud(texts)
132
  wordclouds.append((f"Cluster {cluster}", wordcloud_image))
133
 
134
- cluster_sizes = df['Cluster'].value_counts()
135
- top_clusters = cluster_sizes.head(num_clusters).index
136
- top_queries = df[df['Cluster'].isin(top_clusters)][['Cluster', 'texts']]
137
-
138
- return wordclouds, top_queries
139
 
140
- def display_results(wordclouds, top_queries):
141
- for cluster, wordcloud in wordclouds:
142
- print(cluster)
143
- img = Image.open(wordcloud)
144
- img.show()
145
-
146
- print("Top Queries by Cluster:")
147
- print(top_queries.to_string(index=False))
148
 
149
  interface = gr.Interface(
150
  fn=main,
151
  inputs=[
152
  gr.File(label="Upload CSV File (.csv)"),
153
- gr.Slider(label="Number of Clusters", minimum=2, maximum=20, step=1, value=5)
 
154
  ],
155
  outputs=[
156
  gr.Gallery(label="Word Clouds of Clusters"),
157
- gr.Dataframe(label="Top Queries by Cluster")
158
  ],
159
  title="Unanswered User Queries Clustering",
160
  description="Unanswered User Query Categorization"
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
 
5
  from sklearn.preprocessing import normalize
 
6
  from wordcloud import WordCloud
7
  import matplotlib.pyplot as plt
8
+ from io import BytesIO
9
+ import re
10
+ import tempfile
11
 
12
  def preprocess_data(df):
13
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
 
35
  pattern = r"\b" + synonym + r"\s+you" + r"\b(?!\s*\()"
36
  df['texts'] = df['texts'].str.replace(pattern, original_word + ' ', regex=True)
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return df
39
 
40
  def cluster_data(df, num_clusters):
 
46
  kmeans.fit(X)
47
  df['Cluster'] = kmeans.labels_
48
 
49
+ return df
50
 
51
  def generate_wordcloud(texts):
52
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(texts))
 
58
  buf.seek(0)
59
  return buf
60
 
61
+ def main(file, num_clusters, num_clusters_to_display):
62
  df = pd.read_csv(file)
63
 
64
  # Filter by 'Fallback Message shown'
65
  df = df[df['Answer'] == 'Fallback Message shown']
66
 
67
  df = preprocess_data(df)
68
+ df = cluster_data(df, num_clusters)
69
 
70
+ cluster_sizes = df['Cluster'].value_counts()
71
+ sorted_clusters = cluster_sizes.index.tolist()
72
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
73
+ df = df.sort_values('Cluster')
74
+
75
+ # Filter out the largest cluster and get the next largest clusters
76
+ largest_cluster = sorted_clusters[0]
77
+ filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
78
+
79
+ df = df[df['Cluster'].isin(filtered_clusters)]
80
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
81
+ df = df.sort_values('Cluster')
82
+
83
  wordclouds = []
84
+ for cluster in filtered_clusters:
85
  texts = df[df['Cluster'] == cluster]['texts'].tolist()
86
  wordcloud_image = generate_wordcloud(texts)
87
  wordclouds.append((f"Cluster {cluster}", wordcloud_image))
88
 
89
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
90
+ df.to_csv(tmpfile.name, index=False)
91
+ csv_file_path = tmpfile.name
 
 
92
 
93
+ return wordclouds, csv_file_path
 
 
 
 
 
 
 
94
 
95
  interface = gr.Interface(
96
  fn=main,
97
  inputs=[
98
  gr.File(label="Upload CSV File (.csv)"),
99
+ gr.Slider(label="Number of Clusters", minimum=2, maximum=20, step=1, value=5),
100
+ gr.Slider(label="Number of Categories to Display", minimum=1, maximum=10, step=1, value=5)
101
  ],
102
  outputs=[
103
  gr.Gallery(label="Word Clouds of Clusters"),
104
+ gr.File(label="Clustered Data CSV")
105
  ],
106
  title="Unanswered User Queries Clustering",
107
  description="Unanswered User Query Categorization"