tanish78 commited on
Commit
60c5bda
·
verified ·
1 Parent(s): 0575dff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -50
app.py CHANGED
@@ -38,6 +38,7 @@ categories_keywords = {
38
  "Miscellaneous": []
39
  }
40
 
 
41
  def categorize_question(question):
42
  words = question.split()
43
 
@@ -68,6 +69,7 @@ def categorize_question(question):
68
 
69
  return "Miscellaneous"
70
 
 
71
  def preprocess_data(df):
72
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
73
  df['texts'] = df['texts'].astype(str).str.lower()
@@ -119,6 +121,9 @@ def preprocess_data(df):
119
  df['texts'] = df['texts'].str.strip()
120
  df = df[df['texts'] != '']
121
 
 
 
 
122
  return df
123
 
124
  def cluster_data(df, num_clusters):
@@ -152,58 +157,89 @@ def generate_wordcloud(df):
152
  plt.figure(figsize=(15, 7))
153
  plt.imshow(wordcloud, interpolation='bilinear')
154
  plt.axis('off')
155
- plt.show()
156
-
157
- def generate_barchart(df):
158
- category_counts = df['Category'].value_counts().reset_index()
159
- category_counts.columns = ['Category', 'Count']
160
- fig = px.bar(category_counts, x='Category', y='Count', title='Number of Queries per Category', color='Count', color_continuous_scale='Viridis')
161
- fig.show()
162
-
163
- def process_and_analyze(file, num_clusters):
164
- df = pd.read_csv(file)
165
- df = preprocess_data(df)
166
-
167
- df, kmeans = cluster_data(df, num_clusters)
168
-
169
- df['Category'] = df['texts'].apply(categorize_question)
170
-
171
- df = df.sort_values(by=['Category', 'Cluster'])
172
-
173
- with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
174
- temp_filename = tmp_file.name
175
- df.to_csv(temp_filename, index=False)
176
-
177
- generate_wordcloud(df)
178
- generate_barchart(df)
179
-
180
- return temp_filename
181
-
182
- def save_file(file):
183
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
184
- temp_filename = tmp_file.name
185
- with open(temp_filename, 'wb') as f:
186
- f.write(file.read())
187
- return temp_filename
188
-
189
- def process_and_return(file, num_clusters):
190
- temp_filename = save_file(file)
191
- output_filename = process_and_analyze(temp_filename, num_clusters)
192
-
193
- with open(output_filename, 'rb') as f:
194
- csv_bytes = BytesIO(f.read())
195
 
196
- return csv_bytes
197
-
198
- iface = gr.Interface(
199
- fn=process_and_return,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  inputs=[
201
- gr.inputs.File(label="Upload CSV File"),
202
- gr.inputs.Slider(2, 10, step=1, default=3, label="Number of Clusters")
203
  ],
204
- outputs=gr.outputs.File(label="Processed CSV File"),
205
- title="Query Categorization and Clustering",
206
- description="Upload a CSV file containing the queries. This tool will categorize and cluster the queries, then return a processed CSV file."
 
 
 
207
  )
208
 
209
- iface.launch()
 
 
38
  "Miscellaneous": []
39
  }
40
 
41
+
42
  def categorize_question(question):
43
  words = question.split()
44
 
 
69
 
70
  return "Miscellaneous"
71
 
72
+
73
  def preprocess_data(df):
74
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
75
  df['texts'] = df['texts'].astype(str).str.lower()
 
121
  df['texts'] = df['texts'].str.strip()
122
  df = df[df['texts'] != '']
123
 
124
+ # Categorize the texts
125
+ df['Category'] = df['texts'].apply(categorize_question)
126
+
127
  return df
128
 
129
  def cluster_data(df, num_clusters):
 
157
  plt.figure(figsize=(15, 7))
158
  plt.imshow(wordcloud, interpolation='bilinear')
159
  plt.axis('off')
160
+ buf = BytesIO()
161
+ plt.savefig(buf, format='png')
162
+ buf.seek(0)
163
+ img = Image.open(buf)
164
+ return img
165
+
166
+ def generate_bar_chart(df, num_clusters_to_display):
167
+ # Exclude common words from the top words
168
+ common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
171
+ df_top_categories = df[df['Category'].isin(top_categories)]
172
+
173
+ category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index()
174
+ category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3]))
175
+ category_sizes = df_top_categories['Category'].value_counts().reset_index()
176
+ category_sizes.columns = ['Category', 'Count']
177
+ category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category')
178
+
179
+ fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words')
180
+ fig.update_traces(textposition='outside')
181
+ fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False)
182
+
183
+ buf = BytesIO()
184
+ fig.write_image(buf, format='png')
185
+ buf.seek(0)
186
+ img = Image.open(buf)
187
+ return img
188
+
189
+ def main(file, num_clusters_to_display):
190
+ try:
191
+ df = pd.read_csv(file)
192
+
193
+ # Filter by 'Fallback Message shown'
194
+ df = df[df['Answer'] == 'Fallback Message shown']
195
+
196
+ df = preprocess_data(df)
197
+
198
+ df = df[df['Category'] != 'Miscellaneous']
199
+
200
+ # Get category sizes and sort by size in ascending order
201
+ category_sizes = df['Category'].value_counts().reset_index()
202
+ category_sizes.columns = ['Category', 'Count']
203
+ sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
204
+ sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
205
+
206
+ # Get the largest x categories as specified by num_clusters_to_display
207
+ largest_categories = sorted_categories[:num_clusters_to_display]
208
+ smallest_categories = sorted_categories_sm[:num_clusters_to_display]
209
+
210
+ # Filter the dataframe to include only the largest categories
211
+ filtered_df = df[df['Category'].isin(largest_categories)]
212
+ filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
213
+
214
+ # Sort the dataframe by Category
215
+ filtered_df = filtered_df.sort_values(by='Category')
216
+ filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
217
+
218
+ wordcloud_img = generate_wordcloud(filtered_cloud_df)
219
+ bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
220
+
221
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
222
+ filtered_df.to_csv(tmpfile.name, index=False)
223
+ csv_file_path = tmpfile.name
224
+
225
+ return csv_file_path, wordcloud_img, bar_chart_img
226
+ except Exception as e:
227
+ print(f"Error: {e}")
228
+ return str(e), None, None
229
+
230
+ interface = gr.Interface(
231
+ fn=main,
232
  inputs=[
233
+ gr.File(label="Upload CSV File (.csv)"),
234
+ gr.Slider(label="Number of Categories to Display", minimum=1, maximum=15, step=1, value=5)
235
  ],
236
+ outputs=[
237
+ gr.File(label="Categorized Data CSV"),
238
+ gr.Image(label="Word Cloud"),
239
+ gr.Image(label="Bar Chart")
240
+ ],
241
+ title="Unanswered User Queries Categorization",
242
  )
243
 
244
+ interface.launch(share=True)
245
+