tanish78 commited on
Commit
0575dff
·
verified ·
1 Parent(s): 1762079

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -85
app.py CHANGED
@@ -38,7 +38,6 @@ categories_keywords = {
38
  "Miscellaneous": []
39
  }
40
 
41
-
42
  def categorize_question(question):
43
  words = question.split()
44
 
@@ -69,7 +68,6 @@ def categorize_question(question):
69
 
70
  return "Miscellaneous"
71
 
72
-
73
  def preprocess_data(df):
74
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
75
  df['texts'] = df['texts'].astype(str).str.lower()
@@ -121,9 +119,6 @@ def preprocess_data(df):
121
  df['texts'] = df['texts'].str.strip()
122
  df = df[df['texts'] != '']
123
 
124
- # Categorize the texts
125
- df['Category'] = df['texts'].apply(categorize_question)
126
-
127
  return df
128
 
129
  def cluster_data(df, num_clusters):
@@ -157,88 +152,58 @@ def generate_wordcloud(df):
157
  plt.figure(figsize=(15, 7))
158
  plt.imshow(wordcloud, interpolation='bilinear')
159
  plt.axis('off')
160
- buf = BytesIO()
161
- plt.savefig(buf, format='png')
162
- buf.seek(0)
163
- img = Image.open(buf)
164
- return img
165
-
166
- def generate_bar_chart(df, num_clusters_to_display):
167
- # Exclude common words from the top words
168
- common_words = {'i', 'you', 'thanks', 'thank', 'ok', 'okay', 'sure', 'done'}
169
-
170
- top_categories = df['Category'].value_counts().index[:num_clusters_to_display]
171
- df_top_categories = df[df['Category'].isin(top_categories)]
172
-
173
- category_top_words = df_top_categories.groupby('Category', observed=False)['texts'].apply(lambda x: ' '.join(x)).reset_index()
174
- category_top_words['top_word'] = category_top_words['texts'].apply(lambda x: ' '.join([word for word in pd.Series(x.split()).value_counts().index if word not in common_words][:3]))
175
- category_sizes = df_top_categories['Category'].value_counts().reset_index()
176
- category_sizes.columns = ['Category', 'Count']
177
- category_sizes = category_sizes.merge(category_top_words[['Category', 'top_word']], on='Category')
178
-
179
- fig = px.bar(category_sizes, x='Category', y='Count', text='top_word', title='Category Frequency with Top Words')
180
- fig.update_traces(textposition='outside')
181
- fig.update_layout(xaxis_title='Category', yaxis_title='Frequency', showlegend=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- buf = BytesIO()
184
- fig.write_image(buf, format='png')
185
- buf.seek(0)
186
- img = Image.open(buf)
187
- return img
188
-
189
- def main(file, num_clusters_to_display):
190
- try:
191
- df = pd.read_csv(file)
192
-
193
- # Filter by 'Fallback Message shown'
194
- df = df[df['Answer'] == 'Fallback Message shown']
195
-
196
- df = preprocess_data(df)
197
-
198
- df = df[df['Category'] != 'Miscellaneous']
199
-
200
- # Get category sizes and sort by size in ascending order
201
- category_sizes = df['Category'].value_counts().reset_index()
202
- category_sizes.columns = ['Category', 'Count']
203
- sorted_categories = category_sizes.sort_values(by='Count', ascending=False)['Category'].tolist()
204
- sorted_categories_sm = category_sizes.sort_values(by='Count', ascending=True)['Category'].tolist()
205
-
206
- # Get the largest x categories as specified by num_clusters_to_display
207
- largest_categories = sorted_categories[:num_clusters_to_display]
208
- smallest_categories = sorted_categories_sm[:num_clusters_to_display]
209
-
210
- # Filter the dataframe to include only the largest categories
211
- filtered_df = df[df['Category'].isin(largest_categories)]
212
- filtered_cloud_df = df[df['Category'].isin(smallest_categories)]
213
-
214
- # Sort the dataframe by Category
215
- filtered_df = filtered_df.sort_values(by='Category')
216
- filtered_cloud_df = filtered_cloud_df.sort_values(by='Category')
217
-
218
- wordcloud_img = generate_wordcloud(filtered_cloud_df)
219
- bar_chart_img = generate_bar_chart(df, num_clusters_to_display)
220
-
221
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
222
- filtered_df.to_csv(tmpfile.name, index=False)
223
- csv_file_path = tmpfile.name
224
-
225
- return csv_file_path, wordcloud_img, bar_chart_img
226
- except Exception as e:
227
- print(f"Error: {e}")
228
- return str(e), None, None
229
-
230
- interface = gr.Interface(
231
- fn=main,
232
  inputs=[
233
- gr.File(label="Upload CSV File (.csv)"),
234
- gr.Slider(label="Number of Categories to Display", minimum=1, maximum=15, step=1, value=5)
235
- ],
236
- outputs=[
237
- gr.File(label="Categorized Data CSV"),
238
- gr.Image(label="Word Cloud"),
239
- gr.Image(label="Bar Chart")
240
  ],
241
- title="Unanswered User Queries Categorization",
 
 
242
  )
243
 
244
- interface.launch(share=True)
 
38
  "Miscellaneous": []
39
  }
40
 
 
41
  def categorize_question(question):
42
  words = question.split()
43
 
 
68
 
69
  return "Miscellaneous"
70
 
 
71
  def preprocess_data(df):
72
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
73
  df['texts'] = df['texts'].astype(str).str.lower()
 
119
  df['texts'] = df['texts'].str.strip()
120
  df = df[df['texts'] != '']
121
 
 
 
 
122
  return df
123
 
124
  def cluster_data(df, num_clusters):
 
152
  plt.figure(figsize=(15, 7))
153
  plt.imshow(wordcloud, interpolation='bilinear')
154
  plt.axis('off')
155
+ plt.show()
156
+
157
+ def generate_barchart(df):
158
+ category_counts = df['Category'].value_counts().reset_index()
159
+ category_counts.columns = ['Category', 'Count']
160
+ fig = px.bar(category_counts, x='Category', y='Count', title='Number of Queries per Category', color='Count', color_continuous_scale='Viridis')
161
+ fig.show()
162
+
163
+ def process_and_analyze(file, num_clusters):
164
+ df = pd.read_csv(file)
165
+ df = preprocess_data(df)
166
+
167
+ df, kmeans = cluster_data(df, num_clusters)
168
+
169
+ df['Category'] = df['texts'].apply(categorize_question)
170
+
171
+ df = df.sort_values(by=['Category', 'Cluster'])
172
+
173
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
174
+ temp_filename = tmp_file.name
175
+ df.to_csv(temp_filename, index=False)
176
+
177
+ generate_wordcloud(df)
178
+ generate_barchart(df)
179
+
180
+ return temp_filename
181
+
182
+ def save_file(file):
183
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
184
+ temp_filename = tmp_file.name
185
+ with open(temp_filename, 'wb') as f:
186
+ f.write(file.read())
187
+ return temp_filename
188
+
189
+ def process_and_return(file, num_clusters):
190
+ temp_filename = save_file(file)
191
+ output_filename = process_and_analyze(temp_filename, num_clusters)
192
+
193
+ with open(output_filename, 'rb') as f:
194
+ csv_bytes = BytesIO(f.read())
195
 
196
+ return csv_bytes
197
+
198
+ iface = gr.Interface(
199
+ fn=process_and_return,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  inputs=[
201
+ gr.inputs.File(label="Upload CSV File"),
202
+ gr.inputs.Slider(2, 10, step=1, default=3, label="Number of Clusters")
 
 
 
 
 
203
  ],
204
+ outputs=gr.outputs.File(label="Processed CSV File"),
205
+ title="Query Categorization and Clustering",
206
+ description="Upload a CSV file containing the queries. This tool will categorize and cluster the queries, then return a processed CSV file."
207
  )
208
 
209
+ iface.launch()