tanish78 commited on
Commit
6eb917c
·
verified ·
1 Parent(s): f0823ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -30
app.py CHANGED
@@ -2,13 +2,12 @@ import gradio as gr
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
- import matplotlib.pyplot as plt
6
  import re
7
  from io import BytesIO
8
  import tempfile
9
- import numpy as np
10
- from PIL import Image
11
  from wordcloud import WordCloud
 
 
12
 
13
  def preprocess_data(df):
14
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
@@ -75,7 +74,7 @@ def cluster_data(df, num_clusters):
75
 
76
  def generate_wordcloud(df):
77
  text = " ".join(df['texts'].tolist())
78
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
79
  plt.figure(figsize=(10, 5))
80
  plt.imshow(wordcloud, interpolation='bilinear')
81
  plt.axis('off')
@@ -86,32 +85,37 @@ def generate_wordcloud(df):
86
  return img
87
 
88
  def main(file, num_clusters_to_display):
89
- df = pd.read_csv(file)
90
-
91
- # Filter by 'Fallback Message shown'
92
- df = df[df['Answer'] == 'Fallback Message shown']
93
-
94
- df = preprocess_data(df)
95
- df, kmeans = cluster_data(df, num_clusters=20)
96
-
97
- cluster_sizes = df['Cluster'].value_counts()
98
- sorted_clusters = cluster_sizes.index.tolist()
99
-
100
- # Filter out the largest cluster and get the next largest clusters
101
- largest_cluster = sorted_clusters[0]
102
- filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
103
-
104
- df = df[df['Cluster'].isin(filtered_clusters)]
105
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
106
- df = df.sort_values('Cluster')
107
-
108
- wordcloud_img = generate_wordcloud(df)
109
-
110
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
111
- df.to_csv(tmpfile.name, index=False)
112
- csv_file_path = tmpfile.name
113
-
114
- return csv_file_path, wordcloud_img
 
 
 
 
 
115
 
116
  interface = gr.Interface(
117
  fn=main,
 
2
  import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
 
5
  import re
6
  from io import BytesIO
7
  import tempfile
 
 
8
  from wordcloud import WordCloud
9
+ import matplotlib.pyplot as plt
10
+ from PIL import Image
11
 
12
  def preprocess_data(df):
13
  df.rename(columns={'Question Asked': 'texts'}, inplace=True)
 
74
 
75
  def generate_wordcloud(df):
76
  text = " ".join(df['texts'].tolist())
77
+ wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=200, collocations=False).generate(text)
78
  plt.figure(figsize=(10, 5))
79
  plt.imshow(wordcloud, interpolation='bilinear')
80
  plt.axis('off')
 
85
  return img
86
 
87
  def main(file, num_clusters_to_display):
88
+ try:
89
+ df = pd.read_csv(file)
90
+
91
+ # Filter by 'Fallback Message shown'
92
+ df = df[df['Answer'] == 'Fallback Message shown']
93
+
94
+ df = preprocess_data(df)
95
+ df, kmeans = cluster_data(df, num_clusters=15)
96
+
97
+ cluster_sizes = df['Cluster'].value_counts()
98
+ sorted_clusters = cluster_sizes.index.tolist()
99
+
100
+ # Filter out the largest cluster and get the next largest clusters
101
+ largest_cluster = sorted_clusters[0]
102
+ filtered_clusters = sorted_clusters[1:num_clusters_to_display+1]
103
+
104
+ df = df[df['texts'] != '']
105
+ df = df[df['Cluster'].isin(filtered_clusters)]
106
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=filtered_clusters, ordered=True)
107
+ df = df.sort_values('Cluster')
108
+
109
+ wordcloud_img = generate_wordcloud(df)
110
+
111
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
112
+ df.to_csv(tmpfile.name, index=False)
113
+ csv_file_path = tmpfile.name
114
+
115
+ return csv_file_path, wordcloud_img
116
+ except Exception as e:
117
+ print(f"Error: {e}")
118
+ return str(e), None
119
 
120
  interface = gr.Interface(
121
  fn=main,