tanish78 commited on
Commit
29692d0
·
verified ·
1 Parent(s): c6ae83d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -25
app.py CHANGED
@@ -8,7 +8,6 @@ import re
8
  from io import BytesIO
9
 
10
  def preprocess_data(df):
11
-
12
  # Renaming the 'Queries' column to 'texts'
13
  df.rename(columns={'Queries': 'texts'}, inplace=True)
14
 
@@ -21,7 +20,6 @@ def preprocess_data(df):
21
  # Remove URL from text
22
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
23
 
24
-
25
  # Remove emojis from text
26
  def remove_emoji(string):
27
  emoji_pattern = re.compile("["
@@ -98,7 +96,6 @@ def preprocess_data(df):
98
  for phrase in remove_phrases:
99
  df['texts'] = df['texts'].str.replace(phrase, '')
100
 
101
-
102
  # Drop rows containing any general words from response and its variations
103
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
104
  "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am","i'm all set","ask a question","apply the survey",
@@ -128,12 +125,9 @@ def preprocess_data(df):
128
  df['texts'] = df['texts'].apply(lambda x: x.strip()) # Remove leading and trailing whitespaces
129
  df = df[df['texts'] != '']
130
 
131
-
132
-
133
  return df
134
 
135
  def cluster_data(df, num_clusters=5):
136
-
137
  # Vectorize the text data
138
  vectorizer = TfidfVectorizer(stop_words='english')
139
  X = vectorizer.fit_transform(df['texts'])
@@ -149,10 +143,8 @@ def cluster_data(df, num_clusters=5):
149
  df['PCA1'] = principal_components[:, 0]
150
  df['PCA2'] = principal_components[:, 1]
151
 
152
-
153
  return df
154
 
155
-
156
  def visualize_clusters(df):
157
  plt.figure(figsize=(10, 6))
158
  scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
@@ -162,31 +154,25 @@ def visualize_clusters(df):
162
  plt.ylabel('PCA Component 2')
163
  plt.show()
164
 
165
-
166
  def main(file, num_clusters):
167
  try:
168
  df = pd.read_excel(file)
169
  df = preprocess_data(df)
170
  df = cluster_data(df, num_clusters)
171
  visualize_clusters(df)
172
-
173
- output = BytesIO()
174
- df.to_csv(output, index=False)
175
- output.seek(0)
176
-
177
- return output
178
  except Exception as e:
179
- return str(e)
 
180
 
181
- interface = gr.Interface(
182
  fn=main,
183
- inputs=[
184
- gr.File(label="Upload Excel File (.xlsx)"),
185
- gr.Number(value=5, label="Number of Clusters")
186
- ],
187
- outputs=gr.File(label="Download Clustered Data as CSV"),
188
- title="Unanswered User Queries Clustering",
189
- description="Upload an Excel file (.xlsx)"
190
  )
191
 
192
- interface.launch()
 
 
8
  from io import BytesIO
9
 
10
  def preprocess_data(df):
 
11
  # Renaming the 'Queries' column to 'texts'
12
  df.rename(columns={'Queries': 'texts'}, inplace=True)
13
 
 
20
  # Remove URL from text
21
  df['texts'] = df['texts'].apply(lambda text: re.sub(r'https?://\S+|www\.\S+', '', text))
22
 
 
23
  # Remove emojis from text
24
  def remove_emoji(string):
25
  emoji_pattern = re.compile("["
 
96
  for phrase in remove_phrases:
97
  df['texts'] = df['texts'].str.replace(phrase, '')
98
 
 
99
  # Drop rows containing any general words from response and its variations
100
  general_variations = ["good morning", "good evening", "good afternoon", "good night", "done", "sorry", "top", "query",
101
  "stop", "sir", "sure", "oh", "wow", "aaa", "maam", "mam", "ma'am","i'm all set","ask a question","apply the survey",
 
125
  df['texts'] = df['texts'].apply(lambda x: x.strip()) # Remove leading and trailing whitespaces
126
  df = df[df['texts'] != '']
127
 
 
 
128
  return df
129
 
130
  def cluster_data(df, num_clusters=5):
 
131
  # Vectorize the text data
132
  vectorizer = TfidfVectorizer(stop_words='english')
133
  X = vectorizer.fit_transform(df['texts'])
 
143
  df['PCA1'] = principal_components[:, 0]
144
  df['PCA2'] = principal_components[:, 1]
145
 
 
146
  return df
147
 
 
148
  def visualize_clusters(df):
149
  plt.figure(figsize=(10, 6))
150
  scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
 
154
  plt.ylabel('PCA Component 2')
155
  plt.show()
156
 
 
157
  def main(file, num_clusters):
158
  try:
159
  df = pd.read_excel(file)
160
  df = preprocess_data(df)
161
  df = cluster_data(df, num_clusters)
162
  visualize_clusters(df)
163
+ csv_file = BytesIO()
164
+ df.to_csv(csv_file, index=False)
165
+ csv_file.seek(0)
166
+ return csv_file
 
 
167
  except Exception as e:
168
+ import traceback
169
+ return traceback.format_exc()
170
 
171
+ iface = gr.Interface(
172
  fn=main,
173
+ inputs=[gr.inputs.File(label="Upload an Excel File (.xlsx)"), gr.inputs.Number(label="Number of Clusters")],
174
+ outputs=gr.outputs.File(label="Download Clustered Data as CSV")
 
 
 
 
 
175
  )
176
 
177
+ if __name__ == "__main__":
178
+ iface.launch()