tanish78 commited on
Commit
c108323
·
verified ·
1 Parent(s): 1a4e870

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -24
app.py CHANGED
@@ -107,13 +107,8 @@ def preprocess_data(df):
107
 
108
  return df
109
 
110
- def preprocess_csv_data(df):
111
- df = df[df['Answer'] == 'Fallback Message shown']
112
- df.rename(columns={'Question': 'texts'}, inplace=True)
113
- df['texts'] = df['texts'].astype(str)
114
- return preprocess_data(df)
115
-
116
- def cluster_data(df, num_clusters):
117
  vectorizer = TfidfVectorizer(stop_words='english')
118
  X = vectorizer.fit_transform(df['texts'])
119
 
@@ -139,18 +134,26 @@ def visualize_clusters(df):
139
 
140
  def main(file, num_clusters_to_display):
141
  try:
142
- file_ext = file.name.split('.')[-1].lower()
143
- if file_ext == 'xlsx':
 
 
 
144
  df = pd.read_excel(file)
145
- df = preprocess_data(df)
146
- elif file_ext == 'csv':
147
  df = pd.read_csv(file)
148
- df = preprocess_csv_data(df)
149
  else:
150
- return "Unsupported file format. Please upload an Excel (.xlsx) or CSV (.csv) file."
151
-
152
- num_clusters = 10 # Set the number of clusters
153
- df = cluster_data(df, num_clusters)
 
 
 
 
 
 
 
154
  visualize_clusters(df)
155
 
156
  cluster_sizes = df['Cluster'].value_counts()
@@ -166,21 +169,31 @@ def main(file, num_clusters_to_display):
166
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
167
  df = df.sort_values('Cluster')
168
 
 
169
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
170
  df.to_csv(tmpfile.name, index=False)
 
 
171
  return tmpfile.name
 
172
  except Exception as e:
173
- return str(e)
 
 
 
 
 
 
 
174
 
175
  interface = gr.Interface(
176
- fn=main,
177
  inputs=[
178
- gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
179
- gr.Slider(1, 10, step=1, label="Number of Categories to Display")
180
  ],
181
- outputs=gr.File(label="Clustered Data CSV"),
182
- title="Unanswered User Queries Clustering",
183
- description="Upload an Excel or CSV file and select the number of largest clusters to display (excluding cluster 0)"
184
  )
185
 
186
- interface.launch()
 
107
 
108
  return df
109
 
110
+ def cluster_data(df):
111
+ num_clusters = 15 # Set the number of clusters to 15
 
 
 
 
 
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
 
134
 
135
  def main(file, num_clusters_to_display):
136
  try:
137
+ # Determine the file type
138
+ file_extension = file.name.split('.')[-1].lower()
139
+
140
+ # Load the file
141
+ if file_extension == 'xlsx':
142
  df = pd.read_excel(file)
143
+ elif file_extension == 'csv':
 
144
  df = pd.read_csv(file)
 
145
  else:
146
+ return "Unsupported file type. Please upload an Excel or CSV file."
147
+
148
+ # Process CSV file specifically
149
+ if file_extension == 'csv':
150
+ # Keep only rows where 'Answer' is 'Fallback Message shown'
151
+ df = df[df['Answer'] == 'Fallback Message shown']
152
+ # Focus on 'Query' column for text processing
153
+ df.rename(columns={'Query': 'texts'}, inplace=True)
154
+
155
+ df = preprocess_data(df)
156
+ df = cluster_data(df)
157
  visualize_clusters(df)
158
 
159
  cluster_sizes = df['Cluster'].value_counts()
 
169
  df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
170
  df = df.sort_values('Cluster')
171
 
172
+ # Save the resulting DataFrame to a CSV file
173
  with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
174
  df.to_csv(tmpfile.name, index=False)
175
+ tmpfile.flush()
176
+
177
  return tmpfile.name
178
+
179
  except Exception as e:
180
+ return f"An error occurred: {str(e)}"
181
+
182
+ def upload_file(file, num_clusters_to_display):
183
+ result = main(file, num_clusters_to_display)
184
+ if result.endswith(".csv"):
185
+ return result
186
+ else:
187
+ return f"Error: {result}"
188
 
189
  interface = gr.Interface(
190
+ fn=upload_file,
191
  inputs=[
192
+ gr.inputs.File(label="Upload Excel or CSV File (.xlsx or .csv)", type="file"),
193
+ gr.inputs.Slider(minimum=1, maximum=20, step=1, default=5, label="Number of Categories to Display")
194
  ],
195
+ outputs=gr.outputs.File(label="Output CSV File"),
196
+ title="Unanswered User Queries Clustering"
 
197
  )
198
 
199
+ interface.launch(debug=True)