tanish78 commited on
Commit
1a4e870
·
verified ·
1 Parent(s): a554929

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -10
app.py CHANGED
@@ -107,8 +107,13 @@ def preprocess_data(df):
107
 
108
  return df
109
 
110
- def cluster_data(df):
111
- num_clusters = 15 # Set the number of clusters to 15
 
 
 
 
 
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
@@ -134,16 +139,18 @@ def visualize_clusters(df):
134
 
135
  def main(file, num_clusters_to_display):
136
  try:
137
- # Check the file extension to determine how to read the file
138
- if file.name.endswith('.xlsx'):
139
  df = pd.read_excel(file)
140
- elif file.name.endswith('.csv'):
 
141
  df = pd.read_csv(file)
 
142
  else:
143
- return "Unsupported file format. Please upload a .xlsx or .csv file."
144
-
145
- df = preprocess_data(df)
146
- df = cluster_data(df)
147
  visualize_clusters(df)
148
 
149
  cluster_sizes = df['Cluster'].value_counts()
@@ -168,7 +175,7 @@ def main(file, num_clusters_to_display):
168
  interface = gr.Interface(
169
  fn=main,
170
  inputs=[
171
- gr.File(label="Upload Excel or CSV File (.xlsx, .csv)"),
172
  gr.Slider(1, 10, step=1, label="Number of Categories to Display")
173
  ],
174
  outputs=gr.File(label="Clustered Data CSV"),
 
107
 
108
  return df
109
 
110
+ def preprocess_csv_data(df):
111
+ df = df[df['Answer'] == 'Fallback Message shown']
112
+ df.rename(columns={'Question': 'texts'}, inplace=True)
113
+ df['texts'] = df['texts'].astype(str)
114
+ return preprocess_data(df)
115
+
116
+ def cluster_data(df, num_clusters):
117
  vectorizer = TfidfVectorizer(stop_words='english')
118
  X = vectorizer.fit_transform(df['texts'])
119
 
 
139
 
140
  def main(file, num_clusters_to_display):
141
  try:
142
+ file_ext = file.name.split('.')[-1].lower()
143
+ if file_ext == 'xlsx':
144
  df = pd.read_excel(file)
145
+ df = preprocess_data(df)
146
+ elif file_ext == 'csv':
147
  df = pd.read_csv(file)
148
+ df = preprocess_csv_data(df)
149
  else:
150
+ return "Unsupported file format. Please upload an Excel (.xlsx) or CSV (.csv) file."
151
+
152
+ num_clusters = 10 # Set the number of clusters
153
+ df = cluster_data(df, num_clusters)
154
  visualize_clusters(df)
155
 
156
  cluster_sizes = df['Cluster'].value_counts()
 
175
  interface = gr.Interface(
176
  fn=main,
177
  inputs=[
178
+ gr.File(label="Upload Excel or CSV File (.xlsx or .csv)"),
179
  gr.Slider(1, 10, step=1, label="Number of Categories to Display")
180
  ],
181
  outputs=gr.File(label="Clustered Data CSV"),