tanish78 commited on
Commit
cdb1b12
·
verified ·
1 Parent(s): a48299a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -43
app.py CHANGED
@@ -107,22 +107,8 @@ def preprocess_data(df):
107
 
108
  return df
109
 
110
- def elbow_method(X):
111
- distortions = []
112
- K = range(1, 20)
113
- for k in K:
114
- kmeanModel = KMeans(n_clusters=k)
115
- kmeanModel.fit(X)
116
- distortions.append(kmeanModel.inertia_)
117
-
118
- plt.figure(figsize=(10, 6))
119
- plt.plot(K, distortions, 'bx-')
120
- plt.xlabel('k')
121
- plt.ylabel('Distortion')
122
- plt.title('The Elbow Method showing the optimal k')
123
- plt.show()
124
-
125
- def cluster_data(df, num_clusters=15):
126
  vectorizer = TfidfVectorizer(stop_words='english')
127
  X = vectorizer.fit_transform(df['texts'])
128
 
@@ -131,45 +117,54 @@ def cluster_data(df, num_clusters=15):
131
  df['Cluster'] = kmeans.labels_
132
 
133
  pca = PCA(n_components=2)
134
- scatter_plot_points = pca.fit_transform(X.toarray())
135
- colors = ['r', 'b', 'c', 'y', 'm', 'g']
 
136
 
137
- x_axis = [o[0] for o in scatter_plot_points]
138
- y_axis = [o[1] for o in scatter_plot_points]
139
 
 
140
  plt.figure(figsize=(10, 6))
141
- plt.scatter(x_axis, y_axis, c=[colors[d] for d in kmeans.labels_])
 
 
 
 
142
  plt.show()
143
 
144
- return df
145
-
146
- def sort_and_filter_clusters(df, num_display_clusters):
147
- cluster_sizes = df['Cluster'].value_counts().sort_values(ascending=False)
148
- largest_clusters = cluster_sizes[cluster_sizes.index != 0].head(num_display_clusters).index
149
- filtered_df = df[df['Cluster'].isin(largest_clusters)]
150
 
151
- return filtered_df
 
 
 
152
 
153
- def main(file, num_display_clusters):
154
- df = pd.read_excel(file)
155
- df = preprocess_data(df)
156
- df = cluster_data(df, num_clusters=15)
157
- df = sort_and_filter_clusters(df, num_display_clusters)
158
-
159
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
160
- df.to_csv(tmp.name, index=False)
161
- tmp.seek(0)
162
- return tmp.name
163
 
164
- input_file = gr.inputs.File(label="Upload an Excel file (.xlsx)")
165
- num_display_clusters = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of Largest Clusters to Display")
 
166
 
167
- output_file = gr.outputs.File(label="Clustered Data CSV")
 
 
 
 
168
 
169
  interface = gr.Interface(
170
  fn=main,
171
- inputs=[input_file, num_display_clusters],
172
- outputs=output_file,
 
 
 
173
  title="Unanswered User Queries Clustering",
174
  description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
175
  )
 
107
 
108
  return df
109
 
110
+ def cluster_data(df):
111
+ num_clusters = 20 # Set the number of clusters to 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
 
117
  df['Cluster'] = kmeans.labels_
118
 
119
  pca = PCA(n_components=2)
120
+ principal_components = pca.fit_transform(X.toarray())
121
+ df['PCA1'] = principal_components[:, 0]
122
+ df['PCA2'] = principal_components[:, 1]
123
 
124
+ return df
 
125
 
126
+ def visualize_clusters(df):
127
  plt.figure(figsize=(10, 6))
128
+ scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
129
+ plt.legend(*scatter.legend_elements(), title="Clusters")
130
+ plt.title('Clusters of User Queries')
131
+ plt.xlabel('PCA Component 1')
132
+ plt.ylabel('PCA Component 2')
133
  plt.show()
134
 
135
+ def main(file, num_clusters_to_display):
136
+ try:
137
+ df = pd.read_excel(file)
138
+ df = preprocess_data(df)
139
+ df = cluster_data(df)
140
+ visualize_clusters(df)
141
 
142
+ cluster_sizes = df['Cluster'].value_counts()
143
+ sorted_clusters = cluster_sizes.index.tolist()
144
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
145
+ df = df.sort_values('Cluster')
146
 
147
+ # Filter out cluster 0 and get the largest clusters
148
+ filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
149
+ top_clusters = filtered_clusters[:num_clusters_to_display]
 
 
 
 
 
 
 
150
 
151
+ df = df[df['Cluster'].isin(top_clusters)]
152
+ df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
153
+ df = df.sort_values('Cluster')
154
 
155
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
156
+ df.to_csv(tmpfile.name, index=False)
157
+ return tmpfile.name
158
+ except Exception as e:
159
+ return str(e)
160
 
161
  interface = gr.Interface(
162
  fn=main,
163
+ inputs=[
164
+ gr.File(label="Upload Excel File (.xlsx)"),
165
+ gr.Slider(1, 10, step=1, label="Number of Categories to Display")
166
+ ],
167
+ outputs=gr.File(label="Clustered Data CSV"),
168
  title="Unanswered User Queries Clustering",
169
  description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
170
  )