tanish78 commited on
Commit
a48299a
·
verified ·
1 Parent(s): faeddba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -38
app.py CHANGED
@@ -107,8 +107,22 @@ def preprocess_data(df):
107
 
108
  return df
109
 
110
- def cluster_data(df):
111
- num_clusters = 15 # Set the number of clusters to 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  vectorizer = TfidfVectorizer(stop_words='english')
113
  X = vectorizer.fit_transform(df['texts'])
114
 
@@ -117,54 +131,45 @@ def cluster_data(df):
117
  df['Cluster'] = kmeans.labels_
118
 
119
  pca = PCA(n_components=2)
120
- principal_components = pca.fit_transform(X.toarray())
121
- df['PCA1'] = principal_components[:, 0]
122
- df['PCA2'] = principal_components[:, 1]
123
 
124
- return df
 
125
 
126
- def visualize_clusters(df):
127
  plt.figure(figsize=(10, 6))
128
- scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
129
- plt.legend(*scatter.legend_elements(), title="Clusters")
130
- plt.title('Clusters of User Queries')
131
- plt.xlabel('PCA Component 1')
132
- plt.ylabel('PCA Component 2')
133
  plt.show()
134
 
135
- def main(file, num_clusters_to_display):
136
- try:
137
- df = pd.read_excel(file)
138
- df = preprocess_data(df)
139
- df = cluster_data(df)
140
- visualize_clusters(df)
141
 
142
- cluster_sizes = df['Cluster'].value_counts()
143
- sorted_clusters = cluster_sizes.index.tolist()
144
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
145
- df = df.sort_values('Cluster')
146
 
147
- # Filter out cluster 0 and get the largest clusters
148
- filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
149
- top_clusters = filtered_clusters[:num_clusters_to_display]
 
 
 
 
 
 
 
150
 
151
- df = df[df['Cluster'].isin(top_clusters)]
152
- df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
153
- df = df.sort_values('Cluster')
154
 
155
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
156
- df.to_csv(tmpfile.name, index=False)
157
- return tmpfile.name
158
- except Exception as e:
159
- return str(e)
160
 
161
  interface = gr.Interface(
162
  fn=main,
163
- inputs=[
164
- gr.File(label="Upload Excel File (.xlsx)"),
165
- gr.Slider(1, 10, step=1, label="Number of Largest Clusters to Display")
166
- ],
167
- outputs=gr.File(label="Clustered Data CSV"),
168
  title="Unanswered User Queries Clustering",
169
  description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
170
  )
 
107
 
108
  return df
109
 
110
+ def elbow_method(X):
111
+ distortions = []
112
+ K = range(1, 20)
113
+ for k in K:
114
+ kmeanModel = KMeans(n_clusters=k)
115
+ kmeanModel.fit(X)
116
+ distortions.append(kmeanModel.inertia_)
117
+
118
+ plt.figure(figsize=(10, 6))
119
+ plt.plot(K, distortions, 'bx-')
120
+ plt.xlabel('k')
121
+ plt.ylabel('Distortion')
122
+ plt.title('The Elbow Method showing the optimal k')
123
+ plt.show()
124
+
125
+ def cluster_data(df, num_clusters=15):
126
  vectorizer = TfidfVectorizer(stop_words='english')
127
  X = vectorizer.fit_transform(df['texts'])
128
 
 
131
  df['Cluster'] = kmeans.labels_
132
 
133
  pca = PCA(n_components=2)
134
+ scatter_plot_points = pca.fit_transform(X.toarray())
135
+ colors = ['r', 'b', 'c', 'y', 'm', 'g']
 
136
 
137
+ x_axis = [o[0] for o in scatter_plot_points]
138
+ y_axis = [o[1] for o in scatter_plot_points]
139
 
 
140
  plt.figure(figsize=(10, 6))
141
+ plt.scatter(x_axis, y_axis, c=[colors[d] for d in kmeans.labels_])
 
 
 
 
142
  plt.show()
143
 
144
+ return df
145
+
146
+ def sort_and_filter_clusters(df, num_display_clusters):
147
+ cluster_sizes = df['Cluster'].value_counts().sort_values(ascending=False)
148
+ largest_clusters = cluster_sizes[cluster_sizes.index != 0].head(num_display_clusters).index
149
+ filtered_df = df[df['Cluster'].isin(largest_clusters)]
150
 
151
+ return filtered_df
 
 
 
152
 
153
+ def main(file, num_display_clusters):
154
+ df = pd.read_excel(file)
155
+ df = preprocess_data(df)
156
+ df = cluster_data(df, num_clusters=15)
157
+ df = sort_and_filter_clusters(df, num_display_clusters)
158
+
159
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
160
+ df.to_csv(tmp.name, index=False)
161
+ tmp.seek(0)
162
+ return tmp.name
163
 
164
+ input_file = gr.inputs.File(label="Upload an Excel file (.xlsx)")
165
+ num_display_clusters = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of Largest Clusters to Display")
 
166
 
167
+ output_file = gr.outputs.File(label="Clustered Data CSV")
 
 
 
 
168
 
169
  interface = gr.Interface(
170
  fn=main,
171
+ inputs=[input_file, num_display_clusters],
172
+ outputs=output_file,
 
 
 
173
  title="Unanswered User Queries Clustering",
174
  description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
175
  )