Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -107,8 +107,22 @@ def preprocess_data(df):
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
-
def
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 113 |
X = vectorizer.fit_transform(df['texts'])
|
| 114 |
|
|
@@ -117,54 +131,45 @@ def cluster_data(df):
|
|
| 117 |
df['Cluster'] = kmeans.labels_
|
| 118 |
|
| 119 |
pca = PCA(n_components=2)
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
df['PCA2'] = principal_components[:, 1]
|
| 123 |
|
| 124 |
-
|
|
|
|
| 125 |
|
| 126 |
-
def visualize_clusters(df):
|
| 127 |
plt.figure(figsize=(10, 6))
|
| 128 |
-
|
| 129 |
-
plt.legend(*scatter.legend_elements(), title="Clusters")
|
| 130 |
-
plt.title('Clusters of User Queries')
|
| 131 |
-
plt.xlabel('PCA Component 1')
|
| 132 |
-
plt.ylabel('PCA Component 2')
|
| 133 |
plt.show()
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
|
| 142 |
-
|
| 143 |
-
sorted_clusters = cluster_sizes.index.tolist()
|
| 144 |
-
df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
|
| 145 |
-
df = df.sort_values('Cluster')
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
df = df.sort_values('Cluster')
|
| 154 |
|
| 155 |
-
|
| 156 |
-
df.to_csv(tmpfile.name, index=False)
|
| 157 |
-
return tmpfile.name
|
| 158 |
-
except Exception as e:
|
| 159 |
-
return str(e)
|
| 160 |
|
| 161 |
interface = gr.Interface(
|
| 162 |
fn=main,
|
| 163 |
-
inputs=[
|
| 164 |
-
|
| 165 |
-
gr.Slider(1, 10, step=1, label="Number of Largest Clusters to Display")
|
| 166 |
-
],
|
| 167 |
-
outputs=gr.File(label="Clustered Data CSV"),
|
| 168 |
title="Unanswered User Queries Clustering",
|
| 169 |
description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
|
| 170 |
)
|
|
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
+
def elbow_method(X):
|
| 111 |
+
distortions = []
|
| 112 |
+
K = range(1, 20)
|
| 113 |
+
for k in K:
|
| 114 |
+
kmeanModel = KMeans(n_clusters=k)
|
| 115 |
+
kmeanModel.fit(X)
|
| 116 |
+
distortions.append(kmeanModel.inertia_)
|
| 117 |
+
|
| 118 |
+
plt.figure(figsize=(10, 6))
|
| 119 |
+
plt.plot(K, distortions, 'bx-')
|
| 120 |
+
plt.xlabel('k')
|
| 121 |
+
plt.ylabel('Distortion')
|
| 122 |
+
plt.title('The Elbow Method showing the optimal k')
|
| 123 |
+
plt.show()
|
| 124 |
+
|
| 125 |
+
def cluster_data(df, num_clusters=15):
|
| 126 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 127 |
X = vectorizer.fit_transform(df['texts'])
|
| 128 |
|
|
|
|
| 131 |
df['Cluster'] = kmeans.labels_
|
| 132 |
|
| 133 |
pca = PCA(n_components=2)
|
| 134 |
+
scatter_plot_points = pca.fit_transform(X.toarray())
|
| 135 |
+
colors = ['r', 'b', 'c', 'y', 'm', 'g']
|
|
|
|
| 136 |
|
| 137 |
+
x_axis = [o[0] for o in scatter_plot_points]
|
| 138 |
+
y_axis = [o[1] for o in scatter_plot_points]
|
| 139 |
|
|
|
|
| 140 |
plt.figure(figsize=(10, 6))
|
| 141 |
+
plt.scatter(x_axis, y_axis, c=[colors[d] for d in kmeans.labels_])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
plt.show()
|
| 143 |
|
| 144 |
+
return df
|
| 145 |
+
|
| 146 |
+
def sort_and_filter_clusters(df, num_display_clusters):
|
| 147 |
+
cluster_sizes = df['Cluster'].value_counts().sort_values(ascending=False)
|
| 148 |
+
largest_clusters = cluster_sizes[cluster_sizes.index != 0].head(num_display_clusters).index
|
| 149 |
+
filtered_df = df[df['Cluster'].isin(largest_clusters)]
|
| 150 |
|
| 151 |
+
return filtered_df
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
def main(file, num_display_clusters):
|
| 154 |
+
df = pd.read_excel(file)
|
| 155 |
+
df = preprocess_data(df)
|
| 156 |
+
df = cluster_data(df, num_clusters=15)
|
| 157 |
+
df = sort_and_filter_clusters(df, num_display_clusters)
|
| 158 |
+
|
| 159 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
| 160 |
+
df.to_csv(tmp.name, index=False)
|
| 161 |
+
tmp.seek(0)
|
| 162 |
+
return tmp.name
|
| 163 |
|
| 164 |
+
input_file = gr.inputs.File(label="Upload an Excel file (.xlsx)")
|
| 165 |
+
num_display_clusters = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=5, label="Number of Largest Clusters to Display")
|
|
|
|
| 166 |
|
| 167 |
+
output_file = gr.outputs.File(label="Clustered Data CSV")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
interface = gr.Interface(
|
| 170 |
fn=main,
|
| 171 |
+
inputs=[input_file, num_display_clusters],
|
| 172 |
+
outputs=output_file,
|
|
|
|
|
|
|
|
|
|
| 173 |
title="Unanswered User Queries Clustering",
|
| 174 |
description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
|
| 175 |
)
|