Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -107,22 +107,8 @@ def preprocess_data(df):
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
-
def
|
| 111 |
-
|
| 112 |
-
K = range(1, 20)
|
| 113 |
-
for k in K:
|
| 114 |
-
kmeanModel = KMeans(n_clusters=k)
|
| 115 |
-
kmeanModel.fit(X)
|
| 116 |
-
distortions.append(kmeanModel.inertia_)
|
| 117 |
-
|
| 118 |
-
plt.figure(figsize=(10, 6))
|
| 119 |
-
plt.plot(K, distortions, 'bx-')
|
| 120 |
-
plt.xlabel('k')
|
| 121 |
-
plt.ylabel('Distortion')
|
| 122 |
-
plt.title('The Elbow Method showing the optimal k')
|
| 123 |
-
plt.show()
|
| 124 |
-
|
| 125 |
-
def cluster_data(df, num_clusters=15):
|
| 126 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 127 |
X = vectorizer.fit_transform(df['texts'])
|
| 128 |
|
|
@@ -131,45 +117,54 @@ def cluster_data(df, num_clusters=15):
|
|
| 131 |
df['Cluster'] = kmeans.labels_
|
| 132 |
|
| 133 |
pca = PCA(n_components=2)
|
| 134 |
-
|
| 135 |
-
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
y_axis = [o[1] for o in scatter_plot_points]
|
| 139 |
|
|
|
|
| 140 |
plt.figure(figsize=(10, 6))
|
| 141 |
-
plt.scatter(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
plt.show()
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
df = cluster_data(df, num_clusters=15)
|
| 157 |
-
df = sort_and_filter_clusters(df, num_display_clusters)
|
| 158 |
-
|
| 159 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
| 160 |
-
df.to_csv(tmp.name, index=False)
|
| 161 |
-
tmp.seek(0)
|
| 162 |
-
return tmp.name
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
|
|
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
interface = gr.Interface(
|
| 170 |
fn=main,
|
| 171 |
-
inputs=[
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
| 173 |
title="Unanswered User Queries Clustering",
|
| 174 |
description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
|
| 175 |
)
|
|
|
|
| 107 |
|
| 108 |
return df
|
| 109 |
|
| 110 |
+
def cluster_data(df):
|
| 111 |
+
num_clusters = 20 # Set the number of clusters to 15
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
vectorizer = TfidfVectorizer(stop_words='english')
|
| 113 |
X = vectorizer.fit_transform(df['texts'])
|
| 114 |
|
|
|
|
| 117 |
df['Cluster'] = kmeans.labels_
|
| 118 |
|
| 119 |
pca = PCA(n_components=2)
|
| 120 |
+
principal_components = pca.fit_transform(X.toarray())
|
| 121 |
+
df['PCA1'] = principal_components[:, 0]
|
| 122 |
+
df['PCA2'] = principal_components[:, 1]
|
| 123 |
|
| 124 |
+
return df
|
|
|
|
| 125 |
|
| 126 |
+
def visualize_clusters(df):
|
| 127 |
plt.figure(figsize=(10, 6))
|
| 128 |
+
scatter = plt.scatter(df['PCA1'], df['PCA2'], c=df['Cluster'], cmap='viridis')
|
| 129 |
+
plt.legend(*scatter.legend_elements(), title="Clusters")
|
| 130 |
+
plt.title('Clusters of User Queries')
|
| 131 |
+
plt.xlabel('PCA Component 1')
|
| 132 |
+
plt.ylabel('PCA Component 2')
|
| 133 |
plt.show()
|
| 134 |
|
| 135 |
+
def main(file, num_clusters_to_display):
|
| 136 |
+
try:
|
| 137 |
+
df = pd.read_excel(file)
|
| 138 |
+
df = preprocess_data(df)
|
| 139 |
+
df = cluster_data(df)
|
| 140 |
+
visualize_clusters(df)
|
| 141 |
|
| 142 |
+
cluster_sizes = df['Cluster'].value_counts()
|
| 143 |
+
sorted_clusters = cluster_sizes.index.tolist()
|
| 144 |
+
df['Cluster'] = pd.Categorical(df['Cluster'], categories=sorted_clusters, ordered=True)
|
| 145 |
+
df = df.sort_values('Cluster')
|
| 146 |
|
| 147 |
+
# Filter out cluster 0 and get the largest clusters
|
| 148 |
+
filtered_clusters = [cluster for cluster in sorted_clusters if cluster != 0]
|
| 149 |
+
top_clusters = filtered_clusters[:num_clusters_to_display]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
df = df[df['Cluster'].isin(top_clusters)]
|
| 152 |
+
df['Cluster'] = pd.Categorical(df['Cluster'], categories=top_clusters, ordered=True)
|
| 153 |
+
df = df.sort_values('Cluster')
|
| 154 |
|
| 155 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmpfile:
|
| 156 |
+
df.to_csv(tmpfile.name, index=False)
|
| 157 |
+
return tmpfile.name
|
| 158 |
+
except Exception as e:
|
| 159 |
+
return str(e)
|
| 160 |
|
| 161 |
interface = gr.Interface(
|
| 162 |
fn=main,
|
| 163 |
+
inputs=[
|
| 164 |
+
gr.File(label="Upload Excel File (.xlsx)"),
|
| 165 |
+
gr.Slider(1, 10, step=1, label="Number of Categories to Display")
|
| 166 |
+
],
|
| 167 |
+
outputs=gr.File(label="Clustered Data CSV"),
|
| 168 |
title="Unanswered User Queries Clustering",
|
| 169 |
description="Upload an Excel file (.xlsx) and select the number of largest clusters to display (excluding cluster 0)"
|
| 170 |
)
|