Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -213,7 +213,9 @@ def text_processing_for_domain(text):
|
|
| 213 |
from sentence_transformers import SentenceTransformer
|
| 214 |
from sklearn.cluster import AgglomerativeClustering
|
| 215 |
from sklearn.metrics import silhouette_score
|
| 216 |
-
from bertopic import BERTopic
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
| 219 |
# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
|
@@ -239,30 +241,66 @@ def extract_problem_domains(df,
|
|
| 239 |
|
| 240 |
# Determine the optimal number of clusters
|
| 241 |
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 242 |
-
|
| 243 |
-
#
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
cluster_representations = {}
|
| 249 |
-
for
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
| 256 |
|
| 257 |
# Map cluster labels to representative words
|
| 258 |
-
df["Problem_Cluster"] =
|
| 259 |
-
df['Problem_Category_Words'] = [cluster_representations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
|
|
|
| 261 |
return df, optimal_n_clusters
|
| 262 |
|
| 263 |
|
| 264 |
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
|
| 268 |
# Usage
|
|
|
|
| 213 |
from sentence_transformers import SentenceTransformer
|
| 214 |
from sklearn.cluster import AgglomerativeClustering
|
| 215 |
from sklearn.metrics import silhouette_score
|
| 216 |
+
# from bertopic import BERTopic
|
| 217 |
+
from collections import Counter
|
| 218 |
+
|
| 219 |
|
| 220 |
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
| 221 |
# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
|
|
|
| 241 |
|
| 242 |
# Determine the optimal number of clusters
|
| 243 |
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 244 |
+
|
| 245 |
+
# Perform clustering with the optimal number of clusters
|
| 246 |
+
clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
| 247 |
+
cluster_labels = clustering.fit_predict(embeddings)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# # BERTopic for topic modelling
|
| 251 |
+
# # topic_model = BERTopic(num_topics=optimal_n_clusters)
|
| 252 |
+
# # topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 253 |
+
# topic_model = BERTopic()
|
| 254 |
+
# topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 255 |
+
# topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# # Get representative words for each cluster
|
| 259 |
+
# cluster_representations = {}
|
| 260 |
+
# for i in range(optimal_n_clusters):
|
| 261 |
+
# # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
|
| 262 |
+
# cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# Get representative words for each cluster (without BERTopic)
|
| 267 |
cluster_representations = {}
|
| 268 |
+
for i in range(optimal_n_clusters):
|
| 269 |
+
# Use the most common words in each cluster as representative words
|
| 270 |
+
cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
| 271 |
+
cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
|
| 277 |
|
| 278 |
# Map cluster labels to representative words
|
| 279 |
+
df["Problem_Cluster"] = cluster_labels
|
| 280 |
+
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 281 |
+
|
| 282 |
+
# # Print clusters and their representative words
|
| 283 |
+
# for cluster_label, words in cluster_representations.items():
|
| 284 |
+
# print(f"Domain {cluster_label}: {', '.join(words)}")
|
| 285 |
+
|
| 286 |
+
# return df.assign(cluster=cluster_labels), optimal_n_clusters
|
| 287 |
|
| 288 |
+
# df[new_column_name] = clustering.fit_predict(embeddings)
|
| 289 |
return df, optimal_n_clusters
|
| 290 |
|
| 291 |
|
| 292 |
|
| 293 |
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
|
| 304 |
|
| 305 |
|
| 306 |
# Usage
|