Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -216,7 +216,11 @@ from sklearn.metrics import silhouette_score
|
|
| 216 |
from bertopic import BERTopic
|
| 217 |
|
| 218 |
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
| 219 |
-
def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
# Select Model (can we also optimize model selection automatically?)
|
| 222 |
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
@@ -235,38 +239,27 @@ def extract_problem_domains(df, text_column='Problem_Description', cluster_range
|
|
| 235 |
|
| 236 |
# Determine the optimal number of clusters
|
| 237 |
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 238 |
-
|
| 239 |
-
#
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
# BERTopic for topic modelling
|
| 245 |
-
# topic_model = BERTopic(num_topics=optimal_n_clusters)
|
| 246 |
-
# topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 247 |
-
topic_model = BERTopic()
|
| 248 |
-
topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 249 |
-
topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
| 250 |
-
|
| 251 |
-
|
| 252 |
# Get representative words for each cluster
|
| 253 |
cluster_representations = {}
|
| 254 |
-
for
|
| 255 |
-
|
| 256 |
-
cluster_representations[
|
| 257 |
|
| 258 |
# Map cluster labels to representative words
|
| 259 |
-
df["Problem_Cluster"] =
|
| 260 |
-
df['Problem_Category_Words'] = [cluster_representations[
|
| 261 |
|
| 262 |
-
|
| 263 |
-
# for cluster_label, words in cluster_representations.items():
|
| 264 |
-
# print(f"Domain {cluster_label}: {', '.join(words)}")
|
| 265 |
|
| 266 |
-
# return df.assign(cluster=cluster_labels), optimal_n_clusters
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
|
|
|
| 270 |
|
| 271 |
# Usage
|
| 272 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|
|
|
|
| 216 |
from bertopic import BERTopic
|
| 217 |
|
| 218 |
# def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
|
| 219 |
+
# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
|
| 220 |
+
def extract_problem_domains(df,
|
| 221 |
+
text_column='Problem_Description',
|
| 222 |
+
cluster_range=(10, 50),
|
| 223 |
+
top_words=17):
|
| 224 |
|
| 225 |
# Select Model (can we also optimize model selection automatically?)
|
| 226 |
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
|
|
| 239 |
|
| 240 |
# Determine the optimal number of clusters
|
| 241 |
optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 242 |
+
|
| 243 |
+
# BERTopic for topic modeling
|
| 244 |
+
topic_model = BERTopic(nr_topics=optimal_n_clusters)
|
| 245 |
+
topics, probs = topic_model.fit_transform(df[text_column].tolist())
|
| 246 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
# Get representative words for each cluster
|
| 248 |
cluster_representations = {}
|
| 249 |
+
for topic in range(len(topic_model.get_topic_info())):
|
| 250 |
+
words, _ = zip(*topic_model.get_topic(topic))
|
| 251 |
+
cluster_representations[topic] = list(words)[:top_words]
|
| 252 |
|
| 253 |
# Map cluster labels to representative words
|
| 254 |
+
df["Problem_Cluster"] = topics
|
| 255 |
+
df['Problem_Category_Words'] = [cluster_representations.get(label, []) for label in topics]
|
| 256 |
|
| 257 |
+
return df, optimal_n_clusters
|
|
|
|
|
|
|
| 258 |
|
|
|
|
| 259 |
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
|
| 263 |
|
| 264 |
# Usage
|
| 265 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|