Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

ee949ff

verified ·

1 Parent(s): 46a0ef6

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -26

app.py CHANGED Viewed

@@ -216,7 +216,11 @@ from sklearn.metrics import silhouette_score
 from bertopic import BERTopic
 # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
-def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
     # Select Model (can we also optimize model selection automatically?)
     # model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -235,38 +239,27 @@ def extract_problem_domains(df, text_column='Problem_Description', cluster_range
     # Determine the optimal number of clusters
     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-    # Perform clustering with the optimal number of clusters
-    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
-    cluster_labels = clustering.fit_predict(embeddings)
-    # BERTopic for topic modelling
-    # topic_model = BERTopic(num_topics=optimal_n_clusters)
-    # topics, _ = topic_model.fit_transform(df[text_column].tolist())
-    topic_model = BERTopic()
-    topics, _ = topic_model.fit_transform(df[text_column].tolist())
-    topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
     # Get representative words for each cluster
     cluster_representations = {}
-    for i in range(optimal_n_clusters):
-        # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
-        cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
     # Map cluster labels to representative words
-    df["Problem_Cluster"] = cluster_labels
-    df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-    # # Print clusters and their representative words
-    # for cluster_label, words in cluster_representations.items():
-    #     print(f"Domain {cluster_label}: {', '.join(words)}")
-    # return df.assign(cluster=cluster_labels), optimal_n_clusters
-    # df[new_column_name] = clustering.fit_predict(embeddings)
-    return df, optimal_n_clusters
 # Usage
 # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)

 from bertopic import BERTopic
 # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
+# def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
+def extract_problem_domains(df,
+                            text_column='Problem_Description',
+                            cluster_range=(10, 50),
+                            top_words=17):
     # Select Model (can we also optimize model selection automatically?)
     # model = SentenceTransformer('all-MiniLM-L6-v2')
     # Determine the optimal number of clusters
     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+    # BERTopic for topic modeling
+    topic_model = BERTopic(nr_topics=optimal_n_clusters)
+    topics, probs = topic_model.fit_transform(df[text_column].tolist())
     # Get representative words for each cluster
     cluster_representations = {}
+    for topic in range(len(topic_model.get_topic_info())):
+        words, _ = zip(*topic_model.get_topic(topic))
+        cluster_representations[topic] = list(words)[:top_words]
     # Map cluster labels to representative words
+    df["Problem_Cluster"] = topics
+    df['Problem_Category_Words'] = [cluster_representations.get(label, []) for label in topics]
+    return df, optimal_n_clusters
 # Usage
 # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)