Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

0e7ae0f

verified ·

1 Parent(s): b187752

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -16

app.py CHANGED Viewed

@@ -213,7 +213,9 @@ def text_processing_for_domain(text):
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
-from bertopic import BERTopic
 # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
 # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
@@ -239,30 +241,66 @@ def extract_problem_domains(df,
     # Determine the optimal number of clusters
     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
-    # BERTopic for topic modeling
-    topic_model = BERTopic(nr_topics=optimal_n_clusters)
-    topics, probs = topic_model.fit_transform(df[text_column].tolist())
-    # Get representative words for each cluster
     cluster_representations = {}
-    for topic in range(len(topic_model.get_topic_info())):
-        topic_words = topic_model.get_topic(topic)
-        if isinstance(topic_words, list) and len(topic_words) > 0 and isinstance(topic_words[0], tuple):
-            words = [word for word, _ in topic_words[:top_words]]
-        else:
-            words = []
-        cluster_representations[topic] = words
     # Map cluster labels to representative words
-    df["Problem_Cluster"] = topics
-    df['Problem_Category_Words'] = [cluster_representations.get(label, []) for label in topics]
     return df, optimal_n_clusters
 # Usage

 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.metrics import silhouette_score
+# from bertopic import BERTopic
+from collections import Counter
 # def optimal_Problem_clustering(df, text_column='Problem_Description', new_column_name="Problem_Cluster" ,cluster_range=(30, 70)):
 # def extract_problem_domains(df, text_column='Problem_Description', cluster_range=(10, 50), top_words=17):
     # Determine the optimal number of clusters
     optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+    # Perform clustering with the optimal number of clusters
+    clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+    cluster_labels = clustering.fit_predict(embeddings)
+    # # BERTopic for topic modelling
+    # # topic_model = BERTopic(num_topics=optimal_n_clusters)
+    # # topics, _ = topic_model.fit_transform(df[text_column].tolist())
+    # topic_model = BERTopic()
+    # topics, _ = topic_model.fit_transform(df[text_column].tolist())
+    # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
+    # # Get representative words for each cluster
+    # cluster_representations = {}
+    # for i in range(optimal_n_clusters):
+    #     # cluster_representations[i] = topic_model.get_topic_info(i)['words'][:top_words]
+    #     cluster_representations[i] = topic_model.get_topic_info(i).get('words', [])[:top_words]
+    # Get representative words for each cluster (without BERTopic)
     cluster_representations = {}
+    for i in range(optimal_n_clusters):
+        # Use the most common words in each cluster as representative words
+        cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+        cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
     # Map cluster labels to representative words
+    df["Problem_Cluster"] = cluster_labels
+    df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+    # # Print clusters and their representative words
+    # for cluster_label, words in cluster_representations.items():
+    #     print(f"Domain {cluster_label}: {', '.join(words)}")
+    # return df.assign(cluster=cluster_labels), optimal_n_clusters
+    # df[new_column_name] = clustering.fit_predict(embeddings)
     return df, optimal_n_clusters
 # Usage