Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

887a7c1

verified ·

1 Parent(s): 9d1c4e8

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -25

app.py CHANGED Viewed

@@ -261,6 +261,140 @@ def text_processing_for_domain(text):
 # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering, KMeans
@@ -268,18 +402,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics import silhouette_score
 from bertopic import BERTopic
 from collections import Counter
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
-                            # text_column='Problem_Description',
                             cluster_range=(5, 15),
                             top_words=10,
-                            # method='sentence_transformers'
                             method='tfidf_kmeans'
                            ):
     console_messages.append("Extracting Problem Domains...")
     if method == 'sentence_transformers':
@@ -339,32 +470,19 @@ def extract_problem_domains(df,
         feature_names = vectorizer.get_feature_names_out()
         cluster_representations = {}
         for i in range(optimal_n_clusters):
-            # center = kmeans.cluster_centers_[i]
-            # # print(f"top_words: {top_words}, type: {type(top_words)}")
-            # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
-            # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
-            # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
-            # # top_word_indices = center.argsort()[-top_words:][::-1]
-            # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
-            # top_words = [feature_names[index] for index in top_word_indices]
-            # cluster_representations[i] = top_words
             try:
                 center = kmeans.cluster_centers_[i]
                 console_messages.append(f"Processing cluster {i}")
                 console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
-                if not isinstance(center, np.ndarray):
                     center = np.array(center)
                 # Remove NaN values
-                center = center[~np.isnan(center)]
-                sorted_indices = np.array(center.argsort())
                 top_word_indices = sorted_indices[-top_words:][::-1]
@@ -381,8 +499,6 @@ def extract_problem_domains(df,
                 console_messages.append(f"Error processing cluster {i}: {str(e)}")
                 console_messages.append(f"Center: {center}")
         console_messages.append(f"Number of clusters: {optimal_n_clusters}")
         console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
@@ -390,10 +506,9 @@ def extract_problem_domains(df,
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
-    # console_messages.append("Returning from Problem Domain Extraction function.")
     console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters
 # Usage
 # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)

 # Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
+# From here Sanban
+# from sentence_transformers import SentenceTransformer
+# from sklearn.cluster import AgglomerativeClustering, KMeans
+# from sklearn.feature_extraction.text import TfidfVectorizer
+# from sklearn.metrics import silhouette_score
+# from bertopic import BERTopic
+# from collections import Counter
+# def extract_problem_domains(df,
+#                             text_column='Processed_ProblemDescription_forDomainExtraction',
+#                             # text_column='Problem_Description',
+#                             cluster_range=(5, 15),
+#                             top_words=10,
+#                             # method='sentence_transformers'
+#                             method='tfidf_kmeans'
+#                            ):
+#     console_messages.append("Extracting Problem Domains...")
+#     if method == 'sentence_transformers':
+#         # Sentence Transformers approach
+#         model = SentenceTransformer('all-mpnet-base-v2')
+#         embeddings = model.encode(df[text_column].tolist())
+#         # Perform hierarchical clustering with Silhouette Analysis
+#         silhouette_scores = []
+#         for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+#             clustering = AgglomerativeClustering(n_clusters=n_clusters)
+#             cluster_labels = clustering.fit_predict(embeddings)
+#             silhouette_avg = silhouette_score(embeddings, cluster_labels)
+#             silhouette_scores.append(silhouette_avg)
+#         # Determine the optimal number of clusters
+#         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+#         # Perform clustering with the optimal number of clusters
+#         clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
+#         cluster_labels = clustering.fit_predict(embeddings)
+#     elif method == 'tfidf_kmeans':
+#         # TF-IDF Vectorization and K-Means approach
+#         vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
+#         X = vectorizer.fit_transform(df[text_column])
+#         # Perform K-Means clustering with Silhouette Analysis
+#         silhouette_scores = []
+#         for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
+#             kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
+#             cluster_labels = kmeans.fit_predict(X)
+#             silhouette_avg = silhouette_score(X, cluster_labels)
+#             silhouette_scores.append(silhouette_avg)
+#         # Determine the optimal number of clusters
+#         optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
+#         # Perform final clustering with optimal number of clusters
+#         kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
+#         cluster_labels = kmeans.fit_predict(X)
+#     # # BERTopic approach (commented out)
+#     console_messages.append("BERT is currently commented...")
+#     # topic_model = BERTopic()
+#     # topics, _ = topic_model.fit_transform(df[text_column].tolist())
+#     # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
+#     # cluster_labels = topics
+#     # Get representative words for each cluster
+#     if method == 'sentence_transformers':
+#         cluster_representations = {}
+#         for i in range(optimal_n_clusters):
+#             cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
+#             cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
+#     elif method == 'tfidf_kmeans':
+#         feature_names = vectorizer.get_feature_names_out()
+#         cluster_representations = {}
+#         for i in range(optimal_n_clusters):
+#             # center = kmeans.cluster_centers_[i]
+#             # # print(f"top_words: {top_words}, type: {type(top_words)}")
+#             # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
+#             # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
+#             # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
+#             # # top_word_indices = center.argsort()[-top_words:][::-1]
+#             # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
+#             # top_words = [feature_names[index] for index in top_word_indices]
+#             # cluster_representations[i] = top_words
+#             try:
+#                 center = kmeans.cluster_centers_[i]
+#                 console_messages.append(f"Processing cluster {i}")
+#                 console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
+#                 if not isinstance(center, np.ndarray):
+#                     center = np.array(center)
+#                 # Remove NaN values
+#                 center = center[~np.isnan(center)]
+#                 sorted_indices = np.array(center.argsort())
+#                 top_word_indices = sorted_indices[-top_words:][::-1]
+#                 # Check for valid indices
+#                 if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
+#                     console_messages.append(f"Invalid top word indices for cluster {i}")
+#                     continue
+#                 top_words = [feature_names[index] for index in top_word_indices]
+#                 console_messages.append(f"Top words: {top_words}")
+#                 cluster_representations[i] = top_words
+#             except Exception as e:
+#                 console_messages.append(f"Error processing cluster {i}: {str(e)}")
+#                 console_messages.append(f"Center: {center}")
+#         console_messages.append(f"Number of clusters: {optimal_n_clusters}")
+#         console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
+#     # Map cluster labels to representative words
+#     df["Problem_Cluster"] = cluster_labels
+#     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
+#     # console_messages.append("Returning from Problem Domain Extraction function.")
+#     console_messages.append("Problem Domain Extraction completed.")
+#     return df, optimal_n_clusters
+# Till here sanban
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering, KMeans
 from sklearn.metrics import silhouette_score
 from bertopic import BERTopic
 from collections import Counter
+import numpy as np
 def extract_problem_domains(df,
                             text_column='Processed_ProblemDescription_forDomainExtraction',
                             cluster_range=(5, 15),
                             top_words=10,
                             method='tfidf_kmeans'
                            ):
     console_messages.append("Extracting Problem Domains...")
     if method == 'sentence_transformers':
         feature_names = vectorizer.get_feature_names_out()
         cluster_representations = {}
         for i in range(optimal_n_clusters):
             try:
                 center = kmeans.cluster_centers_[i]
                 console_messages.append(f"Processing cluster {i}")
                 console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
+                if isinstance(center, list):
                     center = np.array(center)
                 # Remove NaN values
+                if np.any(np.isnan(center)):
+                    center = np.nan_to_num(center)
+                sorted_indices = np.argsort(center)
                 top_word_indices = sorted_indices[-top_words:][::-1]
                 console_messages.append(f"Error processing cluster {i}: {str(e)}")
                 console_messages.append(f"Center: {center}")
         console_messages.append(f"Number of clusters: {optimal_n_clusters}")
         console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
     df["Problem_Cluster"] = cluster_labels
     df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
     console_messages.append("Problem Domain Extraction completed.")
     return df, optimal_n_clusters
 # Usage
 # clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)