Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -261,6 +261,140 @@ def text_processing_for_domain(text):
|
|
| 261 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
| 262 |
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
from sentence_transformers import SentenceTransformer
|
| 266 |
from sklearn.cluster import AgglomerativeClustering, KMeans
|
|
@@ -268,18 +402,15 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
| 268 |
from sklearn.metrics import silhouette_score
|
| 269 |
from bertopic import BERTopic
|
| 270 |
from collections import Counter
|
| 271 |
-
|
| 272 |
|
| 273 |
def extract_problem_domains(df,
|
| 274 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
| 275 |
-
# text_column='Problem_Description',
|
| 276 |
cluster_range=(5, 15),
|
| 277 |
top_words=10,
|
| 278 |
-
# method='sentence_transformers'
|
| 279 |
method='tfidf_kmeans'
|
| 280 |
):
|
| 281 |
|
| 282 |
-
|
| 283 |
console_messages.append("Extracting Problem Domains...")
|
| 284 |
|
| 285 |
if method == 'sentence_transformers':
|
|
@@ -339,32 +470,19 @@ def extract_problem_domains(df,
|
|
| 339 |
feature_names = vectorizer.get_feature_names_out()
|
| 340 |
cluster_representations = {}
|
| 341 |
for i in range(optimal_n_clusters):
|
| 342 |
-
# center = kmeans.cluster_centers_[i]
|
| 343 |
-
|
| 344 |
-
# # print(f"top_words: {top_words}, type: {type(top_words)}")
|
| 345 |
-
# # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
| 346 |
-
|
| 347 |
-
# console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
|
| 348 |
-
# console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
| 349 |
-
|
| 350 |
-
# # top_word_indices = center.argsort()[-top_words:][::-1]
|
| 351 |
-
# top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
|
| 352 |
-
|
| 353 |
-
# top_words = [feature_names[index] for index in top_word_indices]
|
| 354 |
-
# cluster_representations[i] = top_words
|
| 355 |
-
|
| 356 |
try:
|
| 357 |
center = kmeans.cluster_centers_[i]
|
| 358 |
console_messages.append(f"Processing cluster {i}")
|
| 359 |
console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
| 360 |
|
| 361 |
-
if
|
| 362 |
center = np.array(center)
|
| 363 |
|
| 364 |
# Remove NaN values
|
| 365 |
-
|
|
|
|
| 366 |
|
| 367 |
-
sorted_indices = np.
|
| 368 |
|
| 369 |
top_word_indices = sorted_indices[-top_words:][::-1]
|
| 370 |
|
|
@@ -381,8 +499,6 @@ def extract_problem_domains(df,
|
|
| 381 |
console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
| 382 |
console_messages.append(f"Center: {center}")
|
| 383 |
|
| 384 |
-
|
| 385 |
-
|
| 386 |
console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
| 387 |
console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
| 388 |
|
|
@@ -390,10 +506,9 @@ def extract_problem_domains(df,
|
|
| 390 |
df["Problem_Cluster"] = cluster_labels
|
| 391 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 392 |
|
| 393 |
-
# console_messages.append("Returning from Problem Domain Extraction function.")
|
| 394 |
console_messages.append("Problem Domain Extraction completed.")
|
| 395 |
return df, optimal_n_clusters
|
| 396 |
-
|
| 397 |
|
| 398 |
# Usage
|
| 399 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|
|
|
|
| 261 |
# Hyperparameter Tuning: Include a parameter to adjust the number of top words to display for each cluster.
|
| 262 |
|
| 263 |
|
| 264 |
+
# From here Sanban
|
| 265 |
+
# from sentence_transformers import SentenceTransformer
|
| 266 |
+
# from sklearn.cluster import AgglomerativeClustering, KMeans
|
| 267 |
+
# from sklearn.feature_extraction.text import TfidfVectorizer
|
| 268 |
+
# from sklearn.metrics import silhouette_score
|
| 269 |
+
# from bertopic import BERTopic
|
| 270 |
+
# from collections import Counter
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# def extract_problem_domains(df,
|
| 274 |
+
# text_column='Processed_ProblemDescription_forDomainExtraction',
|
| 275 |
+
# # text_column='Problem_Description',
|
| 276 |
+
# cluster_range=(5, 15),
|
| 277 |
+
# top_words=10,
|
| 278 |
+
# # method='sentence_transformers'
|
| 279 |
+
# method='tfidf_kmeans'
|
| 280 |
+
# ):
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# console_messages.append("Extracting Problem Domains...")
|
| 284 |
+
|
| 285 |
+
# if method == 'sentence_transformers':
|
| 286 |
+
# # Sentence Transformers approach
|
| 287 |
+
# model = SentenceTransformer('all-mpnet-base-v2')
|
| 288 |
+
# embeddings = model.encode(df[text_column].tolist())
|
| 289 |
+
|
| 290 |
+
# # Perform hierarchical clustering with Silhouette Analysis
|
| 291 |
+
# silhouette_scores = []
|
| 292 |
+
# for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
| 293 |
+
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
| 294 |
+
# cluster_labels = clustering.fit_predict(embeddings)
|
| 295 |
+
# silhouette_avg = silhouette_score(embeddings, cluster_labels)
|
| 296 |
+
# silhouette_scores.append(silhouette_avg)
|
| 297 |
+
|
| 298 |
+
# # Determine the optimal number of clusters
|
| 299 |
+
# optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 300 |
+
|
| 301 |
+
# # Perform clustering with the optimal number of clusters
|
| 302 |
+
# clustering = AgglomerativeClustering(n_clusters=optimal_n_clusters)
|
| 303 |
+
# cluster_labels = clustering.fit_predict(embeddings)
|
| 304 |
+
|
| 305 |
+
# elif method == 'tfidf_kmeans':
|
| 306 |
+
# # TF-IDF Vectorization and K-Means approach
|
| 307 |
+
# vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
|
| 308 |
+
# X = vectorizer.fit_transform(df[text_column])
|
| 309 |
+
|
| 310 |
+
# # Perform K-Means clustering with Silhouette Analysis
|
| 311 |
+
# silhouette_scores = []
|
| 312 |
+
# for n_clusters in range(cluster_range[0], cluster_range[1] + 1):
|
| 313 |
+
# kmeans = KMeans(n_clusters=n_clusters)#, random_state=42)
|
| 314 |
+
# cluster_labels = kmeans.fit_predict(X)
|
| 315 |
+
# silhouette_avg = silhouette_score(X, cluster_labels)
|
| 316 |
+
# silhouette_scores.append(silhouette_avg)
|
| 317 |
+
|
| 318 |
+
# # Determine the optimal number of clusters
|
| 319 |
+
# optimal_n_clusters = cluster_range[0] + silhouette_scores.index(max(silhouette_scores))
|
| 320 |
+
|
| 321 |
+
# # Perform final clustering with optimal number of clusters
|
| 322 |
+
# kmeans = KMeans(n_clusters=optimal_n_clusters) #, random_state=42)
|
| 323 |
+
# cluster_labels = kmeans.fit_predict(X)
|
| 324 |
+
|
| 325 |
+
# # # BERTopic approach (commented out)
|
| 326 |
+
# console_messages.append("BERT is currently commented...")
|
| 327 |
+
# # topic_model = BERTopic()
|
| 328 |
+
# # topics, _ = topic_model.fit_transform(df[text_column].tolist())
|
| 329 |
+
# # topic_model.reduce_topics(df[text_column].tolist(), nr_topics=optimal_n_clusters)
|
| 330 |
+
# # cluster_labels = topics
|
| 331 |
+
|
| 332 |
+
# # Get representative words for each cluster
|
| 333 |
+
# if method == 'sentence_transformers':
|
| 334 |
+
# cluster_representations = {}
|
| 335 |
+
# for i in range(optimal_n_clusters):
|
| 336 |
+
# cluster_words = df.loc[cluster_labels == i, text_column].str.cat(sep=' ').split()
|
| 337 |
+
# cluster_representations[i] = [word for word, _ in Counter(cluster_words).most_common(top_words)]
|
| 338 |
+
# elif method == 'tfidf_kmeans':
|
| 339 |
+
# feature_names = vectorizer.get_feature_names_out()
|
| 340 |
+
# cluster_representations = {}
|
| 341 |
+
# for i in range(optimal_n_clusters):
|
| 342 |
+
# # center = kmeans.cluster_centers_[i]
|
| 343 |
+
|
| 344 |
+
# # # print(f"top_words: {top_words}, type: {type(top_words)}")
|
| 345 |
+
# # # print(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
| 346 |
+
|
| 347 |
+
# # console_messages.append(f"top_words: {top_words}, type: {type(top_words)}")
|
| 348 |
+
# # console_messages.append(f"center.argsort(): {center.argsort()}, type: {type(center.argsort())}")
|
| 349 |
+
|
| 350 |
+
# # # top_word_indices = center.argsort()[-top_words:][::-1]
|
| 351 |
+
# # top_word_indices = center.argsort()[-top_words:][::-1].tolist() # Indexes of top words
|
| 352 |
+
|
| 353 |
+
# # top_words = [feature_names[index] for index in top_word_indices]
|
| 354 |
+
# # cluster_representations[i] = top_words
|
| 355 |
+
|
| 356 |
+
# try:
|
| 357 |
+
# center = kmeans.cluster_centers_[i]
|
| 358 |
+
# console_messages.append(f"Processing cluster {i}")
|
| 359 |
+
# console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
| 360 |
+
|
| 361 |
+
# if not isinstance(center, np.ndarray):
|
| 362 |
+
# center = np.array(center)
|
| 363 |
+
|
| 364 |
+
# # Remove NaN values
|
| 365 |
+
# center = center[~np.isnan(center)]
|
| 366 |
+
|
| 367 |
+
# sorted_indices = np.array(center.argsort())
|
| 368 |
+
|
| 369 |
+
# top_word_indices = sorted_indices[-top_words:][::-1]
|
| 370 |
+
|
| 371 |
+
# # Check for valid indices
|
| 372 |
+
# if np.any(top_word_indices < 0) or np.any(top_word_indices >= len(feature_names)):
|
| 373 |
+
# console_messages.append(f"Invalid top word indices for cluster {i}")
|
| 374 |
+
# continue
|
| 375 |
+
|
| 376 |
+
# top_words = [feature_names[index] for index in top_word_indices]
|
| 377 |
+
# console_messages.append(f"Top words: {top_words}")
|
| 378 |
+
# cluster_representations[i] = top_words
|
| 379 |
+
|
| 380 |
+
# except Exception as e:
|
| 381 |
+
# console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
| 382 |
+
# console_messages.append(f"Center: {center}")
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
| 387 |
+
# console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
| 388 |
+
|
| 389 |
+
# # Map cluster labels to representative words
|
| 390 |
+
# df["Problem_Cluster"] = cluster_labels
|
| 391 |
+
# df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 392 |
+
|
| 393 |
+
# # console_messages.append("Returning from Problem Domain Extraction function.")
|
| 394 |
+
# console_messages.append("Problem Domain Extraction completed.")
|
| 395 |
+
# return df, optimal_n_clusters
|
| 396 |
+
# Till here sanban
|
| 397 |
+
|
| 398 |
|
| 399 |
from sentence_transformers import SentenceTransformer
|
| 400 |
from sklearn.cluster import AgglomerativeClustering, KMeans
|
|
|
|
| 402 |
from sklearn.metrics import silhouette_score
|
| 403 |
from bertopic import BERTopic
|
| 404 |
from collections import Counter
|
| 405 |
+
import numpy as np
|
| 406 |
|
| 407 |
def extract_problem_domains(df,
|
| 408 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
|
|
|
| 409 |
cluster_range=(5, 15),
|
| 410 |
top_words=10,
|
|
|
|
| 411 |
method='tfidf_kmeans'
|
| 412 |
):
|
| 413 |
|
|
|
|
| 414 |
console_messages.append("Extracting Problem Domains...")
|
| 415 |
|
| 416 |
if method == 'sentence_transformers':
|
|
|
|
| 470 |
feature_names = vectorizer.get_feature_names_out()
|
| 471 |
cluster_representations = {}
|
| 472 |
for i in range(optimal_n_clusters):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
try:
|
| 474 |
center = kmeans.cluster_centers_[i]
|
| 475 |
console_messages.append(f"Processing cluster {i}")
|
| 476 |
console_messages.append(f"Center shape: {center.shape}, type: {type(center)}")
|
| 477 |
|
| 478 |
+
if isinstance(center, list):
|
| 479 |
center = np.array(center)
|
| 480 |
|
| 481 |
# Remove NaN values
|
| 482 |
+
if np.any(np.isnan(center)):
|
| 483 |
+
center = np.nan_to_num(center)
|
| 484 |
|
| 485 |
+
sorted_indices = np.argsort(center)
|
| 486 |
|
| 487 |
top_word_indices = sorted_indices[-top_words:][::-1]
|
| 488 |
|
|
|
|
| 499 |
console_messages.append(f"Error processing cluster {i}: {str(e)}")
|
| 500 |
console_messages.append(f"Center: {center}")
|
| 501 |
|
|
|
|
|
|
|
| 502 |
console_messages.append(f"Number of clusters: {optimal_n_clusters}")
|
| 503 |
console_messages.append(f"Sample cluster words: {cluster_representations[0][:5]}...")
|
| 504 |
|
|
|
|
| 506 |
df["Problem_Cluster"] = cluster_labels
|
| 507 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
| 508 |
|
|
|
|
| 509 |
console_messages.append("Problem Domain Extraction completed.")
|
| 510 |
return df, optimal_n_clusters
|
| 511 |
+
|
| 512 |
|
| 513 |
# Usage
|
| 514 |
# clustered_df, optimal_n_clusters = optimal_Problem_clustering(processed_df)
|