BERTopic_AG_final

Running

App Files Files Community

anujjuna commited on Apr 26

Commit

c8c01fa

verified ·

1 Parent(s): 8c6e466

Update tools.py

Browse files

Files changed (1) hide show

tools.py +98 -36

tools.py CHANGED Viewed

@@ -2,15 +2,23 @@
 tools.py
 --------
 Topic modeling module using BERTopic for analyzing research paper abstracts and titles.
-Heavy imports are lazy-loaded inside functions to stay within 2GB RAM on free HF Spaces.
 """
 import re
 import logging
 import pandas as pd
-import numpy as np
 from typing import Optional
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
@@ -22,8 +30,6 @@ logger = logging.getLogger(__name__)
 # Setup
 # ---------------------------------------------------------------------------
 def _ensure_nltk_stopwords() -> None:
-    from nltk.corpus import stopwords
-    import nltk
     try:
         stopwords.words("english")
     except LookupError:
@@ -39,6 +45,7 @@ def load_csv(filepath: str) -> pd.DataFrame:
     missing = required_cols - set(df.columns.str.lower())
     if missing:
         raise ValueError(f"CSV is missing required column(s): {missing}")
     df.columns = df.columns.str.lower()
     logger.info("Loaded %d rows from '%s'.", len(df), filepath)
     return df
@@ -48,7 +55,6 @@ def load_csv(filepath: str) -> pd.DataFrame:
 # Preprocessing
 # ---------------------------------------------------------------------------
 def preprocess_text(texts: pd.Series) -> list[str]:
-    from nltk.corpus import stopwords
     _ensure_nltk_stopwords()
     stop_words = set(stopwords.words("english"))
@@ -67,10 +73,9 @@ def preprocess_text(texts: pd.Series) -> list[str]:
 # ---------------------------------------------------------------------------
 # Model Construction
 # ---------------------------------------------------------------------------
-def build_bertopic_model(embedding_model, min_topic_size: int = 5):
-    from bertopic import BERTopic
-    from umap import UMAP
-    from hdbscan import HDBSCAN
     umap_model = UMAP(
         n_neighbors=15,
@@ -80,6 +85,8 @@ def build_bertopic_model(embedding_model, min_topic_size: int = 5):
         random_state=42,
     )
     hdbscan_model = HDBSCAN(
         min_cluster_size=max(min_topic_size, 5),
         min_samples=2,
@@ -95,7 +102,7 @@ def build_bertopic_model(embedding_model, min_topic_size: int = 5):
         min_topic_size=max(min_topic_size, 5),
         verbose=False,
     )
-    logger.info("BERTopic model created (min_cluster_size=%d).", max(min_topic_size, 5))
     return model
@@ -110,8 +117,14 @@ def _get_cluster_sizes(topics: list[int]) -> dict[int, int]:
     return sizes
-def _split_large_cluster(topic_id, doc_indices, embeddings, topics, next_id):
-    from sklearn.cluster import KMeans
     if len(doc_indices) < 4:
         return next_id
     sub_embs = embeddings[doc_indices]
@@ -119,14 +132,19 @@ def _split_large_cluster(topic_id, doc_indices, embeddings, topics, next_id):
     labels = km.fit_predict(sub_embs)
     new_id = next_id
     for local_idx, global_idx in enumerate(doc_indices):
-        if labels[local_idx] == 1:
             topics[global_idx] = new_id
     logger.info("Split large cluster %d → kept %d, created %d.", topic_id, topic_id, new_id)
     return next_id + 1
-def _merge_small_cluster(topic_id, doc_indices, cluster_centroids, topics):
-    from sklearn.metrics.pairwise import cosine_similarity
     if not cluster_centroids:
         return
     src_centroid = cluster_centroids[topic_id].reshape(1, -1)
@@ -141,9 +159,25 @@ def _merge_small_cluster(topic_id, doc_indices, cluster_centroids, topics):
     logger.info("Merged small cluster %d → cluster %d.", topic_id, nearest)
-def balance_clusters(topics, documents, embedding_model, large_factor=2.0, small_threshold=3):
     try:
         embeddings = embedding_model.encode(documents, show_progress_bar=False)
         topics = list(topics)
         sizes = _get_cluster_sizes(topics)
         if not sizes:
@@ -153,51 +187,67 @@ def balance_clusters(topics, documents, embedding_model, large_factor=2.0, small
         median_size = float(np.median(counts))
         large_cutoff = large_factor * median_size
         cluster_docs: dict[int, list[int]] = {}
         for idx, tid in enumerate(topics):
             if tid != -1:
                 cluster_docs.setdefault(tid, []).append(idx)
-        centroids = {
             tid: embeddings[idxs].mean(axis=0)
             for tid, idxs in cluster_docs.items()
         }
         next_id = max(sizes.keys()) + 1
         for tid, size in list(sizes.items()):
             if size > large_cutoff:
-                next_id = _split_large_cluster(tid, cluster_docs[tid], embeddings, topics, next_id)
         sizes = _get_cluster_sizes(topics)
         cluster_docs = {}
         for idx, tid in enumerate(topics):
             if tid != -1:
                 cluster_docs.setdefault(tid, []).append(idx)
         for tid, size in list(sizes.items()):
             if size < small_threshold and tid in cluster_docs:
                 _merge_small_cluster(tid, cluster_docs[tid], centroids, topics)
         return topics
     except Exception as e:
-        logger.error("Cluster balancing error: %s", e)
         raise e
 # ---------------------------------------------------------------------------
 # Topic Extraction
 # ---------------------------------------------------------------------------
-def extract_topics(model, documents, embedding_model, label="documents") -> dict:
     valid_docs = [d if d.strip() else "empty" for d in documents]
     topics, _ = model.fit_transform(valid_docs)
     try:
         topics = balance_clusters(topics, valid_docs, embedding_model)
     except Exception as e:
-        logger.error("Cluster balancing failed (using original topics): %s", e)
-    topic_info = model.get_topic_info()
     topic_keywords: dict[int, list[tuple[str, float]]] = {}
     for topic_id in topic_info["Topic"].tolist():
@@ -207,9 +257,16 @@ def extract_topics(model, documents, embedding_model, label="documents") -> dict
         if words:
             topic_keywords[topic_id] = words
-    topic_freq = topic_info.set_index("Topic")["Count"].to_dict()
-    logger.info("Extracted %d topic(s) from %s.", len(topic_keywords), label)
     return {
         "topics": topics,
         "topic_info": topic_info,
@@ -219,28 +276,30 @@ def extract_topics(model, documents, embedding_model, label="documents") -> dict
 # ---------------------------------------------------------------------------
-# High-Level Pipeline  —  ALL heavy imports live here
 # ---------------------------------------------------------------------------
-def run_topic_modeling(filepath: str, min_topic_size: int = 5) -> dict:
-    # Heavy imports deferred to here so app.py startup stays lightweight
-    from sentence_transformers import SentenceTransformer
-    from bertopic import BERTopic  # noqa: F401  (ensures bertopic is cached)
     df = load_csv(filepath)
     clean_abstracts = preprocess_text(df["abstract"])
     clean_titles = preprocess_text(df["title"])
     embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     abstract_model = build_bertopic_model(embedding_model, min_topic_size=min_topic_size)
-    title_model    = build_bertopic_model(embedding_model, min_topic_size=min_topic_size)
     abstract_results = extract_topics(abstract_model, clean_abstracts, embedding_model, label="abstracts")
-    title_results    = extract_topics(title_model,    clean_titles,    embedding_model, label="titles")
     return {
         "abstracts": abstract_results,
-        "titles":    title_results,
     }
@@ -253,15 +312,15 @@ def print_results(results: dict, top_n_keywords: int = 10) -> None:
         print(f"  Topic Modeling Results – {section.upper()}")
         print(f"{'='*60}")
-        keywords = data["topic_keywords"]
-        freq     = data["topic_freq"]
         if not keywords:
             print("  No topics found.")
             continue
         for topic_id, words in sorted(keywords.items()):
-            count  = freq.get(topic_id, 0)
             kw_str = ", ".join(w for w, _ in words[:top_n_keywords])
             print(f"\n  Topic {topic_id:>3}  |  docs: {count:>4}")
             print(f"  Keywords : {kw_str}")
@@ -276,10 +335,13 @@ def print_results(results: dict, top_n_keywords: int = 10) -> None:
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
     if len(sys.argv) < 2:
         print("Usage: python tools.py <path_to_csv> [min_topic_size]")
         sys.exit(1)
     csv_path = sys.argv[1]
     mts = int(sys.argv[2]) if len(sys.argv) > 2 else 5
     output = run_topic_modeling(csv_path, min_topic_size=mts)
-    print_results(output)

 tools.py
 --------
 Topic modeling module using BERTopic for analyzing research paper abstracts and titles.
 """
 import re
 import logging
 import pandas as pd
 from typing import Optional
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+from umap import UMAP
+from hdbscan import HDBSCAN                          # --- Cluster Balancing Logic ---
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from nltk.corpus import stopwords
+import nltk
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 # Setup
 # ---------------------------------------------------------------------------
 def _ensure_nltk_stopwords() -> None:
     try:
         stopwords.words("english")
     except LookupError:
     missing = required_cols - set(df.columns.str.lower())
     if missing:
         raise ValueError(f"CSV is missing required column(s): {missing}")
     df.columns = df.columns.str.lower()
     logger.info("Loaded %d rows from '%s'.", len(df), filepath)
     return df
 # Preprocessing
 # ---------------------------------------------------------------------------
 def preprocess_text(texts: pd.Series) -> list[str]:
     _ensure_nltk_stopwords()
     stop_words = set(stopwords.words("english"))
 # ---------------------------------------------------------------------------
 # Model Construction
 # ---------------------------------------------------------------------------
+def build_bertopic_model(embedding_model: SentenceTransformer, min_topic_size: int = 5) -> BERTopic:
+    # --- Cluster Balancing Logic ---
+    # (embedding_model is passed explicitly from run_topic_modeling)
     umap_model = UMAP(
         n_neighbors=15,
         random_state=42,
     )
+    # Tuned HDBSCAN: smaller min_cluster_size allows more granular clusters;
+    # reduced min_samples makes the model less strict about noise.
     hdbscan_model = HDBSCAN(
         min_cluster_size=max(min_topic_size, 5),
         min_samples=2,
         min_topic_size=max(min_topic_size, 5),
         verbose=False,
     )
+    logger.info("BERTopic model created with tuned HDBSCAN (min_cluster_size=%d).", max(min_topic_size, 5))
     return model
     return sizes
+def _split_large_cluster(
+    topic_id: int,
+    doc_indices: list[int],
+    embeddings: np.ndarray,
+    topics: list[int],
+    next_id: int,
+) -> int:
+    """Split an oversized cluster into 2 sub-clusters via KMeans. Returns next available ID."""
     if len(doc_indices) < 4:
         return next_id
     sub_embs = embeddings[doc_indices]
     labels = km.fit_predict(sub_embs)
     new_id = next_id
     for local_idx, global_idx in enumerate(doc_indices):
+        if labels[local_idx] == 1:          # half goes to a new cluster ID
             topics[global_idx] = new_id
     logger.info("Split large cluster %d → kept %d, created %d.", topic_id, topic_id, new_id)
     return next_id + 1
+def _merge_small_cluster(
+    topic_id: int,
+    doc_indices: list[int],
+    cluster_centroids: dict[int, np.ndarray],
+    topics: list[int],
+) -> None:
+    """Merge a tiny cluster into the nearest cluster by cosine similarity."""
     if not cluster_centroids:
         return
     src_centroid = cluster_centroids[topic_id].reshape(1, -1)
     logger.info("Merged small cluster %d → cluster %d.", topic_id, nearest)
+def balance_clusters(
+    topics: list[int],
+    documents: list[str],
+    embedding_model: SentenceTransformer,
+    large_factor: float = 2.0,
+    small_threshold: int = 3,
+) -> list[int]:
+    """
+    --- Cluster Balancing Logic ---
+    Post-process HDBSCAN topic assignments to reduce extreme size imbalance.
+    - Splits clusters > large_factor × median size (via KMeans sub-split).
+    - Merges clusters < small_threshold into their nearest neighbour.
+    Does NOT enforce equal sizes.
+    """
     try:
+        # Ensure balance_clusters actually runs and uses embedding_model.encode
         embeddings = embedding_model.encode(documents, show_progress_bar=False)
         topics = list(topics)
         sizes = _get_cluster_sizes(topics)
         if not sizes:
         median_size = float(np.median(counts))
         large_cutoff = large_factor * median_size
+        # Build per-cluster document index lists
         cluster_docs: dict[int, list[int]] = {}
         for idx, tid in enumerate(topics):
             if tid != -1:
                 cluster_docs.setdefault(tid, []).append(idx)
+        # Compute centroids for merge step
+        centroids: dict[int, np.ndarray] = {
             tid: embeddings[idxs].mean(axis=0)
             for tid, idxs in cluster_docs.items()
         }
         next_id = max(sizes.keys()) + 1
+        # Split oversized clusters
         for tid, size in list(sizes.items()):
             if size > large_cutoff:
+                next_id = _split_large_cluster(
+                    tid, cluster_docs[tid], embeddings, topics, next_id
+                )
+        # Re-compute sizes after splits for merge step
         sizes = _get_cluster_sizes(topics)
         cluster_docs = {}
         for idx, tid in enumerate(topics):
             if tid != -1:
                 cluster_docs.setdefault(tid, []).append(idx)
+        # Merge undersized clusters
         for tid, size in list(sizes.items()):
             if size < small_threshold and tid in cluster_docs:
                 _merge_small_cluster(tid, cluster_docs[tid], centroids, topics)
         return topics
     except Exception as e:
+        print("Cluster balancing error:", e)
         raise e
 # ---------------------------------------------------------------------------
 # Topic Extraction
 # ---------------------------------------------------------------------------
+def extract_topics(
+    model: BERTopic,
+    documents: list[str],
+    embedding_model: SentenceTransformer,
+    label: str = "documents",
+) -> dict:
     valid_docs = [d if d.strip() else "empty" for d in documents]
     topics, _ = model.fit_transform(valid_docs)
+    # --- Cluster Balancing Logic ---
+    # Attempt to balance clusters but move ahead if it fails
     try:
         topics = balance_clusters(topics, valid_docs, embedding_model)
     except Exception as e:
+        logger.error("Cluster balancing failed (moving ahead with original topics): %s", e)
+    topic_info: pd.DataFrame = model.get_topic_info()
     topic_keywords: dict[int, list[tuple[str, float]]] = {}
     for topic_id in topic_info["Topic"].tolist():
         if words:
             topic_keywords[topic_id] = words
+    topic_freq: dict[int, int] = (
+        topic_info.set_index("Topic")["Count"].to_dict()
+    )
+    logger.info(
+        "Extracted %d topic(s) from %s.",
+        len(topic_keywords),
+        label,
+    )
     return {
         "topics": topics,
         "topic_info": topic_info,
 # ---------------------------------------------------------------------------
+# High-Level Pipeline
 # ---------------------------------------------------------------------------
+def run_topic_modeling(
+    filepath: str,
+    min_topic_size: int = 5,
+) -> dict:
     df = load_csv(filepath)
     clean_abstracts = preprocess_text(df["abstract"])
     clean_titles = preprocess_text(df["title"])
+    # Create embedding model once to be shared across steps
     embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     abstract_model = build_bertopic_model(embedding_model, min_topic_size=min_topic_size)
+    title_model = build_bertopic_model(embedding_model, min_topic_size=min_topic_size)
     abstract_results = extract_topics(abstract_model, clean_abstracts, embedding_model, label="abstracts")
+    title_results = extract_topics(title_model, clean_titles, embedding_model, label="titles")
     return {
         "abstracts": abstract_results,
+        "titles": title_results,
     }
         print(f"  Topic Modeling Results – {section.upper()}")
         print(f"{'='*60}")
+        keywords: dict = data["topic_keywords"]
+        freq: dict = data["topic_freq"]
         if not keywords:
             print("  No topics found.")
             continue
         for topic_id, words in sorted(keywords.items()):
+            count = freq.get(topic_id, 0)
             kw_str = ", ".join(w for w, _ in words[:top_n_keywords])
             print(f"\n  Topic {topic_id:>3}  |  docs: {count:>4}")
             print(f"  Keywords : {kw_str}")
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
     if len(sys.argv) < 2:
         print("Usage: python tools.py <path_to_csv> [min_topic_size]")
         sys.exit(1)
     csv_path = sys.argv[1]
     mts = int(sys.argv[2]) if len(sys.argv) > 2 else 5
     output = run_topic_modeling(csv_path, min_topic_size=mts)
+    print_results(output)