BERTopic_AG_final

Sleeping

App Files Files Community

anujjuna commited on 18 days ago

Commit

0f5b08b

verified ·

1 Parent(s): 0092543

Update tools.py

Browse files

Files changed (1) hide show

tools.py +349 -444

tools.py CHANGED Viewed

@@ -1,522 +1,427 @@
 """
 tools.py
 --------
-Topic modeling module using BERTopic for analyzing research paper abstracts and titles.
 """
 import re
 import logging
 import pandas as pd
 from typing import Optional
-from bertopic import BERTopic
 from sentence_transformers import SentenceTransformer
 from umap import UMAP
-from hdbscan import HDBSCAN                          # --- Cluster Balancing Logic ---
-from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-from nltk.corpus import stopwords
-import nltk
-from sklearn.feature_extraction.text import CountVectorizer
-from collections import defaultdict, Counter
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
-# Setup
-# ---------------------------------------------------------------------------
-def _ensure_nltk_stopwords() -> None:
-    try:
-        stopwords.words("english")
-    except LookupError:
-        nltk.download("stopwords", quiet=True)
-# ---------------------------------------------------------------------------
-# Data Loading
 # ---------------------------------------------------------------------------
 def load_csv(filepath: str) -> pd.DataFrame:
     df = pd.read_csv(filepath)
-    required_cols = {"title", "abstract"}
-    missing = required_cols - set(df.columns.str.lower())
-    if missing:
-        raise ValueError(f"CSV is missing required column(s): {missing}")
     df.columns = df.columns.str.lower()
     logger.info("Loaded %d rows from '%s'.", len(df), filepath)
     return df
 # ---------------------------------------------------------------------------
-# Preprocessing
 # ---------------------------------------------------------------------------
-def preprocess_text(texts: pd.Series) -> list[str]:
-    _ensure_nltk_stopwords()
-    stop_words = set(stopwords.words("english"))
-    cleaned: list[str] = []
-    for raw in texts.fillna(""):
-        text = raw.lower()
-        text = re.sub(r"[^a-z\s]", " ", text)
-        tokens = text.split()
-        tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
-        cleaned.append(" ".join(tokens))
-    logger.info("Preprocessed %d documents.", len(cleaned))
-    return cleaned
 # ---------------------------------------------------------------------------
-# Model Construction
 # ---------------------------------------------------------------------------
-def build_bertopic_model(embedding_model: SentenceTransformer, min_topic_size: int = 5) -> BERTopic:
-    # --- Cluster Balancing Logic ---
-    # (embedding_model is passed explicitly from run_topic_modeling)
-    umap_model = UMAP(
-        n_neighbors=15,
-        n_components=5,
-        min_dist=0.0,
-        metric="cosine",
-        random_state=42,
-    )
-    # Updated HDBSCAN constraints
-    hdbscan_model = HDBSCAN(
-        min_cluster_size=5,
-        min_samples=3,
-        metric="euclidean",
-        cluster_selection_method="eom",
-        prediction_data=True,
-    )
-    model = BERTopic(
-        embedding_model=embedding_model,
-        umap_model=umap_model,
-        hdbscan_model=hdbscan_model,
-        min_topic_size=5,
-        verbose=False,
     )
-    logger.info("BERTopic model created with HDBSCAN (min_cluster_size=5, min_samples=3).")
-    return model
 # ---------------------------------------------------------------------------
-# Cluster Balancing Logic
 # ---------------------------------------------------------------------------
-def _get_cluster_sizes(topics: list[int]) -> dict[int, int]:
-    sizes: dict[int, int] = {}
-    for t in topics:
-        if t != -1:
-            sizes[t] = sizes.get(t, 0) + 1
-    return sizes
-def _split_large_cluster(
-    topic_id: int,
-    doc_indices: list[int],
-    embeddings: np.ndarray,
-    topics: list[int],
-    next_id: int,
-) -> int:
-    """Split an oversized cluster into 2 sub-clusters via KMeans. Returns next available ID."""
-    if len(doc_indices) < 10:  # Minimum threshold to split
-        return next_id
-    sub_embs = embeddings[doc_indices]
-    km = KMeans(n_clusters=2, random_state=42, n_init=5)
-    labels = km.fit_predict(sub_embs)
-    new_id = next_id
-    for local_idx, global_idx in enumerate(doc_indices):
-        if labels[local_idx] == 1:          # half goes to a new cluster ID
-            topics[global_idx] = new_id
-    logger.info("Split large cluster %d → kept %d, created %d.", topic_id, topic_id, new_id)
-    return next_id + 1
-def _merge_small_cluster(
-    topic_id: int,
-    doc_indices: list[int],
-    cluster_centroids: dict[int, np.ndarray],
-    topics: list[int],
-    similarity_threshold: float = 0.5,
-) -> bool:
-    """Merge a tiny cluster into the nearest cluster by cosine similarity if threshold met."""
-    if not cluster_centroids or topic_id not in cluster_centroids:
-        return False
-    src_centroid = cluster_centroids[topic_id].reshape(1, -1)
-    other_ids = [tid for tid in cluster_centroids if tid != topic_id]
-    if not other_ids:
-        return False
-    other_centroids = np.vstack([cluster_centroids[tid] for tid in other_ids])
-    sims = cosine_similarity(src_centroid, other_centroids)[0]
-    best_idx = int(np.argmax(sims))
-    max_sim = sims[best_idx]
-    if max_sim >= similarity_threshold:
-        nearest = other_ids[best_idx]
-        for idx in doc_indices:
-            topics[idx] = nearest
-        logger.info("Merged small cluster %d → cluster %d (sim=%.2f).", topic_id, nearest, max_sim)
-        return True
-    return False
-def balance_clusters(
-    topics: list[int],
-    documents: list[str],
-    embedding_model: SentenceTransformer,
-    embeddings: Optional[np.ndarray] = None,
-) -> list[int]:
-    """
-    Enforce cluster size limits: MIN=5, MAX=30.
-    """
     try:
-        if embeddings is None:
-            embeddings = embedding_model.encode(documents, show_progress_bar=False)
-        topics = list(topics)
-        MIN_CLUSTER_SIZE = 5
-        MAX_CLUSTER_SIZE = 30
-        for _ in range(3):  # Iterative refinement
-            sizes = _get_cluster_sizes(topics)
-            if not sizes:
-                break
-            cluster_docs: dict[int, list[int]] = {}
-            for idx, tid in enumerate(topics):
-                if tid != -1:
-                    cluster_docs.setdefault(tid, []).append(idx)
-            centroids: dict[int, np.ndarray] = {
-                tid: embeddings[idxs].mean(axis=0)
-                for tid, idxs in cluster_docs.items()
-            }
-            next_id = max(sizes.keys()) + 1 if sizes else 0
-            changed = False
-            # Split oversized clusters
-            for tid, size in list(sizes.items()):
-                if size > MAX_CLUSTER_SIZE:
-                    old_next_id = next_id
-                    next_id = _split_large_cluster(
-                        tid, cluster_docs[tid], embeddings, topics, next_id
-                    )
-                    if next_id > old_next_id:
-                        changed = True
-            # Merge undersized clusters
-            sizes = _get_cluster_sizes(topics)
-            for tid, size in list(sizes.items()):
-                if size < MIN_CLUSTER_SIZE and tid in cluster_docs:
-                    if _merge_small_cluster(tid, cluster_docs[tid], centroids, topics, similarity_threshold=0.5):
-                        changed = True
-            if not changed:
-                break
-        return topics
-    except Exception as e:
-        logger.error("Cluster balancing error: %s", e)
-        return topics
-def enforce_total_clusters(
-    topics: list[int],
-    embeddings: np.ndarray,
-    min_clusters: int = 15,
-    max_clusters: int = 30,
-) -> list[int]:
-    """Iteratively split or merge to keep total clusters between 15 and 30."""
-    topics = list(topics)
-    while True:
-        unique_clusters = [t for t in set(topics) if t != -1]
-        count = len(unique_clusters)
-        if min_clusters <= count <= max_clusters:
-            break
-        cluster_docs: dict[int, list[int]] = {}
-        for idx, tid in enumerate(topics):
-            if tid != -1:
-                cluster_docs.setdefault(tid, []).append(idx)
-        if not cluster_docs:
-            break
-        centroids: dict[int, np.ndarray] = {
-            tid: embeddings[idxs].mean(axis=0)
-            for tid, idxs in cluster_docs.items()
-        }
-        if count > max_clusters:
-            # Merge two closest clusters
-            ids = list(centroids.keys())
-            c_matrix = np.vstack([centroids[tid] for tid in ids])
-            sim_matrix = cosine_similarity(c_matrix)
-            np.fill_diagonal(sim_matrix, -1)
-            i, j = np.unravel_index(np.argmax(sim_matrix), sim_matrix.shape)
-            tid_i, tid_j = ids[i], ids[j]
-            # Merge tid_i into tid_j
-            for idx in cluster_docs[tid_i]:
-                topics[idx] = tid_j
-            logger.info("Reduced clusters: Merged %d and %d (count: %d -> %d)", tid_i, tid_j, count, count-1)
-        elif count < min_clusters:
-            # Split largest cluster
-            sizes = _get_cluster_sizes(topics)
-            largest_tid = max(sizes, key=sizes.get)
-            next_id = max(unique_clusters) + 1
-            _split_large_cluster(largest_tid, cluster_docs[largest_tid], embeddings, topics, next_id)
-            logger.info("Increased clusters: Split %d (count: %d -> %d)", largest_tid, count, count+1)
-    final_count = len([t for t in set(topics) if t != -1])
-    logger.info("Final cluster count: %d", final_count)
-    print(f"Final cluster count: {final_count}")
-    return topics
-def get_top_3_central_docs(
-    topics: list[int],
-    embeddings: np.ndarray,
-    documents: list[str],
-) -> dict[int, list[str]]:
-    """Select top 3 documents closest to centroid for each topic."""
-    cluster_docs_idx: dict[int, list[int]] = {}
-    for idx, tid in enumerate(topics):
-        if tid != -1:
-            cluster_docs_idx.setdefault(tid, []).append(idx)
-    representative_docs = {}
-    for tid, idxs in cluster_docs_idx.items():
-        cluster_embs = embeddings[idxs]
-        centroid = cluster_embs.mean(axis=0).reshape(1, -1)
-        sims = cosine_similarity(centroid, cluster_embs)[0]
-        # Get top 3 indices
-        top_local_idxs = np.argsort(sims)[-3:][::-1]
-        representative_docs[tid] = [documents[idxs[li]] for li in top_local_idxs]
-    return representative_docs
-def rebuild_topic_keywords(
-    topics: list[int],
-    documents: list[str],
-) -> dict[int, list[tuple[str, float]]]:
-    """
-    Rebuild topic keywords based on updated cluster assignments using CountVectorizer.
-    Skips clusters with fewer than 3 documents.
-    """
-    cluster_docs: dict = defaultdict(list)
-    for doc, t in zip(documents, topics):
-        if t != -1:
-            cluster_docs[t].append(doc)
-    topic_keywords = {}
-    for topic_id, docs in cluster_docs.items():
-        if len(docs) < 2:
-            continue
-        vectorizer = CountVectorizer(stop_words='english', max_features=50)
-        try:
-            X = vectorizer.fit_transform(docs)
-            words = vectorizer.get_feature_names_out()
-            scores = X.sum(axis=0).A1
-            top_idx = scores.argsort()[::-1][:10]
-            topic_keywords[topic_id] = [
-                (words[i], float(scores[i])) for i in top_idx
-            ]
-        except Exception as e:
-            logger.warning("rebuild_topic_keywords failed for topic %d: %s", topic_id, e)
-    return topic_keywords
-def reassign_outliers(
-    topics: list[int],
-    embeddings: np.ndarray,
-    similarity_threshold: float = 0.5,
-) -> list[int]:
-    """
-    Reassign outlier documents (topic == -1) to the nearest cluster centroid
-    if cosine similarity >= similarity_threshold AND cluster size < MAX_CLUSTER_SIZE.
-    Otherwise keep as -1.
-    """
-    topics = list(topics)
-    MAX_CLUSTER_SIZE = 100  # Per instructor spec: max 100 papers per cluster
-    # Build centroid map and current sizes
-    cluster_docs: dict[int, list[int]] = {}
-    current_sizes: dict[int, int] = {}
-    for idx, tid in enumerate(topics):
-        if tid != -1:
-            cluster_docs.setdefault(tid, []).append(idx)
-            current_sizes[tid] = current_sizes.get(tid, 0) + 1
-    if not cluster_docs:
-        return topics
-    cluster_ids = list(cluster_docs.keys())
-    centroids = np.vstack([
-        embeddings[cluster_docs[tid]].mean(axis=0)
-        for tid in cluster_ids
-    ])  # shape: (n_clusters, embed_dim)
-    outlier_indices = [idx for idx, tid in enumerate(topics) if tid == -1]
-    reassigned = 0
-    for idx in outlier_indices:
-        doc_emb = embeddings[idx].reshape(1, -1)
-        sims = cosine_similarity(doc_emb, centroids)[0]  # (n_clusters,)
-        best_i = int(np.argmax(sims))
-        target_tid = cluster_ids[best_i]
-        if sims[best_i] >= similarity_threshold and current_sizes.get(target_tid, 0) < MAX_CLUSTER_SIZE:
-            topics[idx] = target_tid
-            current_sizes[target_tid] = current_sizes.get(target_tid, 0) + 1
-            reassigned += 1
-    logger.info(
-        "Outlier reassignment: %d / %d outliers reassigned (threshold=%.2f, max_size=%d).",
-        reassigned, len(outlier_indices), similarity_threshold, MAX_CLUSTER_SIZE
-    )
-    return topics
 # ---------------------------------------------------------------------------
-# Topic Extraction
 # ---------------------------------------------------------------------------
-def extract_topics(
-    model: BERTopic,
-    documents: list[str],
-    embedding_model: SentenceTransformer,
 ) -> dict:
-    valid_docs = [d if d.strip() else "empty" for d in documents]
-    embeddings = embedding_model.encode(valid_docs, show_progress_bar=False)
-    topics, _ = model.fit_transform(valid_docs, embeddings=embeddings)
-    # 1. Balance cluster sizes (5-30)
-    topics = balance_clusters(topics, valid_docs, embedding_model, embeddings=embeddings)
-    # 2. Enforce total cluster count (15-30)
-    topics = enforce_total_clusters(topics, embeddings, min_clusters=15, max_clusters=30)
-    # 3. Reassign outliers to nearest cluster (threshold=0.55)
-    topics = reassign_outliers(topics, embeddings, similarity_threshold=0.55)
-    # 3.5 Re-balance after reassignment (Ensures clusters remain within limits)
-    topics = balance_clusters(topics, valid_docs, embedding_model, embeddings=embeddings)
-    # 4. Rebuild keywords from final cluster assignments
-    topic_keywords = rebuild_topic_keywords(topics, valid_docs)
-    # 5. Recompute topic_freq from FINAL topics
-    topic_freq = Counter(t for t in topics if t != -1)
-    # 6. Get top-3 central documents
-    representative_docs = get_top_3_central_docs(topics, embeddings, documents)
-    # Final Validation & Logs
-    total_docs = len(topics)
-    total_counted = sum(topic_freq.values())
-    print(f"total_docs = {total_docs}")
-    print(f"total_counted = {total_counted}")
-    final_cluster_count = len([t for t in set(topics) if t != -1])
-    final_topic_count = len(topic_keywords)
-    print(f"Cluster count: {final_cluster_count}")
-    print(f"Topic count: {final_topic_count}")
-    if final_cluster_count != final_topic_count:
-        logger.error(f"CONSISTENCY ERROR: {final_cluster_count} clusters != {final_topic_count} topics")
-    return {
-        "topics": topics,
-        "topic_keywords": topic_keywords,
-        "topic_freq": topic_freq,
-        "representative_docs": representative_docs,
-    }
 # ---------------------------------------------------------------------------
-# High-Level Pipeline
 # ---------------------------------------------------------------------------
-def run_topic_modeling(
-    filepath: str,
-    min_topic_size: int = 5,
-) -> dict:
-    df = load_csv(filepath)
-    # Combined column
-    df["combined"] = df["title"].fillna("") + ". " + df["abstract"].fillna("")
-    clean_docs = preprocess_text(df["combined"])
-    # New embedding model
-    embedding_model = SentenceTransformer("allenai/specter2_base")
-    model = build_bertopic_model(embedding_model, min_topic_size=min_topic_size)
-    results = extract_topics(model, clean_docs, embedding_model)
-    return {
-        "documents": results
-    }
 # ---------------------------------------------------------------------------
-# Pretty Printing Helper
 # ---------------------------------------------------------------------------
-def print_results(results: dict, top_n_keywords: int = 10) -> None:
-    for section, data in results.items():
-        print(f"\n{'='*60}")
-        print(f"  Topic Modeling Results – {section.upper()}")
-        print(f"{'='*60}")
-        keywords: dict = data["topic_keywords"]
-        freq: dict = data["topic_freq"]
-        if not keywords:
-            print("  No topics found.")
             continue
-        for topic_id, words in sorted(keywords.items()):
-            count = freq.get(topic_id, 0)
-            kw_str = ", ".join(w for w, _ in words[:top_n_keywords])
-            print(f"\n  Topic {topic_id:>3}  |  docs: {count:>4}")
-            print(f"  Keywords : {kw_str}")
-        outlier_count = freq.get(-1, 0)
-        if outlier_count:
-            print(f"\n  Outlier topic (-1): {outlier_count} document(s).")
 # ---------------------------------------------------------------------------
-# CLI Entry Point
 # ---------------------------------------------------------------------------
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) < 2:
-        print("Usage: python tools.py <path_to_csv> [min_topic_size]")
-        sys.exit(1)
-    csv_path = sys.argv[1]
-    mts = int(sys.argv[2]) if len(sys.argv) > 2 else 5
-    output = run_topic_modeling(csv_path, min_topic_size=mts)
-    print_results(output)

 """
 tools.py
 --------
+Topic-modelling pipeline: SPECTER-2 → UMAP → HDBSCAN
+with multi-objective Bayesian optimisation over UMAP + HDBSCAN
+parameters (§3.1–§3.6 of the methodology guide).
+No BERTopic wrapper — bare UMAP + HDBSCAN on SPECTER-2 embeddings.
 """
 import re
 import logging
 import pandas as pd
+import numpy as np
 from typing import Optional
+from collections import Counter, defaultdict
 from sentence_transformers import SentenceTransformer
 from umap import UMAP
+from hdbscan import HDBSCAN
+from keybert import KeyBERT
+from sklearn.metrics import adjusted_rand_score
 from sklearn.metrics.pairwise import cosine_similarity
+import optuna
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(message)s")
 logger = logging.getLogger(__name__)
+optuna.logging.set_verbosity(optuna.logging.WARNING)
 # ---------------------------------------------------------------------------
+# Data Loading (unchanged)
 # ---------------------------------------------------------------------------
 def load_csv(filepath: str) -> pd.DataFrame:
     df = pd.read_csv(filepath)
     df.columns = df.columns.str.lower()
+    required = {"title", "abstract"}
+    missing = required - set(df.columns)
+    if missing:
+        raise ValueError(f"CSV missing column(s): {missing}")
     logger.info("Loaded %d rows from '%s'.", len(df), filepath)
     return df
 # ---------------------------------------------------------------------------
+# §3.1 — Input unit: title + abstract concatenation
 # ---------------------------------------------------------------------------
+def prepare_documents(df: pd.DataFrame) -> list[str]:
+    """One string per paper: title + abstract (§3.1 input unit)."""
+    docs = (df["title"].fillna("") + ". " + df["abstract"].fillna("")).tolist()
+    logger.info("Prepared %d title+abstract documents.", len(docs))
+    return docs
 # ---------------------------------------------------------------------------
+# §3.1 — Embed with SPECTER-2
 # ---------------------------------------------------------------------------
+def embed_documents(
+    docs: list[str],
+    model_name: str = "allenai/specter2_base",
+) -> np.ndarray:
+    """Embed with SPECTER-2. Deterministic — no tuning (§3.3)."""
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(docs, show_progress_bar=True, batch_size=32)
+    logger.info("Embedded %d docs → %s", len(docs), embeddings.shape)
+    return embeddings
+# ---------------------------------------------------------------------------
+# §3.2 — Cluster discipline checks
+# ---------------------------------------------------------------------------
+def check_discipline(labels: np.ndarray, n_docs: int) -> dict:
+    """Two hard constraints: max-mass ≤ 25 %, min-size ≥ 5."""
+    counts = Counter(int(l) for l in labels)
+    unique = [l for l in counts if l != -1]
+    if not unique:
+        return dict(max_mass_pct=0, max_mass_ok=False,
+                    min_size=0, min_size_ok=False,
+                    n_clusters=0, n_noise=counts.get(-1, 0))
+    max_mass_pct = max(counts[l] / n_docs for l in unique)
+    min_size     = min(counts[l] for l in unique)
+    return dict(
+        max_mass_pct=round(max_mass_pct, 4),
+        max_mass_ok=max_mass_pct <= 0.25,
+        min_size=int(min_size),
+        min_size_ok=min_size >= 5,
+        n_clusters=len(unique),
+        n_noise=counts.get(-1, 0),
+        cluster_sizes={l: counts[l] for l in sorted(unique)},
     )
 # ---------------------------------------------------------------------------
+# §3.4 — Quality metrics
 # ---------------------------------------------------------------------------
+def compute_persistence(clusterer: HDBSCAN) -> float:
+    """Average cluster persistence from the condensed tree."""
+    try:
+        p = getattr(clusterer, "cluster_persistence_", None)
+        if p is not None and len(p) > 0:
+            return float(np.mean(p))
+    except Exception:
+        pass
+    return 0.0
+def compute_dbcv(reduced: np.ndarray, labels: np.ndarray) -> float:
+    """Density-Based Cluster Validity index."""
     try:
+        from hdbscan.validity import validity_index
+        ul = set(labels); ul.discard(-1)
+        if len(ul) < 2:
+            return -1.0
+        return float(validity_index(reduced.astype(np.float64), labels))
+    except Exception as e:
+        logger.warning("DBCV failed: %s", e)
+        return -1.0
+def compute_stability(embeddings: np.ndarray, params: dict,
+                      n_seeds: int = 5) -> float:
+    """Cluster-recurrence stability via pairwise ARI across seeds (§3.4)."""
+    all_labels = []
+    for s in range(n_seeds):
+        u = UMAP(n_neighbors=params["n_neighbors"],
+                 n_components=params["n_components"],
+                 min_dist=0.0, metric="cosine",
+                 random_state=s * 7 + 1)
+        red = u.fit_transform(embeddings)
+        h = HDBSCAN(min_cluster_size=params["min_cluster_size"],
+                    min_samples=params["min_samples"],
+                    metric="euclidean",
+                    cluster_selection_method=params["csm"],
+                    cluster_selection_epsilon=params["cse"])
+        all_labels.append(h.fit_predict(red))
+    aris = []
+    for i in range(len(all_labels)):
+        for j in range(i + 1, len(all_labels)):
+            aris.append(adjusted_rand_score(all_labels[i], all_labels[j]))
+    return float(np.mean(aris)) if aris else 0.0
+# ---------------------------------------------------------------------------
+# §3.4 — Bayesian optimisation objective
+# ---------------------------------------------------------------------------
+def _objective(trial, embeddings, n_docs):
+    """Single Optuna trial — returns (persistence, dbcv, stability_placeholder)."""
+    n_neighbors = trial.suggest_categorical("n_neighbors", [5, 10, 15, 30, 50])
+    n_components = trial.suggest_int("n_components", 5, 10)
+    mcs = trial.suggest_int(
+        "min_cluster_size",
+        max(5, int(0.01 * n_docs)),
+        max(20, int(0.05 * n_docs)),
+    )
+    ms = trial.suggest_int("min_samples", 1, mcs)
+    csm = trial.suggest_categorical("csm", ["eom", "leaf"])
+    cse = trial.suggest_float("cse", 0.0, 0.3, step=0.05)
+    params = dict(n_neighbors=n_neighbors, n_components=n_components,
+                  min_cluster_size=mcs, min_samples=ms, csm=csm, cse=cse)
+    u = UMAP(n_neighbors=n_neighbors, n_components=n_components,
+             min_dist=0.0, metric="cosine", random_state=42)
+    red = u.fit_transform(embeddings)
+    h = HDBSCAN(min_cluster_size=mcs, min_samples=ms, metric="euclidean",
+                cluster_selection_method=csm,
+                cluster_selection_epsilon=cse,
+                allow_single_cluster=False, gen_min_span_tree=True)
+    labels = h.fit_predict(red)
+    disc = check_discipline(labels, n_docs)
+    trial.set_user_attr("params", params)
+    trial.set_user_attr("discipline", disc)
+    trial.set_user_attr("labels", labels.tolist())
+    # Hard-constraint violation → worst scores
+    if not disc["max_mass_ok"] or not disc["min_size_ok"]:
+        trial.set_user_attr("pass", False)
+        return 0.0, -1.0, 0.0
+    trial.set_user_attr("pass", True)
+    pers = compute_persistence(h)
+    dbcv = compute_dbcv(red, labels)
+    trial.set_user_attr("persistence", pers)
+    trial.set_user_attr("dbcv", dbcv)
+    return pers, dbcv, 0.5          # stability computed only for winner
 # ---------------------------------------------------------------------------
+# §3.4 — Run the full Bayesian loop
 # ---------------------------------------------------------------------------
+def run_bayesian_optimisation(
+    embeddings: np.ndarray,
+    n_trials: int = 50,
+    progress_callback=None,
 ) -> dict:
+    n_docs = len(embeddings)
+    study = optuna.create_study(
+        directions=["maximize", "maximize", "maximize"],
+        sampler=optuna.samplers.TPESampler(seed=42, multivariate=True),
+        study_name="specter2_umap_hdbscan",
+    )
+    trial_log = []
+    def _cb(study, trial):
+        d = trial.user_attrs.get("discipline", {})
+        entry = dict(
+            trial=trial.number,
+            params=trial.user_attrs.get("params", {}),
+            discipline_pass=trial.user_attrs.get("pass", False),
+            persistence=trial.user_attrs.get("persistence", 0.0),
+            dbcv=trial.user_attrs.get("dbcv", -1.0),
+            n_clusters=d.get("n_clusters", 0),
+            max_mass_pct=d.get("max_mass_pct", 0.0),
+            min_size=d.get("min_size", 0),
+            n_noise=d.get("n_noise", 0),
+            values=list(trial.values) if trial.values else [],
+        )
+        trial_log.append(entry)
+        if progress_callback:
+            progress_callback(trial.number + 1, n_trials, entry)
+    for i in range(n_trials):
+        study.optimize(
+            lambda t: _objective(t, embeddings, n_docs),
+            n_trials=1, callbacks=[_cb], show_progress_bar=False,
+        )
+        # §3.6 convergence: 3 consecutive passing within 5 % of best
+        passing = [e for e in trial_log if e["discipline_pass"]]
+        if len(passing) >= 3 and i >= 19:
+            best_p = max(e["persistence"] for e in passing)
+            if best_p > 0:
+                last3 = passing[-3:]
+                if all(abs(e["persistence"] - best_p) / best_p < 0.05
+                       for e in last3):
+                    logger.info("Converged at trial %d.", i + 1)
+                    break
+    # Select best passing trial (max persistence, then DBCV)
+    passing_trials = [t for t in study.trials
+                      if t.user_attrs.get("pass", False)]
+    if passing_trials:
+        best = max(passing_trials, key=lambda t: (t.values[0], t.values[1]))
+    else:
+        logger.warning("No trial passed discipline — using last trial.")
+        best = study.trials[-1]
+    bp = best.user_attrs["params"]
+    labels = np.array(best.user_attrs["labels"])
+    stability = compute_stability(embeddings, bp, n_seeds=5)
+    return dict(
+        best_params=bp, best_labels=labels,
+        best_trial=best.number,
+        persistence=best.user_attrs.get("persistence", 0.0),
+        dbcv=best.user_attrs.get("dbcv", -1.0),
+        stability=stability,
+        discipline=best.user_attrs.get("discipline", {}),
+        trial_log=trial_log,
+        n_trials_run=len(trial_log),
+    )
 # ---------------------------------------------------------------------------
+# §3.1 — 2-D UMAP for visualisation
 # ---------------------------------------------------------------------------
+def compute_2d_umap(embeddings: np.ndarray, seed: int = 42) -> np.ndarray:
+    return UMAP(n_neighbors=15, n_components=2, min_dist=0.1,
+                metric="cosine", random_state=seed).fit_transform(embeddings)
 # ---------------------------------------------------------------------------
+# §3.1 — KeyBERT keyphrase extraction per cluster (3–5 phrases)
 # ---------------------------------------------------------------------------
+def extract_keyphrases(docs: list[str], labels: np.ndarray,
+                       top_n: int = 5) -> dict:
+    kw = KeyBERT(model="all-MiniLM-L6-v2")
+    cluster_docs = defaultdict(list)
+    for doc, lab in zip(docs, labels):
+        if lab != -1:
+            cluster_docs[int(lab)].append(doc)
+    out = {}
+    for cid, cdocs in cluster_docs.items():
+        try:
+            out[cid] = kw.extract_keywords(
+                " ".join(cdocs), keyphrase_ngram_range=(1, 3),
+                stop_words="english", top_n=top_n,
+                use_mmr=True, diversity=0.5)
+        except Exception as e:
+            logger.warning("KeyBERT cluster %d: %s", cid, e)
+            out[cid] = []
+    return out
+# ---------------------------------------------------------------------------
+# §3.1 — Strong / weak member counts via HDBSCAN probabilities
+# ---------------------------------------------------------------------------
+def strong_weak_members(labels: np.ndarray,
+                        probabilities: np.ndarray) -> dict:
+    mem = defaultdict(lambda: {"strong": 0, "weak": 0})
+    for lab, prob in zip(labels, probabilities):
+        if lab == -1:
             continue
+        cid = int(lab)
+        if prob >= 0.5:
+            mem[cid]["strong"] += 1
+        else:
+            mem[cid]["weak"] += 1
+    return dict(mem)
+# ---------------------------------------------------------------------------
+# §3.2 — Outlier reduction: reassign noise to nearest cluster (≤ 25 %)
+# ---------------------------------------------------------------------------
+def outlier_reduction(labels: np.ndarray, reduced: np.ndarray,
+                      n_docs: int) -> np.ndarray:
+    labels = labels.copy()
+    cap = int(0.25 * n_docs)
+    cdocs = defaultdict(list)
+    for i, l in enumerate(labels):
+        if l != -1:
+            cdocs[int(l)].append(i)
+    if not cdocs:
+        return labels
+    cids = list(cdocs.keys())
+    centroids = np.vstack([reduced[cdocs[c]].mean(axis=0) for c in cids])
+    noise = [i for i, l in enumerate(labels) if l == -1]
+    moved = 0
+    for idx in noise:
+        dists = np.linalg.norm(centroids - reduced[idx], axis=1)
+        for best in np.argsort(dists):
+            tgt = cids[best]
+            if len(cdocs[tgt]) < cap:
+                labels[idx] = tgt
+                cdocs[tgt].append(idx)
+                moved += 1
+                break
+    logger.info("Outlier reduction: %d / %d noise reassigned.", moved, len(noise))
+    return labels
 # ---------------------------------------------------------------------------
+# Representative docs (top-3 by centroid proximity)
 # ---------------------------------------------------------------------------
+def get_representative_docs(labels, embeddings, docs, top_n=3):
+    cdocs = defaultdict(list)
+    for i, l in enumerate(labels):
+        if l != -1:
+            cdocs[int(l)].append(i)
+    out = {}
+    for cid, idxs in cdocs.items():
+        ce = embeddings[idxs].mean(axis=0).reshape(1, -1)
+        sims = cosine_similarity(ce, embeddings[idxs])[0]
+        top = np.argsort(sims)[-top_n:][::-1]
+        out[cid] = [docs[idxs[t]] for t in top]
+    return out
+# ---------------------------------------------------------------------------
+# High-level pipeline entry point
+# ---------------------------------------------------------------------------
+def run_topic_modeling(filepath: str, n_trials: int = 50,
+                       progress_callback=None) -> dict:
+    # 1. Load
+    df = load_csv(filepath)
+    docs = prepare_documents(df)
+    n_docs = len(docs)
+    # 2. Embed (deterministic)
+    embeddings = embed_documents(docs)
+    # 3. Bayesian optimisation (§3.4)
+    opt = run_bayesian_optimisation(embeddings, n_trials, progress_callback)
+    bp = opt["best_params"]
+    labels = opt["best_labels"]
+    # 4. Re-run winner for clusterer object (probabilities)
+    u = UMAP(n_neighbors=bp["n_neighbors"], n_components=bp["n_components"],
+             min_dist=0.0, metric="cosine", random_state=42)
+    red = u.fit_transform(embeddings)
+    h = HDBSCAN(min_cluster_size=bp["min_cluster_size"],
+                min_samples=bp["min_samples"], metric="euclidean",
+                cluster_selection_method=bp["csm"],
+                cluster_selection_epsilon=bp["cse"],
+                allow_single_cluster=False)
+    h.fit(red)
+    # 5. Outlier reduction (§3.2 — clusters < 5 reassigned)
+    labels = outlier_reduction(labels, red, n_docs)
+    # 6. Strong / weak (§3.1)
+    sw = strong_weak_members(labels, h.probabilities_)
+    # 7. 2-D UMAP (§3.1)
+    umap_2d = compute_2d_umap(embeddings)
+    # 8. KeyBERT keyphrases (§3.1)
+    keyphrases = extract_keyphrases(docs, labels)
+    # 9. Rep docs
+    rep_docs = get_representative_docs(labels, embeddings, docs)
+    # 10. Final discipline
+    disc = check_discipline(labels, n_docs)
+    return dict(
+        documents=docs, labels=labels.tolist(),
+        keyphrases=keyphrases, representative_docs=rep_docs,
+        membership=sw, umap_2d=umap_2d.tolist(),
+        discipline=disc, best_params=bp,
+        metrics=dict(persistence=opt["persistence"],
+                     dbcv=opt["dbcv"],
+                     stability=opt["stability"]),
+        trial_log=opt["trial_log"],
+        n_trials_run=opt["n_trials_run"],
+        best_trial=opt["best_trial"],
+        n_docs=n_docs,
+        embeddings=embeddings,
+    )