Spaces:
Sleeping
Sleeping
Daksh C Jain
Upgrade pipeline: UMAP+HDBSCAN, Bayesian opt, 3+1 LLM labeling, coherence check
f38b6de | """tools.py β Sentence-level BERTopic pipeline + Mistral LLM. Version 4.0.0 | May 2026. | |
| PIPELINE: | |
| Paper β split sentences β embed (all-MiniLM-L6-v2, 384d) | |
| β LLM recommends UMAP + HDBSCAN params based on dataset stats | |
| β Bayesian optimization (Optuna) tunes min_cluster_size + min_samples | |
| β UMAP dim reduction β HDBSCAN clustering | |
| β 3 LLM labeling calls per cluster + 1 orchestration judge call | |
| β Abstract-title cosine similarity validation (hallucination check) | |
| """ | |
| from langchain_core.tools import tool | |
| import os | |
| import json | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| DEBUG = True | |
| debug = {True: print, False: lambda *a, **k: None}[DEBUG] | |
| CHECKPOINT_DIR = "/tmp/checkpoints" | |
| os.makedirs(CHECKPOINT_DIR, exist_ok=True) | |
| NEAREST_K = 5 | |
| SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])' | |
| MIN_SENT_LEN = 30 | |
| RUN_CONFIGS = { | |
| "abstract": ["Abstract"], | |
| "title": ["Title"], | |
| } | |
| _data = {} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: Split text into sentences | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _split_sentences(text): | |
| raw = re.split(SENT_SPLIT_RE, str(text)) | |
| return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: Filter publisher boilerplate | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| _BOILERPLATE_RE = re.compile("|".join([ | |
| r"Licensee MDPI", r"Published by Informa", r"Published by Elsevier", | |
| r"Taylor & Francis", r"Copyright Β©", r"Creative Commons", | |
| r"open access article", r"Inderscience Enterprises", r"All rights reserved", | |
| r"This is an open access", r"distributed under the terms", r"The Author\(s\)", | |
| r"Springer Nature", r"Emerald Publishing", r"limitations and future", | |
| r"limitations and implications", r"limitations are discussed", | |
| r"limitations have been discussed", r"implications are discussed", | |
| r"implications were discussed", r"implications are presented", | |
| r"concludes with .* implications", | |
| ]), re.IGNORECASE) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 1: Load Scopus CSV | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_scopus_csv(filepath: str) -> str: | |
| """Load a Scopus CSV export and show preview. Call this first. | |
| Args: | |
| filepath: Path to the uploaded .csv file. | |
| Returns: | |
| Row count, column names, and sample data.""" | |
| debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')") | |
| df = pd.read_csv(filepath, encoding="utf-8-sig") | |
| _data["df"] = df | |
| debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns") | |
| target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"])) | |
| sample = df[target_cols].head(3).to_string(max_colwidth=80) | |
| null_counts = ", ".join(list(map( | |
| lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols))) | |
| sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len) | |
| avg_abstract_sents = sample_sents.mean() | |
| est_abstract = int(avg_abstract_sents * len(df)) | |
| title_count = int(df["Title"].notna().sum()) | |
| return (f"π **Dataset Statistics:**\n" | |
| f"- **Papers:** {len(df)}\n" | |
| f"- **Abstract sentences:** ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n" | |
| f"- **Title sentences:** {title_count} (1 per paper)\n" | |
| f"- **Non-null:** {null_counts}\n\n" | |
| f"Columns: {', '.join(list(df.columns)[:15])}\n\n" | |
| f"Sample:\n{sample}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: LLM recommends UMAP + HDBSCAN params | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _llm_recommend_params(n_sentences: int, n_papers: int, run_key: str) -> dict: | |
| """Call Mistral to recommend UMAP + HDBSCAN parameters based on dataset characteristics. | |
| Returns a dict with umap_n_components, umap_n_neighbors, hdbscan_min_cluster_size, | |
| hdbscan_min_samples, hdbscan_cluster_selection_epsilon.""" | |
| debug(f">>> LLM param recommendation: {n_sentences} sentences, {n_papers} papers, run={run_key}") | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.output_parsers import JsonOutputParser | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120) | |
| prompt = ( | |
| f"You are an NLP clustering expert. Given this academic paper dataset, " | |
| f"recommend optimal UMAP and HDBSCAN parameters.\n\n" | |
| f"Dataset:\n" | |
| f"- Total sentences to cluster: {n_sentences}\n" | |
| f"- Total papers: {n_papers}\n" | |
| f"- Run type: {run_key} (abstract=~10 sents/paper, title=1 sent/paper)\n" | |
| f"- Embedding dimension: 384 (all-MiniLM-L6-v2)\n\n" | |
| f"Rules:\n" | |
| f"- umap_n_components: 5-50 (lower for small datasets, higher for large)\n" | |
| f"- umap_n_neighbors: 5-50 (lower=local structure, higher=global)\n" | |
| f"- hdbscan_min_cluster_size: 5-50 (min papers per cluster; bigger=fewer clusters)\n" | |
| f"- hdbscan_min_samples: 1-20 (lower=more clusters, higher=more robust)\n" | |
| f"- hdbscan_cluster_selection_epsilon: 0.0-0.5 (merge nearby clusters)\n\n" | |
| f"Return ONLY valid JSON with exactly these keys:\n" | |
| f"{{\"umap_n_components\": int, \"umap_n_neighbors\": int, " | |
| f"\"hdbscan_min_cluster_size\": int, \"hdbscan_min_samples\": int, " | |
| f"\"hdbscan_cluster_selection_epsilon\": float, " | |
| f"\"reasoning\": \"one sentence explaining choices\"}}" | |
| ) | |
| try: | |
| result = llm | JsonOutputParser() | |
| params = result.invoke(prompt) | |
| debug(f">>> LLM recommended params: {params}") | |
| return params | |
| except Exception as e: | |
| debug(f">>> LLM param recommendation failed ({e}), using defaults") | |
| return { | |
| "umap_n_components": 10, | |
| "umap_n_neighbors": 15, | |
| "hdbscan_min_cluster_size": max(5, n_sentences // 100), | |
| "hdbscan_min_samples": 5, | |
| "hdbscan_cluster_selection_epsilon": 0.1, | |
| "reasoning": "defaults used (LLM call failed)", | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: Bayesian optimization for HDBSCAN | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _bayesian_optimize_hdbscan(embeddings_2d: np.ndarray, llm_params: dict) -> dict: | |
| """Use Optuna to tune HDBSCAN min_cluster_size and min_samples. | |
| Optimizes for silhouette score on the UMAP-reduced embeddings. | |
| Uses LLM-recommended params as starting point / search bounds.""" | |
| debug(">>> Bayesian optimization (Optuna) for HDBSCAN...") | |
| try: | |
| import optuna | |
| import hdbscan as hdbscan_lib | |
| from sklearn.metrics import silhouette_score | |
| optuna.logging.set_verbosity(optuna.logging.WARNING) | |
| llm_mcs = llm_params.get("hdbscan_min_cluster_size", 10) | |
| llm_ms = llm_params.get("hdbscan_min_samples", 5) | |
| llm_eps = llm_params.get("hdbscan_cluster_selection_epsilon", 0.1) | |
| def objective(trial): | |
| mcs = trial.suggest_int( | |
| "min_cluster_size", | |
| max(2, llm_mcs // 2), | |
| min(embeddings_2d.shape[0] // 5, llm_mcs * 3), | |
| ) | |
| ms = trial.suggest_int( | |
| "min_samples", | |
| max(1, llm_ms // 2), | |
| max(2, llm_ms * 3), | |
| ) | |
| eps = trial.suggest_float( | |
| "cluster_selection_epsilon", | |
| max(0.0, llm_eps - 0.15), | |
| min(0.5, llm_eps + 0.25), | |
| ) | |
| clusterer = hdbscan_lib.HDBSCAN( | |
| min_cluster_size=mcs, | |
| min_samples=ms, | |
| cluster_selection_epsilon=eps, | |
| metric="euclidean", | |
| cluster_selection_method="eom", | |
| ) | |
| labels = clusterer.fit_predict(embeddings_2d) | |
| n_clusters = len(set(labels)) - (1 if -1 in labels else 0) | |
| n_noise = int((labels == -1).sum()) | |
| noise_ratio = n_noise / len(labels) | |
| # Penalize: fewer than 3 clusters, too many outliers (>40%) | |
| if n_clusters < 3 or noise_ratio > 0.4: | |
| return -1.0 | |
| try: | |
| # Only score non-noise points | |
| mask = labels != -1 | |
| score = silhouette_score(embeddings_2d[mask], labels[mask], metric="euclidean") | |
| return float(score) | |
| except Exception: | |
| return -1.0 | |
| study = optuna.create_study(direction="maximize") | |
| # Seed with LLM-recommended values first | |
| study.enqueue_trial({ | |
| "min_cluster_size": llm_mcs, | |
| "min_samples": llm_ms, | |
| "cluster_selection_epsilon": llm_eps, | |
| }) | |
| study.optimize(objective, n_trials=30, show_progress_bar=False) | |
| best = study.best_params | |
| best_score = study.best_value | |
| debug(f">>> Best HDBSCAN params: {best}, silhouette={best_score:.4f}") | |
| return {**best, "silhouette_score": best_score, "n_trials": 30} | |
| except Exception as e: | |
| debug(f">>> Bayesian optimization failed ({e}), using LLM defaults") | |
| return { | |
| "min_cluster_size": llm_params.get("hdbscan_min_cluster_size", 10), | |
| "min_samples": llm_params.get("hdbscan_min_samples", 5), | |
| "cluster_selection_epsilon": llm_params.get("hdbscan_cluster_selection_epsilon", 0.1), | |
| "silhouette_score": None, | |
| "n_trials": 0, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: 3 LLM calls + 1 orchestration per cluster | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _label_single_cluster(cluster_summary: dict, run_key: str) -> dict: | |
| """For one cluster: make 3 independent Mistral labeling calls, | |
| then 1 orchestration/judge call to pick the final label. | |
| Returns enriched cluster dict with label, category, confidence, reasoning.""" | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.output_parsers import JsonOutputParser | |
| cid = cluster_summary["topic_id"] | |
| sentences = cluster_summary.get("nearest", []) | |
| sent_block = "\n".join( | |
| f" {i+1}. \"{e['sentence'][:200]}\"\n Paper: {e['title']}\n Keywords: {e['keywords']}" | |
| for i, e in enumerate(sentences) | |
| ) | |
| top_words = cluster_summary.get("top_words", "") | |
| n_sent = cluster_summary.get("sentence_count", 0) | |
| n_paper = cluster_summary.get("paper_count", 0) | |
| base_prompt = ( | |
| f"You are a research topic classifier for academic papers on Technology and Tourism.\n\n" | |
| f"Cluster {cid} ({n_sent} sentences from {n_paper} papers):\n" | |
| f"Top BERTopic words: {top_words}\n\n" | |
| f"5 nearest centroid sentences:\n{sent_block}\n\n" | |
| f"Return JSON with:\n" | |
| f"- label: short specific name (3-6 words, NOT generic)\n" | |
| f"- category: plain research area (e.g. 'AI and machine learning', 'virtual reality', " | |
| f"'sustainability', 'consumer behavior', 'social media marketing')\n" | |
| f"- confidence: high / medium / low\n" | |
| f"- reasoning: 1 sentence why\n" | |
| f"Return ONLY valid JSON." | |
| ) | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=120) | |
| parser = JsonOutputParser() | |
| def _one_call(temperature): | |
| m = ChatMistralAI(model="mistral-small-latest", temperature=temperature, timeout=120) | |
| try: | |
| return (m | parser).invoke(base_prompt) | |
| except Exception as ex: | |
| return {"label": f"Cluster {cid}", "category": "unknown", | |
| "confidence": "low", "reasoning": str(ex)} | |
| # 3 independent calls with slight temperature variation for diversity | |
| with ThreadPoolExecutor(max_workers=3) as ex: | |
| futures = {ex.submit(_one_call, t): t for t in [0.0, 0.3, 0.6]} | |
| proposals = [f.result() for f in as_completed(futures)] | |
| debug(f">>> Cluster {cid}: got {len(proposals)} label proposals") | |
| # Orchestration / judge call: pick best label from 3 proposals | |
| judge_prompt = ( | |
| f"You are an expert research label judge.\n\n" | |
| f"For cluster {cid} ({n_sent} sentences, {n_paper} papers on Tourism + Technology):\n" | |
| f"Top words: {top_words}\n\n" | |
| f"3 label proposals:\n" | |
| + "\n".join(f" Option {i+1}: {json.dumps(p)}" for i, p in enumerate(proposals)) | |
| + f"\n\nPick the BEST label. It must be:\n" | |
| f"- Specific (not generic)\n" | |
| f"- Accurately describing the cluster content\n" | |
| f"- Clear for a research paper audience\n\n" | |
| f"Return JSON with the final chosen values:\n" | |
| f"{{\"label\": str, \"category\": str, \"confidence\": str, " | |
| f"\"reasoning\": str, \"chosen_option\": 1|2|3}}\n" | |
| f"Return ONLY valid JSON." | |
| ) | |
| try: | |
| judge_llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120) | |
| final = (judge_llm | parser).invoke(judge_prompt) | |
| except Exception as e: | |
| debug(f">>> Judge call failed for cluster {cid}: {e}, using option 1") | |
| final = proposals[0] | |
| debug(f">>> Cluster {cid} final label: {final.get('label', '?')}") | |
| return {**cluster_summary, **final, "label_proposals": proposals} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER: Abstract-title cosine similarity check | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _validate_cluster_coherence( | |
| cluster_summary: dict, | |
| sent_df: pd.DataFrame, | |
| topics_arr: np.ndarray, | |
| embeddings: np.ndarray, | |
| ) -> dict: | |
| """For each cluster, compute mean cosine similarity between abstract sentences | |
| and title sentences. Low similarity = cluster may be hallucinated or noisy. | |
| Adds 'coherence_score' and 'coherence_warning' to the cluster dict.""" | |
| cid = cluster_summary["topic_id"] | |
| mask = topics_arr == cid | |
| cluster_sent_df = sent_df[mask].copy() | |
| # We need to know which sentences came from abstract vs title | |
| # sent_df has 'source_type' column added during pipeline | |
| if "source_type" not in cluster_sent_df.columns: | |
| return {**cluster_summary, "coherence_score": None, | |
| "coherence_warning": "source_type column not available"} | |
| abstract_idx = cluster_sent_df[cluster_sent_df["source_type"] == "abstract"].index | |
| title_idx = cluster_sent_df[cluster_sent_df["source_type"] == "title"].index | |
| if len(abstract_idx) == 0 or len(title_idx) == 0: | |
| return {**cluster_summary, "coherence_score": None, | |
| "coherence_warning": "only one source type in cluster (expected for single-run)"} | |
| # Map back to original embedding indices | |
| orig_indices = np.where(mask)[0] | |
| sent_df_positions = {idx: pos for pos, idx in enumerate(sent_df[mask].index)} | |
| abs_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]] | |
| for i in abstract_idx if i in sent_df_positions]) | |
| title_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]] | |
| for i in title_idx if i in sent_df_positions]) | |
| if len(abs_embs) == 0 or len(title_embs) == 0: | |
| return {**cluster_summary, "coherence_score": None, | |
| "coherence_warning": "embedding alignment failed"} | |
| # Mean abstract centroid vs mean title centroid β cosine similarity | |
| abs_centroid = abs_embs.mean(axis=0) | |
| title_centroid = title_embs.mean(axis=0) | |
| norm = np.linalg.norm(abs_centroid) * np.linalg.norm(title_centroid) | |
| score = float(np.dot(abs_centroid, title_centroid) / (norm + 1e-10)) | |
| warning = None | |
| if score < 0.3: | |
| warning = f"LOW coherence ({score:.2f}) β abstract and title sentences diverge; cluster may be noisy" | |
| elif score < 0.5: | |
| warning = f"MODERATE coherence ({score:.2f}) β review cluster manually" | |
| debug(f">>> Cluster {cid} coherence score: {score:.3f}") | |
| return {**cluster_summary, "coherence_score": round(score, 4), | |
| "coherence_warning": warning} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 2: Sentence-Level BERTopic Pipeline | |
| # UMAP + HDBSCAN with LLM-guided params + Bayesian optimization | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: | |
| """Sentence-level BERTopic: split papers β embed β LLM recommends params | |
| β Bayesian optimization β UMAP β HDBSCAN β centroid nearest 5 β Plotly charts. | |
| Args: | |
| run_key: One of 'abstract' or 'title'. | |
| threshold: Ignored (kept for API compatibility). Params now set by LLM + Bayesian optimization. | |
| Returns: | |
| Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences.""" | |
| debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}')") | |
| from bertopic import BERTopic | |
| from sentence_transformers import SentenceTransformer | |
| from umap import UMAP | |
| import hdbscan as hdbscan_lib | |
| df = _data["df"].copy() | |
| cols = RUN_CONFIGS[run_key] | |
| available = list(filter(lambda c: c in df.columns, cols)) | |
| debug(f">>> Columns: {available}") | |
| # ββ Step 1: Assemble text per paper ββ | |
| df["_text"] = df[available].fillna("").agg(" ".join, axis=1) | |
| df["_paper_id"] = df.index | |
| # ββ Step 2: Split into sentences ββ | |
| debug(">>> Splitting into sentences...") | |
| df["_sentences"] = df["_text"].apply(_split_sentences) | |
| # ββ Step 3: Explode to sentence-level DataFrame ββ | |
| meta_cols = ["_paper_id", "Title", "Abstract", "Author Keywords", "_sentences"] | |
| available_meta = list(filter(lambda c: c in df.columns, meta_cols)) | |
| sent_df = df[available_meta].explode("_sentences").rename( | |
| columns={"_sentences": "text"}).reset_index(drop=True) | |
| sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True) | |
| sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() | |
| # Tag source type for coherence validation | |
| sent_df["source_type"] = run_key | |
| # ββ Step 3b: Filter boilerplate ββ | |
| debug(">>> Filtering publisher boilerplate...") | |
| n_before = len(sent_df) | |
| clean_mask = ~sent_df["text"].str.contains(_BOILERPLATE_RE.pattern, case=False, regex=True, na=False) | |
| sent_df = sent_df[clean_mask].reset_index(drop=True) | |
| sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() | |
| debug(f">>> Filtered: {n_before} β {len(sent_df)} sentences") | |
| n_sentences = len(sent_df) | |
| n_papers = len(df) | |
| # ββ Step 4: Embed sentences (384d, L2-normalized) ββ | |
| debug(">>> Embedding with all-MiniLM-L6-v2...") | |
| docs = sent_df["text"].tolist() | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True) | |
| debug(f">>> Embeddings: {embeddings.shape}") | |
| np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings) | |
| # ββ Step 5: LLM recommends UMAP + HDBSCAN params ββ | |
| debug(">>> Asking LLM for optimal clustering parameters...") | |
| llm_params = _llm_recommend_params(n_sentences, n_papers, run_key) | |
| debug(f">>> LLM params: {llm_params}") | |
| json.dump(llm_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_llm_params.json", "w"), indent=2) | |
| # ββ Step 6: UMAP dim reduction ββ | |
| debug(f">>> UMAP: {embeddings.shape[1]}d β {llm_params['umap_n_components']}d ...") | |
| umap_model = UMAP( | |
| n_components=llm_params["umap_n_components"], | |
| n_neighbors=llm_params["umap_n_neighbors"], | |
| min_dist=0.0, | |
| metric="cosine", | |
| random_state=42, | |
| ) | |
| embeddings_2d = umap_model.fit_transform(embeddings) | |
| debug(f">>> UMAP output shape: {embeddings_2d.shape}") | |
| # ββ Step 7: Bayesian optimization for HDBSCAN params ββ | |
| best_params = _bayesian_optimize_hdbscan(embeddings_2d, llm_params) | |
| debug(f">>> Optimized HDBSCAN params: {best_params}") | |
| json.dump(best_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json", "w"), indent=2) | |
| # ββ Step 8: Fit BERTopic with UMAP + HDBSCAN ββ | |
| debug(">>> Fitting BERTopic with UMAP + HDBSCAN...") | |
| cluster_model = hdbscan_lib.HDBSCAN( | |
| min_cluster_size=best_params["min_cluster_size"], | |
| min_samples=best_params["min_samples"], | |
| cluster_selection_epsilon=best_params["cluster_selection_epsilon"], | |
| metric="euclidean", | |
| cluster_selection_method="eom", | |
| prediction_data=True, | |
| ) | |
| # Pass identity UMAP since we already reduced | |
| from sklearn.preprocessing import FunctionTransformer | |
| no_umap = FunctionTransformer() | |
| topic_model = BERTopic( | |
| hdbscan_model=cluster_model, | |
| umap_model=no_umap, | |
| ) | |
| topics, probs = topic_model.fit_transform(docs, embeddings_2d) | |
| topics_arr = np.array(topics) | |
| n_topics = len(set(topics)) - int(-1 in topics) | |
| n_outliers = int(np.sum(topics_arr == -1)) | |
| debug(f">>> {n_topics} topics, {n_outliers} outliers ({100*n_outliers/len(topics):.1f}%)") | |
| _data[f"{run_key}_model"] = topic_model | |
| _data[f"{run_key}_topics"] = topics_arr | |
| _data[f"{run_key}_embeddings"] = embeddings | |
| _data[f"{run_key}_embeddings_2d"] = embeddings_2d | |
| _data[f"{run_key}_sent_df"] = sent_df | |
| # ββ Step 9: Plotly visualizations ββ | |
| debug(">>> Generating visualizations...") | |
| (n_topics >= 3) and topic_model.visualize_topics().write_html( | |
| f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn") | |
| (n_topics >= 1) and topic_model.visualize_barchart( | |
| top_n_topics=min(10, max(1, n_topics))).write_html( | |
| f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn") | |
| (n_topics >= 2) and topic_model.visualize_hierarchy().write_html( | |
| f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn") | |
| (n_topics >= 2) and topic_model.visualize_heatmap().write_html( | |
| f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn") | |
| # ββ Step 10: Centroid nearest 5 sentences per topic ββ | |
| topic_info = topic_model.get_topic_info() | |
| valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records"))) | |
| def _centroid_nearest(row): | |
| mask = topics_arr == row["Topic"] | |
| member_idx = np.where(mask)[0] | |
| member_embs = embeddings[mask] | |
| centroid = member_embs.mean(axis=0) | |
| norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) | |
| cosine_sim = (member_embs @ centroid) / (norms + 1e-10) | |
| dists = 1 - cosine_sim | |
| nearest = np.argsort(dists)[:NEAREST_K] | |
| nearest_evidence = list(map(lambda i: { | |
| "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], | |
| "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), | |
| "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], | |
| "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], | |
| }, nearest)) | |
| topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) | |
| unique_papers = len(topic_papers_df) | |
| paper_titles = list(map( | |
| lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], | |
| range(min(50, unique_papers)))) | |
| return {"topic_id": int(row["Topic"]), | |
| "sentence_count": int(row["Count"]), | |
| "paper_count": int(unique_papers), | |
| "top_words": str(row.get("Name", ""))[:100], | |
| "nearest": nearest_evidence, | |
| "paper_titles": paper_titles} | |
| summaries = list(map(_centroid_nearest, valid_rows)) | |
| json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str) | |
| debug(f">>> {len(summaries)} topics saved") | |
| lines = list(map( | |
| lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}", | |
| summaries)) | |
| sil = best_params.get("silhouette_score") | |
| sil_str = f"{sil:.3f}" if sil is not None else "N/A" | |
| return ( | |
| f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n" | |
| f"**Pipeline:**\n" | |
| f" Embeddings: all-MiniLM-L6-v2 (384d)\n" | |
| f" LLM-recommended params: n_components={llm_params['umap_n_components']}, " | |
| f"n_neighbors={llm_params['umap_n_neighbors']} | {llm_params.get('reasoning', '')}\n" | |
| f" Bayesian optimization ({best_params['n_trials']} trials): " | |
| f"min_cluster_size={best_params['min_cluster_size']}, " | |
| f"min_samples={best_params['min_samples']}, " | |
| f"epsilon={best_params['cluster_selection_epsilon']:.3f}, " | |
| f"silhouette={sil_str}\n\n" | |
| + "\n".join(lines) | |
| + f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html" | |
| + f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json" | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 3: Label Topics β 3 LLM calls + 1 judge per cluster | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def label_topics_with_llm(run_key: str) -> str: | |
| """For each cluster: 3 independent Mistral labeling calls + 1 orchestration judge call. | |
| Also runs abstract-title cosine similarity coherence check per cluster. | |
| Args: | |
| run_key: One of 'abstract' or 'title'. | |
| Returns: | |
| Labeled topics with sentence-level evidence and coherence scores.""" | |
| debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')") | |
| summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json")) | |
| debug(f">>> Loaded {len(summaries)} topics") | |
| # Top 100 by size to avoid API rate limits | |
| MAX_LABEL_TOPICS = 100 | |
| sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True) | |
| summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS] | |
| skipped = max(0, len(summaries) - MAX_LABEL_TOPICS) | |
| debug(f">>> Labeling top {len(summaries_to_label)} (skipping {skipped} tiny clusters)") | |
| # Run 3+1 LLM calls per cluster in parallel (thread pool, max 5 concurrent) | |
| debug(">>> Running 3-call + 1 judge labeling pipeline per cluster...") | |
| labeled = [] | |
| with ThreadPoolExecutor(max_workers=5) as executor: | |
| future_to_summary = { | |
| executor.submit(_label_single_cluster, s, run_key): s | |
| for s in summaries_to_label | |
| } | |
| for future in as_completed(future_to_summary): | |
| try: | |
| labeled.append(future.result()) | |
| except Exception as e: | |
| s = future_to_summary[future] | |
| debug(f">>> Labeling failed for cluster {s.get('topic_id', '?')}: {e}") | |
| labeled.append({**s, "label": f"Cluster {s.get('topic_id', '?')}", | |
| "category": "unknown", "confidence": "low", | |
| "reasoning": str(e)}) | |
| # Sort back by topic_id | |
| labeled.sort(key=lambda x: x.get("topic_id", 0)) | |
| # ββ Abstract-title coherence check ββ | |
| debug(">>> Running abstract-title coherence validation...") | |
| topics_arr = _data.get(f"{run_key}_topics") | |
| embeddings = _data.get(f"{run_key}_embeddings") | |
| sent_df = _data.get(f"{run_key}_sent_df") | |
| # Coherence check only meaningful for mixed-source runs | |
| # For single-source runs (abstract or title only) it gracefully returns None | |
| if topics_arr is not None and embeddings is not None and sent_df is not None: | |
| labeled = list(map( | |
| lambda l: _validate_cluster_coherence(l, sent_df, topics_arr, embeddings), | |
| labeled | |
| )) | |
| # For small skipped clusters, copy without labeling | |
| skipped_summaries = sorted_summaries[MAX_LABEL_TOPICS:] | |
| labeled_all = labeled + list(map( | |
| lambda s: {**s, "label": s.get("top_words", f"Cluster {s['topic_id']}")[:50], | |
| "category": "unlabeled", "confidence": "low", | |
| "reasoning": "skipped (small cluster)", "coherence_score": None}, | |
| skipped_summaries | |
| )) | |
| labeled_all.sort(key=lambda x: x.get("topic_id", 0)) | |
| json.dump(labeled_all, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str) | |
| debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json") | |
| def _cs(l): | |
| cs = l.get("coherence_score") | |
| return f"{cs:.2f}" if cs is not None else "N/A" | |
| lines = list(map( | |
| lambda l: ( | |
| f" **Topic {l.get('topic_id', '?')}: {l.get('label', '?')}** " | |
| f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} " | |
| f"coherence={_cs(l)}" | |
| + (f" β οΈ {l['coherence_warning']}" if l.get('coherence_warning') else "") | |
| + f" ({l.get('sentence_count', 0)} sent, {l.get('paper_count', 0)} papers)\n" | |
| + "\n".join(map( | |
| lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_", | |
| l.get("nearest", []) | |
| )) | |
| ), | |
| labeled_all | |
| )) | |
| return (f"[{run_key}] {len(labeled_all)} topics labeled (3 LLM calls + judge per cluster):\n\n" | |
| + "\n\n".join(lines)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 4: Generate Comparison CSV | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_comparison_csv() -> str: | |
| """Compare Mistral-labeled topics across completed runs. | |
| Returns: | |
| Comparison table + CSV path.""" | |
| debug(f"\n>>> TOOL: generate_comparison_csv()") | |
| completed = list(filter( | |
| lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys())) | |
| def _load_run(run_key): | |
| labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) | |
| return list(map(lambda l: { | |
| "run": run_key, "topic_id": l.get("topic_id", ""), | |
| "label": l.get("label", ""), "category": l.get("category", ""), | |
| "confidence": l.get("confidence", ""), "niche": l.get("niche", ""), | |
| "sentences": l.get("sentence_count", 0), | |
| "papers": l.get("paper_count", 0), | |
| "coherence_score": l.get("coherence_score", ""), | |
| "coherence_warning": l.get("coherence_warning", ""), | |
| "top_words": l.get("top_words", ""), | |
| }, labels)) | |
| all_rows = sum(list(map(_load_run, completed)), []) | |
| df = pd.DataFrame(all_rows) | |
| path = "/tmp/rq4_comparison.csv" | |
| df.to_csv(path, index=False) | |
| return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 5: Export 500-Word Narrative | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def export_narrative(run_key: str) -> str: | |
| """Generate 500-word narrative for research paper Section 7 via Mistral. | |
| Args: | |
| run_key: One of 'abstract' or 'title'. | |
| Returns: | |
| 500-word narrative + save path.""" | |
| debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')") | |
| from langchain_mistralai import ChatMistralAI | |
| labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) | |
| hdbscan_params = {} | |
| hdbscan_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json" | |
| os.path.exists(hdbscan_path) and hdbscan_params.update(json.load(open(hdbscan_path))) | |
| def _fmt_score(s): | |
| return f"{s:.3f}" if s is not None else "N/A" | |
| def _fmt_coherence(l): | |
| cs = l.get("coherence_score") | |
| return f"{cs:.2f}" if cs is not None else "N/A" | |
| topics_text = "\n".join(list(map( | |
| lambda l: (f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from " | |
| f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, " | |
| f"confidence: {l.get('confidence', '?')}, " | |
| f"coherence: {_fmt_coherence(l)})"), | |
| labels))) | |
| sil_score_str = _fmt_score(hdbscan_params.get("silhouette_score")) | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300) | |
| result = llm.invoke( | |
| f"Write exactly 500 words for a research paper Section 7 titled " | |
| f"'Topic Modeling Results β BERTopic Discovery'.\n\n" | |
| f"Dataset: 1390 Scopus papers on Tourism and AI.\n" | |
| f"Method: Sentence-level BERTopic β each abstract split into sentences, " | |
| f"embedded with all-MiniLM-L6-v2 (384d), UMAP dim reduction, HDBSCAN clustering.\n" | |
| f"Clustering optimized via Bayesian optimization (Optuna, 30 trials): " | |
| f"min_cluster_size={hdbscan_params.get('min_cluster_size', 'N/A')}, " | |
| f"min_samples={hdbscan_params.get('min_samples', 'N/A')}, " | |
| f"silhouette={sil_score_str}.\n" | |
| f"Each cluster labeled via 3 independent LLM calls + 1 judge orchestration.\n" | |
| f"Coherence validated via abstract-title cosine similarity.\n" | |
| f"Run config: '{run_key}' columns.\n\n" | |
| f"Topics discovered:\n{topics_text}\n\n" | |
| f"Include: methodology justification, key themes, coherence findings, limitations, future work." | |
| ) | |
| path = "/tmp/rq4_narrative.txt" | |
| open(path, "w", encoding="utf-8").write(result.content) | |
| return f"Narrative saved: {path}\n\n{result.content}" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 6: Consolidate Round 1 Topics into Themes | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def consolidate_into_themes(run_key: str, theme_map: dict) -> str: | |
| """ROUND 2: Merge fine-grained Round 1 topics into broader themes. | |
| Args: | |
| run_key: 'abstract' or 'title'. | |
| theme_map: Dict mapping theme names to topic ID lists. | |
| Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]} | |
| Returns: | |
| Consolidated themes with new 5-nearest sentence evidence per theme.""" | |
| debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)") | |
| topics_arr = _data[f"{run_key}_topics"] | |
| embeddings = _data[f"{run_key}_embeddings"] | |
| sent_df = _data[f"{run_key}_sent_df"] | |
| def _build_theme(item): | |
| theme_name, topic_ids = item | |
| mask = np.isin(topics_arr, topic_ids) | |
| member_idx = np.where(mask)[0] | |
| member_embs = embeddings[mask] | |
| centroid = member_embs.mean(axis=0) | |
| norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) | |
| cosine_sim = (member_embs @ centroid) / (norms + 1e-10) | |
| dists = 1 - cosine_sim | |
| nearest = np.argsort(dists)[:NEAREST_K] | |
| nearest_evidence = list(map(lambda i: { | |
| "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], | |
| "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), | |
| "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], | |
| "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], | |
| }, nearest)) | |
| unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique() | |
| topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) | |
| paper_titles = list(map( | |
| lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], | |
| range(min(50, len(topic_papers_df))))) | |
| return {"label": theme_name, "merged_topics": list(topic_ids), | |
| "sentence_count": int(mask.sum()), "paper_count": int(unique_papers), | |
| "nearest": nearest_evidence, "paper_titles": paper_titles} | |
| themes_raw = list(map(_build_theme, theme_map.items())) | |
| themes = list(map( | |
| lambda pair: {**pair[1], "topic_id": pair[0]}, | |
| enumerate(themes_raw))) | |
| json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str) | |
| lines = list(map( | |
| lambda t: (f" **{t['label']}** ({t['sentence_count']} sentences, {t['paper_count']} papers)\n" | |
| f" Merged from topics: {t['merged_topics']}\n" | |
| f" Evidence:\n" | |
| + "\n".join(map( | |
| lambda e: f" β \"{e['sentence'][:120]}...\" β _{e['title'][:60]}_", | |
| t["nearest"]))), | |
| themes)) | |
| return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TOOL 7: Compare Themes with PAJAIS Taxonomy | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| PAJAIS_TAXONOMY = [ | |
| "Electronic and Mobile Business / Social Commerce", | |
| "Human Behavior and IS / Human-Computer Interaction", | |
| "IS/IT Strategy, Leadership, Governance", | |
| "Business Intelligence and Data Analytics", | |
| "Design Science and IS", | |
| "Enterprise Systems and BPM", | |
| "IS Implementation, Adoption, and Diffusion", | |
| "Social Media and Business Impact", | |
| "Cultural and Global Issues in IS", | |
| "IS Security and Privacy", | |
| "IS Smart / IoT", | |
| "Knowledge Management", | |
| "ICT / Digital Platform / IT and Work", | |
| "IS Healthcare", | |
| "IT Project Management", | |
| "Service Science and IS", | |
| "Social and Organizational Aspects of IS", | |
| "Research Methods and Philosophy", | |
| "E-Finance / Economics of IS", | |
| "E-Government", | |
| "IS Education and Learning", | |
| "Green IT and Sustainability", | |
| ] | |
| def compare_with_taxonomy(run_key: str) -> str: | |
| """Compare BERTopic themes against PAJAIS/PACIS taxonomy (Jiang et al., 2019). | |
| Args: | |
| run_key: 'abstract' or 'title'. | |
| Returns: | |
| Mapping table: BERTopic theme β PAJAIS category (or NOVEL).""" | |
| debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')") | |
| from langchain_mistralai import ChatMistralAI | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json" | |
| labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json" | |
| source_path = (os.path.exists(themes_path) and themes_path) or labels_path | |
| themes = json.load(open(source_path)) | |
| themes_text = "\n".join(list(map( | |
| lambda t: f"- {t.get('label', '?')} ({t.get('paper_count', t.get('count', '?'))} papers)", | |
| themes))) | |
| taxonomy_text = "\n".join(map(lambda c: f"- {c}", PAJAIS_TAXONOMY)) | |
| prompt = PromptTemplate.from_template( | |
| "You are an IS research taxonomy expert.\n\n" | |
| "Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n" | |
| "For EACH theme, return a JSON ARRAY with:\n" | |
| "- label: the BERTopic theme name\n" | |
| "- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n" | |
| "- match_confidence: high, medium, low, or none\n" | |
| "- reasoning: why this mapping (1 sentence)\n" | |
| "- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n" | |
| "Return ONLY valid JSON array.\n\n" | |
| "BERTopic Themes:\n{themes}\n\n" | |
| "PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}") | |
| llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300) | |
| chain = prompt | llm | JsonOutputParser() | |
| mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text}) | |
| json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str) | |
| novel = list(filter(lambda m: m.get("is_novel", False), mappings)) | |
| mapped = list(filter(lambda m: not m.get("is_novel", False), mappings)) | |
| mapped_lines = list(map( | |
| lambda m: f" β {m.get('label', '?')} β **{m.get('pajais_match', '?')}** " | |
| f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_", | |
| mapped)) | |
| novel_lines = list(map( | |
| lambda m: f" π **{m.get('label', '?')}** β NOVEL _{m.get('reasoning', '')}_", | |
| novel)) | |
| return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n" | |
| f"**Mapped to PAJAIS categories ({len(mapped)}):**\n" + "\n".join(mapped_lines) + | |
| f"\n\n**NOVEL / Emerging themes ({len(novel)}):**\n" + "\n".join(novel_lines) + | |
| f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET ALL TOOLS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_all_tools(): | |
| tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, | |
| consolidate_into_themes, compare_with_taxonomy, | |
| generate_comparison_csv, export_narrative] | |
| list(map(lambda t: setattr(t, 'handle_tool_error', True), tools)) | |
| debug(f">>> tools.py: {len(tools)} tools ready") | |
| return tools | |