"""tools.py — Sentence-level BERTopic pipeline + Mistral LLM. Version 4.0.0 | May 2026. PIPELINE: Paper → split sentences → embed (all-MiniLM-L6-v2, 384d) → LLM recommends UMAP + HDBSCAN params based on dataset stats → Bayesian optimization (Optuna) tunes min_cluster_size + min_samples → UMAP dim reduction → HDBSCAN clustering → 3 LLM labeling calls per cluster + 1 orchestration judge call → Abstract-title cosine similarity validation (hallucination check) """ from langchain_core.tools import tool import os import json import re import numpy as np import pandas as pd from concurrent.futures import ThreadPoolExecutor, as_completed DEBUG = True debug = {True: print, False: lambda *a, **k: None}[DEBUG] CHECKPOINT_DIR = "/tmp/checkpoints" os.makedirs(CHECKPOINT_DIR, exist_ok=True) NEAREST_K = 5 SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])' MIN_SENT_LEN = 30 RUN_CONFIGS = { "abstract": ["Abstract"], "title": ["Title"], } _data = {} # ═══════════════════════════════════════════════ # HELPER: Split text into sentences # ═══════════════════════════════════════════════ def _split_sentences(text): raw = re.split(SENT_SPLIT_RE, str(text)) return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw)) # ═══════════════════════════════════════════════ # HELPER: Filter publisher boilerplate # ═══════════════════════════════════════════════ _BOILERPLATE_RE = re.compile("|".join([ r"Licensee MDPI", r"Published by Informa", r"Published by Elsevier", r"Taylor & Francis", r"Copyright ©", r"Creative Commons", r"open access article", r"Inderscience Enterprises", r"All rights reserved", r"This is an open access", r"distributed under the terms", r"The Author\(s\)", r"Springer Nature", r"Emerald Publishing", r"limitations and future", r"limitations and implications", r"limitations are discussed", r"limitations have been discussed", r"implications are discussed", r"implications were discussed", r"implications are presented", r"concludes with .* implications", ]), re.IGNORECASE) # ═══════════════════════════════════════════════ # TOOL 1: Load Scopus CSV # ═══════════════════════════════════════════════ @tool def load_scopus_csv(filepath: str) -> str: """Load a Scopus CSV export and show preview. Call this first. Args: filepath: Path to the uploaded .csv file. Returns: Row count, column names, and sample data.""" debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')") df = pd.read_csv(filepath, encoding="utf-8-sig") _data["df"] = df debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns") target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"])) sample = df[target_cols].head(3).to_string(max_colwidth=80) null_counts = ", ".join(list(map( lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols))) sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len) avg_abstract_sents = sample_sents.mean() est_abstract = int(avg_abstract_sents * len(df)) title_count = int(df["Title"].notna().sum()) return (f"📊 **Dataset Statistics:**\n" f"- **Papers:** {len(df)}\n" f"- **Abstract sentences:** ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n" f"- **Title sentences:** {title_count} (1 per paper)\n" f"- **Non-null:** {null_counts}\n\n" f"Columns: {', '.join(list(df.columns)[:15])}\n\n" f"Sample:\n{sample}") # ═══════════════════════════════════════════════ # HELPER: LLM recommends UMAP + HDBSCAN params # ═══════════════════════════════════════════════ def _llm_recommend_params(n_sentences: int, n_papers: int, run_key: str) -> dict: """Call Mistral to recommend UMAP + HDBSCAN parameters based on dataset characteristics. Returns a dict with umap_n_components, umap_n_neighbors, hdbscan_min_cluster_size, hdbscan_min_samples, hdbscan_cluster_selection_epsilon.""" debug(f">>> LLM param recommendation: {n_sentences} sentences, {n_papers} papers, run={run_key}") from langchain_mistralai import ChatMistralAI from langchain_core.output_parsers import JsonOutputParser llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120) prompt = ( f"You are an NLP clustering expert. Given this academic paper dataset, " f"recommend optimal UMAP and HDBSCAN parameters.\n\n" f"Dataset:\n" f"- Total sentences to cluster: {n_sentences}\n" f"- Total papers: {n_papers}\n" f"- Run type: {run_key} (abstract=~10 sents/paper, title=1 sent/paper)\n" f"- Embedding dimension: 384 (all-MiniLM-L6-v2)\n\n" f"Rules:\n" f"- umap_n_components: 5-50 (lower for small datasets, higher for large)\n" f"- umap_n_neighbors: 5-50 (lower=local structure, higher=global)\n" f"- hdbscan_min_cluster_size: 5-50 (min papers per cluster; bigger=fewer clusters)\n" f"- hdbscan_min_samples: 1-20 (lower=more clusters, higher=more robust)\n" f"- hdbscan_cluster_selection_epsilon: 0.0-0.5 (merge nearby clusters)\n\n" f"Return ONLY valid JSON with exactly these keys:\n" f"{{\"umap_n_components\": int, \"umap_n_neighbors\": int, " f"\"hdbscan_min_cluster_size\": int, \"hdbscan_min_samples\": int, " f"\"hdbscan_cluster_selection_epsilon\": float, " f"\"reasoning\": \"one sentence explaining choices\"}}" ) try: result = llm | JsonOutputParser() params = result.invoke(prompt) debug(f">>> LLM recommended params: {params}") return params except Exception as e: debug(f">>> LLM param recommendation failed ({e}), using defaults") return { "umap_n_components": 10, "umap_n_neighbors": 15, "hdbscan_min_cluster_size": max(5, n_sentences // 100), "hdbscan_min_samples": 5, "hdbscan_cluster_selection_epsilon": 0.1, "reasoning": "defaults used (LLM call failed)", } # ═══════════════════════════════════════════════ # HELPER: Bayesian optimization for HDBSCAN # ═══════════════════════════════════════════════ def _bayesian_optimize_hdbscan(embeddings_2d: np.ndarray, llm_params: dict) -> dict: """Use Optuna to tune HDBSCAN min_cluster_size and min_samples. Optimizes for silhouette score on the UMAP-reduced embeddings. Uses LLM-recommended params as starting point / search bounds.""" debug(">>> Bayesian optimization (Optuna) for HDBSCAN...") try: import optuna import hdbscan as hdbscan_lib from sklearn.metrics import silhouette_score optuna.logging.set_verbosity(optuna.logging.WARNING) llm_mcs = llm_params.get("hdbscan_min_cluster_size", 10) llm_ms = llm_params.get("hdbscan_min_samples", 5) llm_eps = llm_params.get("hdbscan_cluster_selection_epsilon", 0.1) def objective(trial): mcs = trial.suggest_int( "min_cluster_size", max(2, llm_mcs // 2), min(embeddings_2d.shape[0] // 5, llm_mcs * 3), ) ms = trial.suggest_int( "min_samples", max(1, llm_ms // 2), max(2, llm_ms * 3), ) eps = trial.suggest_float( "cluster_selection_epsilon", max(0.0, llm_eps - 0.15), min(0.5, llm_eps + 0.25), ) clusterer = hdbscan_lib.HDBSCAN( min_cluster_size=mcs, min_samples=ms, cluster_selection_epsilon=eps, metric="euclidean", cluster_selection_method="eom", ) labels = clusterer.fit_predict(embeddings_2d) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = int((labels == -1).sum()) noise_ratio = n_noise / len(labels) # Penalize: fewer than 3 clusters, too many outliers (>40%) if n_clusters < 3 or noise_ratio > 0.4: return -1.0 try: # Only score non-noise points mask = labels != -1 score = silhouette_score(embeddings_2d[mask], labels[mask], metric="euclidean") return float(score) except Exception: return -1.0 study = optuna.create_study(direction="maximize") # Seed with LLM-recommended values first study.enqueue_trial({ "min_cluster_size": llm_mcs, "min_samples": llm_ms, "cluster_selection_epsilon": llm_eps, }) study.optimize(objective, n_trials=30, show_progress_bar=False) best = study.best_params best_score = study.best_value debug(f">>> Best HDBSCAN params: {best}, silhouette={best_score:.4f}") return {**best, "silhouette_score": best_score, "n_trials": 30} except Exception as e: debug(f">>> Bayesian optimization failed ({e}), using LLM defaults") return { "min_cluster_size": llm_params.get("hdbscan_min_cluster_size", 10), "min_samples": llm_params.get("hdbscan_min_samples", 5), "cluster_selection_epsilon": llm_params.get("hdbscan_cluster_selection_epsilon", 0.1), "silhouette_score": None, "n_trials": 0, } # ═══════════════════════════════════════════════ # HELPER: 3 LLM calls + 1 orchestration per cluster # ═══════════════════════════════════════════════ def _label_single_cluster(cluster_summary: dict, run_key: str) -> dict: """For one cluster: make 3 independent Mistral labeling calls, then 1 orchestration/judge call to pick the final label. Returns enriched cluster dict with label, category, confidence, reasoning.""" from langchain_mistralai import ChatMistralAI from langchain_core.output_parsers import JsonOutputParser cid = cluster_summary["topic_id"] sentences = cluster_summary.get("nearest", []) sent_block = "\n".join( f" {i+1}. \"{e['sentence'][:200]}\"\n Paper: {e['title']}\n Keywords: {e['keywords']}" for i, e in enumerate(sentences) ) top_words = cluster_summary.get("top_words", "") n_sent = cluster_summary.get("sentence_count", 0) n_paper = cluster_summary.get("paper_count", 0) base_prompt = ( f"You are a research topic classifier for academic papers on Technology and Tourism.\n\n" f"Cluster {cid} ({n_sent} sentences from {n_paper} papers):\n" f"Top BERTopic words: {top_words}\n\n" f"5 nearest centroid sentences:\n{sent_block}\n\n" f"Return JSON with:\n" f"- label: short specific name (3-6 words, NOT generic)\n" f"- category: plain research area (e.g. 'AI and machine learning', 'virtual reality', " f"'sustainability', 'consumer behavior', 'social media marketing')\n" f"- confidence: high / medium / low\n" f"- reasoning: 1 sentence why\n" f"Return ONLY valid JSON." ) llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=120) parser = JsonOutputParser() def _one_call(temperature): m = ChatMistralAI(model="mistral-small-latest", temperature=temperature, timeout=120) try: return (m | parser).invoke(base_prompt) except Exception as ex: return {"label": f"Cluster {cid}", "category": "unknown", "confidence": "low", "reasoning": str(ex)} # 3 independent calls with slight temperature variation for diversity with ThreadPoolExecutor(max_workers=3) as ex: futures = {ex.submit(_one_call, t): t for t in [0.0, 0.3, 0.6]} proposals = [f.result() for f in as_completed(futures)] debug(f">>> Cluster {cid}: got {len(proposals)} label proposals") # Orchestration / judge call: pick best label from 3 proposals judge_prompt = ( f"You are an expert research label judge.\n\n" f"For cluster {cid} ({n_sent} sentences, {n_paper} papers on Tourism + Technology):\n" f"Top words: {top_words}\n\n" f"3 label proposals:\n" + "\n".join(f" Option {i+1}: {json.dumps(p)}" for i, p in enumerate(proposals)) + f"\n\nPick the BEST label. It must be:\n" f"- Specific (not generic)\n" f"- Accurately describing the cluster content\n" f"- Clear for a research paper audience\n\n" f"Return JSON with the final chosen values:\n" f"{{\"label\": str, \"category\": str, \"confidence\": str, " f"\"reasoning\": str, \"chosen_option\": 1|2|3}}\n" f"Return ONLY valid JSON." ) try: judge_llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120) final = (judge_llm | parser).invoke(judge_prompt) except Exception as e: debug(f">>> Judge call failed for cluster {cid}: {e}, using option 1") final = proposals[0] debug(f">>> Cluster {cid} final label: {final.get('label', '?')}") return {**cluster_summary, **final, "label_proposals": proposals} # ═══════════════════════════════════════════════ # HELPER: Abstract-title cosine similarity check # ═══════════════════════════════════════════════ def _validate_cluster_coherence( cluster_summary: dict, sent_df: pd.DataFrame, topics_arr: np.ndarray, embeddings: np.ndarray, ) -> dict: """For each cluster, compute mean cosine similarity between abstract sentences and title sentences. Low similarity = cluster may be hallucinated or noisy. Adds 'coherence_score' and 'coherence_warning' to the cluster dict.""" cid = cluster_summary["topic_id"] mask = topics_arr == cid cluster_sent_df = sent_df[mask].copy() # We need to know which sentences came from abstract vs title # sent_df has 'source_type' column added during pipeline if "source_type" not in cluster_sent_df.columns: return {**cluster_summary, "coherence_score": None, "coherence_warning": "source_type column not available"} abstract_idx = cluster_sent_df[cluster_sent_df["source_type"] == "abstract"].index title_idx = cluster_sent_df[cluster_sent_df["source_type"] == "title"].index if len(abstract_idx) == 0 or len(title_idx) == 0: return {**cluster_summary, "coherence_score": None, "coherence_warning": "only one source type in cluster (expected for single-run)"} # Map back to original embedding indices orig_indices = np.where(mask)[0] sent_df_positions = {idx: pos for pos, idx in enumerate(sent_df[mask].index)} abs_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]] for i in abstract_idx if i in sent_df_positions]) title_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]] for i in title_idx if i in sent_df_positions]) if len(abs_embs) == 0 or len(title_embs) == 0: return {**cluster_summary, "coherence_score": None, "coherence_warning": "embedding alignment failed"} # Mean abstract centroid vs mean title centroid — cosine similarity abs_centroid = abs_embs.mean(axis=0) title_centroid = title_embs.mean(axis=0) norm = np.linalg.norm(abs_centroid) * np.linalg.norm(title_centroid) score = float(np.dot(abs_centroid, title_centroid) / (norm + 1e-10)) warning = None if score < 0.3: warning = f"LOW coherence ({score:.2f}) — abstract and title sentences diverge; cluster may be noisy" elif score < 0.5: warning = f"MODERATE coherence ({score:.2f}) — review cluster manually" debug(f">>> Cluster {cid} coherence score: {score:.3f}") return {**cluster_summary, "coherence_score": round(score, 4), "coherence_warning": warning} # ═══════════════════════════════════════════════ # TOOL 2: Sentence-Level BERTopic Pipeline # UMAP + HDBSCAN with LLM-guided params + Bayesian optimization # ═══════════════════════════════════════════════ @tool def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str: """Sentence-level BERTopic: split papers → embed → LLM recommends params → Bayesian optimization → UMAP → HDBSCAN → centroid nearest 5 → Plotly charts. Args: run_key: One of 'abstract' or 'title'. threshold: Ignored (kept for API compatibility). Params now set by LLM + Bayesian optimization. Returns: Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences.""" debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}')") from bertopic import BERTopic from sentence_transformers import SentenceTransformer from umap import UMAP import hdbscan as hdbscan_lib df = _data["df"].copy() cols = RUN_CONFIGS[run_key] available = list(filter(lambda c: c in df.columns, cols)) debug(f">>> Columns: {available}") # ── Step 1: Assemble text per paper ── df["_text"] = df[available].fillna("").agg(" ".join, axis=1) df["_paper_id"] = df.index # ── Step 2: Split into sentences ── debug(">>> Splitting into sentences...") df["_sentences"] = df["_text"].apply(_split_sentences) # ── Step 3: Explode to sentence-level DataFrame ── meta_cols = ["_paper_id", "Title", "Abstract", "Author Keywords", "_sentences"] available_meta = list(filter(lambda c: c in df.columns, meta_cols)) sent_df = df[available_meta].explode("_sentences").rename( columns={"_sentences": "text"}).reset_index(drop=True) sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True) sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() # Tag source type for coherence validation sent_df["source_type"] = run_key # ── Step 3b: Filter boilerplate ── debug(">>> Filtering publisher boilerplate...") n_before = len(sent_df) clean_mask = ~sent_df["text"].str.contains(_BOILERPLATE_RE.pattern, case=False, regex=True, na=False) sent_df = sent_df[clean_mask].reset_index(drop=True) sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount() debug(f">>> Filtered: {n_before} → {len(sent_df)} sentences") n_sentences = len(sent_df) n_papers = len(df) # ── Step 4: Embed sentences (384d, L2-normalized) ── debug(">>> Embedding with all-MiniLM-L6-v2...") docs = sent_df["text"].tolist() embedder = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True) debug(f">>> Embeddings: {embeddings.shape}") np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings) # ── Step 5: LLM recommends UMAP + HDBSCAN params ── debug(">>> Asking LLM for optimal clustering parameters...") llm_params = _llm_recommend_params(n_sentences, n_papers, run_key) debug(f">>> LLM params: {llm_params}") json.dump(llm_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_llm_params.json", "w"), indent=2) # ── Step 6: UMAP dim reduction ── debug(f">>> UMAP: {embeddings.shape[1]}d → {llm_params['umap_n_components']}d ...") umap_model = UMAP( n_components=llm_params["umap_n_components"], n_neighbors=llm_params["umap_n_neighbors"], min_dist=0.0, metric="cosine", random_state=42, ) embeddings_2d = umap_model.fit_transform(embeddings) debug(f">>> UMAP output shape: {embeddings_2d.shape}") # ── Step 7: Bayesian optimization for HDBSCAN params ── best_params = _bayesian_optimize_hdbscan(embeddings_2d, llm_params) debug(f">>> Optimized HDBSCAN params: {best_params}") json.dump(best_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json", "w"), indent=2) # ── Step 8: Fit BERTopic with UMAP + HDBSCAN ── debug(">>> Fitting BERTopic with UMAP + HDBSCAN...") cluster_model = hdbscan_lib.HDBSCAN( min_cluster_size=best_params["min_cluster_size"], min_samples=best_params["min_samples"], cluster_selection_epsilon=best_params["cluster_selection_epsilon"], metric="euclidean", cluster_selection_method="eom", prediction_data=True, ) # Pass identity UMAP since we already reduced from sklearn.preprocessing import FunctionTransformer no_umap = FunctionTransformer() topic_model = BERTopic( hdbscan_model=cluster_model, umap_model=no_umap, ) topics, probs = topic_model.fit_transform(docs, embeddings_2d) topics_arr = np.array(topics) n_topics = len(set(topics)) - int(-1 in topics) n_outliers = int(np.sum(topics_arr == -1)) debug(f">>> {n_topics} topics, {n_outliers} outliers ({100*n_outliers/len(topics):.1f}%)") _data[f"{run_key}_model"] = topic_model _data[f"{run_key}_topics"] = topics_arr _data[f"{run_key}_embeddings"] = embeddings _data[f"{run_key}_embeddings_2d"] = embeddings_2d _data[f"{run_key}_sent_df"] = sent_df # ── Step 9: Plotly visualizations ── debug(">>> Generating visualizations...") (n_topics >= 3) and topic_model.visualize_topics().write_html( f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn") (n_topics >= 1) and topic_model.visualize_barchart( top_n_topics=min(10, max(1, n_topics))).write_html( f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn") (n_topics >= 2) and topic_model.visualize_hierarchy().write_html( f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn") (n_topics >= 2) and topic_model.visualize_heatmap().write_html( f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn") # ── Step 10: Centroid nearest 5 sentences per topic ── topic_info = topic_model.get_topic_info() valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records"))) def _centroid_nearest(row): mask = topics_arr == row["Topic"] member_idx = np.where(mask)[0] member_embs = embeddings[mask] centroid = member_embs.mean(axis=0) norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) cosine_sim = (member_embs @ centroid) / (norms + 1e-10) dists = 1 - cosine_sim nearest = np.argsort(dists)[:NEAREST_K] nearest_evidence = list(map(lambda i: { "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], }, nearest)) topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) unique_papers = len(topic_papers_df) paper_titles = list(map( lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], range(min(50, unique_papers)))) return {"topic_id": int(row["Topic"]), "sentence_count": int(row["Count"]), "paper_count": int(unique_papers), "top_words": str(row.get("Name", ""))[:100], "nearest": nearest_evidence, "paper_titles": paper_titles} summaries = list(map(_centroid_nearest, valid_rows)) json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str) debug(f">>> {len(summaries)} topics saved") lines = list(map( lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}", summaries)) sil = best_params.get("silhouette_score") sil_str = f"{sil:.3f}" if sil is not None else "N/A" return ( f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n" f"**Pipeline:**\n" f" Embeddings: all-MiniLM-L6-v2 (384d)\n" f" LLM-recommended params: n_components={llm_params['umap_n_components']}, " f"n_neighbors={llm_params['umap_n_neighbors']} | {llm_params.get('reasoning', '')}\n" f" Bayesian optimization ({best_params['n_trials']} trials): " f"min_cluster_size={best_params['min_cluster_size']}, " f"min_samples={best_params['min_samples']}, " f"epsilon={best_params['cluster_selection_epsilon']:.3f}, " f"silhouette={sil_str}\n\n" + "\n".join(lines) + f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html" + f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json" ) # ═══════════════════════════════════════════════ # TOOL 3: Label Topics — 3 LLM calls + 1 judge per cluster # ═══════════════════════════════════════════════ @tool def label_topics_with_llm(run_key: str) -> str: """For each cluster: 3 independent Mistral labeling calls + 1 orchestration judge call. Also runs abstract-title cosine similarity coherence check per cluster. Args: run_key: One of 'abstract' or 'title'. Returns: Labeled topics with sentence-level evidence and coherence scores.""" debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')") summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json")) debug(f">>> Loaded {len(summaries)} topics") # Top 100 by size to avoid API rate limits MAX_LABEL_TOPICS = 100 sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True) summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS] skipped = max(0, len(summaries) - MAX_LABEL_TOPICS) debug(f">>> Labeling top {len(summaries_to_label)} (skipping {skipped} tiny clusters)") # Run 3+1 LLM calls per cluster in parallel (thread pool, max 5 concurrent) debug(">>> Running 3-call + 1 judge labeling pipeline per cluster...") labeled = [] with ThreadPoolExecutor(max_workers=5) as executor: future_to_summary = { executor.submit(_label_single_cluster, s, run_key): s for s in summaries_to_label } for future in as_completed(future_to_summary): try: labeled.append(future.result()) except Exception as e: s = future_to_summary[future] debug(f">>> Labeling failed for cluster {s.get('topic_id', '?')}: {e}") labeled.append({**s, "label": f"Cluster {s.get('topic_id', '?')}", "category": "unknown", "confidence": "low", "reasoning": str(e)}) # Sort back by topic_id labeled.sort(key=lambda x: x.get("topic_id", 0)) # ── Abstract-title coherence check ── debug(">>> Running abstract-title coherence validation...") topics_arr = _data.get(f"{run_key}_topics") embeddings = _data.get(f"{run_key}_embeddings") sent_df = _data.get(f"{run_key}_sent_df") # Coherence check only meaningful for mixed-source runs # For single-source runs (abstract or title only) it gracefully returns None if topics_arr is not None and embeddings is not None and sent_df is not None: labeled = list(map( lambda l: _validate_cluster_coherence(l, sent_df, topics_arr, embeddings), labeled )) # For small skipped clusters, copy without labeling skipped_summaries = sorted_summaries[MAX_LABEL_TOPICS:] labeled_all = labeled + list(map( lambda s: {**s, "label": s.get("top_words", f"Cluster {s['topic_id']}")[:50], "category": "unlabeled", "confidence": "low", "reasoning": "skipped (small cluster)", "coherence_score": None}, skipped_summaries )) labeled_all.sort(key=lambda x: x.get("topic_id", 0)) json.dump(labeled_all, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str) debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json") def _cs(l): cs = l.get("coherence_score") return f"{cs:.2f}" if cs is not None else "N/A" lines = list(map( lambda l: ( f" **Topic {l.get('topic_id', '?')}: {l.get('label', '?')}** " f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} " f"coherence={_cs(l)}" + (f" ⚠️ {l['coherence_warning']}" if l.get('coherence_warning') else "") + f" ({l.get('sentence_count', 0)} sent, {l.get('paper_count', 0)} papers)\n" + "\n".join(map( lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_", l.get("nearest", []) )) ), labeled_all )) return (f"[{run_key}] {len(labeled_all)} topics labeled (3 LLM calls + judge per cluster):\n\n" + "\n\n".join(lines)) # ═══════════════════════════════════════════════ # TOOL 4: Generate Comparison CSV # ═══════════════════════════════════════════════ @tool def generate_comparison_csv() -> str: """Compare Mistral-labeled topics across completed runs. Returns: Comparison table + CSV path.""" debug(f"\n>>> TOOL: generate_comparison_csv()") completed = list(filter( lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys())) def _load_run(run_key): labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) return list(map(lambda l: { "run": run_key, "topic_id": l.get("topic_id", ""), "label": l.get("label", ""), "category": l.get("category", ""), "confidence": l.get("confidence", ""), "niche": l.get("niche", ""), "sentences": l.get("sentence_count", 0), "papers": l.get("paper_count", 0), "coherence_score": l.get("coherence_score", ""), "coherence_warning": l.get("coherence_warning", ""), "top_words": l.get("top_words", ""), }, labels)) all_rows = sum(list(map(_load_run, completed)), []) df = pd.DataFrame(all_rows) path = "/tmp/rq4_comparison.csv" df.to_csv(path, index=False) return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}" # ═══════════════════════════════════════════════ # TOOL 5: Export 500-Word Narrative # ═══════════════════════════════════════════════ @tool def export_narrative(run_key: str) -> str: """Generate 500-word narrative for research paper Section 7 via Mistral. Args: run_key: One of 'abstract' or 'title'. Returns: 500-word narrative + save path.""" debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')") from langchain_mistralai import ChatMistralAI labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json")) hdbscan_params = {} hdbscan_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json" os.path.exists(hdbscan_path) and hdbscan_params.update(json.load(open(hdbscan_path))) def _fmt_score(s): return f"{s:.3f}" if s is not None else "N/A" def _fmt_coherence(l): cs = l.get("coherence_score") return f"{cs:.2f}" if cs is not None else "N/A" topics_text = "\n".join(list(map( lambda l: (f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from " f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, " f"confidence: {l.get('confidence', '?')}, " f"coherence: {_fmt_coherence(l)})"), labels))) sil_score_str = _fmt_score(hdbscan_params.get("silhouette_score")) llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300) result = llm.invoke( f"Write exactly 500 words for a research paper Section 7 titled " f"'Topic Modeling Results — BERTopic Discovery'.\n\n" f"Dataset: 1390 Scopus papers on Tourism and AI.\n" f"Method: Sentence-level BERTopic — each abstract split into sentences, " f"embedded with all-MiniLM-L6-v2 (384d), UMAP dim reduction, HDBSCAN clustering.\n" f"Clustering optimized via Bayesian optimization (Optuna, 30 trials): " f"min_cluster_size={hdbscan_params.get('min_cluster_size', 'N/A')}, " f"min_samples={hdbscan_params.get('min_samples', 'N/A')}, " f"silhouette={sil_score_str}.\n" f"Each cluster labeled via 3 independent LLM calls + 1 judge orchestration.\n" f"Coherence validated via abstract-title cosine similarity.\n" f"Run config: '{run_key}' columns.\n\n" f"Topics discovered:\n{topics_text}\n\n" f"Include: methodology justification, key themes, coherence findings, limitations, future work." ) path = "/tmp/rq4_narrative.txt" open(path, "w", encoding="utf-8").write(result.content) return f"Narrative saved: {path}\n\n{result.content}" # ═══════════════════════════════════════════════ # TOOL 6: Consolidate Round 1 Topics into Themes # ═══════════════════════════════════════════════ @tool def consolidate_into_themes(run_key: str, theme_map: dict) -> str: """ROUND 2: Merge fine-grained Round 1 topics into broader themes. Args: run_key: 'abstract' or 'title'. theme_map: Dict mapping theme names to topic ID lists. Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]} Returns: Consolidated themes with new 5-nearest sentence evidence per theme.""" debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)") topics_arr = _data[f"{run_key}_topics"] embeddings = _data[f"{run_key}_embeddings"] sent_df = _data[f"{run_key}_sent_df"] def _build_theme(item): theme_name, topic_ids = item mask = np.isin(topics_arr, topic_ids) member_idx = np.where(mask)[0] member_embs = embeddings[mask] centroid = member_embs.mean(axis=0) norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid) cosine_sim = (member_embs @ centroid) / (norms + 1e-10) dists = 1 - cosine_sim nearest = np.argsort(dists)[:NEAREST_K] nearest_evidence = list(map(lambda i: { "sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250], "paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]), "title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150], "keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150], }, nearest)) unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique() topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"]) paper_titles = list(map( lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200], range(min(50, len(topic_papers_df))))) return {"label": theme_name, "merged_topics": list(topic_ids), "sentence_count": int(mask.sum()), "paper_count": int(unique_papers), "nearest": nearest_evidence, "paper_titles": paper_titles} themes_raw = list(map(_build_theme, theme_map.items())) themes = list(map( lambda pair: {**pair[1], "topic_id": pair[0]}, enumerate(themes_raw))) json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str) lines = list(map( lambda t: (f" **{t['label']}** ({t['sentence_count']} sentences, {t['paper_count']} papers)\n" f" Merged from topics: {t['merged_topics']}\n" f" Evidence:\n" + "\n".join(map( lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_", t["nearest"]))), themes)) return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines) # ═══════════════════════════════════════════════ # TOOL 7: Compare Themes with PAJAIS Taxonomy # ═══════════════════════════════════════════════ PAJAIS_TAXONOMY = [ "Electronic and Mobile Business / Social Commerce", "Human Behavior and IS / Human-Computer Interaction", "IS/IT Strategy, Leadership, Governance", "Business Intelligence and Data Analytics", "Design Science and IS", "Enterprise Systems and BPM", "IS Implementation, Adoption, and Diffusion", "Social Media and Business Impact", "Cultural and Global Issues in IS", "IS Security and Privacy", "IS Smart / IoT", "Knowledge Management", "ICT / Digital Platform / IT and Work", "IS Healthcare", "IT Project Management", "Service Science and IS", "Social and Organizational Aspects of IS", "Research Methods and Philosophy", "E-Finance / Economics of IS", "E-Government", "IS Education and Learning", "Green IT and Sustainability", ] @tool def compare_with_taxonomy(run_key: str) -> str: """Compare BERTopic themes against PAJAIS/PACIS taxonomy (Jiang et al., 2019). Args: run_key: 'abstract' or 'title'. Returns: Mapping table: BERTopic theme → PAJAIS category (or NOVEL).""" debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')") from langchain_mistralai import ChatMistralAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json" labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json" source_path = (os.path.exists(themes_path) and themes_path) or labels_path themes = json.load(open(source_path)) themes_text = "\n".join(list(map( lambda t: f"- {t.get('label', '?')} ({t.get('paper_count', t.get('count', '?'))} papers)", themes))) taxonomy_text = "\n".join(map(lambda c: f"- {c}", PAJAIS_TAXONOMY)) prompt = PromptTemplate.from_template( "You are an IS research taxonomy expert.\n\n" "Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n" "For EACH theme, return a JSON ARRAY with:\n" "- label: the BERTopic theme name\n" "- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n" "- match_confidence: high, medium, low, or none\n" "- reasoning: why this mapping (1 sentence)\n" "- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n" "Return ONLY valid JSON array.\n\n" "BERTopic Themes:\n{themes}\n\n" "PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}") llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300) chain = prompt | llm | JsonOutputParser() mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text}) json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str) novel = list(filter(lambda m: m.get("is_novel", False), mappings)) mapped = list(filter(lambda m: not m.get("is_novel", False), mappings)) mapped_lines = list(map( lambda m: f" ✅ {m.get('label', '?')} → **{m.get('pajais_match', '?')}** " f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_", mapped)) novel_lines = list(map( lambda m: f" 🆕 **{m.get('label', '?')}** → NOVEL _{m.get('reasoning', '')}_", novel)) return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n" f"**Mapped to PAJAIS categories ({len(mapped)}):**\n" + "\n".join(mapped_lines) + f"\n\n**NOVEL / Emerging themes ({len(novel)}):**\n" + "\n".join(novel_lines) + f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json") # ═══════════════════════════════════════════════ # GET ALL TOOLS # ═══════════════════════════════════════════════ def get_all_tools(): tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm, consolidate_into_themes, compare_with_taxonomy, generate_comparison_csv, export_narrative] list(map(lambda t: setattr(t, 'handle_tool_error', True), tools)) debug(f">>> tools.py: {len(tools)} tools ready") return tools