topic_modelling

Sleeping

Daksh C Jain

Upgrade pipeline: UMAP+HDBSCAN, Bayesian opt, 3+1 LLM labeling, coherence check

f38b6de 13 days ago

43.1 kB

	"""tools.py — Sentence-level BERTopic pipeline + Mistral LLM. Version 4.0.0 \| May 2026.

	PIPELINE:
	Paper → split sentences → embed (all-MiniLM-L6-v2, 384d)
	→ LLM recommends UMAP + HDBSCAN params based on dataset stats
	→ Bayesian optimization (Optuna) tunes min_cluster_size + min_samples
	→ UMAP dim reduction → HDBSCAN clustering
	→ 3 LLM labeling calls per cluster + 1 orchestration judge call
	→ Abstract-title cosine similarity validation (hallucination check)
	"""
	from langchain_core.tools import tool
	import os
	import json
	import re
	import numpy as np
	import pandas as pd
	from concurrent.futures import ThreadPoolExecutor, as_completed

	DEBUG = True
	debug = {True: print, False: lambda a, *k: None}[DEBUG]

	CHECKPOINT_DIR = "/tmp/checkpoints"
	os.makedirs(CHECKPOINT_DIR, exist_ok=True)

	NEAREST_K = 5
	SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])'
	MIN_SENT_LEN = 30

	RUN_CONFIGS = {
	"abstract": ["Abstract"],
	"title": ["Title"],
	}

	_data = {}


	# ═══════════════════════════════════════════════
	# HELPER: Split text into sentences
	# ═══════════════════════════════════════════════
	def _split_sentences(text):
	raw = re.split(SENT_SPLIT_RE, str(text))
	return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw))


	# ═══════════════════════════════════════════════
	# HELPER: Filter publisher boilerplate
	# ═══════════════════════════════════════════════
	_BOILERPLATE_RE = re.compile("\|".join([
	r"Licensee MDPI", r"Published by Informa", r"Published by Elsevier",
	r"Taylor & Francis", r"Copyright ©", r"Creative Commons",
	r"open access article", r"Inderscience Enterprises", r"All rights reserved",
	r"This is an open access", r"distributed under the terms", r"The Author\(s\)",
	r"Springer Nature", r"Emerald Publishing", r"limitations and future",
	r"limitations and implications", r"limitations are discussed",
	r"limitations have been discussed", r"implications are discussed",
	r"implications were discussed", r"implications are presented",
	r"concludes with .* implications",
	]), re.IGNORECASE)


	# ═══════════════════════════════════════════════
	# TOOL 1: Load Scopus CSV
	# ═══════════════════════════════════════════════
	@tool
	def load_scopus_csv(filepath: str) -> str:
	"""Load a Scopus CSV export and show preview. Call this first.

	Args:
	filepath: Path to the uploaded .csv file.

	Returns:
	Row count, column names, and sample data."""
	debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')")
	df = pd.read_csv(filepath, encoding="utf-8-sig")
	_data["df"] = df
	debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns")
	target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"]))
	sample = df[target_cols].head(3).to_string(max_colwidth=80)
	null_counts = ", ".join(list(map(
	lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols)))

	sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len)
	avg_abstract_sents = sample_sents.mean()
	est_abstract = int(avg_abstract_sents * len(df))
	title_count = int(df["Title"].notna().sum())

	return (f"📊 Dataset Statistics:\n"
	f"- Papers: {len(df)}\n"
	f"- Abstract sentences: ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n"
	f"- Title sentences: {title_count} (1 per paper)\n"
	f"- Non-null: {null_counts}\n\n"
	f"Columns: {', '.join(list(df.columns)[:15])}\n\n"
	f"Sample:\n{sample}")


	# ═══════════════════════════════════════════════
	# HELPER: LLM recommends UMAP + HDBSCAN params
	# ═══════════════════════════════════════════════
	def _llm_recommend_params(n_sentences: int, n_papers: int, run_key: str) -> dict:
	"""Call Mistral to recommend UMAP + HDBSCAN parameters based on dataset characteristics.
	Returns a dict with umap_n_components, umap_n_neighbors, hdbscan_min_cluster_size,
	hdbscan_min_samples, hdbscan_cluster_selection_epsilon."""
	debug(f">>> LLM param recommendation: {n_sentences} sentences, {n_papers} papers, run={run_key}")
	from langchain_mistralai import ChatMistralAI
	from langchain_core.output_parsers import JsonOutputParser

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120)
	prompt = (
	f"You are an NLP clustering expert. Given this academic paper dataset, "
	f"recommend optimal UMAP and HDBSCAN parameters.\n\n"
	f"Dataset:\n"
	f"- Total sentences to cluster: {n_sentences}\n"
	f"- Total papers: {n_papers}\n"
	f"- Run type: {run_key} (abstract=~10 sents/paper, title=1 sent/paper)\n"
	f"- Embedding dimension: 384 (all-MiniLM-L6-v2)\n\n"
	f"Rules:\n"
	f"- umap_n_components: 5-50 (lower for small datasets, higher for large)\n"
	f"- umap_n_neighbors: 5-50 (lower=local structure, higher=global)\n"
	f"- hdbscan_min_cluster_size: 5-50 (min papers per cluster; bigger=fewer clusters)\n"
	f"- hdbscan_min_samples: 1-20 (lower=more clusters, higher=more robust)\n"
	f"- hdbscan_cluster_selection_epsilon: 0.0-0.5 (merge nearby clusters)\n\n"
	f"Return ONLY valid JSON with exactly these keys:\n"
	f"{{\"umap_n_components\": int, \"umap_n_neighbors\": int, "
	f"\"hdbscan_min_cluster_size\": int, \"hdbscan_min_samples\": int, "
	f"\"hdbscan_cluster_selection_epsilon\": float, "
	f"\"reasoning\": \"one sentence explaining choices\"}}"
	)
	try:
	result = llm \| JsonOutputParser()
	params = result.invoke(prompt)
	debug(f">>> LLM recommended params: {params}")
	return params
	except Exception as e:
	debug(f">>> LLM param recommendation failed ({e}), using defaults")
	return {
	"umap_n_components": 10,
	"umap_n_neighbors": 15,
	"hdbscan_min_cluster_size": max(5, n_sentences // 100),
	"hdbscan_min_samples": 5,
	"hdbscan_cluster_selection_epsilon": 0.1,
	"reasoning": "defaults used (LLM call failed)",
	}


	# ═══════════════════════════════════════════════
	# HELPER: Bayesian optimization for HDBSCAN
	# ═══════════════════════════════════════════════
	def _bayesian_optimize_hdbscan(embeddings_2d: np.ndarray, llm_params: dict) -> dict:
	"""Use Optuna to tune HDBSCAN min_cluster_size and min_samples.
	Optimizes for silhouette score on the UMAP-reduced embeddings.
	Uses LLM-recommended params as starting point / search bounds."""
	debug(">>> Bayesian optimization (Optuna) for HDBSCAN...")
	try:
	import optuna
	import hdbscan as hdbscan_lib
	from sklearn.metrics import silhouette_score

	optuna.logging.set_verbosity(optuna.logging.WARNING)

	llm_mcs = llm_params.get("hdbscan_min_cluster_size", 10)
	llm_ms = llm_params.get("hdbscan_min_samples", 5)
	llm_eps = llm_params.get("hdbscan_cluster_selection_epsilon", 0.1)

	def objective(trial):
	mcs = trial.suggest_int(
	"min_cluster_size",
	max(2, llm_mcs // 2),
	min(embeddings_2d.shape[0] // 5, llm_mcs * 3),
	)
	ms = trial.suggest_int(
	"min_samples",
	max(1, llm_ms // 2),
	max(2, llm_ms * 3),
	)
	eps = trial.suggest_float(
	"cluster_selection_epsilon",
	max(0.0, llm_eps - 0.15),
	min(0.5, llm_eps + 0.25),
	)
	clusterer = hdbscan_lib.HDBSCAN(
	min_cluster_size=mcs,
	min_samples=ms,
	cluster_selection_epsilon=eps,
	metric="euclidean",
	cluster_selection_method="eom",
	)
	labels = clusterer.fit_predict(embeddings_2d)
	n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
	n_noise = int((labels == -1).sum())
	noise_ratio = n_noise / len(labels)
	# Penalize: fewer than 3 clusters, too many outliers (>40%)
	if n_clusters < 3 or noise_ratio > 0.4:
	return -1.0
	try:
	# Only score non-noise points
	mask = labels != -1
	score = silhouette_score(embeddings_2d[mask], labels[mask], metric="euclidean")
	return float(score)
	except Exception:
	return -1.0

	study = optuna.create_study(direction="maximize")
	# Seed with LLM-recommended values first
	study.enqueue_trial({
	"min_cluster_size": llm_mcs,
	"min_samples": llm_ms,
	"cluster_selection_epsilon": llm_eps,
	})
	study.optimize(objective, n_trials=30, show_progress_bar=False)

	best = study.best_params
	best_score = study.best_value
	debug(f">>> Best HDBSCAN params: {best}, silhouette={best_score:.4f}")
	return {**best, "silhouette_score": best_score, "n_trials": 30}

	except Exception as e:
	debug(f">>> Bayesian optimization failed ({e}), using LLM defaults")
	return {
	"min_cluster_size": llm_params.get("hdbscan_min_cluster_size", 10),
	"min_samples": llm_params.get("hdbscan_min_samples", 5),
	"cluster_selection_epsilon": llm_params.get("hdbscan_cluster_selection_epsilon", 0.1),
	"silhouette_score": None,
	"n_trials": 0,
	}


	# ═══════════════════════════════════════════════
	# HELPER: 3 LLM calls + 1 orchestration per cluster
	# ═══════════════════════════════════════════════
	def _label_single_cluster(cluster_summary: dict, run_key: str) -> dict:
	"""For one cluster: make 3 independent Mistral labeling calls,
	then 1 orchestration/judge call to pick the final label.
	Returns enriched cluster dict with label, category, confidence, reasoning."""
	from langchain_mistralai import ChatMistralAI
	from langchain_core.output_parsers import JsonOutputParser

	cid = cluster_summary["topic_id"]
	sentences = cluster_summary.get("nearest", [])
	sent_block = "\n".join(
	f" {i+1}. \"{e['sentence'][:200]}\"\n Paper: {e['title']}\n Keywords: {e['keywords']}"
	for i, e in enumerate(sentences)
	)
	top_words = cluster_summary.get("top_words", "")
	n_sent = cluster_summary.get("sentence_count", 0)
	n_paper = cluster_summary.get("paper_count", 0)

	base_prompt = (
	f"You are a research topic classifier for academic papers on Technology and Tourism.\n\n"
	f"Cluster {cid} ({n_sent} sentences from {n_paper} papers):\n"
	f"Top BERTopic words: {top_words}\n\n"
	f"5 nearest centroid sentences:\n{sent_block}\n\n"
	f"Return JSON with:\n"
	f"- label: short specific name (3-6 words, NOT generic)\n"
	f"- category: plain research area (e.g. 'AI and machine learning', 'virtual reality', "
	f"'sustainability', 'consumer behavior', 'social media marketing')\n"
	f"- confidence: high / medium / low\n"
	f"- reasoning: 1 sentence why\n"
	f"Return ONLY valid JSON."
	)

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=120)
	parser = JsonOutputParser()

	def _one_call(temperature):
	m = ChatMistralAI(model="mistral-small-latest", temperature=temperature, timeout=120)
	try:
	return (m \| parser).invoke(base_prompt)
	except Exception as ex:
	return {"label": f"Cluster {cid}", "category": "unknown",
	"confidence": "low", "reasoning": str(ex)}

	# 3 independent calls with slight temperature variation for diversity
	with ThreadPoolExecutor(max_workers=3) as ex:
	futures = {ex.submit(_one_call, t): t for t in [0.0, 0.3, 0.6]}
	proposals = [f.result() for f in as_completed(futures)]

	debug(f">>> Cluster {cid}: got {len(proposals)} label proposals")

	# Orchestration / judge call: pick best label from 3 proposals
	judge_prompt = (
	f"You are an expert research label judge.\n\n"
	f"For cluster {cid} ({n_sent} sentences, {n_paper} papers on Tourism + Technology):\n"
	f"Top words: {top_words}\n\n"
	f"3 label proposals:\n"
	+ "\n".join(f" Option {i+1}: {json.dumps(p)}" for i, p in enumerate(proposals))
	+ f"\n\nPick the BEST label. It must be:\n"
	f"- Specific (not generic)\n"
	f"- Accurately describing the cluster content\n"
	f"- Clear for a research paper audience\n\n"
	f"Return JSON with the final chosen values:\n"
	f"{{\"label\": str, \"category\": str, \"confidence\": str, "
	f"\"reasoning\": str, \"chosen_option\": 1\|2\|3}}\n"
	f"Return ONLY valid JSON."
	)
	try:
	judge_llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=120)
	final = (judge_llm \| parser).invoke(judge_prompt)
	except Exception as e:
	debug(f">>> Judge call failed for cluster {cid}: {e}, using option 1")
	final = proposals[0]

	debug(f">>> Cluster {cid} final label: {final.get('label', '?')}")
	return {cluster_summary, final, "label_proposals": proposals}


	# ═══════════════════════════════════════════════
	# HELPER: Abstract-title cosine similarity check
	# ═══════════════════════════════════════════════
	def _validate_cluster_coherence(
	cluster_summary: dict,
	sent_df: pd.DataFrame,
	topics_arr: np.ndarray,
	embeddings: np.ndarray,
	) -> dict:
	"""For each cluster, compute mean cosine similarity between abstract sentences
	and title sentences. Low similarity = cluster may be hallucinated or noisy.
	Adds 'coherence_score' and 'coherence_warning' to the cluster dict."""
	cid = cluster_summary["topic_id"]
	mask = topics_arr == cid
	cluster_sent_df = sent_df[mask].copy()

	# We need to know which sentences came from abstract vs title
	# sent_df has 'source_type' column added during pipeline
	if "source_type" not in cluster_sent_df.columns:
	return {**cluster_summary, "coherence_score": None,
	"coherence_warning": "source_type column not available"}

	abstract_idx = cluster_sent_df[cluster_sent_df["source_type"] == "abstract"].index
	title_idx = cluster_sent_df[cluster_sent_df["source_type"] == "title"].index

	if len(abstract_idx) == 0 or len(title_idx) == 0:
	return {**cluster_summary, "coherence_score": None,
	"coherence_warning": "only one source type in cluster (expected for single-run)"}

	# Map back to original embedding indices
	orig_indices = np.where(mask)[0]
	sent_df_positions = {idx: pos for pos, idx in enumerate(sent_df[mask].index)}

	abs_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]]
	for i in abstract_idx if i in sent_df_positions])
	title_embs = np.array([embeddings[orig_indices[sent_df_positions[i]]]
	for i in title_idx if i in sent_df_positions])

	if len(abs_embs) == 0 or len(title_embs) == 0:
	return {**cluster_summary, "coherence_score": None,
	"coherence_warning": "embedding alignment failed"}

	# Mean abstract centroid vs mean title centroid — cosine similarity
	abs_centroid = abs_embs.mean(axis=0)
	title_centroid = title_embs.mean(axis=0)
	norm = np.linalg.norm(abs_centroid) * np.linalg.norm(title_centroid)
	score = float(np.dot(abs_centroid, title_centroid) / (norm + 1e-10))

	warning = None
	if score < 0.3:
	warning = f"LOW coherence ({score:.2f}) — abstract and title sentences diverge; cluster may be noisy"
	elif score < 0.5:
	warning = f"MODERATE coherence ({score:.2f}) — review cluster manually"

	debug(f">>> Cluster {cid} coherence score: {score:.3f}")
	return {**cluster_summary, "coherence_score": round(score, 4),
	"coherence_warning": warning}


	# ═══════════════════════════════════════════════
	# TOOL 2: Sentence-Level BERTopic Pipeline
	# UMAP + HDBSCAN with LLM-guided params + Bayesian optimization
	# ═══════════════════════════════════════════════
	@tool
	def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
	"""Sentence-level BERTopic: split papers → embed → LLM recommends params
	→ Bayesian optimization → UMAP → HDBSCAN → centroid nearest 5 → Plotly charts.

	Args:
	run_key: One of 'abstract' or 'title'.
	threshold: Ignored (kept for API compatibility). Params now set by LLM + Bayesian optimization.

	Returns:
	Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences."""
	debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}')")
	from bertopic import BERTopic
	from sentence_transformers import SentenceTransformer
	from umap import UMAP
	import hdbscan as hdbscan_lib

	df = _data["df"].copy()
	cols = RUN_CONFIGS[run_key]
	available = list(filter(lambda c: c in df.columns, cols))
	debug(f">>> Columns: {available}")

	# ── Step 1: Assemble text per paper ──
	df["_text"] = df[available].fillna("").agg(" ".join, axis=1)
	df["_paper_id"] = df.index

	# ── Step 2: Split into sentences ──
	debug(">>> Splitting into sentences...")
	df["_sentences"] = df["_text"].apply(_split_sentences)

	# ── Step 3: Explode to sentence-level DataFrame ──
	meta_cols = ["_paper_id", "Title", "Abstract", "Author Keywords", "_sentences"]
	available_meta = list(filter(lambda c: c in df.columns, meta_cols))
	sent_df = df[available_meta].explode("_sentences").rename(
	columns={"_sentences": "text"}).reset_index(drop=True)
	sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True)
	sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
	# Tag source type for coherence validation
	sent_df["source_type"] = run_key

	# ── Step 3b: Filter boilerplate ──
	debug(">>> Filtering publisher boilerplate...")
	n_before = len(sent_df)
	clean_mask = ~sent_df["text"].str.contains(_BOILERPLATE_RE.pattern, case=False, regex=True, na=False)
	sent_df = sent_df[clean_mask].reset_index(drop=True)
	sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
	debug(f">>> Filtered: {n_before} → {len(sent_df)} sentences")

	n_sentences = len(sent_df)
	n_papers = len(df)

	# ── Step 4: Embed sentences (384d, L2-normalized) ──
	debug(">>> Embedding with all-MiniLM-L6-v2...")
	docs = sent_df["text"].tolist()
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True)
	debug(f">>> Embeddings: {embeddings.shape}")
	np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings)

	# ── Step 5: LLM recommends UMAP + HDBSCAN params ──
	debug(">>> Asking LLM for optimal clustering parameters...")
	llm_params = _llm_recommend_params(n_sentences, n_papers, run_key)
	debug(f">>> LLM params: {llm_params}")
	json.dump(llm_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_llm_params.json", "w"), indent=2)

	# ── Step 6: UMAP dim reduction ──
	debug(f">>> UMAP: {embeddings.shape[1]}d → {llm_params['umap_n_components']}d ...")
	umap_model = UMAP(
	n_components=llm_params["umap_n_components"],
	n_neighbors=llm_params["umap_n_neighbors"],
	min_dist=0.0,
	metric="cosine",
	random_state=42,
	)
	embeddings_2d = umap_model.fit_transform(embeddings)
	debug(f">>> UMAP output shape: {embeddings_2d.shape}")

	# ── Step 7: Bayesian optimization for HDBSCAN params ──
	best_params = _bayesian_optimize_hdbscan(embeddings_2d, llm_params)
	debug(f">>> Optimized HDBSCAN params: {best_params}")
	json.dump(best_params, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json", "w"), indent=2)

	# ── Step 8: Fit BERTopic with UMAP + HDBSCAN ──
	debug(">>> Fitting BERTopic with UMAP + HDBSCAN...")
	cluster_model = hdbscan_lib.HDBSCAN(
	min_cluster_size=best_params["min_cluster_size"],
	min_samples=best_params["min_samples"],
	cluster_selection_epsilon=best_params["cluster_selection_epsilon"],
	metric="euclidean",
	cluster_selection_method="eom",
	prediction_data=True,
	)
	# Pass identity UMAP since we already reduced
	from sklearn.preprocessing import FunctionTransformer
	no_umap = FunctionTransformer()
	topic_model = BERTopic(
	hdbscan_model=cluster_model,
	umap_model=no_umap,
	)
	topics, probs = topic_model.fit_transform(docs, embeddings_2d)
	topics_arr = np.array(topics)
	n_topics = len(set(topics)) - int(-1 in topics)
	n_outliers = int(np.sum(topics_arr == -1))
	debug(f">>> {n_topics} topics, {n_outliers} outliers ({100*n_outliers/len(topics):.1f}%)")

	_data[f"{run_key}_model"] = topic_model
	_data[f"{run_key}_topics"] = topics_arr
	_data[f"{run_key}_embeddings"] = embeddings
	_data[f"{run_key}_embeddings_2d"] = embeddings_2d
	_data[f"{run_key}_sent_df"] = sent_df

	# ── Step 9: Plotly visualizations ──
	debug(">>> Generating visualizations...")
	(n_topics >= 3) and topic_model.visualize_topics().write_html(
	f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn")
	(n_topics >= 1) and topic_model.visualize_barchart(
	top_n_topics=min(10, max(1, n_topics))).write_html(
	f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn")
	(n_topics >= 2) and topic_model.visualize_hierarchy().write_html(
	f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn")
	(n_topics >= 2) and topic_model.visualize_heatmap().write_html(
	f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn")

	# ── Step 10: Centroid nearest 5 sentences per topic ──
	topic_info = topic_model.get_topic_info()
	valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records")))

	def _centroid_nearest(row):
	mask = topics_arr == row["Topic"]
	member_idx = np.where(mask)[0]
	member_embs = embeddings[mask]
	centroid = member_embs.mean(axis=0)
	norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
	cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
	dists = 1 - cosine_sim
	nearest = np.argsort(dists)[:NEAREST_K]

	nearest_evidence = list(map(lambda i: {
	"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
	"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
	"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
	"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
	}, nearest))

	topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
	unique_papers = len(topic_papers_df)
	paper_titles = list(map(
	lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
	range(min(50, unique_papers))))

	return {"topic_id": int(row["Topic"]),
	"sentence_count": int(row["Count"]),
	"paper_count": int(unique_papers),
	"top_words": str(row.get("Name", ""))[:100],
	"nearest": nearest_evidence,
	"paper_titles": paper_titles}

	summaries = list(map(_centroid_nearest, valid_rows))
	json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str)
	debug(f">>> {len(summaries)} topics saved")

	lines = list(map(
	lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}",
	summaries))

	sil = best_params.get("silhouette_score")
	sil_str = f"{sil:.3f}" if sil is not None else "N/A"
	return (
	f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n"
	f"Pipeline:\n"
	f" Embeddings: all-MiniLM-L6-v2 (384d)\n"
	f" LLM-recommended params: n_components={llm_params['umap_n_components']}, "
	f"n_neighbors={llm_params['umap_n_neighbors']} \| {llm_params.get('reasoning', '')}\n"
	f" Bayesian optimization ({best_params['n_trials']} trials): "
	f"min_cluster_size={best_params['min_cluster_size']}, "
	f"min_samples={best_params['min_samples']}, "
	f"epsilon={best_params['cluster_selection_epsilon']:.3f}, "
	f"silhouette={sil_str}\n\n"
	+ "\n".join(lines)
	+ f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html"
	+ f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json"
	)


	# ═══════════════════════════════════════════════
	# TOOL 3: Label Topics — 3 LLM calls + 1 judge per cluster
	# ═══════════════════════════════════════════════
	@tool
	def label_topics_with_llm(run_key: str) -> str:
	"""For each cluster: 3 independent Mistral labeling calls + 1 orchestration judge call.
	Also runs abstract-title cosine similarity coherence check per cluster.

	Args:
	run_key: One of 'abstract' or 'title'.

	Returns:
	Labeled topics with sentence-level evidence and coherence scores."""
	debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')")

	summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json"))
	debug(f">>> Loaded {len(summaries)} topics")

	# Top 100 by size to avoid API rate limits
	MAX_LABEL_TOPICS = 100
	sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True)
	summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS]
	skipped = max(0, len(summaries) - MAX_LABEL_TOPICS)
	debug(f">>> Labeling top {len(summaries_to_label)} (skipping {skipped} tiny clusters)")

	# Run 3+1 LLM calls per cluster in parallel (thread pool, max 5 concurrent)
	debug(">>> Running 3-call + 1 judge labeling pipeline per cluster...")
	labeled = []
	with ThreadPoolExecutor(max_workers=5) as executor:
	future_to_summary = {
	executor.submit(_label_single_cluster, s, run_key): s
	for s in summaries_to_label
	}
	for future in as_completed(future_to_summary):
	try:
	labeled.append(future.result())
	except Exception as e:
	s = future_to_summary[future]
	debug(f">>> Labeling failed for cluster {s.get('topic_id', '?')}: {e}")
	labeled.append({**s, "label": f"Cluster {s.get('topic_id', '?')}",
	"category": "unknown", "confidence": "low",
	"reasoning": str(e)})

	# Sort back by topic_id
	labeled.sort(key=lambda x: x.get("topic_id", 0))

	# ── Abstract-title coherence check ──
	debug(">>> Running abstract-title coherence validation...")
	topics_arr = _data.get(f"{run_key}_topics")
	embeddings = _data.get(f"{run_key}_embeddings")
	sent_df = _data.get(f"{run_key}_sent_df")

	# Coherence check only meaningful for mixed-source runs
	# For single-source runs (abstract or title only) it gracefully returns None
	if topics_arr is not None and embeddings is not None and sent_df is not None:
	labeled = list(map(
	lambda l: _validate_cluster_coherence(l, sent_df, topics_arr, embeddings),
	labeled
	))

	# For small skipped clusters, copy without labeling
	skipped_summaries = sorted_summaries[MAX_LABEL_TOPICS:]
	labeled_all = labeled + list(map(
	lambda s: {**s, "label": s.get("top_words", f"Cluster {s['topic_id']}")[:50],
	"category": "unlabeled", "confidence": "low",
	"reasoning": "skipped (small cluster)", "coherence_score": None},
	skipped_summaries
	))
	labeled_all.sort(key=lambda x: x.get("topic_id", 0))

	json.dump(labeled_all, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str)
	debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json")

	def _cs(l):
	cs = l.get("coherence_score")
	return f"{cs:.2f}" if cs is not None else "N/A"

	lines = list(map(
	lambda l: (
	f" Topic {l.get('topic_id', '?')}: {l.get('label', '?')} "
	f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} "
	f"coherence={_cs(l)}"
	+ (f" ⚠️ {l['coherence_warning']}" if l.get('coherence_warning') else "")
	+ f" ({l.get('sentence_count', 0)} sent, {l.get('paper_count', 0)} papers)\n"
	+ "\n".join(map(
	lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_",
	l.get("nearest", [])
	))
	),
	labeled_all
	))
	return (f"[{run_key}] {len(labeled_all)} topics labeled (3 LLM calls + judge per cluster):\n\n"
	+ "\n\n".join(lines))


	# ═══════════════════════════════════════════════
	# TOOL 4: Generate Comparison CSV
	# ═══════════════════════════════════════════════
	@tool
	def generate_comparison_csv() -> str:
	"""Compare Mistral-labeled topics across completed runs.

	Returns:
	Comparison table + CSV path."""
	debug(f"\n>>> TOOL: generate_comparison_csv()")
	completed = list(filter(
	lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys()))

	def _load_run(run_key):
	labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
	return list(map(lambda l: {
	"run": run_key, "topic_id": l.get("topic_id", ""),
	"label": l.get("label", ""), "category": l.get("category", ""),
	"confidence": l.get("confidence", ""), "niche": l.get("niche", ""),
	"sentences": l.get("sentence_count", 0),
	"papers": l.get("paper_count", 0),
	"coherence_score": l.get("coherence_score", ""),
	"coherence_warning": l.get("coherence_warning", ""),
	"top_words": l.get("top_words", ""),
	}, labels))

	all_rows = sum(list(map(_load_run, completed)), [])
	df = pd.DataFrame(all_rows)
	path = "/tmp/rq4_comparison.csv"
	df.to_csv(path, index=False)
	return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}"


	# ═══════════════════════════════════════════════
	# TOOL 5: Export 500-Word Narrative
	# ═══════════════════════════════════════════════
	@tool
	def export_narrative(run_key: str) -> str:
	"""Generate 500-word narrative for research paper Section 7 via Mistral.

	Args:
	run_key: One of 'abstract' or 'title'.

	Returns:
	500-word narrative + save path."""
	debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')")
	from langchain_mistralai import ChatMistralAI

	labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
	hdbscan_params = {}
	hdbscan_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_hdbscan_params.json"
	os.path.exists(hdbscan_path) and hdbscan_params.update(json.load(open(hdbscan_path)))

	def _fmt_score(s):
	return f"{s:.3f}" if s is not None else "N/A"

	def _fmt_coherence(l):
	cs = l.get("coherence_score")
	return f"{cs:.2f}" if cs is not None else "N/A"

	topics_text = "\n".join(list(map(
	lambda l: (f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from "
	f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, "
	f"confidence: {l.get('confidence', '?')}, "
	f"coherence: {_fmt_coherence(l)})"),
	labels)))

	sil_score_str = _fmt_score(hdbscan_params.get("silhouette_score"))
	llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300)
	result = llm.invoke(
	f"Write exactly 500 words for a research paper Section 7 titled "
	f"'Topic Modeling Results — BERTopic Discovery'.\n\n"
	f"Dataset: 1390 Scopus papers on Tourism and AI.\n"
	f"Method: Sentence-level BERTopic — each abstract split into sentences, "
	f"embedded with all-MiniLM-L6-v2 (384d), UMAP dim reduction, HDBSCAN clustering.\n"
	f"Clustering optimized via Bayesian optimization (Optuna, 30 trials): "
	f"min_cluster_size={hdbscan_params.get('min_cluster_size', 'N/A')}, "
	f"min_samples={hdbscan_params.get('min_samples', 'N/A')}, "
	f"silhouette={sil_score_str}.\n"
	f"Each cluster labeled via 3 independent LLM calls + 1 judge orchestration.\n"
	f"Coherence validated via abstract-title cosine similarity.\n"
	f"Run config: '{run_key}' columns.\n\n"
	f"Topics discovered:\n{topics_text}\n\n"
	f"Include: methodology justification, key themes, coherence findings, limitations, future work."
	)

	path = "/tmp/rq4_narrative.txt"
	open(path, "w", encoding="utf-8").write(result.content)
	return f"Narrative saved: {path}\n\n{result.content}"


	# ═══════════════════════════════════════════════
	# TOOL 6: Consolidate Round 1 Topics into Themes
	# ═══════════════════════════════════════════════
	@tool
	def consolidate_into_themes(run_key: str, theme_map: dict) -> str:
	"""ROUND 2: Merge fine-grained Round 1 topics into broader themes.

	Args:
	run_key: 'abstract' or 'title'.
	theme_map: Dict mapping theme names to topic ID lists.
	Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]}

	Returns:
	Consolidated themes with new 5-nearest sentence evidence per theme."""
	debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)")

	topics_arr = _data[f"{run_key}_topics"]
	embeddings = _data[f"{run_key}_embeddings"]
	sent_df = _data[f"{run_key}_sent_df"]

	def _build_theme(item):
	theme_name, topic_ids = item
	mask = np.isin(topics_arr, topic_ids)
	member_idx = np.where(mask)[0]
	member_embs = embeddings[mask]
	centroid = member_embs.mean(axis=0)
	norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
	cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
	dists = 1 - cosine_sim
	nearest = np.argsort(dists)[:NEAREST_K]

	nearest_evidence = list(map(lambda i: {
	"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
	"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
	"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
	"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
	}, nearest))

	unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique()
	topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
	paper_titles = list(map(
	lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
	range(min(50, len(topic_papers_df)))))

	return {"label": theme_name, "merged_topics": list(topic_ids),
	"sentence_count": int(mask.sum()), "paper_count": int(unique_papers),
	"nearest": nearest_evidence, "paper_titles": paper_titles}

	themes_raw = list(map(_build_theme, theme_map.items()))
	themes = list(map(
	lambda pair: {**pair[1], "topic_id": pair[0]},
	enumerate(themes_raw)))
	json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str)

	lines = list(map(
	lambda t: (f" {t['label']} ({t['sentence_count']} sentences, {t['paper_count']} papers)\n"
	f" Merged from topics: {t['merged_topics']}\n"
	f" Evidence:\n"
	+ "\n".join(map(
	lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_",
	t["nearest"]))),
	themes))
	return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines)


	# ═══════════════════════════════════════════════
	# TOOL 7: Compare Themes with PAJAIS Taxonomy
	# ═══════════════════════════════════════════════
	PAJAIS_TAXONOMY = [
	"Electronic and Mobile Business / Social Commerce",
	"Human Behavior and IS / Human-Computer Interaction",
	"IS/IT Strategy, Leadership, Governance",
	"Business Intelligence and Data Analytics",
	"Design Science and IS",
	"Enterprise Systems and BPM",
	"IS Implementation, Adoption, and Diffusion",
	"Social Media and Business Impact",
	"Cultural and Global Issues in IS",
	"IS Security and Privacy",
	"IS Smart / IoT",
	"Knowledge Management",
	"ICT / Digital Platform / IT and Work",
	"IS Healthcare",
	"IT Project Management",
	"Service Science and IS",
	"Social and Organizational Aspects of IS",
	"Research Methods and Philosophy",
	"E-Finance / Economics of IS",
	"E-Government",
	"IS Education and Learning",
	"Green IT and Sustainability",
	]


	@tool
	def compare_with_taxonomy(run_key: str) -> str:
	"""Compare BERTopic themes against PAJAIS/PACIS taxonomy (Jiang et al., 2019).

	Args:
	run_key: 'abstract' or 'title'.

	Returns:
	Mapping table: BERTopic theme → PAJAIS category (or NOVEL)."""
	debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')")
	from langchain_mistralai import ChatMistralAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser

	themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json"
	labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"
	source_path = (os.path.exists(themes_path) and themes_path) or labels_path
	themes = json.load(open(source_path))

	themes_text = "\n".join(list(map(
	lambda t: f"- {t.get('label', '?')} ({t.get('paper_count', t.get('count', '?'))} papers)",
	themes)))
	taxonomy_text = "\n".join(map(lambda c: f"- {c}", PAJAIS_TAXONOMY))

	prompt = PromptTemplate.from_template(
	"You are an IS research taxonomy expert.\n\n"
	"Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n"
	"For EACH theme, return a JSON ARRAY with:\n"
	"- label: the BERTopic theme name\n"
	"- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n"
	"- match_confidence: high, medium, low, or none\n"
	"- reasoning: why this mapping (1 sentence)\n"
	"- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n"
	"Return ONLY valid JSON array.\n\n"
	"BERTopic Themes:\n{themes}\n\n"
	"PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}")

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
	chain = prompt \| llm \| JsonOutputParser()
	mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text})

	json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str)

	novel = list(filter(lambda m: m.get("is_novel", False), mappings))
	mapped = list(filter(lambda m: not m.get("is_novel", False), mappings))

	mapped_lines = list(map(
	lambda m: f" ✅ {m.get('label', '?')} → {m.get('pajais_match', '?')} "
	f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_",
	mapped))
	novel_lines = list(map(
	lambda m: f" 🆕 {m.get('label', '?')} → NOVEL _{m.get('reasoning', '')}_",
	novel))

	return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n"
	f"Mapped to PAJAIS categories ({len(mapped)}):\n" + "\n".join(mapped_lines) +
	f"\n\nNOVEL / Emerging themes ({len(novel)}):\n" + "\n".join(novel_lines) +
	f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json")


	# ═══════════════════════════════════════════════
	# GET ALL TOOLS
	# ═══════════════════════════════════════════════
	def get_all_tools():
	tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm,
	consolidate_into_themes, compare_with_taxonomy,
	generate_comparison_csv, export_narrative]
	list(map(lambda t: setattr(t, 'handle_tool_error', True), tools))
	debug(f">>> tools.py: {len(tools)} tools ready")
	return tools