Spaces:

Dash10107
/

eis-topic-intelligence

Sleeping

Daksh C Jain

Initial commit: EIS Topic Intelligence — UMAP+HDBSCAN+Mistral council, dark EIS theme, 23 clusters from Enterprise Information Systems corpus

c91d9b4 6 days ago

raw

history blame contribute delete

28.2 kB

	"""
	tools.py — 7 @tool functions for BERTopic Agentic AI
	Assignment: Text Analysis & Topic Modelling (Prof. Shailaja Jha)
	Generated via: Anthropic Claude Sonnet 4.5
	Architecture: LangChain @tool + LangGraph \| Model: Mistral Small Latest
	Rules: ZERO if/elif/else \| ZERO for/while \| ZERO try/except \| handle_tool_error=True
	"""

	import os
	import re
	import json
	import numpy as np
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from sklearn.cluster import AgglomerativeClustering
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.decomposition import PCA
	from langchain_core.tools import tool
	from langchain_mistralai import ChatMistralAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

	# ─── CONSTANTS ────────────────────────────────────────────────────────────────

	OUTPUT_DIR = "./outputs"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	MAX_LABEL_TOPICS = 100
	NEAREST_K = 5
	BATCH_SIZE = 20

	BOILERPLATE_RE = re.compile(
	r"©\s\d{4}[^.]?\.\|All\s+rights\s+reserved\.?\|"
	r"Published\s+by\s+[A-Z][^.]?\.\|This\s+is\s+an\s+open\s+access[^.]?\.\|"
	r"Correspondence\s+(to\|author):[^.]?\.\|E-?mail:[^.]?\.\|"
	r"Received:[^.]?Accepted:[^.]?\.\|DOI:\S+\|doi:\S+\|https?://\S+\|"
	r"Keywords:[^.]?\.\|JEL[^.]?\.\|ISSN[^.]?\.\|ISBN[^.]?\.\|"
	r"Elsevier[^.]?\.\|Springer[^.]?\.\|Emerald[^.]*?\.\|"
	r"Wiley[^.]?\.\|Taylor\s&\sFrancis[^.]?\.\|"
	r"This\s+paper\s+is\s+part\s+of[^.]*?\.\|"
	r"Conflict\s+of\s+interest[^.]*?\.\|"
	r"Funding[^.]?:\s[^.]*?\.\|"
	r"Acknowledgement[s]?:[^.]*?\.",
	re.IGNORECASE \| re.DOTALL,
	)
	SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z\"(])")

	PAJAIS_25 = [
	"IS Strategy and Management", "E-Commerce and E-Business",
	"IT Adoption and Diffusion", "Business Intelligence and Analytics",
	"Social Commerce and Social Media", "Mobile Commerce and Applications",
	"Knowledge Management", "Healthcare Information Systems",
	"Privacy, Security and Trust", "Enterprise Systems and ERP",
	"Digital Platforms and Ecosystems", "Blockchain and Distributed Ledgers",
	"Artificial Intelligence and Machine Learning",
	"Human-Computer Interaction and UX",
	"Digital Transformation and Innovation",
	"Financial Technology and Digital Finance",
	"Supply Chain and Logistics IS", "Smart Systems IoT and Smart Cities",
	"IS Research Methods and Theory",
	"Recommender and Personalization Systems",
	"Digital Marketing and Advertising",
	"Virtual Teams and Online Collaboration",
	"Cloud Computing and SaaS", "Big Data Analytics and Data Science",
	"IS Education and Training",
	]

	_EMBED_MODEL = None


	def _get_embed_model():
	global _EMBED_MODEL
	from sentence_transformers import SentenceTransformer
	_EMBED_MODEL = _EMBED_MODEL or SentenceTransformer(
	"all-MiniLM-L6-v2"
	)
	return _EMBED_MODEL


	def _get_llm():
	return ChatMistralAI(
	model="mistral-small-latest",
	api_key=os.environ.get("MISTRAL_API_KEY", ""),
	temperature=0.1,
	)


	def _clean(text: str) -> str:
	return BOILERPLATE_RE.sub(" ", str(text)).strip()


	def _split(text: str) -> list:
	return [s.strip() for s in SENT_RE.split(_clean(text)) if len(s.strip()) > 30]


	def _save(data, name: str) -> str:
	path = os.path.join(OUTPUT_DIR, name)
	with open(path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	return path


	def _load(name: str):
	with open(os.path.join(OUTPUT_DIR, name), "r", encoding="utf-8") as f:
	return json.load(f)


	def _opath(name: str) -> str:
	return os.path.join(OUTPUT_DIR, name)


	def _generate_charts(run_key: str, data: list, name_key: str = "cluster_id"):
	"""Regenerates the 4 Plotly charts using the latest data (summaries, labels, or themes)."""
	if not data:
	return

	centroids = np.array([s["centroid"] for s in data])
	sizes = [s["sentence_count"] for s in data]
	n_clusters = len(data)

	# Use the name_key to get human-readable labels if available
	def get_name(s):
	if name_key == "cluster_id":
	return f"C{s.get('cluster_id', '?')}"
	return s.get(name_key, f"C{s.get('cluster_id', '?')}")

	names = [get_name(s) for s in data]

	pca = PCA(n_components=2)
	# Handle case where n_clusters < 2
	if n_clusters < 2:
	coords = np.zeros((n_clusters, 2))
	else:
	coords = pca.fit_transform(centroids)

	chart_dir = _opath(f"{run_key}_charts")
	os.makedirs(chart_dir, exist_ok=True)

	fig1 = px.scatter(
	x=coords[:, 0], y=coords[:, 1], size=sizes,
	title=f"Intertopic Distance Map — {run_key.title()}",
	labels={"x": "PC1", "y": "PC2"},
	hover_name=names,
	template="plotly_dark",
	)
	fig1.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True)

	top30 = data[:30]
	fig2 = px.bar(
	x=[get_name(s) for s in top30],
	y=[s["sentence_count"] for s in top30],
	title=f"Top 30 Cluster Sizes — {run_key.title()}",
	labels={"x": "Cluster", "y": "Sentences"},
	template="plotly_dark",
	)
	fig2.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True)

	fig3 = px.treemap(
	names=names,
	parents=["clusters"] * n_clusters,
	values=sizes,
	title=f"Topic Treemap — {run_key.title()}",
	)
	fig3.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True)

	# Heatmap: pad to 20
	hm_items = data[:20]
	pad_count = 20 - len(hm_items)
	hm_items_padded = hm_items + [{"sentence_count": 0, name_key: "Empty"}] * pad_count
	heatmap_data = np.array([s.get("sentence_count", 0) for s in hm_items_padded]).reshape(4, 5)
	heatmap_text = [[get_name(hm_items_padded[i * 5 + j]) for j in range(5)] for i in range(4)]

	fig4 = go.Figure(go.Heatmap(
	z=heatmap_data, colorscale="Viridis", text=heatmap_text,
	texttemplate="%{text}", showscale=True,
	))
	fig4.update_layout(title=f"Topic Size Heatmap — {run_key.title()}", template="plotly_dark")
	fig4.write_html(os.path.join(chart_dir, "heatmap.html"), include_plotlyjs="cdn", full_html=True)


	# ─── TOOL 1: LOAD CSV ─────────────────────────────────────────────────────────

	@tool
	def load_scopus_csv(filepath: str) -> str:
	"""Load a Scopus CSV export file and return statistics.
	Phase 1 of Braun & Clarke (2006) — Familiarisation.
	Call this FIRST before any analysis. filepath must be the absolute path to the CSV."""
	df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip")
	required = ["Title", "Abstract", "Authors", "Year", "Cited by",
	"Author Keywords", "Source title"]
	found = [c for c in required if c in df.columns]
	missing = [c for c in required if c not in df.columns]
	pairs_abs = [(s, i) for i, t in enumerate(df["Abstract"].fillna("").tolist())
	for s in _split(t)]
	pairs_ttl = [(s, i) for i, t in enumerate(df["Title"].fillna("").tolist())
	for s in _split(t)]
	year_min = int(df["Year"].dropna().min()) if "Year" in df.columns else 0
	year_max = int(df["Year"].dropna().max()) if "Year" in df.columns else 0
	journal = (df["Source title"].value_counts().index[0]
	if "Source title" in df.columns else "Unknown")
	_save({"filepath": filepath, "journal": journal,
	"rows": len(df), "year_min": year_min, "year_max": year_max},
	"corpus_config.json")
	return (
	f"✅ CSV Loaded\nJournal: {journal}\nPapers: {len(df)}\n"
	f"Year Range: {year_min}–{year_max}\n"
	f"Columns Found ({len(found)}/7): {found}\nMissing: {missing}\n"
	f"Abstract sentences: {len(pairs_abs):,}\n"
	f"Title sentences: {len(pairs_ttl):,}\n"
	f"Type 'run abstract' to begin Phase 2."
	)


	# ─── TOOL 2: RUN BERTOPIC DISCOVERY ──────────────────────────────────────────

	@tool
	def run_bertopic_discovery(run_key: str, target_size: int = 250) -> str:
	"""Embed sentences with all-MiniLM-L6-v2 and apply Balanced Agglomerative Clustering.
	Dynamic K selection based on data size (target_size=250 sentences per topic).
	Includes automatic splitting of massive clusters and merging of tiny clusters
	to guarantee minimal size disparity across all discovered topics.
	Saves {run_key}_summaries.json + {run_key}_emb.npy. Phase 2 of Braun & Clarke.
	run_key must be 'abstract' or 'title'. target_size guides the dynamic cluster counts."""
	cfg = _load("corpus_config.json")
	df = pd.read_csv(cfg["filepath"], encoding="utf-8-sig", on_bad_lines="skip")
	col = "Abstract" if run_key == "abstract" else "Title"
	pairs = [(s, i) for i, t in enumerate(df[col].fillna("").tolist())
	for s in _split(t)]
	sentences = [p[0] for p in pairs]
	paper_ids = [p[1] for p in pairs]

	model = _get_embed_model()
	emb = model.encode(sentences, normalize_embeddings=True,
	batch_size=64, show_progress_bar=True)
	np.save(_opath(f"{run_key}_emb.npy"), emb)
	_save({"sentences": sentences, "paper_ids": paper_ids},
	f"{run_key}_sentences.json")

	# Dynamic sizing calculations
	total_sents = len(sentences)
	dynamic_k = max(5, total_sents // target_size)
	max_size = target_size * 2
	min_size = target_size // 2

	labels_arr = AgglomerativeClustering(
	n_clusters=dynamic_k, metric="euclidean", linkage="ward"
	).fit_predict(emb)

	# 1. Enforce splitting of massive clusters
	while True:
	u_labels, counts = np.unique(labels_arr, return_counts=True)
	too_big = u_labels[counts > max_size]
	if len(too_big) == 0:
	break
	for cid in too_big:
	idx = np.where(labels_arr == cid)[0]
	split_k = int(np.ceil(len(idx) / target_size))
	sub_labels = AgglomerativeClustering(
	n_clusters=split_k, metric="euclidean", linkage="ward"
	).fit_predict(emb[idx])
	new_id_start = max(labels_arr) + 1
	for sub_id in range(1, split_k):
	sub_idx = idx[sub_labels == sub_id]
	labels_arr[sub_idx] = new_id_start
	new_id_start += 1

	# 2. Enforce merging of tiny clusters
	while True:
	u_labels, counts = np.unique(labels_arr, return_counts=True)
	too_small = u_labels[counts < min_size]
	if len(too_small) == 0 or len(u_labels) <= 5: # keep at least 5 clusters
	break

	cid = too_small[0]
	idx = np.where(labels_arr == cid)[0]
	centroid = emb[idx].mean(axis=0, keepdims=True)

	best_dist = -1.0
	best_merge_id = -1
	# Try merging into the nearest cluster that won't become too huge
	for other_id in u_labels:
	if other_id == cid: continue
	other_idx = np.where(labels_arr == other_id)[0]
	if len(other_idx) + len(idx) > (max_size * 1.5):
	continue
	other_centroid = emb[other_idx].mean(axis=0, keepdims=True)
	sim = cosine_similarity(centroid, other_centroid)[0][0]
	if sim > best_dist:
	best_dist = float(sim)
	best_merge_id = int(other_id)

	if best_merge_id != -1:
	labels_arr[idx] = best_merge_id
	else:
	# Fallback: force merge into absolute nearest neighbor regardless of size limit
	best_dist = -1.0
	best_merge_id = -1
	for other_id in u_labels:
	if other_id == cid: continue
	other_idx = np.where(labels_arr == other_id)[0]
	other_centroid = emb[other_idx].mean(axis=0, keepdims=True)
	sim = cosine_similarity(centroid, other_centroid)[0][0]
	if sim > best_dist:
	best_dist = float(sim)
	best_merge_id = int(other_id)
	labels_arr[idx] = best_merge_id

	unique_labels = np.unique(labels_arr)
	n_clusters = len(unique_labels)

	# Build per-cluster sentence index list
	# list(map(int,...)) converts numpy.int64 -> Python int for JSON serialisation
	cluster_sentence_idx = {int(cid): list(map(int, np.where(labels_arr == cid)[0]))
	for cid in unique_labels}

	def make_summary(cid):
	idx = cluster_sentence_idx[int(cid)]
	c_emb = emb[idx]
	centroid = c_emb.mean(axis=0, keepdims=True)
	sims = cosine_similarity(centroid, c_emb)[0]
	top_k = min(NEAREST_K, len(idx))
	# Convert numpy int64 -> Python int to ensure JSON serialisability
	top_local = list(map(int, np.argsort(sims)[-top_k:][::-1]))
	top_global = list(map(lambda j: idx[j], top_local))
	return {
	"cluster_id": int(cid),
	"sentence_count": len(idx),
	"paper_count": len(set(paper_ids[i] for i in idx)),
	"top_sentences": [sentences[i] for i in top_global],
	"centroid": centroid[0].tolist(),
	# idx already Python int from cluster_sentence_idx (Fix 1)
	"sentence_indices": idx,
	}

	summaries = list(map(make_summary, unique_labels))
	summaries = sorted(summaries, key=lambda x: x["sentence_count"], reverse=True)
	_save(summaries, f"{run_key}_summaries.json")

	# ── 4 Plotly Charts ───────────────────────────────────────────────────────
	_generate_charts(run_key, summaries, name_key="cluster_id")
	chart_dir = _opath(f"{run_key}_charts")

	return (
	f"✅ BERTopic Discovery Complete ({run_key})\n"
	f"Total sentences: {len(sentences):,}\n"
	f"Topics generated: {n_clusters} (Dynamic via target_size={target_size})\n"
	f"Algorithm: Constrained Agglomerative (Split & Merge Balanced)\n"
	f"Largest cluster: {summaries[0]['sentence_count']} sentences\n"
	f"Smallest cluster: {summaries[-1]['sentence_count']} sentences\n"
	f"Charts saved to {chart_dir}\n"
	f"Now calling label_topics_with_llm..."
	)


	# ─── TOOL 3: LABEL TOPICS WITH LLM ───────────────────────────────────────────

	@tool
	def label_topics_with_llm(run_key: str) -> str:
	"""Send top 100 clusters to Mistral for labelling.
	Returns topic labels, categories, confidence scores, reasoning, is_niche.
	Saves {run_key}_labels.json. Phase 2 of Braun & Clarke.
	run_key must be 'abstract' or 'title'."""
	summaries = _load(f"{run_key}_summaries.json")[:MAX_LABEL_TOPICS]
	llm = _get_llm()

	label_prompt = PromptTemplate.from_template(
	"You are a bibliometric research expert.\n"
	"Label each cluster below with a concise research area name.\n"
	"Return ONLY a JSON array — one object per cluster:\n"
	' {{"cluster_id": N, "label": "...", "category": "...", '
	'"confidence": 0.0-1.0, "reasoning": "...", "is_niche": true/false}}\n\n'
	"Clusters (ID \| sentence_count \| top 2 sentences):\n{clusters}\n\n"
	"Return valid JSON array only, no markdown fences."
	)

	def _format_batch(batch):
	return "\n".join(
	f"{s['cluster_id']} \| {s['sentence_count']} sents \| "
	+ " /// ".join(s["top_sentences"][:2])
	for s in batch
	)

	def label_batch(batch):
	raw = (label_prompt \| llm \| StrOutputParser()).invoke(
	{"clusters": _format_batch(batch)}
	)
	cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
	return json.loads(cleaned)

	batch_starts = list(range(0, len(summaries), BATCH_SIZE))
	batches = list(map(lambda i: summaries[i:i + BATCH_SIZE], batch_starts))
	results = [item for batch_result in map(label_batch, batches)
	for item in batch_result]

	label_map = {r["cluster_id"]: r for r in results}
	labeled = [
	{s, label_map.get(s["cluster_id"],
	{"label": f"Topic {s['cluster_id']}", "category": "Unknown",
	"confidence": 0.5, "reasoning": "", "is_niche": False})}
	for s in summaries
	]
	_save(labeled, f"{run_key}_labels.json")
	_generate_charts(run_key, labeled, name_key="label")

	return (
	f"✅ Labels Generated ({run_key})\n"
	f"Topics labeled: {len(labeled)}\n"
	f"Review table populated. Edit Approve/Rename columns, "
	f"then click Submit Review."
	)


	# ─── TOOL 4: CONSOLIDATE INTO THEMES ─────────────────────────────────────────

	@tool
	def consolidate_into_themes(run_key: str, theme_map: str) -> str:
	"""Merge researcher-approved topic groups into consolidated themes.
	theme_map: JSON string — array from review table with cluster_id, approve, rename_to fields.
	Recomputes centroids and paper counts from actual embeddings.
	Saves {run_key}_themes.json. Phase 3 of Braun & Clarke."""
	decisions = json.loads(theme_map)
	emb = np.load(_opath(f"{run_key}_emb.npy"))
	sent_data = _load(f"{run_key}_sentences.json")
	paper_ids = sent_data["paper_ids"]
	sentences = sent_data["sentences"]
	summaries = _load(f"{run_key}_summaries.json")

	# Build cluster_id → sentence_indices map from summaries
	# (sentence_indices stored during discovery; fallback to sequential search)
	sum_map = {s["cluster_id"]: s for s in summaries}

	approved = [d for d in decisions if str(d.get("approve", "")).upper() == "YES"]

	# Group cluster IDs by theme name
	theme_groups: dict = {}
	list(map(
	lambda d: theme_groups.setdefault(
	str(d.get("rename_to", "") or d.get("label", f"Topic {d['cluster_id']}")).strip(),
	[]
	).append(int(d["cluster_id"])),
	approved
	))

	def build_theme(name_cids_tuple):
	name, cids = name_cids_tuple
	# Collect all sentence indices for these clusters
	all_sent_idx = list(set(
	idx
	for cid in cids
	for idx in (sum_map[cid].get("sentence_indices", []) if cid in sum_map else [])
	))
	# Fallback: scan paper_ids if sentence_indices weren't stored
	fallback_idx = list(set(
	i for cid in cids
	for i in range(len(paper_ids))
	if paper_ids[i] == cid
	)) if not all_sent_idx else all_sent_idx

	use_idx = all_sent_idx if all_sent_idx else fallback_idx
	theme_emb = emb[use_idx] if use_idx else emb[:1]
	centroid = theme_emb.mean(axis=0)

	total_sents = sum(sum_map[cid]["sentence_count"]
	for cid in cids if cid in sum_map)
	unique_papers = set(paper_ids[i] for i in use_idx) if use_idx else set()
	top_sents = sum_map[cids[0]]["top_sentences"][:3] if cids and cids[0] in sum_map else []

	return {
	"theme_name": name,
	"merged_cluster_ids": cids,
	"sentence_count": total_sents,
	"paper_count": len(unique_papers),
	"top_sentences": top_sents,
	"centroid": centroid.tolist(),
	}

	themes = list(map(build_theme, theme_groups.items()))
	themes.sort(key=lambda x: x["sentence_count"], reverse=True)
	_save(themes, f"{run_key}_themes.json")
	_generate_charts(run_key, themes, name_key="theme_name")

	return (
	f"✅ Themes Consolidated ({run_key})\n"
	f"Approved topics: {len(approved)}\n"
	f"Final themes: {len(themes)}\n"
	f"Theme names: {[t['theme_name'] for t in themes]}\n"
	f"Review consolidated themes. Click Submit Review to confirm."
	)


	# ─── TOOL 5: COMPARE WITH TAXONOMY ───────────────────────────────────────────

	@tool
	def compare_with_taxonomy(run_key: str) -> str:
	"""Map final themes to the PAJAIS taxonomy (Jiang et al. 2019) — 25 categories.
	Classifies each theme as MAPPED or NOVEL.
	Saves taxonomy_map.json. Phase 5.5 of Braun & Clarke.
	run_key must be 'abstract' or 'title'."""
	themes_file = (f"{run_key}_themes.json"
	if os.path.exists(_opath(f"{run_key}_themes.json"))
	else f"{run_key}_labels.json")
	themes_raw = _load(themes_file)
	theme_names = [t.get("theme_name", t.get("label", "")) for t in themes_raw]
	llm = _get_llm()

	tax_prompt = PromptTemplate.from_template(
	"You are a bibliometric taxonomy expert.\n"
	"Map each theme to the PAJAIS taxonomy (Jiang et al., 2019).\n\n"
	"PAJAIS 25 categories:\n{pajais}\n\n"
	"Themes to classify:\n{themes}\n\n"
	"Return ONLY a JSON array:\n"
	'[{{"theme": "...", "pajais_match": "category or NOVEL", '
	'"match_confidence": 0.0-1.0, "reasoning": "...", "is_novel": true/false}}]\n'
	"If no PAJAIS category fits well, set pajais_match to NOVEL and is_novel to true.\n"
	"No markdown fences, return raw JSON only."
	)

	pajais_str = "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_25))
	themes_str = "\n".join(f"- {n}" for n in theme_names)

	raw = (tax_prompt \| llm \| StrOutputParser()).invoke(
	{"pajais": pajais_str, "themes": themes_str}
	)
	cleaned = raw.strip().lstrip("```json").lstrip("```").rstrip("```").strip()
	results = json.loads(cleaned)

	mapped = [r for r in results if not r.get("is_novel", False)]
	novel = [r for r in results if r.get("is_novel", False)]
	covered = set(r["pajais_match"] for r in mapped)
	gaps = [c for c in PAJAIS_25 if c not in covered]

	taxonomy_map = {
	"run_key": run_key,
	"taxonomy_mapping": {r["theme"]: r for r in results},
	"novel_themes": [r["theme"] for r in novel],
	"pajais_gap_categories": gaps,
	"coverage_stats": {
	"total_themes": len(results),
	"mapped": len(mapped),
	"novel": len(novel),
	},
	}
	_save(taxonomy_map, "taxonomy_map.json")

	return (
	f"✅ PAJAIS Taxonomy Mapped ({run_key})\n"
	f"Themes mapped: {len(mapped)}\n"
	f"NOVEL themes: {len(novel)} → {[r['theme'] for r in novel]}\n"
	f"PAJAIS gaps (top 5): {gaps[:5]}\n"
	f"taxonomy_map.json saved. Review PAJAIS mapping in table. Click Submit Review."
	)


	# ─── TOOL 6: GENERATE COMPARISON CSV ─────────────────────────────────────────

	@tool
	def generate_comparison_csv() -> str:
	"""Load themes from both abstract and title runs and create a side-by-side comparison.
	Identifies STABLE (convergent), ABSTRACT-ONLY, and TITLE-ONLY themes.
	Saves comparison.csv. Phase 6 of Braun & Clarke."""
	def load_themes(key):
	fname = (f"{key}_themes.json"
	if os.path.exists(_opath(f"{key}_themes.json"))
	else f"{key}_labels.json")
	return _load(fname)

	abs_themes = load_themes("abstract")
	ttl_themes = load_themes("title")

	abs_names = [t.get("theme_name", t.get("label", "")) for t in abs_themes]
	ttl_names = [t.get("theme_name", t.get("label", "")) for t in ttl_themes]
	abs_kws = [" \| ".join(t.get("top_sentences", [""])[:1]) for t in abs_themes]
	ttl_kws = [" \| ".join(t.get("top_sentences", [""])[:1]) for t in ttl_themes]

	max_len = max(len(abs_themes), len(ttl_themes))
	pad = lambda lst, val: lst + [val] * (max_len - len(lst)) # noqa: E731

	df = pd.DataFrame({
	"Abstract_Theme": pad(abs_names, ""),
	"Abstract_Evidence": pad(abs_kws, ""),
	"Abstract_Sentences": pad([t.get("sentence_count", 0) for t in abs_themes], 0),
	"Title_Theme": pad(ttl_names, ""),
	"Title_Evidence": pad(ttl_kws, ""),
	"Title_Sentences": pad([t.get("sentence_count", 0) for t in ttl_themes], 0),
	"Convergence": pad(
	["STABLE" if a in ttl_names else "ABSTRACT-ONLY" for a in abs_names],
	"TITLE-ONLY"
	),
	})
	path = _opath("comparison.csv")
	df.to_csv(path, index=False)
	return (
	f"✅ Comparison CSV Generated\n"
	f"Abstract themes: {len(abs_themes)}\n"
	f"Title themes: {len(ttl_themes)}\n"
	f"Rows: {len(df)}\nFile: {path}\n"
	f"Check Download tab for comparison.csv. Click Submit Review to generate narrative."
	)


	# ─── TOOL 7: EXPORT NARRATIVE ─────────────────────────────────────────────────

	@tool
	def export_narrative(run_key: str) -> str:
	"""Generate a 500-word Section 7 narrative via Mistral LLM.
	Uses themes + PAJAIS taxonomy mapping as context.
	Saves narrative.txt. Phase 6 of Braun & Clarke.
	run_key must be 'abstract' or 'title'."""
	cfg = _load("corpus_config.json")
	theme_file = (f"{run_key}_themes.json"
	if os.path.exists(_opath(f"{run_key}_themes.json"))
	else f"{run_key}_labels.json")
	themes = _load(theme_file)
	tax = _load("taxonomy_map.json")

	theme_names = [t.get("theme_name", t.get("label", "")) for t in themes]
	novel_themes = tax.get("novel_themes", [])
	gaps = tax.get("pajais_gap_categories", [])
	mapped = tax.get("coverage_stats", {}).get("mapped", 0)

	llm = _get_llm()
	narr_prompt = PromptTemplate.from_template(
	"Write a 500-word Section 7 for a conference paper on topic modelling.\n"
	"Journal: {journal} \| Papers: {papers} \| Years: {y_min}–{y_max}\n"
	"Stable BERTopic themes (abstract run): {themes}\n"
	"NOVEL themes (not in PAJAIS 2019): {novel}\n"
	"PAJAIS gap categories: {gaps}\n"
	"Themes mapped to PAJAIS: {mapped}\n\n"
	"Structure: 7.1 Methodology (LDA + BERTopic, Braun & Clarke 2006), "
	"7.2 RQ4 LDA Findings, 7.3 RQ5 Abstract vs Title Comparison, "
	"7.4 RQ6 PAJAIS Taxonomy Mapping with NOVEL theme justification, "
	"7.5 RQ7 Future Research Agenda.\n"
	"Cite: Braun & Clarke (2006), Jiang et al. (2019), Grootendorst (2022).\n"
	"~500 words, academic tone, no bullet points, paragraph form."
	)
	narrative = (narr_prompt \| llm \| StrOutputParser()).invoke({
	"journal": cfg.get("journal", "Electronic Markets"),
	"papers": cfg.get("rows", 908),
	"y_min": cfg.get("year_min", 2007),
	"y_max": cfg.get("year_max", 2026),
	"themes": ", ".join(theme_names[:10]),
	"novel": ", ".join(novel_themes[:5]),
	"gaps": ", ".join(gaps[:5]),
	"mapped": mapped,
	})
	path = _opath("narrative.txt")
	with open(path, "w", encoding="utf-8") as f:
	f.write(narrative)
	return (
	f"✅ Narrative Exported\n"
	f"Words: {len(narrative.split())}\n"
	f"File: {path}\n"
	f"🎉 Pipeline complete! Download narrative.txt from the Download tab.\n"
	f"Deliverables: comparison.csv \| taxonomy_map.json \| narrative.txt"
	)

	# --- SET handle_tool_error ON ALL TOOLS (BaseTool property) ---
	# langchain-core 0.3.x: handle_tool_error is a BaseTool property,
	# not a @tool() decorator argument. Using map() - zero loops.
	_ALL_TOOLS = [
	load_scopus_csv, run_bertopic_discovery, label_topics_with_llm,
	consolidate_into_themes, compare_with_taxonomy,
	generate_comparison_csv, export_narrative,
	]
	list(map(lambda t: setattr(t, "handle_tool_error", True), _ALL_TOOLS))