Spaces:

anujjuna
/

THEMIS-BERTopic

Runtime error

App Files Files Community

THEMIS-BERTopic / tools.py

anujjuna

Update tools.py

b41e337 verified about 1 month ago

raw

history blame contribute delete

30.5 kB

	"""
	tools.py — 7 @tool functions for BERTopic Agentic Thematic Analysis
	Generated for: Braun & Clarke (2006) 6-Phase Framework Pipeline
	"""

	import json
	import os
	import re
	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots

	from langchain_core.tools import tool
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_mistralai import ChatMistralAI

	from sentence_transformers import SentenceTransformer
	from sklearn.cluster import AgglomerativeClustering
	from sklearn.metrics.pairwise import cosine_similarity
	from bertopic import BERTopic

	import nltk
	nltk.download('punkt', quiet=True)
	nltk.download('punkt_tab', quiet=True)
	from nltk.tokenize import sent_tokenize

	# ── Constants ──────────────────────────────────────────────────────────────────
	EMBED_MODEL = "all-MiniLM-L6-v2"
	CHECKPOINT_DIR = "checkpoints"
	NEAREST_K = 5
	MAX_LABEL_TOPICS = 100
	os.makedirs(CHECKPOINT_DIR, exist_ok=True)

	PAJAIS_CATEGORIES = [
	"Artificial Intelligence & Machine Learning",
	"Big Data & Analytics",
	"Blockchain & Distributed Ledger",
	"Business Intelligence & Decision Support",
	"Cloud Computing & Infrastructure",
	"Cybersecurity & Privacy",
	"Digital Transformation & Strategy",
	"E-Commerce & Digital Markets",
	"Enterprise Systems & ERP",
	"Ethics & Governance in IS",
	"Health Informatics & eHealth",
	"Human-Computer Interaction & UX",
	"Information Systems Theory & Foundations",
	"Internet of Things & Cyber-Physical Systems",
	"IS Education & Curriculum",
	"IS in Developing Countries",
	"IS Project Management & Implementation",
	"Knowledge Management & Organizational Learning",
	"Mobile & Ubiquitous Computing",
	"Natural Language Processing & Text Mining",
	"Open Source & Collaborative Systems",
	"Platforms & Ecosystems",
	"Social Media & Online Communities",
	"Supply Chain & Logistics IS",
	"Virtual Reality & Immersive Technologies",
	]

	BOILERPLATE_PATTERNS = [
	r"©\s\d{4}.?(elsevier\|springer\|wiley\|taylor\|emerald\|sage\|ieee\|acm\|informs).*?\.",
	r"all rights reserved\.?",
	r"published by.*?\.",
	r"doi:\s*\S+",
	r"http[s]?://\S+",
	r"this article is protected by copyright.*?\.",
	r"please cite this article.*?\.",
	r"accepted manuscript.*?\.",
	r"preprint.*?\.",
	r"peer.reviewed.*?\.",
	r"received:\s\d+.?accepted:\s\d+.?\.",
	r"keywords:.*?\.",
	r"jel classification.*?\.",
	r"abstract[-–—]?\s*",
	r"introduction[-–—]?\s*$",
	r"in this (paper\|study\|article\|research).*?we (propose\|present\|examine\|investigate\|explore)",
	r"the purpose of this (paper\|study\|article)",
	r"this (paper\|study\|article) (aims\|seeks\|investigates\|examines\|explores\|presents)",
	r"we (propose\|present\|examine\|investigate\|explore)",
	r"\b(furthermore\|moreover\|however\|nevertheless\|therefore\|thus\|hence)\b",
	r"^\s\d+\s$",
	r"\s{2,}",
	]

	# ── Helpers ────────────────────────────────────────────────────────────────────
	_df_cache: dict = {}
	_embeddings_cache: dict = {}


	def _get_llm():
	return ChatMistralAI(
	model="mistral-large-latest",
	temperature=0.1,
	api_key=os.environ.get("MISTRAL_API_KEY", ""),
	)


	def _clean_text(text: str) -> str:
	if not isinstance(text, str):
	return ""
	for pat in BOILERPLATE_PATTERNS:
	text = re.sub(pat, " ", text, flags=re.IGNORECASE)
	return text.strip()


	def _load_df() -> pd.DataFrame:
	if "df" in _df_cache:
	return _df_cache["df"]
	path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv")
	if os.path.exists(path):
	_df_cache["df"] = pd.read_csv(path)
	return _df_cache["df"]
	raise FileNotFoundError("No CSV loaded. Please upload your Scopus CSV first.")


	def _get_sentences(run_key: str) -> list[str]:
	df = _load_df()
	col_map = {"abstract": "Abstract", "title": "Title"}
	col = col_map.get(run_key.lower(), "Abstract")
	sentences = []
	for text in df[col].dropna():
	cleaned = _clean_text(str(text))
	if run_key.lower() == "abstract":
	sents = sent_tokenize(cleaned)
	sentences.extend([s.strip() for s in sents if len(s.strip()) > 30])
	else:
	if len(cleaned.strip()) > 10:
	sentences.append(cleaned.strip())
	return sentences


	def _embed(sentences: list[str], run_key: str) -> np.ndarray:
	cache_key = f"{run_key}_emb"
	emb_path = os.path.join(CHECKPOINT_DIR, f"{run_key}_emb.npy")
	if cache_key in _embeddings_cache:
	return _embeddings_cache[cache_key]
	if os.path.exists(emb_path):
	emb = np.load(emb_path)
	_embeddings_cache[cache_key] = emb
	return emb
	model = SentenceTransformer(EMBED_MODEL)
	emb = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
	np.save(emb_path, emb)
	_embeddings_cache[cache_key] = emb
	return emb


	def _save_json(data, filename: str):
	path = os.path.join(CHECKPOINT_DIR, filename)
	with open(path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)


	def _load_json(filename: str):
	path = os.path.join(CHECKPOINT_DIR, filename)
	if not os.path.exists(path):
	return None
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	# ── Tool 1: Load CSV ───────────────────────────────────────────────────────────
	@tool
	def load_scopus_csv(filepath: str) -> str:
	"""
	Load a Scopus CSV export and prepare it for topic modelling.
	Returns paper count, abstract sentence count, and title sentence count.
	Saves the cleaned dataframe as a checkpoint for subsequent tools.

	Args:
	filepath: Path to the uploaded Scopus CSV file.
	"""
	df = pd.read_csv(filepath)

	# Normalize column names
	df.columns = [c.strip() for c in df.columns]
	expected = ["Authors", "Title", "Abstract", "Author Keywords", "Cited by", "Source title", "Year"]
	for col in expected:
	if col not in df.columns:
	# Try case-insensitive match
	matches = [c for c in df.columns if c.lower() == col.lower()]
	if matches:
	df.rename(columns={matches[0]: col}, inplace=True)

	# Save checkpoint
	save_path = os.path.join(CHECKPOINT_DIR, "scopus_data.csv")
	df.to_csv(save_path, index=False)
	_df_cache["df"] = df

	# Count papers
	n_papers = len(df)

	# Count abstract sentences
	abstract_sents = 0
	for text in df.get("Abstract", pd.Series()).dropna():
	cleaned = _clean_text(str(text))
	sents = sent_tokenize(cleaned)
	abstract_sents += len([s for s in sents if len(s.strip()) > 30])

	# Count title sentences
	title_sents = len(df["Title"].dropna())

	# Save summaries checkpoint (Phase 1 marker)
	summary_data = {
	"n_papers": n_papers,
	"abstract_sentences": abstract_sents,
	"title_sentences": title_sents,
	"columns": list(df.columns),
	"year_range": f"{int(df['Year'].min())} – {int(df['Year'].max())}" if "Year" in df.columns else "N/A",
	}
	_save_json(summary_data, "summaries.json")

	return (
	f"✅ CSV loaded successfully!\n\n"
	f"📄 Papers: {n_papers:,}\n"
	f"📝 Abstract sentences (after cleaning): {abstract_sents:,}\n"
	f"🏷️ Title sentences: {title_sents:,}\n"
	f"📅 Year range: {summary_data['year_range']}\n"
	f"📊 Columns detected: {', '.join(df.columns.tolist())}\n\n"
	f"Phase 1 (Familiarisation) complete. Type 'run abstract' to begin Phase 2."
	)


	# ── Tool 2: Run BERTopic ───────────────────────────────────────────────────────
	@tool
	def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
	"""
	Run BERTopic clustering on either abstracts or titles.
	Uses SentenceTransformer embeddings in 384-dimensional space with
	AgglomerativeClustering (cosine metric, average linkage). No UMAP reduction.
	Generates 4 interactive Plotly charts. Saves summaries.json and emb.npy.

	Args:
	run_key: Either 'abstract' or 'title'
	threshold: AgglomerativeClustering distance threshold (default 0.7)
	"""
	sentences = _get_sentences(run_key)
	if not sentences:
	return f"No sentences found for run_key='{run_key}'. Check your CSV."

	# Embed
	emb = _embed(sentences, run_key)

	# Cluster in 384d (no UMAP)
	clustering = AgglomerativeClustering(
	metric="cosine",
	linkage="average",
	distance_threshold=threshold,
	n_clusters=None,
	)
	labels = clustering.fit_predict(emb)
	n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

	# Build topic summaries
	topics = {}
	for idx, label in enumerate(labels):
	if label == -1:
	continue
	topics.setdefault(label, []).append(idx)

	# Find nearest-K sentences to centroid
	topic_summaries = []
	for topic_id, sent_indices in sorted(topics.items()):
	topic_embs = emb[sent_indices]
	centroid = topic_embs.mean(axis=0, keepdims=True)
	sims = cosine_similarity(centroid, topic_embs)[0]
	top_k = np.argsort(sims)[::-1][:NEAREST_K]
	top_sentences = [sentences[sent_indices[i]] for i in top_k]
	topic_summaries.append({
	"topic_id": int(topic_id),
	"count": len(sent_indices),
	"top_sentences": top_sentences,
	"label": None,
	"category": None,
	"confidence": None,
	"reasoning": None,
	"niche": None,
	"approve": "",
	"rename_to": "",
	"user_reasoning": "",
	})

	# Sort by size descending
	topic_summaries.sort(key=lambda x: x["count"], reverse=True)

	# Save checkpoint
	_save_json({"run_key": run_key, "topics": topic_summaries}, f"{run_key}_summaries.json")
	_save_json({"run_key": run_key, "topics": topic_summaries}, "summaries.json")

	# ── Generate Plotly charts ─────────────────────────────────────────────────
	# Intertopic distance map (using PCA-projected centroids)
	from sklearn.decomposition import PCA

	centroids = []
	sizes = []
	ids = []
	for t in topic_summaries[:50]:
	sent_indices = topics[t["topic_id"]]
	centroid = emb[sent_indices].mean(axis=0)
	centroids.append(centroid)
	sizes.append(t["count"])
	ids.append(t["topic_id"])

	pca = PCA(n_components=2)
	coords = pca.fit_transform(np.array(centroids))

	# Chart 1: Intertopic Map
	fig1 = go.Figure()
	fig1.add_trace(go.Scatter(
	x=coords[:, 0], y=coords[:, 1],
	mode="markers+text",
	marker=dict(size=[max(10, s / 2) for s in sizes], color=sizes,
	colorscale="Viridis", showscale=True,
	colorbar=dict(title="Sentences")),
	text=[f"T{i}" for i in ids],
	textposition="top center",
	hovertext=[f"Topic {i}<br>{s} sentences" for i, s in zip(ids, sizes)],
	))
	fig1.update_layout(title=f"Intertopic Distance Map — {run_key.title()} ({n_clusters} topics)",
	template="plotly_dark", height=500,
	xaxis_title="PC1", yaxis_title="PC2")

	# Chart 2: Topic Size Bar Chart
	top_n = topic_summaries[:30]
	fig2 = px.bar(
	x=[t["count"] for t in top_n],
	y=[f"Topic {t['topic_id']}" for t in top_n],
	orientation="h",
	color=[t["count"] for t in top_n],
	color_continuous_scale="Plasma",
	title=f"Top 30 Topics by Size — {run_key.title()}",
	labels={"x": "Sentence Count", "y": "Topic"},
	)
	fig2.update_layout(template="plotly_dark", height=600)

	# Chart 3: Topic Hierarchy (dendrogram-style via heatmap of similarity)
	top20_indices = [topics[t["topic_id"]] for t in topic_summaries[:20]]
	sim_matrix = np.zeros((20, 20))
	cent20 = [emb[idx].mean(axis=0) for idx in top20_indices]
	for i in range(20):
	for j in range(20):
	sim_matrix[i][j] = cosine_similarity([cent20[i]], [cent20[j]])[0][0]

	fig3 = go.Figure(go.Heatmap(
	z=sim_matrix,
	x=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)],
	y=[f"T{topic_summaries[i]['topic_id']}" for i in range(20)],
	colorscale="RdBu", zmin=0, zmax=1,
	))
	fig3.update_layout(title=f"Topic Similarity Heatmap (Top 20) — {run_key.title()}",
	template="plotly_dark", height=500)

	# Chart 4: Sentence distribution
	fig4 = px.histogram(
	x=[t["count"] for t in topic_summaries],
	nbins=30, title=f"Topic Size Distribution — {run_key.title()}",
	labels={"x": "Sentences per Topic", "y": "Number of Topics"},
	color_discrete_sequence=["#7C3AED"],
	)
	fig4.update_layout(template="plotly_dark", height=400)

	# Save charts
	charts = {
	"intertopic": fig1.to_html(include_plotlyjs="cdn", full_html=False),
	"bars": fig2.to_html(include_plotlyjs="cdn", full_html=False),
	"heatmap": fig3.to_html(include_plotlyjs="cdn", full_html=False),
	"distribution": fig4.to_html(include_plotlyjs="cdn", full_html=False),
	}
	_save_json(charts, f"{run_key}_charts.json")

	return (
	f"✅ BERTopic clustering complete for {run_key}!\n\n"
	f"🔢 Topics discovered: {n_clusters}\n"
	f"📊 Sentences processed: {len(sentences):,}\n"
	f"📐 Embedding dimensions: 384 (no UMAP reduction)\n"
	f"📏 Distance threshold: {threshold}\n\n"
	f"4 interactive charts saved. Calling label_topics_with_llm next..."
	)


	# ── Tool 3: Label Topics ───────────────────────────────────────────────────────
	@tool
	def label_topics_with_llm(run_key: str) -> str:
	"""
	Send top topics to Mistral LLM for labeling with research area names,
	categories, confidence scores, reasoning, and niche flag.
	Saves labels.json checkpoint.

	Args:
	run_key: Either 'abstract' or 'title'
	"""
	data = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json")
	if not data:
	return "No topic summaries found. Run run_bertopic_discovery first."

	topics = data["topics"][:MAX_LABEL_TOPICS]
	llm = _get_llm()

	# Build prompt payload
	topic_texts = []
	for t in topics:
	sents = " \| ".join(t["top_sentences"][:3])
	topic_texts.append(f"Topic {t['topic_id']} ({t['count']} sentences): {sents[:300]}")

	prompt_template = PromptTemplate.from_template(
	"""You are an expert academic researcher specializing in Information Systems and Computer Science.

	Analyze these research topics extracted from journal abstracts/titles and label each one.

	Topics:
	{topics}

	For each topic, respond with a JSON array. Each element must have:
	- topic_id: integer
	- label: concise research area name (3-7 words)
	- category: broad category (e.g., "AI & ML", "HCI", "Security", "Data Management")
	- confidence: float 0.0-1.0
	- reasoning: one sentence explaining the label
	- niche: boolean (true if highly specialized/narrow)

	Respond ONLY with a valid JSON array. No markdown, no preamble, no explanation."""
	)

	parser = JsonOutputParser()
	chain = prompt_template \| llm \| parser

	result = chain.invoke({"topics": "\n".join(topic_texts)})

	# Merge labels back into topics
	label_map = {item["topic_id"]: item for item in result}
	labeled_topics = []
	for t in data["topics"]:
	lbl = label_map.get(t["topic_id"], {})
	labeled_topics.append({
	**t,
	"label": lbl.get("label", f"Topic {t['topic_id']}"),
	"category": lbl.get("category", "Uncategorized"),
	"confidence": lbl.get("confidence", 0.5),
	"reasoning": lbl.get("reasoning", ""),
	"niche": lbl.get("niche", False),
	})

	_save_json({"run_key": run_key, "topics": labeled_topics}, f"{run_key}_labels.json")
	_save_json({"run_key": run_key, "topics": labeled_topics}, "labels.json")

	labeled_count = len([t for t in labeled_topics if t.get("label")])
	return (
	f"✅ Topics labeled by LLM!\n\n"
	f"🏷️ Topics labeled: {labeled_count}\n"
	f"📋 Review the table below — check labels, approve or rename topics.\n\n"
	f"Phase 2 complete. Review the table, edit Approve/Rename columns, then click Submit Review."
	)


	# ── Tool 4: Consolidate Themes ─────────────────────────────────────────────────
	@tool
	def consolidate_into_themes(run_key: str, theme_map: str) -> str:
	"""
	Consolidate approved topics into researcher-defined themes.
	Recomputes centroids for merged theme groups.
	Saves themes.json checkpoint.

	Args:
	run_key: Either 'abstract' or 'title'
	theme_map: JSON string mapping theme names to lists of topic IDs.
	Example: '{"AI in Healthcare": [0, 3, 7], "Blockchain": [1, 5]}'
	"""
	data = _load_json(f"{run_key}_labels.json") or _load_json("labels.json")
	if not data:
	return "No labeled topics found. Run label_topics_with_llm first."

	try:
	if isinstance(theme_map, str):
	groupings = json.loads(theme_map)
	else:
	groupings = theme_map
	except json.JSONDecodeError as e:
	return f"Invalid theme_map JSON: {e}"

	topics_by_id = {t["topic_id"]: t for t in data["topics"]}
	sentences = _get_sentences(run_key)
	emb = _embed(sentences, run_key)

	# Build label → sentences mapping for original topics
	# (reconstruct from saved summaries)
	summaries = _load_json(f"{run_key}_summaries.json") or _load_json("summaries.json")
	all_topic_indices: dict = {}
	if summaries:
	# We need to re-cluster to get indices — use saved emb
	clustering = AgglomerativeClustering(
	metric="cosine", linkage="average",
	distance_threshold=0.7, n_clusters=None
	)
	labels_arr = clustering.fit_predict(emb)
	for idx, lbl in enumerate(labels_arr):
	all_topic_indices.setdefault(int(lbl), []).append(idx)

	themes = []
	used_ids = set()
	for theme_name, topic_ids in groupings.items():
	merged_sentence_indices = []
	for tid in topic_ids:
	merged_sentence_indices.extend(all_topic_indices.get(tid, []))
	used_ids.add(tid)

	if not merged_sentence_indices:
	continue

	theme_embs = emb[merged_sentence_indices]
	centroid = theme_embs.mean(axis=0, keepdims=True)
	sims = cosine_similarity(centroid, theme_embs)[0]
	top_k = np.argsort(sims)[::-1][:NEAREST_K]
	top_sents = [sentences[merged_sentence_indices[i]] for i in top_k]

	# Count papers
	df = _load_df()
	n_papers = len(df) # Approximate

	themes.append({
	"theme_name": theme_name,
	"topic_ids": topic_ids,
	"sentence_count": len(merged_sentence_indices),
	"paper_count": n_papers,
	"top_sentences": top_sents,
	"approve": "",
	"rename_to": "",
	"user_reasoning": "",
	"pajais_match": None,
	"is_novel": None,
	})

	# Handle uncategorized topics
	uncategorized = [tid for tid in all_topic_indices if tid not in used_ids]
	if uncategorized:
	merged = []
	for tid in uncategorized:
	merged.extend(all_topic_indices[tid])
	if merged:
	themes.append({
	"theme_name": "Uncategorized",
	"topic_ids": uncategorized,
	"sentence_count": len(merged),
	"paper_count": 0,
	"top_sentences": [sentences[i] for i in merged[:3]],
	"approve": "",
	"rename_to": "",
	"user_reasoning": "",
	"pajais_match": None,
	"is_novel": None,
	})

	_save_json({"run_key": run_key, "themes": themes}, f"{run_key}_themes.json")
	_save_json({"run_key": run_key, "themes": themes}, "themes.json")

	return (
	f"✅ Themes consolidated!\n\n"
	f"🗂️ Themes created: {len(themes)}\n"
	f"📊 Total sentences covered: {sum(t['sentence_count'] for t in themes):,}\n\n"
	f"Phase 3 complete. Review consolidated themes in the table. Click Submit Review."
	)


	# ── Tool 5: Compare with PAJAIS Taxonomy ──────────────────────────────────────
	@tool
	def compare_with_taxonomy(run_key: str) -> str:
	"""
	Map final themes to PAJAIS 25-category taxonomy using Mistral LLM.
	Identifies NOVEL themes not covered by existing taxonomy.
	Saves taxonomy_map.json checkpoint.

	Args:
	run_key: Either 'abstract' or 'title'
	"""
	data = _load_json(f"{run_key}_themes.json") or _load_json("themes.json")
	if not data:
	return "No themes found. Run consolidate_into_themes first."

	themes = data["themes"]
	llm = _get_llm()

	theme_descriptions = []
	for t in themes:
	sents = " \| ".join(t["top_sentences"][:2])
	theme_descriptions.append(
	f"Theme: {t['theme_name']}\nEvidence: {sents[:250]}"
	)

	prompt_template = PromptTemplate.from_template(
	"""You are an expert in Information Systems research taxonomy.

	Map each research theme to the PAJAIS (Pan-Pacific Journal of Advanced Research in Information Systems) taxonomy categories, or flag as NOVEL if no match exists.

	PAJAIS Categories:
	{categories}

	Themes to map:
	{themes}

	For each theme, respond with a JSON array. Each element must have:
	- theme_name: string (exact match from input)
	- pajais_match: string (exact PAJAIS category name, or "NOVEL")
	- match_confidence: float 0.0-1.0
	- reasoning: one sentence justification
	- is_novel: boolean (true if NOVEL)
	- evidence_summary: brief description of what the theme covers

	Respond ONLY with valid JSON array. No markdown."""
	)

	parser = JsonOutputParser()
	chain = prompt_template \| llm \| parser

	result = chain.invoke({
	"categories": "\n".join(f"{i+1}. {c}" for i, c in enumerate(PAJAIS_CATEGORIES)),
	"themes": "\n\n".join(theme_descriptions),
	})

	# Merge results
	result_map = {item["theme_name"]: item for item in result}
	taxonomy_themes = []
	for t in themes:
	mapping = result_map.get(t["theme_name"], {})
	taxonomy_themes.append({
	**t,
	"pajais_match": mapping.get("pajais_match", "NOVEL"),
	"match_confidence": mapping.get("match_confidence", 0.0),
	"reasoning": mapping.get("reasoning", ""),
	"is_novel": mapping.get("is_novel", True),
	"evidence_summary": mapping.get("evidence_summary", ""),
	"top_sentences": [
	f"→ {mapping.get('pajais_match', 'NOVEL')} \| {mapping.get('reasoning', '')}"
	] + t.get("top_sentences", [])[:2],
	})

	novel_count = len([t for t in taxonomy_themes if t.get("is_novel")])
	mapped_count = len(taxonomy_themes) - novel_count

	_save_json({"run_key": run_key, "themes": taxonomy_themes}, f"{run_key}_taxonomy_map.json")
	_save_json({"run_key": run_key, "themes": taxonomy_themes}, "taxonomy_map.json")

	return (
	f"✅ PAJAIS taxonomy mapping complete!\n\n"
	f"✅ MAPPED themes: {mapped_count}\n"
	f"🆕 NOVEL themes: {novel_count}\n\n"
	f"**Phase 5.5 complete. Review PAJAIS mapping in the table (Top Evidence column shows → PAJAIS match). "
	f"Click Submit Review.**"
	)


	# ── Tool 6: Generate Comparison CSV ───────────────────────────────────────────
	@tool
	def generate_comparison_csv() -> str:
	"""
	Compare abstract themes vs title themes side-by-side.
	Creates a convergence/divergence analysis CSV.
	Saves comparison.csv checkpoint.
	"""
	abstract_data = _load_json("abstract_themes.json") or _load_json("themes.json")
	title_data = _load_json("title_themes.json")

	if not abstract_data:
	return "Abstract themes not found. Complete abstract analysis first."
	if not title_data:
	return "Title themes not found. Complete title analysis first (run title analysis)."

	abstract_themes = {t["theme_name"]: t for t in abstract_data.get("themes", [])}
	title_themes = {t["theme_name"]: t for t in title_data.get("themes", [])}

	all_themes = sorted(set(list(abstract_themes.keys()) + list(title_themes.keys())))

	rows = []
	for theme in all_themes:
	a = abstract_themes.get(theme, {})
	t = title_themes.get(theme, {})
	convergence = "CONVERGE" if theme in abstract_themes and theme in title_themes else (
	"ABSTRACT ONLY" if theme in abstract_themes else "TITLE ONLY"
	)
	rows.append({
	"Theme": theme,
	"Abstract_Sentences": a.get("sentence_count", 0),
	"Title_Sentences": t.get("sentence_count", 0),
	"Abstract_PAJAIS": a.get("pajais_match", "N/A"),
	"Title_PAJAIS": t.get("pajais_match", "N/A"),
	"Abstract_Novel": a.get("is_novel", False),
	"Title_Novel": t.get("is_novel", False),
	"Convergence": convergence,
	"Top_Abstract_Evidence": (a.get("top_sentences", [""])[0])[:200] if a else "",
	"Top_Title_Evidence": (t.get("top_sentences", [""])[0])[:200] if t else "",
	})

	df = pd.DataFrame(rows)
	save_path = os.path.join(CHECKPOINT_DIR, "comparison.csv")
	df.to_csv(save_path, index=False)

	converge = len([r for r in rows if r["Convergence"] == "CONVERGE"])
	abstract_only = len([r for r in rows if r["Convergence"] == "ABSTRACT ONLY"])
	title_only = len([r for r in rows if r["Convergence"] == "TITLE ONLY"])

	return (
	f"✅ Comparison CSV generated!\n\n"
	f"🔄 Converging themes: {converge}\n"
	f"📝 Abstract-only themes: {abstract_only}\n"
	f"🏷️ Title-only themes: {title_only}\n\n"
	f"Check the Download tab for comparison.csv. Click Submit Review to confirm."
	)


	# ── Tool 7: Export Narrative ───────────────────────────────────────────────────
	@tool
	def export_narrative(run_key: str) -> str:
	"""
	Generate a 500-word Section 7 literature review narrative using Mistral LLM.
	References B&C methodology, key themes, PAJAIS mapping, and limitations.
	Saves narrative.txt checkpoint.

	Args:
	run_key: Either 'abstract' or 'title'
	"""
	taxonomy_data = _load_json(f"{run_key}_taxonomy_map.json") or _load_json("taxonomy_map.json")
	if not taxonomy_data:
	return "No taxonomy mapping found. Run compare_with_taxonomy first."

	themes = taxonomy_data.get("themes", [])
	llm = _get_llm()

	theme_summary = []
	for t in themes:
	novel_flag = " [NOVEL]" if t.get("is_novel") else f" [→ {t.get('pajais_match', '')}]"
	theme_summary.append(f"• {t['theme_name']}{novel_flag}: {t.get('evidence_summary', t.get('reasoning', ''))}")

	summaries_data = _load_json("summaries.json") or {}
	n_papers = summaries_data.get("n_papers", "N/A")

	prompt_template = PromptTemplate.from_template(
	"""You are an academic writer drafting a Section 7 (Thematic Analysis Results) for a peer-reviewed Information Systems journal paper.

	Context:
	- Dataset: {n_papers} papers from Scopus
	- Method: BERTopic with AgglomerativeClustering (cosine metric, 384d embeddings, no UMAP), Braun & Clarke (2006) 6-phase framework
	- Analysis type: {run_key} analysis

	Themes discovered:
	{themes}

	Write a 500-word Section 7 that:
	1. Opens with methodology overview (BERTopic, B&C phases, embedding approach)
	2. Presents each major theme with evidence and paper count references
	3. Discusses PAJAIS taxonomy alignment (MAPPED vs NOVEL themes)
	4. Highlights the most significant NOVEL themes and their publication potential
	5. Acknowledges limitations (single journal, time period, computational constraints)
	6. Closes with implications for future research

	Write in formal academic style. Use hedged language where appropriate. Do not use bullet points — write in flowing paragraphs."""
	)

	chain = prompt_template \| llm
	response = chain.invoke({
	"n_papers": n_papers,
	"run_key": run_key,
	"themes": "\n".join(theme_summary),
	})

	narrative_text = response.content if hasattr(response, "content") else str(response)
	save_path = os.path.join(CHECKPOINT_DIR, "narrative.txt")
	with open(save_path, "w", encoding="utf-8") as f:
	f.write(narrative_text)

	word_count = len(narrative_text.split())
	return (
	f"✅ Section 7 narrative exported!\n\n"
	f"📝 Word count: {word_count}\n"
	f"💾 Saved to: narrative.txt\n\n"
	f"Phase 6 complete! All B&C phases finished. Check Download tab for all outputs.\n\n"
	f"---\n\n{narrative_text[:500]}...\n\n(Full narrative in narrative.txt)"
	)


	# ── All tools list ─────────────────────────────────────────────────────────────
	ALL_TOOLS = [
	load_scopus_csv,
	run_bertopic_discovery,
	label_topics_with_llm,
	consolidate_into_themes,
	compare_with_taxonomy,
	generate_comparison_csv,
	export_narrative,
	]