Spaces:

milindkamat0507
/

topic_modelling

Running on CPU Upgrade

App Files Files Community

topic_modelling / tools.py

milindkamat0507

Upload 4 files

684bbba verified 3 days ago

raw

history blame contribute delete

31.1 kB

	"""tools.py — Sentence-level BERTopic pipeline + Mistral LLM. Version 3.0.0 \| 4 April 2026. ZERO for/while/if.

	PIPELINE:
	Paper → split into sentences → each sentence gets paper_id + sent_id + metadata
	→ embed sentences (384d) → AgglomerativeClustering cosine → centroid nearest 5 sentences
	→ Mistral labels topics from sentence evidence + paper metadata
	→ one paper can span MULTIPLE topics
	"""
	from langchain_core.tools import tool
	import os
	import json
	import re
	import numpy as np
	import pandas as pd

	# ═══════════════════════════════════════════════
	# DEBUG + STATE + CONSTANTS
	# ═══════════════════════════════════════════════
	DEBUG = True
	debug = {True: print, False: lambda a, *k: None}[DEBUG]

	CHECKPOINT_DIR = "/tmp/checkpoints"
	os.makedirs(CHECKPOINT_DIR, exist_ok=True)

	NEAREST_K = 5
	SENT_SPLIT_RE = r'(?<=[.!?])\s+(?=[A-Z])'
	MIN_SENT_LEN = 30

	RUN_CONFIGS = {
	"abstract": ["Abstract"],
	"title": ["Title"],
	}

	_data = {}


	# ═══════════════════════════════════════════════
	# HELPER: Split text into sentences (regex, no nltk)
	# ═══════════════════════════════════════════════
	def _split_sentences(text):
	"""Split text on sentence boundaries. Filters short fragments (<30 chars).
	Uses regex: split after .!? followed by uppercase letter."""
	raw = re.split(SENT_SPLIT_RE, str(text))
	return list(filter(lambda s: len(s.strip()) >= MIN_SENT_LEN, raw))


	# ═══════════════════════════════════════════════
	# TOOL 1: Load Scopus CSV
	# ═══════════════════════════════════════════════
	@tool
	def load_scopus_csv(filepath: str) -> str:
	"""Load a Scopus CSV export and show preview. Call this first.

	Args:
	filepath: Path to the uploaded .csv file.

	Returns:
	Row count, column names, and sample data."""
	debug(f"\n>>> TOOL: load_scopus_csv(filepath='{filepath}')")
	df = pd.read_csv(filepath, encoding="utf-8-sig")
	_data["df"] = df
	debug(f">>> Loaded {len(df)} rows, {len(df.columns)} columns")
	target_cols = list(filter(lambda c: c in df.columns, ["Title", "Abstract", "Author Keywords"]))
	sample = df[target_cols].head(3).to_string(max_colwidth=80)
	null_counts = ", ".join(list(map(
	lambda c: f"{c}: {df[c].notna().sum()}/{len(df)}", target_cols)))

	# Estimate sentence counts
	sample_sents = df["Abstract"].head(5).apply(_split_sentences).apply(len)
	avg_abstract_sents = sample_sents.mean()
	est_abstract = int(avg_abstract_sents * len(df))
	title_count = int(df["Title"].notna().sum())

	return (f"📊 Dataset Statistics:\n"
	f"- Papers: {len(df)}\n"
	f"- Abstract sentences: ~{est_abstract} (~{avg_abstract_sents:.0f} per paper)\n"
	f"- Title sentences: {title_count} (1 per paper)\n"
	f"- Non-null: {null_counts}\n\n"
	f"Columns: {', '.join(list(df.columns)[:15])}\n\n"
	f"Sample:\n{sample}")


	# ═══════════════════════════════════════════════
	# TOOL 2: Sentence-Level BERTopic Pipeline
	# ═══════════════════════════════════════════════
	@tool
	def run_bertopic_discovery(run_key: str, threshold: float = 0.7) -> str:
	"""Sentence-level BERTopic: split papers → embed sentences → cosine similarity clustering → centroid nearest 5 → Plotly charts.
	Each sentence keeps paper_id, sent_id, and metadata. One paper can span multiple topics.
	Uses AgglomerativeClustering with cosine distance — groups sentences by similarity threshold.

	Args:
	run_key: One of 'abstract' or 'title' — selects which columns to split into sentences.
	threshold: Cosine distance threshold (0.0-1.0). Lower = stricter = more topics.
	0.5 = very strict (~2000 topics), 0.7 = recommended (~100 topics, default), 0.8 = loose (~30 topics), 0.9 = very loose (~10 topics).

	Returns:
	Topic summary with sentence counts, paper counts, and 5 nearest centroid sentences."""
	debug(f"\n>>> TOOL: run_bertopic_discovery(run_key='{run_key}', threshold={threshold})")
	from bertopic import BERTopic
	from sentence_transformers import SentenceTransformer

	df = _data["df"].copy()
	cols = RUN_CONFIGS[run_key]
	available = list(filter(lambda c: c in df.columns, cols))
	debug(f">>> Columns: {available}")

	# ── Step 1: Assemble text per paper ──
	df["_text"] = df[available].fillna("").agg(" ".join, axis=1)
	df["_paper_id"] = df.index
	debug(f">>> {len(df)} papers assembled")

	# ── Step 2: Split into sentences — regex, no nltk ──
	debug(">>> Splitting into sentences...")
	df["_sentences"] = df["_text"].apply(_split_sentences)
	debug(f">>> Sentence counts: min={df['_sentences'].apply(len).min()}, "
	f"max={df['_sentences'].apply(len).max()}, "
	f"mean={df['_sentences'].apply(len).mean():.1f}")

	# ── Step 3: Explode to sentence-level DataFrame ──
	meta_cols = ["_paper_id", "Title", "Author Keywords", "_sentences"]
	available_meta = list(filter(lambda c: c in df.columns, meta_cols))
	sent_df = df[available_meta].explode("_sentences").rename(
	columns={"_sentences": "text"}).reset_index(drop=True)
	sent_df = sent_df.dropna(subset=["text"]).reset_index(drop=True)
	sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()

	# ── Step 3b: Filter out publisher boilerplate sentences ──
	# Scopus abstracts contain copyright/license noise that clustering picks up as topics.
	# These are NOT research content — remove before embedding.
	debug(">>> Filtering publisher boilerplate...")
	_n_before = len(sent_df)
	boilerplate_patterns = "\|".join([
	r"Licensee MDPI",
	r"Published by Informa",
	r"Published by Elsevier",
	r"Taylor & Francis",
	r"Copyright ©",
	r"Creative Commons",
	r"open access article",
	r"Inderscience Enterprises",
	r"All rights reserved",
	r"This is an open access",
	r"distributed under the terms",
	r"The Author\(s\)",
	r"Springer Nature",
	r"Emerald Publishing",
	r"limitations and future",
	r"limitations and implications",
	r"limitations are discussed",
	r"limitations have been discussed",
	r"implications are discussed",
	r"implications were discussed",
	r"implications are presented",
	r"concludes with .* implications",
	])
	clean_mask = ~sent_df["text"].str.contains(boilerplate_patterns, case=False, regex=True, na=False)
	sent_df = sent_df[clean_mask].reset_index(drop=True)
	sent_df["sent_id"] = sent_df.groupby("_paper_id").cumcount()
	debug(f">>> Filtered: {_n_before} → {len(sent_df)} sentences ({_n_before - len(sent_df)} boilerplate removed)")
	n_sentences = len(sent_df)
	n_papers = len(df)
	debug(f">>> {n_sentences} sentences from {n_papers} papers")

	# ── Step 4: Embed sentences (384d, L2-normalized) ──
	# BERTopic FAQ: "normalize them first to force a cosine-related distance metric"
	# Math: for L2-normalized vectors, euclidean²(a,b) = 2(1 - cos(a,b)) → same clusters as cosine
	debug(">>> Embedding sentences with all-MiniLM-L6-v2 (L2-normalized)...")
	docs = sent_df["text"].tolist()
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = embedder.encode(docs, show_progress_bar=False, normalize_embeddings=True)
	debug(f">>> Embeddings: {embeddings.shape}, normalized: True")

	# Save checkpoint
	np.save(f"{CHECKPOINT_DIR}/rq4_{run_key}_emb.npy", embeddings)

	# ── Step 5: Agglomerative Clustering with COSINE similarity threshold ──
	# Groups sentences where cosine_distance < threshold → same cluster
	# No dimension reduction. No density estimation. Pure similarity grouping.
	debug(f">>> AgglomerativeClustering cosine threshold={threshold} on 384d embeddings...")
	from sklearn.preprocessing import FunctionTransformer
	from sklearn.cluster import AgglomerativeClustering
	no_umap = FunctionTransformer()
	cluster_model = AgglomerativeClustering(
	n_clusters=None,
	metric="cosine",
	linkage="average",
	distance_threshold=threshold,
	)
	topic_model = BERTopic(
	hdbscan_model=cluster_model,
	umap_model=no_umap,
	)
	topics, probs = topic_model.fit_transform(docs, embeddings)
	n_topics = len(set(topics)) - int(-1 in topics)
	n_outliers = int(np.sum(np.array(topics) == -1))
	debug(f">>> {n_topics} topics, {n_outliers} outlier sentences")

	# Store for later tools
	_data[f"{run_key}_model"] = topic_model
	_data[f"{run_key}_topics"] = np.array(topics)
	_data[f"{run_key}_embeddings"] = embeddings
	_data[f"{run_key}_sent_df"] = sent_df

	# ── Step 6: BERTopic Plotly visualizations (skip charts that need 3+ topics) ──
	debug(f">>> Generating visualizations ({n_topics} topics)...")
	# visualize_topics() uses UMAP internally → crashes with < 3 topics
	(n_topics >= 3) and topic_model.visualize_topics().write_html(
	f"/tmp/rq4_{run_key}_intertopic.html", include_plotlyjs="cdn")
	# barchart works with 1+ topics
	(n_topics >= 1) and topic_model.visualize_barchart(
	top_n_topics=min(10, max(1, n_topics))).write_html(
	f"/tmp/rq4_{run_key}_bars.html", include_plotlyjs="cdn")
	# hierarchy needs 2+ topics
	(n_topics >= 2) and topic_model.visualize_hierarchy().write_html(
	f"/tmp/rq4_{run_key}_hierarchy.html", include_plotlyjs="cdn")
	# heatmap needs 2+ topics
	(n_topics >= 2) and topic_model.visualize_heatmap().write_html(
	f"/tmp/rq4_{run_key}_heatmap.html", include_plotlyjs="cdn")
	debug(f">>> Visualizations saved (skipped charts needing more topics)")

	# ── Step 7: Centroid nearest 5 SENTENCES — COSINE similarity ──
	topics_arr = np.array(topics)
	topic_info = topic_model.get_topic_info()
	valid_rows = list(filter(lambda r: r["Topic"] != -1, topic_info.to_dict("records")))

	def _centroid_nearest(row):
	"""Find 5 sentences nearest to topic centroid via cosine similarity."""
	mask = topics_arr == row["Topic"]
	member_idx = np.where(mask)[0]
	member_embs = embeddings[mask]
	centroid = member_embs.mean(axis=0)
	# Cosine distance: 1 - cos_sim. For normalized vectors: cos_sim = dot product
	norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
	cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
	dists = 1 - cosine_sim
	nearest = np.argsort(dists)[:NEAREST_K]

	# 5 nearest sentences with paper metadata
	nearest_evidence = list(map(lambda i: {
	"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
	"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
	"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
	"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
	}, nearest))

	# Count unique papers in this topic + collect their titles
	topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
	unique_papers = len(topic_papers_df)
	paper_titles = list(map(
	lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
	range(min(50, unique_papers)))) # cap at 50 titles per topic

	return {"topic_id": int(row["Topic"]),
	"sentence_count": int(row["Count"]),
	"paper_count": int(unique_papers),
	"top_words": str(row.get("Name", ""))[:100],
	"nearest": nearest_evidence,
	"paper_titles": paper_titles}

	summaries = list(map(_centroid_nearest, valid_rows))
	json.dump(summaries, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json", "w"), indent=2, default=str)
	debug(f">>> {len(summaries)} topics saved ({NEAREST_K} nearest sentences each)")

	# ── Format output ──
	lines = list(map(
	lambda s: f" Topic {s['topic_id']} ({s['sentence_count']} sentences, {s['paper_count']} papers): {s['top_words']}",
	summaries))
	return (f"[{run_key}] {n_topics} topics from {n_sentences} sentences ({n_papers} papers, {n_outliers} outliers).\n\n"
	+ "\n".join(lines)
	+ f"\n\nVisualizations: /tmp/rq4_{run_key}_*.html (4 files)"
	+ f"\nCheckpoints: {CHECKPOINT_DIR}/rq4_{run_key}_emb.npy + summaries.json")


	# ═══════════════════════════════════════════════
	# TOOL 3: Label Topics with Mistral (sentence evidence)
	# ═══════════════════════════════════════════════
	@tool
	def label_topics_with_llm(run_key: str) -> str:
	"""Send 5 nearest centroid sentences + paper metadata to Mistral for labeling.
	Each sentence shows which paper it came from (title + keywords).

	Args:
	run_key: One of 'abstract' or 'title'.

	Returns:
	Labeled topics with sentence-level evidence."""
	debug(f"\n>>> TOOL: label_topics_with_llm(run_key='{run_key}')")
	from langchain_mistralai import ChatMistralAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser

	summaries = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_summaries.json"))
	debug(f">>> Loaded {len(summaries)} topics ({NEAREST_K} sentences each)")

	# Limit to top 50 largest topics — prevents Mistral rate limit on 2000+ topics
	MAX_LABEL_TOPICS = 100
	sorted_summaries = sorted(summaries, key=lambda s: s.get("sentence_count", 0), reverse=True)
	summaries_to_label = sorted_summaries[:MAX_LABEL_TOPICS]
	skipped = max(0, len(summaries) - MAX_LABEL_TOPICS)
	debug(f">>> Labeling top {len(summaries_to_label)} topics (skipped {skipped} small clusters)")

	# Format all topics — show sentence + paper metadata as evidence
	topics_block = "\n\n".join(list(map(
	lambda s: (f"Topic {s['topic_id']} ({s['sentence_count']} sentences from {s['paper_count']} papers):\n"
	f" Top words: {s['top_words']}\n"
	f" {NEAREST_K} nearest centroid sentences:\n"
	+ "\n".join(list(map(
	lambda e: (f" - \"{e['sentence'][:200]}\"\n"
	f" Paper: \"{e['title']}\"\n"
	f" Keywords: {e['keywords']}"),
	s["nearest"])))),
	summaries_to_label)))

	prompt = PromptTemplate.from_template(
	"You are a research topic classifier for academic papers about Technology and Tourism.\n\n"
	"For EACH topic below, you are given the 5 sentences nearest to the topic centroid,\n"
	"plus the paper title and author keywords each sentence came from.\n\n"
	"Return a JSON ARRAY with one object per topic:\n"
	"- topic_id: integer\n"
	"- label: short descriptive name (3-6 words, specific — NOT generic like 'tourism studies')\n"
	"- category: general research area (e.g., 'technology adoption', 'consumer behavior',\n"
	" 'virtual reality', 'social media marketing', 'sustainability', 'cultural heritage',\n"
	" 'AI and machine learning', 'online reviews', 'destination marketing',\n"
	" 'tourist psychology', 'hotel management', 'sharing economy',\n"
	" 'mobile applications', 'research methodology', 'data analytics')\n"
	" DO NOT use PACIS/ICIS categories — just plain descriptive research area.\n"
	"- confidence: high, medium, or low\n"
	"- reasoning: 1 sentence explaining WHY you chose this label based on the evidence sentences\n"
	"- niche: true or false (true = very specific sub-area with <20 sentences)\n\n"
	"CRITICAL: be SPECIFIC in labels. Do NOT use broad terms.\n"
	"Return ONLY valid JSON array, no markdown.\n\n"
	"Topics:\n{topics}")

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
	chain = prompt \| llm \| JsonOutputParser()
	debug(">>> Calling Mistral (single call, all topics)...")
	labels = chain.invoke({"topics": topics_block})
	debug(f">>> Got {len(labels)} labels")

	# Merge labels with summaries
	labeled = list(map(lambda pair: {pair[0], pair[1]},
	zip(summaries, (labels + summaries)[:len(summaries)])))
	json.dump(labeled, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json", "w"), indent=2, default=str)
	debug(f">>> Labels saved: {CHECKPOINT_DIR}/rq4_{run_key}_labels.json")

	# Format — show label + evidence sentences + paper source
	lines = list(map(
	lambda l: (f" Topic {l.get('topic_id', '?')}: {l.get('label', '?')} "
	f"[{l.get('category', '?')}] conf={l.get('confidence', '?')} "
	f"({l.get('sentence_count', 0)} sentences, {l.get('paper_count', 0)} papers)\n"
	+ "\n".join(list(map(
	lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_",
	l.get("nearest", []))))),
	labeled))
	return f"[{run_key}] {len(labeled)} topics labeled by Mistral:\n\n" + "\n\n".join(lines)


	# ═══════════════════════════════════════════════
	# TOOL 4: Generate Comparison Table
	# ═══════════════════════════════════════════════
	@tool
	def generate_comparison_csv() -> str:
	"""Compare Mistral-labeled topics across completed runs. Includes sentence + paper counts.

	Returns:
	Comparison table + CSV path."""
	debug(f"\n>>> TOOL: generate_comparison_csv()")
	completed = list(filter(
	lambda k: os.path.exists(f"{CHECKPOINT_DIR}/rq4_{k}_labels.json"), RUN_CONFIGS.keys()))
	debug(f">>> Completed runs: {completed}")

	def _load_run(run_key):
	labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
	return list(map(lambda l: {
	"run": run_key, "topic_id": l.get("topic_id", ""),
	"label": l.get("label", ""), "category": l.get("category", ""),
	"confidence": l.get("confidence", ""), "niche": l.get("niche", ""),
	"sentences": l.get("sentence_count", 0),
	"papers": l.get("paper_count", 0),
	"top_words": l.get("top_words", ""),
	}, labels))

	all_rows = sum(list(map(_load_run, completed)), [])
	df = pd.DataFrame(all_rows)
	path = "/tmp/rq4_comparison.csv"
	df.to_csv(path, index=False)
	debug(f">>> Comparison CSV: {path} ({len(df)} rows)")
	return f"Comparison saved: {path} ({len(completed)} runs, {len(df)} topics)\n\n{df.to_string(index=False)}"


	# ═══════════════════════════════════════════════
	# TOOL 5: Export 500-Word Narrative
	# ═══════════════════════════════════════════════
	@tool
	def export_narrative(run_key: str) -> str:
	"""Generate 500-word narrative for research paper Section 7 via Mistral.

	Args:
	run_key: One of 'abstract' or 'title'.

	Returns:
	500-word narrative + save path."""
	debug(f"\n>>> TOOL: export_narrative(run_key='{run_key}')")
	from langchain_mistralai import ChatMistralAI

	labels = json.load(open(f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"))
	topics_text = "\n".join(list(map(
	lambda l: f"- {l.get('label', '?')} ({l.get('sentence_count', 0)} sentences from "
	f"{l.get('paper_count', 0)} papers, category: {l.get('category', '?')}, "
	f"confidence: {l.get('confidence', '?')}, niche: {l.get('niche', '?')})",
	labels)))

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0.3, timeout=300)
	result = llm.invoke(
	f"Write exactly 500 words for a research paper Section 7 titled "
	f"'Topic Modeling Results — BERTopic Discovery'.\n\n"
	f"Dataset: 1390 Scopus papers on Tourism and AI.\n"
	f"Method: Sentence-level BERTopic — each abstract split into sentences,\n"
	f"embedded with all-MiniLM-L6-v2 (384d), clustered with AgglomerativeClustering (cosine).\n"
	f"Note: One paper can contribute sentences to MULTIPLE topics.\n"
	f"Run config: '{run_key}' columns.\n\n"
	f"Topics discovered:\n{topics_text}\n\n"
	f"Include: methodology justification for sentence-level approach,\n"
	f"key themes, emerging niches, limitations, future work.")

	path = "/tmp/rq4_narrative.txt"
	open(path, "w", encoding="utf-8").write(result.content)
	debug(f">>> Narrative saved: {path} ({len(result.content)} chars)")
	return f"Narrative saved: {path}\n\n{result.content}"


	# ═══════════════════════════════════════════════
	# TOOL 6: Consolidate Round 1 Topics into Themes
	# ═══════════════════════════════════════════════
	@tool
	def consolidate_into_themes(run_key: str, theme_map: dict) -> str:
	"""ROUND 2: Merge fine-grained Round 1 topics into broader themes.
	Researcher decides which topics to group. Recomputes centroids and evidence.

	Args:
	run_key: 'abstract' or 'title'.
	theme_map: Dict mapping theme names to topic ID lists.
	Example: {"AI in Tourism": [0, 1, 5], "VR Tourism": [2, 3]}

	Returns:
	Consolidated themes with new 5-nearest sentence evidence per theme."""
	debug(f"\n>>> TOOL: consolidate_into_themes(run_key='{run_key}', {len(theme_map)} themes)")

	topics_arr = _data[f"{run_key}_topics"]
	embeddings = _data[f"{run_key}_embeddings"]
	sent_df = _data[f"{run_key}_sent_df"]

	def _build_theme(item):
	"""Merge listed topics into one theme. Recompute centroid + 5 nearest."""
	theme_name, topic_ids = item
	mask = np.isin(topics_arr, topic_ids)
	member_idx = np.where(mask)[0]
	member_embs = embeddings[mask]
	centroid = member_embs.mean(axis=0)
	norms = np.linalg.norm(member_embs, axis=1) * np.linalg.norm(centroid)
	cosine_sim = (member_embs @ centroid) / (norms + 1e-10)
	dists = 1 - cosine_sim
	nearest = np.argsort(dists)[:NEAREST_K]

	nearest_evidence = list(map(lambda i: {
	"sentence": str(sent_df.iloc[member_idx[i]]["text"])[:250],
	"paper_id": int(sent_df.iloc[member_idx[i]]["_paper_id"]),
	"title": str(sent_df.iloc[member_idx[i]].get("Title", ""))[:150],
	"keywords": str(sent_df.iloc[member_idx[i]].get("Author Keywords", ""))[:150],
	}, nearest))

	unique_papers = sent_df.iloc[member_idx]["_paper_id"].nunique()

	# Collect paper titles (up to 50)
	topic_papers_df = sent_df.iloc[member_idx].drop_duplicates(subset=["_paper_id"])
	paper_titles = list(map(
	lambda idx: str(topic_papers_df.iloc[idx].get("Title", ""))[:200],
	range(min(50, len(topic_papers_df)))))

	return {"label": theme_name, "merged_topics": list(topic_ids),
	"sentence_count": int(mask.sum()), "paper_count": int(unique_papers),
	"nearest": nearest_evidence, "paper_titles": paper_titles}

	# Add topic_id to each theme (sequential)
	themes_raw = list(map(_build_theme, theme_map.items()))
	themes = list(map(
	lambda pair: {**pair[1], "topic_id": pair[0]},
	enumerate(themes_raw)))
	json.dump(themes, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json", "w"), indent=2, default=str)
	debug(f">>> {len(themes)} themes saved: {CHECKPOINT_DIR}/rq4_{run_key}_themes.json")

	# Format — show theme + merged topics + evidence
	lines = list(map(
	lambda t: (f" {t['label']} ({t['sentence_count']} sentences, {t['paper_count']} papers)\n"
	f" Merged from topics: {t['merged_topics']}\n"
	f" Evidence:\n"
	+ "\n".join(list(map(
	lambda e: f" → \"{e['sentence'][:120]}...\" — _{e['title'][:60]}_",
	t["nearest"])))),
	themes))
	return f"[{run_key}] Round 2: {len(themes)} themes consolidated:\n\n" + "\n\n".join(lines)


	# ═══════════════════════════════════════════════
	# TOOL 7: Compare Themes with PAJAIS Taxonomy
	# ═══════════════════════════════════════════════

	# Established IS topic taxonomy from:
	# Jiang, Liang & Tsai (2019) "Knowledge Profile in PAJAIS"
	# Pacific Asia Journal of the AIS, 11(1), 1-24. doi:10.17705/1pais.11101
	PAJAIS_TAXONOMY = [
	"Electronic and Mobile Business / Social Commerce",
	"Human Behavior and IS / Human-Computer Interaction",
	"IS/IT Strategy, Leadership, Governance",
	"Business Intelligence and Data Analytics",
	"Design Science and IS",
	"Enterprise Systems and BPM",
	"IS Implementation, Adoption, and Diffusion",
	"Social Media and Business Impact",
	"Cultural and Global Issues in IS",
	"IS Security and Privacy",
	"IS Smart / IoT",
	"Knowledge Management",
	"ICT / Digital Platform / IT and Work",
	"IS Healthcare",
	"IT Project Management",
	"Service Science and IS",
	"Social and Organizational Aspects of IS",
	"Research Methods and Philosophy",
	"E-Finance / Economics of IS",
	"E-Government",
	"IS Education and Learning",
	"Green IT and Sustainability",
	]


	@tool
	def compare_with_taxonomy(run_key: str) -> str:
	"""Compare BERTopic themes against established PAJAIS/PACIS taxonomy
	(Jiang, Liang & Tsai, 2019). Identifies which themes map to known
	categories and which are NOVEL/EMERGING (not in existing taxonomy).
	Researcher reviews mapping and approves new theme consolidation.

	Args:
	run_key: 'abstract' or 'title'.

	Returns:
	Mapping table: BERTopic theme → PAJAIS category (or NOVEL)."""
	debug(f"\n>>> TOOL: compare_with_taxonomy(run_key='{run_key}')")
	from langchain_mistralai import ChatMistralAI
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser

	# Load themes (prefer consolidated themes, fall back to labels)
	themes_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_themes.json"
	labels_path = f"{CHECKPOINT_DIR}/rq4_{run_key}_labels.json"
	source_path = (os.path.exists(themes_path) and themes_path) or labels_path
	themes = json.load(open(source_path))
	debug(f">>> Loaded {len(themes)} themes from {source_path}")

	# Format themes for Mistral
	themes_text = "\n".join(list(map(
	lambda t: f"- {t.get('label', '?')} "
	f"({t.get('paper_count', t.get('count', '?'))} papers)",
	themes)))

	taxonomy_text = "\n".join(list(map(lambda c: f"- {c}", PAJAIS_TAXONOMY)))

	prompt = PromptTemplate.from_template(
	"You are an IS research taxonomy expert.\n\n"
	"Compare each BERTopic theme against the established PAJAIS/PACIS taxonomy.\n"
	"For EACH theme, return a JSON ARRAY with:\n"
	"- label: the BERTopic theme name\n"
	"- pajais_match: closest PAJAIS category (or 'NOVEL' if no match)\n"
	"- match_confidence: high, medium, low, or none\n"
	"- reasoning: why this mapping (1 sentence)\n"
	"- is_novel: true if this theme represents an emerging area not in the taxonomy\n\n"
	"Return ONLY valid JSON array.\n\n"
	"BERTopic Themes:\n{themes}\n\n"
	"PAJAIS Taxonomy (Jiang et al., 2019):\n{taxonomy}")

	llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
	chain = prompt \| llm \| JsonOutputParser()
	debug(">>> Calling Mistral for taxonomy comparison...")
	mappings = chain.invoke({"themes": themes_text, "taxonomy": taxonomy_text})
	debug(f">>> Got {len(mappings)} mappings")

	# Save mapping
	json.dump(mappings, open(f"{CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json", "w"), indent=2, default=str)

	# Count novel vs mapped
	novel = list(filter(lambda m: m.get("is_novel", False), mappings))
	mapped = list(filter(lambda m: not m.get("is_novel", False), mappings))

	# Format output
	mapped_lines = list(map(
	lambda m: f" ✅ {m.get('label', '?')} → {m.get('pajais_match', '?')} "
	f"(conf={m.get('match_confidence', '?')}) _{m.get('reasoning', '')}_",
	mapped))
	novel_lines = list(map(
	lambda m: f" 🆕 {m.get('label', '?')} → NOVEL "
	f"_{m.get('reasoning', '')}_",
	novel))

	return (f"[{run_key}] Taxonomy comparison (Jiang et al., 2019):\n\n"
	f"Mapped to PAJAIS categories ({len(mapped)}):\n" + "\n".join(mapped_lines) +
	f"\n\nNOVEL / Emerging themes ({len(novel)}):\n" + "\n".join(novel_lines) +
	f"\n\nSaved: {CHECKPOINT_DIR}/rq4_{run_key}_taxonomy_map.json")


	# ═══════════════════════════════════════════════
	# GET ALL TOOLS
	# ═══════════════════════════════════════════════
	def get_all_tools():
	"""Return all 7 tools with error handling enabled."""
	tools = [load_scopus_csv, run_bertopic_discovery, label_topics_with_llm,
	consolidate_into_themes, compare_with_taxonomy,
	generate_comparison_csv, export_narrative]
	list(map(lambda t: setattr(t, 'handle_tool_error', True), tools))
	debug(f">>> tools.py: {len(tools)} tools ready (handle_tool_error=True)")
	list(map(lambda t: debug(f">>> - {t.name}"), tools))
	return tools