Spaces:

atharvthite05
/

BERTopic_Thematic_Analysis_Agent

Running

App Files Files Community

BERTopic_Thematic_Analysis_Agent / tools.py

atharvthite05

Update tools.py

0d2dcbb verified about 16 hours ago

raw

history blame contribute delete

95.4 kB

	"""
	tools.py — BERTopic Thematic Analysis Pipeline Tools
	=====================================================
	Nine LangChain @tool functions implementing Braun & Clarke's (2006)
	six-phase thematic analysis pipeline.

	Conventions
	-----------
	- All tools accept / return plain Python dicts (JSON-serialisable).
	- Artefacts are written to OUTPUT_DIR / run_key / <file>.
	- Functional style throughout: map, operator, numpy vectorised ops.
	- No for/while loops, no try/except, no if/else.

	Fixes applied (v2)
	------------------
	- BUG 1 : run_bertopic_discovery() now saves sent_labels.npy —
	per-sentence cluster-label array required by Tool 4.
	- BUG 1 : consolidate_into_themes() _build_theme() rewritten —
	centroid computed from actual merged-cluster embeddings
	via sent_labels.npy mask (no dead `if False` scaffolding).
	- ISSUE 1: generate_comparison_csv() guards against missing title run
	with a .exists() check instead of hard-crashing.

	Dependencies
	------------
	pip install langchain langchain-core langchain-mistralai langchain-groq
	sentence-transformers scikit-learn plotly pandas numpy
	"""

	# ---------------------------------------------------------------------------
	# Stdlib
	# ---------------------------------------------------------------------------
	import json
	import os
	import re
	import time
	from functools import reduce
	from pathlib import Path
	from operator import itemgetter

	# ---------------------------------------------------------------------------
	# Third-party
	# ---------------------------------------------------------------------------
	import numpy as np
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import plotly.figure_factory as ff
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.preprocessing import normalize
	from sentence_transformers import SentenceTransformer
	import hdbscan
	import umap

	import fitz # PyMuPDF — text-only PDF extraction

	from langchain_core.tools import tool
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_mistralai import ChatMistralAI

	try:
	from langchain_groq import ChatGroq # type: ignore[import-not-found]
	except ImportError:
	ChatGroq = None

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	MISTRAL_API_KEY: str = os.environ.get("MISTRAL_API_KEY", "")
	MODEL_NAME: str = "mistral-small-latest"
	GROQ_API_KEY: str = os.environ.get("GROQ_API_KEY", "")
	GROQ_MODEL_NAME: str = os.environ.get("GROQ_MODEL_NAME", "llama-3.3-70b-versatile")
	GROQ_OLLAMA_MODEL_NAME: str = os.environ.get("GROQ_OLLAMA_MODEL_NAME", "llama-3.3-70b-versatile")
	GROQ_GPT_MODEL_NAME: str = os.environ.get("GROQ_GPT_MODEL_NAME", "openai/gpt-oss-120b")
	GROQ_JUDGE_MODEL_NAME: str = os.environ.get("GROQ_JUDGE_MODEL_NAME", "llama-3.1-8b-instant")
	EMBED_MODEL: str = "allenai/specter2_base"
	BASE_DIR: Path = Path(__file__).resolve().parent
	OUTPUT_DIR: Path = BASE_DIR / "outputs"
	N_EVIDENCE: int = 5 # sentences kept per cluster centroid
	DISTANCE_THRESH: float = 0.35 # cosine-distance threshold (1 - similarity)
	RANDOM_SEED: int = 42
	LLM_TIMEOUT_S: int = 45
	LLM_MAX_RETRIES: int = 3
	MAX_LABEL_CLUSTERS: int = 60
	MIN_CLUSTER_SIZE_FOR_LABEL: int = 20
	MAX_TOOL_RETURN_PREVIEW: int = 12
	PROVIDER_RETRY_ATTEMPTS: int = 4
	PROVIDER_RETRY_BASE_DELAY_S: float = 2.0
	PROVIDER_RETRY_RATE_LIMIT_DELAY_S: float = 6.0
	PROVIDER_RETRY_MAX_DELAY_S: float = 18.0
	HDBSCAN_MIN_CLUSTER_SIZE: int = 20
	HDBSCAN_MIN_SAMPLES: int = 5
	HDBSCAN_MAX_CLUSTER_SIZE: int = 120
	UMAP_N_NEIGHBORS: int = 15
	UMAP_MIN_DIST: float = 0.0
	UMAP_N_COMPONENTS_CLUSTER: int = 5
	UMAP_N_COMPONENTS_VIZ: int = 2
	AUTO_OPTIMIZE_CLUSTERS: bool = True
	OPTIMIZE_MAX_ITERS: int = 6
	OPTIMIZE_STABLE_ROUNDS: int = 2
	OPTIMIZE_MIN_IMPROVEMENT: float = 0.01
	OPTIMIZE_TARGET_CLUSTER_MIN: int = 20
	OPTIMIZE_TARGET_CLUSTER_MAX: int = 120
	OPTIMIZE_TARGET_NOISE_MAX: float = 0.50
	OPTIMIZE_MIN_CLUSTER_SIZE_MIN: int = 5
	OPTIMIZE_MIN_CLUSTER_SIZE_MAX: int = 60
	OPTIMIZE_MAX_CLUSTER_SIZE_MIN: int = 40
	OPTIMIZE_MAX_CLUSTER_SIZE_MAX: int = 200
	OPTIMIZE_MIN_SAMPLES_MIN: int = 1
	OPTIMIZE_MIN_SAMPLES_MAX: int = 15

	# Run configurations — keys map to source columns
	RUN_CONFIGS: dict[str, list[str]] = {
	"abstract": ["Abstract"],
	"title": ["Title"],
	"keywords": [
	"Author Keywords",
	"Author Keywords Plus",
	"Index Keywords",
	"Keywords",
	"Author_Keywords",
	],
	}

	# PAJAIS 25-category taxonomy (Pan-Pacific Journal of AIS)
	PAJAIS_TAXONOMY: list[str] = [
	"Artificial Intelligence & Machine Learning",
	"Big Data & Analytics",
	"Blockchain & Distributed Ledger",
	"Cloud Computing & Infrastructure",
	"Cybersecurity & Privacy",
	"Decision Support Systems",
	"Digital Business & E-Commerce",
	"Digital Health & Telemedicine",
	"Digital Innovation & Transformation",
	"Enterprise Systems & ERP",
	"Fintech & Digital Finance",
	"Green IS & Sustainability",
	"Human-Computer Interaction",
	"Information Systems Strategy",
	"IT Governance & Management",
	"Knowledge Management",
	"Mobile Computing & IoT",
	"Natural Language Processing & Text Mining",
	"Organizational Behavior & IS",
	"Platform Ecosystems & APIs",
	"Privacy & Ethics in IS",
	"Smart Cities & Digital Government",
	"Social Media & Collaboration",
	"Supply Chain & Logistics IS",
	"Virtual Reality & Immersive Technologies",
	]

	# Boilerplate patterns to strip from abstracts
	_BOILERPLATE_RE = re.compile(
	r"(©\s\d{4}.?(?:rights reserved\|elsevier\|springer\|wiley)[^.]*\.?)"
	r"\|(all rights reserved\.?)"
	r"\|(published by.?(?:ltd\|inc\|llc)[^.]\.?)"
	r"\|(doi:\s*\S+)",
	re.IGNORECASE,
	)

	# Sentence splitter — split on sentence-boundary punctuation, keep >= 20 chars
	_SENT_RE = re.compile(r"(?<=[.!?])\s+")
	_KEYWORD_SPLIT_RE = re.compile(r"\s[;\|]\s")
	_KEYWORD_COMMA_RE = re.compile(r"\s,\s")


	# ---------------------------------------------------------------------------
	# Private helpers (pure functions, no side-effects)
	# ---------------------------------------------------------------------------

	def _ensure_dir(path: Path) -> Path:
	path.mkdir(parents=True, exist_ok=True)
	return path


	def _run_dir(run_key: str) -> Path:
	return _ensure_dir(OUTPUT_DIR / run_key)


	def _clean_text(text: str) -> str:
	return _BOILERPLATE_RE.sub("", str(text)).strip()


	def _split_sentences(text: str) -> list[str]:
	return list(filter(
	lambda s: len(s.strip()) >= 20,
	_SENT_RE.split(_clean_text(text)),
	))


	def _split_keywords(text: str) -> list[str]:
	cleaned = _clean_text(text).replace("\n", " ").strip()
	if not cleaned:
	return []
	primary = list(filter(None, map(str.strip, _KEYWORD_SPLIT_RE.split(cleaned))))
	terms = (
	primary
	if len(primary) > 1
	else list(filter(None, map(str.strip, _KEYWORD_COMMA_RE.split(cleaned))))
	)
	return list(dict.fromkeys(filter(lambda t: len(t) >= 2, terms)))


	def _resolve_column_name(df: pd.DataFrame, candidates: list[str]) -> str \| None:
	normalised = {
	str(col).strip().lower(): col
	for col in df.columns
	}
	return next(
	(normalised.get(str(c).strip().lower()) for c in candidates
	if normalised.get(str(c).strip().lower()) is not None),
	None,
	)


	def _texts_for_candidates(df: pd.DataFrame, candidates: list[str]) -> tuple[list[str], str \| None]:
	col = _resolve_column_name(df, candidates)
	return (
	df[col].dropna().astype(str).tolist(),
	col,
	) if col else ([], None)


	def _embed(sentences: list[str]) -> np.ndarray:
	"""Encode sentences to L2-normalised SPECTER2 vectors."""
	model = SentenceTransformer(EMBED_MODEL, trust_remote_code=True)
	raw = model.encode(sentences, show_progress_bar=False, batch_size=64)
	return normalize(raw, norm="l2") # unit-norm -> cosine = dot product


	def _umap_reduce(embeddings: np.ndarray, n_components: int) -> np.ndarray:
	reducer = umap.UMAP(
	n_neighbors=UMAP_N_NEIGHBORS,
	min_dist=UMAP_MIN_DIST,
	n_components=n_components,
	metric="cosine",
	random_state=RANDOM_SEED,
	)
	return reducer.fit_transform(embeddings)


	def _cluster(embeddings: np.ndarray,
	min_cluster_size: int,
	max_cluster_size: int,
	min_samples: int) -> np.ndarray:
	return hdbscan.HDBSCAN(
	min_cluster_size=min_cluster_size,
	min_samples=min_samples,
	metric="euclidean",
	cluster_selection_method="eom",
	max_cluster_size=max_cluster_size,
	).fit_predict(embeddings)


	def _centroid(embeddings: np.ndarray) -> np.ndarray:
	"""Mean-pool rows then re-normalise to unit length."""
	return normalize(embeddings.mean(axis=0, keepdims=True), norm="l2")[0]


	def _top_k_indices(embeddings: np.ndarray, centroid: np.ndarray, k: int) -> np.ndarray:
	sims = cosine_similarity(embeddings, centroid.reshape(1, -1)).flatten()
	return np.argsort(sims)[::-1][:k]


	def _llm() -> ChatMistralAI:
	return ChatMistralAI(
	model=MODEL_NAME,
	api_key=MISTRAL_API_KEY,
	temperature=0.2,
	random_seed=RANDOM_SEED,
	timeout=LLM_TIMEOUT_S,
	max_retries=LLM_MAX_RETRIES,
	)


	def _llm_groq(model_name: str):
	if ChatGroq is None:
	raise RuntimeError(
	"langchain-groq is not installed. Install dependencies from requirements.txt "
	"to enable Groq topic-label verification."
	)
	return ChatGroq(
	model=model_name,
	api_key=GROQ_API_KEY,
	temperature=0.2,
	timeout=LLM_TIMEOUT_S,
	max_retries=LLM_MAX_RETRIES,
	)


	def _groq_ollama_enabled() -> bool:
	return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_OLLAMA_MODEL_NAME)


	def _groq_gpt_enabled() -> bool:
	return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_GPT_MODEL_NAME)


	def _groq_judge_enabled() -> bool:
	return bool(GROQ_API_KEY) and ChatGroq is not None and bool(GROQ_JUDGE_MODEL_NAME)


	def _to_float(value: object, fallback: float = 0.0) -> float:
	try:
	return float(value)
	except (TypeError, ValueError):
	return float(fallback)


	def _clamp_int(value: object, low: int, high: int, fallback: int) -> int:
	try:
	casted = int(value)
	except (TypeError, ValueError):
	casted = int(fallback)
	return max(low, min(high, casted))


	def _cluster_metrics(labels: np.ndarray) -> dict:
	labels_arr = np.array(labels, dtype=np.int32)
	n_sentences = int(labels_arr.shape[0])
	noise_count = int((labels_arr == -1).sum())
	unique_ids = sorted(filter(lambda v: v != -1, set(labels_arr.tolist())))
	sizes = list(map(lambda cid: int((labels_arr == cid).sum()), unique_ids))

	if sizes:
	min_size = float(np.min(sizes))
	median_size = float(np.median(sizes))
	mean_size = float(np.mean(sizes))
	max_size = float(np.max(sizes))
	else:
	min_size = 0.0
	median_size = 0.0
	mean_size = 0.0
	max_size = 0.0

	return {
	"n_sentences": n_sentences,
	"n_clusters": int(len(unique_ids)),
	"noise_ratio": float(noise_count) / float(max(1, n_sentences)),
	"min_size": min_size,
	"median_size": median_size,
	"mean_size": mean_size,
	"max_size": max_size,
	}


	def _heuristic_hdbscan_tweak(metrics: dict, params: dict) -> dict:
	n_clusters = int(metrics.get("n_clusters", 0))
	noise_ratio = float(metrics.get("noise_ratio", 0.0))

	min_cluster_size = int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE))
	max_cluster_size = int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE))
	min_samples = int(params.get("min_samples", HDBSCAN_MIN_SAMPLES))

	action = "accept"
	reasoning = "Cluster metrics are within target ranges."

	if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN:
	min_cluster_size = max(
	OPTIMIZE_MIN_CLUSTER_SIZE_MIN,
	int(round(min_cluster_size * 0.8)),
	)
	min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1)
	action = "tweak"
	reasoning = "Too few clusters; reducing min_cluster_size and min_samples."
	elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX:
	min_cluster_size = min(
	OPTIMIZE_MIN_CLUSTER_SIZE_MAX,
	int(round(min_cluster_size * 1.2)),
	)
	min_samples = min(OPTIMIZE_MIN_SAMPLES_MAX, min_samples + 1)
	action = "tweak"
	reasoning = "Too many clusters; increasing min_cluster_size and min_samples."
	elif noise_ratio > OPTIMIZE_TARGET_NOISE_MAX:
	min_cluster_size = max(
	OPTIMIZE_MIN_CLUSTER_SIZE_MIN,
	int(round(min_cluster_size * 0.85)),
	)
	min_samples = max(OPTIMIZE_MIN_SAMPLES_MIN, min_samples - 1)
	action = "tweak"
	reasoning = "Noise ratio is high; lowering min_cluster_size and min_samples."

	return {
	"action": action,
	"min_cluster_size": min_cluster_size,
	"max_cluster_size": max_cluster_size,
	"min_samples": min_samples,
	"reasoning": reasoning,
	}


	def _normalize_hdbscan_suggestion(suggestion: dict, current: dict) -> dict:
	action = str(suggestion.get("action", "accept")).strip().lower()
	action = action if action in {"accept", "tweak"} else "accept"

	min_cluster_size = _clamp_int(
	suggestion.get("min_cluster_size", current.get("min_cluster_size")),
	OPTIMIZE_MIN_CLUSTER_SIZE_MIN,
	OPTIMIZE_MIN_CLUSTER_SIZE_MAX,
	current.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE),
	)
	max_cluster_size = _clamp_int(
	suggestion.get("max_cluster_size", current.get("max_cluster_size")),
	OPTIMIZE_MAX_CLUSTER_SIZE_MIN,
	OPTIMIZE_MAX_CLUSTER_SIZE_MAX,
	current.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE),
	)
	min_samples = _clamp_int(
	suggestion.get("min_samples", current.get("min_samples")),
	OPTIMIZE_MIN_SAMPLES_MIN,
	OPTIMIZE_MIN_SAMPLES_MAX,
	current.get("min_samples", HDBSCAN_MIN_SAMPLES),
	)

	if max_cluster_size < min_cluster_size:
	max_cluster_size = min_cluster_size + 1

	return {
	"action": action,
	"min_cluster_size": min_cluster_size,
	"max_cluster_size": max_cluster_size,
	"min_samples": min_samples,
	"reasoning": str(suggestion.get("reasoning", "")).strip(),
	}


	def _metrics_in_target(metrics: dict) -> bool:
	n_clusters = int(metrics.get("n_clusters", 0))
	noise_ratio = float(metrics.get("noise_ratio", 1.0))
	return (
	OPTIMIZE_TARGET_CLUSTER_MIN <= n_clusters <= OPTIMIZE_TARGET_CLUSTER_MAX
	and noise_ratio <= OPTIMIZE_TARGET_NOISE_MAX
	)


	def _optimization_score(metrics: dict) -> float:
	n_clusters = int(metrics.get("n_clusters", 0))
	noise_ratio = float(metrics.get("noise_ratio", 1.0))

	if n_clusters < OPTIMIZE_TARGET_CLUSTER_MIN:
	cluster_penalty = (OPTIMIZE_TARGET_CLUSTER_MIN - n_clusters) / max(
	OPTIMIZE_TARGET_CLUSTER_MIN,
	1,
	)
	elif n_clusters > OPTIMIZE_TARGET_CLUSTER_MAX:
	cluster_penalty = (n_clusters - OPTIMIZE_TARGET_CLUSTER_MAX) / max(
	OPTIMIZE_TARGET_CLUSTER_MAX,
	1,
	)
	else:
	cluster_penalty = 0.0

	noise_penalty = max(0.0, noise_ratio - OPTIMIZE_TARGET_NOISE_MAX) / max(
	OPTIMIZE_TARGET_NOISE_MAX,
	1e-6,
	)

	return 1.0 - min(1.0, cluster_penalty + noise_penalty)


	def _load_sentence_meta(run_key: str, sentences: list[str]) -> list[dict]:
	meta_path = OUTPUT_DIR / run_key / "sentence_meta.json"
	if not meta_path.exists():
	return [
	{
	"sentence": s,
	"paper_title": "",
	"paper_id": None,
	}
	for s in sentences
	]

	meta = _load_json(meta_path)
	if not isinstance(meta, list):
	return [
	{
	"sentence": s,
	"paper_title": "",
	"paper_id": None,
	}
	for s in sentences
	]

	if len(meta) != len(sentences):
	return [
	{
	"sentence": s,
	"paper_title": "",
	"paper_id": None,
	}
	for s in sentences
	]

	return meta


	def _top_papers_for_mask(meta: list[dict], mask: np.ndarray, k: int = 3) -> dict:
	counts: dict[tuple[object, str], int] = {}
	for idx, entry in enumerate(meta):
	if not mask[idx]:
	continue
	paper_id = entry.get("paper_id")
	title = str(entry.get("paper_title") or entry.get("title") or "").strip()
	if not title:
	title = f"Paper {paper_id}" if paper_id is not None else "Unknown"
	key = (paper_id, title)
	counts[key] = counts.get(key, 0) + 1

	ordered = sorted(
	counts.items(),
	key=lambda kv: (-kv[1], str(kv[0][1]).lower()),
	)

	top = [
	{"paper_id": pid, "paper_title": title, "count": count}
	for (pid, title), count in ordered[:k]
	]

	return {
	"paper_count": int(len(counts)),
	"top_papers": top,
	}


	def _is_transient_provider_error(exc: Exception) -> bool:
	"""Detect transient provider outages (Mistral/Groq) that should be retried."""
	msg = str(exc).lower()
	return (
	"unreachable_backend" in msg
	or "internal server error" in msg
	or '"code":"1100"' in msg
	or '"raw_status_code":503' in msg
	or '"raw_status_code":502' in msg
	or '"raw_status_code":504' in msg
	or '"status":503' in msg
	or '"status":502' in msg
	or '"status":504' in msg
	or '"status":429' in msg
	or "too many requests" in msg
	or "rate limit" in msg
	or "gateway timeout" in msg
	or "service unavailable" in msg
	)


	def _is_rate_limit_error(exc: Exception) -> bool:
	msg = str(exc).lower()
	return (
	"rate limit" in msg
	or "too many requests" in msg
	or '"raw_status_code":429' in msg
	or '"status":429' in msg
	or "status code: 429" in msg
	)


	def _invoke_with_retries(fn):
	"""Run an LLM call with bounded linear backoff on transient provider errors."""
	last_exc: Exception \| None = None
	for attempt in range(PROVIDER_RETRY_ATTEMPTS):
	try:
	return fn()
	except Exception as exc:
	if not _is_transient_provider_error(exc):
	raise
	last_exc = exc
	if attempt < PROVIDER_RETRY_ATTEMPTS - 1:
	delay = PROVIDER_RETRY_BASE_DELAY_S * (attempt + 1)
	if _is_rate_limit_error(exc):
	delay = max(delay, PROVIDER_RETRY_RATE_LIMIT_DELAY_S * (attempt + 1))
	time.sleep(min(PROVIDER_RETRY_MAX_DELAY_S, delay))
	continue
	raise last_exc

	raise RuntimeError("Unexpected retry flow in _invoke_with_retries")


	def _save_json(path: Path, data: object) -> None:
	path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")


	def _load_json(path: Path) -> object:
	return json.loads(path.read_text(encoding="utf-8"))


	# ---------------------------------------------------------------------------
	# Plotly chart builders
	# ---------------------------------------------------------------------------

	def _chart_intertopic(summaries: list[dict]) -> go.Figure:
	df = pd.DataFrame(summaries)
	return px.scatter(
	df,
	x="cx", y="cy",
	size="size",
	text="cluster_id",
	color="size",
	color_continuous_scale="Blues",
	title="Intertopic Distance Map",
	labels={"cx": "Dim-1", "cy": "Dim-2", "size": "Sentences"},
	template="plotly_dark",
	)


	def _chart_top_words(summaries: list[dict]) -> go.Figure:
	df = (
	pd.DataFrame(summaries)
	.nlargest(20, "size")
	.assign(label=lambda d: d["cluster_id"].astype(str))
	)
	return px.bar(
	df,
	x="size", y="label",
	orientation="h",
	title="Top Clusters by Sentence Count",
	labels={"size": "Sentences", "label": "Cluster"},
	color="size",
	color_continuous_scale="Teal",
	template="plotly_dark",
	)


	def _chart_hierarchy(labels: list[int], embeddings: np.ndarray) -> go.Figure:
	unique = sorted(filter(lambda v: v != -1, set(labels)))
	if not unique:
	fig = go.Figure()
	fig.update_layout(title="Cluster Hierarchy", template="plotly_dark")
	return fig
	labels_arr = np.array(labels)
	centroids = np.vstack([
	_centroid(embeddings[labels_arr == lbl])
	for lbl in unique
	])
	dist_mat = 1 - cosine_similarity(centroids)
	fig = ff.create_dendrogram(
	dist_mat,
	labels=[str(l) for l in unique],
	colorscale=px.colors.sequential.Blues,
	)
	fig.update_layout(title="Cluster Hierarchy", template="plotly_dark")
	return fig


	def _chart_heatmap(labels: list[int], embeddings: np.ndarray) -> go.Figure:
	unique = sorted(filter(lambda v: v != -1, set(labels)))
	if not unique:
	fig = go.Figure()
	fig.update_layout(title="Cluster Similarity Heatmap", template="plotly_dark")
	return fig
	labels_arr = np.array(labels)
	centroids = np.vstack([
	_centroid(embeddings[labels_arr == lbl])
	for lbl in unique
	])
	sim_mat = cosine_similarity(centroids)
	return px.imshow(
	sim_mat,
	x=[str(l) for l in unique],
	y=[str(l) for l in unique],
	color_continuous_scale="Blues",
	title="Cluster Similarity Heatmap",
	template="plotly_dark",
	)


	def _save_chart(fig: go.Figure, path: Path) -> str:
	fig.write_html(str(path), full_html=True, include_plotlyjs="cdn")
	return str(path)


	_OPTIMIZE_PROMPT = PromptTemplate.from_template(
	"""You are optimizing HDBSCAN clustering parameters for BERTopic.

	Current parameters:
	min_cluster_size: {min_cluster_size}
	max_cluster_size: {max_cluster_size}
	min_samples: {min_samples}

	Clustering metrics:
	n_sentences: {n_sentences}
	n_clusters: {n_clusters}
	noise_ratio: {noise_ratio}
	min_size: {min_size}
	median_size: {median_size}
	mean_size: {mean_size}
	max_size: {max_size}

	Constraints:
	- Only adjust min_cluster_size, max_cluster_size, min_samples.
	- Keep min_cluster_size within [{min_cluster_size_min}, {min_cluster_size_max}].
	- Keep max_cluster_size within [{max_cluster_size_min}, {max_cluster_size_max}].
	- Keep min_samples within [{min_samples_min}, {min_samples_max}].
	- Prefer n_clusters in [{target_cluster_min}, {target_cluster_max}].
	- Prefer noise_ratio <= {target_noise_max}.

	Return RAW JSON with exactly these keys:
	action: "accept" or "tweak"
	min_cluster_size: int
	max_cluster_size: int
	min_samples: int
	reasoning: short sentence

	If clustering already looks good, set action="accept" and repeat the current values.
	Respond with RAW JSON only.
	"""
	)


	def _recommend_hdbscan_params(metrics: dict, params: dict) -> dict:
	if not MISTRAL_API_KEY:
	return _normalize_hdbscan_suggestion(
	_heuristic_hdbscan_tweak(metrics, params),
	params,
	)

	chain = _OPTIMIZE_PROMPT \| _llm() \| JsonOutputParser()

	payload = {
	**metrics,
	**params,
	"min_cluster_size_min": OPTIMIZE_MIN_CLUSTER_SIZE_MIN,
	"min_cluster_size_max": OPTIMIZE_MIN_CLUSTER_SIZE_MAX,
	"max_cluster_size_min": OPTIMIZE_MAX_CLUSTER_SIZE_MIN,
	"max_cluster_size_max": OPTIMIZE_MAX_CLUSTER_SIZE_MAX,
	"min_samples_min": OPTIMIZE_MIN_SAMPLES_MIN,
	"min_samples_max": OPTIMIZE_MIN_SAMPLES_MAX,
	"target_cluster_min": OPTIMIZE_TARGET_CLUSTER_MIN,
	"target_cluster_max": OPTIMIZE_TARGET_CLUSTER_MAX,
	"target_noise_max": OPTIMIZE_TARGET_NOISE_MAX,
	}

	try:
	suggestion = _invoke_with_retries(lambda: chain.invoke(payload))
	except Exception:
	suggestion = {}

	if not isinstance(suggestion, dict) or not suggestion:
	suggestion = _heuristic_hdbscan_tweak(metrics, params)

	return _normalize_hdbscan_suggestion(suggestion, params)


	# ============================================================================
	# TOOL 1 — load_scopus_csv
	# ============================================================================

	@tool
	def load_scopus_csv(filepath: str) -> dict:
	"""
	Load a Scopus-exported CSV and extract corpus statistics.

	Parameters
	----------
	filepath : str
	Absolute or relative path to the CSV file.

	Returns
	-------
	dict with keys:
	paper_count, abstract_sentence_count, title_sentence_count,
	keywords_term_count,
	columns, sample_abstracts, filepath
	"""
	df = pd.read_csv(filepath).rename(columns=str.strip)

	abstract_texts, abstract_col = _texts_for_candidates(df, RUN_CONFIGS["abstract"])
	title_texts, title_col = _texts_for_candidates(df, RUN_CONFIGS["title"])
	keywords_texts, keywords_col = _texts_for_candidates(df, RUN_CONFIGS["keywords"])

	titles_for_meta = (
	df[title_col].fillna("").astype(str).tolist()
	if title_col
	else [""] * len(df)
	)

	def _build_sentences_and_meta(text_col: str \| None, splitter) -> tuple[list[str], list[dict]]:
	if not text_col:
	return [], []
	texts = df[text_col].fillna("").astype(str).tolist()
	sentences: list[str] = []
	meta: list[dict] = []
	for idx, (text, title) in enumerate(zip(texts, titles_for_meta), start=1):
	parts = splitter(text)
	if not parts:
	continue
	sentences.extend(parts)
	meta.extend(
	{
	"sentence": part,
	"paper_title": title or f"Paper {idx}",
	"paper_id": idx,
	}
	for part in parts
	)
	return sentences, meta

	abstract_sentences, abstract_meta = _build_sentences_and_meta(
	abstract_col, _split_sentences
	)
	title_sentences, title_meta = _build_sentences_and_meta(
	title_col, _split_sentences
	)
	keywords_terms, keywords_meta = _build_sentences_and_meta(
	keywords_col, _split_keywords
	)

	_ensure_dir(OUTPUT_DIR / "abstract")
	_ensure_dir(OUTPUT_DIR / "title")
	_ensure_dir(OUTPUT_DIR / "keywords")

	_save_json(OUTPUT_DIR / "abstract" / "sentences.json", abstract_sentences)
	_save_json(OUTPUT_DIR / "abstract" / "sentence_meta.json", abstract_meta)
	_save_json(OUTPUT_DIR / "title" / "sentences.json", title_sentences)
	_save_json(OUTPUT_DIR / "title" / "sentence_meta.json", title_meta)
	_save_json(OUTPUT_DIR / "keywords" / "sentences.json", keywords_terms)
	_save_json(OUTPUT_DIR / "keywords" / "sentence_meta.json", keywords_meta)

	df.to_csv(OUTPUT_DIR / "corpus.csv", index=False)

	return {
	"paper_count": int(len(df)),
	"abstract_sentence_count": int(len(abstract_sentences)),
	"title_sentence_count": int(len(title_sentences)),
	"keywords_term_count": int(len(keywords_terms)),
	"detected_columns": {
	"abstract": abstract_col,
	"title": title_col,
	"keywords": keywords_col,
	},
	"columns": df.columns.tolist(),
	"sample_abstracts": abstract_texts[:3],
	"filepath": str(filepath),
	}


	# ============================================================================
	# TOOL 2 — run_bertopic_discovery
	# ============================================================================

	@tool
	def run_bertopic_discovery(
	run_key: str,
	threshold: float = DISTANCE_THRESH,
	min_cluster_size: int = HDBSCAN_MIN_CLUSTER_SIZE,
	max_cluster_size: int = HDBSCAN_MAX_CLUSTER_SIZE,
	min_samples: int = HDBSCAN_MIN_SAMPLES,
	auto_optimize: bool = AUTO_OPTIMIZE_CLUSTERS,
	max_optimize_iters: int = OPTIMIZE_MAX_ITERS,
	) -> dict:
	"""
	Embed sentences, cluster with UMAP + HDBSCAN, extract evidence,
	and generate four Plotly charts.

	Saved artefacts
	---------------
	emb.npy : (N, D) float32 L2-normalised embeddings
	sent_labels.npy : (N,) int32 per-sentence cluster label [BUG 1 FIX]
	summaries.json : list of cluster dicts with evidence sentences
	optimization.json : list of optimization rounds and metrics

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"
	threshold : float — legacy arg (ignored by HDBSCAN)
	min_cluster_size : int — HDBSCAN minimum cluster size
	max_cluster_size : int — HDBSCAN maximum cluster size
	min_samples : int — HDBSCAN min_samples
	auto_optimize : bool — run LLM-guided optimization loop
	max_optimize_iters : int — max optimization rounds after initial run

	Returns
	-------
	dict with keys:
	run_key, n_clusters, n_sentences, threshold,
	chart_paths, summaries_path, embeddings_path, optimization_path
	"""
	if run_key not in RUN_CONFIGS:
	return {
	"run_key": run_key,
	"n_clusters": 0,
	"n_sentences": 0,
	"threshold": threshold,
	"chart_paths": {},
	"error": (
	f"Unsupported run_key: {run_key}. "
	f"Use one of: {', '.join(RUN_CONFIGS.keys())}."
	),
	}

	rdir = _run_dir(run_key)
	sent_path = OUTPUT_DIR / run_key / "sentences.json"
	if not sent_path.exists():
	return {
	"run_key": run_key,
	"n_clusters": 0,
	"n_sentences": 0,
	"threshold": threshold,
	"chart_paths": {},
	"error": (
	f"Missing sentences artifact: {sent_path}. "
	"Run load_scopus_csv first."
	),
	}

	sentences = _load_json(sent_path)
	if not sentences:
	return {
	"run_key": run_key,
	"n_clusters": 0,
	"n_sentences": 0,
	"threshold": threshold,
	"chart_paths": {},
	"error": (
	f"No sentences/terms found for run_key={run_key}. "
	"Check that the corresponding source column exists in the CSV."
	),
	}

	sentence_meta = _load_sentence_meta(run_key, sentences)

	emb_path = rdir / "emb.npy"
	embeddings = None
	if emb_path.exists():
	cached = np.load(str(emb_path))
	if cached.shape[0] == len(sentences):
	embeddings = cached

	if embeddings is None:
	embeddings = _embed(sentences)
	np.save(str(emb_path), embeddings)

	cluster_space = _umap_reduce(embeddings, UMAP_N_COMPONENTS_CLUSTER)
	umap_2d = _umap_reduce(embeddings, UMAP_N_COMPONENTS_VIZ)

	def _run_hdbscan(params: dict) -> tuple[list[int], dict]:
	labels_local = _cluster(
	cluster_space,
	min_cluster_size=int(params.get("min_cluster_size", HDBSCAN_MIN_CLUSTER_SIZE)),
	max_cluster_size=int(params.get("max_cluster_size", HDBSCAN_MAX_CLUSTER_SIZE)),
	min_samples=int(params.get("min_samples", HDBSCAN_MIN_SAMPLES)),
	).tolist()
	return labels_local, _cluster_metrics(np.array(labels_local))

	current_params = {
	"min_cluster_size": int(min_cluster_size),
	"max_cluster_size": int(max_cluster_size),
	"min_samples": int(min_samples),
	}

	labels, metrics = _run_hdbscan(current_params)
	optimization_log = [
	{
	"round": 0,
	"params": current_params,
	"metrics": metrics,
	}
	]

	best_score = _optimization_score(metrics)
	stable_rounds = 0

	seen_params = {(
	current_params["min_cluster_size"],
	current_params["max_cluster_size"],
	current_params["min_samples"],
	)}

	if bool(auto_optimize) and int(max_optimize_iters) > 0:
	for round_idx in range(1, int(max_optimize_iters) + 1):
	suggestion = _recommend_hdbscan_params(metrics, current_params)
	if suggestion.get("action") == "accept":
	optimization_log.append({
	"round": round_idx,
	"params": current_params,
	"metrics": metrics,
	"action": "accept",
	"reasoning": suggestion.get("reasoning", ""),
	})
	break

	next_params = {
	"min_cluster_size": int(suggestion.get("min_cluster_size")),
	"max_cluster_size": int(suggestion.get("max_cluster_size")),
	"min_samples": int(suggestion.get("min_samples")),
	}
	next_key = (
	next_params["min_cluster_size"],
	next_params["max_cluster_size"],
	next_params["min_samples"],
	)
	if next_key in seen_params:
	optimization_log.append({
	"round": round_idx,
	"params": current_params,
	"metrics": metrics,
	"action": "stop",
	"reasoning": "Repeated parameter set; stopping optimization.",
	})
	break

	labels, metrics = _run_hdbscan(next_params)
	optimization_log.append({
	"round": round_idx,
	"params": next_params,
	"metrics": metrics,
	"reasoning": suggestion.get("reasoning", ""),
	})
	current_params = next_params
	seen_params.add(next_key)

	score = _optimization_score(metrics)
	if score <= best_score + OPTIMIZE_MIN_IMPROVEMENT:
	stable_rounds += 1
	else:
	best_score = score
	stable_rounds = 0

	if _metrics_in_target(metrics):
	break

	if stable_rounds >= OPTIMIZE_STABLE_ROUNDS:
	break

	optimization_path = rdir / "optimization.json"
	_save_json(optimization_path, optimization_log)

	unique_ids = sorted(filter(lambda v: v != -1, set(labels)))

	# FIX BUG 1 — persist per-sentence label array so Tool 4 can build
	# correct cluster masks without any guesswork or scaffolding.
	np.save(str(rdir / "sent_labels.npy"), np.array(labels, dtype=np.int32))

	labels_arr = np.array(labels)

	if not unique_ids:
	_save_json(rdir / "summaries.json", [])
	return {
	"run_key": run_key,
	"n_clusters": 0,
	"n_sentences": int(len(sentences)),
	"threshold": threshold,
	"min_cluster_size": int(current_params["min_cluster_size"]),
	"max_cluster_size": int(current_params["max_cluster_size"]),
	"min_samples": int(current_params["min_samples"]),
	"chart_paths": {},
	"summaries_path": str(rdir / "summaries.json"),
	"embeddings_path": str(rdir / "emb.npy"),
	"optimization_path": str(optimization_path),
	"error": "HDBSCAN produced no clusters (all points labeled as noise).",
	}

	def _cluster_summary(cid: int) -> dict:
	mask = labels_arr == cid
	c_emb = embeddings[mask]
	c_umap = umap_2d[mask]
	c_sent = list(np.array(sentences)[mask])
	ctroid = _centroid(c_emb)
	top_idx = _top_k_indices(c_emb, ctroid, N_EVIDENCE)
	coords = (
	c_umap.mean(axis=0)
	if c_umap.shape[0] > 0
	else np.zeros(UMAP_N_COMPONENTS_VIZ, dtype=np.float32)
	)
	paper_stats = _top_papers_for_mask(sentence_meta, mask, k=3)
	return {
	"cluster_id": int(cid),
	"size": int(mask.sum()),
	"cx": float(coords[0]),
	"cy": float(coords[1]),
	"evidence": list(np.array(c_sent)[top_idx]),
	"paper_count": paper_stats.get("paper_count", 0),
	"top_papers": paper_stats.get("top_papers", []),
	}

	summaries = list(map(_cluster_summary, unique_ids))
	_save_json(rdir / "summaries.json", summaries)

	chart_paths = {
	"Intertopic Map": _save_chart(_chart_intertopic(summaries), rdir / "intertopic.html"),
	"Top Words": _save_chart(_chart_top_words(summaries), rdir / "topwords.html"),
	"Hierarchy": _save_chart(_chart_hierarchy(labels, embeddings), rdir / "hierarchy.html"),
	"Heatmap": _save_chart(_chart_heatmap(labels, embeddings), rdir / "heatmap.html"),
	}

	return {
	"run_key": run_key,
	"n_clusters": int(len(unique_ids)),
	"n_sentences": int(len(sentences)),
	"threshold": threshold,
	"min_cluster_size": int(current_params["min_cluster_size"]),
	"max_cluster_size": int(current_params["max_cluster_size"]),
	"min_samples": int(current_params["min_samples"]),
	"chart_paths": chart_paths,
	"summaries_path": str(rdir / "summaries.json"),
	"embeddings_path": str(rdir / "emb.npy"),
	"optimization_path": str(optimization_path),
	}


	# ============================================================================
	# TOOL 3 — label_topics_with_llm
	# ============================================================================

	_LABEL_PROMPT = PromptTemplate.from_template(
	"""You are an expert academic researcher specialising in Information Systems.

	Given the following cluster of research sentences, return a JSON object with EXACTLY these keys:
	label : short research-area name (<= 6 words)
	category : broader IS research category
	confidence : float 0.0-1.0
	reasoning : one sentence explaining your choice
	niche : boolean - true if highly specialised / narrow

	Cluster ID : {cluster_id}
	Sentence count: {size}
	Evidence sentences:
	{evidence}

	Respond with RAW JSON only. No markdown, no explanation outside the JSON.
	"""
	)


	_LABEL_JUDGE_PROMPT = PromptTemplate.from_template(
	"""You are an expert label adjudicator. Choose the single best label from
	the candidates below based on the evidence sentences.

	Cluster ID : {cluster_id}
	Sentence count: {size}
	Evidence sentences:
	{evidence}

	Candidate labels:
	1) Mistral
	Label: {mistral_label}
	Category: {mistral_category}
	Confidence: {mistral_confidence}
	Reasoning: {mistral_reasoning}

	2) Groq-Ollama
	Label: {groq_ollama_label}
	Category: {groq_ollama_category}
	Confidence: {groq_ollama_confidence}
	Reasoning: {groq_ollama_reasoning}

	3) Groq-GPT
	Label: {groq_gpt_label}
	Category: {groq_gpt_category}
	Confidence: {groq_gpt_confidence}
	Reasoning: {groq_gpt_reasoning}

	Rules:
	- Choose exactly one of the three labels. Do not invent a new label.
	- Pick the label that best matches the evidence and is most specific.
	- If two are equally good, prefer the one with higher confidence.

	Return RAW JSON with exactly these keys:
	best_label: string
	best_category: string
	chosen_source: string # one of: mistral, groq_ollama, groq_gpt
	best_reasoning: string

	Respond with RAW JSON only.
	"""
	)


	@tool
	def label_topics_with_llm(run_key: str) -> dict:
	"""
	Label each cluster with Mistral only (default Phase 2 labeling pass).

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"

	Returns
	-------
	dict with keys:
	run_key, labels_path, labelled_count, labels_preview (list of dicts)
	"""
	rdir = _run_dir(run_key)
	summaries_path = rdir / "summaries.json"
	if not summaries_path.exists():
	return {
	"run_key": run_key,
	"labels_path": str(rdir / "labels.json"),
	"labelled_count": 0,
	"total_clusters": 0,
	"selected_clusters": 0,
	"skipped_clusters": 0,
	"labels_preview": [],
	"error": (
	f"Missing discovery artifact: {summaries_path}. "
	"Run run_bertopic_discovery first for this run_key."
	),
	}

	summaries = _load_json(summaries_path)

	ranked = sorted(
	filter(lambda s: s.get("size", 0) >= MIN_CLUSTER_SIZE_FOR_LABEL, summaries),
	key=lambda s: s.get("size", 0),
	reverse=True,
	)
	selected = ranked[:MAX_LABEL_CLUSTERS]

	chain_mistral = _LABEL_PROMPT \| _llm() \| JsonOutputParser()

	def _evidence_block(summary: dict) -> str:
	return "\n".join(
	f" {i+1}. {s}"
	for i, s in enumerate(summary["evidence"])
	)

	def _label_one(summary: dict) -> dict:
	result = _invoke_with_retries(lambda: chain_mistral.invoke({
	"cluster_id": summary["cluster_id"],
	"size": summary["size"],
	"evidence": _evidence_block(summary),
	}))

	return {
	**summary,
	**result,
	"mistral_label": result.get("label", ""),
	"mistral_category": result.get("category", ""),
	"mistral_confidence": _to_float(result.get("confidence"), 0.0),
	"mistral_reasoning": result.get("reasoning", ""),
	"mistral_niche": bool(result.get("niche", False)),
	"groq_label": "",
	"groq_category": "",
	"groq_confidence": 0.0,
	"groq_reasoning": "",
	"groq_niche": False,
	"groq_ollama_label": "",
	"groq_ollama_category": "",
	"groq_ollama_confidence": 0.0,
	"groq_ollama_reasoning": "",
	"groq_ollama_niche": False,
	"groq_gpt_label": "",
	"groq_gpt_category": "",
	"groq_gpt_confidence": 0.0,
	"groq_gpt_reasoning": "",
	"groq_gpt_niche": False,
	"verification_done": False,
	"verification_done_ollama": False,
	"verification_done_gpt": False,
	"verification_note": (
	"Run VERIFY in Phase 2 to compare with Groq-Ollama and Groq-GPT labels."
	),
	}

	labelled = list(map(_label_one, selected))
	_save_json(rdir / "labels.json", labelled)

	# Keep tool output compact so the ReAct transcript does not overflow model context.
	preview = list(map(
	lambda r: {
	"cluster_id": r.get("cluster_id"),
	"label": r.get("label"),
	"category": r.get("category"),
	"confidence": r.get("confidence"),
	"mistral_label": r.get("mistral_label", ""),
	"groq_label": r.get("groq_label", ""),
	"groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")),
	"groq_gpt_label": r.get("groq_gpt_label", ""),
	"size": r.get("size"),
	"niche": r.get("niche", False),
	},
	labelled[:MAX_TOOL_RETURN_PREVIEW],
	))

	return {
	"run_key": run_key,
	"labels_path": str(rdir / "labels.json"),
	"labelled_count": len(labelled),
	"total_clusters": len(summaries),
	"selected_clusters": len(selected),
	"skipped_clusters": max(0, len(summaries) - len(selected)),
	"groq_enabled": _groq_ollama_enabled() and _groq_gpt_enabled(),
	"mode_note": "Single-model labeling complete (Mistral). Send VERIFY in Phase 2 to run Groq-Ollama and Groq-GPT verification.",
	"labels_preview": preview,
	}


	@tool
	def verify_topic_labels_with_groq(run_key: str) -> dict:
	"""
	Run Groq topic labeling for already-labeled topics and append comparison fields
	into labels.json so UI review table can show Mistral vs Groq-Ollama vs Groq-GPT labels,
	plus an adjudicated best label when GROQ_JUDGE_MODEL_NAME is configured.

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"

	Returns
	-------
	dict with keys:
	run_key, labels_path, verification_path, verified_count, labels_preview
	"""
	rdir = _run_dir(run_key)
	labels_path = rdir / "labels.json"
	summaries_path = rdir / "summaries.json"

	if not _groq_ollama_enabled() or not _groq_gpt_enabled():
	return {
	"run_key": run_key,
	"labels_path": str(labels_path),
	"verified_count": 0,
	"labels_preview": [],
	"error": (
	"GROQ_API_KEY or Groq model config is missing, or langchain-groq is unavailable. "
	"Set GROQ_API_KEY and GROQ_GPT_MODEL_NAME (and optionally GROQ_OLLAMA_MODEL_NAME) "
	"and install requirements to use VERIFY."
	),
	}

	if not labels_path.exists():
	return {
	"run_key": run_key,
	"labels_path": str(labels_path),
	"verified_count": 0,
	"labels_preview": [],
	"error": (
	f"Missing labels artifact: {labels_path}. "
	"Run label_topics_with_llm first."
	),
	}

	if not summaries_path.exists():
	return {
	"run_key": run_key,
	"labels_path": str(labels_path),
	"verified_count": 0,
	"labels_preview": [],
	"error": (
	f"Missing summaries artifact: {summaries_path}. "
	"Run run_bertopic_discovery first."
	),
	}

	labels_data = _load_json(labels_path)
	summaries = _load_json(summaries_path)
	summary_by_id = {
	int(s.get("cluster_id", -1)): s
	for s in summaries
	}

	target_rows = list(filter(
	lambda r: int(r.get("cluster_id", -1)) in summary_by_id,
	labels_data,
	))

	chain_groq_ollama = _LABEL_PROMPT \| _llm_groq(GROQ_OLLAMA_MODEL_NAME) \| JsonOutputParser()
	chain_groq_gpt = _LABEL_PROMPT \| _llm_groq(GROQ_GPT_MODEL_NAME) \| JsonOutputParser()
	chain_judge = (
	_LABEL_JUDGE_PROMPT \| _llm_groq(GROQ_JUDGE_MODEL_NAME) \| JsonOutputParser()
	if _groq_judge_enabled()
	else None
	)

	def _evidence_block(summary: dict) -> str:
	return "\n".join(
	f" {i+1}. {s}"
	for i, s in enumerate(summary.get("evidence", []))
	)

	def _label_with_groq(row: dict) -> tuple[int, dict, dict]:
	cid = int(row.get("cluster_id", -1))
	summary = summary_by_id[cid]
	payload = {
	"cluster_id": summary["cluster_id"],
	"size": summary["size"],
	"evidence": _evidence_block(summary),
	}
	groq_ollama = _invoke_with_retries(lambda: chain_groq_ollama.invoke(payload))
	groq_gpt = _invoke_with_retries(lambda: chain_groq_gpt.invoke(payload))
	return cid, groq_ollama, groq_gpt

	groq_pairs = list(map(_label_with_groq, target_rows))
	groq_ollama_by_id = {cid: data for cid, data, _ in groq_pairs}
	groq_gpt_by_id = {cid: data for cid, _, data in groq_pairs}

	def _judge_label(row: dict) -> tuple[int, dict]:
	if chain_judge is None:
	return int(row.get("cluster_id", -1)), {}
	cid = int(row.get("cluster_id", -1))
	summary = summary_by_id[cid]
	groq_ollama = groq_ollama_by_id.get(cid, {})
	groq_gpt = groq_gpt_by_id.get(cid, {})
	payload = {
	"cluster_id": summary.get("cluster_id"),
	"size": summary.get("size"),
	"evidence": _evidence_block(summary),
	"mistral_label": str(row.get("mistral_label") or row.get("label", "")).strip(),
	"mistral_category": str(row.get("mistral_category") or row.get("category", "")).strip(),
	"mistral_confidence": _to_float(row.get("mistral_confidence", row.get("confidence", 0.0)), 0.0),
	"mistral_reasoning": str(row.get("mistral_reasoning") or row.get("reasoning", "")).strip(),
	"groq_ollama_label": str(groq_ollama.get("label", "")).strip(),
	"groq_ollama_category": str(groq_ollama.get("category", "")).strip(),
	"groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0),
	"groq_ollama_reasoning": str(groq_ollama.get("reasoning", "")).strip(),
	"groq_gpt_label": str(groq_gpt.get("label", "")).strip(),
	"groq_gpt_category": str(groq_gpt.get("category", "")).strip(),
	"groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0),
	"groq_gpt_reasoning": str(groq_gpt.get("reasoning", "")).strip(),
	}
	try:
	result = _invoke_with_retries(lambda: chain_judge.invoke(payload))
	except Exception:
	result = {}
	return cid, result

	judge_pairs = list(map(_judge_label, target_rows)) if chain_judge else []
	judge_by_id = {cid: data for cid, data in judge_pairs}

	def _merge_row(row: dict) -> dict:
	cid = int(row.get("cluster_id", -1))
	groq_ollama = groq_ollama_by_id.get(cid, {})
	groq_gpt = groq_gpt_by_id.get(cid, {})
	adjudicated = judge_by_id.get(cid, {})
	has_groq_ollama = bool(groq_ollama)
	has_groq_gpt = bool(groq_gpt)
	mistral_label = str(row.get("mistral_label") or row.get("label", "")).strip()
	groq_ollama_label = str(groq_ollama.get("label", "")).strip()
	groq_gpt_label = str(groq_gpt.get("label", "")).strip()
	adjudicated_label = str(adjudicated.get("best_label", "")).strip()
	is_agreement = (
	all([mistral_label, groq_ollama_label, groq_gpt_label])
	and mistral_label.lower() == groq_ollama_label.lower()
	and mistral_label.lower() == groq_gpt_label.lower()
	)

	return {
	**row,
	"mistral_label": mistral_label,
	"mistral_category": row.get("mistral_category") or row.get("category", ""),
	"mistral_confidence": _to_float(
	row.get("mistral_confidence", row.get("confidence", 0.0)),
	0.0,
	),
	"mistral_reasoning": row.get("mistral_reasoning") or row.get("reasoning", ""),
	"mistral_niche": bool(row.get("mistral_niche", row.get("niche", False))),
	"groq_label": groq_ollama_label,
	"groq_category": groq_ollama.get("category", ""),
	"groq_confidence": _to_float(groq_ollama.get("confidence"), 0.0),
	"groq_reasoning": groq_ollama.get("reasoning", ""),
	"groq_niche": bool(groq_ollama.get("niche", False)),
	"groq_ollama_label": groq_ollama_label,
	"groq_ollama_category": groq_ollama.get("category", ""),
	"groq_ollama_confidence": _to_float(groq_ollama.get("confidence"), 0.0),
	"groq_ollama_reasoning": groq_ollama.get("reasoning", ""),
	"groq_ollama_niche": bool(groq_ollama.get("niche", False)),
	"groq_gpt_label": groq_gpt_label,
	"groq_gpt_category": groq_gpt.get("category", ""),
	"groq_gpt_confidence": _to_float(groq_gpt.get("confidence"), 0.0),
	"groq_gpt_reasoning": groq_gpt.get("reasoning", ""),
	"groq_gpt_niche": bool(groq_gpt.get("niche", False)),
	"adjudicated_label": adjudicated_label,
	"adjudicated_category": str(adjudicated.get("best_category", "")).strip(),
	"adjudicated_reasoning": str(adjudicated.get("best_reasoning", "")).strip(),
	"adjudicated_source": str(adjudicated.get("chosen_source", "")).strip(),
	"adjudication_done": bool(adjudicated_label),
	"adjudication_note": (
	"Adjudicated label available."
	if adjudicated_label
	else "Adjudication unavailable for this topic."
	),
	"verification_done": has_groq_ollama and has_groq_gpt,
	"verification_done_ollama": has_groq_ollama,
	"verification_done_gpt": has_groq_gpt,
	"verification_note": (
	"Mistral, Groq-Ollama, and Groq-GPT labels match."
	if is_agreement
	else "Model labels differ. Review before approval."
	)
	if has_groq_ollama and has_groq_gpt
	else "Groq labeling unavailable for this topic.",
	}

	verified_rows = list(map(_merge_row, labels_data))
	verification_path = rdir / "labels_verification.json"
	_save_json(labels_path, verified_rows)
	_save_json(verification_path, verified_rows)

	preview = list(map(
	lambda r: {
	"cluster_id": r.get("cluster_id"),
	"mistral_label": r.get("mistral_label", ""),
	"groq_ollama_label": r.get("groq_ollama_label", r.get("groq_label", "")),
	"groq_gpt_label": r.get("groq_gpt_label", ""),
	"adjudicated_label": r.get("adjudicated_label", ""),
	"verification_note": r.get("verification_note", ""),
	},
	verified_rows[:MAX_TOOL_RETURN_PREVIEW],
	))

	verified_count = sum(
	1
	for row in verified_rows
	if row.get("groq_ollama_label") and row.get("groq_gpt_label")
	)

	return {
	"run_key": run_key,
	"labels_path": str(labels_path),
	"verification_path": str(verification_path),
	"verified_count": int(verified_count),
	"labelled_count": int(len(verified_rows)),
	"labels_preview": preview,
	}


	# ============================================================================
	# TOOL 4 — consolidate_into_themes
	# ============================================================================

	@tool
	def consolidate_into_themes(run_key: str, theme_map: dict) -> dict:
	"""
	Merge approved / renamed topics into consolidated themes and recompute
	centroids from the actual merged-cluster embeddings.

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"
	theme_map : dict — {new_theme_name: [cluster_id, ...], ...}
	Only approved topics need appear here.

	Returns
	-------
	dict with keys:
	run_key, theme_count, themes_path, themes_preview (list of dicts)
	"""
	rdir = _run_dir(run_key)
	labels_data = _load_json(rdir / "labels.json")
	embeddings = np.load(str(rdir / "emb.npy")) # (N, 384)
	sent_labels = np.load(str(rdir / "sent_labels.npy")) # (N,) — FIX BUG 1

	# Index label dicts by cluster_id for O(1) lookup
	label_idx = {item["cluster_id"]: item for item in labels_data}

	def _build_theme(theme_name: str, cids: list[int]) -> dict:
	"""
	Build one consolidated theme from a list of cluster IDs.

	Evidence : top-N sentences pooled across all merged clusters
	Centroid : L2-normalised mean of all embeddings in the merged set
	Size : total sentence count across merged clusters
	"""
	member_labels = list(map(label_idx.get, cids))

	# Pool evidence sentences from all member clusters
	all_evidence = reduce(
	lambda acc, lbl: acc + lbl["evidence"],
	filter(None, member_labels),
	[],
	)

	# Total sentence count across merged clusters
	total_size = reduce(
	lambda acc, lbl: acc + lbl.get("size", 0),
	filter(None, member_labels),
	0,
	)

	# FIX BUG 1 — build correct cluster mask using persisted sent_labels
	cluster_mask = np.isin(sent_labels, np.array(cids, dtype=np.int32))
	theme_embeddings = embeddings[cluster_mask] # (M, 384)

	# Guard: if mask is somehow empty fall back to zero vector
	theme_centroid = (
	_centroid(theme_embeddings)
	if theme_embeddings.shape[0] > 0
	else np.zeros(embeddings.shape[1], dtype=np.float32)
	)

	return {
	"theme_name": theme_name,
	"cluster_ids": cids,
	"size": total_size,
	"evidence": all_evidence[:N_EVIDENCE],
	"centroid": theme_centroid.tolist(),
	"sub_labels": list(map(
	itemgetter("label"),
	filter(None, member_labels),
	)),
	}

	themes = list(map(
	lambda kv: _build_theme(kv[0], kv[1]),
	theme_map.items(),
	))

	_save_json(rdir / "themes.json", themes)

	preview = list(map(
	lambda t: {
	"theme_name": t.get("theme_name"),
	"size": t.get("size", 0),
	"cluster_count": len(t.get("cluster_ids", [])),
	},
	themes[:MAX_TOOL_RETURN_PREVIEW],
	))

	return {
	"run_key": run_key,
	"theme_count": len(themes),
	"themes_path": str(rdir / "themes.json"),
	"themes_preview": preview,
	}


	# ============================================================================
	# TOOL 5 — compare_with_taxonomy
	# ============================================================================

	_TAXONOMY_PROMPT = PromptTemplate.from_template(
	"""You are an IS research taxonomist. Map the following research theme to the
	PAJAIS taxonomy. Return RAW JSON with EXACTLY these keys:
	theme_name : the input theme name (unchanged)
	pajais_match : best matching PAJAIS category OR the string "NOVEL"
	confidence : float 0.0-1.0
	reasoning : one sentence
	is_novel : boolean

	PAJAIS categories:
	{taxonomy}

	Theme to map:
	Name : {theme_name}
	Evidence : {evidence}

	Respond with RAW JSON only. No markdown.
	"""
	)


	@tool
	def compare_with_taxonomy(run_key: str) -> dict:
	"""
	Map consolidated themes to PAJAIS taxonomy via Mistral.

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"

	Returns
	-------
	dict with keys:
	run_key, taxonomy_path, mapped_count, novel_count, mapping_preview
	"""
	rdir = _run_dir(run_key)
	themes = _load_json(rdir / "themes.json")
	chain = _TAXONOMY_PROMPT \| _llm() \| JsonOutputParser()

	taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY)

	def _map_theme(theme: dict) -> dict:
	result = _invoke_with_retries(lambda: chain.invoke({
	"taxonomy": taxonomy_str,
	"theme_name": theme["theme_name"],
	"evidence": " \| ".join(theme.get("evidence", [])[:3]),
	}))
	return {theme, result}

	taxonomy_map = list(map(_map_theme, themes))
	_save_json(rdir / "taxonomy_map.json", taxonomy_map)

	novel_count = sum(1 for t in taxonomy_map if t.get("is_novel", False))
	mapped_count = len(taxonomy_map) - novel_count

	preview = list(map(
	lambda t: {
	"theme_name": t.get("theme_name"),
	"pajais_match": t.get("pajais_match", "NOVEL"),
	"confidence": t.get("confidence", 0),
	"is_novel": t.get("is_novel", False),
	},
	taxonomy_map[:MAX_TOOL_RETURN_PREVIEW],
	))

	return {
	"run_key": run_key,
	"taxonomy_path": str(rdir / "taxonomy_map.json"),
	"mapped_count": mapped_count,
	"novel_count": novel_count,
	"mapping_preview": preview,
	}


	@tool
	def verify_taxonomy_mapping_with_groq(run_key: str) -> dict:
	"""
	Run Groq validation for PAJAIS taxonomy mappings and persist side-by-side
	Mistral/Groq mapping fields for each theme.

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords"

	Returns
	-------
	dict with keys:
	run_key, taxonomy_path, verification_path,
	verified_count, mapping_preview
	"""
	if not _groq_ollama_enabled():
	return {
	"run_key": run_key,
	"taxonomy_path": str(_run_dir(run_key) / "taxonomy_map.json"),
	"verified_count": 0,
	"mapping_preview": [],
	"error": (
	"GROQ_API_KEY is missing or langchain-groq is unavailable. "
	"Set GROQ_API_KEY and install requirements to use VERIFY."
	),
	}

	rdir = _run_dir(run_key)
	themes_path = rdir / "themes.json"
	taxonomy_path = rdir / "taxonomy_map.json"

	if not themes_path.exists():
	return {
	"run_key": run_key,
	"taxonomy_path": str(taxonomy_path),
	"verified_count": 0,
	"mapping_preview": [],
	"error": (
	f"Missing themes artifact: {themes_path}. "
	"Run consolidate_into_themes first."
	),
	}

	if not taxonomy_path.exists():
	return {
	"run_key": run_key,
	"taxonomy_path": str(taxonomy_path),
	"verified_count": 0,
	"mapping_preview": [],
	"error": (
	f"Missing taxonomy artifact: {taxonomy_path}. "
	"Run compare_with_taxonomy first."
	),
	}

	themes = _load_json(themes_path)
	taxonomy_map = _load_json(taxonomy_path)
	taxonomy_str = "\n".join(f" - {cat}" for cat in PAJAIS_TAXONOMY)

	chain_groq = _TAXONOMY_PROMPT \| _llm_groq(GROQ_OLLAMA_MODEL_NAME) \| JsonOutputParser()

	def _map_theme_with_groq(theme: dict) -> dict:
	return _invoke_with_retries(lambda: chain_groq.invoke({
	"taxonomy": taxonomy_str,
	"theme_name": theme["theme_name"],
	"evidence": " \| ".join(theme.get("evidence", [])[:3]),
	}))

	groq_maps = list(map(_map_theme_with_groq, themes))
	groq_by_theme = {
	str(item.get("theme_name", "")).strip(): item
	for item in groq_maps
	}

	def _merge_mappings(mistral_row: dict) -> dict:
	theme_name = str(mistral_row.get("theme_name", "")).strip()
	groq_row = groq_by_theme.get(theme_name, {})
	groq_match = str(groq_row.get("pajais_match", "")).strip()
	mistral_match = str(mistral_row.get("pajais_match", "")).strip()
	is_same = bool(groq_match) and (groq_match.lower() == mistral_match.lower())

	return {
	**mistral_row,
	"mistral_pajais_match": mistral_match,
	"mistral_confidence": _to_float(
	mistral_row.get("mistral_confidence", mistral_row.get("confidence", 0.0)),
	0.0,
	),
	"mistral_reasoning": str(
	mistral_row.get("mistral_reasoning", mistral_row.get("reasoning", ""))
	),
	"mistral_is_novel": bool(
	mistral_row.get("mistral_is_novel", mistral_row.get("is_novel", False))
	),
	"groq_pajais_match": groq_match,
	"groq_confidence": _to_float(groq_row.get("confidence"), 0.0),
	"groq_reasoning": str(groq_row.get("reasoning", "")),
	"groq_is_novel": bool(groq_row.get("is_novel", False)),
	"taxonomy_verification_done": bool(groq_row),
	"taxonomy_verification_note": (
	"Mistral and Groq taxonomy mapping match."
	if is_same
	else "Mistral and Groq taxonomy mapping differ."
	) if groq_row else "Groq taxonomy mapping unavailable for this theme.",
	}

	merged_rows = list(map(_merge_mappings, taxonomy_map))
	verification_path = rdir / "taxonomy_verification.json"
	_save_json(taxonomy_path, merged_rows)
	_save_json(verification_path, merged_rows)

	preview = list(map(
	lambda row: {
	"theme_name": row.get("theme_name", ""),
	"mistral_pajais_match": row.get("mistral_pajais_match", row.get("pajais_match", "")),
	"groq_pajais_match": row.get("groq_pajais_match", ""),
	"taxonomy_verification_note": row.get("taxonomy_verification_note", ""),
	},
	merged_rows[:MAX_TOOL_RETURN_PREVIEW],
	))

	verified_count = sum(1 for row in merged_rows if row.get("groq_pajais_match"))

	return {
	"run_key": run_key,
	"taxonomy_path": str(taxonomy_path),
	"verification_path": str(verification_path),
	"verified_count": int(verified_count),
	"mapped_count": int(len(merged_rows)),
	"mapping_preview": preview,
	}


	# ============================================================================
	# TOOL 6 — generate_comparison_csv
	# ============================================================================

	@tool
	def generate_comparison_csv() -> dict:
	"""
	Side-by-side comparison of abstract/title/keywords theme mappings.

	Each run is optional. Missing runs produce empty columns.

	Returns
	-------
	dict with keys:
	csv_path, row_count, columns, preview (list of dicts)
	"""
	abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json"
	title_path = OUTPUT_DIR / "title" / "taxonomy_map.json"
	keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json"

	abstract_map = _load_json(abstract_path) if abstract_path.exists() else []
	title_map = _load_json(title_path) if title_path.exists() else []
	keywords_map = _load_json(keywords_path) if keywords_path.exists() else []

	if not (abstract_map or title_map or keywords_map):
	return {
	"csv_path": str(OUTPUT_DIR / "comparison.csv"),
	"row_count": 0,
	"columns": [],
	"preview": [],
	"error": (
	"No taxonomy_map.json files found for abstract/title/keywords. "
	"Run compare_with_taxonomy for at least one run first."
	),
	}

	def _row(a_theme: dict \| None, t_theme: dict \| None, k_theme: dict \| None) -> dict:
	return {
	"Abstract Theme": a_theme.get("theme_name", "") if a_theme else "",
	"Abstract PAJAIS": a_theme.get("pajais_match", "") if a_theme else "",
	"Abstract Confidence": a_theme.get("confidence", 0) if a_theme else 0,
	"Abstract Novel": a_theme.get("is_novel", False) if a_theme else False,
	"Title Theme": t_theme.get("theme_name", "") if t_theme else "",
	"Title PAJAIS": t_theme.get("pajais_match", "") if t_theme else "",
	"Title Confidence": t_theme.get("confidence", 0) if t_theme else 0,
	"Title Novel": t_theme.get("is_novel", False) if t_theme else False,
	"Keywords Theme": k_theme.get("theme_name", "") if k_theme else "",
	"Keywords PAJAIS": k_theme.get("pajais_match", "") if k_theme else "",
	"Keywords Confidence": k_theme.get("confidence", 0) if k_theme else 0,
	"Keywords Novel": k_theme.get("is_novel", False) if k_theme else False,
	}

	max_len = max(len(abstract_map), len(title_map), len(keywords_map), 1)
	padded_a = abstract_map + [{}] * (max_len - len(abstract_map))
	padded_t = title_map + [{}] * (max_len - len(title_map))
	padded_k = keywords_map + [{}] * (max_len - len(keywords_map))

	rows = list(map(_row, padded_a, padded_t, padded_k))
	df = pd.DataFrame(rows)

	out_path = OUTPUT_DIR / "comparison.csv"
	df.to_csv(out_path, index=False)

	return {
	"csv_path": str(out_path),
	"row_count": len(df),
	"columns": df.columns.tolist(),
	"preview": df.head(5).to_dict(orient="records"),
	}


	# ============================================================================
	# TOOL 7 — export_narrative
	# ============================================================================

	_NARRATIVE_PROMPT = PromptTemplate.from_template(
	"""You are an academic researcher writing a methodology and findings section.

	Write a 500-word academic narrative describing the thematic analysis results below.
	Structure: (1) methodology overview, (2) major themes found across runs,
	(3) PAJAIS alignment, (4) novel contributions, (5) limitations.

	Use formal academic English. Do NOT use bullet points.

	Abstract themes & taxonomy:
	{abstract_themes}

	Title themes & taxonomy:
	{title_themes}

	Keywords themes & taxonomy:
	{keywords_themes}

	Respond with plain text only.
	"""
	)


	@tool
	def export_narrative(run_key: str) -> dict:
	"""
	Generate a 500-word academic narrative and save to narrative.txt.

	Parameters
	----------
	run_key : str — "abstract" or "title" or "keywords" (primary source)

	Returns
	-------
	dict with keys:
	narrative_path, word_count, preview (first 300 chars)
	"""
	rdir = _run_dir(run_key)
	abstract_path = OUTPUT_DIR / "abstract" / "taxonomy_map.json"
	title_path = OUTPUT_DIR / "title" / "taxonomy_map.json"
	keywords_path = OUTPUT_DIR / "keywords" / "taxonomy_map.json"

	abstract_map = _load_json(abstract_path) if abstract_path.exists() else []
	title_map = _load_json(title_path) if title_path.exists() else []
	keywords_map = _load_json(keywords_path) if keywords_path.exists() else []

	if not (abstract_map or title_map or keywords_map):
	return {
	"narrative_path": str(rdir / "narrative.txt"),
	"word_count": 0,
	"preview": "",
	"error": (
	"No taxonomy mappings found for abstract/title/keywords. "
	"Run compare_with_taxonomy before export_narrative."
	),
	}

	def _theme_summary(t: dict) -> str:
	return (
	f" - {t.get('theme_name','?')} -> {t.get('pajais_match','?')} "
	f"(conf={t.get('confidence',0):.2f}, novel={t.get('is_novel',False)})"
	)

	abstract_str = "\n".join(map(_theme_summary, abstract_map))
	title_str = "\n".join(map(_theme_summary, title_map)) or "Not run."
	keywords_str = "\n".join(map(_theme_summary, keywords_map)) or "Not run."

	chain = _NARRATIVE_PROMPT \| _llm()
	response = _invoke_with_retries(lambda: chain.invoke({
	"abstract_themes": abstract_str,
	"title_themes": title_str,
	"keywords_themes": keywords_str,
	}))

	narrative = response.content if hasattr(response, "content") else str(response)
	out_path = rdir / "narrative.txt"
	out_path.write_text(narrative, encoding="utf-8")

	return {
	"narrative_path": str(out_path),
	"word_count": len(narrative.split()),
	"preview": narrative[:300],
	}


	# ============================================================================
	# METHOD EXTRACTION — Per-Paper Computational Method Identification
	# ============================================================================

	def _extract_text_from_pdf(pdf_path: str) -> str:
	"""Extract all text from a PDF using PyMuPDF (text only, no images)."""
	import fitz
	doc = fitz.open(pdf_path)
	pages = []
	for page in doc:
	pages.append(page.get_text("text"))
	doc.close()
	return "\n".join(pages)


	def _extract_title_from_pdf(full_text: str) -> str:
	"""Try to extract the paper title from the first few lines of text."""
	lines = full_text.strip().split("\n")
	title_lines = []
	for line in lines[:10]:
	stripped = line.strip()
	if not stripped:
	if title_lines:
	break
	continue
	low = stripped.lower()
	if low.startswith("abstract") or low.startswith("keyword"):
	break
	if len(stripped) > 10:
	title_lines.append(stripped)
	if len(title_lines) >= 2:
	break
	return " ".join(title_lines)[:200] if title_lines else ""


	def _chunk_text(text: str, chunk_size: int = 12000, overlap: int = 1000) -> list[str]:
	"""Split text into chunks of `chunk_size` characters with `overlap`."""
	if not text:
	return []
	chunks = []
	start = 0
	text_len = len(text)
	while start < text_len:
	end = start + chunk_size
	chunks.append(text[start:end])
	if end >= text_len:
	break
	start = end - overlap
	return chunks


	# LLM prompt — extracts computational methods from a single paper's method section
	_EXTRACT_METHODS_PROMPT = PromptTemplate.from_template(
	"""You are an expert IS research methodologist. Read this excerpt from a research
	paper and identify ALL computational techniques used.

	The excerpt may come from methods or results. Use:
	- explicit method statements ("this study uses", "we employed")
	- analytical technique mentions in results (beta coefficients, BERT scores, LDA topics, network centrality)
	- sample/data descriptions (N=, dataset, corpus)
	- implicit method cues from results presentation (e.g., beta tables imply regression)
	Do not guess beyond evidence in the excerpt.

	A "computational method" or "analytical technique" refers to specific algorithms,
	statistical tests, machine learning models, NLP techniques, network measures,
	or simulation/optimization approaches.

	Paper: {paper_name}

	Excerpt text:
	{method_text}

	Return a JSON object with EXACTLY this key:
	computational_methods : list of specific algorithms, models, or computational techniques found.
	Be very specific. DO NOT just say "Machine Learning", name the algorithm.
	Examples: ["Random Forest", "BERT", "K-means clustering", "LSTM", "XGBoost",
	"LDA topic modeling", "PLS-SEM", "CB-SEM", "OLS Regression", "ANOVA",
	"Network centrality", "Louvain community detection", "Sentiment Analysis (VADER)"]
	Return an empty list [] if absolutely no specific computational
	techniques or statistical models are mentioned.

	Respond with RAW JSON only. No markdown, no explanation.
	"""
	)


	@tool
	def extract_methods_from_pdfs(pdf_dir: str) -> dict:
	"""
	Extract computational methods from each PDF paper.

	For each PDF: extract text (no images) → split into overlapping chunks →
	send each chunk to Mistral LLM → aggregate identified methods per paper.

	Parameters
	----------
	pdf_dir : str — directory containing PDF files

	Returns
	-------
	dict with keys:
	n_papers, results (list of per-paper method dicts), csv_path
	"""
	pdf_dir_path = Path(pdf_dir)
	if not pdf_dir_path.exists():
	return {"error": f"PDF directory not found: {pdf_dir}"}

	pdf_files = sorted(
	[str(p) for p in pdf_dir_path.glob("*.pdf")]
	+ [str(p) for p in pdf_dir_path.glob("*.PDF")]
	)
	if not pdf_files:
	return {"error": f"No PDF files found in {pdf_dir}"}

	rdir = _ensure_dir(OUTPUT_DIR / "methods")

	# Step 1: Extract full text from all PDFs and chunk them
	paper_chunks = []
	for idx, pdf_path in enumerate(pdf_files, start=1):
	try:
	full_text = _extract_text_from_pdf(pdf_path)
	title = Path(pdf_path).stem
	chunks = _chunk_text(full_text)

	paper_chunks.append({
	"paper_id": idx,
	"paper_filename": Path(pdf_path).stem,
	"paper_title": title,
	"chunks": chunks,
	})
	except Exception as exc:
	paper_chunks.append({
	"paper_id": idx,
	"paper_filename": Path(pdf_path).stem,
	"paper_title": Path(pdf_path).stem,
	"chunks": [],
	"error": str(exc),
	})

	# Step 2: For each paper, use LLM on all chunks and aggregate
	if not MISTRAL_API_KEY:
	return {
	"n_papers": len(pdf_files),
	"results": paper_chunks,
	"error": "MISTRAL_API_KEY not set — extracted text chunks but cannot identify methods via LLM.",
	}

	chain = _EXTRACT_METHODS_PROMPT \| _llm() \| JsonOutputParser()
	paper_results = []

	for entry in paper_chunks:
	chunks = entry.get("chunks", [])
	if not chunks:
	paper_results.append({
	"paper_id": entry["paper_id"],
	"paper_filename": entry["paper_filename"],
	"paper_title": entry.get("paper_title", ""),
	"computational_methods": [],
	"extraction_note": "No text extracted",
	})
	continue

	all_comp_methods = set()

	# Process each chunk
	for chunk in chunks:
	if len(chunk) < 50:
	continue
	try:
	result = _invoke_with_retries(lambda c=chunk: chain.invoke({
	"paper_name": entry.get("paper_title", entry.get("paper_filename", "")),
	"method_text": c,
	}))

	# Collect computational methods
	cm = result.get("computational_methods", [])
	if isinstance(cm, list):
	for item in cm:
	if isinstance(item, str) and item.strip():
	all_comp_methods.add(item.strip())
	elif isinstance(cm, str) and cm.strip():
	all_comp_methods.add(cm.strip())

	except Exception as exc:
	pass # Skip failed chunks

	paper_results.append({
	"paper_id": entry["paper_id"],
	"paper_filename": entry["paper_filename"],
	"paper_title": entry.get("paper_title", ""),
	"computational_methods": sorted(list(all_comp_methods)),
	"chunks_processed": len(chunks)
	})

	# Save results
	_save_json(rdir / "method_results.json", paper_results)

	# Build CSV
	rows = []
	for r in paper_results:
	comp_methods = r.get("computational_methods", [])
	if isinstance(comp_methods, list):
	comp_str = ", ".join(comp_methods)
	else:
	comp_str = str(comp_methods)
	rows.append({
	"Paper ID": r.get("paper_id", ""),
	"Paper Title": r.get("paper_title", r.get("paper_filename", "")),
	"Computational Methods": comp_str,
	})

	df = pd.DataFrame(rows)
	csv_path = rdir / "method_summary.csv"
	df.to_csv(csv_path, index=False)

	def _clean_technique_name(name: str) -> str:
	return re.sub(r"\s+", " ", name.strip())

	def _normalize_technique_key(name: str) -> str:
	cleaned = re.sub(r"[^a-z0-9+ ]", " ", name.lower())
	cleaned = re.sub(r"\s+", " ", cleaned).strip()
	cleaned = cleaned.replace("forests", "forest")
	cleaned = cleaned.replace("trees", "tree")
	cleaned = cleaned.replace("networks", "network")
	cleaned = cleaned.replace("models", "model")
	cleaned = cleaned.replace("transformers", "transformer")
	cleaned = cleaned.replace("embeddings", "embedding")
	cleaned = cleaned.replace("topics", "topic")
	cleaned = cleaned.replace("measures", "measure")
	return cleaned

	canonical_patterns = [
	(re.compile(r"\bbert\b"), "BERT"),
	(re.compile(r"\broberta\b"), "RoBERTa"),
	(re.compile(r"\bxlm[- ]?roberta\b"), "XLM-RoBERTa"),
	(re.compile(r"\bgpt[- ]?[0-9]*\b"), "GPT"),
	(re.compile(r"\bt5\b"), "T5"),
	(re.compile(r"\bword2vec\b"), "Word2Vec"),
	(re.compile(r"\bglove\b"), "GloVe"),
	(re.compile(r"\bdoc2vec\b"), "Doc2Vec"),
	(re.compile(r"\bfasttext\b"), "fastText"),
	(re.compile(r"\bspecter\b"), "SPECTER"),
	(re.compile(r"\bsentence[- ]?transformer"), "Sentence-Transformers"),
	(re.compile(r"\btf[- ]?idf\b"), "TF-IDF"),
	(re.compile(r"\bbm25\b"), "BM25"),
	(re.compile(r"\bbag of words\b\|\bbow\b"), "Bag-of-words"),
	(re.compile(r"\blda\b\|\blatent dirichlet allocation\b"), "LDA topic modeling"),
	(re.compile(r"\bnmf\b\|\bnon[- ]?negative matrix factorization\b"), "NMF topic modeling"),
	(re.compile(r"\blsa\b\|\blsi\b\|\blatent semantic analysis\b"), "LSA"),
	(re.compile(r"\bbertopic\b"), "BERTopic"),
	(re.compile(r"\bk[- ]?means\b"), "K-means clustering"),
	(re.compile(r"\bhierarchical clustering\b"), "Hierarchical clustering"),
	(re.compile(r"\bdbscan\b"), "DBSCAN"),
	(re.compile(r"\bhdbscan\b"), "HDBSCAN"),
	(re.compile(r"\bgmm\b\|\bgaussian mixture\b"), "Gaussian mixture model"),
	(re.compile(r"\bpca\b\|\bprincipal component analysis\b"), "PCA"),
	(re.compile(r"\bsvd\b\|\bsingular value decomposition\b"), "SVD"),
	(re.compile(r"\btsne\b\|\bt-sne\b"), "t-SNE"),
	(re.compile(r"\bumap\b"), "UMAP"),
	(re.compile(r"\bner\b\|\bnamed entity recognition\b"), "Named entity recognition"),
	(re.compile(r"\bsentiment\b"), "Sentiment analysis"),
	(re.compile(r"\brandom forest\b"), "Random Forest"),
	(re.compile(r"\bdecision tree\b"), "Decision Tree"),
	(re.compile(r"\bgradient boosting\b\|\bxgboost\b\|\blightgbm\b\|\bcatboost\b"), "Gradient boosting"),
	(re.compile(r"\bsvm\b\|\bsupport vector machine\b"), "SVM"),
	(re.compile(r"\bknn\b\|\bk[- ]?nearest neighbor\b"), "KNN"),
	(re.compile(r"\bnaive bayes\b"), "Naive Bayes"),
	(re.compile(r"\bneural network\b\|\bdeep learning\b\|\bmlp\b"), "Neural networks"),
	(re.compile(r"\bcnn\b\|\bconvolutional neural network\b"), "CNN"),
	(re.compile(r"\brnn\b\|\brecurrent neural network\b"), "RNN"),
	(re.compile(r"\blstm\b"), "LSTM"),
	(re.compile(r"\bgru\b"), "GRU"),
	(re.compile(r"\bautoencoder\b"), "Autoencoder"),
	(re.compile(r"\btransformer\b"), "Transformers"),
	(re.compile(r"\bfine[- ]?tuning\b"), "Model fine-tuning"),
	(re.compile(r"\bpls[- ]?sem\b\|\bpartial least squares\b"), "PLS-SEM"),
	(re.compile(r"\bcb[- ]?sem\b\|\bcovariance[- ]?based sem\b"), "CB-SEM"),
	(re.compile(r"\bsem\b\|\bstructural equation model\b"), "SEM"),
	(re.compile(r"\bglmm\b\|\bgeneralized linear mixed model\b"), "GLMM"),
	(re.compile(r"\birt\b\|\bitem response theory\b"), "IRT"),
	(re.compile(r"\bbayesian\b"), "Bayesian inference"),
	(re.compile(r"\bmediation\b"), "Mediation analysis"),
	(re.compile(r"\bmoderation\b"), "Moderation analysis"),
	(re.compile(r"\bchi[- ]?square\b\|\bchi square\b"), "Chi-square test"),
	(re.compile(r"\banova\b"), "ANOVA"),
	(re.compile(r"\bmanova\b"), "MANOVA"),
	(re.compile(r"\bancova\b"), "ANCOVA"),
	(re.compile(r"\bmancova\b"), "MANCOVA"),
	(re.compile(r"\bt[- ]?test\b"), "t-test"),
	(re.compile(r"\bwilcoxon\b"), "Wilcoxon test"),
	(re.compile(r"\bkruskal[- ]?wallis\b"), "Kruskal-Wallis test"),
	(re.compile(r"\bfactor analysis\b"), "Factor analysis"),
	(re.compile(r"\btime[- ]?series\b"), "Time-series analysis"),
	(re.compile(r"\barima\b"), "ARIMA"),
	(re.compile(r"\bsarima\b"), "SARIMA"),
	(re.compile(r"\bvar\b\|\bvector autoregression\b"), "VAR"),
	(re.compile(r"\bprophet\b"), "Prophet"),
	(re.compile(r"\bpanel regression\b\|\bpanel data\b"), "Panel regression"),
	(re.compile(r"\bfixed effects\b"), "Fixed-effects regression"),
	(re.compile(r"\brandom effects\b"), "Random-effects regression"),
	(re.compile(r"\bmultilevel\b\|\bhierarchical linear model\b\|\bhlm\b\|\bmixed effects\b"), "Multilevel / mixed-effects regression"),
	(re.compile(r"\bglm\b\|\bgeneralized linear model\b"), "Generalized linear model"),
	(re.compile(r"\bgls\b\|\bgeneralized least squares\b"), "Generalized least squares"),
	(re.compile(r"\bgee\b\|\bgeneralized estimating equation\b"), "GEE"),
	(re.compile(r"\bgmm\b\|\bgeneralized method of moments\b"), "GMM"),
	(re.compile(r"\b2sls\b\|\btwo[- ]?stage least squares\b"), "2SLS"),
	(re.compile(r"\b3sls\b\|\bthree[- ]?stage least squares\b"), "3SLS"),
	(re.compile(r"\binstrumental variable\b\|\biv\b"), "Instrumental variables"),
	(re.compile(r"\btobit\b"), "Tobit regression"),
	(re.compile(r"\bheckman\b"), "Heckman selection model"),
	(re.compile(r"\bpoisson\b"), "Poisson regression"),
	(re.compile(r"\bnegative binomial\b"), "Negative binomial regression"),
	(re.compile(r"\bprobit\b"), "Probit regression"),
	(re.compile(r"\bsurvival analysis\b\|\bcox\b\|\bhazard model\b\|\bkaplan[- ]?meier\b"), "Survival analysis"),
	(re.compile(r"\blatent class analysis\b\|\blca\b"), "Latent class analysis"),
	(re.compile(r"\blatent profile analysis\b\|\blpa\b"), "Latent profile analysis"),
	(re.compile(r"\blogistic regression\b"), "Logistic regression"),
	(re.compile(r"\bols\b\|\borderinary least squares\b\|\blinear regression\b\|\bmultiple regression\b"), "Linear regression (OLS)"),
	(re.compile(r"\bridge regression\b\|\bridge\b"), "Ridge regression"),
	(re.compile(r"\blasso\b"), "LASSO regression"),
	(re.compile(r"\belastic net\b"), "Elastic Net regression"),
	(re.compile(r"\bregression\b"), "Regression"),
	(re.compile(r"\bcentrality\b"), "Network centrality"),
	(re.compile(r"\bcommunity detection\b\|\blouvain\b\|\bleiden\b"), "Community detection"),
	(re.compile(r"\bergm\b\|\bexponential random graph\b"), "ERGM"),
	(re.compile(r"\blink prediction\b"), "Link prediction"),
	(re.compile(r"\bpagerank\b\|\bpage rank\b"), "PageRank"),
	(re.compile(r"\bgraph neural network\b\|\bgnn\b"), "Graph neural networks"),
	(re.compile(r"\bhidden markov\b\|\bhmm\b"), "Hidden Markov Model"),
	(re.compile(r"\bmarkov chain\b\|\bmarkov model\b"), "Markov models"),
	(re.compile(r"\bkalman filter\b"), "Kalman filter"),
	(re.compile(r"\bstate[- ]?space\b"), "State-space models"),
	(re.compile(r"\bhawkes\b"), "Hawkes process"),
	(re.compile(r"\brecommender\b\|\bcollaborative filtering\b\|\bmatrix factorization\b"), "Recommender systems"),
	(re.compile(r"\bahp\b\|\banalytic hierarchy process\b"), "AHP"),
	(re.compile(r"\btopsis\b"), "TOPSIS"),
	(re.compile(r"\bvikor\b"), "VIKOR"),
	(re.compile(r"\bpromethee\b"), "PROMETHEE"),
	(re.compile(r"\bdematel\b"), "DEMATEL"),
	(re.compile(r"\bdea\b\|\bdata envelopment analysis\b"), "DEA"),
	(re.compile(r"\bsfa\b\|\bstochastic frontier\b"), "SFA"),
	(re.compile(r"\bagent[- ]?based\b"), "Agent-based simulation"),
	(re.compile(r"\bmonte carlo\b"), "Monte Carlo simulation"),
	(re.compile(r"\bbayesian optimization\b"), "Bayesian optimization"),
	(re.compile(r"\blinear programming\b\|\binteger programming\b\|\bmixed integer\b"), "Mathematical optimization"),
	(re.compile(r"\bgenetic algorithm\b"), "Genetic algorithms"),
	(re.compile(r"\bsimulated annealing\b"), "Simulated annealing"),
	]

	def _canonicalize_technique(name: str) -> tuple[str, str]:
	cleaned = _normalize_technique_key(name)
	for pattern, canonical in canonical_patterns:
	if pattern.search(cleaned):
	return canonical, canonical.lower()
	display = " ".join(word.capitalize() for word in cleaned.split())
	display = display or _clean_technique_name(name)
	return display, display.lower()

	category_patterns = [
	(re.compile(r"\b(bert\|roberta\|xlm roberta\|gpt\|t5\|transformer\|fine[- ]?tuning)\b"), "Transformers"),
	(re.compile(r"\b(word2vec\|glove\|doc2vec\|fasttext\|specter\|sentence[- ]?transformer\|embedding\|tf[- ]?idf\|bm25\|bag of words\|bow)\b"), "Embeddings / Representation"),
	(re.compile(r"\b(topic modeling\|lda\|nmf\|bertopic\|lsa\|lsi)\b"), "Topic Modeling"),
	(re.compile(r"\b(k[- ]?means\|hierarchical clustering\|dbscan\|hdbscan\|gaussian mixture\|gmm\|clustering)\b"), "Clustering"),
	(re.compile(r"\b(pca\|svd\|t-sne\|tsne\|umap\|dimensionality reduction)\b"), "Dimensionality Reduction"),
	(re.compile(r"\b(arima\|sarima\|var\|prophet\|time[- ]?series)\b"), "Time Series / Forecasting"),
	(re.compile(r"\b(panel data\|panel regression\|fixed effects\|random effects\|multilevel\|hierarchical linear model\|hlm\|mixed effects\|glm\|gls\|gee\|gmm\|2sls\|3sls\|instrumental variable\|tobit\|heckman\|poisson\|negative binomial\|probit\|logit)\b"), "Econometric / Panel Models"),
	(re.compile(r"\b(ols\|linear regression\|logistic regression\|ridge\|lasso\|elastic net\|regression)\b"), "Regression"),
	(re.compile(r"\b(sem\|pls[- ]?sem\|cb[- ]?sem\|structural equation\|cfa\|efa)\b"), "SEM"),
	(re.compile(r"\b(latent class analysis\|latent profile analysis\|latent variable\|mixture model)\b"), "Latent Variable Models"),
	(re.compile(r"\b(grad(ient)? boosting\|xgboost\|lightgbm\|catboost)\b"), "Boosting / Ensembles"),
	(re.compile(r"\b(random forest\|decision tree\|svm\|knn\|naive bayes)\b"), "Classic ML"),
	(re.compile(r"\b(neural network\|deep learning\|lstm\|cnn\|rnn\|gru\|mlp\|autoencoder)\b"), "Deep Learning"),
	(re.compile(r"\b(ner\|named entity recognition\|sentiment\|nlp\|text mining\|tokenization\|stemming\|lemmatization\|keyword extraction)\b"), "NLP / Text Mining"),
	(re.compile(r"\b(network\|centrality\|community detection\|louvain\|leiden\|ergm\|link prediction\|pagerank\|graph neural network\|gnn)\b"), "Network Analysis"),
	(re.compile(r"\b(agent[- ]?based\|monte carlo\|bayesian optimization\|linear programming\|integer programming\|genetic algorithm\|simulated annealing)\b"), "Simulation / Optimization"),
	(re.compile(r"\b(survival\|cox\|hazard\|kaplan[- ]?meier)\b"), "Survival / Event History"),
	(re.compile(r"\b(bayesian\|mcmc\|gibbs\|variational)\b"), "Bayesian Methods"),
	(re.compile(r"\b(anova\|manova\|ancova\|mancova\|t[- ]?test\|chi[- ]?square\|factor analysis\|glmm\|irt\|mediation\|moderation\|wilcoxon\|kruskal[- ]?wallis)\b"), "Statistical Tests / Models"),
	(re.compile(r"\b(difference[- ]?in[- ]?differences\|did\|regression discontinuity\|rdd\|instrumental variable\|iv\|propensity score\|matching)\b"), "Causal Inference"),
	(re.compile(r"\b(recommender\|collaborative filtering\|matrix factorization)\b"), "Recommender Systems"),
	(re.compile(r"\b(hidden markov\|hmm\|markov\|kalman\|state[- ]?space\|hawkes)\b"), "Sequence / Stochastic Processes"),
	(re.compile(r"\b(ahp\|analytic hierarchy process\|topsis\|vikor\|promethee\|dematel)\b"), "Decision Analysis / MCDA"),
	(re.compile(r"\b(dea\|data envelopment analysis\|stochastic frontier\|sfa\|frontier analysis)\b"), "Efficiency / Frontier Analysis"),
	]

	def _categorize_technique(*names: str) -> str:
	for name in names:
	if not name:
	continue
	key = _normalize_technique_key(name)
	for pattern, category in category_patterns:
	if pattern.search(key):
	return category
	fallback_keywords = [
	("Classic ML", ["classifier", "classification", "predictive model", "prediction", "supervised"]),
	("Clustering", ["cluster", "clustering"]),
	("Topic Modeling", ["topic", "semantic"]),
	("Embeddings / Representation", ["embedding", "vector", "tf idf", "bow", "bag of words"]),
	("Regression", ["regression", "logit", "probit", "panel", "fixed effects", "random effects", "glm", "gls", "gee", "gmm"]),
	("SEM", ["sem", "structural equation", "factor", "latent"]),
	("Bayesian Methods", ["bayesian", "mcmc", "gibbs", "prior", "posterior"]),
	("Time Series / Forecasting", ["time series", "forecast", "arima", "sarima", "var", "prophet"]),
	("NLP / Text Mining", ["nlp", "text", "token", "lemma", "stem", "language"]),
	("Network Analysis", ["network", "graph", "node", "edge"]),
	("Simulation / Optimization", ["simulation", "optimi", "heuristic", "metaheuristic", "monte carlo", "agent-based"]),
	]
	for category, keywords in fallback_keywords:
	if any(k in key for k in keywords):
	return category
	if any(token in key for token in ["model", "analysis", "estimation", "test"]):
	return "Statistical Tests / Models"
	return "Other"

	category_map: dict[str, dict[str, object]] = {}
	for r in paper_results:
	paper_title = r.get("paper_title") or r.get("paper_filename") or ""
	paper_id = r.get("paper_id", "")
	paper_label = str(paper_title or paper_id)

	methods = r.get("computational_methods", [])
	if isinstance(methods, list):
	techniques = set([m.strip() for m in methods if isinstance(m, str) and m.strip()])
	elif isinstance(methods, str) and methods.strip():
	techniques = set([m.strip() for m in re.split(r"[;,]", methods) if m.strip()])
	else:
	techniques = set()

	for technique in techniques:
	algorithm, _ = _canonicalize_technique(technique)
	if not algorithm:
	continue
	category = _categorize_technique(technique, algorithm)
	key = category.lower()
	if key not in category_map:
	category_map[key] = {
	"name": category,
	"algorithms": set(),
	"papers": set(),
	}
	category_map[key]["algorithms"].add(algorithm)
	category_map[key]["papers"].add(paper_label)

	technique_rows = [
	{
	"Main Computational Technique": entry["name"],
	"Algorithms": ", ".join(sorted(entry["algorithms"])),
	"Papers": " \| ".join(sorted(entry["papers"])),
	}
	for entry in sorted(category_map.values(), key=lambda v: str(v["name"]).lower())
	]
	technique_df = pd.DataFrame(
	technique_rows,
	columns=["Main Computational Technique", "Algorithms", "Papers"],
	)
	technique_csv_path = rdir / "technique_to_papers.csv"
	technique_df.to_csv(technique_csv_path, index=False)

	return {
	"n_papers": len(pdf_files),
	"n_extracted": len(paper_results),
	"csv_path": str(csv_path),
	"technique_csv_path": str(technique_csv_path),
	"results": paper_results,
	}


	# ---------------------------------------------------------------------------
	# Tool registry — imported by agent.py
	# ---------------------------------------------------------------------------

	ALL_TOOLS = [
	load_scopus_csv,
	run_bertopic_discovery,
	label_topics_with_llm,
	verify_topic_labels_with_groq,
	consolidate_into_themes,
	compare_with_taxonomy,
	verify_taxonomy_mapping_with_groq,
	generate_comparison_csv,
	export_narrative,
	extract_methods_from_pdfs,
	]