Spaces:

Dash10107
/

eis-topic-intelligence

Sleeping

eis-topic-intelligence / topic_pipeline.py

Daksh C Jain

Initial commit: EIS Topic Intelligence — UMAP+HDBSCAN+Mistral council, dark EIS theme, 23 clusters from Enterprise Information Systems corpus

c91d9b4 6 days ago

raw

history blame contribute delete

53.3 kB

	import json
	import os
	import re
	from collections import Counter, defaultdict
	from datetime import datetime
	from typing import Any, Dict, List, Tuple

	import numpy as np
	import pandas as pd
	import plotly.express as px
	from sklearn.cluster import AgglomerativeClustering, KMeans
	from sklearn.decomposition import PCA, TruncatedSVD
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.metrics import silhouette_score
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.preprocessing import normalize


	OUTPUT_DIR = "./outputs"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	TARGET_MIN_CLUSTERS = 15
	TARGET_MAX_CLUSTERS = 25
	MIN_CLUSTER_SIZE = 5
	MAX_CLUSTER_SIZE = 100
	LABEL_STOP_WORDS = set("""
	abstract available research study studies paper papers article articles journal
	based using use uses used result results effect effects model models approach
	analysis data information system systems electronic markets market business
	""".split())

	PAJAIS_25 = [
	"IS Strategy and Management",
	"E-Commerce and E-Business",
	"IT Adoption and Diffusion",
	"Business Intelligence and Analytics",
	"Social Commerce and Social Media",
	"Mobile Commerce and Applications",
	"Knowledge Management",
	"Healthcare Information Systems",
	"Privacy, Security and Trust",
	"Enterprise Systems and ERP",
	"Digital Platforms and Ecosystems",
	"Blockchain and Distributed Ledgers",
	"Artificial Intelligence and Machine Learning",
	"Human-Computer Interaction and UX",
	"Digital Transformation and Innovation",
	"Financial Technology and Digital Finance",
	"Supply Chain and Logistics IS",
	"Smart Systems IoT and Smart Cities",
	"IS Research Methods and Theory",
	"Recommender and Personalization Systems",
	"Digital Marketing and Advertising",
	"Virtual Teams and Online Collaboration",
	"Cloud Computing and SaaS",
	"Big Data Analytics and Data Science",
	"IS Education and Training",
	]

	CATEGORY_TERMS = {
	"IS Strategy and Management": "strategy governance value performance management capability alignment",
	"E-Commerce and E-Business": "e-commerce marketplace online shopping electronic market platform transaction",
	"IT Adoption and Diffusion": "adoption acceptance intention use continuance utaut tam diffusion",
	"Business Intelligence and Analytics": "analytics data mining business intelligence decision support prediction",
	"Social Commerce and Social Media": "social media social commerce online community influencer live streaming",
	"Mobile Commerce and Applications": "mobile app smartphone m-commerce location based wearable",
	"Knowledge Management": "knowledge sharing knowledge management learning collaboration expertise",
	"Healthcare Information Systems": "health healthcare patient medical telemedicine ehealth",
	"Privacy, Security and Trust": "privacy security trust risk fraud identity protection",
	"Enterprise Systems and ERP": "erp enterprise system process integration organization",
	"Digital Platforms and Ecosystems": "platform ecosystem multi-sided digital platform complementor",
	"Blockchain and Distributed Ledgers": "blockchain distributed ledger smart contract token cryptocurrency",
	"Artificial Intelligence and Machine Learning": "artificial intelligence machine learning ai algorithm automation robot",
	"Human-Computer Interaction and UX": "user experience interface interaction usability design hci",
	"Digital Transformation and Innovation": "digital transformation innovation digitization disruption business model",
	"Financial Technology and Digital Finance": "fintech finance payment robo-advisor banking investment",
	"Supply Chain and Logistics IS": "supply chain logistics procurement inventory operations",
	"Smart Systems IoT and Smart Cities": "iot internet of things sensor smart city smart service",
	"IS Research Methods and Theory": "method theory literature review framework model research design",
	"Recommender and Personalization Systems": "recommendation recommender personalization preference choice",
	"Digital Marketing and Advertising": "advertising marketing consumer brand customer targeting",
	"Virtual Teams and Online Collaboration": "virtual team collaboration remote work crowd outsourcing",
	"Cloud Computing and SaaS": "cloud saas service computing infrastructure platform as a service",
	"Big Data Analytics and Data Science": "big data data science text mining deep learning analytics",
	"IS Education and Training": "education training learning student teaching mooc",
	}

	THEORY_PATTERNS = [
	"technology acceptance model", "tam", "utaut", "diffusion of innovation",
	"task technology fit", "social exchange theory", "institutional theory",
	"resource based view", "transaction cost", "information systems success",
	"expectation confirmation", "trust theory", "planned behavior",
	]

	METHOD_PATTERNS = [
	"survey", "experiment", "case study", "structural equation", "sem",
	"regression", "machine learning", "design science", "literature review",
	"qualitative", "interview", "content analysis", "simulation",
	]

	COMPUTATIONAL_PATTERNS = [
	"machine learning", "deep learning", "neural network", "random forest",
	"support vector", "svm", "classification", "clustering", "topic model",
	"lda", "natural language processing", "nlp", "text mining", "sentiment",
	"recommender", "algorithm", "prediction", "analytics", "optimization",
	]


	def _opath(name: str) -> str:
	return os.path.join(OUTPUT_DIR, name)


	def _save_json(data: Any, name: str) -> str:
	path = _opath(name)
	with open(path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	return path


	def _clean_text(text: Any) -> str:
	text = re.sub(r"\s+", " ", str(text or "")).strip()
	text = re.sub(r"©.*$", "", text).strip()
	return text


	def _first_existing(df: pd.DataFrame, candidates: List[str]) -> str:
	lowered = {c.lower(): c for c in df.columns}
	for name in candidates:
	if name.lower() in lowered:
	return lowered[name.lower()]
	return ""


	def load_corpus(filepath: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
	df = pd.read_csv(filepath, encoding="utf-8-sig", on_bad_lines="skip")
	title_col = _first_existing(df, ["Title"])
	abstract_col = _first_existing(df, ["Abstract"])
	doi_col = _first_existing(df, ["DOI"])
	year_col = _first_existing(df, ["Year"])
	journal_col = _first_existing(df, ["Source title", "Journal"])
	cited_col = _first_existing(df, ["Cited by", "Citations"])

	if not title_col or not abstract_col:
	raise ValueError("CSV must include Title and Abstract columns.")

	df = df.copy()
	df["__title"] = df[title_col].map(_clean_text)
	df["__abstract"] = df[abstract_col].map(_clean_text)
	df["__doi"] = df[doi_col].fillna("").map(str) if doi_col else ""
	df["__combined"] = (df["__title"] + ". " + df["__abstract"]).map(_clean_text)
	df = df[df["__combined"].str.len() > 80].reset_index(drop=True)
	df["__paper_id"] = np.arange(len(df))

	if cited_col:
	df["__cited_by"] = pd.to_numeric(df[cited_col], errors="coerce").fillna(0)
	else:
	df["__cited_by"] = 0

	years = pd.to_numeric(df[year_col], errors="coerce") if year_col else pd.Series(dtype=float)
	journal = df[journal_col].dropna().astype(str).mode().iloc[0] if journal_col and not df[journal_col].dropna().empty else "Unknown"
	config = {
	"filepath": filepath,
	"journal": journal,
	"rows": int(len(df)),
	"year_min": int(years.min()) if not years.dropna().empty else None,
	"year_max": int(years.max()) if not years.dropna().empty else None,
	"title_column": title_col,
	"abstract_column": abstract_col,
	"doi_column": doi_col or "missing",
	"combined_field": "Title + Abstract; DOI retained as paper identifier",
	"generated_at": datetime.now().isoformat(timespec="seconds"),
	}
	_save_json(config, "corpus_config.json")
	return df, config


	def _embed_documents(texts: List[str]) -> Tuple[np.ndarray, Dict[str, Any]]:
	errors = []
	try:
	import torch
	from transformers import AutoModel, AutoTokenizer

	model_name = "allenai/specter2_base"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)
	model.eval()
	batches = []
	with torch.no_grad():
	for start in range(0, len(texts), 8):
	batch = texts[start:start + 8]
	encoded = tokenizer(
	batch,
	padding=True,
	truncation=True,
	max_length=512,
	return_tensors="pt",
	)
	output = model(**encoded)
	mask = encoded["attention_mask"].unsqueeze(-1)
	pooled = (output.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
	batches.append(pooled.cpu().numpy())
	vectors = normalize(np.vstack(batches))
	return np.asarray(vectors, dtype=np.float32), {
	"embedding_model": "allenai/specter2_base",
	"embedding_note": "SPECTER2 transformer embeddings from Title + Abstract; DOI retained as paper identifier.",
	}
	except Exception as exc:
	errors.append(f"transformers allenai/specter2_base: {exc.__class__.__name__}: {exc}")

	for model_name in ["allenai/specter2_base", "allenai-specter"]:
	try:
	from sentence_transformers import SentenceTransformer

	model = SentenceTransformer(model_name)
	vectors = model.encode(
	texts,
	normalize_embeddings=True,
	batch_size=16,
	show_progress_bar=True,
	)
	return np.asarray(vectors, dtype=np.float32), {
	"embedding_model": model_name,
	"embedding_note": "Transformer embeddings. SPECTER2 attempted first.",
	}
	except Exception as exc:
	errors.append(f"{model_name}: {exc.__class__.__name__}: {exc}")

	vectorizer = TfidfVectorizer(
	max_features=5000,
	ngram_range=(1, 2),
	min_df=2,
	max_df=0.85,
	stop_words="english",
	)
	tfidf = vectorizer.fit_transform(texts)
	n_components = min(256, max(2, min(tfidf.shape) - 1))
	svd = TruncatedSVD(n_components=n_components, random_state=42)
	vectors = normalize(svd.fit_transform(tfidf))
	meta = {
	"embedding_model": "TF-IDF + TruncatedSVD fallback",
	"embedding_note": "SPECTER2/Transformer loading failed; deterministic fallback kept the app runnable.",
	"embedding_errors": errors[-3:],
	}
	return np.asarray(vectors, dtype=np.float32), meta


	def _cluster_metrics(labels: np.ndarray, vectors: np.ndarray) -> Dict[str, Any]:
	valid = labels >= 0
	cluster_ids, counts = np.unique(labels[valid], return_counts=True)
	n_clusters = int(len(cluster_ids))
	noise_ratio = float(np.mean(~valid)) if len(labels) else 1.0
	too_small = int(np.sum(counts < MIN_CLUSTER_SIZE)) if len(counts) else 999
	too_large = int(np.sum(counts > MAX_CLUSTER_SIZE)) if len(counts) else 999
	silhouette = -1.0
	if n_clusters > 1 and np.sum(valid) > n_clusters:
	sample_vectors = vectors[valid]
	sample_labels = labels[valid]
	if len(sample_vectors) > 800:
	rng = np.random.default_rng(42)
	sample_idx = rng.choice(len(sample_vectors), 800, replace=False)
	sample_vectors = sample_vectors[sample_idx]
	sample_labels = sample_labels[sample_idx]
	try:
	silhouette = float(silhouette_score(sample_vectors, sample_labels, metric="cosine"))
	except Exception:
	silhouette = -1.0
	range_penalty = 0
	if n_clusters < TARGET_MIN_CLUSTERS:
	range_penalty = (TARGET_MIN_CLUSTERS - n_clusters) * 3
	if n_clusters > TARGET_MAX_CLUSTERS:
	range_penalty = (n_clusters - TARGET_MAX_CLUSTERS) * 3
	score = (
	range_penalty
	+ too_small * 2
	+ too_large * 4
	+ noise_ratio * 8
	- silhouette
	)
	return {
	"n_clusters": n_clusters,
	"noise_ratio": round(noise_ratio, 4),
	"min_size": int(counts.min()) if len(counts) else 0,
	"max_size": int(counts.max()) if len(counts) else 0,
	"too_small": too_small,
	"too_large": too_large,
	"silhouette_cosine": round(silhouette, 4),
	"score": round(float(score), 4),
	}


	def _compact_labels(labels: np.ndarray) -> np.ndarray:
	labels = np.asarray(labels, dtype=int).copy()
	positive = [int(x) for x in sorted(np.unique(labels)) if x >= 0]
	mapping = {old: new for new, old in enumerate(positive)}
	return np.asarray([mapping.get(int(x), -1) for x in labels], dtype=int)


	def _repair_labels(labels: np.ndarray, vectors: np.ndarray) -> np.ndarray:
	labels = _compact_labels(labels)
	if np.all(labels < 0):
	k = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35))
	return KMeans(n_clusters=k, random_state=42, n_init=20).fit_predict(vectors)

	noise_idx = np.where(labels < 0)[0]
	if len(noise_idx):
	valid_ids = [int(x) for x in sorted(np.unique(labels)) if x >= 0]
	centroids = np.asarray([vectors[labels == cid].mean(axis=0) for cid in valid_ids])
	nearest = cosine_similarity(vectors[noise_idx], centroids).argmax(axis=1)
	labels[noise_idx] = np.asarray([valid_ids[i] for i in nearest], dtype=int)

	labels = _compact_labels(labels)
	next_id = int(labels.max()) + 1
	for cid in list(sorted(np.unique(labels))):
	idx = np.where(labels == cid)[0]
	if len(idx) <= MAX_CLUSTER_SIZE:
	continue
	centroid = vectors[idx].mean(axis=0, keepdims=True)
	order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1]
	ordered_idx = idx[order]
	chunks = [ordered_idx[i:i + MAX_CLUSTER_SIZE] for i in range(0, len(ordered_idx), MAX_CLUSTER_SIZE)]
	labels[chunks[0]] = cid
	for chunk in chunks[1:]:
	labels[chunk] = next_id
	next_id += 1

	labels = _compact_labels(labels)
	changed = True
	while changed:
	changed = False
	ids, counts = np.unique(labels, return_counts=True)
	tiny = [int(cid) for cid, count in zip(ids, counts) if count < MIN_CLUSTER_SIZE]
	if not tiny or len(ids) <= TARGET_MIN_CLUSTERS:
	break
	for cid in tiny:
	idx = np.where(labels == cid)[0]
	other_ids = [int(x) for x in np.unique(labels) if int(x) != cid]
	if not other_ids:
	continue
	centroid = vectors[idx].mean(axis=0, keepdims=True)
	other_centroids = np.asarray([vectors[labels == oid].mean(axis=0) for oid in other_ids])
	nearest_order = cosine_similarity(centroid, other_centroids).ravel().argsort()[::-1]
	target = other_ids[int(nearest_order[0])]
	labels[idx] = target
	labels = _compact_labels(labels)
	changed = True
	break
	labels = _compact_labels(labels)
	while len(np.unique(labels)) < TARGET_MIN_CLUSTERS:
	ids, counts = np.unique(labels, return_counts=True)
	largest = int(ids[np.argmax(counts)])
	idx = np.where(labels == largest)[0]
	if len(idx) < MIN_CLUSTER_SIZE * 2:
	break
	centroid = vectors[idx].mean(axis=0, keepdims=True)
	order = cosine_similarity(vectors[idx], centroid).ravel().argsort()[::-1]
	split_at = len(idx) // 2
	if split_at < MIN_CLUSTER_SIZE or len(idx) - split_at < MIN_CLUSTER_SIZE:
	break
	labels[idx[order[split_at:]]] = int(labels.max()) + 1
	labels = _compact_labels(labels)
	return _compact_labels(labels)


	def _optimizer_recommendation(metrics: Dict[str, Any]) -> str:
	if metrics["n_clusters"] < TARGET_MIN_CLUSTERS:
	return "Increase UMAP n_neighbors separation pressure or lower HDBSCAN min_cluster_size."
	if metrics["n_clusters"] > TARGET_MAX_CLUSTERS:
	return "Raise HDBSCAN min_cluster_size or increase UMAP n_neighbors to merge nearby themes."
	if metrics["max_size"] > MAX_CLUSTER_SIZE:
	return "Split dominant clusters by lowering min_cluster_size or lowering min_samples."
	if metrics["noise_ratio"] > 0.25:
	return "Reduce min_samples and keep UMAP n_components at 5-10 to reduce noise."
	return "Keep this parameter set; it satisfies the 15-25 cluster target best."


	def _run_umap_hdbscan(vectors: np.ndarray) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any], List[Dict[str, Any]]]:
	candidates = []
	best = None
	best_reduced = None
	best_params = None

	try:
	import umap
	try:
	import hdbscan as external_hdbscan
	hdbscan_backend = "external"
	except Exception:
	external_hdbscan = None
	from sklearn.cluster import HDBSCAN as SklearnHDBSCAN
	hdbscan_backend = "sklearn"

	for n_neighbors in [10, 20, 35]:
	for n_components in [5, 10]:
	reduced = umap.UMAP(
	n_neighbors=n_neighbors,
	n_components=n_components,
	min_dist=0.0,
	metric="cosine",
	random_state=42,
	).fit_transform(vectors)
	for min_cluster_size in [5, 8, 12, 16, 25]:
	for min_samples in [1, 3, None]:
	if hdbscan_backend == "external":
	clusterer = external_hdbscan.HDBSCAN(
	min_cluster_size=min_cluster_size,
	min_samples=min_samples,
	metric="euclidean",
	prediction_data=True,
	)
	else:
	clusterer = SklearnHDBSCAN(
	min_cluster_size=min_cluster_size,
	min_samples=min_samples,
	metric="euclidean",
	)
	labels = clusterer.fit_predict(reduced)
	labels = _repair_labels(labels, vectors)
	metrics = _cluster_metrics(labels, vectors)
	params = {
	"algorithm": f"UMAP + HDBSCAN ({hdbscan_backend})",
	"umap_n_neighbors": n_neighbors,
	"umap_n_components": n_components,
	"umap_metric": "cosine",
	"hdbscan_min_cluster_size": min_cluster_size,
	"hdbscan_min_samples": min_samples,
	"hdbscan_metric": "euclidean",
	}
	row = {params, metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)}
	candidates.append(row)
	if best is None or metrics["score"] < best[1]["score"]:
	best = (labels, metrics, getattr(clusterer, "probabilities_", np.ones(len(labels))))
	best_reduced = reduced
	best_params = params
	if (
	TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS
	and metrics["too_small"] == 0
	and metrics["too_large"] == 0
	and metrics["noise_ratio"] <= 0.25
	):
	probs = getattr(clusterer, "probabilities_", np.ones(len(labels)))
	if len(probs) != len(labels):
	probs = np.ones(len(labels))
	return labels, probs, {params, metrics}, candidates
	except Exception as exc:
	candidates.append({
	"algorithm": "UMAP + HDBSCAN",
	"error": f"{exc.__class__.__name__}: {exc}",
	"optimizer_recommendation": "UMAP is unavailable in this Python install; try PCA manifold fallback with HDBSCAN.",
	})
	try:
	from sklearn.cluster import HDBSCAN as SklearnHDBSCAN

	for n_components in [5, 10, 20]:
	pca_components = min(n_components, max(2, min(vectors.shape) - 1))
	reduced = PCA(n_components=pca_components, random_state=42).fit_transform(vectors)
	for min_cluster_size in [5, 8, 12, 16, 25]:
	for min_samples in [1, 3, None]:
	clusterer = SklearnHDBSCAN(
	min_cluster_size=min_cluster_size,
	min_samples=min_samples,
	metric="euclidean",
	)
	labels = _repair_labels(clusterer.fit_predict(reduced), vectors)
	metrics = _cluster_metrics(labels, vectors)
	params = {
	"algorithm": "PCA manifold fallback + HDBSCAN (sklearn; UMAP unavailable locally)",
	"pca_n_components": pca_components,
	"hdbscan_min_cluster_size": min_cluster_size,
	"hdbscan_min_samples": min_samples,
	"hdbscan_metric": "euclidean",
	}
	row = {params, metrics, "optimizer_recommendation": _optimizer_recommendation(metrics)}
	candidates.append(row)
	probs = np.ones(len(labels))
	if best is None or metrics["score"] < best[1]["score"]:
	best = (labels, metrics, probs)
	best_params = params
	if (
	TARGET_MIN_CLUSTERS <= metrics["n_clusters"] <= TARGET_MAX_CLUSTERS
	and metrics["too_small"] == 0
	and metrics["too_large"] == 0
	):
	return labels, probs, {params, metrics}, candidates
	except Exception as fallback_exc:
	candidates.append({
	"algorithm": "PCA manifold fallback + HDBSCAN",
	"error": f"{fallback_exc.__class__.__name__}: {fallback_exc}",
	"optimizer_recommendation": "Use deterministic KMeans fallback so the pipeline still completes.",
	})

	if best is not None and TARGET_MIN_CLUSTERS <= best[1]["n_clusters"] <= TARGET_MAX_CLUSTERS:
	return best[0], best[2], {best_params, best[1]}, candidates

	n_clusters = min(20, max(TARGET_MIN_CLUSTERS, len(vectors) // 35))
	labels = KMeans(n_clusters=n_clusters, random_state=42, n_init=20).fit_predict(vectors)
	labels = _repair_labels(labels, vectors)
	metrics = _cluster_metrics(labels, vectors)
	params = {
	"algorithm": "KMeans fallback after UMAP/HDBSCAN optimization",
	"n_clusters": n_clusters,
	**metrics,
	}
	sims = np.zeros(len(labels), dtype=float)
	for cid in np.unique(labels):
	idx = np.where(labels == cid)[0]
	centroid = vectors[idx].mean(axis=0, keepdims=True)
	sims[idx] = cosine_similarity(vectors[idx], centroid).ravel()
	probs = np.clip((sims + 1) / 2, 0, 1)
	candidates.append({**params, "optimizer_recommendation": "Fallback used to guarantee a crisp 15-25 cluster solution."})
	return labels, probs, params, candidates


	def _top_terms(texts: List[str], top_n: int = 8) -> List[str]:
	if not texts:
	return []
	vec = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=1, max_features=1200)
	matrix = vec.fit_transform(texts)
	scores = np.asarray(matrix.sum(axis=0)).ravel()
	terms = np.asarray(vec.get_feature_names_out())
	order = scores.argsort()[::-1]
	cleaned = []
	for term in terms[order]:
	pieces = term.split()
	if (
	len(term) > 2
	and not term.isdigit()
	and term not in cleaned
	and not all(piece in LABEL_STOP_WORDS for piece in pieces)
	and not any(piece in {"doi", "s12525", "1007"} for piece in pieces)
	):
	cleaned.append(term)
	if len(cleaned) >= top_n:
	break
	return cleaned


	def _category_for_terms(terms: List[str]) -> Tuple[str, float]:
	query = " ".join(terms)
	docs = [query] + list(CATEGORY_TERMS.values())
	vec = TfidfVectorizer(stop_words="english").fit_transform(docs)
	sims = cosine_similarity(vec[0], vec[1:]).ravel()
	best = int(np.argmax(sims))
	return PAJAIS_25[best], float(sims[best])


	def _title_from_terms(terms: List[str], category: str) -> str:
	priority = [
	"artificial intelligence", "machine learning", "blockchain", "digital platform",
	"social commerce", "e-commerce", "privacy", "trust", "fintech", "robo advisor",
	"analytics", "mobile", "digital transformation", "recommender",
	]
	joined = " ".join(terms)
	for phrase in priority:
	if phrase in joined:
	return phrase.title().replace("Ai", "AI")
	if terms:
	return " ".join(t.title() for t in terms[:3])
	return category


	def _optional_mistral_label(cluster: Dict[str, Any]) -> str:
	api_key = os.environ.get("MISTRAL_API_KEY", "").strip()
	if not api_key:
	return ""
	try:
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.prompts import PromptTemplate
	from langchain_mistralai import ChatMistralAI

	prompt = PromptTemplate.from_template(
	"Name this academic topic cluster in 3-7 words. "
	"Use only the evidence, no markdown.\nKeywords: {keywords}\nTitles:\n{titles}"
	)
	llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1)
	return (prompt \| llm \| StrOutputParser()).invoke({
	"keywords": ", ".join(cluster["keywords"]),
	"titles": "\n".join(cluster["top_titles"][:3]),
	}).strip().strip('"')[:80]
	except Exception:
	return ""


	def _optional_mistral_council(cluster: Dict[str, Any]) -> List[Dict[str, str]]:
	api_key = os.environ.get("MISTRAL_API_KEY", "").strip()
	if not api_key:
	return []
	try:
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.prompts import PromptTemplate
	from langchain_mistralai import ChatMistralAI

	personas = [
	("LLM Council A - Domain Labeler", "Name the Information Systems research theme."),
	("LLM Council B - Methods Skeptic", "Name the theme conservatively using only the three titles and keywords."),
	("LLM Council C - Taxonomy Judge", "Name the theme and prefer PAJAIS-style terminology when appropriate."),
	]
	llm = ChatMistralAI(model="mistral-small-latest", api_key=api_key, temperature=0.1)
	prompt = PromptTemplate.from_template(
	"{task}\nReturn one concise 3-7 word label only.\n"
	"Keywords: {keywords}\n"
	"High-probability paper titles:\n{titles}"
	)
	votes = []
	for member, task in personas:
	label = (prompt \| llm \| StrOutputParser()).invoke({
	"task": task,
	"keywords": ", ".join(cluster["keywords"]),
	"titles": "\n".join(cluster["top_titles"][:3]),
	}).strip().strip('"')[:80]
	if label:
	votes.append({"member": member, "label": label, "method": "Mistral LLM council vote using top-3 high-probability paper titles"})
	return votes
	except Exception:
	return []


	def _build_cluster_summaries(df: pd.DataFrame, vectors: np.ndarray, labels: np.ndarray, probabilities: np.ndarray) -> List[Dict[str, Any]]:
	summaries = []
	for cid in sorted([int(x) for x in np.unique(labels) if x >= 0]):
	idx = np.where(labels == cid)[0]
	texts = df.loc[idx, "__combined"].tolist()
	titles = df.loc[idx, "__title"].tolist()
	keywords = _top_terms(texts, 10)
	category, category_score = _category_for_terms(keywords + titles[:5])
	centroid = vectors[idx].mean(axis=0, keepdims=True)
	sims = cosine_similarity(vectors[idx], centroid).ravel()
	rank = np.lexsort((-sims, -probabilities[idx]))[::-1]
	top_idx = idx[rank[:3]]
	cluster_draft = {
	"keywords": keywords,
	"top_titles": df.loc[top_idx, "__title"].tolist(),
	}
	vote_keyword = _title_from_terms(keywords, category)
	vote_taxonomy = category
	llm_votes = _optional_mistral_council(cluster_draft)
	votes = llm_votes or [
	{"member": "Council A - Keyword Extractor", "label": vote_keyword, "method": "deterministic cluster TF-IDF terms"},
	{"member": "Council B - PAJAIS Mapper", "label": vote_taxonomy, "method": "taxonomy-term cosine validation"},
	{"member": "Council C - Local Semantic Judge", "label": vote_keyword, "method": "local fallback; configure MISTRAL_API_KEY for live 3-LLM council"},
	]
	normalized_votes = [v["label"].strip().lower() for v in votes if v["label"]]
	agreement = Counter(normalized_votes).most_common(1)[0][1] / max(1, len(normalized_votes))
	final_label = Counter([v["label"] for v in votes if v["label"]]).most_common(1)[0][0] if votes else vote_keyword
	summaries.append({
	"cluster_id": cid,
	"label": final_label,
	"category": category,
	"confidence": round(float(np.mean(probabilities[idx]) * 0.65 + agreement * 0.35), 3),
	"category_confidence": round(category_score, 3),
	"sentence_count": int(len(idx)),
	"paper_count": int(len(idx)),
	"top_sentences": df.loc[top_idx, "__abstract"].str[:350].tolist(),
	"top_titles": df.loc[top_idx, "__title"].tolist(),
	"keywords": keywords,
	"centroid": centroid.ravel().tolist(),
	"paper_indices": [int(i) for i in idx],
	"council_votes": votes,
	"agreement_score": round(float(agreement), 3),
	"is_niche": bool(len(idx) <= 8),
	"reasoning": f"{len(idx)} papers; top terms: {', '.join(keywords[:6])}; council agreement {agreement:.2f}.",
	})
	summaries.sort(key=lambda x: x["paper_count"], reverse=True)
	return summaries


	def _generate_charts(summaries: List[Dict[str, Any]]) -> None:
	chart_dir = _opath("combined_charts")
	os.makedirs(chart_dir, exist_ok=True)
	if not summaries:
	return
	centroids = np.asarray([s["centroid"] for s in summaries])
	sizes = [s["paper_count"] for s in summaries]
	labels = [s["label"] for s in summaries]
	coords = PCA(n_components=2, random_state=42).fit_transform(centroids) if len(summaries) > 1 else np.zeros((1, 2))
	fig = px.scatter(
	x=coords[:, 0],
	y=coords[:, 1],
	size=sizes,
	color=[s["category"] for s in summaries],
	hover_name=labels,
	title="Intertopic Map - Title + Abstract + DOI",
	template="plotly_dark",
	)
	fig.write_html(os.path.join(chart_dir, "intertopic_map.html"), include_plotlyjs="cdn", full_html=True)
	bar = px.bar(
	x=labels,
	y=sizes,
	title="Cluster Sizes",
	labels={"x": "Cluster", "y": "Papers"},
	template="plotly_dark",
	)
	bar.write_html(os.path.join(chart_dir, "bar_chart.html"), include_plotlyjs="cdn", full_html=True)
	tree = px.treemap(
	names=labels,
	parents=["clusters"] * len(labels),
	values=sizes,
	title="Topic Treemap",
	)
	tree.write_html(os.path.join(chart_dir, "treemap.html"), include_plotlyjs="cdn", full_html=True)


	def _taxonomy_map(summaries: List[Dict[str, Any]]) -> Dict[str, Any]:
	mapping = {}
	for s in summaries:
	is_novel = s["category_confidence"] < 0.08
	mapping[s["label"]] = {
	"theme": s["label"],
	"pajais_match": "NOVEL" if is_novel else s["category"],
	"match_confidence": s["category_confidence"],
	"reasoning": s["reasoning"],
	"is_novel": is_novel,
	}
	covered = {v["pajais_match"] for v in mapping.values() if not v["is_novel"]}
	novel = [k for k, v in mapping.items() if v["is_novel"]]
	return {
	"run_key": "combined",
	"taxonomy_mapping": mapping,
	"novel_themes": novel,
	"pajais_gap_categories": [c for c in PAJAIS_25 if c not in covered],
	"coverage_stats": {
	"total_themes": len(mapping),
	"mapped": len(mapping) - len(novel),
	"novel": len(novel),
	},
	}


	def _write_comparison_csv(summaries: List[Dict[str, Any]]) -> str:
	rows = []
	for s in summaries:
	rows.append({
	"Cluster_ID": s["cluster_id"],
	"Final_Label": s["label"],
	"PAJAIS_Category": s["category"],
	"Papers": s["paper_count"],
	"Confidence": s["confidence"],
	"Agreement": s["agreement_score"],
	"Top_Keywords": "; ".join(s["keywords"][:8]),
	"Top_3_Paper_Titles": " \| ".join(s["top_titles"][:3]),
	"Validation_Status": "VALIDATED" if s["confidence"] >= 0.55 else "REVIEW",
	})
	path = _opath("comparison.csv")
	pd.DataFrame(rows).to_csv(path, index=False)
	return path


	def _extract_matches(text: str, patterns: List[str]) -> List[str]:
	lower = text.lower()
	return sorted({p for p in patterns if re.search(r"\b" + re.escape(p.lower()) + r"\b", lower)})


	def _write_tccm_validation(df: pd.DataFrame) -> str:
	top = df.sort_values("__cited_by", ascending=False).head(100).copy()
	rows = []
	for rank, (_, row) in enumerate(top.iterrows(), start=1):
	text = f"{row['__title']} {row['__abstract']}"
	theories = _extract_matches(text, THEORY_PATTERNS)
	methods = _extract_matches(text, METHOD_PATTERNS)
	techniques = _extract_matches(text, COMPUTATIONAL_PATTERNS)
	context = []
	for category, terms in CATEGORY_TERMS.items():
	if any(term in text.lower() for term in terms.split()[:6]):
	context.append(category)
	rows.append({
	"Paper ID": rank,
	"Title": row["__title"],
	"DOI": row["__doi"],
	"Cited_By": row["__cited_by"],
	"Theory_Regex": "; ".join(theories),
	"Context_Taxonomy": "; ".join(context[:3]),
	"Characteristics_Constructs": "; ".join(_top_terms([text], 6)),
	"Method_Regex": "; ".join(methods),
	"Computational_Techniques_Regex": "; ".join(techniques),
	"Validation_Method_1": "dictionary/regex extraction",
	"Validation_Method_2": "cluster/category semantic match",
	"Validation_Status": "VALIDATED" if (methods or techniques or theories) else "NEEDS_FULL_TEXT_REVIEW",
	})
	path = _opath("tccm_validation.csv")
	pd.DataFrame(rows).to_csv(path, index=False)
	return path


	def parse_notebooklm_tccm_text(raw_text: str) -> str:
	"""Parse NotebookLM's copied table text into a Google-Sheets-ready CSV."""
	columns = [
	"Paper ID",
	"Paper Citation",
	"Study Type",
	"Dependent Variable(s) (DV)",
	"Independent Variable(s) (IVs)",
	"Mediator(s)",
	"Moderator(s)",
	"Relationship Direction",
	"Evidence Snippet",
	]
	skip = {c.lower() for c in columns}
	skip.update({
	"today • 14:09",
	"ask a question or create something",
	"notebooklm can be inaccurate; please double-check its responses.",
	"i want minimum 1 dependent and 1 independent variable",
	"i want minimum 1 dependernt and 1 independent variable",
	})
	lines = [
	re.sub(r"\s+", " ", line.strip())
	for line in str(raw_text or "").splitlines()
	if line.strip()
	]

	rows = []
	current_id = None
	fields = []

	def is_id(line: str) -> bool:
	return bool(re.fullmatch(r"\d{1,3}", line)) and 1 <= int(line) <= 100

	def finalise():
	if current_id is None or not fields:
	return
	cleaned = [f for f in fields if f.lower() not in skip]
	if len(cleaned) < 2:
	return
	fixed = cleaned[:7]
	evidence = " ".join(cleaned[7:]) if len(cleaned) > 7 else ""
	while len(fixed) < 7:
	fixed.append("NA")
	rows.append({
	"Paper ID": int(current_id),
	"Paper Citation": fixed[0],
	"Study Type": fixed[1],
	"Dependent Variable(s) (DV)": fixed[2],
	"Independent Variable(s) (IVs)": fixed[3],
	"Mediator(s)": fixed[4],
	"Moderator(s)": fixed[5],
	"Relationship Direction": fixed[6],
	"Evidence Snippet": evidence,
	})

	seen_header = False
	for line in lines:
	lower = line.lower()
	if lower in skip:
	seen_header = True
	continue
	if is_id(line):
	if seen_header or current_id is not None:
	finalise()
	current_id = int(line)
	fields = []
	continue
	if current_id is not None:
	fields.append(line)
	finalise()

	df = pd.DataFrame(rows, columns=columns)
	if not df.empty:
	df = df.sort_values("Paper ID").drop_duplicates("Paper ID", keep="last")
	path = _opath("notebooklm_extraction.csv")
	df.to_csv(path, index=False)
	return path


	def write_tccm_dual_validation(notebooklm_path: str = "", second_llm_path: str = "") -> str:
	base_path = _opath("tccm_validation.csv")
	base = pd.read_csv(base_path) if os.path.exists(base_path) else pd.DataFrame()

	def load_optional(path: str, prefix: str) -> pd.DataFrame:
	if not path or not os.path.exists(path):
	return pd.DataFrame()
	loaded = pd.read_csv(path, encoding="utf-8-sig", on_bad_lines="skip")
	rename = {}
	for col in loaded.columns:
	low = col.lower().strip()
	if low in {"paper id", "paper_id", "rank", "id"}:
	rename[col] = "Paper ID"
	elif low in {"title", "paper title", "article title"}:
	rename[col] = "Title"
	elif low in {"paper citation", "citation"}:
	rename[col] = f"{prefix}_Citation"
	elif "study type" in low:
	rename[col] = f"{prefix}_Study_Type"
	elif "independent" in low or low == "iv" or "ivs" in low:
	rename[col] = f"{prefix}_IV"
	elif "dependent" in low or low == "dv":
	rename[col] = f"{prefix}_DV"
	elif "mediator" in low:
	rename[col] = f"{prefix}_Mediator"
	elif "moderator" in low:
	rename[col] = f"{prefix}_Moderator"
	elif "relationship direction" in low or low == "direction":
	rename[col] = f"{prefix}_Relationship_Direction"
	elif "evidence" in low or "snippet" in low:
	rename[col] = f"{prefix}_Evidence"
	elif low in {"doi"}:
	rename[col] = "DOI"
	elif "theor" in low:
	rename[col] = f"{prefix}_Theory"
	elif "context" in low:
	rename[col] = f"{prefix}_Context"
	elif "method" in low:
	rename[col] = f"{prefix}_Method"
	elif "variable" in low or "construct" in low or "characteristic" in low:
	rename[col] = f"{prefix}_Variables"
	elif "technique" in low or "comput" in low:
	rename[col] = f"{prefix}_Computational_Techniques"
	loaded = loaded.rename(columns=rename)
	keep = [c for c in loaded.columns if c in {"Paper ID", "Title", "DOI"} or c.startswith(prefix)]
	return loaded[keep].copy()

	notebook = load_optional(notebooklm_path, "NotebookLM")
	second = load_optional(second_llm_path, "SecondLLM")

	if base.empty:
	merged = pd.DataFrame()
	else:
	merged = base.copy()
	if not notebook.empty:
	key = "Paper ID" if "Paper ID" in notebook.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in notebook.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title")
	merged = merged.merge(notebook, how="left", on=key, suffixes=("", "_NotebookLM_Input"))
	if not second.empty:
	key = "Paper ID" if "Paper ID" in second.columns and "Paper ID" in merged.columns else ("DOI" if "DOI" in second.columns and merged["DOI"].astype(str).str.len().gt(0).any() else "Title")
	merged = merged.merge(second, how="left", on=key, suffixes=("", "_SecondLLM_Input"))

	if merged.empty:
	merged = pd.DataFrame([{
	"Compliance_Status": "PENDING",
	"Required_Action": "Run topic pipeline first, then upload NotebookLM and second-LLM extraction CSV files.",
	}])
	else:
	has_notebook = any(c.startswith("NotebookLM") for c in merged.columns)
	has_second = any(c.startswith("SecondLLM") for c in merged.columns)

	def row_status(row):
	regex_hit = any(
	str(row.get(c, "")).strip().lower() not in {"", "nan", "none"}
	for c in ["Theory_Regex", "Method_Regex", "Computational_Techniques_Regex", "Characteristics_Constructs"]
	)
	notebook_cols = [c for c in merged.columns if c.startswith("NotebookLM_") and c != "NotebookLM_File_Loaded"]
	second_cols = [c for c in merged.columns if c.startswith("SecondLLM_")]
	notebook_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in notebook_cols)
	second_hit = any(str(row.get(c, "")).strip().lower() not in {"", "nan", "none", "false"} for c in second_cols)
	if notebook_hit and second_hit:
	return "COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM"
	if notebook_hit and regex_hit:
	return "PARTIAL_NOTEBOOKLM_PLUS_REGEX"
	if second_hit and regex_hit:
	return "PARTIAL_SECOND_LLM_PLUS_REGEX"
	return "PENDING_NOTEBOOKLM_AND_SECOND_LLM"

	merged["NotebookLM_File_Loaded"] = has_notebook
	merged["Second_LLM_File_Loaded"] = has_second
	merged["Final_TCCM_Compliance_Status"] = merged.apply(row_status, axis=1)
	merged["Required_Action"] = np.where(
	merged["Final_TCCM_Compliance_Status"].eq("COMPLIANT_NOTEBOOKLM_PLUS_SECOND_LLM"),
	"Ready for mentor review with dual AI validation.",
	"Upload NotebookLM extraction and second LLM extraction from full-text PDFs before claiming final compliance.",
	)

	path = _opath("tccm_dual_validation.csv")
	merged.to_csv(path, index=False)
	return path


	def write_compliance_checklist(params: Dict[str, Any], meta: Dict[str, Any], summaries: List[Dict[str, Any]]) -> str:
	has_live_llm = any(
	"Mistral LLM council vote" in vote.get("method", "")
	for summary in summaries
	for vote in summary.get("council_votes", [])
	)
	notebook_loaded = os.path.exists(_opath("notebooklm_extraction.csv"))
	dual_path = _opath("tccm_dual_validation.csv")
	second_loaded = False
	if os.path.exists(dual_path):
	try:
	dual_df = pd.read_csv(dual_path)
	second_loaded = bool(dual_df.get("Second_LLM_File_Loaded", pd.Series([False])).astype(bool).any())
	except Exception:
	second_loaded = False
	rows = [
	{
	"Requirement": "15 to 25 crisp topic clusters",
	"Status": "PASS" if TARGET_MIN_CLUSTERS <= params.get("n_clusters", 0) <= TARGET_MAX_CLUSTERS else "FAIL",
	"Evidence": f"{params.get('n_clusters')} clusters generated.",
	"File": "comparison.csv / cluster_optimization_log.csv",
	},
	{
	"Requirement": "Minimum 5 and maximum 100 papers per cluster",
	"Status": "PASS" if params.get("min_size", 0) >= MIN_CLUSTER_SIZE and params.get("max_size", 999) <= MAX_CLUSTER_SIZE else "FAIL",
	"Evidence": f"min={params.get('min_size')}, max={params.get('max_size')}.",
	"File": "cluster_optimization_log.csv",
	},
	{
	"Requirement": "Cluster optimization loop with parameter recommendations",
	"Status": "PASS" if os.path.exists(_opath("cluster_optimization_log.csv")) else "FAIL",
	"Evidence": "Optimizer records attempted settings, scores, and recommendations.",
	"File": "cluster_optimization_log.csv",
	},
	{
	"Requirement": "Top three high-probability paper titles fed for labels",
	"Status": "PASS" if all(len(s.get("top_titles", [])) >= 3 for s in summaries) else "REVIEW",
	"Evidence": "Top_3_Paper_Titles included for every cluster.",
	"File": "comparison.csv",
	},
	{
	"Requirement": "LLM council visible in app, not just story text",
	"Status": "PASS" if os.path.exists(_opath("llm_council_validation.csv")) else "FAIL",
	"Evidence": "Animated council board and vote table in Council Validation tab.",
	"File": "llm_council_validation.csv / app.py",
	},
	{
	"Requirement": "Live 3-LLM council labels",
	"Status": "PASS" if has_live_llm else "CONFIG_REQUIRED",
	"Evidence": "Set MISTRAL_API_KEY in Space secrets to switch from local fallback to live Mistral council.",
	"File": "llm_council_validation.csv",
	},
	{
	"Requirement": "SPECTER2 paper-level embeddings",
	"Status": "PASS" if "specter2" in str(meta.get("embedding_model", "")).lower() else "ENV_FALLBACK",
	"Evidence": meta.get("embedding_note", ""),
	"File": "run_metadata.json",
	},
	{
	"Requirement": "UMAP + HDBSCAN density clustering",
	"Status": "PASS" if str(params.get("algorithm", "")).lower().startswith("umap + hdbscan") else "ENV_FALLBACK",
	"Evidence": str(params.get("algorithm", "")),
	"File": "run_metadata.json / cluster_optimization_log.csv",
	},
	{
	"Requirement": "TCCM corpus loaded and vectorised for computational techniques",
	"Status": "PASS" if os.path.exists(_opath("tccm_validation.csv")) else "FAIL",
	"Evidence": "Top-cited 100 papers exported with regex and semantic computational technique extraction.",
	"File": "tccm_validation.csv",
	},
	{
	"Requirement": "NotebookLM output plus another LLM method for TCCM",
	"Status": "PASS" if notebook_loaded and second_loaded else ("PARTIAL" if notebook_loaded else "INPUT_REQUIRED"),
	"Evidence": (
	"NotebookLM extraction loaded; second LLM extraction still required."
	if notebook_loaded and not second_loaded
	else ("NotebookLM and second LLM extraction loaded." if second_loaded else "Use TCCM Dual Validation tab to upload NotebookLM CSV and second LLM CSV.")
	),
	"File": "tccm_dual_validation.csv",
	},
	{
	"Requirement": "Formal mentor approval before final submission",
	"Status": "MANUAL_REQUIRED",
	"Evidence": "Cannot be automated; get faculty mentor approval.",
	"File": "mentor approval evidence",
	},
	]
	path = _opath("compliance_checklist.csv")
	pd.DataFrame(rows).to_csv(path, index=False)
	_save_json(rows, "compliance_checklist.json")
	return path


	def _write_validation_files(summaries: List[Dict[str, Any]], optimizer_log: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> None:
	council = []
	for s in summaries:
	for vote in s["council_votes"]:
	council.append({
	"cluster_id": s["cluster_id"],
	"final_label": s["label"],
	"member": vote["member"],
	"member_label": vote["label"],
	"method": vote["method"],
	"top_3_titles_used": " \| ".join(s.get("top_titles", [])[:3]),
	"agreement_score": s["agreement_score"],
	"confidence": s["confidence"],
	})
	pd.DataFrame(council).to_csv(_opath("llm_council_validation.csv"), index=False)
	_save_json(council, "llm_council.json")
	pd.DataFrame(optimizer_log).sort_values("score", na_position="last").to_csv(_opath("cluster_optimization_log.csv"), index=False)
	_save_json({"selected_parameters": params, "embedding": meta}, "run_metadata.json")


	def _write_report(config: Dict[str, Any], summaries: List[Dict[str, Any]], params: Dict[str, Any], meta: Dict[str, Any]) -> str:
	lines = [
	"# Topic Modelling Final Submission Report",
	"",
	f"Journal: {config.get('journal')}",
	f"Papers analysed: {config.get('rows')}",
	f"Years: {config.get('year_min')} to {config.get('year_max')}",
	"",
	"## Method",
	f"The model uses one vector per paper from {config.get('combined_field')}. "
	f"Embedding model: {meta.get('embedding_model')}. Clustering: {params.get('algorithm')}. "
	"The optimizer searches UMAP/HDBSCAN parameters and selects the lowest penalty solution "
	"against the required 15-25 clusters, 5 minimum papers per cluster, and 100 maximum papers per cluster.",
	"",
	"## Selected Parameters",
	"```json",
	json.dumps(params, indent=2),
	"```",
	"",
	"## Validated Clusters",
	]
	for s in summaries:
	lines.append(
	f"- C{s['cluster_id']}: {s['label']} ({s['paper_count']} papers, "
	f"confidence {s['confidence']}, PAJAIS: {s['category']}). "
	f"Evidence titles: {' \| '.join(s['top_titles'][:3])}"
	)
	lines.extend([
	"",
	"## Validation",
	"Labels are validated through the in-app council table: keyword extraction, PAJAIS semantic mapping, "
	"and an LLM labeler when MISTRAL_API_KEY is configured. Without a key, the third council member "
	"uses a deterministic local semantic fallback, so the app remains executable end to end.",
	"",
	"TCCM and computational technique extraction are exported in `tccm_validation.csv` for the top-cited 100 papers. "
	"Rows marked `NEEDS_FULL_TEXT_REVIEW` should be checked against PDFs before final academic submission. "
	"Full TCCM compliance requires uploading NotebookLM extraction and a second LLM extraction in the app's "
	"TCCM Dual Validation tab to generate `tccm_dual_validation.csv`.",
	])
	path = _opath("topic_model_report.md")
	with open(path, "w", encoding="utf-8") as f:
	f.write("\n".join(lines))
	with open(_opath("narrative.txt"), "w", encoding="utf-8") as f:
	f.write("\n".join(lines[:60]))
	return path


	def run_complete_pipeline(filepath: str) -> Dict[str, Any]:
	df, config = load_corpus(filepath)
	vectors, meta = _embed_documents(df["__combined"].tolist())
	np.save(_opath("combined_emb.npy"), vectors)
	labels, probabilities, params, optimizer_log = _run_umap_hdbscan(vectors)
	summaries = _build_cluster_summaries(df, vectors, labels, probabilities)

	_save_json(summaries, "combined_labels.json")
	_save_json(summaries, "abstract_labels.json")
	_save_json(summaries, "title_labels.json")
	_save_json({"sentences": df["__combined"].tolist(), "paper_ids": df["__paper_id"].astype(int).tolist()}, "combined_sentences.json")
	taxonomy = _taxonomy_map(summaries)
	_save_json(taxonomy, "taxonomy_map.json")
	_generate_charts(summaries)
	comparison_path = _write_comparison_csv(summaries)
	tccm_path = _write_tccm_validation(df)
	_write_validation_files(summaries, optimizer_log, params, meta)
	dual_tccm_path = write_tccm_dual_validation()
	checklist_path = write_compliance_checklist(params, meta, summaries)
	report_path = _write_report(config, summaries, params, meta)

	deliverables = [
	comparison_path,
	_opath("taxonomy_map.json"),
	_opath("topic_model_report.md"),
	_opath("narrative.txt"),
	_opath("cluster_optimization_log.csv"),
	_opath("llm_council_validation.csv"),
	_opath("tccm_validation.csv"),
	dual_tccm_path,
	checklist_path,
	_opath("run_metadata.json"),
	_opath("combined_labels.json"),
	]
	return {
	"config": config,
	"parameters": params,
	"embedding": meta,
	"clusters": summaries,
	"taxonomy": taxonomy,
	"deliverables": [p for p in deliverables if os.path.exists(p)],
	}