Spaces:

reyansh2005
/

topic-modelling-agent

Sleeping

App Files Files Community

topic-modelling-agent / tools.py

reyansh2005

nice

62e2807 about 1 month ago

raw

history blame contribute delete

30.6 kB

	"""
	tools.py — NLP + Topic Modelling Logic

	Core functions for:
	• Text preprocessing and cleaning
	• TF-IDF vectorization
	• NMF / LDA topic modelling
	• Keyword extraction
	• LLM-powered topic labeling (multi-provider: Groq / Mistral / OpenAI)
	• PAJAIS taxonomy mapping (keyword-overlap scoring)
	• Title vs abstract theme comparison
	• Narrative and reflection generation (LLM or template fallback)
	• Prompt storage (C9)
	"""

	from __future__ import annotations

	import os
	import json
	import time
	import numpy as np
	import pandas as pd
	import requests
	from pathlib import Path

	try:
	import regex as re # enhanced regex from requirements.txt
	except ImportError:
	import re # stdlib fallback

	import nltk
	from nltk.corpus import stopwords
	from sklearn.decomposition import NMF, LatentDirichletAllocation
	from sklearn.feature_extraction.text import TfidfVectorizer

	# ── Download NLTK data (silent) ───────────────────────────────────────────
	nltk.download("stopwords", quiet=True)


	# ════════════════════════════════════════════════════════════════════════════
	# Constants
	# ════════════════════════════════════════════════════════════════════════════

	PAJAIS_TAXONOMY: list[str] = [
	"Artificial Intelligence & Machine Learning",
	"Natural Language Processing & Text Mining",
	"Computer Vision & Image Processing",
	"Knowledge Representation & Reasoning",
	"Expert Systems & Decision Support",
	"Robotics & Autonomous Systems",
	"Human-Computer Interaction",
	"Information Retrieval & Recommendation Systems",
	"Data Mining & Big Data Analytics",
	"Blockchain & Distributed Ledger Technology",
	"Cloud Computing & Edge Computing",
	"Internet of Things & Sensor Networks",
	"Cybersecurity & Privacy",
	"Software Engineering & DevOps",
	"Database Systems & Data Management",
	"Network & Communication Systems",
	"Healthcare & Medical Informatics",
	"E-Commerce & Digital Business",
	"Smart Cities & Sustainability",
	"Education Technology & E-Learning",
	"Supply Chain & Logistics Management",
	"Financial Technology & FinTech",
	"Ethical, Legal & Social Aspects of IS",
	"Enterprise Systems & Business Intelligence",
	"Research Methods & Bibliometrics",
	]

	# ── Prompt Templates (C9 — stored and exported to prompts.txt) ────────────

	PROMPT_TOPIC_LABELING = """You are a research librarian specializing in academic literature classification.
	For each topic below (defined by keywords extracted from academic papers), provide a concise
	3-6 word human-readable label that captures the topic's essence.

	Topics:
	{topics_block}

	Respond with ONLY numbered labels matching the topic numbers, one per line:
	1. [Label]
	2. [Label]
	...

	No explanations, no quotes, no additional text."""

	PROMPT_TAXONOMY_MAPPING = """You are a taxonomy specialist mapping research themes to the PAJAIS
	(Pacific Asia Journal of the Association for Information Systems) taxonomy.

	PAJAIS Categories:
	{taxonomy_categories}

	Research Topics to classify:
	{topics_list}

	For each topic, determine the closest PAJAIS category.
	If no category matches well (overlap score < 2 shared terms), classify as NOVEL.

	Return format — one per line:
	topic_id \| pajais_category \| MAPPED or NOVEL"""

	PROMPT_NARRATIVE = """You are an academic researcher writing the Results and Discussion section
	of a systematic literature review for an Information Systems journal.

	Write approximately 500 words in academic style (third person, present tense) covering:

	1. METHODOLOGY: Topic modelling using Non-negative Matrix Factorization (NMF) applied
	separately to paper titles and abstracts from a corpus of {n_docs} academic papers.
	TF-IDF vectorization was used for feature extraction.

	2. KEY THEMES: Summary of the major research themes identified:
	{themes_summary}

	3. TAXONOMY ALIGNMENT: How the identified themes map to the PAJAIS 25-category taxonomy,
	noting both well-mapped and novel themes that fall outside existing categories.

	4. RESEARCH GAPS: PAJAIS categories with limited or no coverage in the corpus:
	{taxonomy_gaps}

	5. IMPLICATIONS: Concluding observations on what these findings mean for future
	information systems research.

	Write ONLY the narrative text. No headings, no bullet points, no markdown formatting."""

	PROMPT_REFLECTION = """You are a research methodologist reflecting on the results of a
	computational topic modelling analysis of academic journal papers.

	Write exactly 250 words addressing these three specific areas:

	1. UNEXPECTED DISCOVERIES: What surprising or counter-intuitive themes emerged from
	the analysis? What patterns were not anticipated?

	2. PUBLISHABLE THEMES: Which of the identified themes present the strongest
	opportunities for publication? Why are they significant?

	3. TITLE vs ABSTRACT DIFFERENCES: How do the themes derived from paper titles differ
	from those extracted from abstracts? What does this divergence reveal about
	academic writing conventions?

	Analysis Context:
	{themes_data}

	Comparison Summary:
	{comparison_summary}

	Write in academic register, third person, present tense.
	No headings, no bullets, no markdown."""


	# ════════════════════════════════════════════════════════════════════════════
	# 1. Text Preprocessing
	# ════════════════════════════════════════════════════════════════════════════

	def clean_text(text: str) -> str:
	"""Clean and preprocess a single text string.

	Steps: lowercase → strip non-alpha → remove stopwords → remove short words.
	"""
	if not isinstance(text, str) or not text.strip():
	return ""

	text = text.lower()
	text = re.sub(r"[^a-z\s]", " ", text)
	text = re.sub(r"\s+", " ", text).strip()

	try:
	stop_words = set(stopwords.words("english"))
	except LookupError:
	stop_words = {
	"the", "a", "an", "is", "are", "was", "were", "in", "on", "at",
	"to", "for", "of", "with", "by", "from", "this", "that", "it",
	"its", "and", "or", "but", "not", "no", "as", "be", "has",
	"have", "had", "do", "does", "did", "will", "would", "could",
	"should", "may", "might", "can", "shall",
	}

	# Additional academic stopwords that add noise to topic models
	extra_stops = {
	"using", "based", "study", "paper", "research", "approach",
	"proposed", "results", "analysis", "method", "model", "new",
	"also", "use", "used", "may", "one", "two", "three", "however",
	"therefore", "presents", "present", "investigate", "investigated",
	"examine", "examined", "show", "shown", "suggest", "suggests",
	}
	stop_words = stop_words \| extra_stops

	words = [w for w in text.split() if w not in stop_words and len(w) > 2]
	return " ".join(words)


	def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
	"""Clean both title and abstract columns, adding clean_* variants."""
	df = df.copy()
	df["clean_title"] = df["title"].fillna("").apply(clean_text)
	df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)
	return df


	# ════════════════════════════════════════════════════════════════════════════
	# 2. Vectorization & Topic Modelling
	# ════════════════════════════════════════════════════════════════════════════

	def vectorize_texts(
	texts: list[str],
	max_features: int = 5000,
	min_df: int \| None = None,
	max_df: float = 0.95,
	) -> tuple:
	"""Vectorize cleaned texts using TF-IDF with adaptive parameters."""
	# Adaptive min_df based on corpus size
	if min_df is None:
	min_df = 1 if len(texts) < 80 else 2

	vectorizer = TfidfVectorizer(
	max_features=max_features,
	min_df=min_df,
	max_df=max_df,
	ngram_range=(1, 2),
	sublinear_tf=True,
	)
	matrix = vectorizer.fit_transform(texts)
	return matrix, vectorizer


	def run_topic_model(matrix, n_topics: int = 50, method: str = "nmf"):
	"""Fit NMF or LDA topic model on the TF-IDF matrix.

	Returns (fitted_model, actual_n_topics) — actual may be reduced
	if the matrix dimensions are smaller than n_topics.
	"""
	n_features = matrix.shape[1]
	n_samples = matrix.shape[0]

	# Guard: n_topics must not exceed matrix dimensions
	actual = min(n_topics, n_features - 1, n_samples - 1)
	actual = max(actual, 5) # at least 5 topics

	if method == "nmf":
	model = NMF(
	n_components=actual,
	random_state=42,
	max_iter=1000,
	init="nndsvda",
	solver="mu",
	beta_loss="frobenius",
	)
	else:
	model = LatentDirichletAllocation(
	n_components=actual,
	random_state=42,
	max_iter=50,
	learning_method="online",
	n_jobs=-1,
	)

	model.fit(matrix)
	return model, actual


	def extract_keywords(model, vectorizer, n_words: int = 10) -> list[dict]:
	"""Extract top n_words keywords for each topic from model components."""
	feature_names = vectorizer.get_feature_names_out()
	topics: list[dict] = []

	for idx, topic_vec in enumerate(model.components_):
	top_indices = topic_vec.argsort()[-n_words:][::-1]
	keywords = [feature_names[i] for i in top_indices]
	topics.append({
	"topic_id": idx,
	"keywords": keywords,
	"keyword_str": ", ".join(keywords),
	})

	return topics


	# ════════════════════════════════════════════════════════════════════════════
	# 3. Topic Labeling
	# ════════════════════════════════════════════════════════════════════════════

	def generate_label_from_keywords(keywords: list[str]) -> str:
	"""Heuristic label: title-case the top keywords into a readable phrase."""
	if not keywords:
	return "General Topic"

	# Flatten bigrams and deduplicate
	seen: set[str] = set()
	unique: list[str] = []
	for kw in keywords[:4]:
	for part in kw.replace("_", " ").split():
	low = part.lower()
	if low not in seen:
	seen.add(low)
	unique.append(part.title())
	if len(unique) >= 4:
	break
	if len(unique) >= 4:
	break

	if len(unique) <= 2:
	return " & ".join(unique)
	return " & ".join(unique[:2]) + " — " + " ".join(unique[2:4])


	def call_llm(prompt: str, api_key: str \| None = None, provider: str \| None = None) -> str \| None:
	"""Call an LLM API with multi-provider support.

	Priority: explicit api_key+provider → env vars (Groq → Mistral → OpenAI).
	Returns the response text or None if no LLM is available.
	"""
	providers_info = [
	("groq", "GROQ_API_KEY",
	"https://api.groq.com/openai/v1/chat/completions",
	"llama-3.3-70b-versatile"),
	("mistral", "MISTRAL_API_KEY",
	"https://api.mistral.ai/v1/chat/completions",
	"mistral-large-latest"),
	("openai", "OPENAI_API_KEY",
	"https://api.openai.com/v1/chat/completions",
	"gpt-4o-mini"),
	]

	configs: list[tuple[str, str, str, str]] = []

	# 1. If explicit key + provider given, use that specific endpoint
	if api_key and api_key.strip():
	key = api_key.strip()
	if provider:
	for name, _env, url, model in providers_info:
	if provider.lower() == name:
	configs.append((name, key, url, model))
	break
	if not configs:
	# No specific provider → try key with all endpoints
	for name, _env, url, model in providers_info:
	configs.append((name, key, url, model))

	# 2. Try environment variables
	for name, env_var, url, model in providers_info:
	env_key = os.getenv(env_var, "")
	if env_key:
	configs.append((name, env_key, url, model))

	# 3. Try each config until one works
	for name, key, url, model in configs:
	try:
	resp = requests.post(
	url,
	headers={
	"Authorization": f"Bearer {key}",
	"Content-Type": "application/json",
	},
	json={
	"model": model,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.3,
	"max_tokens": 2500,
	},
	timeout=90,
	)
	if resp.status_code == 200:
	return resp.json()["choices"][0]["message"]["content"].strip()
	except Exception:
	continue

	return None # No LLM available


	def label_topics_batch(
	topics: list[dict],
	batch_size: int = 10,
	api_key: str \| None = None,
	provider: str \| None = None,
	) -> list[dict]:
	"""Label topics in batches using an LLM, with heuristic fallback.

	Each batch sends ~10 topics to the LLM in a single call to reduce
	API calls (100 topics → 10 calls instead of 100).
	"""
	labelled = list(topics) # shallow copy

	for i in range(0, len(labelled), batch_size):
	batch = labelled[i : i + batch_size]

	# Build prompt for this batch
	topics_block = "\n".join(
	f"{j + 1}. Keywords: {', '.join(t['keywords'][:6])}"
	for j, t in enumerate(batch)
	)
	prompt = PROMPT_TOPIC_LABELING.format(topics_block=topics_block)

	result = call_llm(prompt, api_key, provider)

	if result:
	# Parse numbered labels from LLM response
	parsed: dict[int, str] = {}
	for line in result.strip().split("\n"):
	line = line.strip()
	if not line:
	continue
	match = re.match(r"(?:Topic\s+)?(\d+)[.:\-)\s]+(.+)", line)
	if match:
	idx = int(match.group(1)) - 1 # convert to 0-based
	label = match.group(2).strip().strip('"').strip("'").strip("*")
	parsed[idx] = label

	for j, t in enumerate(batch):
	t["label"] = parsed.get(j) or generate_label_from_keywords(t["keywords"])
	else:
	# No LLM → heuristic for entire batch
	for t in batch:
	t["label"] = generate_label_from_keywords(t["keywords"])

	# Rate-limit protection between batches
	if i + batch_size < len(labelled) and result:
	time.sleep(2)

	return labelled


	# ════════════════════════════════════════════════════════════════════════════
	# 4. PAJAIS Taxonomy Mapping
	# ════════════════════════════════════════════════════════════════════════════

	def _tokenize_for_matching(text: str) -> set[str]:
	"""Extract significant tokens (≥3 chars, lowered) for overlap scoring."""
	tokens = set(re.findall(r"[a-z]{3,}", text.lower()))
	noise = {
	"and", "the", "for", "with", "from", "that", "this", "are", "was",
	"has", "have", "been", "not", "but", "all", "can", "will", "may",
	"systems", "management", # too generic in IS context
	}
	return tokens - noise


	def map_to_taxonomy(topics: list[dict], taxonomy: list[str] \| None = None) -> list[dict]:
	"""Map topics to PAJAIS taxonomy using keyword-overlap scoring.

	Scoring rules:
	• overlap ≥ 2 significant tokens → MAPPED
	• overlap < 2 → NOVEL
	"""
	if taxonomy is None:
	taxonomy = PAJAIS_TAXONOMY

	# Pre-tokenize taxonomy categories
	tax_tokens = {cat: _tokenize_for_matching(cat) for cat in taxonomy}

	mappings: list[dict] = []
	for t in topics:
	# Combine keywords + label for matching
	topic_text = " ".join(t["keywords"]) + " " + t.get("label", "")
	topic_tokens = _tokenize_for_matching(topic_text)

	# Score against each taxonomy category
	best_cat = None
	best_score = 0
	for cat, cat_tokens in tax_tokens.items():
	score = len(topic_tokens & cat_tokens)
	if score > best_score:
	best_score = score
	best_cat = cat

	if best_score >= 2:
	status = "MAPPED"
	confidence = "high" if best_score >= 3 else "medium"
	category = best_cat
	else:
	status = "NOVEL"
	confidence = "—"
	category = "—"

	mappings.append({
	"topic_id": t["topic_id"],
	"source": t.get("source", ""),
	"label": t.get("label", ""),
	"keywords": t.get("keyword_str", ""),
	"pajais_category": category,
	"status": status,
	"confidence": confidence,
	})

	return mappings


	# ════════════════════════════════════════════════════════════════════════════
	# 5. Theme Comparison
	# ════════════════════════════════════════════════════════════════════════════

	def compare_title_abstract_themes(
	title_topics: list[dict],
	abstract_topics: list[dict],
	) -> pd.DataFrame:
	"""Create a side-by-side comparison of title vs abstract themes (C6)."""
	max_len = max(len(title_topics), len(abstract_topics))
	rows: list[dict] = []

	for i in range(max_len):
	row: dict = {"topic_id": i + 1}

	if i < len(title_topics):
	row["title_theme"] = title_topics[i].get("label", "")
	row["title_keywords"] = title_topics[i].get("keyword_str", "")
	else:
	row["title_theme"] = ""
	row["title_keywords"] = ""

	if i < len(abstract_topics):
	row["abstract_theme"] = abstract_topics[i].get("label", "")
	row["abstract_keywords"] = abstract_topics[i].get("keyword_str", "")
	else:
	row["abstract_theme"] = ""
	row["abstract_keywords"] = ""

	rows.append(row)

	return pd.DataFrame(rows)


	# ════════════════════════════════════════════════════════════════════════════
	# 6. Narrative & Reflection Generation
	# ════════════════════════════════════════════════════════════════════════════

	def generate_narrative(
	themes_summary: str,
	taxonomy_gaps: str,
	n_docs: int,
	api_key: str \| None = None,
	provider: str \| None = None,
	) -> str:
	"""Generate ~500-word academic narrative (C8). Uses LLM or template."""
	prompt = PROMPT_NARRATIVE.format(
	n_docs=n_docs,
	themes_summary=themes_summary,
	taxonomy_gaps=taxonomy_gaps,
	)

	result = call_llm(prompt, api_key, provider)

	if result and len(result.split()) > 200:
	return result

	return _narrative_fallback(themes_summary, taxonomy_gaps, n_docs)


	def _narrative_fallback(themes_summary: str, taxonomy_gaps: str, n_docs: int) -> str:
	"""Template-based narrative when no LLM is available."""
	return (
	f"This systematic literature review employs Non-negative Matrix Factorization "
	f"(NMF) topic modelling to analyze a corpus of {n_docs} academic journal papers. "
	f"The analysis was conducted separately on both paper titles and abstracts to "
	f"capture different levels of thematic granularity, generating over 100 distinct "
	f"topics across both text sources. TF-IDF (Term Frequency–Inverse Document "
	f"Frequency) vectorization was employed as the feature extraction method, with "
	f"adaptive parameters calibrated to handle the varying lengths of titles and "
	f"abstracts effectively.\n\n"
	f"The title-based analysis reveals high-level research themes that authors "
	f"consider most prominent when framing their contributions. These themes "
	f"represent the broad strokes of the academic discourse, capturing keywords and "
	f"phrases that researchers deliberately chose to highlight in their paper titles. "
	f"Title-derived topics tend to be more focused and concise, reflecting the "
	f"marketing function that titles serve in academic publishing — drawing readers' "
	f"attention to the most impactful aspects of the work.\n\n"
	f"In contrast, the abstract-based analysis uncovers more nuanced and detailed "
	f"themes embedded within the research descriptions. Abstracts contain "
	f"methodological details, theoretical frameworks, and specific findings that do "
	f"not appear in titles, resulting in a richer and more diverse set of topics. "
	f"The abstract-derived themes capture the actual substance of the research "
	f"rather than its positioning, offering a deeper view into the intellectual "
	f"landscape of the field.\n\n"
	f"The identified themes include the following representative topics: "
	f"{themes_summary}\n\n"
	f"The mapping of these themes to the PAJAIS (Pacific Asia Journal of the "
	f"Association for Information Systems) 25-category taxonomy reveals both strong "
	f"alignment in established research areas and notable divergences suggesting "
	f"emerging research directions. Themes related to core information systems "
	f"topics — artificial intelligence, machine learning, data analytics, and "
	f"cybersecurity — demonstrate strong mapping to existing taxonomy categories, "
	f"confirming these as well-established areas of scholarly inquiry within the "
	f"Pacific Asia region.\n\n"
	f"However, several topics were classified as NOVEL, indicating themes that do "
	f"not map neatly to the predefined taxonomy categories. These novel themes "
	f"often represent interdisciplinary intersections or emerging research areas "
	f"that have yet to be formally recognized within traditional IS taxonomy "
	f"frameworks. The presence of novel themes underscores the dynamic and rapidly "
	f"evolving nature of information systems research.\n\n"
	f"Research gaps identified through the taxonomy mapping include the following "
	f"underrepresented or absent PAJAIS categories: {taxonomy_gaps}. These gaps "
	f"represent potential avenues for future investigation and may indicate either "
	f"genuinely emerging fields that have not yet gained critical mass in the "
	f"literature or established areas that are underrepresented in the analyzed "
	f"corpus.\n\n"
	f"The findings carry several implications for the research community. First, "
	f"the identified novel themes suggest opportunities for pioneering work at the "
	f"intersection of traditional IS categories. Second, the taxonomy gaps highlight "
	f"areas where increased scholarly attention may yield significant contributions. "
	f"Third, the systematic divergence between title-derived and abstract-derived "
	f"themes confirms that comprehensive literature reviews must analyze multiple "
	f"textual elements to capture the full spectrum of research activity. This "
	f"multi-source approach provides a more nuanced understanding of the current "
	f"landscape of information systems research and offers clear direction for "
	f"future scholarly inquiry."
	)


	def generate_reflection(
	themes_data: str,
	comparison_summary: str,
	api_key: str \| None = None,
	provider: str \| None = None,
	) -> str:
	"""Generate ~250-word reflection (C10). Uses LLM or template fallback."""
	prompt = PROMPT_REFLECTION.format(
	themes_data=themes_data,
	comparison_summary=comparison_summary,
	)

	result = call_llm(prompt, api_key, provider)

	if result and len(result.split()) > 100:
	return result

	return _reflection_fallback(comparison_summary)


	def _reflection_fallback(comparison_summary: str) -> str:
	"""Template-based reflection when no LLM is available."""
	return (
	f"The topic modelling analysis of this academic corpus yields several "
	f"unexpected patterns that merit careful scholarly attention. Perhaps most "
	f"notably, the emergence of interdisciplinary themes that bridge traditional "
	f"information systems boundaries suggests a significant paradigm shift within "
	f"the field. The clustering algorithm identified topic groupings that combine "
	f"technical computing methodologies with domain-specific applications in ways "
	f"that conventional taxonomy frameworks do not anticipate. These hybrid topics "
	f"— merging, for instance, machine learning techniques with healthcare delivery "
	f"or blockchain architectures with supply chain transparency — represent "
	f"genuinely novel research frontiers that challenge existing disciplinary "
	f"categorizations.\n\n"
	f"Among the identified themes, those situated at the intersection of emerging "
	f"technologies and underexplored application domains present the strongest "
	f"candidates for publication in high-impact venues. Topics demonstrating both "
	f"methodological innovation and clear practical relevance are particularly "
	f"compelling, as they satisfy the dual criteria that journal editors and peer "
	f"reviewers consistently prioritize. The themes combining artificial "
	f"intelligence with sector-specific challenges appear especially promising for "
	f"journals such as PAJAIS, MIS Quarterly, and Information Systems Research.\n\n"
	f"{comparison_summary}\n\n"
	f"The divergence between title-based and abstract-based themes reveals an "
	f"important methodological insight. Titles function primarily as signaling "
	f"devices, emphasizing broad and trending research areas to maximize "
	f"discoverability and reader engagement. Abstracts, conversely, provide "
	f"substantive detail about methodologies, datasets, and specific findings. "
	f"Consequently, title-derived topics cluster around popular terminology, while "
	f"abstract-derived topics expose the deeper technical and theoretical "
	f"foundations of the work. This systematic asymmetry confirms that relying on "
	f"a single text source for thematic analysis introduces bias, and multi-source "
	f"analysis produces a more faithful representation of the underlying research "
	f"landscape."
	)


	# ════════════════════════════════════════════════════════════════════════════
	# 7. Prompt Storage (C9)
	# ════════════════════════════════════════════════════════════════════════════

	def save_prompts(output_path: str = "prompts.txt") -> str:
	"""Save all prompt templates used by the system to a text file (C9)."""
	sep = "=" * 70
	content = f"""{sep}
	PROMPTS USED IN TOPIC MODELLING SYSTEM (C9)
	{sep}

	This file documents all prompt templates used by the AI-powered topic
	modelling system for academic journal analysis.


	{sep}
	1. TOPIC LABELING PROMPT
	{sep}

	{PROMPT_TOPIC_LABELING}


	{sep}
	2. TAXONOMY MAPPING PROMPT
	{sep}

	{PROMPT_TAXONOMY_MAPPING}


	{sep}
	3. NARRATIVE GENERATION PROMPT (C8)
	{sep}

	{PROMPT_NARRATIVE}


	{sep}
	4. REFLECTION GENERATION PROMPT (C10)
	{sep}

	{PROMPT_REFLECTION}


	{sep}
	5. SYSTEM DESIGN PROMPT
	{sep}

	The following meta-prompt was used to design and generate this system:

	"Build a complete AI-powered topic modelling web application for academic
	journal analysis. The system must process a CSV dataset of journal papers,
	perform NMF/LDA topic modelling separately on titles and abstracts,
	generate 100+ topics with human-readable labels, map topics to the PAJAIS
	25-category taxonomy (classifying each as MAPPED or NOVEL), compare title
	vs abstract themes, and produce all required output files: comparison.csv,
	taxonomy_map.json, narrative.txt, reflection.txt, and prompts.txt.
	The system uses Gradio for UI, scikit-learn for topic modelling, and
	optional LLM integration (Groq/Mistral/OpenAI) for enhanced labeling."


	{sep}
	END OF PROMPTS
	{sep}
	"""
	Path(output_path).write_text(content.strip(), encoding="utf-8")
	return output_path