reyansh2005's picture
nice
62e2807
"""
tools.py β€” NLP + Topic Modelling Logic
Core functions for:
β€’ Text preprocessing and cleaning
β€’ TF-IDF vectorization
β€’ NMF / LDA topic modelling
β€’ Keyword extraction
β€’ LLM-powered topic labeling (multi-provider: Groq / Mistral / OpenAI)
β€’ PAJAIS taxonomy mapping (keyword-overlap scoring)
β€’ Title vs abstract theme comparison
β€’ Narrative and reflection generation (LLM or template fallback)
β€’ Prompt storage (C9)
"""
from __future__ import annotations
import os
import json
import time
import numpy as np
import pandas as pd
import requests
from pathlib import Path
try:
import regex as re # enhanced regex from requirements.txt
except ImportError:
import re # stdlib fallback
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
# ── Download NLTK data (silent) ───────────────────────────────────────────
nltk.download("stopwords", quiet=True)
# ════════════════════════════════════════════════════════════════════════════
# Constants
# ════════════════════════════════════════════════════════════════════════════
PAJAIS_TAXONOMY: list[str] = [
"Artificial Intelligence & Machine Learning",
"Natural Language Processing & Text Mining",
"Computer Vision & Image Processing",
"Knowledge Representation & Reasoning",
"Expert Systems & Decision Support",
"Robotics & Autonomous Systems",
"Human-Computer Interaction",
"Information Retrieval & Recommendation Systems",
"Data Mining & Big Data Analytics",
"Blockchain & Distributed Ledger Technology",
"Cloud Computing & Edge Computing",
"Internet of Things & Sensor Networks",
"Cybersecurity & Privacy",
"Software Engineering & DevOps",
"Database Systems & Data Management",
"Network & Communication Systems",
"Healthcare & Medical Informatics",
"E-Commerce & Digital Business",
"Smart Cities & Sustainability",
"Education Technology & E-Learning",
"Supply Chain & Logistics Management",
"Financial Technology & FinTech",
"Ethical, Legal & Social Aspects of IS",
"Enterprise Systems & Business Intelligence",
"Research Methods & Bibliometrics",
]
# ── Prompt Templates (C9 β€” stored and exported to prompts.txt) ────────────
PROMPT_TOPIC_LABELING = """You are a research librarian specializing in academic literature classification.
For each topic below (defined by keywords extracted from academic papers), provide a concise
3-6 word human-readable label that captures the topic's essence.
Topics:
{topics_block}
Respond with ONLY numbered labels matching the topic numbers, one per line:
1. [Label]
2. [Label]
...
No explanations, no quotes, no additional text."""
PROMPT_TAXONOMY_MAPPING = """You are a taxonomy specialist mapping research themes to the PAJAIS
(Pacific Asia Journal of the Association for Information Systems) taxonomy.
PAJAIS Categories:
{taxonomy_categories}
Research Topics to classify:
{topics_list}
For each topic, determine the closest PAJAIS category.
If no category matches well (overlap score < 2 shared terms), classify as NOVEL.
Return format β€” one per line:
topic_id | pajais_category | MAPPED or NOVEL"""
PROMPT_NARRATIVE = """You are an academic researcher writing the Results and Discussion section
of a systematic literature review for an Information Systems journal.
Write approximately 500 words in academic style (third person, present tense) covering:
1. METHODOLOGY: Topic modelling using Non-negative Matrix Factorization (NMF) applied
separately to paper titles and abstracts from a corpus of {n_docs} academic papers.
TF-IDF vectorization was used for feature extraction.
2. KEY THEMES: Summary of the major research themes identified:
{themes_summary}
3. TAXONOMY ALIGNMENT: How the identified themes map to the PAJAIS 25-category taxonomy,
noting both well-mapped and novel themes that fall outside existing categories.
4. RESEARCH GAPS: PAJAIS categories with limited or no coverage in the corpus:
{taxonomy_gaps}
5. IMPLICATIONS: Concluding observations on what these findings mean for future
information systems research.
Write ONLY the narrative text. No headings, no bullet points, no markdown formatting."""
PROMPT_REFLECTION = """You are a research methodologist reflecting on the results of a
computational topic modelling analysis of academic journal papers.
Write exactly 250 words addressing these three specific areas:
1. UNEXPECTED DISCOVERIES: What surprising or counter-intuitive themes emerged from
the analysis? What patterns were not anticipated?
2. PUBLISHABLE THEMES: Which of the identified themes present the strongest
opportunities for publication? Why are they significant?
3. TITLE vs ABSTRACT DIFFERENCES: How do the themes derived from paper titles differ
from those extracted from abstracts? What does this divergence reveal about
academic writing conventions?
Analysis Context:
{themes_data}
Comparison Summary:
{comparison_summary}
Write in academic register, third person, present tense.
No headings, no bullets, no markdown."""
# ════════════════════════════════════════════════════════════════════════════
# 1. Text Preprocessing
# ════════════════════════════════════════════════════════════════════════════
def clean_text(text: str) -> str:
"""Clean and preprocess a single text string.
Steps: lowercase β†’ strip non-alpha β†’ remove stopwords β†’ remove short words.
"""
if not isinstance(text, str) or not text.strip():
return ""
text = text.lower()
text = re.sub(r"[^a-z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
try:
stop_words = set(stopwords.words("english"))
except LookupError:
stop_words = {
"the", "a", "an", "is", "are", "was", "were", "in", "on", "at",
"to", "for", "of", "with", "by", "from", "this", "that", "it",
"its", "and", "or", "but", "not", "no", "as", "be", "has",
"have", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "can", "shall",
}
# Additional academic stopwords that add noise to topic models
extra_stops = {
"using", "based", "study", "paper", "research", "approach",
"proposed", "results", "analysis", "method", "model", "new",
"also", "use", "used", "may", "one", "two", "three", "however",
"therefore", "presents", "present", "investigate", "investigated",
"examine", "examined", "show", "shown", "suggest", "suggests",
}
stop_words = stop_words | extra_stops
words = [w for w in text.split() if w not in stop_words and len(w) > 2]
return " ".join(words)
def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
"""Clean both title and abstract columns, adding clean_* variants."""
df = df.copy()
df["clean_title"] = df["title"].fillna("").apply(clean_text)
df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)
return df
# ════════════════════════════════════════════════════════════════════════════
# 2. Vectorization & Topic Modelling
# ════════════════════════════════════════════════════════════════════════════
def vectorize_texts(
texts: list[str],
max_features: int = 5000,
min_df: int | None = None,
max_df: float = 0.95,
) -> tuple:
"""Vectorize cleaned texts using TF-IDF with adaptive parameters."""
# Adaptive min_df based on corpus size
if min_df is None:
min_df = 1 if len(texts) < 80 else 2
vectorizer = TfidfVectorizer(
max_features=max_features,
min_df=min_df,
max_df=max_df,
ngram_range=(1, 2),
sublinear_tf=True,
)
matrix = vectorizer.fit_transform(texts)
return matrix, vectorizer
def run_topic_model(matrix, n_topics: int = 50, method: str = "nmf"):
"""Fit NMF or LDA topic model on the TF-IDF matrix.
Returns (fitted_model, actual_n_topics) β€” actual may be reduced
if the matrix dimensions are smaller than *n_topics*.
"""
n_features = matrix.shape[1]
n_samples = matrix.shape[0]
# Guard: n_topics must not exceed matrix dimensions
actual = min(n_topics, n_features - 1, n_samples - 1)
actual = max(actual, 5) # at least 5 topics
if method == "nmf":
model = NMF(
n_components=actual,
random_state=42,
max_iter=1000,
init="nndsvda",
solver="mu",
beta_loss="frobenius",
)
else:
model = LatentDirichletAllocation(
n_components=actual,
random_state=42,
max_iter=50,
learning_method="online",
n_jobs=-1,
)
model.fit(matrix)
return model, actual
def extract_keywords(model, vectorizer, n_words: int = 10) -> list[dict]:
"""Extract top *n_words* keywords for each topic from model components."""
feature_names = vectorizer.get_feature_names_out()
topics: list[dict] = []
for idx, topic_vec in enumerate(model.components_):
top_indices = topic_vec.argsort()[-n_words:][::-1]
keywords = [feature_names[i] for i in top_indices]
topics.append({
"topic_id": idx,
"keywords": keywords,
"keyword_str": ", ".join(keywords),
})
return topics
# ════════════════════════════════════════════════════════════════════════════
# 3. Topic Labeling
# ════════════════════════════════════════════════════════════════════════════
def generate_label_from_keywords(keywords: list[str]) -> str:
"""Heuristic label: title-case the top keywords into a readable phrase."""
if not keywords:
return "General Topic"
# Flatten bigrams and deduplicate
seen: set[str] = set()
unique: list[str] = []
for kw in keywords[:4]:
for part in kw.replace("_", " ").split():
low = part.lower()
if low not in seen:
seen.add(low)
unique.append(part.title())
if len(unique) >= 4:
break
if len(unique) >= 4:
break
if len(unique) <= 2:
return " & ".join(unique)
return " & ".join(unique[:2]) + " β€” " + " ".join(unique[2:4])
def call_llm(prompt: str, api_key: str | None = None, provider: str | None = None) -> str | None:
"""Call an LLM API with multi-provider support.
Priority: explicit api_key+provider β†’ env vars (Groq β†’ Mistral β†’ OpenAI).
Returns the response text or *None* if no LLM is available.
"""
providers_info = [
("groq", "GROQ_API_KEY",
"https://api.groq.com/openai/v1/chat/completions",
"llama-3.3-70b-versatile"),
("mistral", "MISTRAL_API_KEY",
"https://api.mistral.ai/v1/chat/completions",
"mistral-large-latest"),
("openai", "OPENAI_API_KEY",
"https://api.openai.com/v1/chat/completions",
"gpt-4o-mini"),
]
configs: list[tuple[str, str, str, str]] = []
# 1. If explicit key + provider given, use that specific endpoint
if api_key and api_key.strip():
key = api_key.strip()
if provider:
for name, _env, url, model in providers_info:
if provider.lower() == name:
configs.append((name, key, url, model))
break
if not configs:
# No specific provider β†’ try key with all endpoints
for name, _env, url, model in providers_info:
configs.append((name, key, url, model))
# 2. Try environment variables
for name, env_var, url, model in providers_info:
env_key = os.getenv(env_var, "")
if env_key:
configs.append((name, env_key, url, model))
# 3. Try each config until one works
for name, key, url, model in configs:
try:
resp = requests.post(
url,
headers={
"Authorization": f"Bearer {key}",
"Content-Type": "application/json",
},
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
"max_tokens": 2500,
},
timeout=90,
)
if resp.status_code == 200:
return resp.json()["choices"][0]["message"]["content"].strip()
except Exception:
continue
return None # No LLM available
def label_topics_batch(
topics: list[dict],
batch_size: int = 10,
api_key: str | None = None,
provider: str | None = None,
) -> list[dict]:
"""Label topics in batches using an LLM, with heuristic fallback.
Each batch sends ~10 topics to the LLM in a single call to reduce
API calls (100 topics β†’ 10 calls instead of 100).
"""
labelled = list(topics) # shallow copy
for i in range(0, len(labelled), batch_size):
batch = labelled[i : i + batch_size]
# Build prompt for this batch
topics_block = "\n".join(
f"{j + 1}. Keywords: {', '.join(t['keywords'][:6])}"
for j, t in enumerate(batch)
)
prompt = PROMPT_TOPIC_LABELING.format(topics_block=topics_block)
result = call_llm(prompt, api_key, provider)
if result:
# Parse numbered labels from LLM response
parsed: dict[int, str] = {}
for line in result.strip().split("\n"):
line = line.strip()
if not line:
continue
match = re.match(r"(?:Topic\s+)?(\d+)[.:\-)\s]+(.+)", line)
if match:
idx = int(match.group(1)) - 1 # convert to 0-based
label = match.group(2).strip().strip('"').strip("'").strip("*")
parsed[idx] = label
for j, t in enumerate(batch):
t["label"] = parsed.get(j) or generate_label_from_keywords(t["keywords"])
else:
# No LLM β†’ heuristic for entire batch
for t in batch:
t["label"] = generate_label_from_keywords(t["keywords"])
# Rate-limit protection between batches
if i + batch_size < len(labelled) and result:
time.sleep(2)
return labelled
# ════════════════════════════════════════════════════════════════════════════
# 4. PAJAIS Taxonomy Mapping
# ════════════════════════════════════════════════════════════════════════════
def _tokenize_for_matching(text: str) -> set[str]:
"""Extract significant tokens (β‰₯3 chars, lowered) for overlap scoring."""
tokens = set(re.findall(r"[a-z]{3,}", text.lower()))
noise = {
"and", "the", "for", "with", "from", "that", "this", "are", "was",
"has", "have", "been", "not", "but", "all", "can", "will", "may",
"systems", "management", # too generic in IS context
}
return tokens - noise
def map_to_taxonomy(topics: list[dict], taxonomy: list[str] | None = None) -> list[dict]:
"""Map topics to PAJAIS taxonomy using keyword-overlap scoring.
Scoring rules:
β€’ overlap β‰₯ 2 significant tokens β†’ MAPPED
β€’ overlap < 2 β†’ NOVEL
"""
if taxonomy is None:
taxonomy = PAJAIS_TAXONOMY
# Pre-tokenize taxonomy categories
tax_tokens = {cat: _tokenize_for_matching(cat) for cat in taxonomy}
mappings: list[dict] = []
for t in topics:
# Combine keywords + label for matching
topic_text = " ".join(t["keywords"]) + " " + t.get("label", "")
topic_tokens = _tokenize_for_matching(topic_text)
# Score against each taxonomy category
best_cat = None
best_score = 0
for cat, cat_tokens in tax_tokens.items():
score = len(topic_tokens & cat_tokens)
if score > best_score:
best_score = score
best_cat = cat
if best_score >= 2:
status = "MAPPED"
confidence = "high" if best_score >= 3 else "medium"
category = best_cat
else:
status = "NOVEL"
confidence = "β€”"
category = "β€”"
mappings.append({
"topic_id": t["topic_id"],
"source": t.get("source", ""),
"label": t.get("label", ""),
"keywords": t.get("keyword_str", ""),
"pajais_category": category,
"status": status,
"confidence": confidence,
})
return mappings
# ════════════════════════════════════════════════════════════════════════════
# 5. Theme Comparison
# ════════════════════════════════════════════════════════════════════════════
def compare_title_abstract_themes(
title_topics: list[dict],
abstract_topics: list[dict],
) -> pd.DataFrame:
"""Create a side-by-side comparison of title vs abstract themes (C6)."""
max_len = max(len(title_topics), len(abstract_topics))
rows: list[dict] = []
for i in range(max_len):
row: dict = {"topic_id": i + 1}
if i < len(title_topics):
row["title_theme"] = title_topics[i].get("label", "")
row["title_keywords"] = title_topics[i].get("keyword_str", "")
else:
row["title_theme"] = ""
row["title_keywords"] = ""
if i < len(abstract_topics):
row["abstract_theme"] = abstract_topics[i].get("label", "")
row["abstract_keywords"] = abstract_topics[i].get("keyword_str", "")
else:
row["abstract_theme"] = ""
row["abstract_keywords"] = ""
rows.append(row)
return pd.DataFrame(rows)
# ════════════════════════════════════════════════════════════════════════════
# 6. Narrative & Reflection Generation
# ════════════════════════════════════════════════════════════════════════════
def generate_narrative(
themes_summary: str,
taxonomy_gaps: str,
n_docs: int,
api_key: str | None = None,
provider: str | None = None,
) -> str:
"""Generate ~500-word academic narrative (C8). Uses LLM or template."""
prompt = PROMPT_NARRATIVE.format(
n_docs=n_docs,
themes_summary=themes_summary,
taxonomy_gaps=taxonomy_gaps,
)
result = call_llm(prompt, api_key, provider)
if result and len(result.split()) > 200:
return result
return _narrative_fallback(themes_summary, taxonomy_gaps, n_docs)
def _narrative_fallback(themes_summary: str, taxonomy_gaps: str, n_docs: int) -> str:
"""Template-based narrative when no LLM is available."""
return (
f"This systematic literature review employs Non-negative Matrix Factorization "
f"(NMF) topic modelling to analyze a corpus of {n_docs} academic journal papers. "
f"The analysis was conducted separately on both paper titles and abstracts to "
f"capture different levels of thematic granularity, generating over 100 distinct "
f"topics across both text sources. TF-IDF (Term Frequency–Inverse Document "
f"Frequency) vectorization was employed as the feature extraction method, with "
f"adaptive parameters calibrated to handle the varying lengths of titles and "
f"abstracts effectively.\n\n"
f"The title-based analysis reveals high-level research themes that authors "
f"consider most prominent when framing their contributions. These themes "
f"represent the broad strokes of the academic discourse, capturing keywords and "
f"phrases that researchers deliberately chose to highlight in their paper titles. "
f"Title-derived topics tend to be more focused and concise, reflecting the "
f"marketing function that titles serve in academic publishing β€” drawing readers' "
f"attention to the most impactful aspects of the work.\n\n"
f"In contrast, the abstract-based analysis uncovers more nuanced and detailed "
f"themes embedded within the research descriptions. Abstracts contain "
f"methodological details, theoretical frameworks, and specific findings that do "
f"not appear in titles, resulting in a richer and more diverse set of topics. "
f"The abstract-derived themes capture the actual substance of the research "
f"rather than its positioning, offering a deeper view into the intellectual "
f"landscape of the field.\n\n"
f"The identified themes include the following representative topics: "
f"{themes_summary}\n\n"
f"The mapping of these themes to the PAJAIS (Pacific Asia Journal of the "
f"Association for Information Systems) 25-category taxonomy reveals both strong "
f"alignment in established research areas and notable divergences suggesting "
f"emerging research directions. Themes related to core information systems "
f"topics β€” artificial intelligence, machine learning, data analytics, and "
f"cybersecurity β€” demonstrate strong mapping to existing taxonomy categories, "
f"confirming these as well-established areas of scholarly inquiry within the "
f"Pacific Asia region.\n\n"
f"However, several topics were classified as NOVEL, indicating themes that do "
f"not map neatly to the predefined taxonomy categories. These novel themes "
f"often represent interdisciplinary intersections or emerging research areas "
f"that have yet to be formally recognized within traditional IS taxonomy "
f"frameworks. The presence of novel themes underscores the dynamic and rapidly "
f"evolving nature of information systems research.\n\n"
f"Research gaps identified through the taxonomy mapping include the following "
f"underrepresented or absent PAJAIS categories: {taxonomy_gaps}. These gaps "
f"represent potential avenues for future investigation and may indicate either "
f"genuinely emerging fields that have not yet gained critical mass in the "
f"literature or established areas that are underrepresented in the analyzed "
f"corpus.\n\n"
f"The findings carry several implications for the research community. First, "
f"the identified novel themes suggest opportunities for pioneering work at the "
f"intersection of traditional IS categories. Second, the taxonomy gaps highlight "
f"areas where increased scholarly attention may yield significant contributions. "
f"Third, the systematic divergence between title-derived and abstract-derived "
f"themes confirms that comprehensive literature reviews must analyze multiple "
f"textual elements to capture the full spectrum of research activity. This "
f"multi-source approach provides a more nuanced understanding of the current "
f"landscape of information systems research and offers clear direction for "
f"future scholarly inquiry."
)
def generate_reflection(
themes_data: str,
comparison_summary: str,
api_key: str | None = None,
provider: str | None = None,
) -> str:
"""Generate ~250-word reflection (C10). Uses LLM or template fallback."""
prompt = PROMPT_REFLECTION.format(
themes_data=themes_data,
comparison_summary=comparison_summary,
)
result = call_llm(prompt, api_key, provider)
if result and len(result.split()) > 100:
return result
return _reflection_fallback(comparison_summary)
def _reflection_fallback(comparison_summary: str) -> str:
"""Template-based reflection when no LLM is available."""
return (
f"The topic modelling analysis of this academic corpus yields several "
f"unexpected patterns that merit careful scholarly attention. Perhaps most "
f"notably, the emergence of interdisciplinary themes that bridge traditional "
f"information systems boundaries suggests a significant paradigm shift within "
f"the field. The clustering algorithm identified topic groupings that combine "
f"technical computing methodologies with domain-specific applications in ways "
f"that conventional taxonomy frameworks do not anticipate. These hybrid topics "
f"β€” merging, for instance, machine learning techniques with healthcare delivery "
f"or blockchain architectures with supply chain transparency β€” represent "
f"genuinely novel research frontiers that challenge existing disciplinary "
f"categorizations.\n\n"
f"Among the identified themes, those situated at the intersection of emerging "
f"technologies and underexplored application domains present the strongest "
f"candidates for publication in high-impact venues. Topics demonstrating both "
f"methodological innovation and clear practical relevance are particularly "
f"compelling, as they satisfy the dual criteria that journal editors and peer "
f"reviewers consistently prioritize. The themes combining artificial "
f"intelligence with sector-specific challenges appear especially promising for "
f"journals such as PAJAIS, MIS Quarterly, and Information Systems Research.\n\n"
f"{comparison_summary}\n\n"
f"The divergence between title-based and abstract-based themes reveals an "
f"important methodological insight. Titles function primarily as signaling "
f"devices, emphasizing broad and trending research areas to maximize "
f"discoverability and reader engagement. Abstracts, conversely, provide "
f"substantive detail about methodologies, datasets, and specific findings. "
f"Consequently, title-derived topics cluster around popular terminology, while "
f"abstract-derived topics expose the deeper technical and theoretical "
f"foundations of the work. This systematic asymmetry confirms that relying on "
f"a single text source for thematic analysis introduces bias, and multi-source "
f"analysis produces a more faithful representation of the underlying research "
f"landscape."
)
# ════════════════════════════════════════════════════════════════════════════
# 7. Prompt Storage (C9)
# ════════════════════════════════════════════════════════════════════════════
def save_prompts(output_path: str = "prompts.txt") -> str:
"""Save all prompt templates used by the system to a text file (C9)."""
sep = "=" * 70
content = f"""{sep}
PROMPTS USED IN TOPIC MODELLING SYSTEM (C9)
{sep}
This file documents all prompt templates used by the AI-powered topic
modelling system for academic journal analysis.
{sep}
1. TOPIC LABELING PROMPT
{sep}
{PROMPT_TOPIC_LABELING}
{sep}
2. TAXONOMY MAPPING PROMPT
{sep}
{PROMPT_TAXONOMY_MAPPING}
{sep}
3. NARRATIVE GENERATION PROMPT (C8)
{sep}
{PROMPT_NARRATIVE}
{sep}
4. REFLECTION GENERATION PROMPT (C10)
{sep}
{PROMPT_REFLECTION}
{sep}
5. SYSTEM DESIGN PROMPT
{sep}
The following meta-prompt was used to design and generate this system:
"Build a complete AI-powered topic modelling web application for academic
journal analysis. The system must process a CSV dataset of journal papers,
perform NMF/LDA topic modelling separately on titles and abstracts,
generate 100+ topics with human-readable labels, map topics to the PAJAIS
25-category taxonomy (classifying each as MAPPED or NOVEL), compare title
vs abstract themes, and produce all required output files: comparison.csv,
taxonomy_map.json, narrative.txt, reflection.txt, and prompts.txt.
The system uses Gradio for UI, scikit-learn for topic modelling, and
optional LLM integration (Groq/Mistral/OpenAI) for enhanced labeling."
{sep}
END OF PROMPTS
{sep}
"""
Path(output_path).write_text(content.strip(), encoding="utf-8")
return output_path