""" tools.py — NLP + Topic Modelling Logic Core functions for: • Text preprocessing and cleaning • TF-IDF vectorization • NMF / LDA topic modelling • Keyword extraction • LLM-powered topic labeling (multi-provider: Groq / Mistral / OpenAI) • PAJAIS taxonomy mapping (keyword-overlap scoring) • Title vs abstract theme comparison • Narrative and reflection generation (LLM or template fallback) • Prompt storage (C9) """ from __future__ import annotations import os import json import time import numpy as np import pandas as pd import requests from pathlib import Path try: import regex as re # enhanced regex from requirements.txt except ImportError: import re # stdlib fallback import nltk from nltk.corpus import stopwords from sklearn.decomposition import NMF, LatentDirichletAllocation from sklearn.feature_extraction.text import TfidfVectorizer # ── Download NLTK data (silent) ─────────────────────────────────────────── nltk.download("stopwords", quiet=True) # ════════════════════════════════════════════════════════════════════════════ # Constants # ════════════════════════════════════════════════════════════════════════════ PAJAIS_TAXONOMY: list[str] = [ "Artificial Intelligence & Machine Learning", "Natural Language Processing & Text Mining", "Computer Vision & Image Processing", "Knowledge Representation & Reasoning", "Expert Systems & Decision Support", "Robotics & Autonomous Systems", "Human-Computer Interaction", "Information Retrieval & Recommendation Systems", "Data Mining & Big Data Analytics", "Blockchain & Distributed Ledger Technology", "Cloud Computing & Edge Computing", "Internet of Things & Sensor Networks", "Cybersecurity & Privacy", "Software Engineering & DevOps", "Database Systems & Data Management", "Network & Communication Systems", "Healthcare & Medical Informatics", "E-Commerce & Digital Business", "Smart Cities & Sustainability", "Education Technology & E-Learning", "Supply Chain & Logistics Management", "Financial Technology & FinTech", "Ethical, Legal & Social Aspects of IS", "Enterprise Systems & Business Intelligence", "Research Methods & Bibliometrics", ] # ── Prompt Templates (C9 — stored and exported to prompts.txt) ──────────── PROMPT_TOPIC_LABELING = """You are a research librarian specializing in academic literature classification. For each topic below (defined by keywords extracted from academic papers), provide a concise 3-6 word human-readable label that captures the topic's essence. Topics: {topics_block} Respond with ONLY numbered labels matching the topic numbers, one per line: 1. [Label] 2. [Label] ... No explanations, no quotes, no additional text.""" PROMPT_TAXONOMY_MAPPING = """You are a taxonomy specialist mapping research themes to the PAJAIS (Pacific Asia Journal of the Association for Information Systems) taxonomy. PAJAIS Categories: {taxonomy_categories} Research Topics to classify: {topics_list} For each topic, determine the closest PAJAIS category. If no category matches well (overlap score < 2 shared terms), classify as NOVEL. Return format — one per line: topic_id | pajais_category | MAPPED or NOVEL""" PROMPT_NARRATIVE = """You are an academic researcher writing the Results and Discussion section of a systematic literature review for an Information Systems journal. Write approximately 500 words in academic style (third person, present tense) covering: 1. METHODOLOGY: Topic modelling using Non-negative Matrix Factorization (NMF) applied separately to paper titles and abstracts from a corpus of {n_docs} academic papers. TF-IDF vectorization was used for feature extraction. 2. KEY THEMES: Summary of the major research themes identified: {themes_summary} 3. TAXONOMY ALIGNMENT: How the identified themes map to the PAJAIS 25-category taxonomy, noting both well-mapped and novel themes that fall outside existing categories. 4. RESEARCH GAPS: PAJAIS categories with limited or no coverage in the corpus: {taxonomy_gaps} 5. IMPLICATIONS: Concluding observations on what these findings mean for future information systems research. Write ONLY the narrative text. No headings, no bullet points, no markdown formatting.""" PROMPT_REFLECTION = """You are a research methodologist reflecting on the results of a computational topic modelling analysis of academic journal papers. Write exactly 250 words addressing these three specific areas: 1. UNEXPECTED DISCOVERIES: What surprising or counter-intuitive themes emerged from the analysis? What patterns were not anticipated? 2. PUBLISHABLE THEMES: Which of the identified themes present the strongest opportunities for publication? Why are they significant? 3. TITLE vs ABSTRACT DIFFERENCES: How do the themes derived from paper titles differ from those extracted from abstracts? What does this divergence reveal about academic writing conventions? Analysis Context: {themes_data} Comparison Summary: {comparison_summary} Write in academic register, third person, present tense. No headings, no bullets, no markdown.""" # ════════════════════════════════════════════════════════════════════════════ # 1. Text Preprocessing # ════════════════════════════════════════════════════════════════════════════ def clean_text(text: str) -> str: """Clean and preprocess a single text string. Steps: lowercase → strip non-alpha → remove stopwords → remove short words. """ if not isinstance(text, str) or not text.strip(): return "" text = text.lower() text = re.sub(r"[^a-z\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() try: stop_words = set(stopwords.words("english")) except LookupError: stop_words = { "the", "a", "an", "is", "are", "was", "were", "in", "on", "at", "to", "for", "of", "with", "by", "from", "this", "that", "it", "its", "and", "or", "but", "not", "no", "as", "be", "has", "have", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall", } # Additional academic stopwords that add noise to topic models extra_stops = { "using", "based", "study", "paper", "research", "approach", "proposed", "results", "analysis", "method", "model", "new", "also", "use", "used", "may", "one", "two", "three", "however", "therefore", "presents", "present", "investigate", "investigated", "examine", "examined", "show", "shown", "suggest", "suggests", } stop_words = stop_words | extra_stops words = [w for w in text.split() if w not in stop_words and len(w) > 2] return " ".join(words) def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame: """Clean both title and abstract columns, adding clean_* variants.""" df = df.copy() df["clean_title"] = df["title"].fillna("").apply(clean_text) df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text) return df # ════════════════════════════════════════════════════════════════════════════ # 2. Vectorization & Topic Modelling # ════════════════════════════════════════════════════════════════════════════ def vectorize_texts( texts: list[str], max_features: int = 5000, min_df: int | None = None, max_df: float = 0.95, ) -> tuple: """Vectorize cleaned texts using TF-IDF with adaptive parameters.""" # Adaptive min_df based on corpus size if min_df is None: min_df = 1 if len(texts) < 80 else 2 vectorizer = TfidfVectorizer( max_features=max_features, min_df=min_df, max_df=max_df, ngram_range=(1, 2), sublinear_tf=True, ) matrix = vectorizer.fit_transform(texts) return matrix, vectorizer def run_topic_model(matrix, n_topics: int = 50, method: str = "nmf"): """Fit NMF or LDA topic model on the TF-IDF matrix. Returns (fitted_model, actual_n_topics) — actual may be reduced if the matrix dimensions are smaller than *n_topics*. """ n_features = matrix.shape[1] n_samples = matrix.shape[0] # Guard: n_topics must not exceed matrix dimensions actual = min(n_topics, n_features - 1, n_samples - 1) actual = max(actual, 5) # at least 5 topics if method == "nmf": model = NMF( n_components=actual, random_state=42, max_iter=1000, init="nndsvda", solver="mu", beta_loss="frobenius", ) else: model = LatentDirichletAllocation( n_components=actual, random_state=42, max_iter=50, learning_method="online", n_jobs=-1, ) model.fit(matrix) return model, actual def extract_keywords(model, vectorizer, n_words: int = 10) -> list[dict]: """Extract top *n_words* keywords for each topic from model components.""" feature_names = vectorizer.get_feature_names_out() topics: list[dict] = [] for idx, topic_vec in enumerate(model.components_): top_indices = topic_vec.argsort()[-n_words:][::-1] keywords = [feature_names[i] for i in top_indices] topics.append({ "topic_id": idx, "keywords": keywords, "keyword_str": ", ".join(keywords), }) return topics # ════════════════════════════════════════════════════════════════════════════ # 3. Topic Labeling # ════════════════════════════════════════════════════════════════════════════ def generate_label_from_keywords(keywords: list[str]) -> str: """Heuristic label: title-case the top keywords into a readable phrase.""" if not keywords: return "General Topic" # Flatten bigrams and deduplicate seen: set[str] = set() unique: list[str] = [] for kw in keywords[:4]: for part in kw.replace("_", " ").split(): low = part.lower() if low not in seen: seen.add(low) unique.append(part.title()) if len(unique) >= 4: break if len(unique) >= 4: break if len(unique) <= 2: return " & ".join(unique) return " & ".join(unique[:2]) + " — " + " ".join(unique[2:4]) def call_llm(prompt: str, api_key: str | None = None, provider: str | None = None) -> str | None: """Call an LLM API with multi-provider support. Priority: explicit api_key+provider → env vars (Groq → Mistral → OpenAI). Returns the response text or *None* if no LLM is available. """ providers_info = [ ("groq", "GROQ_API_KEY", "https://api.groq.com/openai/v1/chat/completions", "llama-3.3-70b-versatile"), ("mistral", "MISTRAL_API_KEY", "https://api.mistral.ai/v1/chat/completions", "mistral-large-latest"), ("openai", "OPENAI_API_KEY", "https://api.openai.com/v1/chat/completions", "gpt-4o-mini"), ] configs: list[tuple[str, str, str, str]] = [] # 1. If explicit key + provider given, use that specific endpoint if api_key and api_key.strip(): key = api_key.strip() if provider: for name, _env, url, model in providers_info: if provider.lower() == name: configs.append((name, key, url, model)) break if not configs: # No specific provider → try key with all endpoints for name, _env, url, model in providers_info: configs.append((name, key, url, model)) # 2. Try environment variables for name, env_var, url, model in providers_info: env_key = os.getenv(env_var, "") if env_key: configs.append((name, env_key, url, model)) # 3. Try each config until one works for name, key, url, model in configs: try: resp = requests.post( url, headers={ "Authorization": f"Bearer {key}", "Content-Type": "application/json", }, json={ "model": model, "messages": [{"role": "user", "content": prompt}], "temperature": 0.3, "max_tokens": 2500, }, timeout=90, ) if resp.status_code == 200: return resp.json()["choices"][0]["message"]["content"].strip() except Exception: continue return None # No LLM available def label_topics_batch( topics: list[dict], batch_size: int = 10, api_key: str | None = None, provider: str | None = None, ) -> list[dict]: """Label topics in batches using an LLM, with heuristic fallback. Each batch sends ~10 topics to the LLM in a single call to reduce API calls (100 topics → 10 calls instead of 100). """ labelled = list(topics) # shallow copy for i in range(0, len(labelled), batch_size): batch = labelled[i : i + batch_size] # Build prompt for this batch topics_block = "\n".join( f"{j + 1}. Keywords: {', '.join(t['keywords'][:6])}" for j, t in enumerate(batch) ) prompt = PROMPT_TOPIC_LABELING.format(topics_block=topics_block) result = call_llm(prompt, api_key, provider) if result: # Parse numbered labels from LLM response parsed: dict[int, str] = {} for line in result.strip().split("\n"): line = line.strip() if not line: continue match = re.match(r"(?:Topic\s+)?(\d+)[.:\-)\s]+(.+)", line) if match: idx = int(match.group(1)) - 1 # convert to 0-based label = match.group(2).strip().strip('"').strip("'").strip("*") parsed[idx] = label for j, t in enumerate(batch): t["label"] = parsed.get(j) or generate_label_from_keywords(t["keywords"]) else: # No LLM → heuristic for entire batch for t in batch: t["label"] = generate_label_from_keywords(t["keywords"]) # Rate-limit protection between batches if i + batch_size < len(labelled) and result: time.sleep(2) return labelled # ════════════════════════════════════════════════════════════════════════════ # 4. PAJAIS Taxonomy Mapping # ════════════════════════════════════════════════════════════════════════════ def _tokenize_for_matching(text: str) -> set[str]: """Extract significant tokens (≥3 chars, lowered) for overlap scoring.""" tokens = set(re.findall(r"[a-z]{3,}", text.lower())) noise = { "and", "the", "for", "with", "from", "that", "this", "are", "was", "has", "have", "been", "not", "but", "all", "can", "will", "may", "systems", "management", # too generic in IS context } return tokens - noise def map_to_taxonomy(topics: list[dict], taxonomy: list[str] | None = None) -> list[dict]: """Map topics to PAJAIS taxonomy using keyword-overlap scoring. Scoring rules: • overlap ≥ 2 significant tokens → MAPPED • overlap < 2 → NOVEL """ if taxonomy is None: taxonomy = PAJAIS_TAXONOMY # Pre-tokenize taxonomy categories tax_tokens = {cat: _tokenize_for_matching(cat) for cat in taxonomy} mappings: list[dict] = [] for t in topics: # Combine keywords + label for matching topic_text = " ".join(t["keywords"]) + " " + t.get("label", "") topic_tokens = _tokenize_for_matching(topic_text) # Score against each taxonomy category best_cat = None best_score = 0 for cat, cat_tokens in tax_tokens.items(): score = len(topic_tokens & cat_tokens) if score > best_score: best_score = score best_cat = cat if best_score >= 2: status = "MAPPED" confidence = "high" if best_score >= 3 else "medium" category = best_cat else: status = "NOVEL" confidence = "—" category = "—" mappings.append({ "topic_id": t["topic_id"], "source": t.get("source", ""), "label": t.get("label", ""), "keywords": t.get("keyword_str", ""), "pajais_category": category, "status": status, "confidence": confidence, }) return mappings # ════════════════════════════════════════════════════════════════════════════ # 5. Theme Comparison # ════════════════════════════════════════════════════════════════════════════ def compare_title_abstract_themes( title_topics: list[dict], abstract_topics: list[dict], ) -> pd.DataFrame: """Create a side-by-side comparison of title vs abstract themes (C6).""" max_len = max(len(title_topics), len(abstract_topics)) rows: list[dict] = [] for i in range(max_len): row: dict = {"topic_id": i + 1} if i < len(title_topics): row["title_theme"] = title_topics[i].get("label", "") row["title_keywords"] = title_topics[i].get("keyword_str", "") else: row["title_theme"] = "" row["title_keywords"] = "" if i < len(abstract_topics): row["abstract_theme"] = abstract_topics[i].get("label", "") row["abstract_keywords"] = abstract_topics[i].get("keyword_str", "") else: row["abstract_theme"] = "" row["abstract_keywords"] = "" rows.append(row) return pd.DataFrame(rows) # ════════════════════════════════════════════════════════════════════════════ # 6. Narrative & Reflection Generation # ════════════════════════════════════════════════════════════════════════════ def generate_narrative( themes_summary: str, taxonomy_gaps: str, n_docs: int, api_key: str | None = None, provider: str | None = None, ) -> str: """Generate ~500-word academic narrative (C8). Uses LLM or template.""" prompt = PROMPT_NARRATIVE.format( n_docs=n_docs, themes_summary=themes_summary, taxonomy_gaps=taxonomy_gaps, ) result = call_llm(prompt, api_key, provider) if result and len(result.split()) > 200: return result return _narrative_fallback(themes_summary, taxonomy_gaps, n_docs) def _narrative_fallback(themes_summary: str, taxonomy_gaps: str, n_docs: int) -> str: """Template-based narrative when no LLM is available.""" return ( f"This systematic literature review employs Non-negative Matrix Factorization " f"(NMF) topic modelling to analyze a corpus of {n_docs} academic journal papers. " f"The analysis was conducted separately on both paper titles and abstracts to " f"capture different levels of thematic granularity, generating over 100 distinct " f"topics across both text sources. TF-IDF (Term Frequency–Inverse Document " f"Frequency) vectorization was employed as the feature extraction method, with " f"adaptive parameters calibrated to handle the varying lengths of titles and " f"abstracts effectively.\n\n" f"The title-based analysis reveals high-level research themes that authors " f"consider most prominent when framing their contributions. These themes " f"represent the broad strokes of the academic discourse, capturing keywords and " f"phrases that researchers deliberately chose to highlight in their paper titles. " f"Title-derived topics tend to be more focused and concise, reflecting the " f"marketing function that titles serve in academic publishing — drawing readers' " f"attention to the most impactful aspects of the work.\n\n" f"In contrast, the abstract-based analysis uncovers more nuanced and detailed " f"themes embedded within the research descriptions. Abstracts contain " f"methodological details, theoretical frameworks, and specific findings that do " f"not appear in titles, resulting in a richer and more diverse set of topics. " f"The abstract-derived themes capture the actual substance of the research " f"rather than its positioning, offering a deeper view into the intellectual " f"landscape of the field.\n\n" f"The identified themes include the following representative topics: " f"{themes_summary}\n\n" f"The mapping of these themes to the PAJAIS (Pacific Asia Journal of the " f"Association for Information Systems) 25-category taxonomy reveals both strong " f"alignment in established research areas and notable divergences suggesting " f"emerging research directions. Themes related to core information systems " f"topics — artificial intelligence, machine learning, data analytics, and " f"cybersecurity — demonstrate strong mapping to existing taxonomy categories, " f"confirming these as well-established areas of scholarly inquiry within the " f"Pacific Asia region.\n\n" f"However, several topics were classified as NOVEL, indicating themes that do " f"not map neatly to the predefined taxonomy categories. These novel themes " f"often represent interdisciplinary intersections or emerging research areas " f"that have yet to be formally recognized within traditional IS taxonomy " f"frameworks. The presence of novel themes underscores the dynamic and rapidly " f"evolving nature of information systems research.\n\n" f"Research gaps identified through the taxonomy mapping include the following " f"underrepresented or absent PAJAIS categories: {taxonomy_gaps}. These gaps " f"represent potential avenues for future investigation and may indicate either " f"genuinely emerging fields that have not yet gained critical mass in the " f"literature or established areas that are underrepresented in the analyzed " f"corpus.\n\n" f"The findings carry several implications for the research community. First, " f"the identified novel themes suggest opportunities for pioneering work at the " f"intersection of traditional IS categories. Second, the taxonomy gaps highlight " f"areas where increased scholarly attention may yield significant contributions. " f"Third, the systematic divergence between title-derived and abstract-derived " f"themes confirms that comprehensive literature reviews must analyze multiple " f"textual elements to capture the full spectrum of research activity. This " f"multi-source approach provides a more nuanced understanding of the current " f"landscape of information systems research and offers clear direction for " f"future scholarly inquiry." ) def generate_reflection( themes_data: str, comparison_summary: str, api_key: str | None = None, provider: str | None = None, ) -> str: """Generate ~250-word reflection (C10). Uses LLM or template fallback.""" prompt = PROMPT_REFLECTION.format( themes_data=themes_data, comparison_summary=comparison_summary, ) result = call_llm(prompt, api_key, provider) if result and len(result.split()) > 100: return result return _reflection_fallback(comparison_summary) def _reflection_fallback(comparison_summary: str) -> str: """Template-based reflection when no LLM is available.""" return ( f"The topic modelling analysis of this academic corpus yields several " f"unexpected patterns that merit careful scholarly attention. Perhaps most " f"notably, the emergence of interdisciplinary themes that bridge traditional " f"information systems boundaries suggests a significant paradigm shift within " f"the field. The clustering algorithm identified topic groupings that combine " f"technical computing methodologies with domain-specific applications in ways " f"that conventional taxonomy frameworks do not anticipate. These hybrid topics " f"— merging, for instance, machine learning techniques with healthcare delivery " f"or blockchain architectures with supply chain transparency — represent " f"genuinely novel research frontiers that challenge existing disciplinary " f"categorizations.\n\n" f"Among the identified themes, those situated at the intersection of emerging " f"technologies and underexplored application domains present the strongest " f"candidates for publication in high-impact venues. Topics demonstrating both " f"methodological innovation and clear practical relevance are particularly " f"compelling, as they satisfy the dual criteria that journal editors and peer " f"reviewers consistently prioritize. The themes combining artificial " f"intelligence with sector-specific challenges appear especially promising for " f"journals such as PAJAIS, MIS Quarterly, and Information Systems Research.\n\n" f"{comparison_summary}\n\n" f"The divergence between title-based and abstract-based themes reveals an " f"important methodological insight. Titles function primarily as signaling " f"devices, emphasizing broad and trending research areas to maximize " f"discoverability and reader engagement. Abstracts, conversely, provide " f"substantive detail about methodologies, datasets, and specific findings. " f"Consequently, title-derived topics cluster around popular terminology, while " f"abstract-derived topics expose the deeper technical and theoretical " f"foundations of the work. This systematic asymmetry confirms that relying on " f"a single text source for thematic analysis introduces bias, and multi-source " f"analysis produces a more faithful representation of the underlying research " f"landscape." ) # ════════════════════════════════════════════════════════════════════════════ # 7. Prompt Storage (C9) # ════════════════════════════════════════════════════════════════════════════ def save_prompts(output_path: str = "prompts.txt") -> str: """Save all prompt templates used by the system to a text file (C9).""" sep = "=" * 70 content = f"""{sep} PROMPTS USED IN TOPIC MODELLING SYSTEM (C9) {sep} This file documents all prompt templates used by the AI-powered topic modelling system for academic journal analysis. {sep} 1. TOPIC LABELING PROMPT {sep} {PROMPT_TOPIC_LABELING} {sep} 2. TAXONOMY MAPPING PROMPT {sep} {PROMPT_TAXONOMY_MAPPING} {sep} 3. NARRATIVE GENERATION PROMPT (C8) {sep} {PROMPT_NARRATIVE} {sep} 4. REFLECTION GENERATION PROMPT (C10) {sep} {PROMPT_REFLECTION} {sep} 5. SYSTEM DESIGN PROMPT {sep} The following meta-prompt was used to design and generate this system: "Build a complete AI-powered topic modelling web application for academic journal analysis. The system must process a CSV dataset of journal papers, perform NMF/LDA topic modelling separately on titles and abstracts, generate 100+ topics with human-readable labels, map topics to the PAJAIS 25-category taxonomy (classifying each as MAPPED or NOVEL), compare title vs abstract themes, and produce all required output files: comparison.csv, taxonomy_map.json, narrative.txt, reflection.txt, and prompts.txt. The system uses Gradio for UI, scikit-learn for topic modelling, and optional LLM integration (Groq/Mistral/OpenAI) for enhanced labeling." {sep} END OF PROMPTS {sep} """ Path(output_path).write_text(content.strip(), encoding="utf-8") return output_path