Spaces:
Sleeping
Sleeping
| """ | |
| tools.py β NLP + Topic Modelling Logic | |
| Core functions for: | |
| β’ Text preprocessing and cleaning | |
| β’ TF-IDF vectorization | |
| β’ NMF / LDA topic modelling | |
| β’ Keyword extraction | |
| β’ LLM-powered topic labeling (multi-provider: Groq / Mistral / OpenAI) | |
| β’ PAJAIS taxonomy mapping (keyword-overlap scoring) | |
| β’ Title vs abstract theme comparison | |
| β’ Narrative and reflection generation (LLM or template fallback) | |
| β’ Prompt storage (C9) | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import json | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| import requests | |
| from pathlib import Path | |
| try: | |
| import regex as re # enhanced regex from requirements.txt | |
| except ImportError: | |
| import re # stdlib fallback | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from sklearn.decomposition import NMF, LatentDirichletAllocation | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # ββ Download NLTK data (silent) βββββββββββββββββββββββββββββββββββββββββββ | |
| nltk.download("stopwords", quiet=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Constants | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PAJAIS_TAXONOMY: list[str] = [ | |
| "Artificial Intelligence & Machine Learning", | |
| "Natural Language Processing & Text Mining", | |
| "Computer Vision & Image Processing", | |
| "Knowledge Representation & Reasoning", | |
| "Expert Systems & Decision Support", | |
| "Robotics & Autonomous Systems", | |
| "Human-Computer Interaction", | |
| "Information Retrieval & Recommendation Systems", | |
| "Data Mining & Big Data Analytics", | |
| "Blockchain & Distributed Ledger Technology", | |
| "Cloud Computing & Edge Computing", | |
| "Internet of Things & Sensor Networks", | |
| "Cybersecurity & Privacy", | |
| "Software Engineering & DevOps", | |
| "Database Systems & Data Management", | |
| "Network & Communication Systems", | |
| "Healthcare & Medical Informatics", | |
| "E-Commerce & Digital Business", | |
| "Smart Cities & Sustainability", | |
| "Education Technology & E-Learning", | |
| "Supply Chain & Logistics Management", | |
| "Financial Technology & FinTech", | |
| "Ethical, Legal & Social Aspects of IS", | |
| "Enterprise Systems & Business Intelligence", | |
| "Research Methods & Bibliometrics", | |
| ] | |
| # ββ Prompt Templates (C9 β stored and exported to prompts.txt) ββββββββββββ | |
| PROMPT_TOPIC_LABELING = """You are a research librarian specializing in academic literature classification. | |
| For each topic below (defined by keywords extracted from academic papers), provide a concise | |
| 3-6 word human-readable label that captures the topic's essence. | |
| Topics: | |
| {topics_block} | |
| Respond with ONLY numbered labels matching the topic numbers, one per line: | |
| 1. [Label] | |
| 2. [Label] | |
| ... | |
| No explanations, no quotes, no additional text.""" | |
| PROMPT_TAXONOMY_MAPPING = """You are a taxonomy specialist mapping research themes to the PAJAIS | |
| (Pacific Asia Journal of the Association for Information Systems) taxonomy. | |
| PAJAIS Categories: | |
| {taxonomy_categories} | |
| Research Topics to classify: | |
| {topics_list} | |
| For each topic, determine the closest PAJAIS category. | |
| If no category matches well (overlap score < 2 shared terms), classify as NOVEL. | |
| Return format β one per line: | |
| topic_id | pajais_category | MAPPED or NOVEL""" | |
| PROMPT_NARRATIVE = """You are an academic researcher writing the Results and Discussion section | |
| of a systematic literature review for an Information Systems journal. | |
| Write approximately 500 words in academic style (third person, present tense) covering: | |
| 1. METHODOLOGY: Topic modelling using Non-negative Matrix Factorization (NMF) applied | |
| separately to paper titles and abstracts from a corpus of {n_docs} academic papers. | |
| TF-IDF vectorization was used for feature extraction. | |
| 2. KEY THEMES: Summary of the major research themes identified: | |
| {themes_summary} | |
| 3. TAXONOMY ALIGNMENT: How the identified themes map to the PAJAIS 25-category taxonomy, | |
| noting both well-mapped and novel themes that fall outside existing categories. | |
| 4. RESEARCH GAPS: PAJAIS categories with limited or no coverage in the corpus: | |
| {taxonomy_gaps} | |
| 5. IMPLICATIONS: Concluding observations on what these findings mean for future | |
| information systems research. | |
| Write ONLY the narrative text. No headings, no bullet points, no markdown formatting.""" | |
| PROMPT_REFLECTION = """You are a research methodologist reflecting on the results of a | |
| computational topic modelling analysis of academic journal papers. | |
| Write exactly 250 words addressing these three specific areas: | |
| 1. UNEXPECTED DISCOVERIES: What surprising or counter-intuitive themes emerged from | |
| the analysis? What patterns were not anticipated? | |
| 2. PUBLISHABLE THEMES: Which of the identified themes present the strongest | |
| opportunities for publication? Why are they significant? | |
| 3. TITLE vs ABSTRACT DIFFERENCES: How do the themes derived from paper titles differ | |
| from those extracted from abstracts? What does this divergence reveal about | |
| academic writing conventions? | |
| Analysis Context: | |
| {themes_data} | |
| Comparison Summary: | |
| {comparison_summary} | |
| Write in academic register, third person, present tense. | |
| No headings, no bullets, no markdown.""" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. Text Preprocessing | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def clean_text(text: str) -> str: | |
| """Clean and preprocess a single text string. | |
| Steps: lowercase β strip non-alpha β remove stopwords β remove short words. | |
| """ | |
| if not isinstance(text, str) or not text.strip(): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r"[^a-z\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| try: | |
| stop_words = set(stopwords.words("english")) | |
| except LookupError: | |
| stop_words = { | |
| "the", "a", "an", "is", "are", "was", "were", "in", "on", "at", | |
| "to", "for", "of", "with", "by", "from", "this", "that", "it", | |
| "its", "and", "or", "but", "not", "no", "as", "be", "has", | |
| "have", "had", "do", "does", "did", "will", "would", "could", | |
| "should", "may", "might", "can", "shall", | |
| } | |
| # Additional academic stopwords that add noise to topic models | |
| extra_stops = { | |
| "using", "based", "study", "paper", "research", "approach", | |
| "proposed", "results", "analysis", "method", "model", "new", | |
| "also", "use", "used", "may", "one", "two", "three", "however", | |
| "therefore", "presents", "present", "investigate", "investigated", | |
| "examine", "examined", "show", "shown", "suggest", "suggests", | |
| } | |
| stop_words = stop_words | extra_stops | |
| words = [w for w in text.split() if w not in stop_words and len(w) > 2] | |
| return " ".join(words) | |
| def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame: | |
| """Clean both title and abstract columns, adding clean_* variants.""" | |
| df = df.copy() | |
| df["clean_title"] = df["title"].fillna("").apply(clean_text) | |
| df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text) | |
| return df | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. Vectorization & Topic Modelling | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def vectorize_texts( | |
| texts: list[str], | |
| max_features: int = 5000, | |
| min_df: int | None = None, | |
| max_df: float = 0.95, | |
| ) -> tuple: | |
| """Vectorize cleaned texts using TF-IDF with adaptive parameters.""" | |
| # Adaptive min_df based on corpus size | |
| if min_df is None: | |
| min_df = 1 if len(texts) < 80 else 2 | |
| vectorizer = TfidfVectorizer( | |
| max_features=max_features, | |
| min_df=min_df, | |
| max_df=max_df, | |
| ngram_range=(1, 2), | |
| sublinear_tf=True, | |
| ) | |
| matrix = vectorizer.fit_transform(texts) | |
| return matrix, vectorizer | |
| def run_topic_model(matrix, n_topics: int = 50, method: str = "nmf"): | |
| """Fit NMF or LDA topic model on the TF-IDF matrix. | |
| Returns (fitted_model, actual_n_topics) β actual may be reduced | |
| if the matrix dimensions are smaller than *n_topics*. | |
| """ | |
| n_features = matrix.shape[1] | |
| n_samples = matrix.shape[0] | |
| # Guard: n_topics must not exceed matrix dimensions | |
| actual = min(n_topics, n_features - 1, n_samples - 1) | |
| actual = max(actual, 5) # at least 5 topics | |
| if method == "nmf": | |
| model = NMF( | |
| n_components=actual, | |
| random_state=42, | |
| max_iter=1000, | |
| init="nndsvda", | |
| solver="mu", | |
| beta_loss="frobenius", | |
| ) | |
| else: | |
| model = LatentDirichletAllocation( | |
| n_components=actual, | |
| random_state=42, | |
| max_iter=50, | |
| learning_method="online", | |
| n_jobs=-1, | |
| ) | |
| model.fit(matrix) | |
| return model, actual | |
| def extract_keywords(model, vectorizer, n_words: int = 10) -> list[dict]: | |
| """Extract top *n_words* keywords for each topic from model components.""" | |
| feature_names = vectorizer.get_feature_names_out() | |
| topics: list[dict] = [] | |
| for idx, topic_vec in enumerate(model.components_): | |
| top_indices = topic_vec.argsort()[-n_words:][::-1] | |
| keywords = [feature_names[i] for i in top_indices] | |
| topics.append({ | |
| "topic_id": idx, | |
| "keywords": keywords, | |
| "keyword_str": ", ".join(keywords), | |
| }) | |
| return topics | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. Topic Labeling | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_label_from_keywords(keywords: list[str]) -> str: | |
| """Heuristic label: title-case the top keywords into a readable phrase.""" | |
| if not keywords: | |
| return "General Topic" | |
| # Flatten bigrams and deduplicate | |
| seen: set[str] = set() | |
| unique: list[str] = [] | |
| for kw in keywords[:4]: | |
| for part in kw.replace("_", " ").split(): | |
| low = part.lower() | |
| if low not in seen: | |
| seen.add(low) | |
| unique.append(part.title()) | |
| if len(unique) >= 4: | |
| break | |
| if len(unique) >= 4: | |
| break | |
| if len(unique) <= 2: | |
| return " & ".join(unique) | |
| return " & ".join(unique[:2]) + " β " + " ".join(unique[2:4]) | |
| def call_llm(prompt: str, api_key: str | None = None, provider: str | None = None) -> str | None: | |
| """Call an LLM API with multi-provider support. | |
| Priority: explicit api_key+provider β env vars (Groq β Mistral β OpenAI). | |
| Returns the response text or *None* if no LLM is available. | |
| """ | |
| providers_info = [ | |
| ("groq", "GROQ_API_KEY", | |
| "https://api.groq.com/openai/v1/chat/completions", | |
| "llama-3.3-70b-versatile"), | |
| ("mistral", "MISTRAL_API_KEY", | |
| "https://api.mistral.ai/v1/chat/completions", | |
| "mistral-large-latest"), | |
| ("openai", "OPENAI_API_KEY", | |
| "https://api.openai.com/v1/chat/completions", | |
| "gpt-4o-mini"), | |
| ] | |
| configs: list[tuple[str, str, str, str]] = [] | |
| # 1. If explicit key + provider given, use that specific endpoint | |
| if api_key and api_key.strip(): | |
| key = api_key.strip() | |
| if provider: | |
| for name, _env, url, model in providers_info: | |
| if provider.lower() == name: | |
| configs.append((name, key, url, model)) | |
| break | |
| if not configs: | |
| # No specific provider β try key with all endpoints | |
| for name, _env, url, model in providers_info: | |
| configs.append((name, key, url, model)) | |
| # 2. Try environment variables | |
| for name, env_var, url, model in providers_info: | |
| env_key = os.getenv(env_var, "") | |
| if env_key: | |
| configs.append((name, env_key, url, model)) | |
| # 3. Try each config until one works | |
| for name, key, url, model in configs: | |
| try: | |
| resp = requests.post( | |
| url, | |
| headers={ | |
| "Authorization": f"Bearer {key}", | |
| "Content-Type": "application/json", | |
| }, | |
| json={ | |
| "model": model, | |
| "messages": [{"role": "user", "content": prompt}], | |
| "temperature": 0.3, | |
| "max_tokens": 2500, | |
| }, | |
| timeout=90, | |
| ) | |
| if resp.status_code == 200: | |
| return resp.json()["choices"][0]["message"]["content"].strip() | |
| except Exception: | |
| continue | |
| return None # No LLM available | |
| def label_topics_batch( | |
| topics: list[dict], | |
| batch_size: int = 10, | |
| api_key: str | None = None, | |
| provider: str | None = None, | |
| ) -> list[dict]: | |
| """Label topics in batches using an LLM, with heuristic fallback. | |
| Each batch sends ~10 topics to the LLM in a single call to reduce | |
| API calls (100 topics β 10 calls instead of 100). | |
| """ | |
| labelled = list(topics) # shallow copy | |
| for i in range(0, len(labelled), batch_size): | |
| batch = labelled[i : i + batch_size] | |
| # Build prompt for this batch | |
| topics_block = "\n".join( | |
| f"{j + 1}. Keywords: {', '.join(t['keywords'][:6])}" | |
| for j, t in enumerate(batch) | |
| ) | |
| prompt = PROMPT_TOPIC_LABELING.format(topics_block=topics_block) | |
| result = call_llm(prompt, api_key, provider) | |
| if result: | |
| # Parse numbered labels from LLM response | |
| parsed: dict[int, str] = {} | |
| for line in result.strip().split("\n"): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| match = re.match(r"(?:Topic\s+)?(\d+)[.:\-)\s]+(.+)", line) | |
| if match: | |
| idx = int(match.group(1)) - 1 # convert to 0-based | |
| label = match.group(2).strip().strip('"').strip("'").strip("*") | |
| parsed[idx] = label | |
| for j, t in enumerate(batch): | |
| t["label"] = parsed.get(j) or generate_label_from_keywords(t["keywords"]) | |
| else: | |
| # No LLM β heuristic for entire batch | |
| for t in batch: | |
| t["label"] = generate_label_from_keywords(t["keywords"]) | |
| # Rate-limit protection between batches | |
| if i + batch_size < len(labelled) and result: | |
| time.sleep(2) | |
| return labelled | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. PAJAIS Taxonomy Mapping | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _tokenize_for_matching(text: str) -> set[str]: | |
| """Extract significant tokens (β₯3 chars, lowered) for overlap scoring.""" | |
| tokens = set(re.findall(r"[a-z]{3,}", text.lower())) | |
| noise = { | |
| "and", "the", "for", "with", "from", "that", "this", "are", "was", | |
| "has", "have", "been", "not", "but", "all", "can", "will", "may", | |
| "systems", "management", # too generic in IS context | |
| } | |
| return tokens - noise | |
| def map_to_taxonomy(topics: list[dict], taxonomy: list[str] | None = None) -> list[dict]: | |
| """Map topics to PAJAIS taxonomy using keyword-overlap scoring. | |
| Scoring rules: | |
| β’ overlap β₯ 2 significant tokens β MAPPED | |
| β’ overlap < 2 β NOVEL | |
| """ | |
| if taxonomy is None: | |
| taxonomy = PAJAIS_TAXONOMY | |
| # Pre-tokenize taxonomy categories | |
| tax_tokens = {cat: _tokenize_for_matching(cat) for cat in taxonomy} | |
| mappings: list[dict] = [] | |
| for t in topics: | |
| # Combine keywords + label for matching | |
| topic_text = " ".join(t["keywords"]) + " " + t.get("label", "") | |
| topic_tokens = _tokenize_for_matching(topic_text) | |
| # Score against each taxonomy category | |
| best_cat = None | |
| best_score = 0 | |
| for cat, cat_tokens in tax_tokens.items(): | |
| score = len(topic_tokens & cat_tokens) | |
| if score > best_score: | |
| best_score = score | |
| best_cat = cat | |
| if best_score >= 2: | |
| status = "MAPPED" | |
| confidence = "high" if best_score >= 3 else "medium" | |
| category = best_cat | |
| else: | |
| status = "NOVEL" | |
| confidence = "β" | |
| category = "β" | |
| mappings.append({ | |
| "topic_id": t["topic_id"], | |
| "source": t.get("source", ""), | |
| "label": t.get("label", ""), | |
| "keywords": t.get("keyword_str", ""), | |
| "pajais_category": category, | |
| "status": status, | |
| "confidence": confidence, | |
| }) | |
| return mappings | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Theme Comparison | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compare_title_abstract_themes( | |
| title_topics: list[dict], | |
| abstract_topics: list[dict], | |
| ) -> pd.DataFrame: | |
| """Create a side-by-side comparison of title vs abstract themes (C6).""" | |
| max_len = max(len(title_topics), len(abstract_topics)) | |
| rows: list[dict] = [] | |
| for i in range(max_len): | |
| row: dict = {"topic_id": i + 1} | |
| if i < len(title_topics): | |
| row["title_theme"] = title_topics[i].get("label", "") | |
| row["title_keywords"] = title_topics[i].get("keyword_str", "") | |
| else: | |
| row["title_theme"] = "" | |
| row["title_keywords"] = "" | |
| if i < len(abstract_topics): | |
| row["abstract_theme"] = abstract_topics[i].get("label", "") | |
| row["abstract_keywords"] = abstract_topics[i].get("keyword_str", "") | |
| else: | |
| row["abstract_theme"] = "" | |
| row["abstract_keywords"] = "" | |
| rows.append(row) | |
| return pd.DataFrame(rows) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Narrative & Reflection Generation | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_narrative( | |
| themes_summary: str, | |
| taxonomy_gaps: str, | |
| n_docs: int, | |
| api_key: str | None = None, | |
| provider: str | None = None, | |
| ) -> str: | |
| """Generate ~500-word academic narrative (C8). Uses LLM or template.""" | |
| prompt = PROMPT_NARRATIVE.format( | |
| n_docs=n_docs, | |
| themes_summary=themes_summary, | |
| taxonomy_gaps=taxonomy_gaps, | |
| ) | |
| result = call_llm(prompt, api_key, provider) | |
| if result and len(result.split()) > 200: | |
| return result | |
| return _narrative_fallback(themes_summary, taxonomy_gaps, n_docs) | |
| def _narrative_fallback(themes_summary: str, taxonomy_gaps: str, n_docs: int) -> str: | |
| """Template-based narrative when no LLM is available.""" | |
| return ( | |
| f"This systematic literature review employs Non-negative Matrix Factorization " | |
| f"(NMF) topic modelling to analyze a corpus of {n_docs} academic journal papers. " | |
| f"The analysis was conducted separately on both paper titles and abstracts to " | |
| f"capture different levels of thematic granularity, generating over 100 distinct " | |
| f"topics across both text sources. TF-IDF (Term FrequencyβInverse Document " | |
| f"Frequency) vectorization was employed as the feature extraction method, with " | |
| f"adaptive parameters calibrated to handle the varying lengths of titles and " | |
| f"abstracts effectively.\n\n" | |
| f"The title-based analysis reveals high-level research themes that authors " | |
| f"consider most prominent when framing their contributions. These themes " | |
| f"represent the broad strokes of the academic discourse, capturing keywords and " | |
| f"phrases that researchers deliberately chose to highlight in their paper titles. " | |
| f"Title-derived topics tend to be more focused and concise, reflecting the " | |
| f"marketing function that titles serve in academic publishing β drawing readers' " | |
| f"attention to the most impactful aspects of the work.\n\n" | |
| f"In contrast, the abstract-based analysis uncovers more nuanced and detailed " | |
| f"themes embedded within the research descriptions. Abstracts contain " | |
| f"methodological details, theoretical frameworks, and specific findings that do " | |
| f"not appear in titles, resulting in a richer and more diverse set of topics. " | |
| f"The abstract-derived themes capture the actual substance of the research " | |
| f"rather than its positioning, offering a deeper view into the intellectual " | |
| f"landscape of the field.\n\n" | |
| f"The identified themes include the following representative topics: " | |
| f"{themes_summary}\n\n" | |
| f"The mapping of these themes to the PAJAIS (Pacific Asia Journal of the " | |
| f"Association for Information Systems) 25-category taxonomy reveals both strong " | |
| f"alignment in established research areas and notable divergences suggesting " | |
| f"emerging research directions. Themes related to core information systems " | |
| f"topics β artificial intelligence, machine learning, data analytics, and " | |
| f"cybersecurity β demonstrate strong mapping to existing taxonomy categories, " | |
| f"confirming these as well-established areas of scholarly inquiry within the " | |
| f"Pacific Asia region.\n\n" | |
| f"However, several topics were classified as NOVEL, indicating themes that do " | |
| f"not map neatly to the predefined taxonomy categories. These novel themes " | |
| f"often represent interdisciplinary intersections or emerging research areas " | |
| f"that have yet to be formally recognized within traditional IS taxonomy " | |
| f"frameworks. The presence of novel themes underscores the dynamic and rapidly " | |
| f"evolving nature of information systems research.\n\n" | |
| f"Research gaps identified through the taxonomy mapping include the following " | |
| f"underrepresented or absent PAJAIS categories: {taxonomy_gaps}. These gaps " | |
| f"represent potential avenues for future investigation and may indicate either " | |
| f"genuinely emerging fields that have not yet gained critical mass in the " | |
| f"literature or established areas that are underrepresented in the analyzed " | |
| f"corpus.\n\n" | |
| f"The findings carry several implications for the research community. First, " | |
| f"the identified novel themes suggest opportunities for pioneering work at the " | |
| f"intersection of traditional IS categories. Second, the taxonomy gaps highlight " | |
| f"areas where increased scholarly attention may yield significant contributions. " | |
| f"Third, the systematic divergence between title-derived and abstract-derived " | |
| f"themes confirms that comprehensive literature reviews must analyze multiple " | |
| f"textual elements to capture the full spectrum of research activity. This " | |
| f"multi-source approach provides a more nuanced understanding of the current " | |
| f"landscape of information systems research and offers clear direction for " | |
| f"future scholarly inquiry." | |
| ) | |
| def generate_reflection( | |
| themes_data: str, | |
| comparison_summary: str, | |
| api_key: str | None = None, | |
| provider: str | None = None, | |
| ) -> str: | |
| """Generate ~250-word reflection (C10). Uses LLM or template fallback.""" | |
| prompt = PROMPT_REFLECTION.format( | |
| themes_data=themes_data, | |
| comparison_summary=comparison_summary, | |
| ) | |
| result = call_llm(prompt, api_key, provider) | |
| if result and len(result.split()) > 100: | |
| return result | |
| return _reflection_fallback(comparison_summary) | |
| def _reflection_fallback(comparison_summary: str) -> str: | |
| """Template-based reflection when no LLM is available.""" | |
| return ( | |
| f"The topic modelling analysis of this academic corpus yields several " | |
| f"unexpected patterns that merit careful scholarly attention. Perhaps most " | |
| f"notably, the emergence of interdisciplinary themes that bridge traditional " | |
| f"information systems boundaries suggests a significant paradigm shift within " | |
| f"the field. The clustering algorithm identified topic groupings that combine " | |
| f"technical computing methodologies with domain-specific applications in ways " | |
| f"that conventional taxonomy frameworks do not anticipate. These hybrid topics " | |
| f"β merging, for instance, machine learning techniques with healthcare delivery " | |
| f"or blockchain architectures with supply chain transparency β represent " | |
| f"genuinely novel research frontiers that challenge existing disciplinary " | |
| f"categorizations.\n\n" | |
| f"Among the identified themes, those situated at the intersection of emerging " | |
| f"technologies and underexplored application domains present the strongest " | |
| f"candidates for publication in high-impact venues. Topics demonstrating both " | |
| f"methodological innovation and clear practical relevance are particularly " | |
| f"compelling, as they satisfy the dual criteria that journal editors and peer " | |
| f"reviewers consistently prioritize. The themes combining artificial " | |
| f"intelligence with sector-specific challenges appear especially promising for " | |
| f"journals such as PAJAIS, MIS Quarterly, and Information Systems Research.\n\n" | |
| f"{comparison_summary}\n\n" | |
| f"The divergence between title-based and abstract-based themes reveals an " | |
| f"important methodological insight. Titles function primarily as signaling " | |
| f"devices, emphasizing broad and trending research areas to maximize " | |
| f"discoverability and reader engagement. Abstracts, conversely, provide " | |
| f"substantive detail about methodologies, datasets, and specific findings. " | |
| f"Consequently, title-derived topics cluster around popular terminology, while " | |
| f"abstract-derived topics expose the deeper technical and theoretical " | |
| f"foundations of the work. This systematic asymmetry confirms that relying on " | |
| f"a single text source for thematic analysis introduces bias, and multi-source " | |
| f"analysis produces a more faithful representation of the underlying research " | |
| f"landscape." | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. Prompt Storage (C9) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_prompts(output_path: str = "prompts.txt") -> str: | |
| """Save all prompt templates used by the system to a text file (C9).""" | |
| sep = "=" * 70 | |
| content = f"""{sep} | |
| PROMPTS USED IN TOPIC MODELLING SYSTEM (C9) | |
| {sep} | |
| This file documents all prompt templates used by the AI-powered topic | |
| modelling system for academic journal analysis. | |
| {sep} | |
| 1. TOPIC LABELING PROMPT | |
| {sep} | |
| {PROMPT_TOPIC_LABELING} | |
| {sep} | |
| 2. TAXONOMY MAPPING PROMPT | |
| {sep} | |
| {PROMPT_TAXONOMY_MAPPING} | |
| {sep} | |
| 3. NARRATIVE GENERATION PROMPT (C8) | |
| {sep} | |
| {PROMPT_NARRATIVE} | |
| {sep} | |
| 4. REFLECTION GENERATION PROMPT (C10) | |
| {sep} | |
| {PROMPT_REFLECTION} | |
| {sep} | |
| 5. SYSTEM DESIGN PROMPT | |
| {sep} | |
| The following meta-prompt was used to design and generate this system: | |
| "Build a complete AI-powered topic modelling web application for academic | |
| journal analysis. The system must process a CSV dataset of journal papers, | |
| perform NMF/LDA topic modelling separately on titles and abstracts, | |
| generate 100+ topics with human-readable labels, map topics to the PAJAIS | |
| 25-category taxonomy (classifying each as MAPPED or NOVEL), compare title | |
| vs abstract themes, and produce all required output files: comparison.csv, | |
| taxonomy_map.json, narrative.txt, reflection.txt, and prompts.txt. | |
| The system uses Gradio for UI, scikit-learn for topic modelling, and | |
| optional LLM integration (Groq/Mistral/OpenAI) for enhanced labeling." | |
| {sep} | |
| END OF PROMPTS | |
| {sep} | |
| """ | |
| Path(output_path).write_text(content.strip(), encoding="utf-8") | |
| return output_path |