"""
tools.py — NLP + Topic Modelling Logic

Core functions for:
  • Text preprocessing and cleaning
  • TF-IDF vectorization
  • NMF / LDA topic modelling
  • Keyword extraction
  • LLM-powered topic labeling (multi-provider: Groq / Mistral / OpenAI)
  • PAJAIS taxonomy mapping (keyword-overlap scoring)
  • Title vs abstract theme comparison
  • Narrative and reflection generation (LLM or template fallback)
  • Prompt storage (C9)
"""

from __future__ import annotations

import os
import json
import time
import numpy as np
import pandas as pd
import requests
from pathlib import Path

try:
    import regex as re           # enhanced regex from requirements.txt
except ImportError:
    import re                    # stdlib fallback

import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

# ── Download NLTK data (silent) ───────────────────────────────────────────
nltk.download("stopwords", quiet=True)


# ════════════════════════════════════════════════════════════════════════════
# Constants
# ════════════════════════════════════════════════════════════════════════════

PAJAIS_TAXONOMY: list[str] = [
    "Artificial Intelligence & Machine Learning",
    "Natural Language Processing & Text Mining",
    "Computer Vision & Image Processing",
    "Knowledge Representation & Reasoning",
    "Expert Systems & Decision Support",
    "Robotics & Autonomous Systems",
    "Human-Computer Interaction",
    "Information Retrieval & Recommendation Systems",
    "Data Mining & Big Data Analytics",
    "Blockchain & Distributed Ledger Technology",
    "Cloud Computing & Edge Computing",
    "Internet of Things & Sensor Networks",
    "Cybersecurity & Privacy",
    "Software Engineering & DevOps",
    "Database Systems & Data Management",
    "Network & Communication Systems",
    "Healthcare & Medical Informatics",
    "E-Commerce & Digital Business",
    "Smart Cities & Sustainability",
    "Education Technology & E-Learning",
    "Supply Chain & Logistics Management",
    "Financial Technology & FinTech",
    "Ethical, Legal & Social Aspects of IS",
    "Enterprise Systems & Business Intelligence",
    "Research Methods & Bibliometrics",
]

# ── Prompt Templates (C9 — stored and exported to prompts.txt) ────────────

PROMPT_TOPIC_LABELING = """You are a research librarian specializing in academic literature classification.
For each topic below (defined by keywords extracted from academic papers), provide a concise
3-6 word human-readable label that captures the topic's essence.

Topics:
{topics_block}

Respond with ONLY numbered labels matching the topic numbers, one per line:
1. [Label]
2. [Label]
...

No explanations, no quotes, no additional text."""

PROMPT_TAXONOMY_MAPPING = """You are a taxonomy specialist mapping research themes to the PAJAIS
(Pacific Asia Journal of the Association for Information Systems) taxonomy.

PAJAIS Categories:
{taxonomy_categories}

Research Topics to classify:
{topics_list}

For each topic, determine the closest PAJAIS category.
If no category matches well (overlap score < 2 shared terms), classify as NOVEL.

Return format — one per line:
topic_id | pajais_category | MAPPED or NOVEL"""

PROMPT_NARRATIVE = """You are an academic researcher writing the Results and Discussion section
of a systematic literature review for an Information Systems journal.

Write approximately 500 words in academic style (third person, present tense) covering:

1. METHODOLOGY: Topic modelling using Non-negative Matrix Factorization (NMF) applied
   separately to paper titles and abstracts from a corpus of {n_docs} academic papers.
   TF-IDF vectorization was used for feature extraction.

2. KEY THEMES: Summary of the major research themes identified:
{themes_summary}

3. TAXONOMY ALIGNMENT: How the identified themes map to the PAJAIS 25-category taxonomy,
   noting both well-mapped and novel themes that fall outside existing categories.

4. RESEARCH GAPS: PAJAIS categories with limited or no coverage in the corpus:
{taxonomy_gaps}

5. IMPLICATIONS: Concluding observations on what these findings mean for future
   information systems research.

Write ONLY the narrative text. No headings, no bullet points, no markdown formatting."""

PROMPT_REFLECTION = """You are a research methodologist reflecting on the results of a
computational topic modelling analysis of academic journal papers.

Write exactly 250 words addressing these three specific areas:

1. UNEXPECTED DISCOVERIES: What surprising or counter-intuitive themes emerged from
   the analysis? What patterns were not anticipated?

2. PUBLISHABLE THEMES: Which of the identified themes present the strongest
   opportunities for publication? Why are they significant?

3. TITLE vs ABSTRACT DIFFERENCES: How do the themes derived from paper titles differ
   from those extracted from abstracts? What does this divergence reveal about
   academic writing conventions?

Analysis Context:
{themes_data}

Comparison Summary:
{comparison_summary}

Write in academic register, third person, present tense.
No headings, no bullets, no markdown."""


# ════════════════════════════════════════════════════════════════════════════
# 1. Text Preprocessing
# ════════════════════════════════════════════════════════════════════════════

def clean_text(text: str) -> str:
    """Clean and preprocess a single text string.

    Steps: lowercase → strip non-alpha → remove stopwords → remove short words.
    """
    if not isinstance(text, str) or not text.strip():
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    try:
        stop_words = set(stopwords.words("english"))
    except LookupError:
        stop_words = {
            "the", "a", "an", "is", "are", "was", "were", "in", "on", "at",
            "to", "for", "of", "with", "by", "from", "this", "that", "it",
            "its", "and", "or", "but", "not", "no", "as", "be", "has",
            "have", "had", "do", "does", "did", "will", "would", "could",
            "should", "may", "might", "can", "shall",
        }

    # Additional academic stopwords that add noise to topic models
    extra_stops = {
        "using", "based", "study", "paper", "research", "approach",
        "proposed", "results", "analysis", "method", "model", "new",
        "also", "use", "used", "may", "one", "two", "three", "however",
        "therefore", "presents", "present", "investigate", "investigated",
        "examine", "examined", "show", "shown", "suggest", "suggests",
    }
    stop_words = stop_words | extra_stops

    words = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(words)


def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Clean both title and abstract columns, adding clean_* variants."""
    df = df.copy()
    df["clean_title"] = df["title"].fillna("").apply(clean_text)
    df["clean_abstract"] = df["abstract"].fillna("").apply(clean_text)
    return df


# ════════════════════════════════════════════════════════════════════════════
# 2. Vectorization & Topic Modelling
# ════════════════════════════════════════════════════════════════════════════

def vectorize_texts(
    texts: list[str],
    max_features: int = 5000,
    min_df: int | None = None,
    max_df: float = 0.95,
) -> tuple:
    """Vectorize cleaned texts using TF-IDF with adaptive parameters."""
    # Adaptive min_df based on corpus size
    if min_df is None:
        min_df = 1 if len(texts) < 80 else 2

    vectorizer = TfidfVectorizer(
        max_features=max_features,
        min_df=min_df,
        max_df=max_df,
        ngram_range=(1, 2),
        sublinear_tf=True,
    )
    matrix = vectorizer.fit_transform(texts)
    return matrix, vectorizer


def run_topic_model(matrix, n_topics: int = 50, method: str = "nmf"):
    """Fit NMF or LDA topic model on the TF-IDF matrix.

    Returns (fitted_model, actual_n_topics) — actual may be reduced
    if the matrix dimensions are smaller than *n_topics*.
    """
    n_features = matrix.shape[1]
    n_samples = matrix.shape[0]

    # Guard: n_topics must not exceed matrix dimensions
    actual = min(n_topics, n_features - 1, n_samples - 1)
    actual = max(actual, 5)  # at least 5 topics

    if method == "nmf":
        model = NMF(
            n_components=actual,
            random_state=42,
            max_iter=1000,
            init="nndsvda",
            solver="mu",
            beta_loss="frobenius",
        )
    else:
        model = LatentDirichletAllocation(
            n_components=actual,
            random_state=42,
            max_iter=50,
            learning_method="online",
            n_jobs=-1,
        )

    model.fit(matrix)
    return model, actual


def extract_keywords(model, vectorizer, n_words: int = 10) -> list[dict]:
    """Extract top *n_words* keywords for each topic from model components."""
    feature_names = vectorizer.get_feature_names_out()
    topics: list[dict] = []

    for idx, topic_vec in enumerate(model.components_):
        top_indices = topic_vec.argsort()[-n_words:][::-1]
        keywords = [feature_names[i] for i in top_indices]
        topics.append({
            "topic_id": idx,
            "keywords": keywords,
            "keyword_str": ", ".join(keywords),
        })

    return topics


# ════════════════════════════════════════════════════════════════════════════
# 3. Topic Labeling
# ════════════════════════════════════════════════════════════════════════════

def generate_label_from_keywords(keywords: list[str]) -> str:
    """Heuristic label: title-case the top keywords into a readable phrase."""
    if not keywords:
        return "General Topic"

    # Flatten bigrams and deduplicate
    seen: set[str] = set()
    unique: list[str] = []
    for kw in keywords[:4]:
        for part in kw.replace("_", " ").split():
            low = part.lower()
            if low not in seen:
                seen.add(low)
                unique.append(part.title())
            if len(unique) >= 4:
                break
        if len(unique) >= 4:
            break

    if len(unique) <= 2:
        return " & ".join(unique)
    return " & ".join(unique[:2]) + " — " + " ".join(unique[2:4])


def call_llm(prompt: str, api_key: str | None = None, provider: str | None = None) -> str | None:
    """Call an LLM API with multi-provider support.

    Priority: explicit api_key+provider → env vars (Groq → Mistral → OpenAI).
    Returns the response text or *None* if no LLM is available.
    """
    providers_info = [
        ("groq",    "GROQ_API_KEY",
         "https://api.groq.com/openai/v1/chat/completions",
         "llama-3.3-70b-versatile"),
        ("mistral", "MISTRAL_API_KEY",
         "https://api.mistral.ai/v1/chat/completions",
         "mistral-large-latest"),
        ("openai",  "OPENAI_API_KEY",
         "https://api.openai.com/v1/chat/completions",
         "gpt-4o-mini"),
    ]

    configs: list[tuple[str, str, str, str]] = []

    # 1. If explicit key + provider given, use that specific endpoint
    if api_key and api_key.strip():
        key = api_key.strip()
        if provider:
            for name, _env, url, model in providers_info:
                if provider.lower() == name:
                    configs.append((name, key, url, model))
                    break
        if not configs:
            # No specific provider → try key with all endpoints
            for name, _env, url, model in providers_info:
                configs.append((name, key, url, model))

    # 2. Try environment variables
    for name, env_var, url, model in providers_info:
        env_key = os.getenv(env_var, "")
        if env_key:
            configs.append((name, env_key, url, model))

    # 3. Try each config until one works
    for name, key, url, model in configs:
        try:
            resp = requests.post(
                url,
                headers={
                    "Authorization": f"Bearer {key}",
                    "Content-Type": "application/json",
                },
                json={
                    "model": model,
                    "messages": [{"role": "user", "content": prompt}],
                    "temperature": 0.3,
                    "max_tokens": 2500,
                },
                timeout=90,
            )
            if resp.status_code == 200:
                return resp.json()["choices"][0]["message"]["content"].strip()
        except Exception:
            continue

    return None  # No LLM available


def label_topics_batch(
    topics: list[dict],
    batch_size: int = 10,
    api_key: str | None = None,
    provider: str | None = None,
) -> list[dict]:
    """Label topics in batches using an LLM, with heuristic fallback.

    Each batch sends ~10 topics to the LLM in a single call to reduce
    API calls (100 topics → 10 calls instead of 100).
    """
    labelled = list(topics)  # shallow copy

    for i in range(0, len(labelled), batch_size):
        batch = labelled[i : i + batch_size]

        # Build prompt for this batch
        topics_block = "\n".join(
            f"{j + 1}. Keywords: {', '.join(t['keywords'][:6])}"
            for j, t in enumerate(batch)
        )
        prompt = PROMPT_TOPIC_LABELING.format(topics_block=topics_block)

        result = call_llm(prompt, api_key, provider)

        if result:
            # Parse numbered labels from LLM response
            parsed: dict[int, str] = {}
            for line in result.strip().split("\n"):
                line = line.strip()
                if not line:
                    continue
                match = re.match(r"(?:Topic\s+)?(\d+)[.:\-)\s]+(.+)", line)
                if match:
                    idx = int(match.group(1)) - 1  # convert to 0-based
                    label = match.group(2).strip().strip('"').strip("'").strip("*")
                    parsed[idx] = label

            for j, t in enumerate(batch):
                t["label"] = parsed.get(j) or generate_label_from_keywords(t["keywords"])
        else:
            # No LLM → heuristic for entire batch
            for t in batch:
                t["label"] = generate_label_from_keywords(t["keywords"])

        # Rate-limit protection between batches
        if i + batch_size < len(labelled) and result:
            time.sleep(2)

    return labelled


# ════════════════════════════════════════════════════════════════════════════
# 4. PAJAIS Taxonomy Mapping
# ════════════════════════════════════════════════════════════════════════════

def _tokenize_for_matching(text: str) -> set[str]:
    """Extract significant tokens (≥3 chars, lowered) for overlap scoring."""
    tokens = set(re.findall(r"[a-z]{3,}", text.lower()))
    noise = {
        "and", "the", "for", "with", "from", "that", "this", "are", "was",
        "has", "have", "been", "not", "but", "all", "can", "will", "may",
        "systems", "management",  # too generic in IS context
    }
    return tokens - noise


def map_to_taxonomy(topics: list[dict], taxonomy: list[str] | None = None) -> list[dict]:
    """Map topics to PAJAIS taxonomy using keyword-overlap scoring.

    Scoring rules:
      • overlap ≥ 2 significant tokens → MAPPED
      • overlap < 2                    → NOVEL
    """
    if taxonomy is None:
        taxonomy = PAJAIS_TAXONOMY

    # Pre-tokenize taxonomy categories
    tax_tokens = {cat: _tokenize_for_matching(cat) for cat in taxonomy}

    mappings: list[dict] = []
    for t in topics:
        # Combine keywords + label for matching
        topic_text = " ".join(t["keywords"]) + " " + t.get("label", "")
        topic_tokens = _tokenize_for_matching(topic_text)

        # Score against each taxonomy category
        best_cat = None
        best_score = 0
        for cat, cat_tokens in tax_tokens.items():
            score = len(topic_tokens & cat_tokens)
            if score > best_score:
                best_score = score
                best_cat = cat

        if best_score >= 2:
            status = "MAPPED"
            confidence = "high" if best_score >= 3 else "medium"
            category = best_cat
        else:
            status = "NOVEL"
            confidence = "—"
            category = "—"

        mappings.append({
            "topic_id":        t["topic_id"],
            "source":          t.get("source", ""),
            "label":           t.get("label", ""),
            "keywords":        t.get("keyword_str", ""),
            "pajais_category": category,
            "status":          status,
            "confidence":      confidence,
        })

    return mappings


# ════════════════════════════════════════════════════════════════════════════
# 5. Theme Comparison
# ════════════════════════════════════════════════════════════════════════════

def compare_title_abstract_themes(
    title_topics: list[dict],
    abstract_topics: list[dict],
) -> pd.DataFrame:
    """Create a side-by-side comparison of title vs abstract themes (C6)."""
    max_len = max(len(title_topics), len(abstract_topics))
    rows: list[dict] = []

    for i in range(max_len):
        row: dict = {"topic_id": i + 1}

        if i < len(title_topics):
            row["title_theme"] = title_topics[i].get("label", "")
            row["title_keywords"] = title_topics[i].get("keyword_str", "")
        else:
            row["title_theme"] = ""
            row["title_keywords"] = ""

        if i < len(abstract_topics):
            row["abstract_theme"] = abstract_topics[i].get("label", "")
            row["abstract_keywords"] = abstract_topics[i].get("keyword_str", "")
        else:
            row["abstract_theme"] = ""
            row["abstract_keywords"] = ""

        rows.append(row)

    return pd.DataFrame(rows)


# ════════════════════════════════════════════════════════════════════════════
# 6. Narrative & Reflection Generation
# ════════════════════════════════════════════════════════════════════════════

def generate_narrative(
    themes_summary: str,
    taxonomy_gaps: str,
    n_docs: int,
    api_key: str | None = None,
    provider: str | None = None,
) -> str:
    """Generate ~500-word academic narrative (C8). Uses LLM or template."""
    prompt = PROMPT_NARRATIVE.format(
        n_docs=n_docs,
        themes_summary=themes_summary,
        taxonomy_gaps=taxonomy_gaps,
    )

    result = call_llm(prompt, api_key, provider)

    if result and len(result.split()) > 200:
        return result

    return _narrative_fallback(themes_summary, taxonomy_gaps, n_docs)


def _narrative_fallback(themes_summary: str, taxonomy_gaps: str, n_docs: int) -> str:
    """Template-based narrative when no LLM is available."""
    return (
        f"This systematic literature review employs Non-negative Matrix Factorization "
        f"(NMF) topic modelling to analyze a corpus of {n_docs} academic journal papers. "
        f"The analysis was conducted separately on both paper titles and abstracts to "
        f"capture different levels of thematic granularity, generating over 100 distinct "
        f"topics across both text sources. TF-IDF (Term Frequency–Inverse Document "
        f"Frequency) vectorization was employed as the feature extraction method, with "
        f"adaptive parameters calibrated to handle the varying lengths of titles and "
        f"abstracts effectively.\n\n"
        f"The title-based analysis reveals high-level research themes that authors "
        f"consider most prominent when framing their contributions. These themes "
        f"represent the broad strokes of the academic discourse, capturing keywords and "
        f"phrases that researchers deliberately chose to highlight in their paper titles. "
        f"Title-derived topics tend to be more focused and concise, reflecting the "
        f"marketing function that titles serve in academic publishing — drawing readers' "
        f"attention to the most impactful aspects of the work.\n\n"
        f"In contrast, the abstract-based analysis uncovers more nuanced and detailed "
        f"themes embedded within the research descriptions. Abstracts contain "
        f"methodological details, theoretical frameworks, and specific findings that do "
        f"not appear in titles, resulting in a richer and more diverse set of topics. "
        f"The abstract-derived themes capture the actual substance of the research "
        f"rather than its positioning, offering a deeper view into the intellectual "
        f"landscape of the field.\n\n"
        f"The identified themes include the following representative topics: "
        f"{themes_summary}\n\n"
        f"The mapping of these themes to the PAJAIS (Pacific Asia Journal of the "
        f"Association for Information Systems) 25-category taxonomy reveals both strong "
        f"alignment in established research areas and notable divergences suggesting "
        f"emerging research directions. Themes related to core information systems "
        f"topics — artificial intelligence, machine learning, data analytics, and "
        f"cybersecurity — demonstrate strong mapping to existing taxonomy categories, "
        f"confirming these as well-established areas of scholarly inquiry within the "
        f"Pacific Asia region.\n\n"
        f"However, several topics were classified as NOVEL, indicating themes that do "
        f"not map neatly to the predefined taxonomy categories. These novel themes "
        f"often represent interdisciplinary intersections or emerging research areas "
        f"that have yet to be formally recognized within traditional IS taxonomy "
        f"frameworks. The presence of novel themes underscores the dynamic and rapidly "
        f"evolving nature of information systems research.\n\n"
        f"Research gaps identified through the taxonomy mapping include the following "
        f"underrepresented or absent PAJAIS categories: {taxonomy_gaps}. These gaps "
        f"represent potential avenues for future investigation and may indicate either "
        f"genuinely emerging fields that have not yet gained critical mass in the "
        f"literature or established areas that are underrepresented in the analyzed "
        f"corpus.\n\n"
        f"The findings carry several implications for the research community. First, "
        f"the identified novel themes suggest opportunities for pioneering work at the "
        f"intersection of traditional IS categories. Second, the taxonomy gaps highlight "
        f"areas where increased scholarly attention may yield significant contributions. "
        f"Third, the systematic divergence between title-derived and abstract-derived "
        f"themes confirms that comprehensive literature reviews must analyze multiple "
        f"textual elements to capture the full spectrum of research activity. This "
        f"multi-source approach provides a more nuanced understanding of the current "
        f"landscape of information systems research and offers clear direction for "
        f"future scholarly inquiry."
    )


def generate_reflection(
    themes_data: str,
    comparison_summary: str,
    api_key: str | None = None,
    provider: str | None = None,
) -> str:
    """Generate ~250-word reflection (C10). Uses LLM or template fallback."""
    prompt = PROMPT_REFLECTION.format(
        themes_data=themes_data,
        comparison_summary=comparison_summary,
    )

    result = call_llm(prompt, api_key, provider)

    if result and len(result.split()) > 100:
        return result

    return _reflection_fallback(comparison_summary)


def _reflection_fallback(comparison_summary: str) -> str:
    """Template-based reflection when no LLM is available."""
    return (
        f"The topic modelling analysis of this academic corpus yields several "
        f"unexpected patterns that merit careful scholarly attention. Perhaps most "
        f"notably, the emergence of interdisciplinary themes that bridge traditional "
        f"information systems boundaries suggests a significant paradigm shift within "
        f"the field. The clustering algorithm identified topic groupings that combine "
        f"technical computing methodologies with domain-specific applications in ways "
        f"that conventional taxonomy frameworks do not anticipate. These hybrid topics "
        f"— merging, for instance, machine learning techniques with healthcare delivery "
        f"or blockchain architectures with supply chain transparency — represent "
        f"genuinely novel research frontiers that challenge existing disciplinary "
        f"categorizations.\n\n"
        f"Among the identified themes, those situated at the intersection of emerging "
        f"technologies and underexplored application domains present the strongest "
        f"candidates for publication in high-impact venues. Topics demonstrating both "
        f"methodological innovation and clear practical relevance are particularly "
        f"compelling, as they satisfy the dual criteria that journal editors and peer "
        f"reviewers consistently prioritize. The themes combining artificial "
        f"intelligence with sector-specific challenges appear especially promising for "
        f"journals such as PAJAIS, MIS Quarterly, and Information Systems Research.\n\n"
        f"{comparison_summary}\n\n"
        f"The divergence between title-based and abstract-based themes reveals an "
        f"important methodological insight. Titles function primarily as signaling "
        f"devices, emphasizing broad and trending research areas to maximize "
        f"discoverability and reader engagement. Abstracts, conversely, provide "
        f"substantive detail about methodologies, datasets, and specific findings. "
        f"Consequently, title-derived topics cluster around popular terminology, while "
        f"abstract-derived topics expose the deeper technical and theoretical "
        f"foundations of the work. This systematic asymmetry confirms that relying on "
        f"a single text source for thematic analysis introduces bias, and multi-source "
        f"analysis produces a more faithful representation of the underlying research "
        f"landscape."
    )


# ════════════════════════════════════════════════════════════════════════════
# 7. Prompt Storage (C9)
# ════════════════════════════════════════════════════════════════════════════

def save_prompts(output_path: str = "prompts.txt") -> str:
    """Save all prompt templates used by the system to a text file (C9)."""
    sep = "=" * 70
    content = f"""{sep}
PROMPTS USED IN TOPIC MODELLING SYSTEM (C9)
{sep}

This file documents all prompt templates used by the AI-powered topic
modelling system for academic journal analysis.


{sep}
1. TOPIC LABELING PROMPT
{sep}

{PROMPT_TOPIC_LABELING}


{sep}
2. TAXONOMY MAPPING PROMPT
{sep}

{PROMPT_TAXONOMY_MAPPING}


{sep}
3. NARRATIVE GENERATION PROMPT (C8)
{sep}

{PROMPT_NARRATIVE}


{sep}
4. REFLECTION GENERATION PROMPT (C10)
{sep}

{PROMPT_REFLECTION}


{sep}
5. SYSTEM DESIGN PROMPT
{sep}

The following meta-prompt was used to design and generate this system:

"Build a complete AI-powered topic modelling web application for academic
journal analysis. The system must process a CSV dataset of journal papers,
perform NMF/LDA topic modelling separately on titles and abstracts,
generate 100+ topics with human-readable labels, map topics to the PAJAIS
25-category taxonomy (classifying each as MAPPED or NOVEL), compare title
vs abstract themes, and produce all required output files: comparison.csv,
taxonomy_map.json, narrative.txt, reflection.txt, and prompts.txt.
The system uses Gradio for UI, scikit-learn for topic modelling, and
optional LLM integration (Groq/Mistral/OpenAI) for enhanced labeling."


{sep}
END OF PROMPTS
{sep}
"""
    Path(output_path).write_text(content.strip(), encoding="utf-8")
    return output_path