topic_modelling_final

Sleeping

App Files Files Community

luqman2520 commited on Apr 14

Commit

ccab3d4

verified ·

1 Parent(s): 2223108

Upload 4 files

Browse files

Files changed (4) hide show

agent.py +521 -0
app.py +259 -0
requirements.txt +13 -0
tools.py +544 -0

agent.py ADDED Viewed

	@@ -0,0 +1,521 @@

+"""
+agent.py — LangGraph ReAct Agent for BERTopic Agentic Thematic Analysis
+Uses ChatMistralAI + MemorySaver + all 7 tools from tools.py
+"""
+import json
+import os
+import re
+import pandas as pd
+from langgraph.prebuilt import create_react_agent
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_mistralai import ChatMistralAI
+from tools import (
+    load_scopus_csv,
+    run_bertopic_discovery,
+    label_topics_with_llm,
+    consolidate_into_themes,
+    compare_with_taxonomy,
+    generate_comparison_csv,
+    export_narrative,
+)
+llm = ChatMistralAI(
+    model="mistral-large-latest",
+    temperature=0.2,
+    api_key=os.environ.get("MISTRAL_API_KEY", ""),
+)
+memory = MemorySaver()
+SYSTEM_PROMPT = """
+You are an expert computational thematic analysis agent. You follow Braun & Clarke (2006)
+six-phase thematic analysis methodology, adapted for computational corpus analysis using
+BERTopic with sentence-transformer embeddings and agglomerative clustering.
+1. load_scopus_csv(file_path: str)
+   → Load the CSV. Count papers, abstract sentences, title sentences.
+   → Strip boilerplate text from abstracts.
+   → Saves cleaned_data.json to outputs/.
+   → Input: absolute file path string.
+2. run_bertopic_discovery(run_config: str)
+   → Embeds sentences using all-MiniLM-L6-v2.
+   → Clusters with AgglomerativeClustering (cosine, threshold=0.7).
+   → Extracts 5 nearest evidence sentences per cluster.
+   → Saves summaries_{tag}.json, embeddings_{tag}.npy, and 2 chart HTML files.
+   → Input JSON: {"columns": ["Abstract"]} or {"columns": ["Title"]}
+   → Run TWICE: once for Abstract (tag=abstract), once for Title (tag=title).
+3. label_topics_with_llm(labelling_input: str)
+   → You (the LLM) read the top_sentences for each cluster from summaries_{tag}.json,
+     then SELF-SUPPLY the llm_labels list with your best label, category,
+     confidence (0–1), and reasoning for each cluster.
+   → Input JSON: {
+       "tag": "abstract",
+       "llm_labels": [
+         {"cluster_id": 0, "label": "AI in Healthcare", "category": "Applied AI",
+          "confidence": 0.92, "reasoning": "Sentences discuss medical diagnostics..."},
+         ...
+       ]
+     }
+4. consolidate_into_themes(consolidation_input: str)
+   → Applies user approvals from the Review Table.
+   → Merges approved clusters into final themes with final labels.
+   → Saves themes_{tag}.json and chart_keywords.html.
+   → Input JSON: {
+       "tag": "abstract",
+       "approvals": [
+         {"cluster_id": 0, "approved": true, "rename_to": "AI in Medicine",
+          "reasoning": "Covers core domain"},
+         ...
+       ]
+     }
+5. compare_with_taxonomy(taxonomy_input: str)
+   → Maps each final theme to the PAJAIS taxonomy.
+   → Marks each theme as MAPPED or NOVEL.
+   → You self-supply the mappings list.
+   → Input JSON: {
+       "tag": "abstract",
+       "mappings": [
+         {"final_label": "AI in Medicine", "pajais_category": "Healthcare IS",
+          "mapped": true},
+         ...
+       ]
+     }
+6. generate_comparison_csv(comparison_input: str)
+   → Generates side-by-side CSV and Plotly chart comparing abstract vs title themes.
+   → Input JSON: {"tags": ["abstract", "title"]}
+7. export_narrative(narrative_input: str)
+   → You write the ~500-word Section 7 narrative yourself.
+   → Input JSON: {
+       "tag": "abstract",
+       "narrative": "...(your 500-word narrative here)...",
+       "researcher_name": "..."
+     }
+════════════════════════════════════════════════════════════════
+RUN CONFIGURATIONS
+════════════════════════════════════════════════════════════════
+• Abstract run: columns = ["Abstract"]   → tag = "abstract"
+• Title run:    columns = ["Title"]      → tag = "title"
+Always run BERTopic for BOTH configurations before Phase 3.
+════════════════════════════════════════════════════════════════
+BRAUN & CLARKE 6-PHASE WORKFLOW
+════════════════════════════════════════════════════════════════
+PHASE 1 — FAMILIARISATION
+  Goal: Understand the dataset.
+  Action:
+    1. Call load_scopus_csv(file_path) with the uploaded file path.
+    2. Report: total papers, abstract sentences, title sentences, column list.
+    3. Show 5 sample titles.
+  STOP after Phase 1. Say:
+  "✅ Phase 1 complete. Familiarisation done. Say 'Start Phase 2' to begin coding."
+─────────────────────────────────────────��────────────────────
+PHASE 2 — INITIAL CODING
+  Goal: Generate initial semantic codes (clusters) from the corpus.
+  Actions:
+    1. Call run_bertopic_discovery({"columns": ["Abstract"]})
+    2. Call run_bertopic_discovery({"columns": ["Title"]})
+    3. Read outputs/summaries_abstract.json — list ALL cluster IDs and their top 2 sentences.
+    4. Analyse each cluster's top_sentences yourself.
+    5. Call label_topics_with_llm with your self-generated labels for the ABSTRACT run.
+    6. Call label_topics_with_llm with your self-generated labels for the TITLE run.
+    7. Build and present a REVIEW TABLE for the user (for abstract clusters):
+       Columns: [#, Topic Label, Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning]
+       Fill Approve=True for confident clusters, Approve=False for weak/duplicate ones.
+  *** STOP GATE AFTER PHASE 2 ***
+  Say: "⏸️ STOP — Phase 2 complete. Review the table above.
+  Edit Approve/Rename To/Reasoning columns, then click Submit Review to proceed to Phase 3."
+──────────────────────────────────────────────────────────────
+PHASE 3 — SEARCHING FOR THEMES
+  Goal: Group related codes into broader themes.
+  Trigger: User submits the review table (message begins with [REVIEW_TABLE_SUBMITTED]).
+  Actions:
+    1. Parse the JSON review table from the user's message.
+    2. Call consolidate_into_themes with the parsed approvals for "abstract".
+    3. Call consolidate_into_themes with approvals for "title" (approve all by default).
+    4. Report the final theme list with counts.
+  *** STOP GATE AFTER PHASE 3 ***
+  Say: "⏸️ STOP — Phase 3 complete. [N] themes consolidated.
+  Review the theme list above. Say 'Proceed to Phase 4' when satisfied."
+──────────────────────────────────────────────────────────────
+PHASE 4 — REVIEWING THEMES
+  Goal: Theoretical saturation check.
+  Actions:
+    1. Analyse theme sizes and sentence counts.
+    2. Flag any theme with fewer than 3 sentences as POTENTIALLY WEAK.
+    3. Flag any two themes sharing >60% of their top keywords as POTENTIALLY OVERLAPPING.
+    4. Report saturation status: SATURATED or REQUIRES REVISION.
+    5. Recommend merges or splits if needed.
+  *** STOP GATE AFTER PHASE 4 ***
+  Say: "⏸️ STOP — Phase 4 complete. Saturation analysis done.
+  Say 'Proceed to Phase 5' to finalise theme names."
+──────────────────────────────────────────────────────────────
+PHASE 5 — DEFINING AND NAMING THEMES
+  Goal: Finalize descriptive theme names and definitions.
+  Actions:
+    1. For each theme, write a 1-sentence definition.
+    2. Present final theme names and definitions in a clean table.
+    3. Confirm with user.
+  (No STOP gate — flows directly into Phase 5.5)
+──────────────────────────────────────────────────────────────
+PHASE 5.5 — PAJAIS TAXONOMY MAPPING
+  Goal: Position themes within the IS research landscape.
+  Actions:
+    1. Call compare_with_taxonomy for the abstract run — self-supply your mappings.
+    2. Call compare_with_taxonomy for the title run — self-supply your mappings.
+    3. Present a table: Theme | PAJAIS Category | Status (MAPPED/NOVEL).
+  *** STOP GATE AFTER PHASE 5.5 ***
+  Say: "⏸️ STOP — Phase 5.5 complete. PAJAIS mapping done.
+  Say 'Generate Final Report' to proceed to Phase 6."
+──────────────────────────────────────────────────────────────
+PHASE 6 — WRITING UP (REPORT)
+  Goal: Generate the final deliverables.
+  Actions:
+    1. Call generate_comparison_csv({"tags": ["abstract", "title"]})
+    2. Write a ~500-word academic narrative (Section 7) covering:
+       - Research context
+       - Summary of each theme with evidence
+       - Comparison of abstract vs title themes
+       - PAJAIS taxonomy positioning
+       - Implications for IS research
+    3. Call export_narrative with your narrative text.
+    4. Tell the user: outputs are in the outputs/ folder, click Refresh Downloads.
+════════════════════════════════════════════════════════════════
+STRICT BEHAVIOURAL RULES
+════════════════════════════════════════════════════════════════
+• ONE PHASE PER MESSAGE. Never jump ahead.
+• At each STOP gate, wait for explicit user confirmation before proceeding.
+• Never skip a phase.
+• Always self-supply data for label_topics_with_llm, compare_with_taxonomy,
+  and export_narrative — do not ask the user for these.
+• When the user submits a review table ([REVIEW_TABLE_SUBMITTED]), parse it
+  and call consolidate_into_themes immediately.
+• Be concise. Avoid repeating instructions.
+• If a tool returns an error, report it clearly and ask the user how to proceed.
+• Keep all intermediate files in the outputs/ directory.
+════════════════════════════════════════════════════════════════
+PHASE PROGRESS HTML FORMAT
+════════════════════════════════════════════════════════════════
+After completing each phase, include in your response:
+[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]
+(Replace 'done'/'pending' accurately for the current state.)
+"""
+# ─── Agent ────────────────────────────────────────────────────────────────────
+tools_list = [
+    load_scopus_csv,
+    run_bertopic_discovery,
+    label_topics_with_llm,
+    consolidate_into_themes,
+    compare_with_taxonomy,
+    generate_comparison_csv,
+    export_narrative,
+]
+agent = create_react_agent(
+    model=llm,
+    tools=tools_list,
+    checkpointer=memory,
+    prompt=SYSTEM_PROMPT,
+)
+# ─── Helpers for app.py ───────────────────────────────────────────────────────
+def _parse_phase_progress(text: str) -> str:
+    """Extract PHASE_PROGRESS tag from agent response and render as HTML."""
+    match = re.search(r"\[PHASE_PROGRESS:(.*?)\]", text, re.DOTALL)
+    status_map = {
+        "done":    ("✅", "#22c55e"),
+        "pending": ("⬜", "#94a3b8"),
+        "active":  ("🔄", "#3b82f6"),
+    }
+    labels = ["P1", "P2", "P3", "P4", "P5", "P5.5", "P6"]
+    if not match:
+        return "<div style='padding:10px;background:#f0f4ff;border-radius:8px'>" \
+               "<b>Phase Progress:</b> " + \
+               " ".join(f"<span style='margin-left:8px'>⬜ {l}</span>" for l in labels) + \
+               "</div>"
+    progress_str = match.group(1)
+    state = {}
+    for part in progress_str.split(","):
+        part = part.strip()
+        kv   = part.split("=")
+        if len(kv) == 2:
+            state[kv[0].strip()] = kv[1].strip()
+    def _badge(label):
+        s = state.get(label, "pending")
+        icon, color = status_map.get(s, ("⬜", "#94a3b8"))
+        return (f"<span style='margin-left:8px;color:{color};font-weight:600'>"
+                f"{icon} {label}</span>")
+    badges = "".join(map(_badge, labels))
+    clean  = re.sub(r"\[PHASE_PROGRESS:.*?\]", "", text, flags=re.DOTALL).strip()
+    return (
+        "<div style='padding:10px;background:#f0f4ff;border-radius:8px;"
+        "font-family:sans-serif'>"
+        f"<b>Phase Progress:</b>{badges}</div>",
+        clean
+    )
+def _build_review_table(agent_text: str) -> list:
+    """
+    Parse a markdown table from the agent response into a list of dicts
+    for the Gradio Dataframe review table.
+    """
+    lines = agent_text.splitlines()
+    # Find markdown table header line (starts with '|' and contains # and Topic)
+    header_idx = None
+    for i, ln in enumerate(lines):
+      if ln.strip().startswith("|") and ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
+        header_idx = i
+        break
+    if header_idx is None:
+      # Fallback: TSV / whitespace-delimited
+      lines = agent_text.strip().splitlines()
+      header_idx = None
+      for i, ln in enumerate(lines):
+        if ("#" in ln) and ("Topic" in ln or "Topic Label" in ln):
+          header_idx = i
+          break
+      if header_idx is None:
+        return []
+      header_cells = re.split(r"\t| {2,}", lines[header_idx].strip())
+      data_lines = lines[header_idx+1:]
+    else:
+      # header exists as markdown table; collect following '|' rows
+      header_cells = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")]
+      data_lines = []
+      # skip possible separator row like |---|
+      j = header_idx + 1
+      if j < len(lines) and re.match(r"^\|[-\s:|]+\|$", lines[j].strip()):
+        j += 1
+      while j < len(lines) and lines[j].strip().startswith("|"):
+        data_lines.append(lines[j])
+        j += 1
+    # Map header indices
+    header_map = {}
+    for idx, h in enumerate(header_cells):
+      key = h.lower()
+      if "#" in key:
+        header_map["#"] = idx
+      elif "cluster" in key and "id" in key:
+        header_map["Cluster ID"] = idx
+      elif "topic" in key and "label" in key:
+        header_map["Topic Label"] = idx
+      elif "evidence" in key:
+        header_map["Top Evidence"] = idx
+      elif "sentence" in key:
+        header_map["Sentences"] = idx
+      elif "paper" in key:
+        header_map["Papers"] = idx
+      elif "approve" in key:
+        header_map["Approve"] = idx
+      elif "rename" in key:
+        header_map["Rename To"] = idx
+      elif "reason" in key:
+        header_map["Reasoning"] = idx
+    rows = []
+    for ln in data_lines:
+      cells = [c.strip() for c in ln.strip().strip("|").split("|")] if ln.strip().startswith("|") else re.split(r"\t| {2,}", ln.strip())
+      if len(cells) < 2:
+        continue
+      row = {"#": "", "Topic Label": "", "Top Evidence": "", "Sentences": "", "Papers": "", "Approve": False, "Rename To": "", "Reasoning": ""}
+      def safe_get(idx):
+        try:
+          return cells[idx]
+        except Exception:
+          return ""
+      if "#" in header_map:
+        row["#"] = safe_get(header_map["#"]) or safe_get(0)
+      if "Cluster ID" in header_map:
+        row["Cluster ID"] = safe_get(header_map["Cluster ID"]) or ""
+      if "Topic Label" in header_map:
+        row["Topic Label"] = safe_get(header_map["Topic Label"]) or safe_get(1)
+      if "Top Evidence" in header_map:
+        row["Top Evidence"] = safe_get(header_map["Top Evidence"]) or ""
+      if "Sentences" in header_map:
+        row["Sentences"] = safe_get(header_map["Sentences"]) or ""
+      if "Papers" in header_map:
+        row["Papers"] = safe_get(header_map["Papers"]) or ""
+      if "Approve" in header_map:
+        val = safe_get(header_map["Approve"]).lower()
+        row["Approve"] = val in ("true","yes","✅","1","y","approve")
+      if "Rename To" in header_map:
+        row["Rename To"] = safe_get(header_map["Rename To"]) or ""
+      if "Reasoning" in header_map:
+        row["Reasoning"] = safe_get(header_map["Reasoning"]) or ""
+      rows.append(row)
+    return rows
+    raw_rows = table_pattern.group(2).strip().splitlines()
+    rows     = []
+    def _parse_row(line):
+        cells = list(map(str.strip, line.strip("|").split("|")))
+        if len(cells) >= 8:
+            return {
+                "#":            cells[0],
+                "Topic Label":  cells[1],
+                "Top Evidence": cells[2],
+                "Sentences":    cells[3],
+                "Papers":       cells[4],
+                "Approve":      cells[5].lower() in ("true", "yes", "✅", "1"),
+                "Rename To":    cells[6],
+                "Reasoning":    cells[7],
+            }
+        return None
+    parsed  = list(map(_parse_row, raw_rows))
+    cleaned = list(filter(lambda r: r is not None, parsed))
+    return cleaned
+def get_agent_state(thread_id: str) -> dict:
+    """Return the current memory state for a given thread."""
+    config = {"configurable": {"thread_id": thread_id}}
+    return memory.get(config) or {}
+def run_agent(user_message: str, context: dict, chat_history: list):
+    """
+    Invoke the agent with a user message and return:
+      (response_text, review_table_data, phase_bar_html)
+    Parameters
+    ----------
+    user_message : str
+        The user's message or [REVIEW_TABLE_SUBMITTED] payload.
+    context : dict
+        Must include 'file_path' and 'thread_id'.
+    chat_history : list
+        List of (human, ai) tuples for context.
+    """
+    file_path = context.get("file_path", "")
+    thread_id = context.get("thread_id", "thread-001")
+    # Quick shortcut: if user requests to start Phase 2, build a review table
+    # directly from outputs/summaries_abstract.json to avoid LLM calls.
+    if user_message.strip().lower().startswith("start phase 2"):
+      summaries_path = "outputs/summaries_abstract.json"
+      if not os.path.exists(summaries_path):
+        return (
+          "Summaries not found. Run BERTopic discovery first (Phase 2).",
+          [],
+          _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=pending, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
+        )
+      with open(summaries_path, encoding="utf-8") as f:
+        summaries = json.load(f)
+      # sort by size desc and take top 20
+      top = sorted(summaries, key=lambda s: s.get("size", 0), reverse=True)[:20]
+      # build markdown table
+      md_lines = [
+        "| # | Cluster ID | Topic Label | Top Evidence | Sentences | Papers | Approve | Rename To | Reasoning |",
+        "|---|------------|-------------|--------------|-----------|--------|---------|-----------|-----------|",
+      ]
+      for i, s in enumerate(top, start=1):
+        top_ev = "; ".join(s.get("top_sentences", [])[:2])
+        row = f"| {i} | {s.get('cluster_id')} | {s.get('label','')} | {top_ev} | {s.get('size',0)} | {len(s.get('papers',[]))} | ✅ |  |  |"
+        md_lines.append(row)
+      md_table = "\n".join(md_lines)
+      phase_html = _parse_phase_progress("[PHASE_PROGRESS: P1=done, P2=done, P3=pending, P4=pending, P5=pending, P5.5=pending, P6=pending]")
+      # _parse_phase_progress can return (html, clean) tuple
+      if isinstance(phase_html, tuple):
+        phase_html = phase_html[0]
+      review_data = _build_review_table(md_table)
+      return md_table, review_data, phase_html
+    if not os.environ.get("MISTRAL_API_KEY"):
+        return (
+            "Mistral API key is missing. Set the `MISTRAL_API_KEY` environment variable, "
+            "restart the app, and then try again.",
+            [],
+            _parse_phase_progress(""),
+        )
+    # Prepend file path hint if present
+    full_message = (
+        f"[FILE_PATH: {file_path}]\n{user_message}"
+        if file_path
+        else user_message
+    )
+    config   = {"configurable": {"thread_id": thread_id}}
+    try:
+        response = agent.invoke({"messages": [("human", full_message)]}, config=config)
+        ai_text  = response["messages"][-1].content
+    except Exception as exc:
+        return (
+            f"Agent execution failed: {exc}",
+            [],
+            _parse_phase_progress(""),
+        )
+    # Parse phase progress bar
+    parsed = _parse_phase_progress(ai_text)
+    if isinstance(parsed, tuple):
+        phase_html, clean_text = parsed
+    else:
+        phase_html  = parsed
+        clean_text  = ai_text
+    # Parse review table if present
+    review_data = _build_review_table(clean_text)
+    # Fallback: if agent didn't emit a markdown review table but summaries exist,
+    # populate the review table from outputs/summaries_abstract.json so the UI
+    # shows a usable table for Phase 2 review.
+    if not review_data:
+      summaries_path = "outputs/summaries_abstract.json"
+      if os.path.exists(summaries_path):
+        try:
+          with open(summaries_path) as f:
+            summaries = json.load(f)
+          rows = []
+          for s in summaries:
+            rows.append({
+              "#": s.get("cluster_id", ""),
+              "Topic Label": s.get("label", ""),
+              "Top Evidence": ("; ").join(s.get("top_sentences", [])[:2]),
+              "Sentences": s.get("size", 0),
+              "Papers": len(s.get("papers", [])),
+              "Approve": False,
+              "Rename To": "",
+              "Reasoning": "",
+            })
+          review_data = rows
+        except Exception:
+          review_data = []
+    return clean_text, review_data, phase_html

app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+app.py — Gradio UI for BERTopic Agentic Thematic Analysis
+"""
+import gradio as gr
+import pandas as pd
+from agent import run_agent
+def format_chat_history(history):
+    """Convert list-of-tuples to Gradio chatbot format."""
+    # Keep for compatibility; actual normalization happens in handlers.
+    return history
+def send_message(user_message, chat_history, file_path, thread_id):
+    """Forward user message to agent and return updated chat + state."""
+    if not user_message.strip():
+        return chat_history, "", gr.update(), gr.update()
+    # Normalize incoming chat_history (Gradio may provide list of dicts)
+    def _to_agent_history(hist):
+        if not hist:
+            return []
+        if isinstance(hist[0], dict):
+            agent_hist = []
+            i = 0
+            while i < len(hist) - 1:
+                a, b = hist[i], hist[i+1]
+                if a.get("role", "") in ("user", "human") and b.get("role", "") in ("assistant", "ai"):
+                    agent_hist.append((a.get("content", ""), b.get("content", "")))
+                    i += 2
+                else:
+                    i += 1
+            return agent_hist
+        return hist or []
+    def _to_gradio_history_from_agent(hist):
+        gr_hist = []
+        for t in hist:
+            if isinstance(t, (list, tuple)) and len(t) >= 2:
+                gr_hist.append({"role": "user", "content": t[0]})
+                gr_hist.append({"role": "assistant", "content": t[1]})
+        return gr_hist
+    agent_chat_history = _to_agent_history(chat_history)
+    context = {"file_path": file_path, "thread_id": thread_id}
+    response, review_data, phase_html = run_agent(user_message, context, agent_chat_history)
+    # Build gradio-compatible history
+    if isinstance(chat_history, list) and chat_history and isinstance(chat_history[0], dict):
+        new_chat = chat_history.copy()
+    else:
+        new_chat = _to_gradio_history_from_agent(agent_chat_history)
+    new_chat.append({"role": "user", "content": user_message})
+    new_chat.append({"role": "assistant", "content": response})
+    review_df = pd.DataFrame(review_data) if review_data else pd.DataFrame(
+        columns=["#", "Topic Label", "Top Evidence", "Sentences", "Papers",
+                 "Approve", "Rename To", "Reasoning"]
+    )
+    return new_chat, "", review_df, phase_html
+def submit_review(review_df, chat_history, file_path, thread_id):
+    """Send the edited review table back to the agent."""
+    table_json = review_df.to_json(orient="records")
+    review_message = f"[REVIEW_TABLE_SUBMITTED]\n{table_json}"
+    context = {"file_path": file_path, "thread_id": thread_id}
+    # Normalize incoming history similar to send_message
+    def _to_agent_history_for_submit(hist):
+        if not hist:
+            return []
+        if isinstance(hist[0], dict):
+            agent_hist = []
+            i = 0
+            while i < len(hist) - 1:
+                a, b = hist[i], hist[i+1]
+                if a.get("role", "") in ("user", "human") and b.get("role", "") in ("assistant", "ai"):
+                    agent_hist.append((a.get("content", ""), b.get("content", "")))
+                    i += 2
+                else:
+                    i += 1
+            return agent_hist
+        return hist or []
+    agent_chat_history = _to_agent_history_for_submit(chat_history)
+    response, new_review_data, phase_html = run_agent(review_message, context, agent_chat_history)
+    # Build gradio-compatible history
+    if isinstance(chat_history, list) and chat_history and isinstance(chat_history[0], dict):
+        new_chat = chat_history.copy()
+    else:
+        def _to_gradio(hist):
+            out = []
+            for t in (hist or []):
+                if isinstance(t, (list, tuple)) and len(t) >= 2:
+                    out.append({"role": "user", "content": t[0]})
+                    out.append({"role": "assistant", "content": t[1]})
+            return out
+        new_chat = _to_gradio(agent_chat_history)
+    new_chat.append({"role": "user", "content": "(Review table submitted)"})
+    new_chat.append({"role": "assistant", "content": response})
+    new_df = pd.DataFrame(new_review_data) if new_review_data else review_df
+    return new_chat, new_df, phase_html
+def get_download_files():
+    """Collect output files available for download."""
+    import os, glob
+    files = glob.glob("outputs/*.csv") + glob.glob("outputs/*.json") + glob.glob("outputs/*.txt")
+    return files if files else None
+with gr.Blocks(title="BERTopic Agentic Thematic Analysis") as demo:
+    thread_id_state = gr.State("thread-001")
+    uploaded_path_state = gr.State(None)
+    gr.Markdown(
+        "# 🔬 BERTopic Agentic Thematic Analysis\n"
+        "Upload your Scopus CSV and follow the agent through Braun & Clarke's 6 phases."
+    )
+    phase_bar = gr.HTML(
+        value="""
+        <div style='padding:10px;background:#f0f4ff;border-radius:8px;font-family:sans-serif'>
+          <b>Phase Progress:</b>
+          <span style='margin-left:12px'>⬜ P1</span>
+          <span style='margin-left:8px'>⬜ P2</span>
+          <span style='margin-left:8px'>⬜ P3</span>
+          <span style='margin-left:8px'>⬜ P4</span>
+          <span style='margin-left:8px'>⬜ P5</span>
+          <span style='margin-left:8px'>⬜ P5.5</span>
+          <span style='margin-left:8px'>⬜ P6</span>
+        </div>
+        """,
+        label="Phase Tracker"
+    )
+    with gr.Group():
+        gr.Markdown("## 📁 Section 1: Upload Scopus CSV")
+        csv_upload = gr.File(
+            label="Upload Scopus CSV",
+            file_types=[".csv"],
+            type="filepath"
+        )
+        upload_status = gr.Textbox(label="Upload Status", interactive=False)
+        def handle_upload(filepath):
+            if filepath is None:
+                return "No file uploaded.", None
+            return f"✅ File loaded: {filepath}", filepath
+        csv_upload.change(
+            fn=handle_upload,
+            inputs=[csv_upload],
+            outputs=[upload_status, uploaded_path_state]
+        )
+    with gr.Group():
+        gr.Markdown("## 💬 Section 2: Agent Chat")
+        gr.Markdown(
+            "_Start with:_ **'Start Phase 1'** to begin familiarisation, "
+            "then follow the agent's instructions phase by phase."
+        )
+        chatbot = gr.Chatbot(height=420, label="Agent Conversation")
+        with gr.Row():
+            user_input = gr.Textbox(
+                placeholder="Type your message or command here...",
+                label="Your Message",
+                scale=5
+            )
+            send_btn = gr.Button("Send ▶", variant="primary", scale=1)
+    with gr.Group():
+        gr.Markdown("## 📊 Section 3: Results")
+        # Review Table
+        gr.Markdown("### 🗂️ Topic Review Table")
+        gr.Markdown(
+            "Edit the **Approve** (True/False), **Rename To**, and **Reasoning** columns, "
+            "then click **Submit Review** to proceed."
+        )
+        review_table = gr.Dataframe(
+            headers=["#", "Topic Label", "Top Evidence", "Sentences",
+                     "Papers", "Approve", "Rename To", "Reasoning"],
+            datatype=["number", "str", "str", "number", "number", "bool", "str", "str"],
+            interactive=True,
+            label="Review Table",
+            wrap=True,
+            row_count=(5, "dynamic"),
+            column_count=(8, "fixed")
+        )
+        submit_review_btn = gr.Button("✅ Submit Review", variant="secondary")
+        gr.Markdown("### 📈 Topic Charts")
+        with gr.Row():
+            chart_selector = gr.Dropdown(
+                choices=["Topic Distribution", "Similarity Heatmap",
+                         "Top Keywords per Topic", "Abstract vs Title Comparison"],
+                label="Select Chart",
+                value="Topic Distribution"
+            )
+        chart_display = gr.HTML(label="Chart")
+        def load_chart(chart_name):
+            """Load pre-generated Plotly chart HTML from disk."""
+            import os
+            import html as _html
+            chart_map = {
+                "Topic Distribution":         "outputs/chart_distribution.html",
+                "Similarity Heatmap":          "outputs/chart_heatmap.html",
+                "Top Keywords per Topic":      "outputs/chart_keywords.html",
+                "Abstract vs Title Comparison":"outputs/chart_comparison.html",
+            }
+            path = chart_map.get(chart_name, "")
+            if os.path.exists(path):
+                with open(path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                # Embed the full HTML in an iframe via srcdoc so scripts execute
+                # Escape attribute characters but preserve the document structure.
+                srcdoc = _html.escape(content, quote=True)
+                iframe = (
+                    f"<iframe srcdoc=\"{srcdoc}\" style=\"border:0; width:100%; height:700px;\"></iframe>"
+                )
+                return iframe
+            return "<p style='color:grey'>Chart not yet generated. Complete the relevant phase first.</p>"
+        chart_selector.change(fn=load_chart, inputs=[chart_selector], outputs=[chart_display])
+        gr.Markdown("### 📥 Download Outputs")
+        download_btn = gr.Button("🔄 Refresh Download List")
+        download_files = gr.File(label="Available Output Files", file_count="multiple")
+        download_btn.click(fn=get_download_files, inputs=[], outputs=[download_files])
+    send_btn.click(
+        fn=send_message,
+        inputs=[user_input, chatbot, uploaded_path_state, thread_id_state],
+        outputs=[chatbot, user_input, review_table, phase_bar]
+    )
+    user_input.submit(
+        fn=send_message,
+        inputs=[user_input, chatbot, uploaded_path_state, thread_id_state],
+        outputs=[chatbot, user_input, review_table, phase_bar]
+    )
+    submit_review_btn.click(
+        fn=submit_review,
+        inputs=[review_table, chatbot, uploaded_path_state, thread_id_state],
+        outputs=[chatbot, review_table, phase_bar]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860,
+        theme=gr.themes.Soft(),
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+langchain-core
+langchain-mistralai
+langgraph
+sentence-transformers
+scikit-learn
+bertopic
+plotly
+numpy
+pandas
+hdbscan
+umap-learn
+pynndescent

tools.py ADDED Viewed

	@@ -0,0 +1,544 @@

+"""
+tools.py — 7 Stateless LangChain Tools for BERTopic Agentic Thematic Analysis
+All tools are decorated with @tool and use handle_tool_error=True.
+No if/elif/else, no for/while loops, no try/except blocks.
+"""
+import json
+import os
+import re
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from langchain_core.tools import tool
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.preprocessing import normalize
+from sklearn.metrics.pairwise import cosine_similarity
+os.makedirs("outputs", exist_ok=True)
+BOILERPLATE_PATTERNS = [
+    r"©\s*\d{4}.*",
+    r"all rights reserved.*",
+    r"published by elsevier.*",
+    r"this paper (proposes|presents|investigates|aims)",
+    r"in this (paper|study|article|work)",
+    r"the purpose of this (paper|study)",
+]
+def _clean_text(text: str) -> str:
+    """Remove boilerplate from a single text string."""
+    text = str(text).lower().strip()
+    cleaned = re.sub("|".join(BOILERPLATE_PATTERNS), "", text, flags=re.IGNORECASE)
+    return cleaned.strip()
+def _split_sentences(text: str) -> list:
+    """Split text into sentences on '. ', '? ', '! '."""
+    raw = re.split(r"(?<=[.!?])\s+", str(text).strip())
+    return list(filter(lambda s: len(s.split()) > 4, raw))
+@tool
+def load_scopus_csv(file_path: str) -> str:
+    """
+    Load a Scopus CSV file and return a summary of its contents.
+    Args:
+        file_path: Absolute or relative path to the Scopus CSV file.
+    Returns:
+        JSON string with keys: papers, abstract_sentences, title_sentences,
+        columns, sample_titles, status.
+    """
+    df = pd.read_csv(file_path, encoding="utf-8-sig")
+    missing_columns = list(filter(lambda col: col not in df.columns, ["Title", "Abstract"]))
+    if missing_columns:
+        return json.dumps({
+            "status": "error",
+            "message": f"Missing required columns: {', '.join(missing_columns)}",
+            "columns": list(df.columns),
+        }, indent=2)
+    # Keep only rows with non-empty Title and Abstract
+    df = df[df["Title"].notna() & df["Abstract"].notna()].reset_index(drop=True)
+    df["Abstract_Clean"] = df["Abstract"].map(_clean_text)
+    df["Title_Clean"]    = df["Title"].map(_clean_text)
+    abstract_sentences = sum(df["Abstract_Clean"].map(_split_sentences).map(len))
+    title_sentences    = sum(df["Title_Clean"].map(_split_sentences).map(len))
+    df.to_json("outputs/cleaned_data.json", orient="records", indent=2)
+    return json.dumps({
+        "status": "loaded",
+        "papers": int(len(df)),
+        "abstract_sentences": int(abstract_sentences),
+        "title_sentences": int(title_sentences),
+        "columns": list(df.columns),
+        "sample_titles": list(df["Title"].head(5)),
+    }, indent=2)
+@tool
+def run_bertopic_discovery(run_config: str) -> str:
+    """
+    Embed sentences, cluster with AgglomerativeClustering (cosine, threshold=0.7),
+    extract top-5 evidence sentences per cluster, generate Plotly charts, and
+    save summaries.json and embeddings.npy.
+    Args:
+        run_config: JSON string with key 'columns' — list of column names to use,
+                    e.g. '{"columns": ["Abstract"]}' or '{"columns": ["Title"]}'
+                    or '{"columns": ["Abstract", "Title"]}'.
+    Returns:
+        JSON string summarising clusters found.
+    """
+    config   = json.loads(run_config)
+    columns  = config.get("columns", ["Abstract"])
+    tag      = "_".join(columns).lower()
+    cleaned_data_path = "outputs/cleaned_data.json"
+    if not os.path.exists(cleaned_data_path):
+        return json.dumps({
+            "status": "error",
+            "message": "Cleaned data file not found. Run load_scopus_csv first.",
+        }, indent=2)
+    df       = pd.read_json(cleaned_data_path)
+    col_map  = {"Abstract": "Abstract_Clean", "Title": "Title_Clean"}
+    use_cols = list(map(lambda c: col_map.get(c, c), columns))
+    missing_columns = list(filter(lambda c: c not in df.columns, use_cols))
+    if missing_columns:
+        return json.dumps({
+            "status": "error",
+            "message": f"Missing cleaned columns: {', '.join(missing_columns)}",
+            "available_columns": list(df.columns),
+        }, indent=2)
+    # Collect (sentence, paper_index) pairs
+    pairs = []
+    def _extract(row_tuple):
+        idx, row = row_tuple
+        return list(map(lambda s: (s, idx),
+                        _split_sentences(" ".join(str(row[c]) for c in use_cols))))
+    all_pairs = sum(map(_extract, df.iterrows()), [])
+    sentences   = list(map(lambda p: p[0], all_pairs))
+    paper_ids   = list(map(lambda p: p[1], all_pairs))
+    if not sentences:
+        empty_summaries_path = f"outputs/summaries_{tag}.json"
+        with open(empty_summaries_path, "w", encoding="utf-8") as f:
+            json.dump([], f, indent=2)
+        return json.dumps({
+            "status": "completed",
+            "tag": tag,
+            "n_clusters": 0,
+            "total_sentences": 0,
+            "summaries_file": empty_summaries_path,
+            "message": "No sentences available after preprocessing.",
+        }, indent=2)
+    model      = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(sentences, show_progress_bar=True, batch_size=64)
+    # Convert to float32 and L2-normalise in-place to avoid large float64 copies
+    embeddings = np.asarray(embeddings, dtype=np.float32)
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    embeddings = embeddings / (norms + 1e-12)
+    embeddings = embeddings.astype(np.float32, copy=False)
+    np.save(f"outputs/embeddings_{tag}.npy", embeddings)
+    clusterer  = AgglomerativeClustering(
+        n_clusters=None,
+        metric="cosine",
+        linkage="average",
+        distance_threshold=0.3      # cosine distance = 1 – similarity; 0.3 ≈ similarity 0.7
+    )
+    labels     = clusterer.fit_predict(embeddings)
+    n_clusters = int(max(labels) + 1)
+    def _summarise_cluster(cid):
+        mask = np.where(np.array(labels) == cid)[0]
+        vecs = embeddings[mask]
+        if vecs.size == 0:
+            top_sents = []
+            top_pids = []
+            size = 0
+        else:
+            centroid = vecs.mean(axis=0, keepdims=True)
+            sims = cosine_similarity(centroid, vecs)[0]
+            top5_idx = mask[np.argsort(sims)[::-1][:5]]
+            top_sents = list(map(lambda i: sentences[i], top5_idx))
+            top_pids = list(sorted(set(map(lambda i: int(paper_ids[i]), top5_idx))))
+            size = int(len(mask))
+        return {
+            "cluster_id":    cid,
+            "size":          size,
+            "papers":        top_pids,
+            "top_sentences": top_sents,
+            "label":         f"Cluster_{cid}",
+            "approved":      False,
+            "rename_to":     "",
+            "reasoning":     "",
+        }
+    summaries = list(map(_summarise_cluster, range(n_clusters)))
+    with open(f"outputs/summaries_{tag}.json", "w") as f:
+        json.dump(summaries, f, indent=2)
+    sizes  = list(map(lambda s: s["size"], summaries))
+    cids   = list(map(lambda s: f"C{s['cluster_id']}", summaries))
+    fig_dist = px.bar(x=cids, y=sizes, labels={"x": "Cluster", "y": "Sentences"},
+                      title=f"Topic Distribution ({tag})", color=sizes,
+                      color_continuous_scale="Blues")
+    fig_dist.write_html("outputs/chart_distribution.html")
+    # Build centroids (one vector per cluster) using float32 to reduce memory
+    centroids = []
+    emb_arr = embeddings
+    labels_arr = np.array(labels)
+    for s in summaries:
+        mask = np.where(labels_arr == s["cluster_id"])[0]
+        if mask.size == 0:
+            centroids.append(np.zeros((emb_arr.shape[1],), dtype=np.float32))
+        else:
+            centroids.append(emb_arr[mask].mean(axis=0).astype(np.float32))
+    centroids = np.vstack(centroids).astype(np.float32)
+    # Avoid computing an enormous n_clusters x n_clusters heatmap which can OOM.
+    HEATMAP_MAX = 300
+    if centroids.shape[0] > HEATMAP_MAX:
+        with open("outputs/chart_heatmap.html", "w", encoding="utf-8") as f:
+            f.write(f"<p style='color:grey'>Heatmap skipped: {centroids.shape[0]} clusters exceeds safe limit ({HEATMAP_MAX}).</p>")
+    else:
+        sim_matrix = cosine_similarity(centroids.astype(np.float32))
+        fig_heat = go.Figure(go.Heatmap(z=sim_matrix, x=cids, y=cids,
+                                         colorscale="Viridis"))
+        fig_heat.update_layout(title=f"Cluster Similarity Heatmap ({tag})")
+        fig_heat.write_html("outputs/chart_heatmap.html")
+    return json.dumps({
+        "status": "completed",
+        "tag": tag,
+        "n_clusters": n_clusters,
+        "total_sentences": len(sentences),
+        "summaries_file": f"outputs/summaries_{tag}.json",
+    }, indent=2)
+@tool
+def label_topics_with_llm(labelling_input: str) -> str:
+    """
+    Use the LLM to generate a human-readable label, category, confidence score,
+    and reasoning for each cluster based on its top evidence sentences.
+    Args:
+        labelling_input: JSON string with keys:
+            - 'tag': run tag (e.g. 'abstract' or 'title')
+            - 'llm_labels': list of dicts, each with keys
+              'cluster_id', 'label', 'category', 'confidence', 'reasoning'
+              as returned by the LLM's own analysis.
+    Returns:
+        JSON string confirming labels saved.
+    """
+    data = json.loads(labelling_input)
+    tag  = data.get("tag", "abstract")
+    llm_labels = data.get("llm_labels", [])
+    summaries_path = f"outputs/summaries_{tag}.json"
+    with open(summaries_path) as f:
+        summaries = json.load(f)
+    label_map = {item["cluster_id"]: item for item in llm_labels}
+    def _apply_label(s):
+        update = label_map.get(s["cluster_id"], {})
+        return {**s,
+                "label":      update.get("label",      s["label"]),
+                "category":   update.get("category",   ""),
+                "confidence": update.get("confidence", 0.0),
+                "reasoning":  update.get("reasoning",  "")}
+    updated = list(map(_apply_label, summaries))
+    with open(summaries_path, "w") as f:
+        json.dump(updated, f, indent=2)
+    return json.dumps({
+        "status":         "labelled",
+        "tag":            tag,
+        "topics_labelled": len(updated),
+    }, indent=2)
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOL 4 — consolidate_into_themes
+# ══════════════════════════════════════════════════════════════════════════════
+@tool
+def consolidate_into_themes(consolidation_input: str) -> str:
+    """
+    Merge approved clusters into final themes based on user review table.
+    Recomputes merged centroids and saves themes.json.
+    Args:
+        consolidation_input: JSON string with keys:
+            - 'tag': run tag
+            - 'approvals': list of dicts with keys
+              'cluster_id', 'approved' (bool), 'rename_to' (str), 'reasoning' (str)
+    Returns:
+        JSON string summarising final themes.
+    """
+    data       = json.loads(consolidation_input)
+    tag        = data.get("tag", "abstract")
+    approvals  = data.get("approvals", [])
+    summaries_path = f"outputs/summaries_{tag}.json"
+    with open(summaries_path) as f:
+        summaries = json.load(f)
+    approval_map = {a["cluster_id"]: a for a in approvals}
+    def _apply_approval(s):
+        a = approval_map.get(s["cluster_id"], {})
+        return {**s,
+                "approved":  a.get("approved",  False),
+                "rename_to": a.get("rename_to", ""),
+                "reasoning": a.get("reasoning", "")}
+    updated   = list(map(_apply_approval, summaries))
+    approved  = list(filter(lambda s: s["approved"], updated))
+    def _finalise(s):
+        final_label = s["rename_to"].strip() if s["rename_to"].strip() else s["label"]
+        return {**s, "final_label": final_label}
+    themes = list(map(_finalise, approved))
+    with open(f"outputs/themes_{tag}.json", "w") as f:
+        json.dump(themes, f, indent=2)
+    # ── Keyword chart per theme ────────────────────────────────────────────────
+    from collections import Counter
+    stop = {"the","a","an","of","in","and","to","is","for","with","that","this","on","are",
+            "by","as","from","be","was","at","it","or","has","have","been","which","their"}
+    def _top_words(s):
+        words = re.findall(r"\b[a-z]{4,}\b",
+                           " ".join(s.get("top_sentences", [])).lower())
+        filtered = list(filter(lambda w: w not in stop, words))
+        counted  = Counter(filtered).most_common(5)
+        return list(map(lambda kv: {"theme": s["final_label"],
+                                     "word": kv[0], "count": kv[1]}, counted))
+    kw_rows = sum(map(_top_words, themes), [])
+    kw_df   = pd.DataFrame(kw_rows)
+    if len(kw_df) > 0:
+        fig_kw = px.bar(kw_df, x="count", y="word", color="theme",
+                        orientation="h", title="Top Keywords per Theme",
+                        barmode="group")
+        fig_kw.write_html("outputs/chart_keywords.html")
+    else:
+        with open("outputs/chart_keywords.html", "w", encoding="utf-8") as f:
+            f.write("<p style='color:grey'>No approved themes available yet.</p>")
+    return json.dumps({
+        "status":       "consolidated",
+        "tag":          tag,
+        "themes_count": len(themes),
+        "themes":       list(map(lambda t: t["final_label"], themes)),
+    }, indent=2)
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOL 5 — compare_with_taxonomy
+# ══════════════════════════════════════════════════════════════════════════════
+@tool
+def compare_with_taxonomy(taxonomy_input: str) -> str:
+    """
+    Map final themes to the PAJAIS taxonomy. Identify MAPPED vs NOVEL themes.
+    Args:
+        taxonomy_input: JSON string with keys:
+            - 'tag': run tag
+            - 'mappings': list of dicts with keys
+              'final_label', 'pajais_category' (str or ''), 'mapped' (bool)
+    Returns:
+        JSON string with mapping results saved to taxonomy_mapping.json.
+    """
+    PAJAIS_TAXONOMY = [
+        "IS Strategy & Governance", "AI & Machine Learning Applications",
+        "Digital Transformation", "Human-Computer Interaction",
+        "Knowledge Management", "Information Security & Privacy",
+        "Business Intelligence & Analytics", "Enterprise Systems",
+        "E-Commerce & Digital Markets", "IT Adoption & Acceptance",
+        "Social Media & Collaboration", "Healthcare IS",
+        "IS Research Methods", "Emerging Technologies",
+    ]
+    data     = json.loads(taxonomy_input)
+    tag      = data.get("tag", "abstract")
+    mappings = data.get("mappings", [])
+    themes_path = f"outputs/themes_{tag}.json"
+    with open(themes_path) as f:
+        themes = json.load(f)
+    mapping_map = {m["final_label"]: m for m in mappings}
+    def _map_theme(t):
+        m = mapping_map.get(t["final_label"], {})
+        status = "MAPPED" if m.get("mapped", False) else "NOVEL"
+        return {**t,
+                "pajais_category": m.get("pajais_category", ""),
+                "mapping_status":  status}
+    mapped_themes = list(map(_map_theme, themes))
+    with open(f"outputs/taxonomy_mapping_{tag}.json", "w") as f:
+        json.dump(mapped_themes, f, indent=2)
+    mapped_count = len(list(filter(lambda t: t["mapping_status"] == "MAPPED", mapped_themes)))
+    novel_count  = len(mapped_themes) - mapped_count
+    return json.dumps({
+        "status":          "mapped",
+        "tag":             tag,
+        "total_themes":    len(mapped_themes),
+        "mapped_count":    mapped_count,
+        "novel_count":     novel_count,
+        "pajais_taxonomy": PAJAIS_TAXONOMY,
+        "output_file":     f"outputs/taxonomy_mapping_{tag}.json",
+    }, indent=2)
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOL 6 — generate_comparison_csv
+# ══════════════════════════════════════════════════════════════════════════════
+@tool
+def generate_comparison_csv(comparison_input: str) -> str:
+    """
+    Compare Abstract-derived themes vs Title-derived themes.
+    Produce a side-by-side CSV and a Plotly comparison chart.
+    Args:
+        comparison_input: JSON string with key 'tags' — list of two run tags,
+                          e.g. '{"tags": ["abstract", "title"]}'.
+    Returns:
+        JSON string with path to comparison CSV.
+    """
+    data = json.loads(comparison_input)
+    tags = data.get("tags", ["abstract", "title"])
+    def _load_themes(tag):
+        path = f"outputs/themes_{tag}.json"
+        with open(path) as f:
+            themes = json.load(f)
+        return list(map(lambda t: {
+            "tag":         tag,
+            "final_label": t["final_label"],
+            "size":        t["size"],
+            "papers":      len(t.get("papers", [])),
+        }, themes))
+    all_rows = sum(map(_load_themes, tags), [])
+    df       = pd.DataFrame(all_rows)
+    df.to_csv("outputs/theme_comparison.csv", index=False)
+    if len(df) > 0:
+        fig = px.bar(df, x="final_label", y="size", color="tag", barmode="group",
+                     title="Abstract vs Title Theme Comparison",
+                     labels={"final_label": "Theme", "size": "Sentences", "tag": "Source"})
+        fig.write_html("outputs/chart_comparison.html")
+    else:
+        with open("outputs/chart_comparison.html", "w", encoding="utf-8") as f:
+            f.write("<p style='color:grey'>No theme comparison available yet.</p>")
+    return json.dumps({
+        "status":      "comparison_generated",
+        "csv_path":    "outputs/theme_comparison.csv",
+        "chart_path":  "outputs/chart_comparison.html",
+        "total_rows":  len(df),
+    }, indent=2)
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOL 7 — export_narrative
+# ══════════════════════════════════════════════════════════════════════════════
+@tool
+def export_narrative(narrative_input: str) -> str:
+    """
+    Generate a ~500-word Section 7 narrative report summarising all themes,
+    their PAJAIS mapping, and key insights. Save as narrative_report.txt.
+    Args:
+        narrative_input: JSON string with keys:
+            - 'tag': run tag to base report on
+            - 'narrative': the 500-word narrative text (written by the LLM)
+            - 'researcher_name': optional researcher name
+    Returns:
+        JSON string confirming report saved.
+    """
+    data            = json.loads(narrative_input)
+    tag             = data.get("tag", "abstract")
+    narrative_text  = data.get("narrative", "")
+    researcher_name = data.get("researcher_name", "Researcher")
+    # Auto-trim narrative to a maximum word count to avoid oversized reports
+    try:
+        max_words = int(data.get("max_words", 500))
+    except Exception:
+        max_words = 500
+    words = narrative_text.split()
+    trimmed = False
+    if len(words) > max_words:
+        narrative_text = " ".join(words[:max_words]).rstrip() + " ..."
+        trimmed = True
+    mapping_path = f"outputs/taxonomy_mapping_{tag}.json"
+    with open(mapping_path) as f:
+        themes = json.load(f)
+    theme_lines = list(map(
+        lambda t: f"  • {t['final_label']} [{t.get('mapping_status','?')}]"
+                  f" — PAJAIS: {t.get('pajais_category','N/A')}",
+        themes
+    ))
+    full_report = "\n".join([
+        "=" * 60,
+        "SECTION 7: THEMATIC ANALYSIS NARRATIVE REPORT",
+        f"Researcher: {researcher_name}",
+        f"Source: {tag.upper()} columns",
+        "=" * 60,
+        "",
+        narrative_text,
+        "",
+        "─" * 60,
+        "THEME SUMMARY TABLE",
+        "─" * 60,
+        "\n".join(theme_lines),
+        "",
+        "=" * 60,
+    ])
+    report_path = "outputs/narrative_report.txt"
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(full_report)
+    return json.dumps({
+        "status":      "report_saved",
+        "report_path": report_path,
+        "word_count":  len(narrative_text.split()),
+        "trimmed":      trimmed,
+        "themes_in_report": len(themes),
+    }, indent=2)