Spaces:

nethra815
/

topic_modelling

Sleeping

App Files Files Community

nethra815 commited on Apr 14

Commit

e768563

verified ·

1 Parent(s): f5e6f5d

Initial commit

Browse files

Files changed (4) hide show

agent.py +205 -0
app.py +461 -0
requirements.txt +26 -0
tools.py +443 -0

agent.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+agent.py — LangGraph ReAct agent for Braun & Clarke (2006) thematic analysis.
+"""
+from __future__ import annotations
+from langgraph.prebuilt import create_react_agent
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_mistralai import ChatMistralAI
+from tools import (
+    load_scopus_csv,
+    run_bertopic_discovery,
+    label_topics_with_llm,
+    consolidate_into_themes,
+    compare_with_taxonomy,
+    generate_comparison_csv,
+    export_narrative,
+)
+# ---------------------------------------------------------------------------
+# System prompt
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = """
+You are a computational thematic analysis expert specialising in Braun & Clarke (2006)
+six-phase thematic analysis applied to systematic literature reviews. You work with
+Scopus CSV exports and guide researchers through a rigorous, reproducible analysis
+pipeline using BERTopic clustering and LLM-assisted labelling.
+═══════════════════════════════════════════════════════════════════
+ROLE
+═══════════════════════════════════════════════════════════════════
+- Expert in qualitative and computational thematic analysis
+- Familiar with PAJAIS (25 AI research categories) taxonomy
+- Methodologically rigorous: one phase per message, no skipping
+- You EXPLAIN what you did, what you found, and what the researcher should do next
+- You never proceed to the next phase without explicit user approval via the review table
+═══════════════════════════════════════════════════════════════════
+CRITICAL RULES
+═══════════════════════════════════════════════════════════════════
+1. Complete EXACTLY ONE phase per conversational turn, then STOP and wait.
+2. ALL topic approvals, renames, and groupings happen via the REVIEW TABLE — never via chat.
+3. Never ask the user to type topic labels or approvals into the chat.
+4. After every phase, output a clear STOP GATE message telling the user what to review.
+5. You must call the appropriate tool for each phase — do NOT fabricate results.
+6. Always report tool outputs clearly: total papers, sentences, clusters, themes.
+7. When showing the review table, list all columns: #, Topic Label, Top Evidence,
+   Sentences, Papers, Approve (Yes/No), Rename To, Reasoning.
+8. Progress is tracked in the phase progress bar — reference the current phase by name.
+═══════════════════════════════════════════════════════════════════
+AVAILABLE TOOLS
+═══════════════════════════════════════════════════════════════════
+1. load_scopus_csv        — Load CSV, count papers/sentences, apply boilerplate filter
+2. run_bertopic_discovery — Embed + cluster sentences, find centroids, generate 4 charts
+3. label_topics_with_llm  — Send top-100 topics to Mistral for human-readable labels
+4. consolidate_into_themes— Merge approved topic groups into named themes, recompute centroids
+5. compare_with_taxonomy  — Map final themes to PAJAIS 25 categories
+6. generate_comparison_csv— Abstract vs title side-by-side CSV export
+7. export_narrative       — Generate ~500-word Section 7 narrative via Mistral
+═══════════════════════════════════════════════════════════════════
+BRAUN & CLARKE (2006) — SIX PHASES
+═══════════════════════════════════════════════════════════════════
+──────────────────────────────────────────────────────────────────
+PHASE 1 — Familiarisation with the Data
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Call load_scopus_csv with the uploaded CSV path and run_config="abstract".
+  2. Report: total papers, total sentences after boilerplate filtering, columns used.
+  3. Show a brief sample of 3–5 cleaned abstracts.
+  4. Explain what boilerplate was removed and why.
+  5. Confirm the dataset is ready for initial coding.
+⛔ STOP GATE 1: After reporting statistics, STOP. Tell the user:
+   "Phase 1 complete. Please review the dataset statistics above. When ready,
+    type 'proceed to Phase 2' to begin BERTopic clustering."
+──────────────────────────────────────────────────────────────────
+PHASE 2 — Generating Initial Codes
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Call run_bertopic_discovery on the cleaned parquet file.
+  2. Call label_topics_with_llm to generate human-readable labels for top-100 clusters.
+  3. Populate the REVIEW TABLE with all labelled topics (columns: #, Topic Label,
+     Top Evidence, Sentences, Papers, Approve, Rename To, Reasoning).
+  4. Explain the clustering method (all-MiniLM-L6-v2 + AgglomerativeClustering cosine 0.7).
+  5. Show the 4 generated charts in the Charts tab.
+⛔ STOP GATE 2: After displaying the review table, STOP. Tell the user:
+   "Phase 2 complete. Please review the 100 topics in the Review Table.
+    For each topic: set Approve=Yes/No, optionally fill Rename To and Reasoning.
+    Group related topics by noting the same new label. When done, click 'Submit Review'."
+   DO NOT proceed until Submit Review is clicked.
+──────────────────────────────────────────────────────────────────
+PHASE 3 — Searching for Themes
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Parse the submitted review table to extract approved topics and their groupings.
+  2. Call consolidate_into_themes with the approved groups JSON.
+  3. Present the consolidated themes with: theme name, constituent topics, top sentences,
+     and sentence count.
+  4. Explain how topics were merged and centroids recomputed.
+⛔ STOP GATE 3: After showing consolidated themes, STOP. Tell the user:
+   "Phase 3 complete. Please review the consolidated themes in the Review Table.
+    Approve, rename, or merge themes as needed. Click 'Submit Review' when done."
+   DO NOT proceed until Submit Review is clicked.
+──────────────────────────────────────────────────────────────────
+PHASE 4 — Reviewing Themes (Saturation Check)
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Compute coverage: what % of total sentences are captured by approved themes.
+  2. Identify any sentences/topics NOT covered by a theme (orphan codes).
+  3. Report saturation metrics: coverage %, orphan count, theme overlap.
+  4. Suggest whether any orphan codes warrant a new theme or should be discarded.
+  5. Update the review table with coverage statistics per theme.
+⛔ STOP GATE 4: After reporting saturation, STOP. Tell the user:
+   "Phase 4 complete. Coverage is [X]%. Please review the saturation report.
+    Adjust theme groupings in the Review Table if needed. Click 'Submit Review'
+    to confirm final themes."
+   DO NOT proceed until Submit Review is clicked.
+──────────────────────────────────────────────────────────────────
+PHASE 5 — Defining and Naming Themes
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. For each confirmed theme, generate: a definitive name, a 2-sentence definition,
+     and 3 exemplary quotes from the data.
+  2. Explain how the name captures the essence of the theme.
+  3. Ensure theme names are analytic (not merely descriptive).
+  4. Present the finalised theme map.
+⛔ STOP GATE 5 (implicit): Present the final theme map and ask:
+   "Phase 5 complete. Please confirm the final theme names and definitions above.
+    When satisfied, type 'proceed to PAJAIS mapping'."
+──────────────────────────────────────────────────────────────────
+PHASE 5.5 — PAJAIS Taxonomy Mapping
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Call compare_with_taxonomy to map each theme to PAJAIS 25 categories.
+  2. Present a mapping table: Theme → PAJAIS Category, Confidence, Rationale.
+  3. Highlight any themes that map to multiple categories (ambiguous cases).
+⛔ STOP GATE 5.5: After presenting the mapping, STOP. Tell the user:
+   "PAJAIS mapping complete. Please review the taxonomy mappings in the Review Table.
+    Adjust any incorrect mappings. Click 'Submit Review' to confirm."
+   DO NOT proceed until Submit Review is clicked.
+──────────────────────────────────────────────────────────────────
+PHASE 6 — Producing the Report
+──────────────────────────────────────────────────────────────────
+Steps:
+  1. Call generate_comparison_csv to produce the abstract vs title comparison.
+  2. Call export_narrative to generate the ~500-word Section 7 discussion.
+  3. Present the narrative inline and confirm all files are ready for download.
+  4. List all downloadable outputs: comparison CSV, narrative.md, topics.json,
+     themes.json, taxonomy_mapping.json, charts.
+  5. Congratulate the researcher and summarise the full analysis pipeline.
+No STOP GATE — Phase 6 is the final deliverable.
+═══════════════════════════════════════════════════════════════════
+OUTPUT FORMAT GUIDELINES
+═══════════════════════════════════════════════════════════════════
+- Always start your response with: **Phase X — [Phase Name]** and the progress %.
+- Use markdown tables for review tables.
+- Use code blocks for JSON snippets.
+- End every non-final phase with a clearly marked ⛔ STOP message.
+- When referencing tool outputs, always show the key numbers (papers, sentences, clusters).
+"""
+# ---------------------------------------------------------------------------
+# Agent construction
+# ---------------------------------------------------------------------------
+_llm = ChatMistralAI(model="mistral-large-latest", temperature=0)
+_tools = [
+    load_scopus_csv,
+    run_bertopic_discovery,
+    label_topics_with_llm,
+    consolidate_into_themes,
+    compare_with_taxonomy,
+    generate_comparison_csv,
+    export_narrative,
+]
+_memory = MemorySaver()
+agent = create_react_agent(
+    model=_llm,
+    tools=_tools,
+    checkpointer=_memory,
+    prompt=SYSTEM_PROMPT,
+)
+__all__ = ["agent", "SYSTEM_PROMPT"]

app.py ADDED Viewed

	@@ -0,0 +1,461 @@

+"""
+app.py — Gradio Blocks UI for the BERTopic Thematic Analysis Agent.
+Sections: (1) Data Input, (2) Agent Conversation, (3) Results
+"""
+from __future__ import annotations
+import json
+import uuid
+from pathlib import Path
+import os
+import gradio as gr
+import pandas as pd
+import plotly.io as pio
+from agent import agent
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+THREAD_ID = str(uuid.uuid4())
+AGENT_CONFIG = {
+    "configurable": {"thread_id": THREAD_ID},
+    "recursion_limit": 100,
+}
+REVIEW_COLUMNS = [
+    "#",
+    "Topic Label",
+    "Top Evidence",
+    "Sentences",
+    "Papers",
+    "Approve",
+    "Rename To",
+    "Reasoning",
+]
+PHASE_LABELS = [
+    ("Phase 1", "Familiarisation"),
+    ("Phase 2", "Initial Codes"),
+    ("Phase 3", "Themes"),
+    ("Phase 4", "Saturation"),
+    ("Phase 5", "Naming"),
+    ("Phase 5.5", "PAJAIS"),
+    ("Phase 6", "Report"),
+]
+CHART_OPTIONS = [
+    "Bar — Top 20 Topics",
+    "Treemap — Topic Distribution",
+    "Scatter — Cluster PCA",
+    "Heatmap — Topic Similarity",
+]
+_CHART_KEYS = ["bar_top20", "treemap", "scatter_pca", "heatmap"]
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _phase_bar_html(active_index: int) -> str:
+    steps_html = ""
+    for i, (code, name) in enumerate(PHASE_LABELS):
+        if i < active_index:
+            state, bg, fg = "done", "#10b981", "#ffffff"
+        elif i == active_index:
+            state, bg, fg = "active", "#6366f1", "#ffffff"
+        else:
+            state, bg, fg = "pending", "#e5e7eb", "#6b7280"
+        steps_html += (
+            f'<div style="display:flex;flex-direction:column;align-items:center;gap:4px;flex:1;">'
+            f'<div style="width:32px;height:32px;border-radius:50%;background:{bg};'
+            f'color:{fg};display:flex;align-items:center;justify-content:center;'
+            f'font-size:11px;font-weight:600;">{i+1}</div>'
+            f'<span style="font-size:10px;color:#374151;text-align:center;line-height:1.2;">'
+            f'{code}<br>{name}</span>'
+            f'</div>'
+        )
+        if i < len(PHASE_LABELS) - 1:
+            line_bg = "#10b981" if i < active_index else "#e5e7eb"
+            steps_html += (
+                f'<div style="flex:1;height:2px;background:{line_bg};margin-top:16px;'
+                f'max-width:40px;"></div>'
+            )
+    return (
+        f'<div style="padding:16px 8px;background:#f9fafb;border-radius:12px;'
+        f'border:1px solid #e5e7eb;margin-bottom:8px;">'
+        f'<div style="display:flex;align-items:flex-start;justify-content:space-between;">'
+        f'{steps_html}</div></div>'
+    )
+def _empty_review_df() -> pd.DataFrame:
+    return pd.DataFrame(columns=REVIEW_COLUMNS)
+def _load_charts() -> dict:
+    p = Path("charts.json")
+    return json.loads(p.read_text()) if p.exists() else {}
+def _call_agent(message: str, history: list):
+    result = agent.invoke(
+        {"messages": [{"role": "user", "content": message}]},
+        config=AGENT_CONFIG,
+    )
+    ai_msg = result["messages"][-1].content
+    updated_history = history + [
+        {"role": "user", "content": message},
+        {"role": "assistant", "content": ai_msg},
+    ]
+    return updated_history, ""
+def _submit_review(
+    review_df: pd.DataFrame,
+    history: list,
+) -> tuple[list, str, pd.DataFrame]:
+    """Read table edits, serialise to JSON, send to agent."""
+    approved = review_df[
+        review_df["Approve"].astype(str).str.lower() == "yes"
+    ] if not review_df.empty else review_df
+    groups = {}
+    for _, row in approved.iterrows():
+        theme_name = str(
+            row.get("Rename To")
+            or row.get("Topic Label")
+            or f"Theme_{row['#']}"
+        )
+        topic_id = int(row["#"]) if str(row["#"]).isdigit() else 0
+        groups.setdefault(theme_name, []).append(topic_id)
+    groups_list = [
+        {"theme_name": k, "topic_ids": v}
+        for k, v in groups.items()
+    ]
+    summary = (
+        f"Review submitted. Approved topics: {len(approved)}.\n"
+        f"Groups formed: {len(groups_list)}.\n\n"
+        f"{json.dumps(groups_list, indent=2)}\n\n"
+        f"Please consolidate these groups into themes."
+    )
+    updated_history, _ = _call_agent(summary, history)
+    refreshed = _refresh_review_table()
+    return updated_history, "", refreshed
+def _upload_csv(file_obj):
+    if file_obj is None:
+        return "", "No file uploaded."
+    # 🔥 CLEAR OLD FILES
+    files_to_clear = [
+        "labelled_topics.json",
+        "summaries.json",
+        "taxonomy_mapping.json",
+        "comparison.csv",
+        "report.txt"
+    ]
+    list(map(lambda f: os.remove(f) if os.path.exists(f) else None, files_to_clear))
+    path = file_obj.name
+    return path, f"✅ File ready: `{path}`"
+def _start_analysis(csv_path: str, history: list) -> tuple[list, str, str, pd.DataFrame]:
+    if not csv_path:
+        return history, "", "⚠️ Please upload a CSV first.", _empty_review_df()
+    msg = (
+        f"I have uploaded a Scopus CSV at: {csv_path}\n"
+        f"Please begin Phase 1 — Familiarisation. Load the CSV, report statistics, "
+        f"and STOP after Phase 1."
+    )
+    updated_history, _ = _call_agent(msg, history)
+    phase_html = _phase_bar_html(0)
+    return updated_history, "", phase_html, _empty_review_df()
+def _send_message(user_msg: str, history: list, phase_html: str) -> tuple[list, str, str, pd.DataFrame]:
+    if not user_msg.strip():
+        return history, "", phase_html, _refresh_review_table()
+    updated_history, _ = _call_agent(user_msg, history)
+    last_ai = updated_history[-1]["content"] if updated_history else ""
+    new_phase = _detect_phase(last_ai, phase_html)
+    refreshed = _refresh_review_table()
+    return updated_history, "", new_phase, refreshed
+def _detect_phase(ai_text: str, current_html: str) -> str:
+    phase_map = {
+        "phase 1": 0, "phase 2": 1, "phase 3": 2,
+        "phase 4": 3, "phase 5.5": 5, "phase 5": 4, "phase 6": 6,
+    }
+    lower = ai_text.lower()
+    detected = current_html
+    for key, idx in sorted(phase_map.items(), key=lambda x: -len(x[0])):
+        if f"{key} complete" in lower or f"beginning {key}" in lower or f"starting {key}" in lower:
+            detected = _phase_bar_html(idx)
+            break
+    return detected
+def _get_chart_plot(chart_name: str):
+    charts = _load_charts()
+    key_map = dict(zip(CHART_OPTIONS, _CHART_KEYS))
+    key = key_map.get(chart_name, "")
+    payload = charts.get(key, "")
+    if not payload or str(payload).lstrip().startswith("<"):
+        return None
+    return pio.from_json(payload)
+def _get_download_files() -> list[str]:
+    candidates = [
+        "comparison_abstract_vs_title.csv",
+        "narrative.md",
+        "topics.json",
+        "labelled_topics.json",
+        "themes.json",
+        "taxonomy_mapping.json",
+        "summaries.json",
+    ]
+    return list(filter(lambda p: Path(p).exists(), candidates))
+def _refresh_review_table() -> pd.DataFrame:
+    themes_path = Path("themes.json")
+    if themes_path.exists():
+        themes = json.loads(themes_path.read_text())
+        rows = list(map(
+            lambda idx_theme: {
+                "#": idx_theme[0] + 1,
+                "Topic Label": idx_theme[1].get("theme_name", f"Theme {idx_theme[0] + 1}"),
+                "Top Evidence": " | ".join(idx_theme[1].get("top_sentences", [])[:2]),
+                "Sentences": len(idx_theme[1].get("top_sentences", [])),
+                "Papers": "",
+                "Approve": "Yes",
+                "Rename To": "",
+                "Reasoning": "",
+            },
+            list(enumerate(themes)),
+        ))
+        return pd.DataFrame(rows)
+    topics_path = Path("labelled_topics.json")
+    if not topics_path.exists():
+        return _empty_review_df()
+    topics = json.loads(topics_path.read_text())
+    rows = list(map(
+        lambda t: {
+            "#": t["topic_id"],
+            "Topic Label": t.get("label", f"Topic {t['topic_id']}"),
+            "Top Evidence": " | ".join(t.get("top_sentences", [])[:2]),
+            "Sentences": t.get("sentence_count", 0),
+            "Papers": "",
+            "Approve": "Yes",
+            "Rename To": "",
+            "Reasoning": t.get("reasoning", ""),
+        },
+        topics[:100],
+    ))
+    return pd.DataFrame(rows)
+def _refresh_downloads() -> list[str]:
+    return _get_download_files() or None
+# ---------------------------------------------------------------------------
+# Build UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(
+    title="BERTopic Thematic Analysis Agent",
+) as demo:
+    # ---- State ----
+    csv_path_state = gr.State("")
+    # ---- Header ----
+    gr.HTML(
+        '<div style="padding:24px 0 8px;">'
+        '<h1 style="font-size:1.6rem;font-weight:600;margin:0;color:#1e1b4b;">'
+        '📚 BERTopic Thematic Analysis Agent</h1>'
+        '<p style="color:#6b7280;margin:4px 0 0;font-size:0.95rem;">'
+        'Braun &amp; Clarke (2006) · Six-Phase Pipeline · PAJAIS Taxonomy</p>'
+        '</div>'
+    )
+    # ---- Phase Progress Bar ----
+    phase_bar = gr.HTML(value=_phase_bar_html(-1), label="Phase Progress")
+    # ════════════════════════════════════════════════════════
+    # SECTION 1 — Data Input
+    # ════════════════════════════════════════════════════════
+    with gr.Group():
+        gr.Markdown("## 1 · Data Input")
+        with gr.Row():
+            with gr.Column(scale=2):
+                file_upload = gr.File(
+                    label="Upload Scopus CSV",
+                    file_types=[".csv"],
+                    type="filepath",
+                )
+                file_status = gr.Markdown("_No file uploaded._")
+            with gr.Column(scale=1):
+                run_config = gr.Radio(
+                    choices=["abstract", "title"],
+                    value="abstract",
+                    label="Run Config (field to cluster)",
+                )
+                start_btn = gr.Button("▶ Start Analysis", variant="primary", size="lg")
+    # ════════════════════════════════════════════════════════
+    # SECTION 2 — Agent Conversation
+    # ════════════════════════════════════════════════════════
+    with gr.Group():
+        gr.Markdown("## 2 · Agent Conversation")
+        chatbot = gr.Chatbot(
+            label="Thematic Analysis Agent"
+        )
+        with gr.Row():
+            chat_input = gr.Textbox(
+                placeholder="Type a message or instruction… (e.g. 'proceed to Phase 2')",
+                label="",
+                scale=5,
+                show_label=False,
+                lines=1,
+            )
+            send_btn = gr.Button("Send", variant="primary", scale=1)
+    # ════════════════════════════════════════════════════════
+    # SECTION 3 — Results
+    # ════════════════════════════════════════════════════════
+    with gr.Group():
+        gr.Markdown("## 3 · Results")
+        with gr.Tabs():
+            # --- Tab 1: Review Table ---
+            with gr.TabItem("📋 Review Table"):
+                with gr.Row():
+                    refresh_table_btn = gr.Button("🔄 Refresh Table", size="sm")
+                review_table = gr.Dataframe(
+                    value=_empty_review_df(),
+                    headers=REVIEW_COLUMNS,
+                    datatype=[
+                        "number", "str", "str", "number",
+                        "str", "str", "str", "str",
+                    ],
+                    column_count=(8, "fixed"),
+                    interactive=True,
+                    wrap=True,
+                    label="Topic Review Table (edit Approve / Rename To / Reasoning)"
+                )
+                submit_review_btn = gr.Button(
+                    "✅ Submit Review", variant="primary", size="lg"
+                )
+            # --- Tab 2: Charts ---
+            with gr.TabItem("📊 Charts"):
+                chart_dropdown = gr.Dropdown(
+                    choices=CHART_OPTIONS,
+                    value=CHART_OPTIONS[0],
+                    label="Select Chart",
+                    interactive=True,
+                )
+                chart_display = gr.Plot(label="Chart")
+            # --- Tab 3: Download ---
+            with gr.TabItem("⬇ Download"):
+                refresh_dl_btn = gr.Button("🔄 Refresh Files", size="sm")
+                download_files = gr.File(
+                    label="Download Analysis Outputs",
+                    file_count="multiple",
+                    interactive=False,
+                    value=None,
+                )
+    # ════════════════════════════════════════════════════════
+    # Event wiring
+    # ════════════════════════════════════════════════════════
+    # Upload CSV → store path
+    file_upload.change(
+        fn=_upload_csv,
+        inputs=[file_upload],
+        outputs=[csv_path_state, file_status],
+    )
+    # Start analysis button
+    start_btn.click(
+        fn=_start_analysis,
+        inputs=[csv_path_state, chatbot],
+        outputs=[chatbot, chat_input, phase_bar, review_table],
+    )
+    # Send message (button)
+    send_btn.click(
+        fn=_send_message,
+        inputs=[chat_input, chatbot, phase_bar],
+        outputs=[chatbot, chat_input, phase_bar, review_table],
+    )
+    # Send message (Enter key)
+    chat_input.submit(
+        fn=_send_message,
+        inputs=[chat_input, chatbot, phase_bar],
+        outputs=[chatbot, chat_input, phase_bar, review_table],
+    )
+    # Submit review table
+    submit_review_btn.click(
+        fn=_submit_review,
+        inputs=[review_table, chatbot],
+        outputs=[chatbot, chat_input, review_table],
+    )
+    # Refresh review table
+    refresh_table_btn.click(
+        fn=_refresh_review_table,
+        inputs=[],
+        outputs=[review_table],
+    )
+    # Chart dropdown
+    chart_dropdown.change(
+        fn=_get_chart_plot,
+        inputs=[chart_dropdown],
+        outputs=[chart_display],
+    )
+    # Refresh downloads
+    refresh_dl_btn.click(
+        fn=_refresh_downloads,
+        inputs=[],
+        outputs=[download_files],
+    )
+# ---------------------------------------------------------------------------
+# Launch
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        theme=gr.themes.Soft(primary_hue="indigo"),
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Core ML / NLP
+sentence-transformers==3.3.1
+scikit-learn==1.6.1
+numpy==1.26.4
+# LangChain / LangGraph
+langchain==0.3.18
+langchain-core==0.3.37
+langchain-mistralai==0.2.4
+langgraph==0.2.73
+# Gradio UI
+gradio==5.16.0
+# Data handling
+pandas==2.2.3
+pyarrow==19.0.0
+# Visualisation
+plotly==5.24.1
+# Mistral SDK (pulled by langchain-mistralai, pinned for stability)
+mistralai==1.3.1
+# Utilities
+python-dotenv==1.0.1

tools.py ADDED Viewed

	@@ -0,0 +1,443 @@

+"""
+tools.py — 7 LangChain tool functions for BERTopic thematic analysis pipeline.
+Constraints: ZERO if/else, ZERO for/while, ZERO try/except.
+"""
+from __future__ import annotations
+import json
+import re
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from pathlib import Path
+from langchain_core.tools import tool
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics.pairwise import cosine_similarity
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_mistralai import ChatMistralAI
+from dotenv import load_dotenv
+load_dotenv()          # add this right after the imports
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+BOILERPLATE_PATTERNS = [
+    r"©\s*\d{4}",
+    r"all rights reserved",
+    r"published by elsevier",
+    r"doi:\s*10\.\S+",
+    r"this article is protected",
+    r"www\.\S+\.com",
+    r"^\s*abstract\s*$",
+    r"please cite this article",
+    r"accepted manuscript",
+]
+RUN_CONFIGS = {
+    "abstract": ["Abstract"],
+    "title": ["Title"],
+}
+PAJAIS_CATEGORIES = [
+    "Artificial Intelligence", "Machine Learning", "Deep Learning",
+    "Natural Language Processing", "Computer Vision", "Robotics",
+    "Knowledge Representation", "Expert Systems", "Decision Support",
+    "Data Mining", "Information Retrieval", "Human-Computer Interaction",
+    "Ethics in AI", "Explainable AI", "Fairness and Bias",
+    "AI in Healthcare", "AI in Education", "AI in Finance",
+    "AI in Manufacturing", "AI in Agriculture", "AI Governance",
+    "Neural Networks", "Reinforcement Learning", "Federated Learning",
+    "AI Safety",
+]
+_MISTRAL = ChatMistralAI(model="mistral-large-latest", temperature=0)
+# ---------------------------------------------------------------------------
+# Helper — pure functions, no loops
+# ---------------------------------------------------------------------------
+def _clean_text(text: str) -> str:
+    combined = "|".join(BOILERPLATE_PATTERNS)
+    return re.sub(combined, "", text, flags=re.IGNORECASE).strip()
+def _sentences_from_series(series: pd.Series) -> list[str]:
+    raw = series.dropna().str.cat(sep=" ")
+    return list(filter(None, map(str.strip, re.split(r"(?<=[.!?])\s+", raw))))
+def _nearest_centroids(embeddings: np.ndarray, labels: np.ndarray, n: int = 5):
+    unique_labels = np.unique(labels)
+    centroids = np.array(list(map(
+        lambda lbl: embeddings[labels == lbl].mean(axis=0),
+        unique_labels,
+    )))
+    sim_matrix = cosine_similarity(centroids)
+    np.fill_diagonal(sim_matrix, -1)
+    nearest = list(map(
+        lambda i: unique_labels[np.argsort(sim_matrix[i])[::-1][:n]].tolist(),
+        range(len(unique_labels)),
+    ))
+    return dict(zip(unique_labels.tolist(), nearest))
+def _top_sentences(sentences: list[str], embeddings: np.ndarray,
+                   centroid: np.ndarray, k: int = 5) -> list[str]:
+    sims = cosine_similarity([centroid], embeddings)[0]
+    top_idx = np.argsort(sims)[::-1][:k]
+    return list(map(lambda i: sentences[i], top_idx))
+# ---------------------------------------------------------------------------
+# Tool 1 — load_scopus_csv
+# ---------------------------------------------------------------------------
+@tool
+def load_scopus_csv(csv_path: str, run_config: str = "abstract") -> str:
+    """Load a Scopus CSV file, count papers/sentences, apply boilerplate regex
+    filter, and return a JSON summary. run_config must be 'abstract' or 'title'."""
+    df = pd.read_csv(csv_path)
+    columns = RUN_CONFIGS[run_config]
+    available_cols = list(filter(lambda c: c in df.columns, columns))
+    texts = df[available_cols].fillna("").apply(
+        lambda row: " ".join(row.values.astype(str)), axis=1
+    )
+    import re
+    # Step 1: basic cleaning
+    cleaned = list(map(_clean_text, texts))
+    # Step 2: 🔥 remove boilerplate noise (ADD HERE)
+    cleaned = list(map(
+        lambda x: re.sub(
+            r"©.*|all rights reserved|copyright.*|palgrave.*",
+            "",
+            x,
+            flags=re.I
+        ),
+        cleaned
+    ))
+    sentences = _sentences_from_series(pd.Series(cleaned))
+    df["_cleaned_text"] = cleaned
+    df.to_parquet(csv_path.replace(".csv", "_cleaned.parquet"), index=False)
+    summary = {
+        "csv_path": csv_path,
+        "run_config": run_config,
+        "columns_used": available_cols,
+        "total_papers": int(len(df)),
+        "total_sentences": len(sentences),
+        "sample_titles": df["Title"].head(5).tolist() if "Title" in df.columns else [],
+    }
+    Path("summaries.json").write_text(json.dumps(summary, indent=2))
+    return json.dumps(summary)
+# ---------------------------------------------------------------------------
+# Tool 2 — run_bertopic_discovery
+# ---------------------------------------------------------------------------
+@tool
+def run_bertopic_discovery(parquet_path: str, run_config: str = "abstract") -> str:
+    """Embed sentences with all-MiniLM-L6-v2, cluster with AgglomerativeClustering
+    (cosine, threshold=0.7), find 5 nearest centroids per cluster, generate 4
+    Plotly charts. Saves summaries.json + emb.npy. Returns topic summaries JSON."""
+    df = pd.read_parquet(parquet_path)
+    columns = RUN_CONFIGS[run_config]
+    available_cols = list(filter(lambda c: c in df.columns, columns))
+    texts = df[available_cols].fillna("").apply(
+        lambda row: " ".join(row.values.astype(str)), axis=1
+    )
+    sentences = _sentences_from_series(texts)
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
+    np.save("emb.npy", embeddings)
+    clustering = AgglomerativeClustering(
+        metric="cosine",
+        linkage="average",
+        distance_threshold=0.7,
+        n_clusters=None,
+    )
+    labels = clustering.fit_predict(embeddings)
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    nearest = _nearest_centroids(embeddings, labels)
+    topic_summaries = list(map(
+        lambda pair: {
+            "topic_id": int(pair[0]),
+            "sentence_count": int(pair[1]),
+            "nearest_topics": nearest.get(int(pair[0]), []),
+            "top_sentences": _top_sentences(
+                sentences, embeddings,
+                embeddings[labels == pair[0]].mean(axis=0),
+            ),
+        },
+        zip(unique_labels, counts),
+    ))
+    # Sort by sentence count desc
+    topic_summaries.sort(key=lambda t: t["sentence_count"], reverse=True)
+    top100 = topic_summaries[:100]
+    # ---- Chart 1: Bar chart — top 20 topics by sentence count ----
+    top20 = top100[:20]
+    fig1 = px.bar(
+        x=[f"T{t['topic_id']}" for t in top20],
+        y=[t["sentence_count"] for t in top20],
+        labels={"x": "Topic", "y": "Sentences"},
+        title="Top 20 Topics by Sentence Count",
+    )
+    # ---- Chart 2: Treemap ----
+    fig2 = px.treemap(
+        names=[f"Topic {t['topic_id']}" for t in top100],
+        parents=["All"] * len(top100),
+        values=[t["sentence_count"] for t in top100],
+        title="Topic Distribution Treemap",
+    )
+    # ---- Chart 3: Scatter (PCA 2D projection) ----
+    from sklearn.decomposition import PCA
+    pca = PCA(n_components=2)
+    coords = pca.fit_transform(embeddings)
+    fig3 = go.Figure(go.Scatter(
+        x=coords[:, 0], y=coords[:, 1],
+        mode="markers",
+        marker=dict(color=labels, colorscale="Viridis", size=4, opacity=0.6),
+    ))
+    fig3.update_layout(title="Sentence Clusters (PCA 2D)")
+    # ---- Chart 4: Heatmap — top 10 topic cosine similarity ----
+    top10_ids = [t["topic_id"] for t in top100[:10]]
+    centroids10 = np.array(list(map(
+        lambda lbl: embeddings[labels == lbl].mean(axis=0),
+        top10_ids,
+    )))
+    sim10 = cosine_similarity(centroids10)
+    fig4 = px.imshow(
+        sim10,
+        x=[f"T{i}" for i in top10_ids],
+        y=[f"T{i}" for i in top10_ids],
+        color_continuous_scale="Blues",
+        title="Top-10 Topic Cosine Similarity Heatmap",
+    )
+    charts = {
+        "bar_top20": fig1.to_json(),
+        "treemap": fig2.to_json(),
+        "scatter_pca": fig3.to_json(),
+        "heatmap": fig4.to_json(),
+    }
+    result = {
+        "total_clusters": int(len(unique_labels)),
+        "top100_topics": top100,
+        "charts_html": charts,
+    }
+    existing = json.loads(Path("summaries.json").read_text())
+    existing.update({"bertopic": {"total_clusters": result["total_clusters"]}})
+    Path("summaries.json").write_text(json.dumps(existing, indent=2))
+    Path("charts.json").write_text(json.dumps(charts, indent=2))
+    Path("topics.json").write_text(json.dumps(top100, indent=2))
+    return json.dumps({
+        "total_clusters": result["total_clusters"],
+        "top100_count": len(top100),
+        "charts_saved": list(charts.keys()),
+    })
+# ---------------------------------------------------------------------------
+# Tool 3 — label_topics_with_llm
+# ---------------------------------------------------------------------------
+@tool
+def label_topics_with_llm(topics_json_path: str = "topics.json") -> str:
+    """Send top-100 topics to Mistral via PromptTemplate + JsonOutputParser to
+    generate human-readable labels. Returns labelled topics JSON."""
+    topics = json.loads(Path(topics_json_path).read_text())
+    batch = topics[:100]
+    prompt = PromptTemplate.from_template(
+        "You are a qualitative research expert. Below are topic clusters from a "
+        "systematic literature review. For EACH topic assign a concise label "
+        "(3-6 words) and one sentence of reasoning.\n\n"
+        "Topics:\n{topics_text}\n\n"
+        "Return ONLY valid JSON: a list of objects with keys: "
+        "topic_id, label, reasoning. No markdown fences."
+    )
+    parser = JsonOutputParser()
+    chain = prompt | _MISTRAL | parser
+    topics_text = "\n".join(list(map(
+        lambda t: f"Topic {t['topic_id']} ({t['sentence_count']} sentences): "
+                  + " | ".join(t["top_sentences"][:2]),
+        batch,
+    )))
+    labelled = chain.invoke({"topics_text": topics_text})
+    label_map = {item["topic_id"]: item for item in labelled}
+    enriched = list(map(
+        lambda t: {**t, **label_map.get(t["topic_id"], {"label": f"Topic {t['topic_id']}", "reasoning": ""})},
+        batch,
+    ))
+    Path("labelled_topics.json").write_text(json.dumps(enriched, indent=2))
+    return json.dumps({"labelled_count": len(enriched), "path": "labelled_topics.json"})
+# ---------------------------------------------------------------------------
+# Tool 4 — consolidate_into_themes
+# ---------------------------------------------------------------------------
+@tool
+def consolidate_into_themes(approved_groups_json: str) -> str:
+    """Merge approved topic groups into themes, recompute centroids from emb.npy.
+    approved_groups_json: JSON list of {theme_name, topic_ids: [...]} objects."""
+    groups = json.loads(approved_groups_json)
+    embeddings = np.load("emb.npy")
+    topics = json.loads(Path("labelled_topics.json").read_text())
+    topic_id_to_sentences = {t["topic_id"]: t["top_sentences"] for t in topics}
+    themes = list(map(
+        lambda g: {
+            "theme_name": g["theme_name"],
+            "topic_ids": g["topic_ids"],
+            "top_sentences": sum(
+                list(map(lambda tid: topic_id_to_sentences.get(tid, []), g["topic_ids"])),
+                [],
+            )[:10],
+            "centroid": embeddings[
+                np.isin(np.arange(len(embeddings)), g["topic_ids"])
+            ].mean(axis=0).tolist(),
+        },
+        groups,
+    ))
+    Path("themes.json").write_text(json.dumps(themes, indent=2))
+    return json.dumps({"themes_count": len(themes), "theme_names": [t["theme_name"] for t in themes]})
+# ---------------------------------------------------------------------------
+# Tool 5 — compare_with_taxonomy
+# ---------------------------------------------------------------------------
+@tool
+def compare_with_taxonomy(themes_json_path: str = "themes.json") -> str:
+    """Map consolidated themes to PAJAIS 25 categories via Mistral.
+    Returns a mapping JSON."""
+    themes = json.loads(Path(themes_json_path).read_text())
+    prompt = PromptTemplate.from_template(
+        "You are an AI research taxonomist. Map each theme to the most relevant "
+        "PAJAIS category.\n\n"
+        "PAJAIS Categories:\n{categories}\n\n"
+        "Themes:\n{themes_text}\n\n"
+        "Return ONLY valid JSON: a list of objects with keys: "
+        "theme_name, pajais_category, confidence (0-1), rationale. No markdown."
+    )
+    parser = JsonOutputParser()
+    chain = prompt | _MISTRAL | parser
+    themes_text = "\n".join(list(map(
+        lambda t: f"- {t['theme_name']}: " + "; ".join(t["top_sentences"][:2]),
+        themes,
+    )))
+    mapping = chain.invoke({
+        "categories": "\n".join(list(map(lambda c: f"  • {c}", PAJAIS_CATEGORIES))),
+        "themes_text": themes_text,
+    })
+    Path("taxonomy_mapping.json").write_text(json.dumps(mapping, indent=2))
+    return json.dumps({"mapped_count": len(mapping), "path": "taxonomy_mapping.json"})
+# ---------------------------------------------------------------------------
+# Tool 6 — generate_comparison_csv
+# ---------------------------------------------------------------------------
+@tool
+def generate_comparison_csv(original_csv_path: str) -> str:
+    """Generate a side-by-side comparison CSV of abstract vs title clustering
+    results for each paper. Returns path to output CSV."""
+    df = pd.read_csv(original_csv_path)
+    abstract_col = "Abstract" if "Abstract" in df.columns else None
+    title_col = "Title" if "Title" in df.columns else None
+    comparison = df[[c for c in [title_col, abstract_col] if c is not None]].copy()
+    comparison.columns = list(map(
+        lambda c: c + "_text",
+        [c for c in [title_col, abstract_col] if c is not None],
+    ))
+    comparison.insert(0, "Paper_ID", range(1, len(df) + 1))
+    taxonomy_path = Path("taxonomy_mapping.json")
+    theme_label = list(map(
+        lambda _: "See themes.json for full mapping",
+        range(len(comparison)),
+    ))
+    comparison["Theme_Assignment"] = theme_label
+    out_path = "comparison_abstract_vs_title.csv"
+    comparison.to_csv(out_path, index=False)
+    return json.dumps({"output_csv": out_path, "rows": len(comparison), "columns": comparison.columns.tolist()})
+# ---------------------------------------------------------------------------
+# Tool 7 — export_narrative
+# ---------------------------------------------------------------------------
+@tool
+def export_narrative(context_json: str = "{}") -> str:
+    """Generate a ~500-word Section 7 narrative via Mistral, synthesising all
+    prior analysis. context_json may contain extra instructions. Returns the
+    narrative text and saves it to narrative.md."""
+    context = json.loads(context_json)
+    themes = json.loads(Path("themes.json").read_text()) if Path("themes.json").exists() else []
+    mapping = json.loads(Path("taxonomy_mapping.json").read_text()) if Path("taxonomy_mapping.json").exists() else []
+    summaries = json.loads(Path("summaries.json").read_text()) if Path("summaries.json").exists() else {}
+    themes_summary = "\n".join(list(map(
+        lambda t: f"- **{t['theme_name']}**: " + "; ".join(t["top_sentences"][:1]),
+        themes,
+    )))
+    mapping_summary = "\n".join(list(map(
+        lambda m: f"- {m.get('theme_name','?')} → {m.get('pajais_category','?')} "
+                  f"(confidence: {m.get('confidence', '?')})",
+        mapping,
+    )))
+    prompt = PromptTemplate.from_template(
+        "You are a senior academic researcher writing a systematic literature review. "
+        "Write Section 7 (Discussion & Synthesis) of approximately 500 words. "
+        "Use an academic tone, Braun & Clarke (2006) thematic analysis framing, "
+        "and reference the themes and PAJAIS taxonomy mappings provided.\n\n"
+        "Dataset summary:\n{summaries}\n\n"
+        "Themes identified:\n{themes}\n\n"
+        "PAJAIS taxonomy mapping:\n{mapping}\n\n"
+        "Extra context: {extra}\n\n"
+        "Write the section now. Use markdown headings."
+    )
+    chain = prompt | _MISTRAL
+    result = chain.invoke({
+        "summaries": json.dumps(summaries, indent=2),
+        "themes": themes_summary,
+        "mapping": mapping_summary,
+        "extra": context.get("extra_instructions", "None"),
+    })
+    narrative = result.content
+    Path("narrative.md").write_text(narrative)
+    return json.dumps({"narrative_path": "narrative.md", "word_count": len(narrative.split())})