Spaces:

aadisawant2912
/

topic_modelling

Sleeping

File size: 39,597 Bytes

ff882c0
ba97cd6
2097913
ba97cd6
 
 
 
 
 
 
 
 
ff882c0
 
 
 
 
 
 
6219c5c
ff882c0
 
6219c5c
ff882c0
 
 
 
 
ba97cd6
ff882c0
 
ba97cd6
 
ff882c0
ba97cd6
148c14b
ba97cd6
ff882c0
 
ba97cd6
 
952425b
 
ba97cd6
 
 
 
 
 
 
952425b
ba97cd6
ff882c0
6219c5c
ff882c0
 
2097913
 
 
 
 
ba97cd6
8f9a9ca
 
 
ff882c0
 
ba97cd6
 
 
 
 
 
 
ff882c0
148c14b
ff882c0
 
 
148c14b
ff882c0
ba97cd6
 
 
 
 
 
2097913
8f9a9ca
ba97cd6
 
8f9a9ca
ff882c0
ba97cd6
ff882c0
2097913
 
 
8f9a9ca
 
ff882c0
2097913
 
 
 
ba97cd6
ff882c0
2097913
ff882c0
ba97cd6
2097913
 
ba97cd6
2097913
 
 
ba97cd6
2097913
ba97cd6
2097913
 
 
 
ba97cd6
2097913
 
 
 
ba97cd6
 
2097913
ba97cd6
 
 
 
 
 
 
 
 
 
2097913
 
ff882c0
 
148c14b
 
 
 
 
 
 
 
6219c5c
 
148c14b
6219c5c
 
148c14b
 
 
ba97cd6
 
 
 
 
 
148c14b
2097913
 
 
 
 
ba97cd6
 
8f9a9ca
2097913
 
 
8f9a9ca
 
148c14b
ff882c0
148c14b
 
 
 
ff882c0
148c14b
 
8f9a9ca
148c14b
 
ba97cd6
148c14b
ff882c0
 
148c14b
6219c5c
622feba
ff882c0
 
6219c5c
 
 
 
 
 
 
 
 
 
ff882c0
 
6219c5c
2097913
148c14b
ff882c0
ba97cd6
2097913
 
 
ff882c0
6219c5c
148c14b
 
6219c5c
148c14b
ba97cd6
ff882c0
8f9a9ca
ff882c0
 
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
952425b
1261605
952425b
ba97cd6
 
 
 
 
 
 
 
 
148c14b
ba97cd6
 
 
 
 
 
ff882c0
 
ba97cd6
 
2097913
 
6219c5c
 
 
 
 
 
ff882c0
ba97cd6
 
 
 
 
 
 
 
 
ff882c0
 
 
 
 
 
6219c5c
8f9a9ca
 
 
 
 
2097913
 
8f9a9ca
 
2097913
ba97cd6
ff882c0
 
7513bc8
 
952425b
 
 
 
7513bc8
 
 
ba97cd6
 
 
 
 
7513bc8
 
 
ba97cd6
7513bc8
 
 
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
7513bc8
 
 
ba97cd6
 
ff882c0
 
 
ba97cd6
 
 
 
 
 
 
 
ff882c0
ba97cd6
ff882c0
 
 
 
 
 
8f9a9ca
 
 
2097913
 
ba97cd6
8f9a9ca
 
 
 
ba97cd6
7fcaca7
ba97cd6
 
 
 
 
 
 
 
7fcaca7
 
ba97cd6
7fcaca7
 
ba97cd6
7fcaca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba97cd6
ff882c0
 
ba97cd6
860416f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2097913
 
 
 
 
ba97cd6
2097913
ba97cd6
2097913
ff882c0
ba97cd6
 
ff882c0
 
ba97cd6
 
148c14b
ba97cd6
 
 
 
ff882c0
 
2097913
ba97cd6
2097913
ba97cd6
2097913
8f9a9ca
 
2097913
8f9a9ca
 
ba97cd6
860416f
 
ba97cd6
860416f
2097913
ff882c0
 
 
ba97cd6
 
 
 
 
 
 
 
 
6219c5c
ff882c0
 
2097913
148c14b
ff882c0
148c14b
ba97cd6
 
148c14b
ba97cd6
 
 
 
ff882c0
 
 
2097913
 
 
 
 
ba97cd6
2097913
ba97cd6
2097913
ff882c0
2097913
 
ba97cd6
2097913
ff882c0
6219c5c
2097913
 
 
 
6219c5c
2097913
 
 
 
 
6219c5c
ff882c0
 
2097913
 
ba97cd6
2097913
ff882c0
2097913
 
ba97cd6
2097913
 
ff882c0
 
2097913
 
ba97cd6
2097913
ff882c0
6219c5c
8f9a9ca
ff882c0
ba97cd6
ff882c0
 
 
2097913
 
 
ba97cd6
 
8f9a9ca
ff882c0
 
 
2097913
ff882c0
 
 
ba97cd6
148c14b
ba97cd6
 
 
 
ff882c0
 
 
 
148c14b
6219c5c
148c14b
6219c5c
 
 
ff882c0
 
 
 
 
 
 
2097913
 
ff882c0
 
2097913
 
ff882c0
2097913
 
148c14b
ba97cd6
 
148c14b
 
ba97cd6
148c14b
ff882c0
ba97cd6
 
2097913
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff882c0
 
ba97cd6
2097913
ba97cd6
ff882c0
 
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148c14b
ba97cd6
2097913
ba97cd6
 
2097913
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148c14b
 
8f9a9ca
ff882c0
2097913
ba97cd6
 
 
 
 
 
 
ff882c0
2097913
 
ba97cd6
2097913
ba97cd6
ff882c0
 
ba97cd6
 
ff882c0
 
 
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff882c0
2097913
 
 
ba97cd6
 
 
 
 
 
2097913
 
 
ff882c0
 
7513bc8
 
 
ba97cd6
7513bc8
ba97cd6
7513bc8
 
 
 
ba97cd6
 
 
7513bc8
 
 
 
ba97cd6
7513bc8
 
 
 
ba97cd6
7513bc8
 
 
 
 
ba97cd6
7513bc8
 
 
 
 
ba97cd6
7513bc8
ba97cd6
ff882c0
 
 
 
8f9a9ca
2097913
 
ba97cd6
 
 
8f9a9ca
 
6219c5c
ff882c0
 
2097913
ba97cd6
ff882c0
 
 
 
 
 
 
148c14b
ba97cd6
 
 
 
 
 
 
148c14b
2097913
ba97cd6
148c14b
 
2097913
 
ff882c0
6219c5c
ff882c0
6219c5c
ba97cd6
 
ff882c0
6219c5c
ff882c0
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148c14b
 
ff882c0
 
 
 
 
 
ba97cd6
ff882c0
 
6219c5c
ba97cd6
6219c5c
ff882c0
 
 
8f9a9ca
ba97cd6
 
 
 
 
952425b
 
8f9a9ca
ff882c0
 
 
 
ba97cd6
 
8f9a9ca
ba97cd6
 
ff882c0
 
7513bc8
 
ba97cd6
 
7513bc8
 
 
ba97cd6
 
 
 
 
 
 
 
ff882c0
 
2097913
ba97cd6
 
 
ff882c0
 
 
2097913
ba97cd6
 
 
ff882c0
 
 
2097913
ba97cd6
 
 
ff882c0
7513bc8
 
 
 
 
ff882c0
ba97cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952425b
ba97cd6
 
 
 
8f9a9ca
ff882c0
 
 
 
 
148c14b
ff882c0
2097913
148c14b
ff882c0
 
 
2097913
8c80ed1

"""
app.py - Gradio 6.x BERTopic / SPECTER2 Thematic Analysis Agent.

TWO MODES:
  Classic (v1): BERTopic + Mistral-small, abstract run then title run separately.
  SPECTER2 (v2): SPECTER2 embeddings + UMAP + HDBSCAN + council-of-3-LLMs,
                 one combined run on Title+Abstract per paper.

KEY DESIGN:
  - Abstract run and title run use SEPARATE thread IDs in v1.
  - v2 uses its own separate thread ID.
  - Mode switch keeps existing data intact; user can switch freely.
"""

from __future__ import annotations

print("Step 1: imports starting...")

import json
import shutil
import uuid
from pathlib import Path

import gradio as gr
import pandas as pd

print("Step 2: gradio imported, version =", gr.__version__)

# ── v1 agent ──────────────────────────────────────────────────────────────────
try:
    from agent import agent, clean_thread_history
    AGENT_V1_OK = True
    print("Step 3a: v1 agent imported OK")
except Exception as e:
    print("Step 3a FAILED:", e)
    agent    = None
    AGENT_V1_OK = False
    def clean_thread_history(tid): pass

# ── v2 agent ──────────────────────────────────────────────────────────────────
try:
    
    from agent_v2 import agent_v2, clean_thread_history_v2, reset_thread_v2
    AGENT_V2_OK = True
    print("Step 3b: v2 agent imported OK")
except Exception as e:
    print("Step 3b FAILED:", e)
    agent_v2 = None
    AGENT_V2_OK = False
    def clean_thread_history_v2(tid): pass
    def reset_thread_v2(tid): pass

# ── constants ──────────────────────────────────────────────────────────────────
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

REVIEW_COLUMNS = [
    "#", "Topic Label", "Top Evidence",
    "Sentences", "Papers", "Approve", "Rename To", "Reasoning",
]
CHART_OPTIONS = ["bar", "histogram", "scatter", "treemap"]
PHASE_LABELS_V1 = [
    "Phase 1 — Familiarisation", "Phase 2 — Initial Codes",
    "Phase 3 — Themes",          "Phase 4 — Saturation",
    "Phase 5 — Naming",          "Phase 5.5 — PAJAIS",
    "Phase 6 — Report",
]
PHASE_LABELS_V2 = [
    "Phase 1 — Load & Embed",
    "Phase 2 — UMAP+HDBSCAN",
    "Phase 3 — Council Labeling",
    "Phase 4 — PAJAIS Mapping",
    "Phase 5 — Final Outputs",
]


def new_thread_id() -> str:
    return str(uuid.uuid4())


# ── helpers ────────────────────────────────────────────────────────────────────
def make_progress_html(current_phase: int, run_label: str = "", mode: str = "v1") -> str:
    labels = PHASE_LABELS_V1 if mode == "v1" else PHASE_LABELS_V2
    total  = len(labels)
    pct    = int((current_phase / total) * 100)
    color  = "#4f46e5" if mode == "v1" else "#0891b2"
    steps  = "".join(
        '<span style="padding:3px 8px;margin:2px;border-radius:10px;font-size:11px;'
        'background:{bg};color:{fg};">{lbl}</span>'.format(
            bg=color if i <= current_phase else "#e5e7eb",
            fg="#fff" if i <= current_phase else "#6b7280",
            lbl=label,
        )
        for i, label in enumerate(labels)
    )
    badge = (
        ' <span style="background:#f59e0b;color:#fff;padding:2px 10px;'
        'border-radius:10px;font-size:12px;">{}</span>'.format(run_label)
        if run_label else ""
    )
    return (
        '<div style="font-family:sans-serif;padding:8px 0;">'
        '<div style="font-weight:600;color:#374151;margin-bottom:5px;">'
        'Progress{badge}</div>'
        '<div style="background:#e5e7eb;border-radius:6px;height:8px;margin-bottom:6px;">'
        '<div style="background:{color};width:{pct}%;height:100%;border-radius:6px;"></div>'
        '</div>'
        '<div style="display:flex;flex-wrap:wrap;gap:2px;">{steps}</div>'
        '</div>'
    ).format(badge=badge, pct=pct, steps=steps, color=color)


def _run_status_html(mode: str = "v1") -> str:
    abs_done   = (DATA_DIR / "abstract" / "taxonomy.json").exists()
    title_done = (DATA_DIR / "title"    / "taxonomy.json").exists()
    both_done  = abs_done and title_done
    v2_done    = (DATA_DIR / "v2" / "taxonomy.json").exists()

    def badge(label, done, color_done="#22c55e"):
        return (
            '<span style="background:{};color:#fff;padding:3px 12px;'
            'border-radius:10px;font-size:12px;margin:2px;">{} {}</span>'
        ).format(
            color_done if done else "#9ca3af",
            "✅" if done else "⏳",
            label
        )

    v1_badges = (
        badge("Abstract Run", abs_done)
        + badge("Title Run", title_done)
        + badge("V1 Outputs", both_done)
    )
    v2_badges = badge("SPECTER2 Run", v2_done, "#0891b2")

    return (
        '<div style="padding:6px 0;">'
        + ('<div style="display:flex;flex-wrap:wrap;gap:4px;margin-bottom:4px;">'
           + v1_badges + '</div>' if mode == "v1" else "")
        + ('<div style="display:flex;flex-wrap:wrap;gap:4px;">'
           + v2_badges + '</div>' if mode == "v2" else "")
        + '</div>'
    )


def _safe_read_csv(path):
    try:
        return pd.read_csv(path, encoding="utf-8")
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin-1")


def _summaries_path(run_config: str) -> Path:
    return DATA_DIR / run_config / "summaries.json"

def _charts_path(run_config: str) -> Path:
    return DATA_DIR / run_config / "charts.json"

def _papers_path(run_config: str) -> Path:
    return DATA_DIR / run_config / "papers.csv"

def _v2_summaries_path() -> Path:
    return DATA_DIR / "v2" / "summaries.json"

def _v2_charts_path() -> Path:
    return DATA_DIR / "v2" / "charts.json"


def _active_run_for_table() -> str:
    abs_has_summaries   = _summaries_path("abstract").exists()
    title_has_summaries = _summaries_path("title").exists()
    abs_has_themes      = (DATA_DIR / "abstract" / "themes.json").exists()
    title_has_themes    = (DATA_DIR / "title"    / "themes.json").exists()
    title_in_review     = title_has_summaries and not title_has_themes
    abs_in_review       = abs_has_summaries   and not abs_has_themes
    return (
        "title"    if title_in_review else
        "abstract" if abs_in_review   else
        "title"    if title_has_summaries else
        "abstract"
    )


def _count_papers_per_topic(run_config: str) -> dict:
    sp = _summaries_path(run_config)
    pp = _papers_path(run_config)
    if not sp.exists():
        return {}
    summaries = json.loads(sp.read_text())
    if not pp.exists():
        return {s["topic_id"]: max(s.get("size", 0) // 4, 1) for s in summaries}
    papers_df = _safe_read_csv(pp)
    text_col  = next(
        filter(lambda c: "abstract" in c.lower() or "title" in c.lower(), papers_df.columns),
        None
    )
    if text_col is None:
        return {s["topic_id"]: 0 for s in summaries}
    sent_to_paper = {}
    for idx, text in enumerate(list(papers_df[text_col].fillna(""))):
        for sent in str(text).split("."):
            key = sent.strip()[:80]
            if key:
                sent_to_paper[key] = idx
    def count_papers(s):
        ids = set(
            sent_to_paper[sent.strip()[:80]]
            for sent in s.get("sentences", [])
            if sent.strip()[:80] in sent_to_paper
        )
        return max(len(ids), 1)
    return {s["topic_id"]: count_papers(s) for s in summaries}


def _build_review_table(run_config: str = "abstract") -> pd.DataFrame:
    sp = _summaries_path(run_config)
    if not sp.exists():
        return pd.DataFrame(columns=REVIEW_COLUMNS)
    summaries = json.loads(sp.read_text())
    if not summaries:
        return pd.DataFrame(columns=REVIEW_COLUMNS)
    paper_counts = _count_papers_per_topic(run_config)
    rows = list(map(lambda s: [
        int(s.get("topic_id", 0)),
        str(s.get("label", "Topic {}".format(s.get("topic_id", "")))),
        str(" | ".join(s.get("top_evidence", [])[:2])),
        int(len(s.get("sentences", []))),
        int(paper_counts.get(int(s.get("topic_id", 0)), 0)),
        False, "", str(s.get("reasoning", "")),
    ], summaries))
    return pd.DataFrame(rows, columns=REVIEW_COLUMNS)


def _build_v2_cluster_table() -> pd.DataFrame:
    """Build a read-only display table for v2 clusters."""
    sp = _v2_summaries_path()
    if not sp.exists():
        cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
                "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
        return pd.DataFrame(columns=cols)
    summaries = json.loads(sp.read_text())
    rows = list(map(lambda s: [
        int(s.get("cluster_id", 0)),
        str(s.get("label", "Cluster {}".format(s.get("cluster_id", "")))),
        int(s.get("paper_count", 0)),
        str(s.get("vote_agreement", "")),
        str(s.get("llm_vote_1_MISTRAL", "")),
        str(s.get("llm_vote_2_GEMINI",  "")),
        str(s.get("llm_vote_3_GROQ",    "")),
        str(" | ".join(s.get("top3_titles", [])[:2])),
    ], summaries))
    cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
            "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
    return pd.DataFrame(rows, columns=cols)


def _load_chart(chart_type: str, run_config: str, mode: str = "v1") -> str:
    cp = _v2_charts_path() if mode == "v2" else _charts_path(run_config)
    if not cp.exists():
        return "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering completes.</p>"
    charts = json.loads(cp.read_text())
    available = list(charts.keys())
    # v2 charts: scatter, bar; v1: bar, histogram, scatter, treemap
    key = chart_type if chart_type in charts else (available[0] if available else "bar")
    return charts.get(key, "<p>Chart not found.</p>")


def _get_download_files(mode: str = "v1"):
    v1_candidates = [
        DATA_DIR / "comparison.csv",
        DATA_DIR / "narrative.txt",
        DATA_DIR / "abstract" / "summaries.json",
        DATA_DIR / "abstract" / "themes.json",
        DATA_DIR / "abstract" / "taxonomy.json",
        DATA_DIR / "title"    / "summaries.json",
        DATA_DIR / "title"    / "themes.json",
        DATA_DIR / "title"    / "taxonomy.json",
    ]
    v2_candidates = [
        DATA_DIR / "comparison_v2.csv",
        DATA_DIR / "v2" / "cluster_audit.csv",
        DATA_DIR / "v2" / "narrative_v2.txt",
        DATA_DIR / "v2" / "summaries.json",
        DATA_DIR / "v2" / "taxonomy.json",
    ]
    candidates = v2_candidates if mode == "v2" else v1_candidates
    existing   = list(map(str, filter(lambda p: p.exists(), candidates)))
    return existing if existing else None


def handle_file_upload(file_path) -> str:
    if not file_path:
        return ""
    dest = DATA_DIR / "uploaded.csv"
    src  = Path(file_path).resolve()
    dst  = dest.resolve()
    _ = shutil.copy(str(src), str(dst)) if src != dst else None
    try:
        df  = _safe_read_csv(dest)
        msg = "✅ CSV saved — {} rows, columns: {}. ".format(
            len(df), ", ".join(list(df.columns[:8]))
        )
    except Exception:
        msg = "✅ CSV saved to {}. ".format(dest)
    return msg + "Select a mode below and type the run command."


def reset_all_data() -> tuple:
    import shutil as _shutil
    try:
        reset_thread_v2("default")
    except Exception:
        pass
    if DATA_DIR.exists():
        _shutil.rmtree(str(DATA_DIR))
    DATA_DIR.mkdir(exist_ok=True)
    empty_v1   = pd.DataFrame(columns=REVIEW_COLUMNS)
    empty_v2   = pd.DataFrame(columns=["#", "Cluster Label", "Papers",
                                        "Vote Agreement", "LLM1 Vote",
                                        "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"])
    empty_chart = "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>"
    status_msg  = (
        "<div style='padding:10px;background:#fef3c7;border-radius:6px;"
        "font-family:sans-serif;font-size:13px;'>"
        "🔄 <b>All data cleared.</b> Upload a new CSV and begin."
        "</div>"
    )
    return (
        [],                          # chatbot
        "",                          # chat input
        make_progress_html(0),       # progress
        _run_status_html("v1"),      # run status
        empty_v1,                    # v1 review table
        empty_v2,                    # v2 cluster table
        empty_chart,                 # chart
        None,                        # downloads
        new_thread_id(),             # abstract thread
        new_thread_id(),             # title thread
        new_thread_id(),             # v2 thread
        "abstract",                  # current_run (v1)
        status_msg,                  # table_status
        "",                          # file_status
    )


def _detect_phase(text: str, mode: str = "v1") -> int:
    phase_map_v1 = {
        "phase 5.5": 5, "phase 6": 6, "phase 5": 4,
        "phase 4": 3,   "phase 3": 2, "phase 2": 1, "phase 1": 0,
    }
    phase_map_v2 = {
        "phase 5": 4, "phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0,
        "specter2 run complete": 4,
        "final outputs": 4,
        "pajais mapping": 3,
        "council": 2,
        "hdbscan": 1,
    }
    lower = text.lower()
    phase_map = phase_map_v1 if mode == "v1" else phase_map_v2
    for key, val in phase_map.items():
        if key in lower:
            return val
    return 0


def _detect_run_label(text: str) -> str:
    lower = text.lower()
    return (
        "TITLE RUN"    if "title run" in lower or "title phase" in lower else
        "ABSTRACT RUN" if "abstract run" in lower or "abstract phase" in lower else
        "SPECTER2 RUN" if "specter" in lower or "v2" in lower else
        ""
    )


def _stream_agent(user_message: str, thread_id: str, mode: str = "v1") -> str:
    import time
    agent_obj   = agent    if mode == "v1" else agent_v2
    clean_fn    = clean_thread_history if mode == "v1" else clean_thread_history_v2
    agent_ok    = AGENT_V1_OK if mode == "v1" else AGENT_V2_OK

    if not agent_ok:
        return "ERROR: {} agent not loaded. Check terminal.".format(
            "Classic" if mode == "v1" else "SPECTER2"
        )

    def _do_stream() -> str:
        clean_fn(thread_id)
        config     = {"configurable": {"thread_id": thread_id}}
        full_reply = ""
        for chunk in agent_obj.stream(
            {"messages": [{"role": "user", "content": user_message}]},
            config=config,
            stream_mode="values",
        ):
            last_msg = chunk["messages"][-1]
            content  = getattr(last_msg, "content", "")
            if isinstance(content, list):
                content = " ".join(
                    c.get("text", "") if isinstance(c, dict) else str(c)
                    for c in content
                )
            if content:
                full_reply = content
        return full_reply or "(no response)"

    result = _do_stream()
    is_rate_limited = (
        "429" in result
        or "rate limit" in result.lower()
        or "rate_limited" in result.lower()
    )
    return _do_stream() if is_rate_limited else result


def _generate_final_v1_directly(history: list) -> str:
    from tools import generate_comparison_csv, export_narrative
    csv_result  = generate_comparison_csv.invoke({})
    narr_result = export_narrative.invoke({})
    csv_info    = json.loads(csv_result)  if csv_result.strip().startswith("{") else {}
    narr_info   = json.loads(narr_result) if narr_result.strip().startswith("{") else {}
    rows      = csv_info.get("rows", "?")
    col_names = ", ".join(csv_info.get("columns", [])[:5]) + "..."
    wc        = narr_info.get("word_count", "?")
    return (
        "Both runs complete. Final outputs generated. "
        "comparison.csv has {} rows with columns: {}. "
        "narrative.txt has {} words. "
        "Both files are in the Download tab."
    ).format(rows, col_names, wc)


def run_agent(
    user_message: str,
    history: list,
    abstract_thread: str,
    title_thread: str,
    v2_thread: str,
    current_run: str,
    current_mode: str,
) -> tuple:
    if not user_message or not user_message.strip():
        cfg  = _active_run_for_table()
        mode = current_mode or "v1"
        return (
            history or [], "",
            make_progress_html(0, mode=mode),
            _run_status_html(mode),
            _build_review_table(cfg),
            _build_v2_cluster_table(),
            _load_chart("bar", cfg, mode),
            _get_download_files(mode),
            abstract_thread, title_thread, v2_thread, current_run,
        )

    lower = user_message.lower().strip()
    mode  = current_mode or "v1"

    # Detect run switches
    active_run = (
        "title"    if "run title"    in lower else
        "abstract" if "run abstract" in lower else
        current_run
    )

    # v1 shortcut for final outputs
    abs_done   = (DATA_DIR / "abstract" / "taxonomy.json").exists()
    title_done = (DATA_DIR / "title"    / "taxonomy.json").exists()
    both_v1    = abs_done and title_done
    wants_final = any(w in lower for w in ("yes", "generate", "final", "comparison", "narrative", "output"))

    history = list(history or [])
    history.append({"role": "user", "content": user_message})

    reply = (
        _generate_final_v1_directly(history)
        if (mode == "v1" and both_v1 and wants_final
            and not (DATA_DIR / "comparison.csv").exists())
        else _stream_agent(
            user_message,
            v2_thread if mode == "v2" else (title_thread if active_run == "title" else abstract_thread),
            mode=mode,
        )
    )

    history.append({"role": "assistant", "content": reply})
    cfg = _active_run_for_table()

    return (
        history, "",
        make_progress_html(_detect_phase(reply, mode), _detect_run_label(reply), mode),
        _run_status_html(mode),
        _build_review_table(cfg),
        _build_v2_cluster_table(),
        _load_chart("bar", cfg, mode),
        _get_download_files(mode),
        abstract_thread, title_thread, v2_thread, active_run,
    )


def handle_submit_review(
    review_data,
    history: list,
    abstract_thread: str,
    title_thread: str,
    v2_thread: str,
    current_run: str,
    current_mode: str,
) -> tuple:
    if review_data is None:
        return run_agent(
            "Review table empty — waiting for Phase 2.",
            history, abstract_thread, title_thread, v2_thread, current_run, current_mode
        )

    df = (
        pd.DataFrame(
            review_data.get("data", []),
            columns=review_data.get("headers", REVIEW_COLUMNS)
        )
        if isinstance(review_data, dict)
        else (
            review_data.copy()
            if isinstance(review_data, pd.DataFrame)
            else pd.DataFrame(review_data, columns=REVIEW_COLUMNS)
        )
    )

    if df.empty:
        return run_agent(
            "Review table empty — waiting for Phase 2.",
            history, abstract_thread, title_thread, v2_thread, current_run, current_mode
        )

    df.columns  = pd.Index(list(map(str, df.columns)))
    approve_col = next((c for c in df.columns if "approve" in c.lower()), None)
    id_col      = next((c for c in df.columns if c.strip() == "#"),        df.columns[0])
    label_col   = next((c for c in df.columns if "label"   in c.lower()), df.columns[1])
    rename_col  = next((c for c in df.columns if "rename"  in c.lower()), None)

    if approve_col is None:
        return run_agent(
            "Cannot find Approve column in table.",
            history, abstract_thread, title_thread, v2_thread, current_run, current_mode
        )

    def to_bool(v):
        return v is True or str(v).strip().lower() in ("true","1","yes","x","on","✓")

    approved_df = df[pd.Series(list(map(to_bool, list(df[approve_col]))), index=df.index)]

    if len(approved_df) == 0:
        guide = (
            "⚠️ **No topics approved yet.**\n\n"
            "**To approve topics:**\n"
            "1. Click **🔄 Refresh Table** to load latest topics\n"
            "2. Click the checkbox ☐ in **Approve** column\n"
            "3. Fill **Rename To** with a theme name\n"
            "4. Click **✅ Submit Review** again"
        )
        history = list(history or [])
        history.append({"role": "assistant", "content": guide})
        cfg = _active_run_for_table()
        return (
            history, "",
            make_progress_html(1),
            _run_status_html("v1"),
            _build_review_table(cfg),
            _build_v2_cluster_table(),
            _load_chart("bar", cfg, "v1"),
            _get_download_files("v1"),
            abstract_thread, title_thread, v2_thread, current_run,
        )

    theme_map: dict = {}
    for idx in range(len(approved_df)):
        row        = approved_df.iloc[idx]
        rename_val = str(row[rename_col]).strip() if rename_col else ""
        theme      = (
            rename_val
            if rename_val and rename_val.lower() not in ("", "nan", "none")
            else str(row[label_col])
        )
        try:
            tid = int(float(str(row[id_col])))
        except (ValueError, TypeError):
            tid = idx
        theme_map.setdefault(theme, []).append(tid)

    groups = [{"theme_name": k, "topic_ids": v} for k, v in theme_map.items()]
    thread_id = title_thread if current_run == "title" else abstract_thread

    msg = (
        "Researcher submitted the Review Table for the {} run.\n"
        "{} topics approved, {} themes:\n\n"
        "```json\n{}\n```\n\n"
        "Call consolidate_into_themes with run_config='{}' "
        "and the approved_groups JSON above. Then proceed to Phase 3."
    ).format(
        current_run, len(approved_df), len(groups),
        json.dumps(groups, indent=2), current_run,
    )

    return run_agent(msg, history, abstract_thread, title_thread, v2_thread, current_run, current_mode)


def switch_mode(new_mode: str, current_mode: str, abstract_thread: str, title_thread: str, v2_thread: str, current_run: str) -> tuple:
    """Switch between Classic and SPECTER2 modes, refreshing UI accordingly."""
    cfg = _active_run_for_table()
    mode_label_text = (
        "### 🔬 Classic Mode (BERTopic)\n"
        "Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
        "Commands: **run abstract** → review → **run title** → review → download"
        if new_mode == "v1" else
        "### 🧬 SPECTER2 Mode (Advanced)\n"
        "One combined run per paper (Title+Abstract). UMAP+HDBSCAN clustering. "
        "Council-of-3 LLM labeling with audit trail.\n"
        "Command: **run specter** or **run v2**"
    )
    chart_opts = CHART_OPTIONS if new_mode == "v1" else ["scatter", "bar"]
    return (
        make_progress_html(0, mode=new_mode),
        _run_status_html(new_mode),
        _build_review_table(cfg),
        _build_v2_cluster_table(),
        _load_chart("bar", cfg, new_mode),
        _get_download_files(new_mode),
        mode_label_text,
        gr.update(choices=chart_opts, value=chart_opts[0]),
        new_mode,
    )


def manual_refresh_table(current_run: str, current_mode: str) -> tuple:
    cfg = _active_run_for_table()
    return _build_review_table(cfg), _build_v2_cluster_table()


def refresh_chart(chart_type: str, current_run: str, current_mode: str) -> str:
    cfg  = _active_run_for_table()
    mode = current_mode or "v1"
    return _load_chart(chart_type, cfg, mode)


def check_status(current_mode: str) -> str:
    mode = current_mode or "v1"
    if mode == "v2":
        sp = _v2_summaries_path()
        if not sp.exists():
            return (
                "<div style='padding:10px;background:#fef3c7;border-radius:6px;"
                "font-family:sans-serif;font-size:13px;'>"
                "⏳ No v2 clusters yet. Type <b>run specter</b> to begin."
                "</div>"
            )
        summaries = json.loads(sp.read_text())
        labeled   = sum(1 for s in summaries if s.get("label","").strip())
        return (
            "<div style='padding:10px;background:#dcfce7;border-radius:6px;"
            "font-family:sans-serif;font-size:13px;'>"
            "✅ <b>{} clusters</b> in <code>data/v2/</code> ({} labeled). "
            "Click 🔄 Refresh to display."
            "</div>"
        ).format(len(summaries), labeled)
    else:
        cfg = _active_run_for_table()
        sp  = _summaries_path(cfg)
        if not sp.exists():
            return (
                "<div style='padding:10px;background:#fef3c7;border-radius:6px;"
                "font-family:sans-serif;font-size:13px;'>"
                "⏳ No topics yet. Upload CSV then type <b>run abstract</b>."
                "</div>"
            )
        summaries = json.loads(sp.read_text())
        labeled   = sum(1 for s in summaries if s.get("label","").strip()
                        and not s.get("label","").startswith("Topic "))
        return (
            "<div style='padding:10px;background:#dcfce7;border-radius:6px;"
            "font-family:sans-serif;font-size:13px;'>"
            "✅ <b>{} topics</b> from <code>data/{}/</code> ({} LLM-labelled). "
            "Click 🔄 Refresh Table."
            "</div>"
        ).format(len(summaries), cfg, labeled)


print("Step 4: building UI...")

# ── UI ─────────────────────────────────────────────────────────────────────────
with gr.Blocks(
    title="BERTopic / SPECTER2 Thematic Analysis Agent",
    css="""
    .mode-btn-active { border: 2px solid #4f46e5 !important; background: #eef2ff !important; }
    .mode-btn-v2-active { border: 2px solid #0891b2 !important; background: #ecfeff !important; }
    """
) as demo:

    abstract_thread_state = gr.State(new_thread_id())
    title_thread_state    = gr.State(new_thread_id())
    v2_thread_state       = gr.State(new_thread_id())
    current_run_state     = gr.State("abstract")
    current_mode_state    = gr.State("v1")

    gr.Markdown(
        "# 🔬 Thematic Analysis Agent\n"
        "**Braun & Clarke (2006)** · SPECTER2 · PAJAIS Taxonomy · Systematic Literature Review"
    )

    progress_bar = gr.HTML(make_progress_html(0))
    run_status   = gr.HTML(_run_status_html("v1"))

    # ── MODE SELECTOR ──────────────────────────────────────────────────────────
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔀 Analysis Mode")
            with gr.Row():
                btn_v1 = gr.Button(
                    "📊 Classic (BERTopic)\nAbstract + Title runs",
                    variant="primary", size="sm",
                )
                btn_v2 = gr.Button(
                    "🧬 SPECTER2 (Advanced)\nCombined T+A · HDBSCAN · Council-3-LLMs",
                    variant="secondary", size="sm",
                )

    mode_description = gr.Markdown(
        "### 📊 Classic Mode (BERTopic)\n"
        "Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
        "Commands: **run abstract** → review → **run title** → review → download"
    )

    gr.HTML("""
    <div style="background:#f0fdf4;border:1px solid #86efac;border-radius:8px;
                padding:10px 14px;font-family:sans-serif;font-size:13px;margin:4px 0;">
      <b>Classic:</b>
      1️⃣ Upload CSV &nbsp;→&nbsp; 2️⃣ <code>run abstract</code> &nbsp;→&nbsp;
      3️⃣ Review Table &nbsp;→&nbsp; 4️⃣ <code>run title</code> &nbsp;→&nbsp; 5️⃣ Download
      &nbsp;&nbsp;|&nbsp;&nbsp;
      <b>SPECTER2:</b>
      1️⃣ Upload CSV &nbsp;→&nbsp; 2️⃣ <code>run specter</code> &nbsp;→&nbsp; 3️⃣ Download
    </div>
    """)

    # ── Section 1 ─────────────────────────────────────────────────────────────
    with gr.Accordion("📂 Section 1 — Data Input", open=True):
        def _startup_msg():
            abs_done   = (DATA_DIR / "abstract" / "taxonomy.json").exists()
            title_done = (DATA_DIR / "title"    / "taxonomy.json").exists()
            v2_done    = (DATA_DIR / "v2" / "taxonomy.json").exists()
            csv_exists = (DATA_DIR / "uploaded.csv").exists()
            has_data   = csv_exists or abs_done or title_done or v2_done
            return (
                "<div style='padding:10px;background:#fef3c7;border:1px solid #fcd34d;"
                "border-radius:8px;font-family:sans-serif;font-size:13px;'>"
                "⚠️ <b>Previous session data detected.</b> "
                "Abstract: {abs} &nbsp;|&nbsp; Title: {title} &nbsp;|&nbsp; "
                "SPECTER2: {v2} &nbsp;|&nbsp; CSV: {csv}<br>"
                "Click <b>🗑️ Reset</b> to clear or continue from where you left off."
                "</div>"
                if has_data else
                "<div style='padding:10px;background:#f0fdf4;border:1px solid #86efac;"
                "border-radius:8px;font-family:sans-serif;font-size:13px;'>"
                "✅ Fresh session — upload your CSV to begin."
                "</div>"
            ).format(
                abs="✅" if abs_done else "⏳",
                title="✅" if title_done else "⏳",
                v2="✅" if v2_done else "⏳",
                csv="✅" if csv_exists else "❌",
            )

        startup_banner = gr.HTML(_startup_msg())
        with gr.Row():
            file_input = gr.File(
                label="Upload Scopus CSV", file_types=[".csv"],
                type="filepath", scale=4,
            )
            reset_btn = gr.Button(
                "🗑️ Reset & Start Fresh",
                variant="stop", scale=1, size="sm",
            )
        file_status = gr.Textbox(label="Upload status", interactive=False, lines=2)
        file_input.change(fn=handle_file_upload, inputs=file_input, outputs=file_status)

    # ── Section 2 ─────────────────────────────────────────────────────────────
    with gr.Accordion("💬 Section 2 — Agent Conversation", open=True):
        gr.HTML("""
        <div style="background:#fafafa;border:1px solid #e5e7eb;border-radius:6px;
                    padding:8px 12px;font-size:12px;font-family:monospace;margin-bottom:6px;">
          Classic: <b>run abstract</b> | <b>run title</b> | <b>yes</b> | <b>satisfied</b> | <b>confirm</b>
          &nbsp;&nbsp;|&nbsp;&nbsp;
          SPECTER2: <b>run specter</b> | <b>run v2</b> | <b>yes</b>
        </div>
        """)
        chatbot = gr.Chatbot(label="Agent", height=500)
        with gr.Row():
            chat_input = gr.Textbox(
                label="Message",
                placeholder="e.g.  run abstract  or  run specter",
                lines=2, scale=5,
            )
            send_btn = gr.Button("Send ➤", variant="primary", scale=1)

    # ── Section 3 ─────────────────────────────────────────────────────────────
    with gr.Accordion("📊 Section 3 — Results", open=True):
        with gr.Tabs():

            with gr.Tab("📋 Review Table (Classic)"):
                gr.HTML("""
                <div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px;
                            padding:8px 12px;font-family:sans-serif;font-size:13px;">
                  After Phase 2 (Classic): Refresh → tick Approve → fill Rename To → Submit Review
                </div>
                """)
                table_status = gr.HTML(
                    "<div style='padding:8px;color:#6b7280;font-size:13px;'>"
                    "Complete Phase 2 (Classic) then Refresh.</div>"
                )
                with gr.Row():
                    refresh_btn      = gr.Button("🔄 Refresh Table", variant="secondary", scale=2)
                    check_status_btn = gr.Button("📊 Check Status",  variant="secondary", scale=1)
                review_table = gr.Dataframe(
                    value=pd.DataFrame(columns=REVIEW_COLUMNS),
                    headers=REVIEW_COLUMNS,
                    datatype=["number","str","str","number","number","bool","str","str"],
                    interactive=True, wrap=True,
                    label="Topic Review Table (Classic Mode)",
                )
                submit_review_btn = gr.Button("✅ Submit Review", variant="primary", size="lg")

            with gr.Tab("🧬 Cluster View (SPECTER2)"):
                gr.HTML("""
                <div style="background:#ecfeff;border:1px solid #a5f3fc;border-radius:8px;
                            padding:8px 12px;font-family:sans-serif;font-size:13px;">
                  Clusters appear after Phase 3 (Council Labeling) completes. Read-only — no manual review needed.
                  Download the <b>cluster_audit.csv</b> for full LLM voting details.
                </div>
                """)
                with gr.Row():
                    refresh_v2_btn = gr.Button("🔄 Refresh Clusters", variant="secondary", scale=2)
                    check_v2_btn   = gr.Button("📊 Check V2 Status",  variant="secondary", scale=1)
                v2_cluster_table = gr.Dataframe(
                    value=pd.DataFrame(columns=["#", "Cluster Label", "Papers",
                                                "Vote Agreement", "LLM1 Vote",
                                                "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]),
                    headers=["#", "Cluster Label", "Papers", "Vote Agreement",
                             "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"],
                    datatype=["number","str","number","str","str","str","str","str"],
                    interactive=False, wrap=True,
                    label="SPECTER2 Cluster Table (Read-only)",
                )

            with gr.Tab("📈 Charts"):
                chart_selector = gr.Dropdown(
                    choices=CHART_OPTIONS, value="bar",
                    label="Select Chart", interactive=True,
                )
                chart_display = gr.HTML(
                    "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>"
                )
                chart_selector.change(
                    fn=refresh_chart,
                    inputs=[chart_selector, current_run_state, current_mode_state],
                    outputs=chart_display,
                )

            with gr.Tab("⬇️ Download"):
                gr.Markdown(
                    "**Classic outputs** appear after both abstract+title runs complete.\n\n"
                    "**SPECTER2 outputs** appear after v2 run completes:\n"
                    "- `comparison_v2.csv` — one row per paper with cluster + PAJAIS\n"
                    "- `cluster_audit.csv` — full LLM voting record, per paper\n"
                    "- `narrative_v2.txt` — 500-word Section 7 discussion\n"
                    "> 💡 **Cache:** `data/v2/llm_cache/` stores LLM responses — "
                    "delete this folder to force fresh labels on re-run.\n"
                )
                download_files = gr.File(
                    label="Output Files", file_count="multiple", interactive=False,
                )

    # ── wire up — combined outputs ─────────────────────────────────────────────
    agent_outputs = [
        chatbot, chat_input, progress_bar, run_status,
        review_table, v2_cluster_table, chart_display, download_files,
        abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
    ]

    reset_outputs = [
        chatbot, chat_input, progress_bar, run_status,
        review_table, v2_cluster_table, chart_display, download_files,
        abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
        table_status, file_status,
    ]

    mode_switch_outputs = [
        progress_bar, run_status,
        review_table, v2_cluster_table,
        chart_display, download_files,
        mode_description, chart_selector,
        current_mode_state,
    ]

    send_btn.click(
        fn=run_agent,
        inputs=[chat_input, chatbot,
                abstract_thread_state, title_thread_state, v2_thread_state,
                current_run_state, current_mode_state],
        outputs=agent_outputs,
    )
    chat_input.submit(
        fn=run_agent,
        inputs=[chat_input, chatbot,
                abstract_thread_state, title_thread_state, v2_thread_state,
                current_run_state, current_mode_state],
        outputs=agent_outputs,
    )
    submit_review_btn.click(
        fn=handle_submit_review,
        inputs=[review_table, chatbot,
                abstract_thread_state, title_thread_state, v2_thread_state,
                current_run_state, current_mode_state],
        outputs=agent_outputs,
    )
    reset_btn.click(
        fn=reset_all_data,
        inputs=[],
        outputs=reset_outputs,
    )

    btn_v1.click(
        fn=lambda m, at, tt, vt, cr: switch_mode("v1", m, at, tt, vt, cr),
        inputs=[current_mode_state, abstract_thread_state, title_thread_state,
                v2_thread_state, current_run_state],
        outputs=mode_switch_outputs,
    )
    btn_v2.click(
        fn=lambda m, at, tt, vt, cr: switch_mode("v2", m, at, tt, vt, cr),
        inputs=[current_mode_state, abstract_thread_state, title_thread_state,
                v2_thread_state, current_run_state],
        outputs=mode_switch_outputs,
    )

    refresh_btn.click(
        fn=manual_refresh_table,
        inputs=[current_run_state, current_mode_state],
        outputs=[review_table, v2_cluster_table],
    )
    refresh_v2_btn.click(
        fn=manual_refresh_table,
        inputs=[current_run_state, current_mode_state],
        outputs=[review_table, v2_cluster_table],
    )
    check_status_btn.click(
        fn=check_status,
        inputs=[current_mode_state],
        outputs=[table_status],
    )
    check_v2_btn.click(
        fn=lambda: check_status("v2"),
        inputs=[current_mode_state],
        outputs=[table_status],
    )

print("Step 5: UI built OK, launching...")

if __name__ == "__main__":
    _v = tuple(int(x) for x in gr.__version__.split(".")[:2])
    print("Gradio version:", gr.__version__)
    _kwargs = {
        "server_name": "0.0.0.0",
        "server_port": 7860,
        "share":       False,
        "inbrowser":   False,
    }
    if _v >= (5, 0):
        _kwargs["ssr_mode"] = False
    print("Running at http://0.0.0.0:7860")
    import subprocess, sys
    subprocess.Popen([sys.executable, "check_keys.py"])
    demo.launch(**_kwargs)