Thematic Analysis Agent

"""
app.py — Gradio UI for BERTopic Agentic AI Application (~370 lines)
Sections: ① Data Input  ② Agent Conversation  ③ Results (Table | Charts | Download)
Rules: ZERO business logic here. All decisions made by agent.py.
"""

import os
import json
import glob
import gradio as gr
from agent import invoke_agent

CHECKPOINT_DIR = "checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

CSV_PATH = os.path.join(CHECKPOINT_DIR, "uploaded.csv")

# ── Checkpoint file paths ──────────────────────────────────────────────────────
def ckpt(name):
    return os.path.join(CHECKPOINT_DIR, name)


# ── Phase progress HTML ────────────────────────────────────────────────────────
def build_phase_bar():
    phases = [
        ("① Load", "stats.json"),
        ("② Codes", "abstract_labels.json"),
        ("③ Themes", "abstract_themes.json"),
        ("④ Saturation", "abstract_themes.json"),
        ("⑤ Names", "abstract_themes.json"),
        ("⑤½ PAJAIS", "abstract_taxonomy_map.json"),
        ("⑥ Report", "comparison.csv"),
    ]
    items = list(map(
        lambda p: (
            f'<div style="display:inline-flex;align-items:center;gap:6px;'
            f'padding:6px 14px;border-radius:20px;font-size:13px;font-weight:600;'
            f'background:{"#22c55e" if os.path.exists(ckpt(p[1])) else "#374151"};'
            f'color:{"#fff" if os.path.exists(ckpt(p[1])) else "#9ca3af"};">'
            f'{"✅" if os.path.exists(ckpt(p[1])) else "⬜"} {p[0]}</div>'
        ),
        phases,
    ))
    bar = (
        '<div style="background:#111827;padding:12px 16px;border-radius:12px;'
        'border:1px solid #1f2937;display:flex;flex-wrap:wrap;gap:8px;align-items:center;">'
        '<span style="color:#6b7280;font-size:12px;font-weight:700;margin-right:4px;">B&amp;C PHASES:</span>'
        + "".join(items)
        + "</div>"
    )
    return bar


# ── Review table loading ───────────────────────────────────────────────────────
def load_review_table():
    """Priority: taxonomy_map → themes → labels → summaries"""
    priority = [
        ("abstract_taxonomy_map.json", "taxonomy"),
        ("abstract_themes.json", "themes"),
        ("abstract_labels.json", "labels"),
        ("abstract_summaries.json", "summaries"),
    ]
    for filename, mode in priority:
        path = ckpt(filename)
        if os.path.exists(path):
            with open(path) as f:
                data = json.load(f)
            return _format_table(data, mode)
    return _empty_table()


def _empty_table():
    import pandas as pd
    return pd.DataFrame(
        [["", "", "", 0, "", "yes", "", ""]],
        columns=["#", "Topic Label", "Top Evidence", "Sentences", "Papers", "Approve", "Rename To", "Reasoning"],
    )


def _format_table(data, mode):
    import pandas as pd
    rows = list(map(lambda item: _format_row(item, mode), data))
    if not rows:
        return _empty_table()
    return pd.DataFrame(
        rows,
        columns=["#", "Topic Label", "Top Evidence", "Sentences", "Papers", "Approve", "Rename To", "Reasoning"],
    )


def _format_row(item, mode):
    idx = item.get("topic_id", item.get("name", ""))
    label = item.get("label", item.get("name", ""))

    if mode == "taxonomy":
        evidence = (
            f"→ {item.get('pajais_match', 'NOVEL')} "
            f"| conf: {item.get('match_confidence', 0):.2f} "
            f"| {item.get('reasoning', '')}"
        )
    else:
        sentences = item.get("top_sentences", [])
        evidence = sentences[0] if sentences else ""

    sentences_count = item.get("sentence_count", len(item.get("top_sentences", [])))
    papers = item.get("paper_count", "")
    approve = item.get("approve", "yes")
    rename = item.get("rename_to", label)
    reasoning = item.get("reasoning", "")

    return [idx, label, evidence, sentences_count, papers, approve, rename, reasoning]


# ── Chart list ────────────────────────────────────────────────────────────────
def get_chart_choices():
    chart_files = glob.glob(ckpt("*_chart_*.html"))
    choices = list(map(
        lambda f: os.path.basename(f).replace("_", " ").replace(".html", "").title(),
        chart_files,
    ))
    return choices if choices else ["No charts yet"]


def load_chart_html(choice):
    if not choice or choice == "No charts yet":
        return "<p style='color:#6b7280;padding:20px;'>Charts appear after Phase 2 analysis.</p>"
    filename = choice.lower().replace(" ", "_") + ".html"
    path = ckpt(filename)
    if os.path.exists(path):
        with open(path) as f:
            content = f.read()
        return f'<iframe srcdoc="{content.replace(chr(34), "&quot;")}" width="100%" height="600px" frameborder="0"></iframe>'
    return "<p style='color:#ef4444;'>Chart file not found.</p>"


# ── Download file list ─────────────────────────────────────────────────────────
def get_download_files():
    patterns = [
        "*.csv", "*.json", "*.txt", "*.npy",
    ]
    files = []
    list(map(lambda p: files.extend(glob.glob(ckpt(p))), patterns))
    files.sort(key=os.path.getmtime, reverse=True)
    return files if files else None


# ── Table-to-theme-map parser ──────────────────────────────────────────────────
def parse_table_to_message(table_data):
    """Convert review table edits into a structured message for the agent.
    Handles both pandas DataFrame (from gr.Dataframe) and list of lists."""
    import pandas as pd

    # Normalise to list of lists regardless of input type
    if table_data is None:
        return "Submit Review: No table data provided."
    if isinstance(table_data, pd.DataFrame):
        if table_data.empty:
            return "Submit Review: Table is empty, nothing to review."
        rows = table_data.values.tolist()
    else:
        rows = list(table_data) if table_data else []

    if not rows:
        return "Submit Review: No table data provided."

    approved = list(filter(
        lambda row: len(row) >= 6 and str(row[5]).strip().lower() in ("yes", "y", "1", "true"),
        rows,
    ))
    rejected = list(filter(
        lambda row: len(row) >= 6 and str(row[5]).strip().lower() in ("no", "n", "0", "false"),
        rows,
    ))

    theme_groups = {}
    list(map(
        lambda row: theme_groups.setdefault(
            str(row[6]).strip() if len(row) > 6 and row[6] and str(row[6]).strip() else str(row[1]),
            []
        ).append(int(row[0]) if str(row[0]).isdigit() else str(row[0])),
        approved,
    ))

    theme_map_str = json.dumps(theme_groups)

    reasoning_lines = list(filter(None, list(map(
        lambda row: f"  - Topic {row[0]} ({row[1]}): {row[7]}" if len(row) > 7 and str(row[7]).strip() else "",
        approved,
    ))))

    msg = (
        f"Submit Review received.\n\n"
        f"Approved topics: {len(approved)}\n"
        f"Rejected topics: {len(rejected)}\n\n"
        f"Theme groupings (RENAME TO → [topic_ids]):\n{theme_map_str}\n\n"
        f"Researcher reasoning:\n"
        + ("\n".join(reasoning_lines) if reasoning_lines else "  (no reasoning provided)")
        + "\n\nPlease proceed to the next phase based on these decisions."
    )
    return msg


# ── Main Gradio App ────────────────────────────────────────────────────────────
def build_app():
    with gr.Blocks(title="BERTopic Thematic Analysis Agent") as app:

        # ── Header ──────────────────────────────────────────────────────────
        gr.HTML("""
        <div style="text-align:center;padding:32px 0 16px;background:linear-gradient(180deg,#0f172a 0%,#0a0f1a 100%);">
            <div style="font-family:'IBM Plex Mono',monospace;font-size:11px;letter-spacing:0.3em;
                        color:#10b981;text-transform:uppercase;margin-bottom:8px;">
                Braun &amp; Clarke (2006) · BERTopic · PAJAIS Taxonomy
            </div>
            <h1 style="font-family:'IBM Plex Mono',monospace;font-size:28px;font-weight:700;
                       color:#f1f5f9;margin:0 0 8px;">
                Thematic Analysis Agent
            </h1>
            <p style="color:#475569;font-size:14px;margin:0;">
                Agentic AI · LangGraph · Mistral LLM · AgglomerativeClustering (cosine, 384d)
            </p>
        </div>
        """)

        # Phase progress bar
        phase_bar = gr.HTML(value=build_phase_bar(), label="Phase Progress")

        # ── SECTION 1: Data Input ────────────────────────────────────────────
        gr.HTML('<div class="section-header">① DATA INPUT</div>')
        with gr.Row():
            csv_upload = gr.File(
                label="Upload Scopus CSV Export",
                file_types=[".csv"],
                scale=2,
            )
            with gr.Column(scale=1):
                gr.HTML("""
                <div style="background:#1e293b;border-radius:12px;padding:16px;font-size:13px;color:#94a3b8;">
                    <b style="color:#f1f5f9;">Required CSV Columns:</b><br>
                    Authors · Title · Abstract<br>
                    Author Keywords · Cited by<br>
                    Source title · Year
                </div>
                """)

        # ── SECTION 2: Agent Conversation ───────────────────────────────────
        gr.HTML('<div class="section-header">② AGENT CONVERSATION</div>')
        chatbot = gr.Chatbot(
            label="Thematic Analysis Agent",
            height=500,
            avatar_images=(None, "https://www.anthropic.com/favicon.ico"),
        )
        with gr.Row():
            user_input = gr.Textbox(
                placeholder="Type 'run abstract', 'run title', or any instruction...",
                label="",
                scale=5,
                lines=1,
                container=False,
            )
            send_btn = gr.Button("Send ▶", variant="primary", scale=1)

        # ── SECTION 3: Results ───────────────────────────────────────────────
        gr.HTML('<div class="section-header">③ RESULTS</div>')
        with gr.Tabs():

            # Tab 1: Review Table
            with gr.TabItem("📋 Review Table"):
                gr.HTML("""
                <p style="color:#94a3b8;font-size:13px;margin-bottom:8px;">
                    Edit <b>Approve</b> (yes/no), <b>Rename To</b>, and <b>Reasoning</b> columns.
                    Then click <b>Submit Review</b> to send decisions to the agent.
                </p>
                """)
                review_table = gr.Dataframe(
                    headers=["#", "Topic Label", "Top Evidence", "Sentences", "Papers", "Approve", "Rename To", "Reasoning"],
                    datatype=["str", "str", "str", "number", "str", "str", "str", "str"],
                    row_count=10,
                    column_count=8,
                    interactive=True,
                    wrap=True,
                    label="",
                )
                submit_review_btn = gr.Button("📤 Submit Review →", variant="primary")

            # Tab 2: Charts
            with gr.TabItem("📊 Charts"):
                chart_dropdown = gr.Dropdown(
                    choices=get_chart_choices(),
                    label="Select Chart",
                    interactive=True,
                )
                refresh_charts_btn = gr.Button("🔄 Refresh Chart List", variant="secondary", size="sm")
                chart_display = gr.HTML(
                    value="<p style='color:#6b7280;padding:20px;'>Charts appear after Phase 2 BERTopic analysis.</p>"
                )

            # Tab 3: Downloads
            with gr.TabItem("📥 Download Files"):
                gr.HTML("""
                <p style="color:#94a3b8;font-size:13px;margin-bottom:8px;">
                    All checkpoint files are listed below. Download for your conference paper.
                </p>
                """)
                download_files = gr.File(
                    label="Output Files",
                    file_count="multiple",
                    interactive=False,
                )
                refresh_downloads_btn = gr.Button("🔄 Refresh Files", variant="secondary", size="sm")

        # ── State ─────────────────────────────────────────────────────────────
        thread_state = gr.State("default")

        # ── Event: CSV Upload ─────────────────────────────────────────────────
        def on_csv_upload(file, history, thread_id):
            if file is None:
                return history, build_phase_bar(), load_review_table()
            # In Gradio 6, uploaded file is a filepath string
            filepath = file if isinstance(file, str) else file.name
            history = history or []
            history.append({"role": "user", "content": f"CSV uploaded: {os.path.basename(filepath)}"})
            response = invoke_agent(f"load_scopus_csv filepath={filepath}", thread_id)
            history.append({"role": "assistant", "content": response})
            return history, build_phase_bar(), load_review_table()

        csv_upload.upload(
            on_csv_upload,
            inputs=[csv_upload, chatbot, thread_state],
            outputs=[chatbot, phase_bar, review_table],
        )

        # ── Event: Send message ───────────────────────────────────────────────
        def on_send(message, history, thread_id):
            if not message.strip():
                return history, "", build_phase_bar(), load_review_table()
            history = history or []
            history.append({"role": "user", "content": message})
            response = invoke_agent(message, thread_id)
            history.append({"role": "assistant", "content": response})
            return history, "", build_phase_bar(), load_review_table()

        send_btn.click(
            on_send,
            inputs=[user_input, chatbot, thread_state],
            outputs=[chatbot, user_input, phase_bar, review_table],
        )
        user_input.submit(
            on_send,
            inputs=[user_input, chatbot, thread_state],
            outputs=[chatbot, user_input, phase_bar, review_table],
        )

        # ── Event: Submit Review ──────────────────────────────────────────────
        def on_submit_review(table_data, history, thread_id):
            msg = parse_table_to_message(table_data)
            history = history or []
            history.append({"role": "user", "content": "📤 Submit Review (table decisions sent to agent)"})
            response = invoke_agent(msg, thread_id)
            history.append({"role": "assistant", "content": response})
            return history, build_phase_bar(), load_review_table()

        submit_review_btn.click(
            on_submit_review,
            inputs=[review_table, chatbot, thread_state],
            outputs=[chatbot, phase_bar, review_table],
        )

        # ── Event: Chart selection ────────────────────────────────────────────
        chart_dropdown.change(
            load_chart_html,
            inputs=[chart_dropdown],
            outputs=[chart_display],
        )

        def refresh_charts():
            choices = get_chart_choices()
            return gr.update(choices=choices, value=choices[0] if choices else None)

        refresh_charts_btn.click(
            refresh_charts,
            outputs=[chart_dropdown],
        )

        # ── Event: Download refresh ───────────────────────────────────────────
        def refresh_downloads():
            files = get_download_files()
            return gr.update(value=files)

        refresh_downloads_btn.click(
            refresh_downloads,
            outputs=[download_files],
        )

        # ── Initial load ──────────────────────────────────────────────────────
        app.load(
            lambda: (build_phase_bar(), load_review_table(), get_download_files()),
            outputs=[phase_bar, review_table, download_files],
        )

    return app


# ── Launch ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo = build_app()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,
        share=False,
        theme=gr.themes.Base(
            primary_hue="emerald",
            secondary_hue="slate",
            neutral_hue="slate",
            font=[gr.themes.GoogleFont("IBM Plex Mono"), "monospace"],
        ),
        css="""
        body { background: #0a0f1a !important; }
        .gradio-container { max-width: 1400px !important; background: #0a0f1a !important; }
        .section-header {
            font-size: 13px;
            font-weight: 700;
            color: #64748b;
            letter-spacing: 0.12em;
            text-transform: uppercase;
            margin-bottom: 12px;
            padding-bottom: 8px;
            border-bottom: 1px solid #1e293b;
        }
        footer { display: none !important; }
        """,
    )