"""
app.py — Gradio Blocks UI for the BERTopic Thematic Analysis Agent.
Sections: (1) Data Input, (2) Agent Conversation, (3) Results
"""
from __future__ import annotations
import json
import uuid
from pathlib import Path
import os
import gradio as gr
import pandas as pd
import plotly.io as pio
from agent import agent
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
THREAD_ID = str(uuid.uuid4())
AGENT_CONFIG = {
"configurable": {"thread_id": THREAD_ID},
"recursion_limit": 100,
}
REVIEW_COLUMNS = [
"#",
"Topic Label",
"Top Evidence",
"Sentences",
"Papers",
"Approve",
"Rename To",
"Reasoning",
]
PHASE_LABELS = [
("Phase 1", "Familiarisation"),
("Phase 2", "Initial Codes"),
("Phase 3", "Themes"),
("Phase 4", "Saturation"),
("Phase 5", "Naming"),
("Phase 5.5", "PAJAIS"),
("Phase 6", "Report"),
]
CHART_OPTIONS = [
"Bar — Top 20 Topics",
"Treemap — Topic Distribution",
"Scatter — Cluster PCA",
"Heatmap — Topic Similarity",
]
_CHART_KEYS = ["bar_top20", "treemap", "scatter_pca", "heatmap"]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _phase_bar_html(active_index: int) -> str:
steps_html = ""
for i, (code, name) in enumerate(PHASE_LABELS):
if i < active_index:
state, bg, fg = "done", "#10b981", "#ffffff"
elif i == active_index:
state, bg, fg = "active", "#6366f1", "#ffffff"
else:
state, bg, fg = "pending", "#e5e7eb", "#6b7280"
steps_html += (
f'
'
f'
{i+1}
'
f'
'
f'{code}
{name}'
f'
'
)
if i < len(PHASE_LABELS) - 1:
line_bg = "#10b981" if i < active_index else "#e5e7eb"
steps_html += (
f''
)
return (
f''
)
def _empty_review_df() -> pd.DataFrame:
return pd.DataFrame(columns=REVIEW_COLUMNS)
def _load_charts() -> dict:
p = Path("charts.json")
return json.loads(p.read_text()) if p.exists() else {}
def _call_agent(message: str, history: list):
result = agent.invoke(
{"messages": [{"role": "user", "content": message}]},
config=AGENT_CONFIG,
)
ai_msg = result["messages"][-1].content
updated_history = history + [
{"role": "user", "content": message},
{"role": "assistant", "content": ai_msg},
]
return updated_history, ""
def _submit_review(
review_df: pd.DataFrame,
history: list,
) -> tuple[list, str, pd.DataFrame]:
"""Read table edits, serialise to JSON, send to agent."""
approved = review_df[
review_df["Approve"].astype(str).str.lower() == "yes"
] if not review_df.empty else review_df
groups = {}
for _, row in approved.iterrows():
theme_name = str(
row.get("Rename To")
or row.get("Topic Label")
or f"Theme_{row['#']}"
)
topic_id = int(row["#"]) if str(row["#"]).isdigit() else 0
groups.setdefault(theme_name, []).append(topic_id)
groups_list = [
{"theme_name": k, "topic_ids": v}
for k, v in groups.items()
]
summary = (
f"Review submitted. Approved topics: {len(approved)}.\n"
f"Groups formed: {len(groups_list)}.\n\n"
f"{json.dumps(groups_list, indent=2)}\n\n"
f"Please consolidate these groups into themes."
)
updated_history, _ = _call_agent(summary, history)
return updated_history, "", review_df
def _upload_csv(file_obj):
if file_obj is None:
return "", "No file uploaded."
# 🔥 CLEAR OLD FILES
files_to_clear = [
"labelled_topics.json",
"summaries.json",
"taxonomy_mapping.json",
"comparison.csv",
"report.txt"
]
list(map(lambda f: os.remove(f) if os.path.exists(f) else None, files_to_clear))
path = file_obj.name
return path, f"✅ File ready: `{path}`"
def _start_analysis(csv_path: str, history: list) -> tuple[list, str, str]:
if not csv_path:
return history, "", "⚠️ Please upload a CSV first."
msg = (
f"I have uploaded a Scopus CSV at: {csv_path}\n"
f"Please begin Phase 1 — Familiarisation. Load the CSV, report statistics, "
f"and STOP after Phase 1."
)
updated_history, _ = _call_agent(msg, history)
phase_html = _phase_bar_html(0)
return updated_history, "", phase_html
def _send_message(user_msg: str, history: list, phase_html: str) -> tuple[list, str, str]:
if not user_msg.strip():
return history, "", phase_html
updated_history, _ = _call_agent(user_msg, history)
last_ai = updated_history[-1]["content"] if updated_history else ""
new_phase = _detect_phase(last_ai, phase_html)
return updated_history, "", new_phase
def _detect_phase(ai_text: str, current_html: str) -> str:
phase_map = {
"phase 1": 0, "phase 2": 1, "phase 3": 2,
"phase 4": 3, "phase 5.5": 5, "phase 5": 4, "phase 6": 6,
}
lower = ai_text.lower()
detected = current_html
for key, idx in sorted(phase_map.items(), key=lambda x: -len(x[0])):
if f"{key} complete" in lower or f"beginning {key}" in lower or f"starting {key}" in lower:
detected = _phase_bar_html(idx)
break
return detected
def _get_chart_plot(chart_name: str):
charts = _load_charts()
key_map = dict(zip(CHART_OPTIONS, _CHART_KEYS))
key = key_map.get(chart_name, "")
payload = charts.get(key, "")
if not payload or str(payload).lstrip().startswith("<"):
return None
return pio.from_json(payload)
def _get_download_files() -> list[str]:
candidates = [
"comparison_abstract_vs_title.csv",
"narrative.md",
"topics.json",
"labelled_topics.json",
"themes.json",
"taxonomy_mapping.json",
"summaries.json",
]
return list(filter(lambda p: Path(p).exists(), candidates))
def _refresh_review_table() -> pd.DataFrame:
p = Path("labelled_topics.json")
if not p.exists():
return _empty_review_df()
topics = json.loads(p.read_text())
rows = list(map(
lambda t: {
"#": t["topic_id"],
"Topic Label": t.get("label", f"Topic {t['topic_id']}"),
"Top Evidence": " | ".join(t.get("top_sentences", [])[:2]),
"Sentences": t.get("sentence_count", 0),
"Papers": "",
"Approve": "Yes",
"Rename To": "",
"Reasoning": t.get("reasoning", ""),
},
topics[:100],
))
return pd.DataFrame(rows)
def _refresh_downloads() -> list[str]:
return _get_download_files() or None
# ---------------------------------------------------------------------------
# Build UI
# ---------------------------------------------------------------------------
with gr.Blocks(
title="BERTopic Thematic Analysis Agent",
) as demo:
# ---- State ----
csv_path_state = gr.State("")
# ---- Header ----
gr.HTML(
''
'
'
'📚 BERTopic Thematic Analysis Agent
'
'
'
'Braun & Clarke (2006) · Six-Phase Pipeline · PAJAIS Taxonomy
'
'
'
)
# ---- Phase Progress Bar ----
phase_bar = gr.HTML(value=_phase_bar_html(-1), label="Phase Progress")
# ════════════════════════════════════════════════════════
# SECTION 1 — Data Input
# ════════════════════════════════════════════════════════
with gr.Group():
gr.Markdown("## 1 · Data Input")
with gr.Row():
with gr.Column(scale=2):
file_upload = gr.File(
label="Upload Scopus CSV",
file_types=[".csv"],
type="filepath",
)
file_status = gr.Markdown("_No file uploaded._")
with gr.Column(scale=1):
run_config = gr.Radio(
choices=["abstract", "title"],
value="abstract",
label="Run Config (field to cluster)",
)
start_btn = gr.Button("▶ Start Analysis", variant="primary", size="lg")
# ════════════════════════════════════════════════════════
# SECTION 2 — Agent Conversation
# ════════════════════════════════════════════════════════
with gr.Group():
gr.Markdown("## 2 · Agent Conversation")
chatbot = gr.Chatbot(
label="Thematic Analysis Agent"
)
with gr.Row():
chat_input = gr.Textbox(
placeholder="Type a message or instruction… (e.g. 'proceed to Phase 2')",
label="",
scale=5,
show_label=False,
lines=1,
)
send_btn = gr.Button("Send", variant="primary", scale=1)
# ════════════════════════════════════════════════════════
# SECTION 3 — Results
# ════════════════════════════════════════════════════════
with gr.Group():
gr.Markdown("## 3 · Results")
with gr.Tabs():
# --- Tab 1: Review Table ---
with gr.TabItem("📋 Review Table"):
with gr.Row():
refresh_table_btn = gr.Button("🔄 Refresh Table", size="sm")
review_table = gr.Dataframe(
value=_empty_review_df(),
headers=REVIEW_COLUMNS,
datatype=[
"number", "str", "str", "number",
"str", "str", "str", "str",
],
column_count=(8, "fixed"),
interactive=True,
wrap=True,
label="Topic Review Table (edit Approve / Rename To / Reasoning)"
)
submit_review_btn = gr.Button(
"✅ Submit Review", variant="primary", size="lg"
)
# --- Tab 2: Charts ---
with gr.TabItem("📊 Charts"):
chart_dropdown = gr.Dropdown(
choices=CHART_OPTIONS,
value=CHART_OPTIONS[0],
label="Select Chart",
interactive=True,
)
chart_display = gr.Plot(label="Chart")
# --- Tab 3: Download ---
with gr.TabItem("⬇ Download"):
refresh_dl_btn = gr.Button("🔄 Refresh Files", size="sm")
download_files = gr.File(
label="Download Analysis Outputs",
file_count="multiple",
interactive=False,
value=None,
)
# ════════════════════════════════════════════════════════
# Event wiring
# ════════════════════════════════════════════════════════
# Upload CSV → store path
file_upload.change(
fn=_upload_csv,
inputs=[file_upload],
outputs=[csv_path_state, file_status],
)
# Start analysis button
start_btn.click(
fn=_start_analysis,
inputs=[csv_path_state, chatbot],
outputs=[chatbot, chat_input, phase_bar],
)
# Send message (button)
send_btn.click(
fn=_send_message,
inputs=[chat_input, chatbot, phase_bar],
outputs=[chatbot, chat_input, phase_bar],
)
# Send message (Enter key)
chat_input.submit(
fn=_send_message,
inputs=[chat_input, chatbot, phase_bar],
outputs=[chatbot, chat_input, phase_bar],
)
# Submit review table
submit_review_btn.click(
fn=_submit_review,
inputs=[review_table, chatbot],
outputs=[chatbot, chat_input, review_table],
)
# Refresh review table
refresh_table_btn.click(
fn=_refresh_review_table,
inputs=[],
outputs=[review_table],
)
# Chart dropdown
chart_dropdown.change(
fn=_get_chart_plot,
inputs=[chart_dropdown],
outputs=[chart_display],
)
# Refresh downloads
refresh_dl_btn.click(
fn=_refresh_downloads,
inputs=[],
outputs=[download_files],
)
# ---------------------------------------------------------------------------
# Launch
# ---------------------------------------------------------------------------
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
theme=gr.themes.Soft(primary_hue="indigo"),
)