topic_modelling / app.py
Dash10107's picture
Update app.py
b6a5e1c verified
"""
app.py β€” Topic Modelling Agentic AI | Gradio UI
═══════════════════════════════════════════════════
Version: 3.1.0 | April 2026
Stack: Gradio 5.x + LangGraph + Mistral + BERTopic
Deploy: HuggingFace Spaces (sdk: gradio)
Rules: Zero gr.HTML(). All UI via native Gradio components.
See GRADIO_UI_GUIDELINES_v2.docx for full standards.
ARCHITECTURE β€” 20 Blocks in 5 Sections
─────────────────────────────────────────
Section 1: Setup (B1–B3) Imports, agent, theme
Section 2: Helpers (B4–B10) Pure Python functions, no UI
Section 3: UI Layout (B11–B17) gr.Blocks with native components
Section 4: Event Wiring (B18–B19) Connect UI to functions
Section 5: Launch (B20) Start server
BLOCK COMMUNICATION MAP
─────────────────────────
B6 (respond) ←→ B2 (agent) : invokes agent for chat
B6 (respond) β†’ B4 (output) : scans for download files
B7 (chart) β†’ B17a (display) : loads Plotly JSON β†’ gr.Plot
B8 (table) β†’ B16 (review) : builds rows β†’ gr.Dataframe
B9 (papers) ← B16 (review) : triggered by row click
B10 (submit) β†’ B2 (agent) : sends review edits to agent
B18 (wiring) β†’ B5,B7,B8 : refreshes progress, charts, table
"""
import os
import glob
import json
import plotly.io as pio
import gradio as gr
from langchain_mistralai import ChatMistralAI
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from agent import SYSTEM_PROMPT, get_local_tools
print(">>> app.py: imports complete")
llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
tools = get_local_tools()
agent = create_react_agent(
model=llm, tools=tools, prompt=SYSTEM_PROMPT, checkpointer=MemorySaver()
)
print(f">>> app.py: agent ready ({len(tools)} tools)")
_msg_count = 0 # Global message counter (shared across users)
_uploaded = {"path": ""} # Last uploaded CSV path (shared session)
# ── end B2: Agent setup ────────────────────────────────────────
# ── B3: Theme ───────────────────────────────────────────────────
# PURPOSE: Define the visual identity of the entire application.
# Uses teal/indigo on zinc β€” purposeful scientific feel.
# Plus Jakarta Sans: geometric-humanist, modern but not generic.
# Fira Code for monospace elements (phase progress, etc).
# USED BY: B20 (demo.launch) β€” theme applied at launch time.
# ────────────────────────────────────────────────────────────────
theme = gr.themes.Default(
primary_hue="teal",
secondary_hue="indigo",
neutral_hue="zinc",
font=gr.themes.GoogleFont("Plus Jakarta Sans"),
font_mono=gr.themes.GoogleFont("Fira Code"),
radius_size="sm",
spacing_size="md",
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_500",
button_primary_text_color="white",
block_label_text_size="sm",
block_title_text_weight="600",
)
# ── end B3: Theme ──────────────────────────────────────────────
def _latest_output():
"""Scan /tmp for ALL rq4_* files, sorted by phase order.
Returns list of filepaths for gr.File download component."""
phase_order = {
"summaries": 1, "labels": 2, "themes": 3, "taxonomy": 4,
"emb": 0, "intertopic": 5, "bars": 6, "hierarchy": 7,
"heatmap": 8, "comparison": 9, "narrative": 10,
}
files = (
glob.glob("/tmp/rq4_*.csv")
+ glob.glob("/tmp/rq4_*.json")
+ glob.glob("/tmp/checkpoints/rq4_*.json")
)
scored = list(map(
lambda f: (sum(v * (k in f) for k, v in phase_order.items()), f),
files,
))
scored.sort(key=lambda x: x[0])
return list(map(lambda x: x[1], scored)) or None
# ── end B4: _latest_output ─────────────────────────────────────
def _build_progress():
"""Return emoji progress pipeline. NO HTML β€” just text + emoji.
Displayed in gr.Markdown component (B14)."""
checks = [
("Load", bool(glob.glob("/tmp/checkpoints/rq4_*_summaries.json")
or glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))),
("Codes", bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))),
("Themes", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("Review", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("Names", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("PAJAIS", bool(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))),
("Report", bool(glob.glob("/tmp/rq4_comparison.csv")
or glob.glob("/tmp/rq4_narrative.txt"))),
]
return " β†’ ".join(f"{'βœ…' if done else '⬜'} {name}" for name, done in checks)
# ── end B5: _build_progress ────────────────────────────────────
def respond(message, chat_history, uploaded_file):
"""Handle one chat turn with the LangGraph agent.
Yields twice: progress bubble β†’ final response."""
global _msg_count
_msg_count += 1
# Store file path β€” uses `or` short-circuit instead of if/else
_uploaded["path"] = uploaded_file or _uploaded.get("path", "")
# Tell agent where the CSV is (prevents hallucinated filepaths)
file_note = (
f"\n[CSV file at: {_uploaded['path']}]" * bool(_uploaded["path"])
) or "\n[No CSV uploaded yet β€” ask user to upload a file first]"
# Tell agent what phase we're in based on existing checkpoint files
phase_context = (
"\n[Phase context: labels exist]"
* bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
or "\n[Phase context: embeddings exist]"
* bool(glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))
or "\n[Phase context: fresh start]"
)
text = ((message or "").strip() or "Analyze my Scopus CSV") + file_note + phase_context
print(f"\n{'='*60}\n>>> MSG #{_msg_count}: '{text[:120]}'\n{'='*60}")
# YIELD 1: Show "thinking" bubble immediately
chat_history = chat_history + [
{"role": "user", "content": (message or "").strip()},
{"role": "assistant", "content": "πŸ”¬ **Working...** _Agent is thinking..._"},
]
yield chat_history, "", _latest_output()
# Invoke agent β€” Mistral brain decides which tools to call
result = agent.invoke(
{"messages": [("human", text)]},
config={"configurable": {"thread_id": "session"}},
)
response = result["messages"][-1].content
print(f">>> Response ({len(response)} chars)")
# YIELD 2: Replace thinking bubble with actual response
chat_history[-1] = {"role": "assistant", "content": response}
gr.Info(f"Agent responded ({len(response)} chars)")
yield chat_history, "", _latest_output()
# ── end B6: respond ────────────────────────────────────────────
def _load_chart(chart_name):
"""Load Plotly chart from JSON file. Returns figure for gr.Plot.
No HTML, no iframe β€” just a native Plotly figure object."""
path = f"/tmp/{chart_name}"
(not os.path.exists(path)) and (not None) # guard
return pio.from_json(open(path).read()) * bool(os.path.exists(path)) or None
def _get_chart_choices():
"""Find all rq4_*.json chart files in /tmp."""
files = sorted(glob.glob("/tmp/rq4_*.json"))
return list(map(os.path.basename, files))
# ── end B7: _load_chart ───────────────────────────────────────
def _load_review_table():
"""Build review table from latest checkpoint JSON.
Approve column is bool (renders as checkbox in gr.Dataframe).
Priority: taxonomy_map > themes > labels > summaries."""
taxonomy_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))
theme_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
# Pick most advanced checkpoint available
path = (
(taxonomy_files and taxonomy_files[-1])
or (theme_files and theme_files[-1])
or (label_files and label_files[-1])
or (summary_files and summary_files[-1])
or ""
)
is_taxonomy = bool(taxonomy_files and taxonomy_files[-1] == path)
data = (os.path.exists(path) and json.load(open(path))) or []
# For taxonomy: merge with themes to get sentence/paper counts
theme_lookup = {}
(is_taxonomy and theme_files) and theme_lookup.update(
{t.get("label", ""): t for t in json.load(open(theme_files[-1]))}
)
rows = list(map(
lambda pair: [
pair[0], # #
pair[1].get("label", pair[1].get("top_words", ""))[:60], # Label
# Evidence: PAJAIS mapping for taxonomy, nearest sentence otherwise
(
is_taxonomy
and f"β†’ {pair[1].get('pajais_match', '?')} | {pair[1].get('reasoning', '')}"[:120]
) or (
(pair[1].get("nearest", [{}])[0].get("sentence", "")[:120] + "...")
* bool(pair[1].get("nearest"))
),
# Sentence/paper counts
theme_lookup.get(pair[1].get("label", ""), pair[1]).get(
"sentence_count", pair[1].get("sentence_count", 0)),
theme_lookup.get(pair[1].get("label", ""), pair[1]).get(
"paper_count", pair[1].get("paper_count", 0)),
True, # Approve (bool β†’ checkbox)
"", # Rename To
"", # Reasoning
],
enumerate(data),
))
return rows or [[0, "No data yet", "", 0, 0, False, "", ""]]
# ── end B8: _load_review_table ─────────────────────────────────
def _show_papers_by_select(table_data, evt: gr.SelectData):
"""Show papers for clicked row. Uses column 0 as topic_id.
Triggered by review_table.select() β€” no separate Topic # input needed."""
row_idx = evt.index[0]
# Get topic_id from column 0 of the clicked row (not row index)
topic_id = int(table_data.iloc[row_idx, 0]) if hasattr(table_data, 'iloc') else int(table_data[row_idx][0])
# Load paper data from checkpoint files
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
all_files = label_files or summary_files
lines = []
for f in all_files:
source = os.path.basename(f).split("_")[1]
data = json.load(open(f))
for t in data:
(t.get("topic_id") == topic_id) and lines.append(
f"═══ {source.upper()} β€” Topic {topic_id}: "
f"{t.get('label', t.get('top_words', '')[:50])} ═══\n"
f"{t.get('sentence_count', 0)} sentences from {t.get('paper_count', 0)} papers\n"
f"AI Reasoning: {t.get('reasoning', 'not yet labeled')}\n\n"
f"── 5 NEAREST CENTROID SENTENCES (evidence) ──\n"
+ "\n".join(
f" {i+1}. \"{t['nearest'][i]['sentence'][:200]}\"\n"
f" Paper: {t['nearest'][i].get('title', '')[:100]}"
for i in range(min(5, len(t.get('nearest', []))))
)
+ "\n\n── ALL PAPER TITLES ──\n"
+ "\n".join(
f" {i+1}. {title}"
for i, title in enumerate(t.get('paper_titles', []))
)
)
return "\n\n".join(lines) or f"Topic {topic_id} not found."
# ── end B9: _show_papers_by_select ─────────────────────────────
def _submit_review(table_data, chat_history):
"""Convert review table edits into agent message.
Approve column is bool (checkbox), not string."""
rows = table_data.values.tolist()
lines = list(map(
lambda r: (
f"Topic {int(r[0])}: "
+ (f"RENAME to '{r[6]}'" * bool(str(r[6]).strip()))
+ (f"APPROVE '{r[1]}'" * (not bool(str(r[6]).strip())) * bool(r[5]))
+ (f"REJECT" * (not r[5]))
+ (f" β€” reason: {r[7]}" * bool(str(r[7]).strip()))
),
rows,
))
review_msg = "Review decisions:\n" + "\n".join(lines)
print(f">>> Review submitted: {review_msg[:200]}")
# YIELD 1: Show processing bubble
chat_history = chat_history + [
{"role": "user", "content": review_msg},
{"role": "assistant", "content": "πŸ”¬ **Processing review decisions...**"},
]
gr.Info("Review submitted to agent")
yield (chat_history, _latest_output(), gr.update(),
gr.update(), gr.update(), _build_progress())
# Invoke agent with review decisions
result = agent.invoke(
{"messages": [("human", review_msg)]},
config={"configurable": {"thread_id": "session"}},
)
response = result["messages"][-1].content
# YIELD 2: Final response + refreshed table/charts
chat_history[-1] = {"role": "assistant", "content": response}
gr.Info("Review processed β€” table updated")
yield (
chat_history,
_latest_output(),
gr.update(choices=_get_chart_choices()),
gr.update(),
gr.update(value=_load_review_table()),
_build_progress(),
)
print(">>> Building UI...")
with gr.Blocks(
title="Topic Modelling β€” Agentic AI",
fill_width=True,
css="""
/* Accent bar at very top of page */
.gradio-container::before {
content: "";
display: block;
height: 3px;
background: linear-gradient(90deg, #0d9488, #6366f1);
margin-bottom: 4px;
}
/* Tabs: tighter padding, bolder active state */
.tab-nav button {
font-size: 13px !important;
font-weight: 500 !important;
letter-spacing: 0.01em;
padding: 6px 16px !important;
}
.tab-nav button.selected {
font-weight: 700 !important;
border-bottom: 2px solid #0d9488 !important;
}
/* Dataframe: subtle zebra rows */
.table-wrap tr:nth-child(even) td {
background-color: rgba(13, 148, 136, 0.04);
}
/* Chat: teal left-border on assistant bubbles */
.message.bot {
border-left: 3px solid #0d9488 !important;
}
/* Phase progress: monospace, slightly muted */
.phase-bar p {
font-family: "Fira Code", monospace;
font-size: 12px;
letter-spacing: 0.03em;
opacity: 0.80;
}
/* Upload area: cleaner dashed border */
.upload-container {
border-style: dashed !important;
border-width: 1px !important;
}
""",
) as demo:
# ── B12: Header ────────────────────────────────────────────
# PURPOSE: Application title and subtitle.
# ───────────────────────────────────────────────────────────
gr.Markdown(
"# πŸ”¬ Topic Modelling Β· Agentic AI\n"
"<sub>Mistral Β· Cosine Clustering Β· 384d Embeddings Β· Braun & Clarke Thematic Analysis</sub>"
)
# ── end B12: Header ────────────────────────────────────────
# ── B13: Data input ────────────────────────────────────────
# PURPOSE: CSV file upload area with inline instructions.
# Researcher uploads their Scopus CSV export here.
# On upload, B19 auto-triggers the first analysis.
# COMPONENTS: gr.File (upload) + gr.Markdown (instructions)
# EVENTS: upload.change β†’ B19 (_auto_load_csv)
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘  Upload**")
with gr.Row():
upload = gr.File(label="πŸ“‚ Scopus CSV", file_types=[".csv"])
gr.Markdown(
"Upload your Scopus CSV export, then type `run abstract only` in the chat below "
"to begin the analysis pipeline."
)
# ── end B13: Data input ────────────────────────────────────
# ── B14: Progress pipeline ─────────────────────────────────
# PURPOSE: Visual indicator of which Braun & Clarke analysis
# phases are complete. Updated after every agent action.
# elem_classes="phase-bar" targets the monospace CSS rule in B11.
# COMPONENT: gr.Markdown β€” displays emoji string from B5
# UPDATED BY: B18 (after chat), B10 (after review), B19 (after upload)
# ───────────────────────────────────────────────────────────
phase_progress = gr.Markdown(value=_build_progress(), elem_classes=["phase-bar"])
# ── end B14: Progress pipeline ─────────────────────────────
# ── B15: Chatbot + input ───────────────────────────────────
# PURPOSE: Main conversation interface between researcher and
# the LangGraph agent.
# COMPONENTS: gr.Chatbot (display), gr.Textbox (input), gr.Button (send)
# EVENTS: msg.submit β†’ B18, send.click β†’ B18
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘‘ Conversation** β€” follow the guided workflow")
with gr.Group():
chatbot = gr.Chatbot(
height=320,
show_label=False,
avatar_images=(
None,
"https://api.dicebear.com/7.x/bottts-neutral/svg?seed=bertopic",
),
placeholder=(
"**Ready.** Upload a Scopus CSV above, then type:\n\n"
"`run abstract only` Β· `approve all` Β· `show topic 4 papers` Β· `done`"
),
)
with gr.Row():
msg = gr.Textbox(
placeholder="run Β· approve Β· show topic 4 papers Β· group 0 1 5 Β· done",
show_label=False, scale=9, lines=1, max_lines=1, container=False,
)
send = gr.Button("⏎ Send", variant="primary", scale=1, min_width=80)
# ── end B15: Chatbot + input ───────────────────────────────
# ── B16: Review table tab ──────────────────────────────────
# PURPOSE: Interactive topic review table where the researcher
# approves, renames, or annotates BERTopic-discovered
# topics. This is the core human-in-the-loop interface.
#
# KEY FEATURES (all native Gradio, no HTML):
# - static_columns=[0,1,2,3,4] β€” first 5 columns read-only
# - datatype "bool" on column 5 β€” Approve renders as checkbox
# - pinned_columns=2 β€” # and Label stay visible when scrolling
# - show_search="filter" β€” built-in column filtering
# - .select() event β€” clicking any row auto-loads that topic's papers
#
# COMPONENTS: gr.Dataframe, gr.Button (submit), gr.Textbox (papers)
# EVENTS: review_table.select β†’ B9, submit_review.click β†’ B10
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘’ Review & Export**")
with gr.Tabs():
with gr.Tab("πŸ“‹ Topics"):
gr.Markdown(
"*Toggle **Approve**, fill in **Rename To** or **Reasoning**, "
"then click Submit. Click any row to inspect its source papers below.*"
)
review_table = gr.Dataframe(
headers=[
"#", "Topic Label", "Top Evidence Sentence",
"Sentences", "Papers", "Approve", "Rename To", "Your Reasoning",
],
datatype=[
"number", "str", "str", "number", "number",
"bool", "str", "str",
],
interactive=True,
column_count=8,
# NOTE: These features need Gradio >=5.23. Uncomment when available:
# static_columns=[0, 1, 2, 3, 4],
# pinned_columns=2,
# show_search="filter",
# show_row_numbers=True,
# show_fullscreen_button=True,
# show_copy_button=True,
# column_widths=["60px","200px","250px","80px","70px","70px","150px","200px"],
)
submit_review = gr.Button("βœ… Submit Review to Agent", variant="primary")
gr.Markdown("---")
gr.Markdown("**πŸ“„ Papers in selected topic** *(click any row above)*")
paper_list = gr.Textbox(
label="Papers in selected topic",
lines=8, interactive=False,
)
# ── end B16: Review table tab ──────────────────────────────
# ── B17a: Charts tab ───────────────────────────────────
# PURPOSE: Display BERTopic visualization charts rendered
# natively in gr.Plot from Plotly JSON files.
# COMPONENTS: gr.Dropdown (selector), gr.Plot (display)
# EVENTS: chart_selector.change β†’ B7 (_load_chart)
# ───────────────────────────────────────────────────────
with gr.Tab("πŸ“Š Visualise"):
chart_selector = gr.Dropdown(
choices=[], label="Select chart", interactive=True,
)
chart_display = gr.Plot(label="BERTopic Visualization")
# ── end B17a: Charts tab ───────────────────────────────
# ── B17b: Download tab ─────────────────────────────────
# PURPOSE: Multi-file download for all pipeline outputs.
# COMPONENTS: gr.Markdown (descriptions), gr.File (download)
# UPDATED BY: B18, B10, B19 β€” refreshed after each action
# ───────────────────────────────────────────────────────
with gr.Tab("⬇ Export"):
gr.Markdown(
"**Files by Phase (per run: abstract / title):**\n\n"
"**Phase 2 β€” Discovery:** `summaries.json` Β· `emb.npy`\n\n"
"**Phase 2 β€” Labeling:** `labels.json`\n\n"
"**Phase 2 β€” Charts:** `intertopic.json` Β· `bars.json` Β· "
"`hierarchy.json` Β· `heatmap.json`\n\n"
"**Phase 3 β€” Themes:** `themes.json`\n\n"
"**Phase 5.5 β€” Taxonomy:** `taxonomy_map.json`\n\n"
"**Phase 6 β€” Report:** `comparison.csv` Β· `narrative.txt`"
)
download = gr.File(label="All output files", file_count="multiple")
# ── end B17b: Download tab ─────────────────────────────
chart_selector.change(_load_chart, [chart_selector], [chart_display])
review_table.select(
_show_papers_by_select, [review_table], [paper_list],
)
submit_review.click(
_submit_review, [review_table, chatbot],
[chatbot, download, chart_selector, chart_display,
review_table, phase_progress],
)
def respond_with_viz(message, chat_history, uploaded_file):
"""Wrap respond() and update charts + table + progress after each turn."""
gen = respond(message, chat_history, uploaded_file)
# First yield (progress bubble)
hist, txt, dl = next(gen)
yield (hist, txt, dl, gr.update(choices=_get_chart_choices()),
gr.update(), gr.update(), _build_progress())
# Second yield (final response + populate table + charts)
hist, txt, dl = next(gen)
choices = _get_chart_choices()
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
table_data = _load_review_table()
yield (
hist, txt, dl,
gr.update(choices=choices, value=(choices and choices[-1]) or None),
first_chart,
gr.update(value=table_data),
_build_progress(),
)
msg.submit(
respond_with_viz, [msg, chatbot, upload],
[chatbot, msg, download, chart_selector, chart_display,
review_table, phase_progress],
)
send.click(
respond_with_viz, [msg, chatbot, upload],
[chatbot, msg, download, chart_selector, chart_display,
review_table, phase_progress],
)
# ── end B18: respond_with_viz + event bindings ─────────────
# ── B19: _auto_load_csv() ──────────────────────────────────
# PURPOSE: Automatically triggers analysis when a CSV file is
# uploaded. Sends "Analyze my Scopus CSV" as the
# initial message so no manual typing is needed.
# TRIGGERED BY: upload.change event
# CALLS: B6 (respond) with auto-message
# OUTPUTS: chatbot, download, chart_selector, chart_display,
# review_table, phase_progress
# ───────────────────────────────────────────────────────────
def _auto_load_csv(uploaded_file, chat_history):
"""Auto-trigger analysis when CSV is uploaded β€” no typing needed."""
gen = respond("Analyze my Scopus CSV", chat_history, uploaded_file)
# First yield (progress)
hist, txt, dl = next(gen)
yield (hist, dl, gr.update(), gr.update(),
gr.update(), _build_progress())
# Second yield (final + populate everything)
hist, txt, dl = next(gen)
choices = _get_chart_choices()
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
table_data = _load_review_table()
yield (
hist, dl,
gr.update(choices=choices, value=(choices and choices[-1]) or None),
first_chart,
gr.update(value=table_data),
_build_progress(),
)
upload.change(
_auto_load_csv, [upload, chatbot],
[chatbot, download, chart_selector, chart_display,
review_table, phase_progress],
)
# ── end B19: _auto_load_csv ────────────────────────────────
print(">>> Launching...")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False,
theme=theme, # Gradio 6: moved from gr.Blocks()
footer_links=[], # Gradio 6: hides footer, replaces show_api
)
# ── end B20: Launch ────────────────────────────────────────────