topic_modelling / app.py
milindkamat0507's picture
Upload app.py
3454e5c verified
"""
app.py β€” Topic Modelling Agentic AI | Gradio UI
═══════════════════════════════════════════════════
Version: 3.0.0 | April 2026
Stack: Gradio 5.x + LangGraph + Mistral + BERTopic
Deploy: HuggingFace Spaces (sdk: gradio)
Rules: Zero gr.HTML(). All UI via native Gradio components.
See GRADIO_UI_GUIDELINES_v2.docx for full standards.
ARCHITECTURE β€” 20 Blocks in 5 Sections
─────────────────────────────────────────
Section 1: Setup (B1–B3) Imports, agent, theme
Section 2: Helpers (B4–B10) Pure Python functions, no UI
Section 3: UI Layout (B11–B17) gr.Blocks with native components
Section 4: Event Wiring (B18–B19) Connect UI to functions
Section 5: Launch (B20) Start server
BLOCK COMMUNICATION MAP
─────────────────────────
B6 (respond) ←→ B2 (agent) : invokes agent for chat
B6 (respond) β†’ B4 (output) : scans for download files
B7 (chart) β†’ B17a (display) : loads Plotly JSON β†’ gr.Plot
B8 (table) β†’ B16 (review) : builds rows β†’ gr.Dataframe
B9 (papers) ← B16 (review) : triggered by row click
B10 (submit) β†’ B2 (agent) : sends review edits to agent
B18 (wiring) β†’ B5,B7,B8 : refreshes progress, charts, table
"""
import os
import glob
import json
import plotly.io as pio
import gradio as gr
from langchain_mistralai import ChatMistralAI
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from agent import SYSTEM_PROMPT, get_local_tools
print(">>> app.py: imports complete")
# ╔═══════════════════════════════════════════════════════════════╗
# β•‘ SECTION 1 β€” SETUP β•‘
# β•‘ One-time initialization: agent creation and visual theme. β•‘
# β•‘ Nothing here renders UI β€” it prepares the backend brain β•‘
# β•‘ and the visual identity for the entire application. β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# ── B2: Agent setup ─────────────────────────────────────────────
# PURPOSE: Create the LangGraph ReAct agent that powers all chat.
# Connects Mistral LLM to BERTopic tools with memory so
# the agent remembers context across conversation turns.
# PRODUCES: `agent` β€” used by B6 (respond) and B10 (_submit_review)
# IMPORTS: SYSTEM_PROMPT, get_local_tools from agent.py
# NOTE: MemorySaver keeps conversation in RAM (resets on restart).
# For persistent memory, swap to SQLite checkpointer.
# ────────────────────────────────────────────────────────────────
llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300)
tools = get_local_tools()
agent = create_react_agent(
model=llm, tools=tools, prompt=SYSTEM_PROMPT, checkpointer=MemorySaver()
)
print(f">>> app.py: agent ready ({len(tools)} tools)")
_msg_count = 0 # Global message counter (shared across users)
_uploaded = {"path": ""} # Last uploaded CSV path (shared session)
# ── end B2: Agent setup ────────────────────────────────────────
# ── B3: Theme ───────────────────────────────────────────────────
# PURPOSE: Define the visual identity of the entire application.
# Replaces ALL custom CSS that was previously in HEADER_HTML:
# - DM Sans font (was @import url in <style> block)
# - Slate color palette (was hardcoded hex in inline styles)
# - Soft rounded corners and spacing
# USED BY: B20 (demo.launch) β€” Gradio 6 moved theme from gr.Blocks
# to launch(). The theme object is created here but applied
# in B20 via demo.launch(theme=theme).
# REPLACES: Old HEADER_HTML lines 33-38 (<style> block with CSS)
# ────────────────────────────────────────────────────────────────
theme = gr.themes.Soft(
primary_hue="slate",
font=gr.themes.GoogleFont("DM Sans"),
font_mono=gr.themes.GoogleFont("JetBrains Mono"),
)
# ── end B3: Theme ──────────────────────────────────────────────
# ╔═══════════════════════════════════════════════════════════════╗
# β•‘ SECTION 2 β€” HELPER FUNCTIONS β•‘
# β•‘ Pure Python functions that process data and return clean β•‘
# β•‘ values (strings, lists, figures). NONE of these functions β•‘
# β•‘ return HTML strings. They feed data to UI components in β•‘
# β•‘ Section 3 via event handlers in Section 4. β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# ── B4: _latest_output() ───────────────────────────────────────
# PURPOSE: Scan /tmp for all rq4_* output files generated by the
# BERTopic agent pipeline (CSVs, JSONs, chart files).
# Sorts them by pipeline phase order so the download
# component shows files in logical sequence.
# RETURNS: List[str] of filepaths sorted by phase, or None
# USED BY: B6 (respond) β€” attaches to download component after
# each agent response
# B10 (_submit_review) β€” refreshes downloads after review
# B19 (_auto_load_csv) β€” refreshes after initial upload
# ────────────────────────────────────────────────────────────────
def _latest_output():
"""Scan /tmp for ALL rq4_* files, sorted by phase order.
Returns list of filepaths for gr.File download component."""
phase_order = {
"summaries": 1, "labels": 2, "themes": 3, "taxonomy": 4,
"emb": 0, "intertopic": 5, "bars": 6, "hierarchy": 7,
"heatmap": 8, "comparison": 9, "narrative": 10,
}
files = (
glob.glob("/tmp/rq4_*.csv")
+ glob.glob("/tmp/rq4_*.json")
+ glob.glob("/tmp/checkpoints/rq4_*.json")
)
scored = list(map(
lambda f: (sum(v * (k in f) for k, v in phase_order.items()), f),
files,
))
scored.sort(key=lambda x: x[0])
return list(map(lambda x: x[1], scored)) or None
# ── end B4: _latest_output ─────────────────────────────────────
# ── B5: _build_progress() ──────────────────────────────────────
# PURPOSE: Check which Braun & Clarke phases are complete by
# scanning for checkpoint files on disk. Returns a
# human-readable emoji string showing pipeline status.
# RETURNS: str like "βœ… Load β†’ βœ… Codes β†’ ⏳ Themes β†’ ⬜ Report"
# USED BY: B14 (phase_progress initial value)
# B18 (respond_with_viz) β€” refreshes after each agent turn
# B10 (_submit_review) β€” refreshes after review submission
# B19 (_auto_load_csv) β€” refreshes after CSV upload
# REPLACES: Old _build_progress() which returned 24 lines of HTML
# with inline-styled <span> elements and color codes.
# Now returns pure text with emoji β€” gr.Markdown renders it.
# ────────────────────────────────────────────────────────────────
def _build_progress():
"""Return emoji progress pipeline. NO HTML β€” just text + emoji.
Displayed in gr.Markdown component (B14)."""
checks = [
("Load", bool(glob.glob("/tmp/checkpoints/rq4_*_summaries.json")
or glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))),
("Codes", bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))),
("Themes", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("Review", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("Names", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))),
("PAJAIS", bool(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))),
("Report", bool(glob.glob("/tmp/rq4_comparison.csv")
or glob.glob("/tmp/rq4_narrative.txt"))),
]
return " β†’ ".join(f"{'βœ…' if done else '⬜'} {name}" for name, done in checks)
# ── end B5: _build_progress ────────────────────────────────────
# ── B6: respond() ──────────────────────────────────────────────
# PURPOSE: Core chat handler. This is the brain of the app.
# 1. Stores uploaded CSV file path (if new upload)
# 2. Appends file location + phase context to user message
# so the agent knows what data is available
# 3. Yields a "thinking..." bubble immediately (user sees
# instant feedback while agent processes)
# 4. Invokes the LangGraph agent (Mistral decides which
# BERTopic tools to call)
# 5. Replaces thinking bubble with actual agent response
# 6. Attaches latest output files to download component
# INPUTS: message (str), chat_history (list[dict]), uploaded_file (str|None)
# YIELDS: Tuple of (chat_history, empty_string, download_files)
# β€” yields TWICE: first with progress bubble, then with final response
# TALKS TO: B2 (agent.invoke) β€” sends message, gets response
# B4 (_latest_output) β€” gets download file list
# USED BY: B18 (respond_with_viz wraps this)
# B19 (_auto_load_csv wraps this)
# NOTE: Uses single thread_id="session" so agent remembers
# previous turns (loaded CSV path, current phase, etc.)
# ────────────────────────────────────────────────────────────────
def respond(message, chat_history, uploaded_file):
"""Handle one chat turn with the LangGraph agent.
Yields twice: progress bubble β†’ final response."""
global _msg_count
_msg_count += 1
# Store file path β€” uses `or` short-circuit instead of if/else
_uploaded["path"] = uploaded_file or _uploaded.get("path", "")
# Tell agent where the CSV is (prevents hallucinated filepaths)
file_note = (
f"\n[CSV file at: {_uploaded['path']}]" * bool(_uploaded["path"])
) or "\n[No CSV uploaded yet β€” ask user to upload a file first]"
# Tell agent what phase we're in based on existing checkpoint files
phase_context = (
"\n[Phase context: labels exist]"
* bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
or "\n[Phase context: embeddings exist]"
* bool(glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))
or "\n[Phase context: fresh start]"
)
text = ((message or "").strip() or "Analyze my Scopus CSV") + file_note + phase_context
print(f"\n{'='*60}\n>>> MSG #{_msg_count}: '{text[:120]}'\n{'='*60}")
# YIELD 1: Show "thinking" bubble immediately
chat_history = chat_history + [
{"role": "user", "content": (message or "").strip()},
{"role": "assistant", "content": "πŸ”¬ **Working...** _Agent is thinking..._"},
]
yield chat_history, "", _latest_output()
# Invoke agent β€” Mistral brain decides which tools to call
result = agent.invoke(
{"messages": [("human", text)]},
config={"configurable": {"thread_id": "session"}},
)
response = result["messages"][-1].content
print(f">>> Response ({len(response)} chars)")
# YIELD 2: Replace thinking bubble with actual response
chat_history[-1] = {"role": "assistant", "content": response}
gr.Info(f"Agent responded ({len(response)} chars)")
yield chat_history, "", _latest_output()
# ── end B6: respond ────────────────────────────────────────────
# ── B7: _load_chart() ──────────────────────────────────────────
# PURPOSE: Load a BERTopic visualization chart from a saved Plotly
# JSON file on disk and return the figure object.
# The gr.Plot component in B17a renders this directly β€”
# no iframe, no HTML escaping, no srcdoc hack.
# INPUT: chart_name (str) β€” filename like "rq4_intertopic.json"
# RETURNS: plotly.graph_objects.Figure or None
# USED BY: B17a (chart_selector.change event)
# B18 (respond_with_viz) β€” auto-shows latest chart
# REPLACES: Old _load_chart() which used html.escape() + iframe
# srcdoc to embed HTML files. That was 8 lines of hack.
# REQUIRES: BERTopic tools in tools.py must save charts as Plotly
# JSON via pio.to_json(fig) instead of fig.write_html().
# ────────────────────────────────────────────────────────────────
def _load_chart(chart_name):
"""Load Plotly chart from JSON file. Returns figure for gr.Plot.
No HTML, no iframe β€” just a native Plotly figure object."""
path = f"/tmp/{chart_name}"
(not os.path.exists(path)) and (not None) # guard
return pio.from_json(open(path).read()) * bool(os.path.exists(path)) or None
def _get_chart_choices():
"""Find all rq4_*.json chart files in /tmp."""
files = sorted(glob.glob("/tmp/rq4_*.json"))
return list(map(os.path.basename, files))
# ── end B7: _load_chart ───────────────────────────────────────
# ── B8: _load_review_table() ───────────────────────────────────
# PURPOSE: Load the latest BERTopic phase data (taxonomy, themes,
# labels, or summaries β€” whichever is most recent) and
# build a review table for the researcher to approve,
# rename, or annotate topics.
# RETURNS: List[List] with 8 columns matching the Dataframe schema:
# [#, Label, Evidence, Sentences, Papers, Approve, Rename, Reasoning]
# - Column 5 (Approve) is bool (True/False) β†’ renders as checkbox
# - Columns 0-4 are read-only (enforced by static_columns in B16)
# - Columns 5-7 are editable by the researcher
# USED BY: B16 (initial table value)
# B10 (_submit_review) β€” reloads after agent processes review
# B18 (respond_with_viz) β€” refreshes after each agent turn
# REPLACES: Old version which returned "yes"/"no" strings for Approve.
# Now returns True/False so gr.Dataframe renders checkboxes.
# ────────────────────────────────────────────────────────────────
def _load_review_table():
"""Build review table from latest checkpoint JSON.
Approve column is bool (renders as checkbox in gr.Dataframe).
Priority: taxonomy_map > themes > labels > summaries."""
taxonomy_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))
theme_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
# Pick most advanced checkpoint available
path = (
(taxonomy_files and taxonomy_files[-1])
or (theme_files and theme_files[-1])
or (label_files and label_files[-1])
or (summary_files and summary_files[-1])
or ""
)
is_taxonomy = bool(taxonomy_files and taxonomy_files[-1] == path)
data = (os.path.exists(path) and json.load(open(path))) or []
# For taxonomy: merge with themes to get sentence/paper counts
theme_lookup = {}
(is_taxonomy and theme_files) and theme_lookup.update(
{t.get("label", ""): t for t in json.load(open(theme_files[-1]))}
)
rows = list(map(
lambda pair: [
pair[0], # #
pair[1].get("label", pair[1].get("top_words", ""))[:60], # Label
# Evidence: PAJAIS mapping for taxonomy, nearest sentence otherwise
(
is_taxonomy
and f"β†’ {pair[1].get('pajais_match', '?')} | {pair[1].get('reasoning', '')}"[:120]
) or (
(pair[1].get("nearest", [{}])[0].get("sentence", "")[:120] + "...")
* bool(pair[1].get("nearest"))
),
# Sentence/paper counts
theme_lookup.get(pair[1].get("label", ""), pair[1]).get(
"sentence_count", pair[1].get("sentence_count", 0)),
theme_lookup.get(pair[1].get("label", ""), pair[1]).get(
"paper_count", pair[1].get("paper_count", 0)),
True, # Approve (bool β†’ checkbox)
"", # Rename To
"", # Reasoning
],
enumerate(data),
))
return rows or [[0, "No data yet", "", 0, 0, False, "", ""]]
# ── end B8: _load_review_table ─────────────────────────────────
# ── B9: _show_papers_by_select() ───────────────────────────────
# PURPOSE: When the researcher clicks any row in the review table,
# this function fires and shows the papers belonging to
# that topic. Eliminates the old workflow of typing a
# Topic # into a separate input and clicking "Show Papers".
# INPUT: gr.SelectData event β€” contains .index (row, col) and .value
# RETURNS: str β€” formatted paper list for gr.Textbox (paper_list)
# TRIGGERED BY: review_table.select() event in B16
# REPLACES: Old _show_papers(topic_id) + topic_num (gr.Number) +
# view_papers_btn (gr.Button) β€” all three components removed.
# NOTE: Uses column 0 value (the # column) as topic_id, NOT the
# row index, because filtering/sorting may reorder rows.
# ────────────────────────────────────────────────────────────────
def _show_papers_by_select(table_data, evt: gr.SelectData):
"""Show papers for clicked row. Uses column 0 as topic_id.
Triggered by review_table.select() β€” no separate Topic # input needed."""
row_idx = evt.index[0]
# Get topic_id from column 0 of the clicked row (not row index)
topic_id = int(table_data.iloc[row_idx, 0]) if hasattr(table_data, 'iloc') else int(table_data[row_idx][0])
# Load paper data from checkpoint files
label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))
summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json"))
all_files = label_files or summary_files
lines = []
for f in all_files:
source = os.path.basename(f).split("_")[1]
data = json.load(open(f))
for t in data:
(t.get("topic_id") == topic_id) and lines.append(
f"═══ {source.upper()} β€” Topic {topic_id}: "
f"{t.get('label', t.get('top_words', '')[:50])} ═══\n"
f"{t.get('sentence_count', 0)} sentences from {t.get('paper_count', 0)} papers\n"
f"AI Reasoning: {t.get('reasoning', 'not yet labeled')}\n\n"
f"── 5 NEAREST CENTROID SENTENCES (evidence) ──\n"
+ "\n".join(
f" {i+1}. \"{t['nearest'][i]['sentence'][:200]}\"\n"
f" Paper: {t['nearest'][i].get('title', '')[:100]}"
for i in range(min(5, len(t.get('nearest', []))))
)
+ "\n\n── ALL PAPER TITLES ──\n"
+ "\n".join(
f" {i+1}. {title}"
for i, title in enumerate(t.get('paper_titles', []))
)
)
return "\n\n".join(lines) or f"Topic {topic_id} not found."
# ── end B9: _show_papers_by_select ─────────────────────────────
# ── B10: _submit_review() ──────────────────────────────────────
# PURPOSE: When the researcher finishes editing the review table
# (checking Approve boxes, typing Rename values, adding
# Reasoning notes) and clicks "Submit Review", this
# function converts those edits into a natural language
# message and sends it to the agent for processing.
# INPUTS: table_data (DataFrame from gr.Dataframe), chat_history (list)
# YIELDS: Tuple of (chat, download, chart_choices, chart_fig,
# review_rows, progress_str) β€” yields twice (progress β†’ final)
# TALKS TO: B2 (agent.invoke) β€” sends review decisions
# B4 (_latest_output) β€” refreshes downloads
# B5 (_build_progress) β€” refreshes pipeline status
# B7 (_get_chart_choices) β€” refreshes chart dropdown
# B8 (_load_review_table) β€” reloads table with updated data
# NOTE: Column 5 (Approve) is now bool. True = approve, False = reject.
# ────────────────────────────────────────────────────────────────
def _submit_review(table_data, chat_history):
"""Convert review table edits into agent message.
Approve column is bool (checkbox), not string."""
rows = table_data.values.tolist()
lines = list(map(
lambda r: (
f"Topic {int(r[0])}: "
+ (f"RENAME to '{r[6]}'" * bool(str(r[6]).strip()))
+ (f"APPROVE '{r[1]}'" * (not bool(str(r[6]).strip())) * bool(r[5]))
+ (f"REJECT" * (not r[5]))
+ (f" β€” reason: {r[7]}" * bool(str(r[7]).strip()))
),
rows,
))
review_msg = "Review decisions:\n" + "\n".join(lines)
print(f">>> Review submitted: {review_msg[:200]}")
# YIELD 1: Show processing bubble
chat_history = chat_history + [
{"role": "user", "content": review_msg},
{"role": "assistant", "content": "πŸ”¬ **Processing review decisions...**"},
]
gr.Info("Review submitted to agent")
yield (chat_history, _latest_output(), gr.update(),
gr.update(), gr.update(), _build_progress())
# Invoke agent with review decisions
result = agent.invoke(
{"messages": [("human", review_msg)]},
config={"configurable": {"thread_id": "session"}},
)
response = result["messages"][-1].content
# YIELD 2: Final response + refreshed table/charts
chat_history[-1] = {"role": "assistant", "content": response}
gr.Info("Review processed β€” table updated")
yield (
chat_history,
_latest_output(),
gr.update(choices=_get_chart_choices()),
gr.update(),
gr.update(value=_load_review_table()),
_build_progress(),
)
# ── end B10: _submit_review ────────────────────────────────────
# ╔═══════════════════════════════════════════════════════════════╗
# β•‘ SECTION 3 β€” UI LAYOUT β•‘
# β•‘ All visual components defined here using ONLY native Gradio β•‘
# β•‘ widgets. Zero gr.HTML() calls. Theming via B3. β•‘
# β•‘ Layout: Header β†’ Upload β†’ Progress β†’ Chat β†’ Results tabs β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
print(">>> Building UI...")
# ── B11: gr.Blocks container ───────────────────────────────────
# PURPOSE: Root container for the entire application UI.
# Enables full browser width via fill_width.
# CONTAINS: All UI blocks B12 through B17b
# CONFIG: title β€” browser tab title (stays on Blocks in Gradio 6)
# fill_width β€” removes side padding, uses full browser width
# NOTE: In Gradio 6.0, theme/css/footer_links moved from
# gr.Blocks() to demo.launch(). See B20 for those params.
# ────────────────────────────────────────────────────────────────
with gr.Blocks(
title="Topic Modelling β€” Agentic AI",
fill_width=True,
) as demo:
# ── B12: Header ────────────────────────────────────────────
# PURPOSE: Application title and subtitle. Single gr.Markdown
# call replaces 15 lines of HEADER_HTML that included
# a gradient background div, font imports, and inline CSS.
# REPLACES: Old HEADER_HTML constant (lines 32-47 of old app.py)
# ───────────────────────────────────────────────────────────
gr.Markdown(
"# πŸ”¬ Topic Modelling β€” Agentic AI\n"
"*Mistral Β· Cosine Clustering Β· 384d Β· B&C Thematic Analysis*"
)
# ── end B12: Header ────────────────────────────────────────
# ── B13: Data input ────────────────────────────────────────
# PURPOSE: CSV file upload area with inline instructions.
# Researcher uploads their Scopus CSV export here.
# On upload, B19 auto-triggers the first analysis.
# COMPONENTS: gr.File (upload) + gr.Markdown (instructions)
# EVENTS: upload.change β†’ B19 (_auto_load_csv)
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘  Data input**")
with gr.Row():
upload = gr.File(label="πŸ“‚ Upload Scopus CSV", file_types=[".csv"])
gr.Markdown("**Upload your CSV** then type `run abstract only` in chat below")
# ── end B13: Data input ────────────────────────────────────
# ── B14: Progress pipeline ─────────────────────────────────
# PURPOSE: Visual indicator of which Braun & Clarke analysis
# phases are complete. Updated after every agent action.
# Now uses gr.Markdown with emoji text (was gr.HTML
# with inline-styled colored <span> elements).
# COMPONENT: gr.Markdown β€” displays emoji string from B5
# UPDATED BY: B18 (after chat), B10 (after review), B19 (after upload)
# REPLACES: Old gr.HTML(value=_build_progress()) with 24 lines of HTML
# ───────────────────────────────────────────────────────────
phase_progress = gr.Markdown(value=_build_progress())
# ── end B14: Progress pipeline ─────────────────────────────
# ── B15: Chatbot + input ───────────────────────────────────
# PURPOSE: Main conversation interface between researcher and
# the LangGraph agent. The chatbot displays message
# history with markdown rendering. The textbox + button
# below it capture user input.
# COMPONENTS: gr.Chatbot (display), gr.Textbox (input), gr.Button (send)
# EVENTS: msg.submit β†’ B18, send.click β†’ B18
# NOTE: placeholder text guides the researcher on available commands.
# height=300 keeps chat visible while showing results below.
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘‘ Agent conversation** β€” follow the prompts below")
with gr.Group():
chatbot = gr.Chatbot(
height=300,
show_label=False,
placeholder="Upload your Scopus CSV above, then type: run abstract only",
)
with gr.Row():
msg = gr.Textbox(
placeholder="run Β· approve Β· show topic 4 papers Β· group 0 1 5 Β· done",
show_label=False, scale=9, lines=1, max_lines=1, container=False,
)
send = gr.Button("Send", variant="primary", scale=1, min_width=70)
# ── end B15: Chatbot + input ───────────────────────────────
# ── B16: Review table tab ──────────────────────────────────
# PURPOSE: Interactive topic review table where the researcher
# approves, renames, or annotates BERTopic-discovered
# topics. This is the core human-in-the-loop interface.
#
# KEY FEATURES (all native Gradio, no HTML):
# - static_columns=[0,1,2,3,4] β€” first 5 columns (#, Label,
# Evidence, Sentences, Papers) are READ-ONLY. Prevents
# accidental edits to agent-generated data.
# - datatype "bool" on column 5 β€” Approve renders as a native
# CHECKBOX. Researcher clicks to toggle, no typing needed.
# - pinned_columns=2 β€” # and Label columns stay visible when
# scrolling horizontally through wider columns.
# - show_search="filter" β€” built-in column filtering. Researcher
# can filter by paper count, sentence count, etc.
# - .select() event β€” clicking any row auto-loads that topic's
# papers in the textbox below. REPLACES the old workflow of
# Topic # input + Show Papers button (both removed).
#
# COMPONENTS: gr.Dataframe, gr.Button (submit), gr.Textbox (papers)
# EVENTS: review_table.select β†’ B9 (_show_papers_by_select)
# submit_review.click β†’ B10 (_submit_review)
# DATA: Loaded by B8 (_load_review_table)
# REPLACES: Old gr.Dataframe (no static_columns, string Approve,
# no search) + topic_num + view_papers_btn
# ───────────────────────────────────────────────────────────
gr.Markdown("**β‘’ Results** β€” review table, charts, downloads")
with gr.Tabs():
with gr.Tab("πŸ“‹ Review Table"):
gr.Markdown(
"*Edit Approve / Rename To / Reasoning β†’ click Submit. "
"Click any row to see its papers below.*"
)
review_table = gr.Dataframe(
headers=[
"#", "Topic Label", "Top Evidence Sentence",
"Sentences", "Papers", "Approve", "Rename To", "Your Reasoning",
],
datatype=[
"number", "str", "str", "number", "number",
"bool", "str", "str",
],
interactive=True,
column_count=8,
# NOTE: These features need Gradio >=5.23. Uncomment when available:
# static_columns=[0, 1, 2, 3, 4],
# pinned_columns=2,
# show_search="filter",
# show_row_numbers=True,
# show_fullscreen_button=True,
# show_copy_button=True,
# column_widths=["60px","200px","250px","80px","70px","70px","150px","200px"],
)
submit_review = gr.Button("βœ… Submit Review to Agent", variant="primary")
# Paper viewer β€” triggered by clicking any row (replaces Topic # + button)
gr.Markdown("---")
gr.Markdown("**πŸ“„ Papers in selected topic** *(click any row above)*")
paper_list = gr.Textbox(
label="Papers in selected topic",
lines=8, interactive=False,
)
# ── end B16: Review table tab ──────────────────────────────
# ── B17a: Charts tab ───────────────────────────────────
# PURPOSE: Display BERTopic visualization charts (intertopic
# distance map, bar chart, hierarchy, heatmap).
# Charts are loaded as Plotly figure objects from
# JSON files and rendered natively in gr.Plot.
# COMPONENTS: gr.Dropdown (selector), gr.Plot (display)
# EVENTS: chart_selector.change β†’ B7 (_load_chart)
# REPLACES: Old iframe + srcdoc hack that used html.escape()
# to embed HTML files. Now uses gr.Plot directly.
# ───────────────────────────────────────────────────────
with gr.Tab("πŸ“Š Charts"):
chart_selector = gr.Dropdown(
choices=[], label="Select Chart", interactive=True,
)
chart_display = gr.Plot(label="BERTopic Visualization")
# ── end B17a: Charts tab ───────────────────────────────
# ── B17b: Download tab ─────────────────────────────────
# PURPOSE: Multi-file download for all pipeline outputs.
# Shows file descriptions by phase and a gr.File
# component with all generated files.
# COMPONENTS: gr.Markdown (descriptions), gr.File (download)
# UPDATED BY: B18, B10, B19 β€” refreshed after each action
# ───────────────────────────────────────────────────────
with gr.Tab("πŸ“₯ Download"):
gr.Markdown(
"**Files by Phase (per run: abstract / title):**\n\n"
"**Phase 2 β€” Discovery:** `summaries.json` Β· `emb.npy`\n\n"
"**Phase 2 β€” Labeling:** `labels.json`\n\n"
"**Phase 2 β€” Charts:** `intertopic.json` Β· `bars.json` Β· "
"`hierarchy.json` Β· `heatmap.json`\n\n"
"**Phase 3 β€” Themes:** `themes.json`\n\n"
"**Phase 5.5 β€” Taxonomy:** `taxonomy_map.json`\n\n"
"**Phase 6 β€” Report:** `comparison.csv` Β· `narrative.txt`"
)
download = gr.File(label="All output files", file_count="multiple")
# ── end B17b: Download tab ─────────────────────────────
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ SECTION 4 β€” EVENT WIRING β•‘
# β•‘ Connect UI components to helper functions. This is β•‘
# β•‘ where data flows are defined: which function runs when β•‘
# β•‘ a button is clicked, a file is uploaded, or a row is β•‘
# β•‘ selected. No HTML, no CSS β€” just Python event binding. β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# ── B18: respond_with_viz() + event bindings ───────────────
# PURPOSE: Wrapper around B6 (respond) that also refreshes
# the chart dropdown, chart display, review table,
# and progress pipeline after each agent response.
# This is the main "after every chat turn, update
# everything" orchestrator.
# CALLS: B6 (respond), B5 (_build_progress), B7 (_load_chart,
# _get_chart_choices), B8 (_load_review_table)
# BINDINGS: msg.submit β†’ this function
# send.click β†’ this function
# OUTPUTS: chatbot, msg, download, chart_selector, chart_display,
# review_table, phase_progress (7 components updated)
# ───────────────────────────────────────────────────────────
chart_selector.change(_load_chart, [chart_selector], [chart_display])
review_table.select(
_show_papers_by_select, [review_table], [paper_list],
)
submit_review.click(
_submit_review, [review_table, chatbot],
[chatbot, download, chart_selector, chart_display,
review_table, phase_progress],
)
def respond_with_viz(message, chat_history, uploaded_file):
"""Wrap respond() and update charts + table + progress after each turn."""
gen = respond(message, chat_history, uploaded_file)
# First yield (progress bubble)
hist, txt, dl = next(gen)
yield (hist, txt, dl, gr.update(choices=_get_chart_choices()),
gr.update(), gr.update(), _build_progress())
# Second yield (final response + populate table + charts)
hist, txt, dl = next(gen)
choices = _get_chart_choices()
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
table_data = _load_review_table()
yield (
hist, txt, dl,
gr.update(choices=choices, value=(choices and choices[-1]) or None),
first_chart,
gr.update(value=table_data),
_build_progress(),
)
msg.submit(
respond_with_viz, [msg, chatbot, upload],
[chatbot, msg, download, chart_selector, chart_display,
review_table, phase_progress],
)
send.click(
respond_with_viz, [msg, chatbot, upload],
[chatbot, msg, download, chart_selector, chart_display,
review_table, phase_progress],
)
# ── end B18: respond_with_viz + event bindings ─────────────
# ── B19: _auto_load_csv() ──────────────────────────────────
# PURPOSE: Automatically triggers analysis when a CSV file is
# uploaded. The researcher doesn't need to type anything β€”
# just uploading the file starts the pipeline.
# Sends "Analyze my Scopus CSV" as the initial message.
# TRIGGERED BY: upload.change event
# CALLS: B6 (respond) with auto-message
# OUTPUTS: chatbot, download, chart_selector, chart_display,
# review_table, phase_progress
# ───────────────────────────────────────────────────────────
def _auto_load_csv(uploaded_file, chat_history):
"""Auto-trigger analysis when CSV is uploaded β€” no typing needed."""
gen = respond("Analyze my Scopus CSV", chat_history, uploaded_file)
# First yield (progress)
hist, txt, dl = next(gen)
yield (hist, dl, gr.update(), gr.update(),
gr.update(), _build_progress())
# Second yield (final + populate everything)
hist, txt, dl = next(gen)
choices = _get_chart_choices()
first_chart = (choices and _load_chart(choices[-1])) or gr.update()
table_data = _load_review_table()
yield (
hist, dl,
gr.update(choices=choices, value=(choices and choices[-1]) or None),
first_chart,
gr.update(value=table_data),
_build_progress(),
)
upload.change(
_auto_load_csv, [upload, chatbot],
[chatbot, download, chart_selector, chart_display,
review_table, phase_progress],
)
# ── end B19: _auto_load_csv ────────────────────────────────
# ╔═══════════════════════════════════════════════════════════════╗
# β•‘ SECTION 5 β€” LAUNCH β•‘
# β•‘ Start the Gradio server. On HuggingFace Spaces this runs β•‘
# β•‘ automatically. Locally, access at http://localhost:7860 β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# ── B20: Launch ────────────────────────────────────────────────
# PURPOSE: Start the web server. In Gradio 6.0, theme/css/footer
# params moved here from gr.Blocks().
# CONFIG: theme β€” from B3 (Soft + DM Sans + slate)
# footer_links=[] β€” hides footer natively (no CSS hack)
# ssr_mode=False β€” for HuggingFace Spaces free tier compat
# server_name="0.0.0.0" β€” accessible on network
# NOTE: On Spaces, port 7860 is auto-exposed to the internet.
# Locally, open http://localhost:7860 in your browser.
# ────────────────────────────────────────────────────────────────
print(">>> Launching...")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
ssr_mode=False,
theme=theme, # Gradio 6: moved from gr.Blocks()
footer_links=[], # Gradio 6: hides footer, replaces show_api
)
# ── end B20: Launch ────────────────────────────────────────────