""" app.py — Topic Modelling Agentic AI | Gradio UI ═══════════════════════════════════════════════════ Version: 3.1.0 | April 2026 Stack: Gradio 5.x + LangGraph + Mistral + BERTopic Deploy: HuggingFace Spaces (sdk: gradio) Rules: Zero gr.HTML(). All UI via native Gradio components. See GRADIO_UI_GUIDELINES_v2.docx for full standards. ARCHITECTURE — 20 Blocks in 5 Sections ───────────────────────────────────────── Section 1: Setup (B1–B3) Imports, agent, theme Section 2: Helpers (B4–B10) Pure Python functions, no UI Section 3: UI Layout (B11–B17) gr.Blocks with native components Section 4: Event Wiring (B18–B19) Connect UI to functions Section 5: Launch (B20) Start server BLOCK COMMUNICATION MAP ───────────────────────── B6 (respond) ←→ B2 (agent) : invokes agent for chat B6 (respond) → B4 (output) : scans for download files B7 (chart) → B17a (display) : loads Plotly JSON → gr.Plot B8 (table) → B16 (review) : builds rows → gr.Dataframe B9 (papers) ← B16 (review) : triggered by row click B10 (submit) → B2 (agent) : sends review edits to agent B18 (wiring) → B5,B7,B8 : refreshes progress, charts, table """ import os import glob import json import plotly.io as pio import gradio as gr from langchain_mistralai import ChatMistralAI from langgraph.prebuilt import create_react_agent from langgraph.checkpoint.memory import MemorySaver from agent import SYSTEM_PROMPT, get_local_tools print(">>> app.py: imports complete") llm = ChatMistralAI(model="mistral-small-latest", temperature=0, timeout=300) tools = get_local_tools() agent = create_react_agent( model=llm, tools=tools, prompt=SYSTEM_PROMPT, checkpointer=MemorySaver() ) print(f">>> app.py: agent ready ({len(tools)} tools)") _msg_count = 0 # Global message counter (shared across users) _uploaded = {"path": ""} # Last uploaded CSV path (shared session) # ── end B2: Agent setup ──────────────────────────────────────── # ── B3: Theme ─────────────────────────────────────────────────── # PURPOSE: Define the visual identity of the entire application. # Uses teal/indigo on zinc — purposeful scientific feel. # Plus Jakarta Sans: geometric-humanist, modern but not generic. # Fira Code for monospace elements (phase progress, etc). # USED BY: B20 (demo.launch) — theme applied at launch time. # ──────────────────────────────────────────────────────────────── theme = gr.themes.Default( primary_hue="teal", secondary_hue="indigo", neutral_hue="zinc", font=gr.themes.GoogleFont("Plus Jakarta Sans"), font_mono=gr.themes.GoogleFont("Fira Code"), radius_size="sm", spacing_size="md", ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_500", button_primary_text_color="white", block_label_text_size="sm", block_title_text_weight="600", ) # ── end B3: Theme ────────────────────────────────────────────── def _latest_output(): """Scan /tmp for ALL rq4_* files, sorted by phase order. Returns list of filepaths for gr.File download component.""" phase_order = { "summaries": 1, "labels": 2, "themes": 3, "taxonomy": 4, "emb": 0, "intertopic": 5, "bars": 6, "hierarchy": 7, "heatmap": 8, "comparison": 9, "narrative": 10, } files = ( glob.glob("/tmp/rq4_*.csv") + glob.glob("/tmp/rq4_*.json") + glob.glob("/tmp/checkpoints/rq4_*.json") ) scored = list(map( lambda f: (sum(v * (k in f) for k, v in phase_order.items()), f), files, )) scored.sort(key=lambda x: x[0]) return list(map(lambda x: x[1], scored)) or None # ── end B4: _latest_output ───────────────────────────────────── def _build_progress(): """Return emoji progress pipeline. NO HTML — just text + emoji. Displayed in gr.Markdown component (B14).""" checks = [ ("Load", bool(glob.glob("/tmp/checkpoints/rq4_*_summaries.json") or glob.glob("/tmp/checkpoints/rq4_*_emb.npy"))), ("Codes", bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json"))), ("Themes", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))), ("Review", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))), ("Names", bool(glob.glob("/tmp/checkpoints/rq4_*_themes.json"))), ("PAJAIS", bool(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json"))), ("Report", bool(glob.glob("/tmp/rq4_comparison.csv") or glob.glob("/tmp/rq4_narrative.txt"))), ] return " → ".join(f"{'✅' if done else '⬜'} {name}" for name, done in checks) # ── end B5: _build_progress ──────────────────────────────────── def respond(message, chat_history, uploaded_file): """Handle one chat turn with the LangGraph agent. Yields twice: progress bubble → final response.""" global _msg_count _msg_count += 1 # Store file path — uses `or` short-circuit instead of if/else _uploaded["path"] = uploaded_file or _uploaded.get("path", "") # Tell agent where the CSV is (prevents hallucinated filepaths) file_note = ( f"\n[CSV file at: {_uploaded['path']}]" * bool(_uploaded["path"]) ) or "\n[No CSV uploaded yet — ask user to upload a file first]" # Tell agent what phase we're in based on existing checkpoint files phase_context = ( "\n[Phase context: labels exist]" * bool(glob.glob("/tmp/checkpoints/rq4_*_labels.json")) or "\n[Phase context: embeddings exist]" * bool(glob.glob("/tmp/checkpoints/rq4_*_emb.npy")) or "\n[Phase context: fresh start]" ) text = ((message or "").strip() or "Analyze my Scopus CSV") + file_note + phase_context print(f"\n{'='*60}\n>>> MSG #{_msg_count}: '{text[:120]}'\n{'='*60}") # YIELD 1: Show "thinking" bubble immediately chat_history = chat_history + [ {"role": "user", "content": (message or "").strip()}, {"role": "assistant", "content": "🔬 **Working...** _Agent is thinking..._"}, ] yield chat_history, "", _latest_output() # Invoke agent — Mistral brain decides which tools to call result = agent.invoke( {"messages": [("human", text)]}, config={"configurable": {"thread_id": "session"}}, ) response = result["messages"][-1].content print(f">>> Response ({len(response)} chars)") # YIELD 2: Replace thinking bubble with actual response chat_history[-1] = {"role": "assistant", "content": response} gr.Info(f"Agent responded ({len(response)} chars)") yield chat_history, "", _latest_output() # ── end B6: respond ──────────────────────────────────────────── def _load_chart(chart_name): """Load Plotly chart from JSON file. Returns figure for gr.Plot. No HTML, no iframe — just a native Plotly figure object.""" path = f"/tmp/{chart_name}" (not os.path.exists(path)) and (not None) # guard return pio.from_json(open(path).read()) * bool(os.path.exists(path)) or None def _get_chart_choices(): """Find all rq4_*.json chart files in /tmp.""" files = sorted(glob.glob("/tmp/rq4_*.json")) return list(map(os.path.basename, files)) # ── end B7: _load_chart ─────────────────────────────────────── def _load_review_table(): """Build review table from latest checkpoint JSON. Approve column is bool (renders as checkbox in gr.Dataframe). Priority: taxonomy_map > themes > labels > summaries.""" taxonomy_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_taxonomy_map.json")) theme_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_themes.json")) label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json")) summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json")) # Pick most advanced checkpoint available path = ( (taxonomy_files and taxonomy_files[-1]) or (theme_files and theme_files[-1]) or (label_files and label_files[-1]) or (summary_files and summary_files[-1]) or "" ) is_taxonomy = bool(taxonomy_files and taxonomy_files[-1] == path) data = (os.path.exists(path) and json.load(open(path))) or [] # For taxonomy: merge with themes to get sentence/paper counts theme_lookup = {} (is_taxonomy and theme_files) and theme_lookup.update( {t.get("label", ""): t for t in json.load(open(theme_files[-1]))} ) rows = list(map( lambda pair: [ pair[0], # # pair[1].get("label", pair[1].get("top_words", ""))[:60], # Label # Evidence: PAJAIS mapping for taxonomy, nearest sentence otherwise ( is_taxonomy and f"→ {pair[1].get('pajais_match', '?')} | {pair[1].get('reasoning', '')}"[:120] ) or ( (pair[1].get("nearest", [{}])[0].get("sentence", "")[:120] + "...") * bool(pair[1].get("nearest")) ), # Sentence/paper counts theme_lookup.get(pair[1].get("label", ""), pair[1]).get( "sentence_count", pair[1].get("sentence_count", 0)), theme_lookup.get(pair[1].get("label", ""), pair[1]).get( "paper_count", pair[1].get("paper_count", 0)), True, # Approve (bool → checkbox) "", # Rename To "", # Reasoning ], enumerate(data), )) return rows or [[0, "No data yet", "", 0, 0, False, "", ""]] # ── end B8: _load_review_table ───────────────────────────────── def _show_papers_by_select(table_data, evt: gr.SelectData): """Show papers for clicked row. Uses column 0 as topic_id. Triggered by review_table.select() — no separate Topic # input needed.""" row_idx = evt.index[0] # Get topic_id from column 0 of the clicked row (not row index) topic_id = int(table_data.iloc[row_idx, 0]) if hasattr(table_data, 'iloc') else int(table_data[row_idx][0]) # Load paper data from checkpoint files label_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_labels.json")) summary_files = sorted(glob.glob("/tmp/checkpoints/rq4_*_summaries.json")) all_files = label_files or summary_files lines = [] for f in all_files: source = os.path.basename(f).split("_")[1] data = json.load(open(f)) for t in data: (t.get("topic_id") == topic_id) and lines.append( f"═══ {source.upper()} — Topic {topic_id}: " f"{t.get('label', t.get('top_words', '')[:50])} ═══\n" f"{t.get('sentence_count', 0)} sentences from {t.get('paper_count', 0)} papers\n" f"AI Reasoning: {t.get('reasoning', 'not yet labeled')}\n\n" f"── 5 NEAREST CENTROID SENTENCES (evidence) ──\n" + "\n".join( f" {i+1}. \"{t['nearest'][i]['sentence'][:200]}\"\n" f" Paper: {t['nearest'][i].get('title', '')[:100]}" for i in range(min(5, len(t.get('nearest', [])))) ) + "\n\n── ALL PAPER TITLES ──\n" + "\n".join( f" {i+1}. {title}" for i, title in enumerate(t.get('paper_titles', [])) ) ) return "\n\n".join(lines) or f"Topic {topic_id} not found." # ── end B9: _show_papers_by_select ───────────────────────────── def _submit_review(table_data, chat_history): """Convert review table edits into agent message. Approve column is bool (checkbox), not string.""" rows = table_data.values.tolist() lines = list(map( lambda r: ( f"Topic {int(r[0])}: " + (f"RENAME to '{r[6]}'" * bool(str(r[6]).strip())) + (f"APPROVE '{r[1]}'" * (not bool(str(r[6]).strip())) * bool(r[5])) + (f"REJECT" * (not r[5])) + (f" — reason: {r[7]}" * bool(str(r[7]).strip())) ), rows, )) review_msg = "Review decisions:\n" + "\n".join(lines) print(f">>> Review submitted: {review_msg[:200]}") # YIELD 1: Show processing bubble chat_history = chat_history + [ {"role": "user", "content": review_msg}, {"role": "assistant", "content": "🔬 **Processing review decisions...**"}, ] gr.Info("Review submitted to agent") yield (chat_history, _latest_output(), gr.update(), gr.update(), gr.update(), _build_progress()) # Invoke agent with review decisions result = agent.invoke( {"messages": [("human", review_msg)]}, config={"configurable": {"thread_id": "session"}}, ) response = result["messages"][-1].content # YIELD 2: Final response + refreshed table/charts chat_history[-1] = {"role": "assistant", "content": response} gr.Info("Review processed — table updated") yield ( chat_history, _latest_output(), gr.update(choices=_get_chart_choices()), gr.update(), gr.update(value=_load_review_table()), _build_progress(), ) print(">>> Building UI...") with gr.Blocks( title="Topic Modelling — Agentic AI", fill_width=True, css=""" /* Accent bar at very top of page */ .gradio-container::before { content: ""; display: block; height: 3px; background: linear-gradient(90deg, #0d9488, #6366f1); margin-bottom: 4px; } /* Tabs: tighter padding, bolder active state */ .tab-nav button { font-size: 13px !important; font-weight: 500 !important; letter-spacing: 0.01em; padding: 6px 16px !important; } .tab-nav button.selected { font-weight: 700 !important; border-bottom: 2px solid #0d9488 !important; } /* Dataframe: subtle zebra rows */ .table-wrap tr:nth-child(even) td { background-color: rgba(13, 148, 136, 0.04); } /* Chat: teal left-border on assistant bubbles */ .message.bot { border-left: 3px solid #0d9488 !important; } /* Phase progress: monospace, slightly muted */ .phase-bar p { font-family: "Fira Code", monospace; font-size: 12px; letter-spacing: 0.03em; opacity: 0.80; } /* Upload area: cleaner dashed border */ .upload-container { border-style: dashed !important; border-width: 1px !important; } """, ) as demo: # ── B12: Header ──────────────────────────────────────────── # PURPOSE: Application title and subtitle. # ─────────────────────────────────────────────────────────── gr.Markdown( "# 🔬 Topic Modelling · Agentic AI\n" "Mistral · Cosine Clustering · 384d Embeddings · Braun & Clarke Thematic Analysis" ) # ── end B12: Header ──────────────────────────────────────── # ── B13: Data input ──────────────────────────────────────── # PURPOSE: CSV file upload area with inline instructions. # Researcher uploads their Scopus CSV export here. # On upload, B19 auto-triggers the first analysis. # COMPONENTS: gr.File (upload) + gr.Markdown (instructions) # EVENTS: upload.change → B19 (_auto_load_csv) # ─────────────────────────────────────────────────────────── gr.Markdown("**① Upload**") with gr.Row(): upload = gr.File(label="📂 Scopus CSV", file_types=[".csv"]) gr.Markdown( "Upload your Scopus CSV export, then type `run abstract only` in the chat below " "to begin the analysis pipeline." ) # ── end B13: Data input ──────────────────────────────────── # ── B14: Progress pipeline ───────────────────────────────── # PURPOSE: Visual indicator of which Braun & Clarke analysis # phases are complete. Updated after every agent action. # elem_classes="phase-bar" targets the monospace CSS rule in B11. # COMPONENT: gr.Markdown — displays emoji string from B5 # UPDATED BY: B18 (after chat), B10 (after review), B19 (after upload) # ─────────────────────────────────────────────────────────── phase_progress = gr.Markdown(value=_build_progress(), elem_classes=["phase-bar"]) # ── end B14: Progress pipeline ───────────────────────────── # ── B15: Chatbot + input ─────────────────────────────────── # PURPOSE: Main conversation interface between researcher and # the LangGraph agent. # COMPONENTS: gr.Chatbot (display), gr.Textbox (input), gr.Button (send) # EVENTS: msg.submit → B18, send.click → B18 # ─────────────────────────────────────────────────────────── gr.Markdown("**② Conversation** — follow the guided workflow") with gr.Group(): chatbot = gr.Chatbot( height=320, show_label=False, avatar_images=( None, "https://api.dicebear.com/7.x/bottts-neutral/svg?seed=bertopic", ), placeholder=( "**Ready.** Upload a Scopus CSV above, then type:\n\n" "`run abstract only` · `approve all` · `show topic 4 papers` · `done`" ), ) with gr.Row(): msg = gr.Textbox( placeholder="run · approve · show topic 4 papers · group 0 1 5 · done", show_label=False, scale=9, lines=1, max_lines=1, container=False, ) send = gr.Button("⏎ Send", variant="primary", scale=1, min_width=80) # ── end B15: Chatbot + input ─────────────────────────────── # ── B16: Review table tab ────────────────────────────────── # PURPOSE: Interactive topic review table where the researcher # approves, renames, or annotates BERTopic-discovered # topics. This is the core human-in-the-loop interface. # # KEY FEATURES (all native Gradio, no HTML): # - static_columns=[0,1,2,3,4] — first 5 columns read-only # - datatype "bool" on column 5 — Approve renders as checkbox # - pinned_columns=2 — # and Label stay visible when scrolling # - show_search="filter" — built-in column filtering # - .select() event — clicking any row auto-loads that topic's papers # # COMPONENTS: gr.Dataframe, gr.Button (submit), gr.Textbox (papers) # EVENTS: review_table.select → B9, submit_review.click → B10 # ─────────────────────────────────────────────────────────── gr.Markdown("**③ Review & Export**") with gr.Tabs(): with gr.Tab("📋 Topics"): gr.Markdown( "*Toggle **Approve**, fill in **Rename To** or **Reasoning**, " "then click Submit. Click any row to inspect its source papers below.*" ) review_table = gr.Dataframe( headers=[ "#", "Topic Label", "Top Evidence Sentence", "Sentences", "Papers", "Approve", "Rename To", "Your Reasoning", ], datatype=[ "number", "str", "str", "number", "number", "bool", "str", "str", ], interactive=True, column_count=8, # NOTE: These features need Gradio >=5.23. Uncomment when available: # static_columns=[0, 1, 2, 3, 4], # pinned_columns=2, # show_search="filter", # show_row_numbers=True, # show_fullscreen_button=True, # show_copy_button=True, # column_widths=["60px","200px","250px","80px","70px","70px","150px","200px"], ) submit_review = gr.Button("✅ Submit Review to Agent", variant="primary") gr.Markdown("---") gr.Markdown("**📄 Papers in selected topic** *(click any row above)*") paper_list = gr.Textbox( label="Papers in selected topic", lines=8, interactive=False, ) # ── end B16: Review table tab ────────────────────────────── # ── B17a: Charts tab ─────────────────────────────────── # PURPOSE: Display BERTopic visualization charts rendered # natively in gr.Plot from Plotly JSON files. # COMPONENTS: gr.Dropdown (selector), gr.Plot (display) # EVENTS: chart_selector.change → B7 (_load_chart) # ─────────────────────────────────────────────────────── with gr.Tab("📊 Visualise"): chart_selector = gr.Dropdown( choices=[], label="Select chart", interactive=True, ) chart_display = gr.Plot(label="BERTopic Visualization") # ── end B17a: Charts tab ─────────────────────────────── # ── B17b: Download tab ───────────────────────────────── # PURPOSE: Multi-file download for all pipeline outputs. # COMPONENTS: gr.Markdown (descriptions), gr.File (download) # UPDATED BY: B18, B10, B19 — refreshed after each action # ─────────────────────────────────────────────────────── with gr.Tab("⬇ Export"): gr.Markdown( "**Files by Phase (per run: abstract / title):**\n\n" "**Phase 2 — Discovery:** `summaries.json` · `emb.npy`\n\n" "**Phase 2 — Labeling:** `labels.json`\n\n" "**Phase 2 — Charts:** `intertopic.json` · `bars.json` · " "`hierarchy.json` · `heatmap.json`\n\n" "**Phase 3 — Themes:** `themes.json`\n\n" "**Phase 5.5 — Taxonomy:** `taxonomy_map.json`\n\n" "**Phase 6 — Report:** `comparison.csv` · `narrative.txt`" ) download = gr.File(label="All output files", file_count="multiple") # ── end B17b: Download tab ───────────────────────────── chart_selector.change(_load_chart, [chart_selector], [chart_display]) review_table.select( _show_papers_by_select, [review_table], [paper_list], ) submit_review.click( _submit_review, [review_table, chatbot], [chatbot, download, chart_selector, chart_display, review_table, phase_progress], ) def respond_with_viz(message, chat_history, uploaded_file): """Wrap respond() and update charts + table + progress after each turn.""" gen = respond(message, chat_history, uploaded_file) # First yield (progress bubble) hist, txt, dl = next(gen) yield (hist, txt, dl, gr.update(choices=_get_chart_choices()), gr.update(), gr.update(), _build_progress()) # Second yield (final response + populate table + charts) hist, txt, dl = next(gen) choices = _get_chart_choices() first_chart = (choices and _load_chart(choices[-1])) or gr.update() table_data = _load_review_table() yield ( hist, txt, dl, gr.update(choices=choices, value=(choices and choices[-1]) or None), first_chart, gr.update(value=table_data), _build_progress(), ) msg.submit( respond_with_viz, [msg, chatbot, upload], [chatbot, msg, download, chart_selector, chart_display, review_table, phase_progress], ) send.click( respond_with_viz, [msg, chatbot, upload], [chatbot, msg, download, chart_selector, chart_display, review_table, phase_progress], ) # ── end B18: respond_with_viz + event bindings ───────────── # ── B19: _auto_load_csv() ────────────────────────────────── # PURPOSE: Automatically triggers analysis when a CSV file is # uploaded. Sends "Analyze my Scopus CSV" as the # initial message so no manual typing is needed. # TRIGGERED BY: upload.change event # CALLS: B6 (respond) with auto-message # OUTPUTS: chatbot, download, chart_selector, chart_display, # review_table, phase_progress # ─────────────────────────────────────────────────────────── def _auto_load_csv(uploaded_file, chat_history): """Auto-trigger analysis when CSV is uploaded — no typing needed.""" gen = respond("Analyze my Scopus CSV", chat_history, uploaded_file) # First yield (progress) hist, txt, dl = next(gen) yield (hist, dl, gr.update(), gr.update(), gr.update(), _build_progress()) # Second yield (final + populate everything) hist, txt, dl = next(gen) choices = _get_chart_choices() first_chart = (choices and _load_chart(choices[-1])) or gr.update() table_data = _load_review_table() yield ( hist, dl, gr.update(choices=choices, value=(choices and choices[-1]) or None), first_chart, gr.update(value=table_data), _build_progress(), ) upload.change( _auto_load_csv, [upload, chatbot], [chatbot, download, chart_selector, chart_display, review_table, phase_progress], ) # ── end B19: _auto_load_csv ──────────────────────────────── print(">>> Launching...") demo.launch( server_name="0.0.0.0", server_port=7860, ssr_mode=False, theme=theme, # Gradio 6: moved from gr.Blocks() footer_links=[], # Gradio 6: hides footer, replaces show_api ) # ── end B20: Launch ────────────────────────────────────────────