""" app.py — BERTopic Agentic Thematic Analysis — Gradio UI THEMIS: Thematic Engine for Mining and Identifying Scholarly Topics Implements Braun & Clarke (2006) 6-Phase Framework """ import gradio as gr import json import os import pandas as pd from agent import invoke_agent, reset_agent CHECKPOINT_DIR = "checkpoints" os.makedirs(CHECKPOINT_DIR, exist_ok=True) # ── Phase Checkpoint Detection ───────────────────────────────────────────────── def get_phase_status(): files = { "summaries.json": False, "labels.json": False, "themes.json": False, "taxonomy_map.json": False, "comparison.csv": False, "narrative.txt": False, } for fname in files: files[fname] = os.path.exists(os.path.join(CHECKPOINT_DIR, fname)) return files def render_phase_bar(): status = get_phase_status() phases = [ ("①", "Load", status["summaries.json"]), ("②", "Codes", status["labels.json"]), ("③", "Themes", status["themes.json"]), ("④⑤", "Review & Name", status["themes.json"]), ("⑤½", "PAJAIS Map", status["taxonomy_map.json"]), ("⑥", "Report", status["comparison.csv"] and status["narrative.txt"]), ] items = "" for num, name, done in phases: icon = "✅" if done else "⬜" cls = "phase-done" if done else "phase-pending" items += f'
{num}{icon} {name}
' return f'
{items}
' # ── Review Table Loader ──────────────────────────────────────────────────────── def load_review_table(): """Load highest-priority checkpoint for the review table.""" checkpoint_priority = [ ("taxonomy_map.json", "themes"), ("themes.json", "themes"), ("labels.json", "topics"), ("summaries.json", "topics"), ] for fname, key in checkpoint_priority: fpath = os.path.join(CHECKPOINT_DIR, fname) if os.path.exists(fpath): with open(fpath) as f: data = json.load(f) items = data.get(key, []) if not items: continue rows = [] for item in items: is_taxonomy = fname == "taxonomy_map.json" top_evidence = item.get("top_sentences", [""])[0] if item.get("top_sentences") else "" rows.append({ "ID": item.get("topic_id", item.get("theme_name", "")), "Label / Theme": item.get("label") or item.get("theme_name", ""), "Category": item.get("category") or item.get("pajais_match", ""), "Sentences": item.get("count") or item.get("sentence_count", 0), "Confidence": round(float(item.get("confidence") or item.get("match_confidence", 0)), 2), "Top Evidence": top_evidence[:120] + "..." if len(top_evidence) > 120 else top_evidence, "Approve": item.get("approve", ""), "Rename To": item.get("rename_to", ""), "Reasoning": item.get("user_reasoning", ""), }) return pd.DataFrame(rows) return pd.DataFrame(columns=["ID", "Label / Theme", "Category", "Sentences", "Confidence", "Top Evidence", "Approve", "Rename To", "Reasoning"]) # ── Charts Loader ────────────────────────────────────────────────────────────── def get_chart_options(): options = [] for run_key in ["abstract", "title"]: fpath = os.path.join(CHECKPOINT_DIR, f"{run_key}_charts.json") if os.path.exists(fpath): options.extend([ f"{run_key.title()} — Intertopic Map", f"{run_key.title()} — Topic Sizes", f"{run_key.title()} — Similarity Heatmap", f"{run_key.title()} — Size Distribution", ]) return options if options else ["No charts yet — run analysis first"] def load_chart(selection: str) -> str: if not selection or "No charts" in selection: return "

Run analysis to generate charts

" parts = selection.lower().split(" — ") if len(parts) < 2: return "" run_key = parts[0].strip() chart_key_map = { "intertopic map": "intertopic", "topic sizes": "bars", "similarity heatmap": "heatmap", "size distribution": "distribution", } chart_key = chart_key_map.get(parts[1].strip(), "intertopic") fpath = os.path.join(CHECKPOINT_DIR, f"{run_key}_charts.json") if not os.path.exists(fpath): return f"

No charts for {run_key} run yet.

" with open(fpath) as f: charts = json.load(f) html = charts.get(chart_key, "

Chart not found

") return f'
{html}
' # ── Download Links ───────────────────────────────────────────────────────────── def render_download_links(): downloads = [ ("summaries.json", "① Load Summary", "phase-1"), ("labels.json", "② Topic Labels", "phase-2"), ("themes.json", "③ Consolidated Themes", "phase-3"), ("taxonomy_map.json", "⑤½ PAJAIS Taxonomy Map", "phase-5"), ("comparison.csv", "⑥ Abstract vs Title Comparison", "phase-6"), ("narrative.txt", "⑥ Section 7 Narrative", "phase-6"), ] html = '
' for fname, label, phase_cls in downloads: fpath = os.path.join(CHECKPOINT_DIR, fname) if os.path.exists(fpath): size = os.path.getsize(fpath) size_str = f"{size/1024:.1f} KB" if size > 1024 else f"{size} B" html += f'''
📄
{label}
{fname} · {size_str}
↓ Download
''' else: html += f'''
{label}
{fname} · not yet generated
Pending
''' html += '
' return html # ── Submit Review Handler ────────────────────────────────────────────────────── def submit_review(table_data: pd.DataFrame, chat_history: list): """Convert table decisions to agent message and send.""" if table_data is None or len(table_data) == 0: return chat_history, chat_history # Serialize table decisions decisions = [] for _, row in table_data.iterrows(): approve = str(row.get("Approve", "")).strip().upper() if approve: decisions.append( f"- {row.get('Label / Theme', row.get('ID', ''))}: " f"Approve={approve}, Rename To={row.get('Rename To', '')}, " f"Reasoning={row.get('Reasoning', '')}" ) if not decisions: msg = "Researcher submitted review table with no changes — all topics accepted as-is." else: msg = "Researcher submitted review table:\n" + "\n".join(decisions) # Send to agent response = invoke_agent(msg, chat_history) chat_history = chat_history + [["📋 [Review Table Submitted]", response]] return chat_history, chat_history # ── CSS ──────────────────────────────────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;500;600;700;800&family=JetBrains+Mono:wght@300;400;500&family=Inter:wght@300;400;500&display=swap'); :root { --bg-deep: #09090f; --bg-panel: #0f0f1a; --bg-card: #141422; --bg-hover: #1a1a2e; --border: rgba(108, 82, 255, 0.18); --border-glow: rgba(108, 82, 255, 0.45); --accent: #6c52ff; --accent-2: #c084fc; --accent-3: #38bdf8; --accent-ok: #4ade80; --accent-warn: #facc15; --text-primary: #f0eeff; --text-secondary: #9b8fd4; --text-muted: #5a5380; --font-display: 'Syne', sans-serif; --font-body: 'Inter', sans-serif; --font-mono: 'JetBrains Mono', monospace; --radius: 12px; --glow: 0 0 40px rgba(108, 82, 255, 0.15); --glow-lg: 0 0 80px rgba(108, 82, 255, 0.2); } /* ── Base ── */ *, *::before, *::after { box-sizing: border-box; } body, .gradio-container { background: var(--bg-deep) !important; color: var(--text-primary) !important; font-family: var(--font-body) !important; } .gradio-container { max-width: 1400px !important; margin: 0 auto !important; padding: 0 !important; } /* ── App Header ── */ .app-header { background: linear-gradient(135deg, #0d0d1f 0%, #110f2a 50%, #0d0d1f 100%); border-bottom: 1px solid var(--border); padding: 28px 40px 22px; position: relative; overflow: hidden; } .app-header::before { content: ''; position: absolute; top: -60px; right: -60px; width: 300px; height: 300px; background: radial-gradient(circle, rgba(108,82,255,0.12) 0%, transparent 70%); pointer-events: none; } .app-header::after { content: ''; position: absolute; bottom: -40px; left: 10%; width: 200px; height: 200px; background: radial-gradient(circle, rgba(192,132,252,0.08) 0%, transparent 70%); pointer-events: none; } .header-logo { font-family: var(--font-display); font-size: 32px; font-weight: 800; letter-spacing: -1px; background: linear-gradient(135deg, #a78bfa, #6c52ff, #38bdf8); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; line-height: 1; margin-bottom: 4px; } .header-subtitle { font-family: var(--font-mono); font-size: 11px; color: var(--text-muted); letter-spacing: 2px; text-transform: uppercase; } .header-badge { display: inline-flex; align-items: center; gap: 6px; background: rgba(108,82,255,0.12); border: 1px solid var(--border); border-radius: 20px; padding: 4px 12px; font-family: var(--font-mono); font-size: 10px; color: var(--accent-2); margin-top: 8px; } /* ── Phase Progress Bar ── */ .phase-bar { display: flex; gap: 0; background: var(--bg-panel); border-bottom: 1px solid var(--border); padding: 0; overflow-x: auto; } .phase-item { flex: 1; min-width: 100px; display: flex; flex-direction: column; align-items: center; padding: 10px 8px; border-right: 1px solid var(--border); transition: background 0.2s; position: relative; } .phase-item:last-child { border-right: none; } .phase-num { font-family: var(--font-display); font-size: 18px; font-weight: 700; color: var(--text-muted); line-height: 1; } .phase-name { font-family: var(--font-mono); font-size: 10px; color: var(--text-muted); margin-top: 3px; text-align: center; letter-spacing: 0.5px; } .phase-done { background: rgba(108,82,255,0.08); } .phase-done .phase-num { color: var(--accent-2); } .phase-done .phase-name { color: var(--accent-ok); } .phase-done::after { content: ''; position: absolute; bottom: 0; left: 0; right: 0; height: 2px; background: linear-gradient(90deg, var(--accent), var(--accent-2)); } /* ── Section containers ── */ .section-block { background: var(--bg-panel); border: 1px solid var(--border); border-radius: var(--radius); margin: 16px; overflow: hidden; box-shadow: var(--glow); } .section-header { display: flex; align-items: center; gap: 12px; padding: 14px 20px; background: linear-gradient(90deg, rgba(108,82,255,0.08), transparent); border-bottom: 1px solid var(--border); } .section-num { width: 28px; height: 28px; background: linear-gradient(135deg, var(--accent), var(--accent-2)); border-radius: 8px; display: flex; align-items: center; justify-content: center; font-family: var(--font-display); font-size: 13px; font-weight: 700; color: white; flex-shrink: 0; } .section-title { font-family: var(--font-display); font-size: 15px; font-weight: 600; color: var(--text-primary); letter-spacing: 0.3px; } .section-desc { font-family: var(--font-body); font-size: 12px; color: var(--text-muted); margin-left: auto; } /* ── File upload ── */ .upload-zone { border: 2px dashed var(--border) !important; border-radius: var(--radius) !important; background: rgba(108,82,255,0.03) !important; transition: all 0.3s !important; margin: 16px !important; } .upload-zone:hover { border-color: var(--border-glow) !important; background: rgba(108,82,255,0.07) !important; } /* ── Chat ── */ .chat-wrap { padding: 0 16px 16px; } .gradio-chatbot { background: var(--bg-card) !important; border: 1px solid var(--border) !important; border-radius: var(--radius) !important; font-family: var(--font-body) !important; font-size: 14px !important; min-height: 380px !important; } .gradio-chatbot .message.user { background: linear-gradient(135deg, rgba(108,82,255,0.25), rgba(192,132,252,0.15)) !important; border: 1px solid rgba(108,82,255,0.3) !important; border-radius: 12px 12px 2px 12px !important; color: var(--text-primary) !important; font-family: var(--font-body) !important; } .gradio-chatbot .message.bot { background: var(--bg-hover) !important; border: 1px solid var(--border) !important; border-radius: 12px 12px 12px 2px !important; color: var(--text-primary) !important; font-family: var(--font-body) !important; } /* Textbox */ .msg-input textarea { background: var(--bg-card) !important; border: 1px solid var(--border) !important; border-radius: 10px !important; color: var(--text-primary) !important; font-family: var(--font-body) !important; font-size: 14px !important; padding: 12px 16px !important; transition: border-color 0.2s !important; } .msg-input textarea:focus { border-color: var(--border-glow) !important; box-shadow: 0 0 0 3px rgba(108,82,255,0.1) !important; outline: none !important; } .msg-input textarea::placeholder { color: var(--text-muted) !important; } /* ── Buttons ── */ .btn-send, .btn-review, .btn-reset { border-radius: 10px !important; font-family: var(--font-display) !important; font-weight: 600 !important; font-size: 14px !important; letter-spacing: 0.3px !important; transition: all 0.2s !important; border: none !important; cursor: pointer !important; } .btn-send { background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important; color: white !important; padding: 12px 24px !important; box-shadow: 0 4px 20px rgba(108,82,255,0.35) !important; } .btn-send:hover { transform: translateY(-1px) !important; box-shadow: 0 6px 28px rgba(108,82,255,0.5) !important; } .btn-review { background: linear-gradient(135deg, rgba(74,222,128,0.15), rgba(74,222,128,0.08)) !important; color: var(--accent-ok) !important; border: 1px solid rgba(74,222,128,0.3) !important; padding: 12px 24px !important; } .btn-review:hover { background: rgba(74,222,128,0.2) !important; border-color: rgba(74,222,128,0.5) !important; transform: translateY(-1px) !important; } .btn-reset { background: transparent !important; color: var(--text-muted) !important; border: 1px solid var(--border) !important; padding: 10px 18px !important; font-size: 12px !important; } .btn-reset:hover { color: var(--text-secondary) !important; border-color: var(--border-glow) !important; } /* ── Tabs ── */ .gradio-tabs .tab-nav { background: var(--bg-panel) !important; border-bottom: 1px solid var(--border) !important; padding: 0 8px !important; gap: 4px !important; } .gradio-tabs .tab-nav button { font-family: var(--font-display) !important; font-size: 13px !important; font-weight: 600 !important; color: var(--text-muted) !important; background: transparent !important; border: none !important; border-bottom: 2px solid transparent !important; padding: 12px 20px !important; border-radius: 0 !important; transition: all 0.2s !important; } .gradio-tabs .tab-nav button.selected { color: var(--accent-2) !important; border-bottom-color: var(--accent) !important; } .gradio-tabs .tab-nav button:hover:not(.selected) { color: var(--text-secondary) !important; } /* ── Dataframe / Table ── */ .gradio-dataframe { background: var(--bg-card) !important; border: 1px solid var(--border) !important; border-radius: var(--radius) !important; overflow: hidden !important; } .gradio-dataframe table { font-family: var(--font-mono) !important; font-size: 12px !important; } .gradio-dataframe thead th { background: linear-gradient(90deg, rgba(108,82,255,0.15), rgba(108,82,255,0.05)) !important; color: var(--accent-2) !important; font-family: var(--font-display) !important; font-size: 11px !important; font-weight: 700 !important; letter-spacing: 0.8px !important; text-transform: uppercase !important; padding: 10px 14px !important; border-bottom: 1px solid var(--border) !important; } .gradio-dataframe tbody tr { border-bottom: 1px solid rgba(108,82,255,0.07) !important; transition: background 0.15s !important; } .gradio-dataframe tbody tr:hover { background: var(--bg-hover) !important; } .gradio-dataframe tbody td { color: var(--text-primary) !important; padding: 9px 14px !important; } /* ── Dropdown ── */ .gradio-dropdown { background: var(--bg-card) !important; border: 1px solid var(--border) !important; border-radius: 10px !important; } .gradio-dropdown select, .gradio-dropdown input { background: var(--bg-card) !important; color: var(--text-primary) !important; font-family: var(--font-body) !important; border: none !important; } /* ── HTML outputs ── */ .phase-bar-wrap { background: transparent; } /* ── Download cards ── */ .download-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 12px; padding: 16px; } .download-card { display: flex; align-items: center; gap: 12px; padding: 14px 16px; border-radius: 10px; border: 1px solid var(--border); background: var(--bg-card); transition: all 0.2s; } .download-card.available { border-color: rgba(74,222,128,0.2); background: rgba(74,222,128,0.04); } .download-card.available:hover { border-color: rgba(74,222,128,0.4); background: rgba(74,222,128,0.08); transform: translateY(-1px); } .download-card.pending { opacity: 0.5; } .dl-icon { font-size: 20px; flex-shrink: 0; } .dl-info { flex: 1; min-width: 0; } .dl-label { font-family: var(--font-display); font-size: 13px; font-weight: 600; color: var(--text-primary); white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } .dl-meta { font-family: var(--font-mono); font-size: 10px; color: var(--text-muted); margin-top: 2px; } .dl-btn { background: linear-gradient(135deg, rgba(74,222,128,0.2), rgba(74,222,128,0.1)); color: var(--accent-ok); border: 1px solid rgba(74,222,128,0.3); border-radius: 7px; padding: 6px 14px; font-family: var(--font-display); font-size: 12px; font-weight: 600; text-decoration: none; white-space: nowrap; transition: all 0.2s; flex-shrink: 0; } .dl-btn:hover { background: rgba(74,222,128,0.3); border-color: rgba(74,222,128,0.5); } .dl-btn-disabled { color: var(--text-muted); font-family: var(--font-mono); font-size: 11px; flex-shrink: 0; padding: 6px 14px; border: 1px solid var(--border); border-radius: 7px; } /* ── Status indicators ── */ .status-dot { width: 7px; height: 7px; border-radius: 50%; background: var(--accent-ok); display: inline-block; box-shadow: 0 0 8px var(--accent-ok); animation: pulse-dot 2s ease-in-out infinite; margin-right: 6px; } @keyframes pulse-dot { 0%, 100% { opacity: 1; transform: scale(1); } 50% { opacity: 0.6; transform: scale(0.85); } } /* ── Scrollbars ── */ ::-webkit-scrollbar { width: 5px; height: 5px; } ::-webkit-scrollbar-track { background: var(--bg-deep); } ::-webkit-scrollbar-thumb { background: var(--border); border-radius: 10px; } ::-webkit-scrollbar-thumb:hover { background: var(--border-glow); } /* ── Responsive ── */ @media (max-width: 768px) { .download-grid { grid-template-columns: 1fr; } .app-header { padding: 20px; } .section-block { margin: 10px; } } """ # ── Header HTML ──────────────────────────────────────────────────────────────── HEADER_HTML = """
Thematic Engine for Mining & Identifying Scholarly Topics
Braun & Clarke (2006) · BERTopic · PAJAIS Taxonomy · Mistral LLM
EMBEDDING · all-MiniLM-L6-v2 · 384d
CLUSTERING · AgglomerativeClustering · cosine
THRESHOLD · 0.7 → ~100 topics
TAXONOMY · PAJAIS · 25 categories
""" # ── Section header helper ────────────────────────────────────────────────────── def section_header(num: str, title: str, desc: str = "") -> str: return f"""
{num}
{title}
{"
" + desc + "
" if desc else ""}
""" # ── Build Gradio app ─────────────────────────────────────────────────────────── with gr.Blocks( css=CUSTOM_CSS, title="THEMIS — BERTopic Thematic Analysis", theme=gr.themes.Base( primary_hue="violet", neutral_hue="slate", font=[gr.themes.GoogleFont("Inter"), "sans-serif"], ), ) as demo: # ── App Header ────────────────────────────────────────────────────────────── gr.HTML(HEADER_HTML) # ── Phase Progress Bar ────────────────────────────────────────────────────── with gr.Row(): phase_bar = gr.HTML(render_phase_bar(), elem_classes=["phase-bar-wrap"]) # ── Section ①: Data Input ────────────────────────────────────────────────── with gr.Group(elem_classes=["section-block"]): gr.HTML(section_header("①", "Data Input", "Upload your Scopus CSV export")) with gr.Row(): with gr.Column(scale=3): file_input = gr.File( label="Scopus CSV Export", file_types=[".csv"], elem_classes=["upload-zone"], ) with gr.Column(scale=2): gr.HTML("""
Required CSV columns:
· Authors
· Title
· Abstract
· Author Keywords
· Cited by
· Source title
· Year
""") # ── Section ②: Agent Chat ────────────────────────────────────────────────── with gr.Group(elem_classes=["section-block"]): gr.HTML(section_header("②", "THEMIS Agent", "6-phase Braun & Clarke analysis pipeline")) with gr.Column(elem_classes=["chat-wrap"]): chatbot = gr.Chatbot( label="", height=420, show_label=False, elem_classes=["gradio-chatbot"], ) with gr.Row(): msg_input = gr.Textbox( placeholder="Type a message... (e.g. 'run abstract', 'run title', or ask a question)", show_label=False, scale=5, elem_classes=["msg-input"], container=False, ) send_btn = gr.Button("Send ↗", scale=1, elem_classes=["btn-send"]) with gr.Row(): reset_btn = gr.Button("⟳ Reset Session", elem_classes=["btn-reset"], scale=1) gr.HTML('
') # ── Section ③: Results ───────────────────────────────────────────────────── with gr.Group(elem_classes=["section-block"]): gr.HTML(section_header("③", "Results", "Review table · Charts · Downloads")) with gr.Tabs(): # Tab A: Review Table with gr.Tab("📋 Review Table"): review_table = gr.Dataframe( value=load_review_table(), interactive=True, wrap=False, height=400, elem_classes=["gradio-dataframe"], column_widths=["60px","160px","120px","80px","80px","200px","80px","120px","160px"], ) with gr.Row(): refresh_table_btn = gr.Button("↻ Refresh Table", elem_classes=["btn-reset"], scale=1) submit_review_btn = gr.Button("✓ Submit Review →", elem_classes=["btn-review"], scale=2) gr.HTML('
') # Tab B: Charts with gr.Tab("📊 Charts"): with gr.Row(): chart_dropdown = gr.Dropdown( choices=get_chart_options(), label="Select Chart", value=None, scale=2, ) refresh_charts_btn = gr.Button("↻ Refresh", elem_classes=["btn-reset"], scale=1) chart_output = gr.HTML( '
' 'Run analysis to generate interactive charts
' ) # Tab C: Downloads with gr.Tab("⬇ Downloads"): download_html = gr.HTML(render_download_links()) refresh_dl_btn = gr.Button("↻ Refresh Downloads", elem_classes=["btn-reset"]) # ── Event Handlers ────────────────────────────────────────────────────────── # Send message def handle_send(message: str, history: list): if not message.strip(): return history, "", load_review_table(), render_phase_bar() response = invoke_agent(message, history) history = history + [[message, response]] return history, "", load_review_table(), render_phase_bar() send_btn.click( fn=handle_send, inputs=[msg_input, chatbot], outputs=[chatbot, msg_input, review_table, phase_bar], ) msg_input.submit( fn=handle_send, inputs=[msg_input, chatbot], outputs=[chatbot, msg_input, review_table, phase_bar], ) # CSV upload — auto-trigger agent def handle_upload(file, history: list): if file is None: return history, load_review_table(), render_phase_bar() response = invoke_agent(f"Analyze my Scopus CSV: {file.name}", history) history = history + [[f"📂 Uploaded CSV: {os.path.basename(file.name)}", response]] return history, load_review_table(), render_phase_bar() file_input.change( fn=handle_upload, inputs=[file_input, chatbot], outputs=[chatbot, review_table, phase_bar], ) # Submit review def handle_submit_review(table_data, history): new_history, _ = submit_review(table_data, history) return new_history, load_review_table(), render_phase_bar() submit_review_btn.click( fn=handle_submit_review, inputs=[review_table, chatbot], outputs=[chatbot, review_table, phase_bar], ) # Refresh table refresh_table_btn.click( fn=lambda: (load_review_table(), render_phase_bar()), outputs=[review_table, phase_bar], ) # Chart dropdown chart_dropdown.change( fn=load_chart, inputs=[chart_dropdown], outputs=[chart_output], ) # Refresh charts def refresh_charts(): opts = get_chart_options() return gr.update(choices=opts, value=opts[0] if opts else None) refresh_charts_btn.click(fn=refresh_charts, outputs=[chart_dropdown]) # Downloads refresh refresh_dl_btn.click(fn=render_download_links, outputs=[download_html]) # Reset def handle_reset(): msg = reset_agent() return [], render_phase_bar() reset_btn.click( fn=handle_reset, outputs=[chatbot, phase_bar], ) # ── Launch ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, )