"""
app.py - Gradio 6.x BERTopic / SPECTER2 Thematic Analysis Agent.
TWO MODES:
Classic (v1): BERTopic + Mistral-small, abstract run then title run separately.
SPECTER2 (v2): SPECTER2 embeddings + UMAP + HDBSCAN + council-of-3-LLMs,
one combined run on Title+Abstract per paper.
KEY DESIGN:
- Abstract run and title run use SEPARATE thread IDs in v1.
- v2 uses its own separate thread ID.
- Mode switch keeps existing data intact; user can switch freely.
"""
from __future__ import annotations
print("Step 1: imports starting...")
import json
import shutil
import uuid
from pathlib import Path
import gradio as gr
import pandas as pd
print("Step 2: gradio imported, version =", gr.__version__)
# ── v1 agent ──────────────────────────────────────────────────────────────────
try:
from agent import agent, clean_thread_history
AGENT_V1_OK = True
print("Step 3a: v1 agent imported OK")
except Exception as e:
print("Step 3a FAILED:", e)
agent = None
AGENT_V1_OK = False
def clean_thread_history(tid): pass
# ── v2 agent ──────────────────────────────────────────────────────────────────
try:
from agent_v2 import agent_v2, clean_thread_history_v2, reset_thread_v2
AGENT_V2_OK = True
print("Step 3b: v2 agent imported OK")
except Exception as e:
print("Step 3b FAILED:", e)
agent_v2 = None
AGENT_V2_OK = False
def clean_thread_history_v2(tid): pass
def reset_thread_v2(tid): pass
# ── constants ──────────────────────────────────────────────────────────────────
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
REVIEW_COLUMNS = [
"#", "Topic Label", "Top Evidence",
"Sentences", "Papers", "Approve", "Rename To", "Reasoning",
]
CHART_OPTIONS = ["bar", "histogram", "scatter", "treemap"]
PHASE_LABELS_V1 = [
"Phase 1 — Familiarisation", "Phase 2 — Initial Codes",
"Phase 3 — Themes", "Phase 4 — Saturation",
"Phase 5 — Naming", "Phase 5.5 — PAJAIS",
"Phase 6 — Report",
]
PHASE_LABELS_V2 = [
"Phase 1 — Load & Embed",
"Phase 2 — UMAP+HDBSCAN",
"Phase 3 — Council Labeling",
"Phase 4 — PAJAIS Mapping",
"Phase 5 — Final Outputs",
]
def new_thread_id() -> str:
return str(uuid.uuid4())
# ── helpers ────────────────────────────────────────────────────────────────────
def make_progress_html(current_phase: int, run_label: str = "", mode: str = "v1") -> str:
labels = PHASE_LABELS_V1 if mode == "v1" else PHASE_LABELS_V2
total = len(labels)
pct = int((current_phase / total) * 100)
color = "#4f46e5" if mode == "v1" else "#0891b2"
steps = "".join(
'{lbl}'.format(
bg=color if i <= current_phase else "#e5e7eb",
fg="#fff" if i <= current_phase else "#6b7280",
lbl=label,
)
for i, label in enumerate(labels)
)
badge = (
' {}'.format(run_label)
if run_label else ""
)
return (
'
'
'
'
'Progress{badge}
'
'
'
'
{steps}
'
'
'
).format(badge=badge, pct=pct, steps=steps, color=color)
def _run_status_html(mode: str = "v1") -> str:
abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
both_done = abs_done and title_done
v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists()
def badge(label, done, color_done="#22c55e"):
return (
'{} {}'
).format(
color_done if done else "#9ca3af",
"✅" if done else "⏳",
label
)
v1_badges = (
badge("Abstract Run", abs_done)
+ badge("Title Run", title_done)
+ badge("V1 Outputs", both_done)
)
v2_badges = badge("SPECTER2 Run", v2_done, "#0891b2")
return (
''
+ ('
'
+ v1_badges + '
' if mode == "v1" else "")
+ ('
'
+ v2_badges + '
' if mode == "v2" else "")
+ '
'
)
def _safe_read_csv(path):
try:
return pd.read_csv(path, encoding="utf-8")
except UnicodeDecodeError:
return pd.read_csv(path, encoding="latin-1")
def _summaries_path(run_config: str) -> Path:
return DATA_DIR / run_config / "summaries.json"
def _charts_path(run_config: str) -> Path:
return DATA_DIR / run_config / "charts.json"
def _papers_path(run_config: str) -> Path:
return DATA_DIR / run_config / "papers.csv"
def _v2_summaries_path() -> Path:
return DATA_DIR / "v2" / "summaries.json"
def _v2_charts_path() -> Path:
return DATA_DIR / "v2" / "charts.json"
def _active_run_for_table() -> str:
abs_has_summaries = _summaries_path("abstract").exists()
title_has_summaries = _summaries_path("title").exists()
abs_has_themes = (DATA_DIR / "abstract" / "themes.json").exists()
title_has_themes = (DATA_DIR / "title" / "themes.json").exists()
title_in_review = title_has_summaries and not title_has_themes
abs_in_review = abs_has_summaries and not abs_has_themes
return (
"title" if title_in_review else
"abstract" if abs_in_review else
"title" if title_has_summaries else
"abstract"
)
def _count_papers_per_topic(run_config: str) -> dict:
sp = _summaries_path(run_config)
pp = _papers_path(run_config)
if not sp.exists():
return {}
summaries = json.loads(sp.read_text())
if not pp.exists():
return {s["topic_id"]: max(s.get("size", 0) // 4, 1) for s in summaries}
papers_df = _safe_read_csv(pp)
text_col = next(
filter(lambda c: "abstract" in c.lower() or "title" in c.lower(), papers_df.columns),
None
)
if text_col is None:
return {s["topic_id"]: 0 for s in summaries}
sent_to_paper = {}
for idx, text in enumerate(list(papers_df[text_col].fillna(""))):
for sent in str(text).split("."):
key = sent.strip()[:80]
if key:
sent_to_paper[key] = idx
def count_papers(s):
ids = set(
sent_to_paper[sent.strip()[:80]]
for sent in s.get("sentences", [])
if sent.strip()[:80] in sent_to_paper
)
return max(len(ids), 1)
return {s["topic_id"]: count_papers(s) for s in summaries}
def _build_review_table(run_config: str = "abstract") -> pd.DataFrame:
sp = _summaries_path(run_config)
if not sp.exists():
return pd.DataFrame(columns=REVIEW_COLUMNS)
summaries = json.loads(sp.read_text())
if not summaries:
return pd.DataFrame(columns=REVIEW_COLUMNS)
paper_counts = _count_papers_per_topic(run_config)
rows = list(map(lambda s: [
int(s.get("topic_id", 0)),
str(s.get("label", "Topic {}".format(s.get("topic_id", "")))),
str(" | ".join(s.get("top_evidence", [])[:2])),
int(len(s.get("sentences", []))),
int(paper_counts.get(int(s.get("topic_id", 0)), 0)),
False, "", str(s.get("reasoning", "")),
], summaries))
return pd.DataFrame(rows, columns=REVIEW_COLUMNS)
def _build_v2_cluster_table() -> pd.DataFrame:
"""Build a read-only display table for v2 clusters."""
sp = _v2_summaries_path()
if not sp.exists():
cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
return pd.DataFrame(columns=cols)
summaries = json.loads(sp.read_text())
rows = list(map(lambda s: [
int(s.get("cluster_id", 0)),
str(s.get("label", "Cluster {}".format(s.get("cluster_id", "")))),
int(s.get("paper_count", 0)),
str(s.get("vote_agreement", "")),
str(s.get("llm_vote_1_MISTRAL", "")),
str(s.get("llm_vote_2_GEMINI", "")),
str(s.get("llm_vote_3_GROQ", "")),
str(" | ".join(s.get("top3_titles", [])[:2])),
], summaries))
cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
return pd.DataFrame(rows, columns=cols)
def _load_chart(chart_type: str, run_config: str, mode: str = "v1") -> str:
cp = _v2_charts_path() if mode == "v2" else _charts_path(run_config)
if not cp.exists():
return "Charts appear after clustering completes.
"
charts = json.loads(cp.read_text())
available = list(charts.keys())
# v2 charts: scatter, bar; v1: bar, histogram, scatter, treemap
key = chart_type if chart_type in charts else (available[0] if available else "bar")
return charts.get(key, "Chart not found.
")
def _get_download_files(mode: str = "v1"):
v1_candidates = [
DATA_DIR / "comparison.csv",
DATA_DIR / "narrative.txt",
DATA_DIR / "abstract" / "summaries.json",
DATA_DIR / "abstract" / "themes.json",
DATA_DIR / "abstract" / "taxonomy.json",
DATA_DIR / "title" / "summaries.json",
DATA_DIR / "title" / "themes.json",
DATA_DIR / "title" / "taxonomy.json",
]
v2_candidates = [
DATA_DIR / "comparison_v2.csv",
DATA_DIR / "v2" / "cluster_audit.csv",
DATA_DIR / "v2" / "narrative_v2.txt",
DATA_DIR / "v2" / "summaries.json",
DATA_DIR / "v2" / "taxonomy.json",
]
candidates = v2_candidates if mode == "v2" else v1_candidates
existing = list(map(str, filter(lambda p: p.exists(), candidates)))
return existing if existing else None
def handle_file_upload(file_path) -> str:
if not file_path:
return ""
dest = DATA_DIR / "uploaded.csv"
src = Path(file_path).resolve()
dst = dest.resolve()
_ = shutil.copy(str(src), str(dst)) if src != dst else None
try:
df = _safe_read_csv(dest)
msg = "✅ CSV saved — {} rows, columns: {}. ".format(
len(df), ", ".join(list(df.columns[:8]))
)
except Exception:
msg = "✅ CSV saved to {}. ".format(dest)
return msg + "Select a mode below and type the run command."
def reset_all_data() -> tuple:
import shutil as _shutil
try:
reset_thread_v2("default")
except Exception:
pass
if DATA_DIR.exists():
_shutil.rmtree(str(DATA_DIR))
DATA_DIR.mkdir(exist_ok=True)
empty_v1 = pd.DataFrame(columns=REVIEW_COLUMNS)
empty_v2 = pd.DataFrame(columns=["#", "Cluster Label", "Papers",
"Vote Agreement", "LLM1 Vote",
"LLM2 Vote", "LLM3 Vote", "Top 3 Titles"])
empty_chart = "Charts appear after clustering.
"
status_msg = (
""
"🔄 All data cleared. Upload a new CSV and begin."
"
"
)
return (
[], # chatbot
"", # chat input
make_progress_html(0), # progress
_run_status_html("v1"), # run status
empty_v1, # v1 review table
empty_v2, # v2 cluster table
empty_chart, # chart
None, # downloads
new_thread_id(), # abstract thread
new_thread_id(), # title thread
new_thread_id(), # v2 thread
"abstract", # current_run (v1)
status_msg, # table_status
"", # file_status
)
def _detect_phase(text: str, mode: str = "v1") -> int:
phase_map_v1 = {
"phase 5.5": 5, "phase 6": 6, "phase 5": 4,
"phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0,
}
phase_map_v2 = {
"phase 5": 4, "phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0,
"specter2 run complete": 4,
"final outputs": 4,
"pajais mapping": 3,
"council": 2,
"hdbscan": 1,
}
lower = text.lower()
phase_map = phase_map_v1 if mode == "v1" else phase_map_v2
for key, val in phase_map.items():
if key in lower:
return val
return 0
def _detect_run_label(text: str) -> str:
lower = text.lower()
return (
"TITLE RUN" if "title run" in lower or "title phase" in lower else
"ABSTRACT RUN" if "abstract run" in lower or "abstract phase" in lower else
"SPECTER2 RUN" if "specter" in lower or "v2" in lower else
""
)
def _stream_agent(user_message: str, thread_id: str, mode: str = "v1") -> str:
import time
agent_obj = agent if mode == "v1" else agent_v2
clean_fn = clean_thread_history if mode == "v1" else clean_thread_history_v2
agent_ok = AGENT_V1_OK if mode == "v1" else AGENT_V2_OK
if not agent_ok:
return "ERROR: {} agent not loaded. Check terminal.".format(
"Classic" if mode == "v1" else "SPECTER2"
)
def _do_stream() -> str:
clean_fn(thread_id)
config = {"configurable": {"thread_id": thread_id}}
full_reply = ""
for chunk in agent_obj.stream(
{"messages": [{"role": "user", "content": user_message}]},
config=config,
stream_mode="values",
):
last_msg = chunk["messages"][-1]
content = getattr(last_msg, "content", "")
if isinstance(content, list):
content = " ".join(
c.get("text", "") if isinstance(c, dict) else str(c)
for c in content
)
if content:
full_reply = content
return full_reply or "(no response)"
result = _do_stream()
is_rate_limited = (
"429" in result
or "rate limit" in result.lower()
or "rate_limited" in result.lower()
)
return _do_stream() if is_rate_limited else result
def _generate_final_v1_directly(history: list) -> str:
from tools import generate_comparison_csv, export_narrative
csv_result = generate_comparison_csv.invoke({})
narr_result = export_narrative.invoke({})
csv_info = json.loads(csv_result) if csv_result.strip().startswith("{") else {}
narr_info = json.loads(narr_result) if narr_result.strip().startswith("{") else {}
rows = csv_info.get("rows", "?")
col_names = ", ".join(csv_info.get("columns", [])[:5]) + "..."
wc = narr_info.get("word_count", "?")
return (
"Both runs complete. Final outputs generated. "
"comparison.csv has {} rows with columns: {}. "
"narrative.txt has {} words. "
"Both files are in the Download tab."
).format(rows, col_names, wc)
def run_agent(
user_message: str,
history: list,
abstract_thread: str,
title_thread: str,
v2_thread: str,
current_run: str,
current_mode: str,
) -> tuple:
if not user_message or not user_message.strip():
cfg = _active_run_for_table()
mode = current_mode or "v1"
return (
history or [], "",
make_progress_html(0, mode=mode),
_run_status_html(mode),
_build_review_table(cfg),
_build_v2_cluster_table(),
_load_chart("bar", cfg, mode),
_get_download_files(mode),
abstract_thread, title_thread, v2_thread, current_run,
)
lower = user_message.lower().strip()
mode = current_mode or "v1"
# Detect run switches
active_run = (
"title" if "run title" in lower else
"abstract" if "run abstract" in lower else
current_run
)
# v1 shortcut for final outputs
abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
both_v1 = abs_done and title_done
wants_final = any(w in lower for w in ("yes", "generate", "final", "comparison", "narrative", "output"))
history = list(history or [])
history.append({"role": "user", "content": user_message})
reply = (
_generate_final_v1_directly(history)
if (mode == "v1" and both_v1 and wants_final
and not (DATA_DIR / "comparison.csv").exists())
else _stream_agent(
user_message,
v2_thread if mode == "v2" else (title_thread if active_run == "title" else abstract_thread),
mode=mode,
)
)
history.append({"role": "assistant", "content": reply})
cfg = _active_run_for_table()
return (
history, "",
make_progress_html(_detect_phase(reply, mode), _detect_run_label(reply), mode),
_run_status_html(mode),
_build_review_table(cfg),
_build_v2_cluster_table(),
_load_chart("bar", cfg, mode),
_get_download_files(mode),
abstract_thread, title_thread, v2_thread, active_run,
)
def handle_submit_review(
review_data,
history: list,
abstract_thread: str,
title_thread: str,
v2_thread: str,
current_run: str,
current_mode: str,
) -> tuple:
if review_data is None:
return run_agent(
"Review table empty — waiting for Phase 2.",
history, abstract_thread, title_thread, v2_thread, current_run, current_mode
)
df = (
pd.DataFrame(
review_data.get("data", []),
columns=review_data.get("headers", REVIEW_COLUMNS)
)
if isinstance(review_data, dict)
else (
review_data.copy()
if isinstance(review_data, pd.DataFrame)
else pd.DataFrame(review_data, columns=REVIEW_COLUMNS)
)
)
if df.empty:
return run_agent(
"Review table empty — waiting for Phase 2.",
history, abstract_thread, title_thread, v2_thread, current_run, current_mode
)
df.columns = pd.Index(list(map(str, df.columns)))
approve_col = next((c for c in df.columns if "approve" in c.lower()), None)
id_col = next((c for c in df.columns if c.strip() == "#"), df.columns[0])
label_col = next((c for c in df.columns if "label" in c.lower()), df.columns[1])
rename_col = next((c for c in df.columns if "rename" in c.lower()), None)
if approve_col is None:
return run_agent(
"Cannot find Approve column in table.",
history, abstract_thread, title_thread, v2_thread, current_run, current_mode
)
def to_bool(v):
return v is True or str(v).strip().lower() in ("true","1","yes","x","on","✓")
approved_df = df[pd.Series(list(map(to_bool, list(df[approve_col]))), index=df.index)]
if len(approved_df) == 0:
guide = (
"⚠️ **No topics approved yet.**\n\n"
"**To approve topics:**\n"
"1. Click **🔄 Refresh Table** to load latest topics\n"
"2. Click the checkbox ☐ in **Approve** column\n"
"3. Fill **Rename To** with a theme name\n"
"4. Click **✅ Submit Review** again"
)
history = list(history or [])
history.append({"role": "assistant", "content": guide})
cfg = _active_run_for_table()
return (
history, "",
make_progress_html(1),
_run_status_html("v1"),
_build_review_table(cfg),
_build_v2_cluster_table(),
_load_chart("bar", cfg, "v1"),
_get_download_files("v1"),
abstract_thread, title_thread, v2_thread, current_run,
)
theme_map: dict = {}
for idx in range(len(approved_df)):
row = approved_df.iloc[idx]
rename_val = str(row[rename_col]).strip() if rename_col else ""
theme = (
rename_val
if rename_val and rename_val.lower() not in ("", "nan", "none")
else str(row[label_col])
)
try:
tid = int(float(str(row[id_col])))
except (ValueError, TypeError):
tid = idx
theme_map.setdefault(theme, []).append(tid)
groups = [{"theme_name": k, "topic_ids": v} for k, v in theme_map.items()]
thread_id = title_thread if current_run == "title" else abstract_thread
msg = (
"Researcher submitted the Review Table for the {} run.\n"
"{} topics approved, {} themes:\n\n"
"```json\n{}\n```\n\n"
"Call consolidate_into_themes with run_config='{}' "
"and the approved_groups JSON above. Then proceed to Phase 3."
).format(
current_run, len(approved_df), len(groups),
json.dumps(groups, indent=2), current_run,
)
return run_agent(msg, history, abstract_thread, title_thread, v2_thread, current_run, current_mode)
def switch_mode(new_mode: str, current_mode: str, abstract_thread: str, title_thread: str, v2_thread: str, current_run: str) -> tuple:
"""Switch between Classic and SPECTER2 modes, refreshing UI accordingly."""
cfg = _active_run_for_table()
mode_label_text = (
"### 🔬 Classic Mode (BERTopic)\n"
"Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
"Commands: **run abstract** → review → **run title** → review → download"
if new_mode == "v1" else
"### 🧬 SPECTER2 Mode (Advanced)\n"
"One combined run per paper (Title+Abstract). UMAP+HDBSCAN clustering. "
"Council-of-3 LLM labeling with audit trail.\n"
"Command: **run specter** or **run v2**"
)
chart_opts = CHART_OPTIONS if new_mode == "v1" else ["scatter", "bar"]
return (
make_progress_html(0, mode=new_mode),
_run_status_html(new_mode),
_build_review_table(cfg),
_build_v2_cluster_table(),
_load_chart("bar", cfg, new_mode),
_get_download_files(new_mode),
mode_label_text,
gr.update(choices=chart_opts, value=chart_opts[0]),
new_mode,
)
def manual_refresh_table(current_run: str, current_mode: str) -> tuple:
cfg = _active_run_for_table()
return _build_review_table(cfg), _build_v2_cluster_table()
def refresh_chart(chart_type: str, current_run: str, current_mode: str) -> str:
cfg = _active_run_for_table()
mode = current_mode or "v1"
return _load_chart(chart_type, cfg, mode)
def check_status(current_mode: str) -> str:
mode = current_mode or "v1"
if mode == "v2":
sp = _v2_summaries_path()
if not sp.exists():
return (
""
"⏳ No v2 clusters yet. Type run specter to begin."
"
"
)
summaries = json.loads(sp.read_text())
labeled = sum(1 for s in summaries if s.get("label","").strip())
return (
""
"✅ {} clusters in data/v2/ ({} labeled). "
"Click 🔄 Refresh to display."
"
"
).format(len(summaries), labeled)
else:
cfg = _active_run_for_table()
sp = _summaries_path(cfg)
if not sp.exists():
return (
""
"⏳ No topics yet. Upload CSV then type run abstract."
"
"
)
summaries = json.loads(sp.read_text())
labeled = sum(1 for s in summaries if s.get("label","").strip()
and not s.get("label","").startswith("Topic "))
return (
""
"✅ {} topics from data/{}/ ({} LLM-labelled). "
"Click 🔄 Refresh Table."
"
"
).format(len(summaries), cfg, labeled)
print("Step 4: building UI...")
# ── UI ─────────────────────────────────────────────────────────────────────────
with gr.Blocks(
title="BERTopic / SPECTER2 Thematic Analysis Agent",
css="""
.mode-btn-active { border: 2px solid #4f46e5 !important; background: #eef2ff !important; }
.mode-btn-v2-active { border: 2px solid #0891b2 !important; background: #ecfeff !important; }
"""
) as demo:
abstract_thread_state = gr.State(new_thread_id())
title_thread_state = gr.State(new_thread_id())
v2_thread_state = gr.State(new_thread_id())
current_run_state = gr.State("abstract")
current_mode_state = gr.State("v1")
gr.Markdown(
"# 🔬 Thematic Analysis Agent\n"
"**Braun & Clarke (2006)** · SPECTER2 · PAJAIS Taxonomy · Systematic Literature Review"
)
progress_bar = gr.HTML(make_progress_html(0))
run_status = gr.HTML(_run_status_html("v1"))
# ── MODE SELECTOR ──────────────────────────────────────────────────────────
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🔀 Analysis Mode")
with gr.Row():
btn_v1 = gr.Button(
"📊 Classic (BERTopic)\nAbstract + Title runs",
variant="primary", size="sm",
)
btn_v2 = gr.Button(
"🧬 SPECTER2 (Advanced)\nCombined T+A · HDBSCAN · Council-3-LLMs",
variant="secondary", size="sm",
)
mode_description = gr.Markdown(
"### 📊 Classic Mode (BERTopic)\n"
"Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
"Commands: **run abstract** → review → **run title** → review → download"
)
gr.HTML("""
Classic:
1️⃣ Upload CSV → 2️⃣ run abstract →
3️⃣ Review Table → 4️⃣ run title → 5️⃣ Download
|
SPECTER2:
1️⃣ Upload CSV → 2️⃣ run specter → 3️⃣ Download
""")
# ── Section 1 ─────────────────────────────────────────────────────────────
with gr.Accordion("📂 Section 1 — Data Input", open=True):
def _startup_msg():
abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists()
csv_exists = (DATA_DIR / "uploaded.csv").exists()
has_data = csv_exists or abs_done or title_done or v2_done
return (
""
"⚠️ Previous session data detected. "
"Abstract: {abs} | Title: {title} | "
"SPECTER2: {v2} | CSV: {csv}
"
"Click 🗑️ Reset to clear or continue from where you left off."
"
"
if has_data else
""
"✅ Fresh session — upload your CSV to begin."
"
"
).format(
abs="✅" if abs_done else "⏳",
title="✅" if title_done else "⏳",
v2="✅" if v2_done else "⏳",
csv="✅" if csv_exists else "❌",
)
startup_banner = gr.HTML(_startup_msg())
with gr.Row():
file_input = gr.File(
label="Upload Scopus CSV", file_types=[".csv"],
type="filepath", scale=4,
)
reset_btn = gr.Button(
"🗑️ Reset & Start Fresh",
variant="stop", scale=1, size="sm",
)
file_status = gr.Textbox(label="Upload status", interactive=False, lines=2)
file_input.change(fn=handle_file_upload, inputs=file_input, outputs=file_status)
# ── Section 2 ─────────────────────────────────────────────────────────────
with gr.Accordion("💬 Section 2 — Agent Conversation", open=True):
gr.HTML("""
Classic: run abstract | run title | yes | satisfied | confirm
|
SPECTER2: run specter | run v2 | yes
""")
chatbot = gr.Chatbot(label="Agent", height=500)
with gr.Row():
chat_input = gr.Textbox(
label="Message",
placeholder="e.g. run abstract or run specter",
lines=2, scale=5,
)
send_btn = gr.Button("Send ➤", variant="primary", scale=1)
# ── Section 3 ─────────────────────────────────────────────────────────────
with gr.Accordion("📊 Section 3 — Results", open=True):
with gr.Tabs():
with gr.Tab("📋 Review Table (Classic)"):
gr.HTML("""
After Phase 2 (Classic): Refresh → tick Approve → fill Rename To → Submit Review
""")
table_status = gr.HTML(
""
"Complete Phase 2 (Classic) then Refresh.
"
)
with gr.Row():
refresh_btn = gr.Button("🔄 Refresh Table", variant="secondary", scale=2)
check_status_btn = gr.Button("📊 Check Status", variant="secondary", scale=1)
review_table = gr.Dataframe(
value=pd.DataFrame(columns=REVIEW_COLUMNS),
headers=REVIEW_COLUMNS,
datatype=["number","str","str","number","number","bool","str","str"],
interactive=True, wrap=True,
label="Topic Review Table (Classic Mode)",
)
submit_review_btn = gr.Button("✅ Submit Review", variant="primary", size="lg")
with gr.Tab("🧬 Cluster View (SPECTER2)"):
gr.HTML("""
Clusters appear after Phase 3 (Council Labeling) completes. Read-only — no manual review needed.
Download the cluster_audit.csv for full LLM voting details.
""")
with gr.Row():
refresh_v2_btn = gr.Button("🔄 Refresh Clusters", variant="secondary", scale=2)
check_v2_btn = gr.Button("📊 Check V2 Status", variant="secondary", scale=1)
v2_cluster_table = gr.Dataframe(
value=pd.DataFrame(columns=["#", "Cluster Label", "Papers",
"Vote Agreement", "LLM1 Vote",
"LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]),
headers=["#", "Cluster Label", "Papers", "Vote Agreement",
"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"],
datatype=["number","str","number","str","str","str","str","str"],
interactive=False, wrap=True,
label="SPECTER2 Cluster Table (Read-only)",
)
with gr.Tab("📈 Charts"):
chart_selector = gr.Dropdown(
choices=CHART_OPTIONS, value="bar",
label="Select Chart", interactive=True,
)
chart_display = gr.HTML(
"Charts appear after clustering.
"
)
chart_selector.change(
fn=refresh_chart,
inputs=[chart_selector, current_run_state, current_mode_state],
outputs=chart_display,
)
with gr.Tab("⬇️ Download"):
gr.Markdown(
"**Classic outputs** appear after both abstract+title runs complete.\n\n"
"**SPECTER2 outputs** appear after v2 run completes:\n"
"- `comparison_v2.csv` — one row per paper with cluster + PAJAIS\n"
"- `cluster_audit.csv` — full LLM voting record, per paper\n"
"- `narrative_v2.txt` — 500-word Section 7 discussion\n"
"> 💡 **Cache:** `data/v2/llm_cache/` stores LLM responses — "
"delete this folder to force fresh labels on re-run.\n"
)
download_files = gr.File(
label="Output Files", file_count="multiple", interactive=False,
)
# ── wire up — combined outputs ─────────────────────────────────────────────
agent_outputs = [
chatbot, chat_input, progress_bar, run_status,
review_table, v2_cluster_table, chart_display, download_files,
abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
]
reset_outputs = [
chatbot, chat_input, progress_bar, run_status,
review_table, v2_cluster_table, chart_display, download_files,
abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
table_status, file_status,
]
mode_switch_outputs = [
progress_bar, run_status,
review_table, v2_cluster_table,
chart_display, download_files,
mode_description, chart_selector,
current_mode_state,
]
send_btn.click(
fn=run_agent,
inputs=[chat_input, chatbot,
abstract_thread_state, title_thread_state, v2_thread_state,
current_run_state, current_mode_state],
outputs=agent_outputs,
)
chat_input.submit(
fn=run_agent,
inputs=[chat_input, chatbot,
abstract_thread_state, title_thread_state, v2_thread_state,
current_run_state, current_mode_state],
outputs=agent_outputs,
)
submit_review_btn.click(
fn=handle_submit_review,
inputs=[review_table, chatbot,
abstract_thread_state, title_thread_state, v2_thread_state,
current_run_state, current_mode_state],
outputs=agent_outputs,
)
reset_btn.click(
fn=reset_all_data,
inputs=[],
outputs=reset_outputs,
)
btn_v1.click(
fn=lambda m, at, tt, vt, cr: switch_mode("v1", m, at, tt, vt, cr),
inputs=[current_mode_state, abstract_thread_state, title_thread_state,
v2_thread_state, current_run_state],
outputs=mode_switch_outputs,
)
btn_v2.click(
fn=lambda m, at, tt, vt, cr: switch_mode("v2", m, at, tt, vt, cr),
inputs=[current_mode_state, abstract_thread_state, title_thread_state,
v2_thread_state, current_run_state],
outputs=mode_switch_outputs,
)
refresh_btn.click(
fn=manual_refresh_table,
inputs=[current_run_state, current_mode_state],
outputs=[review_table, v2_cluster_table],
)
refresh_v2_btn.click(
fn=manual_refresh_table,
inputs=[current_run_state, current_mode_state],
outputs=[review_table, v2_cluster_table],
)
check_status_btn.click(
fn=check_status,
inputs=[current_mode_state],
outputs=[table_status],
)
check_v2_btn.click(
fn=lambda: check_status("v2"),
inputs=[current_mode_state],
outputs=[table_status],
)
print("Step 5: UI built OK, launching...")
if __name__ == "__main__":
_v = tuple(int(x) for x in gr.__version__.split(".")[:2])
print("Gradio version:", gr.__version__)
_kwargs = {
"server_name": "0.0.0.0",
"server_port": 7860,
"share": False,
"inbrowser": False,
}
if _v >= (5, 0):
_kwargs["ssr_mode"] = False
print("Running at http://0.0.0.0:7860")
import subprocess, sys
subprocess.Popen([sys.executable, "check_keys.py"])
demo.launch(**_kwargs)