Spaces:

aadisawant2912
/

topic_modelling

Sleeping

App Files Files Community

topic_modelling / app.py

aadisawant2912

Update app.py

8c80ed1 verified 20 days ago

raw

history blame contribute delete

39.6 kB

	"""
	app.py - Gradio 6.x BERTopic / SPECTER2 Thematic Analysis Agent.

	TWO MODES:
	Classic (v1): BERTopic + Mistral-small, abstract run then title run separately.
	SPECTER2 (v2): SPECTER2 embeddings + UMAP + HDBSCAN + council-of-3-LLMs,
	one combined run on Title+Abstract per paper.

	KEY DESIGN:
	- Abstract run and title run use SEPARATE thread IDs in v1.
	- v2 uses its own separate thread ID.
	- Mode switch keeps existing data intact; user can switch freely.
	"""

	from __future__ import annotations

	print("Step 1: imports starting...")

	import json
	import shutil
	import uuid
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	print("Step 2: gradio imported, version =", gr.__version__)

	# ── v1 agent ──────────────────────────────────────────────────────────────────
	try:
	from agent import agent, clean_thread_history
	AGENT_V1_OK = True
	print("Step 3a: v1 agent imported OK")
	except Exception as e:
	print("Step 3a FAILED:", e)
	agent = None
	AGENT_V1_OK = False
	def clean_thread_history(tid): pass

	# ── v2 agent ──────────────────────────────────────────────────────────────────
	try:

	from agent_v2 import agent_v2, clean_thread_history_v2, reset_thread_v2
	AGENT_V2_OK = True
	print("Step 3b: v2 agent imported OK")
	except Exception as e:
	print("Step 3b FAILED:", e)
	agent_v2 = None
	AGENT_V2_OK = False
	def clean_thread_history_v2(tid): pass
	def reset_thread_v2(tid): pass

	# ── constants ──────────────────────────────────────────────────────────────────
	DATA_DIR = Path("data")
	DATA_DIR.mkdir(exist_ok=True)

	REVIEW_COLUMNS = [
	"#", "Topic Label", "Top Evidence",
	"Sentences", "Papers", "Approve", "Rename To", "Reasoning",
	]
	CHART_OPTIONS = ["bar", "histogram", "scatter", "treemap"]
	PHASE_LABELS_V1 = [
	"Phase 1 — Familiarisation", "Phase 2 — Initial Codes",
	"Phase 3 — Themes", "Phase 4 — Saturation",
	"Phase 5 — Naming", "Phase 5.5 — PAJAIS",
	"Phase 6 — Report",
	]
	PHASE_LABELS_V2 = [
	"Phase 1 — Load & Embed",
	"Phase 2 — UMAP+HDBSCAN",
	"Phase 3 — Council Labeling",
	"Phase 4 — PAJAIS Mapping",
	"Phase 5 — Final Outputs",
	]


	def new_thread_id() -> str:
	return str(uuid.uuid4())


	# ── helpers ────────────────────────────────────────────────────────────────────
	def make_progress_html(current_phase: int, run_label: str = "", mode: str = "v1") -> str:
	labels = PHASE_LABELS_V1 if mode == "v1" else PHASE_LABELS_V2
	total = len(labels)
	pct = int((current_phase / total) * 100)
	color = "#4f46e5" if mode == "v1" else "#0891b2"
	steps = "".join(
	'<span style="padding:3px 8px;margin:2px;border-radius:10px;font-size:11px;'
	'background:{bg};color:{fg};">{lbl}</span>'.format(
	bg=color if i <= current_phase else "#e5e7eb",
	fg="#fff" if i <= current_phase else "#6b7280",
	lbl=label,
	)
	for i, label in enumerate(labels)
	)
	badge = (
	' <span style="background:#f59e0b;color:#fff;padding:2px 10px;'
	'border-radius:10px;font-size:12px;">{}</span>'.format(run_label)
	if run_label else ""
	)
	return (
	'<div style="font-family:sans-serif;padding:8px 0;">'
	'<div style="font-weight:600;color:#374151;margin-bottom:5px;">'
	'Progress{badge}</div>'
	'<div style="background:#e5e7eb;border-radius:6px;height:8px;margin-bottom:6px;">'
	'<div style="background:{color};width:{pct}%;height:100%;border-radius:6px;"></div>'
	'</div>'
	'<div style="display:flex;flex-wrap:wrap;gap:2px;">{steps}</div>'
	'</div>'
	).format(badge=badge, pct=pct, steps=steps, color=color)


	def _run_status_html(mode: str = "v1") -> str:
	abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
	title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
	both_done = abs_done and title_done
	v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists()

	def badge(label, done, color_done="#22c55e"):
	return (
	'<span style="background:{};color:#fff;padding:3px 12px;'
	'border-radius:10px;font-size:12px;margin:2px;">{} {}</span>'
	).format(
	color_done if done else "#9ca3af",
	"✅" if done else "⏳",
	label
	)

	v1_badges = (
	badge("Abstract Run", abs_done)
	+ badge("Title Run", title_done)
	+ badge("V1 Outputs", both_done)
	)
	v2_badges = badge("SPECTER2 Run", v2_done, "#0891b2")

	return (
	'<div style="padding:6px 0;">'
	+ ('<div style="display:flex;flex-wrap:wrap;gap:4px;margin-bottom:4px;">'
	+ v1_badges + '</div>' if mode == "v1" else "")
	+ ('<div style="display:flex;flex-wrap:wrap;gap:4px;">'
	+ v2_badges + '</div>' if mode == "v2" else "")
	+ '</div>'
	)


	def _safe_read_csv(path):
	try:
	return pd.read_csv(path, encoding="utf-8")
	except UnicodeDecodeError:
	return pd.read_csv(path, encoding="latin-1")


	def _summaries_path(run_config: str) -> Path:
	return DATA_DIR / run_config / "summaries.json"

	def _charts_path(run_config: str) -> Path:
	return DATA_DIR / run_config / "charts.json"

	def _papers_path(run_config: str) -> Path:
	return DATA_DIR / run_config / "papers.csv"

	def _v2_summaries_path() -> Path:
	return DATA_DIR / "v2" / "summaries.json"

	def _v2_charts_path() -> Path:
	return DATA_DIR / "v2" / "charts.json"


	def _active_run_for_table() -> str:
	abs_has_summaries = _summaries_path("abstract").exists()
	title_has_summaries = _summaries_path("title").exists()
	abs_has_themes = (DATA_DIR / "abstract" / "themes.json").exists()
	title_has_themes = (DATA_DIR / "title" / "themes.json").exists()
	title_in_review = title_has_summaries and not title_has_themes
	abs_in_review = abs_has_summaries and not abs_has_themes
	return (
	"title" if title_in_review else
	"abstract" if abs_in_review else
	"title" if title_has_summaries else
	"abstract"
	)


	def _count_papers_per_topic(run_config: str) -> dict:
	sp = _summaries_path(run_config)
	pp = _papers_path(run_config)
	if not sp.exists():
	return {}
	summaries = json.loads(sp.read_text())
	if not pp.exists():
	return {s["topic_id"]: max(s.get("size", 0) // 4, 1) for s in summaries}
	papers_df = _safe_read_csv(pp)
	text_col = next(
	filter(lambda c: "abstract" in c.lower() or "title" in c.lower(), papers_df.columns),
	None
	)
	if text_col is None:
	return {s["topic_id"]: 0 for s in summaries}
	sent_to_paper = {}
	for idx, text in enumerate(list(papers_df[text_col].fillna(""))):
	for sent in str(text).split("."):
	key = sent.strip()[:80]
	if key:
	sent_to_paper[key] = idx
	def count_papers(s):
	ids = set(
	sent_to_paper[sent.strip()[:80]]
	for sent in s.get("sentences", [])
	if sent.strip()[:80] in sent_to_paper
	)
	return max(len(ids), 1)
	return {s["topic_id"]: count_papers(s) for s in summaries}


	def _build_review_table(run_config: str = "abstract") -> pd.DataFrame:
	sp = _summaries_path(run_config)
	if not sp.exists():
	return pd.DataFrame(columns=REVIEW_COLUMNS)
	summaries = json.loads(sp.read_text())
	if not summaries:
	return pd.DataFrame(columns=REVIEW_COLUMNS)
	paper_counts = _count_papers_per_topic(run_config)
	rows = list(map(lambda s: [
	int(s.get("topic_id", 0)),
	str(s.get("label", "Topic {}".format(s.get("topic_id", "")))),
	str(" \| ".join(s.get("top_evidence", [])[:2])),
	int(len(s.get("sentences", []))),
	int(paper_counts.get(int(s.get("topic_id", 0)), 0)),
	False, "", str(s.get("reasoning", "")),
	], summaries))
	return pd.DataFrame(rows, columns=REVIEW_COLUMNS)


	def _build_v2_cluster_table() -> pd.DataFrame:
	"""Build a read-only display table for v2 clusters."""
	sp = _v2_summaries_path()
	if not sp.exists():
	cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
	"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
	return pd.DataFrame(columns=cols)
	summaries = json.loads(sp.read_text())
	rows = list(map(lambda s: [
	int(s.get("cluster_id", 0)),
	str(s.get("label", "Cluster {}".format(s.get("cluster_id", "")))),
	int(s.get("paper_count", 0)),
	str(s.get("vote_agreement", "")),
	str(s.get("llm_vote_1_MISTRAL", "")),
	str(s.get("llm_vote_2_GEMINI", "")),
	str(s.get("llm_vote_3_GROQ", "")),
	str(" \| ".join(s.get("top3_titles", [])[:2])),
	], summaries))
	cols = ["#", "Cluster Label", "Papers", "Vote Agreement",
	"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]
	return pd.DataFrame(rows, columns=cols)


	def _load_chart(chart_type: str, run_config: str, mode: str = "v1") -> str:
	cp = _v2_charts_path() if mode == "v2" else _charts_path(run_config)
	if not cp.exists():
	return "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering completes.</p>"
	charts = json.loads(cp.read_text())
	available = list(charts.keys())
	# v2 charts: scatter, bar; v1: bar, histogram, scatter, treemap
	key = chart_type if chart_type in charts else (available[0] if available else "bar")
	return charts.get(key, "<p>Chart not found.</p>")


	def _get_download_files(mode: str = "v1"):
	v1_candidates = [
	DATA_DIR / "comparison.csv",
	DATA_DIR / "narrative.txt",
	DATA_DIR / "abstract" / "summaries.json",
	DATA_DIR / "abstract" / "themes.json",
	DATA_DIR / "abstract" / "taxonomy.json",
	DATA_DIR / "title" / "summaries.json",
	DATA_DIR / "title" / "themes.json",
	DATA_DIR / "title" / "taxonomy.json",
	]
	v2_candidates = [
	DATA_DIR / "comparison_v2.csv",
	DATA_DIR / "v2" / "cluster_audit.csv",
	DATA_DIR / "v2" / "narrative_v2.txt",
	DATA_DIR / "v2" / "summaries.json",
	DATA_DIR / "v2" / "taxonomy.json",
	]
	candidates = v2_candidates if mode == "v2" else v1_candidates
	existing = list(map(str, filter(lambda p: p.exists(), candidates)))
	return existing if existing else None


	def handle_file_upload(file_path) -> str:
	if not file_path:
	return ""
	dest = DATA_DIR / "uploaded.csv"
	src = Path(file_path).resolve()
	dst = dest.resolve()
	_ = shutil.copy(str(src), str(dst)) if src != dst else None
	try:
	df = _safe_read_csv(dest)
	msg = "✅ CSV saved — {} rows, columns: {}. ".format(
	len(df), ", ".join(list(df.columns[:8]))
	)
	except Exception:
	msg = "✅ CSV saved to {}. ".format(dest)
	return msg + "Select a mode below and type the run command."


	def reset_all_data() -> tuple:
	import shutil as _shutil
	try:
	reset_thread_v2("default")
	except Exception:
	pass
	if DATA_DIR.exists():
	_shutil.rmtree(str(DATA_DIR))
	DATA_DIR.mkdir(exist_ok=True)
	empty_v1 = pd.DataFrame(columns=REVIEW_COLUMNS)
	empty_v2 = pd.DataFrame(columns=["#", "Cluster Label", "Papers",
	"Vote Agreement", "LLM1 Vote",
	"LLM2 Vote", "LLM3 Vote", "Top 3 Titles"])
	empty_chart = "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>"
	status_msg = (
	"<div style='padding:10px;background:#fef3c7;border-radius:6px;"
	"font-family:sans-serif;font-size:13px;'>"
	"🔄 <b>All data cleared.</b> Upload a new CSV and begin."
	"</div>"
	)
	return (
	[], # chatbot
	"", # chat input
	make_progress_html(0), # progress
	_run_status_html("v1"), # run status
	empty_v1, # v1 review table
	empty_v2, # v2 cluster table
	empty_chart, # chart
	None, # downloads
	new_thread_id(), # abstract thread
	new_thread_id(), # title thread
	new_thread_id(), # v2 thread
	"abstract", # current_run (v1)
	status_msg, # table_status
	"", # file_status
	)


	def _detect_phase(text: str, mode: str = "v1") -> int:
	phase_map_v1 = {
	"phase 5.5": 5, "phase 6": 6, "phase 5": 4,
	"phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0,
	}
	phase_map_v2 = {
	"phase 5": 4, "phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0,
	"specter2 run complete": 4,
	"final outputs": 4,
	"pajais mapping": 3,
	"council": 2,
	"hdbscan": 1,
	}
	lower = text.lower()
	phase_map = phase_map_v1 if mode == "v1" else phase_map_v2
	for key, val in phase_map.items():
	if key in lower:
	return val
	return 0


	def _detect_run_label(text: str) -> str:
	lower = text.lower()
	return (
	"TITLE RUN" if "title run" in lower or "title phase" in lower else
	"ABSTRACT RUN" if "abstract run" in lower or "abstract phase" in lower else
	"SPECTER2 RUN" if "specter" in lower or "v2" in lower else
	""
	)


	def _stream_agent(user_message: str, thread_id: str, mode: str = "v1") -> str:
	import time
	agent_obj = agent if mode == "v1" else agent_v2
	clean_fn = clean_thread_history if mode == "v1" else clean_thread_history_v2
	agent_ok = AGENT_V1_OK if mode == "v1" else AGENT_V2_OK

	if not agent_ok:
	return "ERROR: {} agent not loaded. Check terminal.".format(
	"Classic" if mode == "v1" else "SPECTER2"
	)

	def _do_stream() -> str:
	clean_fn(thread_id)
	config = {"configurable": {"thread_id": thread_id}}
	full_reply = ""
	for chunk in agent_obj.stream(
	{"messages": [{"role": "user", "content": user_message}]},
	config=config,
	stream_mode="values",
	):
	last_msg = chunk["messages"][-1]
	content = getattr(last_msg, "content", "")
	if isinstance(content, list):
	content = " ".join(
	c.get("text", "") if isinstance(c, dict) else str(c)
	for c in content
	)
	if content:
	full_reply = content
	return full_reply or "(no response)"

	result = _do_stream()
	is_rate_limited = (
	"429" in result
	or "rate limit" in result.lower()
	or "rate_limited" in result.lower()
	)
	return _do_stream() if is_rate_limited else result


	def _generate_final_v1_directly(history: list) -> str:
	from tools import generate_comparison_csv, export_narrative
	csv_result = generate_comparison_csv.invoke({})
	narr_result = export_narrative.invoke({})
	csv_info = json.loads(csv_result) if csv_result.strip().startswith("{") else {}
	narr_info = json.loads(narr_result) if narr_result.strip().startswith("{") else {}
	rows = csv_info.get("rows", "?")
	col_names = ", ".join(csv_info.get("columns", [])[:5]) + "..."
	wc = narr_info.get("word_count", "?")
	return (
	"Both runs complete. Final outputs generated. "
	"comparison.csv has {} rows with columns: {}. "
	"narrative.txt has {} words. "
	"Both files are in the Download tab."
	).format(rows, col_names, wc)


	def run_agent(
	user_message: str,
	history: list,
	abstract_thread: str,
	title_thread: str,
	v2_thread: str,
	current_run: str,
	current_mode: str,
	) -> tuple:
	if not user_message or not user_message.strip():
	cfg = _active_run_for_table()
	mode = current_mode or "v1"
	return (
	history or [], "",
	make_progress_html(0, mode=mode),
	_run_status_html(mode),
	_build_review_table(cfg),
	_build_v2_cluster_table(),
	_load_chart("bar", cfg, mode),
	_get_download_files(mode),
	abstract_thread, title_thread, v2_thread, current_run,
	)

	lower = user_message.lower().strip()
	mode = current_mode or "v1"

	# Detect run switches
	active_run = (
	"title" if "run title" in lower else
	"abstract" if "run abstract" in lower else
	current_run
	)

	# v1 shortcut for final outputs
	abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
	title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
	both_v1 = abs_done and title_done
	wants_final = any(w in lower for w in ("yes", "generate", "final", "comparison", "narrative", "output"))

	history = list(history or [])
	history.append({"role": "user", "content": user_message})

	reply = (
	_generate_final_v1_directly(history)
	if (mode == "v1" and both_v1 and wants_final
	and not (DATA_DIR / "comparison.csv").exists())
	else _stream_agent(
	user_message,
	v2_thread if mode == "v2" else (title_thread if active_run == "title" else abstract_thread),
	mode=mode,
	)
	)

	history.append({"role": "assistant", "content": reply})
	cfg = _active_run_for_table()

	return (
	history, "",
	make_progress_html(_detect_phase(reply, mode), _detect_run_label(reply), mode),
	_run_status_html(mode),
	_build_review_table(cfg),
	_build_v2_cluster_table(),
	_load_chart("bar", cfg, mode),
	_get_download_files(mode),
	abstract_thread, title_thread, v2_thread, active_run,
	)


	def handle_submit_review(
	review_data,
	history: list,
	abstract_thread: str,
	title_thread: str,
	v2_thread: str,
	current_run: str,
	current_mode: str,
	) -> tuple:
	if review_data is None:
	return run_agent(
	"Review table empty — waiting for Phase 2.",
	history, abstract_thread, title_thread, v2_thread, current_run, current_mode
	)

	df = (
	pd.DataFrame(
	review_data.get("data", []),
	columns=review_data.get("headers", REVIEW_COLUMNS)
	)
	if isinstance(review_data, dict)
	else (
	review_data.copy()
	if isinstance(review_data, pd.DataFrame)
	else pd.DataFrame(review_data, columns=REVIEW_COLUMNS)
	)
	)

	if df.empty:
	return run_agent(
	"Review table empty — waiting for Phase 2.",
	history, abstract_thread, title_thread, v2_thread, current_run, current_mode
	)

	df.columns = pd.Index(list(map(str, df.columns)))
	approve_col = next((c for c in df.columns if "approve" in c.lower()), None)
	id_col = next((c for c in df.columns if c.strip() == "#"), df.columns[0])
	label_col = next((c for c in df.columns if "label" in c.lower()), df.columns[1])
	rename_col = next((c for c in df.columns if "rename" in c.lower()), None)

	if approve_col is None:
	return run_agent(
	"Cannot find Approve column in table.",
	history, abstract_thread, title_thread, v2_thread, current_run, current_mode
	)

	def to_bool(v):
	return v is True or str(v).strip().lower() in ("true","1","yes","x","on","✓")

	approved_df = df[pd.Series(list(map(to_bool, list(df[approve_col]))), index=df.index)]

	if len(approved_df) == 0:
	guide = (
	"⚠️ No topics approved yet.\n\n"
	"To approve topics:\n"
	"1. Click 🔄 Refresh Table to load latest topics\n"
	"2. Click the checkbox ☐ in Approve column\n"
	"3. Fill Rename To with a theme name\n"
	"4. Click ✅ Submit Review again"
	)
	history = list(history or [])
	history.append({"role": "assistant", "content": guide})
	cfg = _active_run_for_table()
	return (
	history, "",
	make_progress_html(1),
	_run_status_html("v1"),
	_build_review_table(cfg),
	_build_v2_cluster_table(),
	_load_chart("bar", cfg, "v1"),
	_get_download_files("v1"),
	abstract_thread, title_thread, v2_thread, current_run,
	)

	theme_map: dict = {}
	for idx in range(len(approved_df)):
	row = approved_df.iloc[idx]
	rename_val = str(row[rename_col]).strip() if rename_col else ""
	theme = (
	rename_val
	if rename_val and rename_val.lower() not in ("", "nan", "none")
	else str(row[label_col])
	)
	try:
	tid = int(float(str(row[id_col])))
	except (ValueError, TypeError):
	tid = idx
	theme_map.setdefault(theme, []).append(tid)

	groups = [{"theme_name": k, "topic_ids": v} for k, v in theme_map.items()]
	thread_id = title_thread if current_run == "title" else abstract_thread

	msg = (
	"Researcher submitted the Review Table for the {} run.\n"
	"{} topics approved, {} themes:\n\n"
	"```json\n{}\n```\n\n"
	"Call consolidate_into_themes with run_config='{}' "
	"and the approved_groups JSON above. Then proceed to Phase 3."
	).format(
	current_run, len(approved_df), len(groups),
	json.dumps(groups, indent=2), current_run,
	)

	return run_agent(msg, history, abstract_thread, title_thread, v2_thread, current_run, current_mode)


	def switch_mode(new_mode: str, current_mode: str, abstract_thread: str, title_thread: str, v2_thread: str, current_run: str) -> tuple:
	"""Switch between Classic and SPECTER2 modes, refreshing UI accordingly."""
	cfg = _active_run_for_table()
	mode_label_text = (
	"### 🔬 Classic Mode (BERTopic)\n"
	"Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
	"Commands: run abstract → review → run title → review → download"
	if new_mode == "v1" else
	"### 🧬 SPECTER2 Mode (Advanced)\n"
	"One combined run per paper (Title+Abstract). UMAP+HDBSCAN clustering. "
	"Council-of-3 LLM labeling with audit trail.\n"
	"Command: run specter or run v2"
	)
	chart_opts = CHART_OPTIONS if new_mode == "v1" else ["scatter", "bar"]
	return (
	make_progress_html(0, mode=new_mode),
	_run_status_html(new_mode),
	_build_review_table(cfg),
	_build_v2_cluster_table(),
	_load_chart("bar", cfg, new_mode),
	_get_download_files(new_mode),
	mode_label_text,
	gr.update(choices=chart_opts, value=chart_opts[0]),
	new_mode,
	)


	def manual_refresh_table(current_run: str, current_mode: str) -> tuple:
	cfg = _active_run_for_table()
	return _build_review_table(cfg), _build_v2_cluster_table()


	def refresh_chart(chart_type: str, current_run: str, current_mode: str) -> str:
	cfg = _active_run_for_table()
	mode = current_mode or "v1"
	return _load_chart(chart_type, cfg, mode)


	def check_status(current_mode: str) -> str:
	mode = current_mode or "v1"
	if mode == "v2":
	sp = _v2_summaries_path()
	if not sp.exists():
	return (
	"<div style='padding:10px;background:#fef3c7;border-radius:6px;"
	"font-family:sans-serif;font-size:13px;'>"
	"⏳ No v2 clusters yet. Type <b>run specter</b> to begin."
	"</div>"
	)
	summaries = json.loads(sp.read_text())
	labeled = sum(1 for s in summaries if s.get("label","").strip())
	return (
	"<div style='padding:10px;background:#dcfce7;border-radius:6px;"
	"font-family:sans-serif;font-size:13px;'>"
	"✅ <b>{} clusters</b> in <code>data/v2/</code> ({} labeled). "
	"Click 🔄 Refresh to display."
	"</div>"
	).format(len(summaries), labeled)
	else:
	cfg = _active_run_for_table()
	sp = _summaries_path(cfg)
	if not sp.exists():
	return (
	"<div style='padding:10px;background:#fef3c7;border-radius:6px;"
	"font-family:sans-serif;font-size:13px;'>"
	"⏳ No topics yet. Upload CSV then type <b>run abstract</b>."
	"</div>"
	)
	summaries = json.loads(sp.read_text())
	labeled = sum(1 for s in summaries if s.get("label","").strip()
	and not s.get("label","").startswith("Topic "))
	return (
	"<div style='padding:10px;background:#dcfce7;border-radius:6px;"
	"font-family:sans-serif;font-size:13px;'>"
	"✅ <b>{} topics</b> from <code>data/{}/</code> ({} LLM-labelled). "
	"Click 🔄 Refresh Table."
	"</div>"
	).format(len(summaries), cfg, labeled)


	print("Step 4: building UI...")

	# ── UI ─────────────────────────────────────────────────────────────────────────
	with gr.Blocks(
	title="BERTopic / SPECTER2 Thematic Analysis Agent",
	css="""
	.mode-btn-active { border: 2px solid #4f46e5 !important; background: #eef2ff !important; }
	.mode-btn-v2-active { border: 2px solid #0891b2 !important; background: #ecfeff !important; }
	"""
	) as demo:

	abstract_thread_state = gr.State(new_thread_id())
	title_thread_state = gr.State(new_thread_id())
	v2_thread_state = gr.State(new_thread_id())
	current_run_state = gr.State("abstract")
	current_mode_state = gr.State("v1")

	gr.Markdown(
	"# 🔬 Thematic Analysis Agent\n"
	"Braun & Clarke (2006) · SPECTER2 · PAJAIS Taxonomy · Systematic Literature Review"
	)

	progress_bar = gr.HTML(make_progress_html(0))
	run_status = gr.HTML(_run_status_html("v1"))

	# ── MODE SELECTOR ──────────────────────────────────────────────────────────
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🔀 Analysis Mode")
	with gr.Row():
	btn_v1 = gr.Button(
	"📊 Classic (BERTopic)\nAbstract + Title runs",
	variant="primary", size="sm",
	)
	btn_v2 = gr.Button(
	"🧬 SPECTER2 (Advanced)\nCombined T+A · HDBSCAN · Council-3-LLMs",
	variant="secondary", size="sm",
	)

	mode_description = gr.Markdown(
	"### 📊 Classic Mode (BERTopic)\n"
	"Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n"
	"Commands: run abstract → review → run title → review → download"
	)

	gr.HTML("""
	<div style="background:#f0fdf4;border:1px solid #86efac;border-radius:8px;
	padding:10px 14px;font-family:sans-serif;font-size:13px;margin:4px 0;">
	<b>Classic:</b>
	1️⃣ Upload CSV  →  2️⃣ <code>run abstract</code>  →
	3️⃣ Review Table  →  4️⃣ <code>run title</code>  →  5️⃣ Download
	\|
	<b>SPECTER2:</b>
	1️⃣ Upload CSV  →  2️⃣ <code>run specter</code>  →  3️⃣ Download
	</div>
	""")

	# ── Section 1 ─────────────────────────────────────────────────────────────
	with gr.Accordion("📂 Section 1 — Data Input", open=True):
	def _startup_msg():
	abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists()
	title_done = (DATA_DIR / "title" / "taxonomy.json").exists()
	v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists()
	csv_exists = (DATA_DIR / "uploaded.csv").exists()
	has_data = csv_exists or abs_done or title_done or v2_done
	return (
	"<div style='padding:10px;background:#fef3c7;border:1px solid #fcd34d;"
	"border-radius:8px;font-family:sans-serif;font-size:13px;'>"
	"⚠️ <b>Previous session data detected.</b> "
	"Abstract: {abs}  \|  Title: {title}  \|  "
	"SPECTER2: {v2}  \|  CSV: {csv}<br>"
	"Click <b>🗑️ Reset</b> to clear or continue from where you left off."
	"</div>"
	if has_data else
	"<div style='padding:10px;background:#f0fdf4;border:1px solid #86efac;"
	"border-radius:8px;font-family:sans-serif;font-size:13px;'>"
	"✅ Fresh session — upload your CSV to begin."
	"</div>"
	).format(
	abs="✅" if abs_done else "⏳",
	title="✅" if title_done else "⏳",
	v2="✅" if v2_done else "⏳",
	csv="✅" if csv_exists else "❌",
	)

	startup_banner = gr.HTML(_startup_msg())
	with gr.Row():
	file_input = gr.File(
	label="Upload Scopus CSV", file_types=[".csv"],
	type="filepath", scale=4,
	)
	reset_btn = gr.Button(
	"🗑️ Reset & Start Fresh",
	variant="stop", scale=1, size="sm",
	)
	file_status = gr.Textbox(label="Upload status", interactive=False, lines=2)
	file_input.change(fn=handle_file_upload, inputs=file_input, outputs=file_status)

	# ── Section 2 ─────────────────────────────────────────────────────────────
	with gr.Accordion("💬 Section 2 — Agent Conversation", open=True):
	gr.HTML("""
	<div style="background:#fafafa;border:1px solid #e5e7eb;border-radius:6px;
	padding:8px 12px;font-size:12px;font-family:monospace;margin-bottom:6px;">
	Classic: <b>run abstract</b> \| <b>run title</b> \| <b>yes</b> \| <b>satisfied</b> \| <b>confirm</b>
	\|
	SPECTER2: <b>run specter</b> \| <b>run v2</b> \| <b>yes</b>
	</div>
	""")
	chatbot = gr.Chatbot(label="Agent", height=500)
	with gr.Row():
	chat_input = gr.Textbox(
	label="Message",
	placeholder="e.g. run abstract or run specter",
	lines=2, scale=5,
	)
	send_btn = gr.Button("Send ➤", variant="primary", scale=1)

	# ── Section 3 ─────────────────────────────────────────────────────────────
	with gr.Accordion("📊 Section 3 — Results", open=True):
	with gr.Tabs():

	with gr.Tab("📋 Review Table (Classic)"):
	gr.HTML("""
	<div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px;
	padding:8px 12px;font-family:sans-serif;font-size:13px;">
	After Phase 2 (Classic): Refresh → tick Approve → fill Rename To → Submit Review
	</div>
	""")
	table_status = gr.HTML(
	"<div style='padding:8px;color:#6b7280;font-size:13px;'>"
	"Complete Phase 2 (Classic) then Refresh.</div>"
	)
	with gr.Row():
	refresh_btn = gr.Button("🔄 Refresh Table", variant="secondary", scale=2)
	check_status_btn = gr.Button("📊 Check Status", variant="secondary", scale=1)
	review_table = gr.Dataframe(
	value=pd.DataFrame(columns=REVIEW_COLUMNS),
	headers=REVIEW_COLUMNS,
	datatype=["number","str","str","number","number","bool","str","str"],
	interactive=True, wrap=True,
	label="Topic Review Table (Classic Mode)",
	)
	submit_review_btn = gr.Button("✅ Submit Review", variant="primary", size="lg")

	with gr.Tab("🧬 Cluster View (SPECTER2)"):
	gr.HTML("""
	<div style="background:#ecfeff;border:1px solid #a5f3fc;border-radius:8px;
	padding:8px 12px;font-family:sans-serif;font-size:13px;">
	Clusters appear after Phase 3 (Council Labeling) completes. Read-only — no manual review needed.
	Download the <b>cluster_audit.csv</b> for full LLM voting details.
	</div>
	""")
	with gr.Row():
	refresh_v2_btn = gr.Button("🔄 Refresh Clusters", variant="secondary", scale=2)
	check_v2_btn = gr.Button("📊 Check V2 Status", variant="secondary", scale=1)
	v2_cluster_table = gr.Dataframe(
	value=pd.DataFrame(columns=["#", "Cluster Label", "Papers",
	"Vote Agreement", "LLM1 Vote",
	"LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]),
	headers=["#", "Cluster Label", "Papers", "Vote Agreement",
	"LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"],
	datatype=["number","str","number","str","str","str","str","str"],
	interactive=False, wrap=True,
	label="SPECTER2 Cluster Table (Read-only)",
	)

	with gr.Tab("📈 Charts"):
	chart_selector = gr.Dropdown(
	choices=CHART_OPTIONS, value="bar",
	label="Select Chart", interactive=True,
	)
	chart_display = gr.HTML(
	"<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>"
	)
	chart_selector.change(
	fn=refresh_chart,
	inputs=[chart_selector, current_run_state, current_mode_state],
	outputs=chart_display,
	)

	with gr.Tab("⬇️ Download"):
	gr.Markdown(
	"Classic outputs appear after both abstract+title runs complete.\n\n"
	"SPECTER2 outputs appear after v2 run completes:\n"
	"- `comparison_v2.csv` — one row per paper with cluster + PAJAIS\n"
	"- `cluster_audit.csv` — full LLM voting record, per paper\n"
	"- `narrative_v2.txt` — 500-word Section 7 discussion\n"
	"> 💡 Cache: `data/v2/llm_cache/` stores LLM responses — "
	"delete this folder to force fresh labels on re-run.\n"
	)
	download_files = gr.File(
	label="Output Files", file_count="multiple", interactive=False,
	)

	# ── wire up — combined outputs ─────────────────────────────────────────────
	agent_outputs = [
	chatbot, chat_input, progress_bar, run_status,
	review_table, v2_cluster_table, chart_display, download_files,
	abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
	]

	reset_outputs = [
	chatbot, chat_input, progress_bar, run_status,
	review_table, v2_cluster_table, chart_display, download_files,
	abstract_thread_state, title_thread_state, v2_thread_state, current_run_state,
	table_status, file_status,
	]

	mode_switch_outputs = [
	progress_bar, run_status,
	review_table, v2_cluster_table,
	chart_display, download_files,
	mode_description, chart_selector,
	current_mode_state,
	]

	send_btn.click(
	fn=run_agent,
	inputs=[chat_input, chatbot,
	abstract_thread_state, title_thread_state, v2_thread_state,
	current_run_state, current_mode_state],
	outputs=agent_outputs,
	)
	chat_input.submit(
	fn=run_agent,
	inputs=[chat_input, chatbot,
	abstract_thread_state, title_thread_state, v2_thread_state,
	current_run_state, current_mode_state],
	outputs=agent_outputs,
	)
	submit_review_btn.click(
	fn=handle_submit_review,
	inputs=[review_table, chatbot,
	abstract_thread_state, title_thread_state, v2_thread_state,
	current_run_state, current_mode_state],
	outputs=agent_outputs,
	)
	reset_btn.click(
	fn=reset_all_data,
	inputs=[],
	outputs=reset_outputs,
	)

	btn_v1.click(
	fn=lambda m, at, tt, vt, cr: switch_mode("v1", m, at, tt, vt, cr),
	inputs=[current_mode_state, abstract_thread_state, title_thread_state,
	v2_thread_state, current_run_state],
	outputs=mode_switch_outputs,
	)
	btn_v2.click(
	fn=lambda m, at, tt, vt, cr: switch_mode("v2", m, at, tt, vt, cr),
	inputs=[current_mode_state, abstract_thread_state, title_thread_state,
	v2_thread_state, current_run_state],
	outputs=mode_switch_outputs,
	)

	refresh_btn.click(
	fn=manual_refresh_table,
	inputs=[current_run_state, current_mode_state],
	outputs=[review_table, v2_cluster_table],
	)
	refresh_v2_btn.click(
	fn=manual_refresh_table,
	inputs=[current_run_state, current_mode_state],
	outputs=[review_table, v2_cluster_table],
	)
	check_status_btn.click(
	fn=check_status,
	inputs=[current_mode_state],
	outputs=[table_status],
	)
	check_v2_btn.click(
	fn=lambda: check_status("v2"),
	inputs=[current_mode_state],
	outputs=[table_status],
	)

	print("Step 5: UI built OK, launching...")

	if __name__ == "__main__":
	_v = tuple(int(x) for x in gr.__version__.split(".")[:2])
	print("Gradio version:", gr.__version__)
	_kwargs = {
	"server_name": "0.0.0.0",
	"server_port": 7860,
	"share": False,
	"inbrowser": False,
	}
	if _v >= (5, 0):
	_kwargs["ssr_mode"] = False
	print("Running at http://0.0.0.0:7860")
	import subprocess, sys
	subprocess.Popen([sys.executable, "check_keys.py"])
	demo.launch(**_kwargs)