Spaces:
Sleeping
Sleeping
| """ | |
| app.py - Gradio 6.x BERTopic / SPECTER2 Thematic Analysis Agent. | |
| TWO MODES: | |
| Classic (v1): BERTopic + Mistral-small, abstract run then title run separately. | |
| SPECTER2 (v2): SPECTER2 embeddings + UMAP + HDBSCAN + council-of-3-LLMs, | |
| one combined run on Title+Abstract per paper. | |
| KEY DESIGN: | |
| - Abstract run and title run use SEPARATE thread IDs in v1. | |
| - v2 uses its own separate thread ID. | |
| - Mode switch keeps existing data intact; user can switch freely. | |
| """ | |
| from __future__ import annotations | |
| print("Step 1: imports starting...") | |
| import json | |
| import shutil | |
| import uuid | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| print("Step 2: gradio imported, version =", gr.__version__) | |
| # ββ v1 agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| from agent import agent, clean_thread_history | |
| AGENT_V1_OK = True | |
| print("Step 3a: v1 agent imported OK") | |
| except Exception as e: | |
| print("Step 3a FAILED:", e) | |
| agent = None | |
| AGENT_V1_OK = False | |
| def clean_thread_history(tid): pass | |
| # ββ v2 agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| from agent_v2 import agent_v2, clean_thread_history_v2, reset_thread_v2 | |
| AGENT_V2_OK = True | |
| print("Step 3b: v2 agent imported OK") | |
| except Exception as e: | |
| print("Step 3b FAILED:", e) | |
| agent_v2 = None | |
| AGENT_V2_OK = False | |
| def clean_thread_history_v2(tid): pass | |
| def reset_thread_v2(tid): pass | |
| # ββ constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATA_DIR = Path("data") | |
| DATA_DIR.mkdir(exist_ok=True) | |
| REVIEW_COLUMNS = [ | |
| "#", "Topic Label", "Top Evidence", | |
| "Sentences", "Papers", "Approve", "Rename To", "Reasoning", | |
| ] | |
| CHART_OPTIONS = ["bar", "histogram", "scatter", "treemap"] | |
| PHASE_LABELS_V1 = [ | |
| "Phase 1 β Familiarisation", "Phase 2 β Initial Codes", | |
| "Phase 3 β Themes", "Phase 4 β Saturation", | |
| "Phase 5 β Naming", "Phase 5.5 β PAJAIS", | |
| "Phase 6 β Report", | |
| ] | |
| PHASE_LABELS_V2 = [ | |
| "Phase 1 β Load & Embed", | |
| "Phase 2 β UMAP+HDBSCAN", | |
| "Phase 3 β Council Labeling", | |
| "Phase 4 β PAJAIS Mapping", | |
| "Phase 5 β Final Outputs", | |
| ] | |
| def new_thread_id() -> str: | |
| return str(uuid.uuid4()) | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def make_progress_html(current_phase: int, run_label: str = "", mode: str = "v1") -> str: | |
| labels = PHASE_LABELS_V1 if mode == "v1" else PHASE_LABELS_V2 | |
| total = len(labels) | |
| pct = int((current_phase / total) * 100) | |
| color = "#4f46e5" if mode == "v1" else "#0891b2" | |
| steps = "".join( | |
| '<span style="padding:3px 8px;margin:2px;border-radius:10px;font-size:11px;' | |
| 'background:{bg};color:{fg};">{lbl}</span>'.format( | |
| bg=color if i <= current_phase else "#e5e7eb", | |
| fg="#fff" if i <= current_phase else "#6b7280", | |
| lbl=label, | |
| ) | |
| for i, label in enumerate(labels) | |
| ) | |
| badge = ( | |
| ' <span style="background:#f59e0b;color:#fff;padding:2px 10px;' | |
| 'border-radius:10px;font-size:12px;">{}</span>'.format(run_label) | |
| if run_label else "" | |
| ) | |
| return ( | |
| '<div style="font-family:sans-serif;padding:8px 0;">' | |
| '<div style="font-weight:600;color:#374151;margin-bottom:5px;">' | |
| 'Progress{badge}</div>' | |
| '<div style="background:#e5e7eb;border-radius:6px;height:8px;margin-bottom:6px;">' | |
| '<div style="background:{color};width:{pct}%;height:100%;border-radius:6px;"></div>' | |
| '</div>' | |
| '<div style="display:flex;flex-wrap:wrap;gap:2px;">{steps}</div>' | |
| '</div>' | |
| ).format(badge=badge, pct=pct, steps=steps, color=color) | |
| def _run_status_html(mode: str = "v1") -> str: | |
| abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists() | |
| title_done = (DATA_DIR / "title" / "taxonomy.json").exists() | |
| both_done = abs_done and title_done | |
| v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists() | |
| def badge(label, done, color_done="#22c55e"): | |
| return ( | |
| '<span style="background:{};color:#fff;padding:3px 12px;' | |
| 'border-radius:10px;font-size:12px;margin:2px;">{} {}</span>' | |
| ).format( | |
| color_done if done else "#9ca3af", | |
| "β " if done else "β³", | |
| label | |
| ) | |
| v1_badges = ( | |
| badge("Abstract Run", abs_done) | |
| + badge("Title Run", title_done) | |
| + badge("V1 Outputs", both_done) | |
| ) | |
| v2_badges = badge("SPECTER2 Run", v2_done, "#0891b2") | |
| return ( | |
| '<div style="padding:6px 0;">' | |
| + ('<div style="display:flex;flex-wrap:wrap;gap:4px;margin-bottom:4px;">' | |
| + v1_badges + '</div>' if mode == "v1" else "") | |
| + ('<div style="display:flex;flex-wrap:wrap;gap:4px;">' | |
| + v2_badges + '</div>' if mode == "v2" else "") | |
| + '</div>' | |
| ) | |
| def _safe_read_csv(path): | |
| try: | |
| return pd.read_csv(path, encoding="utf-8") | |
| except UnicodeDecodeError: | |
| return pd.read_csv(path, encoding="latin-1") | |
| def _summaries_path(run_config: str) -> Path: | |
| return DATA_DIR / run_config / "summaries.json" | |
| def _charts_path(run_config: str) -> Path: | |
| return DATA_DIR / run_config / "charts.json" | |
| def _papers_path(run_config: str) -> Path: | |
| return DATA_DIR / run_config / "papers.csv" | |
| def _v2_summaries_path() -> Path: | |
| return DATA_DIR / "v2" / "summaries.json" | |
| def _v2_charts_path() -> Path: | |
| return DATA_DIR / "v2" / "charts.json" | |
| def _active_run_for_table() -> str: | |
| abs_has_summaries = _summaries_path("abstract").exists() | |
| title_has_summaries = _summaries_path("title").exists() | |
| abs_has_themes = (DATA_DIR / "abstract" / "themes.json").exists() | |
| title_has_themes = (DATA_DIR / "title" / "themes.json").exists() | |
| title_in_review = title_has_summaries and not title_has_themes | |
| abs_in_review = abs_has_summaries and not abs_has_themes | |
| return ( | |
| "title" if title_in_review else | |
| "abstract" if abs_in_review else | |
| "title" if title_has_summaries else | |
| "abstract" | |
| ) | |
| def _count_papers_per_topic(run_config: str) -> dict: | |
| sp = _summaries_path(run_config) | |
| pp = _papers_path(run_config) | |
| if not sp.exists(): | |
| return {} | |
| summaries = json.loads(sp.read_text()) | |
| if not pp.exists(): | |
| return {s["topic_id"]: max(s.get("size", 0) // 4, 1) for s in summaries} | |
| papers_df = _safe_read_csv(pp) | |
| text_col = next( | |
| filter(lambda c: "abstract" in c.lower() or "title" in c.lower(), papers_df.columns), | |
| None | |
| ) | |
| if text_col is None: | |
| return {s["topic_id"]: 0 for s in summaries} | |
| sent_to_paper = {} | |
| for idx, text in enumerate(list(papers_df[text_col].fillna(""))): | |
| for sent in str(text).split("."): | |
| key = sent.strip()[:80] | |
| if key: | |
| sent_to_paper[key] = idx | |
| def count_papers(s): | |
| ids = set( | |
| sent_to_paper[sent.strip()[:80]] | |
| for sent in s.get("sentences", []) | |
| if sent.strip()[:80] in sent_to_paper | |
| ) | |
| return max(len(ids), 1) | |
| return {s["topic_id"]: count_papers(s) for s in summaries} | |
| def _build_review_table(run_config: str = "abstract") -> pd.DataFrame: | |
| sp = _summaries_path(run_config) | |
| if not sp.exists(): | |
| return pd.DataFrame(columns=REVIEW_COLUMNS) | |
| summaries = json.loads(sp.read_text()) | |
| if not summaries: | |
| return pd.DataFrame(columns=REVIEW_COLUMNS) | |
| paper_counts = _count_papers_per_topic(run_config) | |
| rows = list(map(lambda s: [ | |
| int(s.get("topic_id", 0)), | |
| str(s.get("label", "Topic {}".format(s.get("topic_id", "")))), | |
| str(" | ".join(s.get("top_evidence", [])[:2])), | |
| int(len(s.get("sentences", []))), | |
| int(paper_counts.get(int(s.get("topic_id", 0)), 0)), | |
| False, "", str(s.get("reasoning", "")), | |
| ], summaries)) | |
| return pd.DataFrame(rows, columns=REVIEW_COLUMNS) | |
| def _build_v2_cluster_table() -> pd.DataFrame: | |
| """Build a read-only display table for v2 clusters.""" | |
| sp = _v2_summaries_path() | |
| if not sp.exists(): | |
| cols = ["#", "Cluster Label", "Papers", "Vote Agreement", | |
| "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"] | |
| return pd.DataFrame(columns=cols) | |
| summaries = json.loads(sp.read_text()) | |
| rows = list(map(lambda s: [ | |
| int(s.get("cluster_id", 0)), | |
| str(s.get("label", "Cluster {}".format(s.get("cluster_id", "")))), | |
| int(s.get("paper_count", 0)), | |
| str(s.get("vote_agreement", "")), | |
| str(s.get("llm_vote_1_MISTRAL", "")), | |
| str(s.get("llm_vote_2_GEMINI", "")), | |
| str(s.get("llm_vote_3_GROQ", "")), | |
| str(" | ".join(s.get("top3_titles", [])[:2])), | |
| ], summaries)) | |
| cols = ["#", "Cluster Label", "Papers", "Vote Agreement", | |
| "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"] | |
| return pd.DataFrame(rows, columns=cols) | |
| def _load_chart(chart_type: str, run_config: str, mode: str = "v1") -> str: | |
| cp = _v2_charts_path() if mode == "v2" else _charts_path(run_config) | |
| if not cp.exists(): | |
| return "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering completes.</p>" | |
| charts = json.loads(cp.read_text()) | |
| available = list(charts.keys()) | |
| # v2 charts: scatter, bar; v1: bar, histogram, scatter, treemap | |
| key = chart_type if chart_type in charts else (available[0] if available else "bar") | |
| return charts.get(key, "<p>Chart not found.</p>") | |
| def _get_download_files(mode: str = "v1"): | |
| v1_candidates = [ | |
| DATA_DIR / "comparison.csv", | |
| DATA_DIR / "narrative.txt", | |
| DATA_DIR / "abstract" / "summaries.json", | |
| DATA_DIR / "abstract" / "themes.json", | |
| DATA_DIR / "abstract" / "taxonomy.json", | |
| DATA_DIR / "title" / "summaries.json", | |
| DATA_DIR / "title" / "themes.json", | |
| DATA_DIR / "title" / "taxonomy.json", | |
| ] | |
| v2_candidates = [ | |
| DATA_DIR / "comparison_v2.csv", | |
| DATA_DIR / "v2" / "cluster_audit.csv", | |
| DATA_DIR / "v2" / "narrative_v2.txt", | |
| DATA_DIR / "v2" / "summaries.json", | |
| DATA_DIR / "v2" / "taxonomy.json", | |
| ] | |
| candidates = v2_candidates if mode == "v2" else v1_candidates | |
| existing = list(map(str, filter(lambda p: p.exists(), candidates))) | |
| return existing if existing else None | |
| def handle_file_upload(file_path) -> str: | |
| if not file_path: | |
| return "" | |
| dest = DATA_DIR / "uploaded.csv" | |
| src = Path(file_path).resolve() | |
| dst = dest.resolve() | |
| _ = shutil.copy(str(src), str(dst)) if src != dst else None | |
| try: | |
| df = _safe_read_csv(dest) | |
| msg = "β CSV saved β {} rows, columns: {}. ".format( | |
| len(df), ", ".join(list(df.columns[:8])) | |
| ) | |
| except Exception: | |
| msg = "β CSV saved to {}. ".format(dest) | |
| return msg + "Select a mode below and type the run command." | |
| def reset_all_data() -> tuple: | |
| import shutil as _shutil | |
| try: | |
| reset_thread_v2("default") | |
| except Exception: | |
| pass | |
| if DATA_DIR.exists(): | |
| _shutil.rmtree(str(DATA_DIR)) | |
| DATA_DIR.mkdir(exist_ok=True) | |
| empty_v1 = pd.DataFrame(columns=REVIEW_COLUMNS) | |
| empty_v2 = pd.DataFrame(columns=["#", "Cluster Label", "Papers", | |
| "Vote Agreement", "LLM1 Vote", | |
| "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]) | |
| empty_chart = "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>" | |
| status_msg = ( | |
| "<div style='padding:10px;background:#fef3c7;border-radius:6px;" | |
| "font-family:sans-serif;font-size:13px;'>" | |
| "π <b>All data cleared.</b> Upload a new CSV and begin." | |
| "</div>" | |
| ) | |
| return ( | |
| [], # chatbot | |
| "", # chat input | |
| make_progress_html(0), # progress | |
| _run_status_html("v1"), # run status | |
| empty_v1, # v1 review table | |
| empty_v2, # v2 cluster table | |
| empty_chart, # chart | |
| None, # downloads | |
| new_thread_id(), # abstract thread | |
| new_thread_id(), # title thread | |
| new_thread_id(), # v2 thread | |
| "abstract", # current_run (v1) | |
| status_msg, # table_status | |
| "", # file_status | |
| ) | |
| def _detect_phase(text: str, mode: str = "v1") -> int: | |
| phase_map_v1 = { | |
| "phase 5.5": 5, "phase 6": 6, "phase 5": 4, | |
| "phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0, | |
| } | |
| phase_map_v2 = { | |
| "phase 5": 4, "phase 4": 3, "phase 3": 2, "phase 2": 1, "phase 1": 0, | |
| "specter2 run complete": 4, | |
| "final outputs": 4, | |
| "pajais mapping": 3, | |
| "council": 2, | |
| "hdbscan": 1, | |
| } | |
| lower = text.lower() | |
| phase_map = phase_map_v1 if mode == "v1" else phase_map_v2 | |
| for key, val in phase_map.items(): | |
| if key in lower: | |
| return val | |
| return 0 | |
| def _detect_run_label(text: str) -> str: | |
| lower = text.lower() | |
| return ( | |
| "TITLE RUN" if "title run" in lower or "title phase" in lower else | |
| "ABSTRACT RUN" if "abstract run" in lower or "abstract phase" in lower else | |
| "SPECTER2 RUN" if "specter" in lower or "v2" in lower else | |
| "" | |
| ) | |
| def _stream_agent(user_message: str, thread_id: str, mode: str = "v1") -> str: | |
| import time | |
| agent_obj = agent if mode == "v1" else agent_v2 | |
| clean_fn = clean_thread_history if mode == "v1" else clean_thread_history_v2 | |
| agent_ok = AGENT_V1_OK if mode == "v1" else AGENT_V2_OK | |
| if not agent_ok: | |
| return "ERROR: {} agent not loaded. Check terminal.".format( | |
| "Classic" if mode == "v1" else "SPECTER2" | |
| ) | |
| def _do_stream() -> str: | |
| clean_fn(thread_id) | |
| config = {"configurable": {"thread_id": thread_id}} | |
| full_reply = "" | |
| for chunk in agent_obj.stream( | |
| {"messages": [{"role": "user", "content": user_message}]}, | |
| config=config, | |
| stream_mode="values", | |
| ): | |
| last_msg = chunk["messages"][-1] | |
| content = getattr(last_msg, "content", "") | |
| if isinstance(content, list): | |
| content = " ".join( | |
| c.get("text", "") if isinstance(c, dict) else str(c) | |
| for c in content | |
| ) | |
| if content: | |
| full_reply = content | |
| return full_reply or "(no response)" | |
| result = _do_stream() | |
| is_rate_limited = ( | |
| "429" in result | |
| or "rate limit" in result.lower() | |
| or "rate_limited" in result.lower() | |
| ) | |
| return _do_stream() if is_rate_limited else result | |
| def _generate_final_v1_directly(history: list) -> str: | |
| from tools import generate_comparison_csv, export_narrative | |
| csv_result = generate_comparison_csv.invoke({}) | |
| narr_result = export_narrative.invoke({}) | |
| csv_info = json.loads(csv_result) if csv_result.strip().startswith("{") else {} | |
| narr_info = json.loads(narr_result) if narr_result.strip().startswith("{") else {} | |
| rows = csv_info.get("rows", "?") | |
| col_names = ", ".join(csv_info.get("columns", [])[:5]) + "..." | |
| wc = narr_info.get("word_count", "?") | |
| return ( | |
| "Both runs complete. Final outputs generated. " | |
| "comparison.csv has {} rows with columns: {}. " | |
| "narrative.txt has {} words. " | |
| "Both files are in the Download tab." | |
| ).format(rows, col_names, wc) | |
| def run_agent( | |
| user_message: str, | |
| history: list, | |
| abstract_thread: str, | |
| title_thread: str, | |
| v2_thread: str, | |
| current_run: str, | |
| current_mode: str, | |
| ) -> tuple: | |
| if not user_message or not user_message.strip(): | |
| cfg = _active_run_for_table() | |
| mode = current_mode or "v1" | |
| return ( | |
| history or [], "", | |
| make_progress_html(0, mode=mode), | |
| _run_status_html(mode), | |
| _build_review_table(cfg), | |
| _build_v2_cluster_table(), | |
| _load_chart("bar", cfg, mode), | |
| _get_download_files(mode), | |
| abstract_thread, title_thread, v2_thread, current_run, | |
| ) | |
| lower = user_message.lower().strip() | |
| mode = current_mode or "v1" | |
| # Detect run switches | |
| active_run = ( | |
| "title" if "run title" in lower else | |
| "abstract" if "run abstract" in lower else | |
| current_run | |
| ) | |
| # v1 shortcut for final outputs | |
| abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists() | |
| title_done = (DATA_DIR / "title" / "taxonomy.json").exists() | |
| both_v1 = abs_done and title_done | |
| wants_final = any(w in lower for w in ("yes", "generate", "final", "comparison", "narrative", "output")) | |
| history = list(history or []) | |
| history.append({"role": "user", "content": user_message}) | |
| reply = ( | |
| _generate_final_v1_directly(history) | |
| if (mode == "v1" and both_v1 and wants_final | |
| and not (DATA_DIR / "comparison.csv").exists()) | |
| else _stream_agent( | |
| user_message, | |
| v2_thread if mode == "v2" else (title_thread if active_run == "title" else abstract_thread), | |
| mode=mode, | |
| ) | |
| ) | |
| history.append({"role": "assistant", "content": reply}) | |
| cfg = _active_run_for_table() | |
| return ( | |
| history, "", | |
| make_progress_html(_detect_phase(reply, mode), _detect_run_label(reply), mode), | |
| _run_status_html(mode), | |
| _build_review_table(cfg), | |
| _build_v2_cluster_table(), | |
| _load_chart("bar", cfg, mode), | |
| _get_download_files(mode), | |
| abstract_thread, title_thread, v2_thread, active_run, | |
| ) | |
| def handle_submit_review( | |
| review_data, | |
| history: list, | |
| abstract_thread: str, | |
| title_thread: str, | |
| v2_thread: str, | |
| current_run: str, | |
| current_mode: str, | |
| ) -> tuple: | |
| if review_data is None: | |
| return run_agent( | |
| "Review table empty β waiting for Phase 2.", | |
| history, abstract_thread, title_thread, v2_thread, current_run, current_mode | |
| ) | |
| df = ( | |
| pd.DataFrame( | |
| review_data.get("data", []), | |
| columns=review_data.get("headers", REVIEW_COLUMNS) | |
| ) | |
| if isinstance(review_data, dict) | |
| else ( | |
| review_data.copy() | |
| if isinstance(review_data, pd.DataFrame) | |
| else pd.DataFrame(review_data, columns=REVIEW_COLUMNS) | |
| ) | |
| ) | |
| if df.empty: | |
| return run_agent( | |
| "Review table empty β waiting for Phase 2.", | |
| history, abstract_thread, title_thread, v2_thread, current_run, current_mode | |
| ) | |
| df.columns = pd.Index(list(map(str, df.columns))) | |
| approve_col = next((c for c in df.columns if "approve" in c.lower()), None) | |
| id_col = next((c for c in df.columns if c.strip() == "#"), df.columns[0]) | |
| label_col = next((c for c in df.columns if "label" in c.lower()), df.columns[1]) | |
| rename_col = next((c for c in df.columns if "rename" in c.lower()), None) | |
| if approve_col is None: | |
| return run_agent( | |
| "Cannot find Approve column in table.", | |
| history, abstract_thread, title_thread, v2_thread, current_run, current_mode | |
| ) | |
| def to_bool(v): | |
| return v is True or str(v).strip().lower() in ("true","1","yes","x","on","β") | |
| approved_df = df[pd.Series(list(map(to_bool, list(df[approve_col]))), index=df.index)] | |
| if len(approved_df) == 0: | |
| guide = ( | |
| "β οΈ **No topics approved yet.**\n\n" | |
| "**To approve topics:**\n" | |
| "1. Click **π Refresh Table** to load latest topics\n" | |
| "2. Click the checkbox β in **Approve** column\n" | |
| "3. Fill **Rename To** with a theme name\n" | |
| "4. Click **β Submit Review** again" | |
| ) | |
| history = list(history or []) | |
| history.append({"role": "assistant", "content": guide}) | |
| cfg = _active_run_for_table() | |
| return ( | |
| history, "", | |
| make_progress_html(1), | |
| _run_status_html("v1"), | |
| _build_review_table(cfg), | |
| _build_v2_cluster_table(), | |
| _load_chart("bar", cfg, "v1"), | |
| _get_download_files("v1"), | |
| abstract_thread, title_thread, v2_thread, current_run, | |
| ) | |
| theme_map: dict = {} | |
| for idx in range(len(approved_df)): | |
| row = approved_df.iloc[idx] | |
| rename_val = str(row[rename_col]).strip() if rename_col else "" | |
| theme = ( | |
| rename_val | |
| if rename_val and rename_val.lower() not in ("", "nan", "none") | |
| else str(row[label_col]) | |
| ) | |
| try: | |
| tid = int(float(str(row[id_col]))) | |
| except (ValueError, TypeError): | |
| tid = idx | |
| theme_map.setdefault(theme, []).append(tid) | |
| groups = [{"theme_name": k, "topic_ids": v} for k, v in theme_map.items()] | |
| thread_id = title_thread if current_run == "title" else abstract_thread | |
| msg = ( | |
| "Researcher submitted the Review Table for the {} run.\n" | |
| "{} topics approved, {} themes:\n\n" | |
| "```json\n{}\n```\n\n" | |
| "Call consolidate_into_themes with run_config='{}' " | |
| "and the approved_groups JSON above. Then proceed to Phase 3." | |
| ).format( | |
| current_run, len(approved_df), len(groups), | |
| json.dumps(groups, indent=2), current_run, | |
| ) | |
| return run_agent(msg, history, abstract_thread, title_thread, v2_thread, current_run, current_mode) | |
| def switch_mode(new_mode: str, current_mode: str, abstract_thread: str, title_thread: str, v2_thread: str, current_run: str) -> tuple: | |
| """Switch between Classic and SPECTER2 modes, refreshing UI accordingly.""" | |
| cfg = _active_run_for_table() | |
| mode_label_text = ( | |
| "### π¬ Classic Mode (BERTopic)\n" | |
| "Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n" | |
| "Commands: **run abstract** β review β **run title** β review β download" | |
| if new_mode == "v1" else | |
| "### 𧬠SPECTER2 Mode (Advanced)\n" | |
| "One combined run per paper (Title+Abstract). UMAP+HDBSCAN clustering. " | |
| "Council-of-3 LLM labeling with audit trail.\n" | |
| "Command: **run specter** or **run v2**" | |
| ) | |
| chart_opts = CHART_OPTIONS if new_mode == "v1" else ["scatter", "bar"] | |
| return ( | |
| make_progress_html(0, mode=new_mode), | |
| _run_status_html(new_mode), | |
| _build_review_table(cfg), | |
| _build_v2_cluster_table(), | |
| _load_chart("bar", cfg, new_mode), | |
| _get_download_files(new_mode), | |
| mode_label_text, | |
| gr.update(choices=chart_opts, value=chart_opts[0]), | |
| new_mode, | |
| ) | |
| def manual_refresh_table(current_run: str, current_mode: str) -> tuple: | |
| cfg = _active_run_for_table() | |
| return _build_review_table(cfg), _build_v2_cluster_table() | |
| def refresh_chart(chart_type: str, current_run: str, current_mode: str) -> str: | |
| cfg = _active_run_for_table() | |
| mode = current_mode or "v1" | |
| return _load_chart(chart_type, cfg, mode) | |
| def check_status(current_mode: str) -> str: | |
| mode = current_mode or "v1" | |
| if mode == "v2": | |
| sp = _v2_summaries_path() | |
| if not sp.exists(): | |
| return ( | |
| "<div style='padding:10px;background:#fef3c7;border-radius:6px;" | |
| "font-family:sans-serif;font-size:13px;'>" | |
| "β³ No v2 clusters yet. Type <b>run specter</b> to begin." | |
| "</div>" | |
| ) | |
| summaries = json.loads(sp.read_text()) | |
| labeled = sum(1 for s in summaries if s.get("label","").strip()) | |
| return ( | |
| "<div style='padding:10px;background:#dcfce7;border-radius:6px;" | |
| "font-family:sans-serif;font-size:13px;'>" | |
| "β <b>{} clusters</b> in <code>data/v2/</code> ({} labeled). " | |
| "Click π Refresh to display." | |
| "</div>" | |
| ).format(len(summaries), labeled) | |
| else: | |
| cfg = _active_run_for_table() | |
| sp = _summaries_path(cfg) | |
| if not sp.exists(): | |
| return ( | |
| "<div style='padding:10px;background:#fef3c7;border-radius:6px;" | |
| "font-family:sans-serif;font-size:13px;'>" | |
| "β³ No topics yet. Upload CSV then type <b>run abstract</b>." | |
| "</div>" | |
| ) | |
| summaries = json.loads(sp.read_text()) | |
| labeled = sum(1 for s in summaries if s.get("label","").strip() | |
| and not s.get("label","").startswith("Topic ")) | |
| return ( | |
| "<div style='padding:10px;background:#dcfce7;border-radius:6px;" | |
| "font-family:sans-serif;font-size:13px;'>" | |
| "β <b>{} topics</b> from <code>data/{}/</code> ({} LLM-labelled). " | |
| "Click π Refresh Table." | |
| "</div>" | |
| ).format(len(summaries), cfg, labeled) | |
| print("Step 4: building UI...") | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks( | |
| title="BERTopic / SPECTER2 Thematic Analysis Agent", | |
| css=""" | |
| .mode-btn-active { border: 2px solid #4f46e5 !important; background: #eef2ff !important; } | |
| .mode-btn-v2-active { border: 2px solid #0891b2 !important; background: #ecfeff !important; } | |
| """ | |
| ) as demo: | |
| abstract_thread_state = gr.State(new_thread_id()) | |
| title_thread_state = gr.State(new_thread_id()) | |
| v2_thread_state = gr.State(new_thread_id()) | |
| current_run_state = gr.State("abstract") | |
| current_mode_state = gr.State("v1") | |
| gr.Markdown( | |
| "# π¬ Thematic Analysis Agent\n" | |
| "**Braun & Clarke (2006)** Β· SPECTER2 Β· PAJAIS Taxonomy Β· Systematic Literature Review" | |
| ) | |
| progress_bar = gr.HTML(make_progress_html(0)) | |
| run_status = gr.HTML(_run_status_html("v1")) | |
| # ββ MODE SELECTOR ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Analysis Mode") | |
| with gr.Row(): | |
| btn_v1 = gr.Button( | |
| "π Classic (BERTopic)\nAbstract + Title runs", | |
| variant="primary", size="sm", | |
| ) | |
| btn_v2 = gr.Button( | |
| "𧬠SPECTER2 (Advanced)\nCombined T+A · HDBSCAN · Council-3-LLMs", | |
| variant="secondary", size="sm", | |
| ) | |
| mode_description = gr.Markdown( | |
| "### π Classic Mode (BERTopic)\n" | |
| "Run abstract analysis, then title analysis. 6 Braun & Clarke phases each.\n" | |
| "Commands: **run abstract** β review β **run title** β review β download" | |
| ) | |
| gr.HTML(""" | |
| <div style="background:#f0fdf4;border:1px solid #86efac;border-radius:8px; | |
| padding:10px 14px;font-family:sans-serif;font-size:13px;margin:4px 0;"> | |
| <b>Classic:</b> | |
| 1οΈβ£ Upload CSV β 2οΈβ£ <code>run abstract</code> β | |
| 3οΈβ£ Review Table β 4οΈβ£ <code>run title</code> β 5οΈβ£ Download | |
| | | |
| <b>SPECTER2:</b> | |
| 1οΈβ£ Upload CSV β 2οΈβ£ <code>run specter</code> β 3οΈβ£ Download | |
| </div> | |
| """) | |
| # ββ Section 1 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Accordion("π Section 1 β Data Input", open=True): | |
| def _startup_msg(): | |
| abs_done = (DATA_DIR / "abstract" / "taxonomy.json").exists() | |
| title_done = (DATA_DIR / "title" / "taxonomy.json").exists() | |
| v2_done = (DATA_DIR / "v2" / "taxonomy.json").exists() | |
| csv_exists = (DATA_DIR / "uploaded.csv").exists() | |
| has_data = csv_exists or abs_done or title_done or v2_done | |
| return ( | |
| "<div style='padding:10px;background:#fef3c7;border:1px solid #fcd34d;" | |
| "border-radius:8px;font-family:sans-serif;font-size:13px;'>" | |
| "β οΈ <b>Previous session data detected.</b> " | |
| "Abstract: {abs} | Title: {title} | " | |
| "SPECTER2: {v2} | CSV: {csv}<br>" | |
| "Click <b>ποΈ Reset</b> to clear or continue from where you left off." | |
| "</div>" | |
| if has_data else | |
| "<div style='padding:10px;background:#f0fdf4;border:1px solid #86efac;" | |
| "border-radius:8px;font-family:sans-serif;font-size:13px;'>" | |
| "β Fresh session β upload your CSV to begin." | |
| "</div>" | |
| ).format( | |
| abs="β " if abs_done else "β³", | |
| title="β " if title_done else "β³", | |
| v2="β " if v2_done else "β³", | |
| csv="β " if csv_exists else "β", | |
| ) | |
| startup_banner = gr.HTML(_startup_msg()) | |
| with gr.Row(): | |
| file_input = gr.File( | |
| label="Upload Scopus CSV", file_types=[".csv"], | |
| type="filepath", scale=4, | |
| ) | |
| reset_btn = gr.Button( | |
| "ποΈ Reset & Start Fresh", | |
| variant="stop", scale=1, size="sm", | |
| ) | |
| file_status = gr.Textbox(label="Upload status", interactive=False, lines=2) | |
| file_input.change(fn=handle_file_upload, inputs=file_input, outputs=file_status) | |
| # ββ Section 2 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Accordion("π¬ Section 2 β Agent Conversation", open=True): | |
| gr.HTML(""" | |
| <div style="background:#fafafa;border:1px solid #e5e7eb;border-radius:6px; | |
| padding:8px 12px;font-size:12px;font-family:monospace;margin-bottom:6px;"> | |
| Classic: <b>run abstract</b> | <b>run title</b> | <b>yes</b> | <b>satisfied</b> | <b>confirm</b> | |
| | | |
| SPECTER2: <b>run specter</b> | <b>run v2</b> | <b>yes</b> | |
| </div> | |
| """) | |
| chatbot = gr.Chatbot(label="Agent", height=500) | |
| with gr.Row(): | |
| chat_input = gr.Textbox( | |
| label="Message", | |
| placeholder="e.g. run abstract or run specter", | |
| lines=2, scale=5, | |
| ) | |
| send_btn = gr.Button("Send β€", variant="primary", scale=1) | |
| # ββ Section 3 βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Accordion("π Section 3 β Results", open=True): | |
| with gr.Tabs(): | |
| with gr.Tab("π Review Table (Classic)"): | |
| gr.HTML(""" | |
| <div style="background:#eff6ff;border:1px solid #bfdbfe;border-radius:8px; | |
| padding:8px 12px;font-family:sans-serif;font-size:13px;"> | |
| After Phase 2 (Classic): Refresh β tick Approve β fill Rename To β Submit Review | |
| </div> | |
| """) | |
| table_status = gr.HTML( | |
| "<div style='padding:8px;color:#6b7280;font-size:13px;'>" | |
| "Complete Phase 2 (Classic) then Refresh.</div>" | |
| ) | |
| with gr.Row(): | |
| refresh_btn = gr.Button("π Refresh Table", variant="secondary", scale=2) | |
| check_status_btn = gr.Button("π Check Status", variant="secondary", scale=1) | |
| review_table = gr.Dataframe( | |
| value=pd.DataFrame(columns=REVIEW_COLUMNS), | |
| headers=REVIEW_COLUMNS, | |
| datatype=["number","str","str","number","number","bool","str","str"], | |
| interactive=True, wrap=True, | |
| label="Topic Review Table (Classic Mode)", | |
| ) | |
| submit_review_btn = gr.Button("β Submit Review", variant="primary", size="lg") | |
| with gr.Tab("𧬠Cluster View (SPECTER2)"): | |
| gr.HTML(""" | |
| <div style="background:#ecfeff;border:1px solid #a5f3fc;border-radius:8px; | |
| padding:8px 12px;font-family:sans-serif;font-size:13px;"> | |
| Clusters appear after Phase 3 (Council Labeling) completes. Read-only β no manual review needed. | |
| Download the <b>cluster_audit.csv</b> for full LLM voting details. | |
| </div> | |
| """) | |
| with gr.Row(): | |
| refresh_v2_btn = gr.Button("π Refresh Clusters", variant="secondary", scale=2) | |
| check_v2_btn = gr.Button("π Check V2 Status", variant="secondary", scale=1) | |
| v2_cluster_table = gr.Dataframe( | |
| value=pd.DataFrame(columns=["#", "Cluster Label", "Papers", | |
| "Vote Agreement", "LLM1 Vote", | |
| "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"]), | |
| headers=["#", "Cluster Label", "Papers", "Vote Agreement", | |
| "LLM1 Vote", "LLM2 Vote", "LLM3 Vote", "Top 3 Titles"], | |
| datatype=["number","str","number","str","str","str","str","str"], | |
| interactive=False, wrap=True, | |
| label="SPECTER2 Cluster Table (Read-only)", | |
| ) | |
| with gr.Tab("π Charts"): | |
| chart_selector = gr.Dropdown( | |
| choices=CHART_OPTIONS, value="bar", | |
| label="Select Chart", interactive=True, | |
| ) | |
| chart_display = gr.HTML( | |
| "<p style='color:#9ca3af;padding:20px;'>Charts appear after clustering.</p>" | |
| ) | |
| chart_selector.change( | |
| fn=refresh_chart, | |
| inputs=[chart_selector, current_run_state, current_mode_state], | |
| outputs=chart_display, | |
| ) | |
| with gr.Tab("β¬οΈ Download"): | |
| gr.Markdown( | |
| "**Classic outputs** appear after both abstract+title runs complete.\n\n" | |
| "**SPECTER2 outputs** appear after v2 run completes:\n" | |
| "- `comparison_v2.csv` β one row per paper with cluster + PAJAIS\n" | |
| "- `cluster_audit.csv` β full LLM voting record, per paper\n" | |
| "- `narrative_v2.txt` β 500-word Section 7 discussion\n" | |
| "> π‘ **Cache:** `data/v2/llm_cache/` stores LLM responses β " | |
| "delete this folder to force fresh labels on re-run.\n" | |
| ) | |
| download_files = gr.File( | |
| label="Output Files", file_count="multiple", interactive=False, | |
| ) | |
| # ββ wire up β combined outputs βββββββββββββββββββββββββββββββββββββββββββββ | |
| agent_outputs = [ | |
| chatbot, chat_input, progress_bar, run_status, | |
| review_table, v2_cluster_table, chart_display, download_files, | |
| abstract_thread_state, title_thread_state, v2_thread_state, current_run_state, | |
| ] | |
| reset_outputs = [ | |
| chatbot, chat_input, progress_bar, run_status, | |
| review_table, v2_cluster_table, chart_display, download_files, | |
| abstract_thread_state, title_thread_state, v2_thread_state, current_run_state, | |
| table_status, file_status, | |
| ] | |
| mode_switch_outputs = [ | |
| progress_bar, run_status, | |
| review_table, v2_cluster_table, | |
| chart_display, download_files, | |
| mode_description, chart_selector, | |
| current_mode_state, | |
| ] | |
| send_btn.click( | |
| fn=run_agent, | |
| inputs=[chat_input, chatbot, | |
| abstract_thread_state, title_thread_state, v2_thread_state, | |
| current_run_state, current_mode_state], | |
| outputs=agent_outputs, | |
| ) | |
| chat_input.submit( | |
| fn=run_agent, | |
| inputs=[chat_input, chatbot, | |
| abstract_thread_state, title_thread_state, v2_thread_state, | |
| current_run_state, current_mode_state], | |
| outputs=agent_outputs, | |
| ) | |
| submit_review_btn.click( | |
| fn=handle_submit_review, | |
| inputs=[review_table, chatbot, | |
| abstract_thread_state, title_thread_state, v2_thread_state, | |
| current_run_state, current_mode_state], | |
| outputs=agent_outputs, | |
| ) | |
| reset_btn.click( | |
| fn=reset_all_data, | |
| inputs=[], | |
| outputs=reset_outputs, | |
| ) | |
| btn_v1.click( | |
| fn=lambda m, at, tt, vt, cr: switch_mode("v1", m, at, tt, vt, cr), | |
| inputs=[current_mode_state, abstract_thread_state, title_thread_state, | |
| v2_thread_state, current_run_state], | |
| outputs=mode_switch_outputs, | |
| ) | |
| btn_v2.click( | |
| fn=lambda m, at, tt, vt, cr: switch_mode("v2", m, at, tt, vt, cr), | |
| inputs=[current_mode_state, abstract_thread_state, title_thread_state, | |
| v2_thread_state, current_run_state], | |
| outputs=mode_switch_outputs, | |
| ) | |
| refresh_btn.click( | |
| fn=manual_refresh_table, | |
| inputs=[current_run_state, current_mode_state], | |
| outputs=[review_table, v2_cluster_table], | |
| ) | |
| refresh_v2_btn.click( | |
| fn=manual_refresh_table, | |
| inputs=[current_run_state, current_mode_state], | |
| outputs=[review_table, v2_cluster_table], | |
| ) | |
| check_status_btn.click( | |
| fn=check_status, | |
| inputs=[current_mode_state], | |
| outputs=[table_status], | |
| ) | |
| check_v2_btn.click( | |
| fn=lambda: check_status("v2"), | |
| inputs=[current_mode_state], | |
| outputs=[table_status], | |
| ) | |
| print("Step 5: UI built OK, launching...") | |
| if __name__ == "__main__": | |
| _v = tuple(int(x) for x in gr.__version__.split(".")[:2]) | |
| print("Gradio version:", gr.__version__) | |
| _kwargs = { | |
| "server_name": "0.0.0.0", | |
| "server_port": 7860, | |
| "share": False, | |
| "inbrowser": False, | |
| } | |
| if _v >= (5, 0): | |
| _kwargs["ssr_mode"] = False | |
| print("Running at http://0.0.0.0:7860") | |
| import subprocess, sys | |
| subprocess.Popen([sys.executable, "check_keys.py"]) | |
| demo.launch(**_kwargs) | |