""" app.py — Gradio UI entry point. ORIGINAL structure and all tabs preserved. NEW: second file upload for methodology CSV, technique sheets 1-4, journal cross-tabulation chart + table, technique optimisation log. """ import os, json import re import pandas as pd, numpy as np import gradio as gr import plotly.express as px import plotly.graph_objects as go from agent import run_pipeline, METHODOLOGY_PATTERNS, TECHNIQUE_PATTERNS # ── CSV preview ────────────────────────────────────────────────────────────── def _preview(file): if not file: return "Upload a Scopus CSV to begin." df = pd.read_csv(file.name) df.columns = df.columns.str.lower() has_t = "title" in df.columns has_a = "abstract" in df.columns n = len(df) blanks_t = int(df["title"].isna().sum()) if has_t else n blanks_a = int(df["abstract"].isna().sum()) if has_a else n ok = "✅" if has_t and has_a and blanks_t < n and blanks_a < n else "❌" return (f"## {ok} CSV loaded — {n} entries\n\n" f"| Column | Present | Blank rows |\n|---|---|---|\n" f"| title | {'✅' if has_t else '❌'} | {blanks_t} |\n" f"| abstract | {'✅' if has_a else '❌'} | {blanks_a} |\n\n" f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}") def _preview_methodology(file): if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis." df = pd.read_csv(file.name) df.columns = df.columns.str.lower() has_t = "title" in df.columns has_m = "methodology" in df.columns has_d = "doi" in df.columns n = len(df) ok = "✅" if has_t and has_m else "❌" return (f"## {ok} Methodology CSV — {n} papers\n\n" f"| Column | Present |\n|---|---|\n" f"| title | {'✅' if has_t else '❌'} |\n" f"| doi | {'✅' if has_d else '⚠ optional'} |\n" f"| methodology | {'✅' if has_m else '❌'} |\n\n" f"Journals will be auto-detected from DOI + title.") # ── Original helper builders ───────────────────────────────────────────────── def _top_papers_df(top_papers: dict) -> pd.DataFrame: rows = [] for cid in sorted(top_papers.keys()): for p in top_papers[cid]: rows.append({"Cluster": cid, "Label": p["cluster_label"], "Rank": p["rank"], "Title": p["title"], "Abstract Snippet": p["abstract_snippet"]}) return pd.DataFrame(rows) def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFrame: rows = [] for cid in sorted(methodology_data.keys()): md = methodology_data[cid] label = interps.get(cid, {}).get("label", f"Cluster {cid}") rows.append({ "Cluster": cid, "Label": label, "Dominant Method": md.get("dominant_method", "—"), "Dominant Technique": md.get("dominant_technique", "—"), "Empirical %": md.get("empirical_pct", 0), "Theoretical %": md.get("theoretical_pct", 0), "Mixed %": md.get("mixed_pct", 0), "Methods (≥2 LLMs)": ", ".join( f"{m['name']} ({m['pct']}%, {m['agreement']})" for m in md.get("methodologies", [])), "Techniques (≥2 LLMs)": ", ".join( f"{t['name']} ({t['pct']}%, {t['agreement']})" for t in md.get("techniques", [])), "Regex Confirmed": ", ".join(md.get("regex_confirmed_consensus", [])) or "—", "Regex Rejected": ", ".join(md.get("regex_rejected_consensus", [])) or "—", }) return pd.DataFrame(rows) def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame: rows = [] for cid in sorted(methodology_data.keys()): md = methodology_data[cid] label = interps.get(cid, {}).get("label", f"Cluster {cid}") scan = md.get("regex_scan", {}) for item in md.get("methodologies", []) + md.get("techniques", []): name = item["name"] regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[]) matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—" rows.append({"Cluster": cid, "Label": label, "Item": name, "Type": "Method" if item in md.get("methodologies",[]) else "Technique", "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌", "LLM Votes": item["llm_votes"], "Agreement": item["agreement"], "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"), "Gate Passed":"✅ ACCEPTED"}) for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]): name = item["name"] regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[]) matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—" rows.append({"Cluster": cid, "Label": label, "Item": name, "Type": "Method" if item in md.get("rejected_methods",[]) else "Technique", "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌", "LLM Votes": item["llm_votes"], "Agreement": item["agreement"], "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"), "Gate Passed":"❌ REJECTED (single LLM)"}) return pd.DataFrame(rows) if rows else pd.DataFrame() def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame: rows = [] for cid in sorted(methodology_data.keys()): md = methodology_data[cid] label = interps.get(cid,{}).get("label", f"Cluster {cid}") raw = md.get("llm_raw",{}) def _fmt(r, key): return " | ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "—" rows.append({"Cluster": cid, "Label": label, "Groq Methods": _fmt(raw.get("groq",{}), "methodologies"), "Mistral Methods": _fmt(raw.get("mistral",{}), "methodologies"), "Gemini Methods": _fmt(raw.get("gemini",{}), "methodologies"), "Groq Techniques": _fmt(raw.get("groq",{}), "techniques"), "Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"), "Gemini Techniques": _fmt(raw.get("gemini",{}), "techniques"), "Groq E/T/M": f"{raw.get('groq',{}).get('empirical_pct',0)}/" f"{raw.get('groq',{}).get('theoretical_pct',0)}/" f"{raw.get('groq',{}).get('mixed_pct',0)}", "Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/" f"{raw.get('mistral',{}).get('theoretical_pct',0)}/" f"{raw.get('mistral',{}).get('mixed_pct',0)}", "Gemini E/T/M": f"{raw.get('gemini',{}).get('empirical_pct',0)}/" f"{raw.get('gemini',{}).get('theoretical_pct',0)}/" f"{raw.get('gemini',{}).get('mixed_pct',0)}", }) return pd.DataFrame(rows) def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame: rows = [] for cid in sorted(methodology_data.keys()): md = methodology_data[cid] label = interps.get(cid,{}).get("label", f"Cluster {cid}") scan = md.get("regex_scan",{}) for category, hits in scan.get("methods",{}).items(): for h in hits: rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology", "Pattern Category": category, "Matched Text": h["match"], "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"}) for category, hits in scan.get("techniques",{}).items(): for h in hits: rows.append({"Cluster": cid, "Label": label, "Bank": "Technique", "Pattern Category": category, "Matched Text": h["match"], "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"}) return pd.DataFrame(rows) if rows else pd.DataFrame() def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure: labels_list, empirical, theoretical, mixed = [], [], [], [] for cid in sorted(methodology_data.keys()): md = methodology_data[cid] labels_list.append(interps.get(cid,{}).get("label", f"C{cid}")[:30]) empirical.append(md.get("empirical_pct", 0)) theoretical.append(md.get("theoretical_pct", 0)) mixed.append(md.get("mixed_pct", 0)) fig = go.Figure() fig.add_trace(go.Bar(name="Empirical %", x=labels_list, y=empirical, marker_color="#3dba7a")) fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6")) fig.add_trace(go.Bar(name="Mixed %", x=labels_list, y=mixed, marker_color="#f5a623")) fig.update_layout(barmode="stack", template="plotly_dark", height=420, paper_bgcolor="#0d1117", plot_bgcolor="#161b22", title="Research Orientation per Cluster — Averaged across Groq + Mistral + Gemini", xaxis_title="Cluster", yaxis_title="Percentage (%)", font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35) return fig def _refinement_df(rl: list) -> pd.DataFrame: if not rl: return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label", "Issues","Improvement","Hallucination Detected"]) return pd.DataFrame([{ "Cluster": r["cluster"], "Iteration": r["iteration"], "Old Label": r["old_label"], "New Label": r["new_label"], "Issues": "; ".join(r.get("issues",[])), "Improvement": r["improvement_score"], "Hallucination Detected": r["hallucination_detected"], } for r in rl]) def _regex_pattern_info() -> str: m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items()) t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items()) return ( "### How Cluster Methodology Extraction Works\n\n" "**Step 1 — Regex Pre-Scan:** Two compiled pattern banks run against representative " "abstracts. Every match recorded with exact character span, matched text, paper number.\n\n" "**Step 2 — 3-LLM Council:** Groq, Mistral, Gemini each receive regex evidence + abstracts. " "Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n" "**Step 3 — ≥2-LLM Gate:** Only items named by ≥2 LLMs survive. Percentages averaged.\n\n" "**Step 4 — Orientation:** Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n" "---\n\n#### Methodology Bank\n" + m_list + "\n\n#### Technique Bank\n" + t_list) # ── NEW helpers for methodology-CSV pipeline ───────────────────────────────── def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame: return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame() def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure: """ Grouped bar: for each technique, show the % of papers it was found in by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated. """ s1 = comp_sheets.get(1, []) s2 = comp_sheets.get(2, []) s3 = comp_sheets.get(3, []) s4 = comp_sheets.get(4, []) def _freq(rows): counts = {} n = len(rows) or 1 for row in rows: for t in (row.get("techniques","") or "").split(", "): t = t.strip().title() if t and t != "—": counts[t] = counts.get(t,0) + 1 return {k: round(v/n*100) for k,v in counts.items()} f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4) all_techs = sorted(set(f1)|set(f2)|set(f3)|set(f4)) fig = go.Figure() fig.add_trace(go.Bar(name="Groq", x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6")) fig.add_trace(go.Bar(name="Mistral", x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623")) fig.add_trace(go.Bar(name="Gemini", x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7")) fig.add_trace(go.Bar(name="Consolidated", x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a")) fig.update_layout(barmode="group", template="plotly_dark", height=480, paper_bgcolor="#0d1117", plot_bgcolor="#161b22", title="Computational Technique Frequency — % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)", xaxis_title="Technique", yaxis_title="% of papers", font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40) return fig def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure: """ Grouped bar: for each technique, show % usage per journal. Journals on x-axis, techniques as bar groups. """ ct = journal_crosstab.get("consolidated", {}) journals = journal_crosstab.get("journals", []) techniques= journal_crosstab.get("techniques", []) if not journals or not techniques: fig = go.Figure() fig.update_layout(template="plotly_dark", title="No journal data available", paper_bgcolor="#0d1117") return fig COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4", "#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"] fig = go.Figure() for i, tech in enumerate(techniques[:15]): # cap at 15 techniques for readability pcts = [ct.get(j,{}).get(tech, 0) for j in journals] fig.add_trace(go.Bar(name=tech, x=journals, y=pcts, marker_color=COLORS[i % len(COLORS)])) fig.update_layout(barmode="group", template="plotly_dark", height=500, paper_bgcolor="#0d1117", plot_bgcolor="#161b22", title="Computational Technique Usage — Cross-Tabulation by Journal (%)", xaxis_title="Journal", yaxis_title="% of papers using technique", font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20) return fig def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame: ct = journal_crosstab.get("consolidated", {}) journals = journal_crosstab.get("journals", []) techniques= journal_crosstab.get("techniques", []) paper_counts = journal_crosstab.get("journal_paper_counts", {}) rows = [] for j in journals: row = {"Journal": j, "N Papers": paper_counts.get(j,0)} for t in techniques: row[t] = f"{ct.get(j,{}).get(t,0)}%" rows.append(row) return pd.DataFrame(rows) def _tech_opt_df(opt_log: list) -> pd.DataFrame: if not opt_log: return pd.DataFrame(columns=["Technique","Refined Name","Hallucination", "High Variance","Groq %","Mistral %","Gemini %", "Suggestion","Split Into","Merge With"]) return pd.DataFrame([{ "Technique": r["technique"], "Refined Name": r["refined_name"], "Hallucination": r["is_hallucination"], "High Variance": r["high_variance"], "Groq %": r["pct_groq"], "Mistral %": r["pct_mistral"], "Gemini %": r["pct_gemini"], "Suggestion": r["suggestion"], "Split Into": r["split_into"], "Merge With": r["merge_with"], } for r in opt_log]) def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame: """Per-LLM technique frequency across all papers in methodology CSV.""" per_llm = journal_crosstab.get("per_llm_freq", {}) techniques = sorted(set(t for d in per_llm.values() for t in d.keys())) rows = [] for t in techniques: rows.append({ "Technique": t, "Groq %": per_llm.get("Groq",{}).get(t, 0), "Mistral %": per_llm.get("Mistral",{}).get(t, 0), "Gemini %": per_llm.get("Gemini",{}).get(t, 0), "Variance": round(max( per_llm.get("Groq",{}).get(t,0), per_llm.get("Mistral",{}).get(t,0), per_llm.get("Gemini",{}).get(t,0), ) - min( per_llm.get("Groq",{}).get(t,0), per_llm.get("Mistral",{}).get(t,0), per_llm.get("Gemini",{}).get(t,0), )), }) return pd.DataFrame(rows).sort_values("Groq %", ascending=False) # ── NEW: Cluster Sizes bar chart (what supervisor pointed to) ──────────────── def _cluster_sizes_chart(interps: dict, disc: dict) -> go.Figure: """ Bar chart: Papers per Cluster — coloured by discipline rule status. Green = passes both constraints (mass ≤ 25%, size ≥ 5). Yellow = exceeds 25% mass cap (dominant cluster warning). Red = below min-size of 5 (too small). Number label shown on top of each bar, exactly like supervisor's image. """ cluster_sizes = disc.get("cluster_sizes", {}) n_docs = sum(cluster_sizes.values()) or 1 max_allowed = int(0.25 * n_docs) labels, sizes, colors, texts = [], [], [], [] for cid in sorted(interps.keys()): label = interps[cid]["label"] size = cluster_sizes.get(cid, interps[cid].get("strong",0) + interps[cid].get("weak",0)) mass_pct = size / n_docs color = "#3dba7a" # green — PASS if mass_pct > 0.25: color = "#f5c518" # yellow — mass violation (like supervisor image) elif size < 5: color = "#e04d4d" # red — too small labels.append(label) sizes.append(size) colors.append(color) texts.append(str(size)) fig = go.Figure(go.Bar( x=labels, y=sizes, marker_color=colors, text=texts, textposition="outside", textfont=dict(size=11, color="#c9d1d9"), )) fig.add_hline(y=max_allowed, line_dash="dash", line_color="#f5a623", annotation_text=f"25% cap ({max_allowed} papers)", annotation_font_color="#f5a623") fig.update_layout( template="plotly_dark", height=520, paper_bgcolor="#0d1117", plot_bgcolor="#161b22", title="Cluster Sizes (Papers per Cluster) — Green=PASS · Yellow=Mass>25% · Red=Size<5", xaxis_title="Cluster", yaxis_title="Number of Papers", font=dict(size=10), xaxis_tickangle=-40, showlegend=False, margin=dict(t=80, b=200), ) return fig # ── NEW: Reproducibility panel ──────────────────────────────────────────────── def _reproducibility_df(td: dict, interps: dict) -> pd.DataFrame: """ Shows what the supervisor means by 'run again and again, topic list is same'. Pulls the stability ARI (already computed across 3 seeds in tools.py) and shows per-cluster persistence as a proxy for how stable each cluster is. High persistence = cluster survives across seeds = reproducible. Low persistence = cluster may disappear or merge on re-run. """ cluster_persistence = td.get("cluster_persistence", {}) overall_stability = td["metrics"].get("stability", 0.0) rows = [] for cid in sorted(interps.keys()): pers = cluster_persistence.get(cid, 0.0) label = interps[cid]["label"] size = interps[cid].get("strong",0) + interps[cid].get("weak",0) stable_verdict = "✅ Stable" if pers >= 0.7 else \ "⚠ Borderline" if pers >= 0.4 else \ "❌ Fragile" rows.append({ "Cluster": cid, "Label": label, "Cluster Persistence": round(pers, 4), "Strong Members": interps[cid].get("strong", 0), "Weak Members": interps[cid].get("weak", 0), "Total Papers": size, "Stability Verdict": stable_verdict, "Note": ("Likely same label on re-run" if pers >= 0.7 else "Label may shift slightly" if pers >= 0.4 else "May merge/split on re-run — consider merging with adjacent cluster"), }) df = pd.DataFrame(rows).sort_values("Cluster Persistence", ascending=False) # Prepend overall ARI row overall_row = pd.DataFrame([{ "Cluster": "ALL", "Label": f"Overall ARI Stability across 3 seeds = {round(overall_stability,4)}", "Cluster Persistence": overall_stability, "Strong Members": "—", "Weak Members": "—", "Total Papers": "—", "Stability Verdict": "✅ Stable" if overall_stability >= 0.8 else "⚠ Borderline" if overall_stability >= 0.5 else "❌ Unstable", "Note": "ARI close to 1.0 → running the pipeline again will produce the same clusters", }]) return pd.concat([overall_row, df], ignore_index=True) def _reproducibility_chart(td: dict, interps: dict) -> go.Figure: """Horizontal bar of cluster persistence — shows which clusters are stable.""" cluster_persistence = td.get("cluster_persistence", {}) labels, persis, colors = [], [], [] for cid in sorted(interps.keys(), key=lambda c: cluster_persistence.get(c,0)): p = cluster_persistence.get(cid, 0.0) labels.append(interps[cid]["label"][:35]) persis.append(round(p, 4)) colors.append("#3dba7a" if p >= 0.7 else "#f5a623" if p >= 0.4 else "#e04d4d") fig = go.Figure(go.Bar( x=persis, y=labels, orientation="h", marker_color=colors, text=[str(v) for v in persis], textposition="outside", )) fig.add_vline(x=0.7, line_dash="dot", line_color="#3dba7a", annotation_text="Stable threshold (0.7)") fig.add_vline(x=0.4, line_dash="dot", line_color="#f5a623", annotation_text="Borderline (0.4)") fig.update_layout( template="plotly_dark", height=max(400, len(labels)*28), paper_bgcolor="#0d1117", plot_bgcolor="#161b22", title="Cluster Persistence — Proxy for Reproducibility\n" "Green ≥ 0.7 (stable) · Orange 0.4–0.7 (borderline) · Red < 0.4 (fragile)", xaxis_title="Persistence Score", yaxis_title="", font=dict(size=10), margin=dict(l=260), ) return fig # ── NEW: Human interpretability check ──────────────────────────────────────── def _interpretability_df(interps: dict) -> pd.DataFrame: """ Flags what supervisor called 'human interpretable topic list'. Checks two things: 1. Label overlap — pairs of cluster labels that share ≥2 significant words (e.g. 'Cybersecurity and Privacy' vs 'Cyber-Risk Management and Online Security'). 2. Vagueness — labels containing generic terms like 'systems', 'digital', 'data' as the ONLY meaningful content. Output is a table the supervisor can review to confirm distinctiveness. """ import itertools NOISE = {"the","and","for","with","using","based","from","that","are","this", "in","of","a","to","an","on","at","by","or","as","is","its","via", "systems","digital","information","management","based","driven"} VAGUE_SINGLES = {"systems","digital","data","information","analysis","research", "study","approach","framework","model","methods","technology"} def _sig_words(label: str) -> set: words = set(re.findall(r"\b[a-z]{4,}\b", label.lower())) return words - NOISE rows = [] cids = sorted(interps.keys()) labels_map = {cid: interps[cid]["label"] for cid in cids} # Check every pair seen_pairs = set() for cid_a, cid_b in itertools.combinations(cids, 2): la, lb = labels_map[cid_a], labels_map[cid_b] wa, wb = _sig_words(la), _sig_words(lb) overlap = wa & wb if len(overlap) >= 2: pair_key = tuple(sorted([cid_a, cid_b])) if pair_key not in seen_pairs: seen_pairs.add(pair_key) rows.append({ "Issue": "⚠ Label Overlap", "Cluster A": cid_a, "Label A": la, "Cluster B": cid_b, "Label B": lb, "Shared Words": ", ".join(sorted(overlap)), "Severity": "HIGH — consider merging" if len(overlap) >= 3 else "MEDIUM — review distinctiveness", "Action": "Check if these two clusters cover the same research theme. " "If yes, increase min_cluster_size to force a merge.", }) # Check each label for vagueness for cid in cids: label = labels_map[cid] sig = _sig_words(label) vague = sig & VAGUE_SINGLES specific = sig - VAGUE_SINGLES if len(specific) == 0: rows.append({ "Issue": "❌ Too Vague", "Cluster A": cid, "Label A": label, "Cluster B": "—", "Label B": "—", "Shared Words": ", ".join(vague), "Severity": "HIGH — label is not human interpretable", "Action": "Run optimization pass to refine the label, " "or manually inspect keyphrases for more specific terms.", }) if not rows: rows.append({ "Issue": "✅ All Clear", "Cluster A": "—", "Label A": "All labels are distinct and specific", "Cluster B": "—", "Label B": "—", "Shared Words": "—", "Severity": "NONE", "Action": "Topic list is human interpretable and non-overlapping.", }) return pd.DataFrame(rows) # ── Pipeline runner ────────────────────────────────────────────────────────── def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize, progress=gr.Progress(track_tqdm=True)): if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.") gk = gk.strip() or os.getenv("GROQ_API_KEY","") mk = mk.strip() or os.getenv("MISTRAL_API_KEY","") gek = gek.strip() or os.getenv("GEMINI_API_KEY","") if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.") method_path = method_file.name if method_file else None progress(0.05, desc="📥 Loading CSV…") progress(0.10, desc="🔬 Embedding corpus with SPECTER-2…") r = run_pipeline(corpus_file.name, gk, mk, gek, int(n_trials), int(n_optimize), method_path) if r.get("error"): raise gr.Error(r["error"]) progress(0.85, desc="📊 Building outputs…") td, interps = r["topic_data"], r.get("interpretations",{}) disc, met = td["discipline"], td["metrics"] ar = r.get("agreement_rates",{}) rl = r.get("refinement_log", []) def _s(ok): return "✅ PASS" if ok else "❌ FAIL" summary = ( f"## Pipeline Complete — {disc['n_clusters']} clusters discovered\n\n" f"| Criterion | Value | Status |\n|---|---|---|\n" f"| Max cluster mass | {round(disc['max_mass_pct']*100,1)}% | {_s(disc['max_mass_ok'])} |\n" f"| Min cluster size | {disc['min_size']} | {_s(disc['min_size_ok'])} |\n" f"| Persistence (mean) | {round(met['persistence'],4)} | — |\n" f"| DBCV | {round(met['dbcv'],4)} | — |\n" f"| Stability (3 seeds) | {round(met['stability'],4)} | — |\n\n" f"**Trials:** {td['n_trials_run']} (best #{td['best_trial']}) · " f"**Agreement:** Triple {ar.get('triple',0)}% · Two+ {ar.get('two_or_more',0)}% · " f"**Optimization passes:** {n_optimize} · **Labels refined:** {len(rl)}" ) # UMAP scatter u2d = np.array(td["umap_2d"]) sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1], "Cluster":[str(l) for l in td["labels"]], "Doc":[d[:60] for d in td["documents"]]}) fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster", hover_data=["Doc"], opacity=0.75, title="2-D UMAP visualisation of SPECTER-2 embeddings") fig.update_layout(template="plotly_dark", height=500, paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11)) # Trial log + Pareto tl = pd.DataFrame(td["trial_log"]) tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence", "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns] tl_show = tl[tl_cols] if not tl.empty else pd.DataFrame() pfig = go.Figure() if not tl.empty: for passed, color, name in [(True,"#3dba7a","PASS"),(False,"#e04d4d","FAIL")]: sub = tl[tl["discipline_pass"]==passed] if not sub.empty: pfig.add_trace(go.Scatter(x=sub["max_mass_pct"],y=sub["persistence"], mode="markers",marker=dict(size=8,color=color),name=name, text=sub["trial"],hovertemplate="Trial %{text}
Mass: %{x:.0%}
Pers: %{y:.3f}")) pfig.add_vline(x=0.25,line_dash="dash",line_color="#5a6480",annotation_text="25% rule") pfig.update_layout(template="plotly_dark",height=400, paper_bgcolor="#0d1117",plot_bgcolor="#161b22", title="Pareto front — Persistence vs Max cluster mass", xaxis_title="Max cluster mass",yaxis_title="Persistence",font=dict(size=11)) cdf_rows = [] for cid in sorted(interps.keys()): v = interps[cid] cdf_rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"], "Strong":v["strong"],"Weak":v["weak"], "Persistence":round(v.get("persistence",0),4), "Keyphrases":", ".join(v.get("keyphrases",[]))}) cdf = pd.DataFrame(cdf_rows) sheets = r.get("sheets",{}) s1 = pd.DataFrame(sheets.get(1,[])); s2 = pd.DataFrame(sheets.get(2,[])) s3 = pd.DataFrame(sheets.get(3,[])); s4 = pd.DataFrame(sheets.get(4,[])) sp = r.get("sheet_paths",{}) mdf = pd.DataFrame(r.get("mismatch_table",[])) md_data = r.get("methodology_data",{}) top_papers_df = _top_papers_df(r.get("top_papers",{})) method_sum_df = _methodology_summary_df(md_data, interps) method_chart = _methodology_bar_chart(md_data, interps) extraction_df = _extraction_pipeline_df(md_data, interps) per_llm_meth_df = _per_llm_methodology_df(md_data, interps) regex_hits_df = _regex_hits_df(md_data, interps) pattern_info = _regex_pattern_info() refine_df = _refinement_df(rl) # ── NEW: methodology-CSV outputs ───────────────────────────────────────── comp_sheets = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]}) jct = r.get("journal_crosstab", {}) tech_opt_log = r.get("technique_opt_log", []) tech_s1 = _tech_sheet_df(comp_sheets.get(1,[])) tech_s2 = _tech_sheet_df(comp_sheets.get(2,[])) tech_s3 = _tech_sheet_df(comp_sheets.get(3,[])) tech_s4 = _tech_sheet_df(comp_sheets.get(4,[])) tech_llm_chart = _tech_llm_pct_chart(comp_sheets) jct_chart = _journal_crosstab_chart(jct) jct_df = _journal_crosstab_df(jct) per_llm_freq_df = _per_llm_freq_df(jct) tech_opt_df = _tech_opt_df(tech_opt_log) # ── NEW: cluster sizes, reproducibility, interpretability ───────────────── cluster_sizes_fig = _cluster_sizes_chart(interps, disc) repro_chart = _reproducibility_chart(td, interps) repro_df = _reproducibility_df(td, interps) interpretability_df = _interpretability_df(interps) progress(1.0, desc="✅ Done!") dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f] return ( # ── original outputs (order preserved) ─────────────────────────────── summary, fig, pfig, tl_show, cdf, top_papers_df, method_chart, method_sum_df, extraction_df, per_llm_meth_df, regex_hits_df, pattern_info, refine_df, s1, s2, s3, s4, dl_files if dl_files else None, mdf, # ── new outputs ─────────────────────────────────────────────────────── tech_llm_chart, tech_s1, tech_s2, tech_s3, tech_s4, per_llm_freq_df, jct_chart, jct_df, tech_opt_df, # ── supervisor additions ────────────────────────────────────────────── cluster_sizes_fig, repro_chart, repro_df, interpretability_df, ) # ── UI ──────────────────────────────────────────────────────────────────────── css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \ "footer{display:none!important}" with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"), css=css, title="SPECTER-2 Topic Analyzer") as demo: gr.Markdown("# 📐 SPECTER-2 Topic Analyzer") with gr.Row(): # ── Left sidebar ───────────────────────────────────────────────────── with gr.Column(scale=1): gr.Markdown("### 📄 Corpus CSV") file_in = gr.File(label="Upload Scopus CSV (title + abstract)", file_types=[".csv"]) preview_out = gr.Markdown("Upload a CSV to see stats.") gr.Markdown("### 🔬 Methodology CSV *(optional)*") method_file_in = gr.File(label="Upload Methodology CSV (title, doi, methodology)", file_types=[".csv"]) method_preview = gr.Markdown("Upload methodology CSV to enable technique analysis.") gr.Markdown("### 🔑 API Keys") groq_in = gr.Textbox(label="Groq API Key", type="password", placeholder="or set GROQ_API_KEY env var") mistral_in = gr.Textbox(label="Mistral API Key", type="password", placeholder="or set MISTRAL_API_KEY env var") gemini_in = gr.Textbox(label="Gemini API Key", type="password", placeholder="or set GEMINI_API_KEY env var") gr.Markdown("### ⚙ Parameters") trials_in = gr.Slider(10, 100, 50, step=5, label="Optuna Trials") optimize_in = gr.Slider(1, 5, 1, step=1, label="🔁 Optimization Passes", info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels " "AND technique labels for hallucinations + improvements.") run_btn = gr.Button("▶ Run Full Pipeline", variant="primary", size="lg") # ── Main panel ──────────────────────────────────────────────────────── with gr.Column(scale=3): with gr.Tabs(): # ── original tabs (order / content unchanged) ───────────────── with gr.Tab("Summary"): summary_out = gr.Markdown() with gr.Tab("2-D UMAP"): scatter_out = gr.Plot() with gr.Tab("Pareto Front"): pareto_out = gr.Plot() with gr.Tab("Trial Log"): trial_out = gr.Dataframe() with gr.Tab("Clusters"): cluster_out = gr.Dataframe() with gr.Tab("🗞 Top 3 Papers"): gr.Markdown("### Top 3 Representative Papers per Cluster\n" "Ranked by cosine similarity to cluster centroid " "in SPECTER-2 embedding space.") top_papers_out = gr.Dataframe( headers=["Cluster","Label","Rank","Title","Abstract Snippet"], wrap=True) with gr.Tab("🔬 Cluster Methodology"): gr.Markdown("### Cluster-Level Methodology — 3-LLM Council\n" "Derived from representative abstracts per cluster. " "≥2-LLM gate applied.") method_chart_out = gr.Plot() method_summary_out = gr.Dataframe(wrap=True) with gr.Tab("⚙ Cluster Extraction Pipeline"): gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)") extraction_out = gr.Dataframe(wrap=True) with gr.Tab("🤖 Cluster Per-LLM Votes"): gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)") per_llm_out = gr.Dataframe(wrap=True) with gr.Tab("🔍 Cluster Regex Hits"): gr.Markdown("### Regex Pattern Matches (per cluster)\n" "Every match with exact character span and paper number.") regex_hits_out = gr.Dataframe(wrap=True) regex_info_out = gr.Markdown() with gr.Tab("🔁 Refinement Log"): gr.Markdown("### Topic Label Optimization Log\n" "Changes made by LLM critic per optimization pass.") refine_out = gr.Dataframe(wrap=True) with gr.Tab("Sheet 1 — Groq"): s1_out = gr.Dataframe() with gr.Tab("Sheet 2 — Mistral"): s2_out = gr.Dataframe() with gr.Tab("Sheet 3 — Gemini"): s3_out = gr.Dataframe() with gr.Tab("Sheet 4 — Consolidated"): s4_out = gr.Dataframe() with gr.Tab("RQ Mismatch"): mismatch_out = gr.Dataframe() with gr.Tab("Downloads"): dl_out = gr.File(label="All sheet CSVs + topics.json", file_count="multiple") # ── NEW tabs: methodology CSV pipeline ──────────────────────── with gr.Tab("💻 Comp. Techniques — LLM % Chart"): gr.Markdown("### Computational Technique Frequency — Methodology CSV\n" "For each technique, shows the % of papers it was extracted " "from by each of the 3 LLMs independently + the consolidated " "result (≥2-LLM gate). Bars grouped by technique.") tech_llm_chart_out = gr.Plot() with gr.Tab("💻 Tech Sheet 1 — Groq"): gr.Markdown("### Groq raw technique extraction — one row per paper") tech_s1_out = gr.Dataframe(wrap=True) with gr.Tab("💻 Tech Sheet 2 — Mistral"): gr.Markdown("### Mistral raw technique extraction — one row per paper") tech_s2_out = gr.Dataframe(wrap=True) with gr.Tab("💻 Tech Sheet 3 — Gemini"): gr.Markdown("### Gemini raw technique extraction — one row per paper") tech_s3_out = gr.Dataframe(wrap=True) with gr.Tab("💻 Tech Sheet 4 — Consolidated"): gr.Markdown("### Consolidated techniques — ≥2-LLM agreement, one row per paper") tech_s4_out = gr.Dataframe(wrap=True) with gr.Tab("📊 Tech Frequency by LLM"): gr.Markdown("### Per-LLM Technique Frequency Table\n" "% of all papers where each LLM extracted each technique. " "High variance = LLMs disagree → optimization flag.") per_llm_freq_out = gr.Dataframe(wrap=True) with gr.Tab("🗂 Journal Cross-Tabulation"): gr.Markdown("### Technique × Journal Cross-Tabulation\n" "Rows = journals auto-detected from DOI/title. " "Columns = consolidated techniques. " "Values = % of papers in that journal using the technique.\n\n" "**Journals detected:** MISQ, JAIS, ISR, JMIS, PAJAIS, " "ECIS, ICIS, Other.") jct_chart_out = gr.Plot() jct_df_out = gr.Dataframe(wrap=True) with gr.Tab("🔧 Technique Optimization"): gr.Markdown("### Technique Label Improvement Suggestions\n" "Groq critic flags: hallucination, high inter-LLM variance " "(>15% gap), split/merge recommendations.\n" "Only runs when Optimization Passes ≥ 2.") tech_opt_out = gr.Dataframe(wrap=True) # ── Supervisor-requested additions ──────────────────────────── with gr.Tab("📊 Cluster Sizes"): gr.Markdown( "### Cluster Sizes (Papers per Cluster)\n" "Exact chart your supervisor highlighted. " "**Green** = passes both discipline rules (mass ≤ 25%, size ≥ 5). " "**Yellow** = cluster exceeds 25% mass cap — dominant cluster warning. " "**Red** = cluster has fewer than 5 papers — too small.\n\n" "The orange dashed line marks the 25% cap. Any bar above it " "will fail the discipline check and the pipeline will re-optimise." ) cluster_sizes_out = gr.Plot() with gr.Tab("🔄 Reproducibility"): gr.Markdown( "### Reproducibility — 'Run Again and Again, Topic List is the Same'\n\n" "Your supervisor wants proof that running the pipeline multiple times " "produces the **same clusters**. This tab shows two measures:\n\n" "**Overall ARI Stability** (top row) — Adjusted Rand Index averaged " "across 3 random seeds. ARI = 1.0 means identical clusters every run. " "ARI ≥ 0.8 is considered stable for publication.\n\n" "**Cluster Persistence** (per row) — how strongly each cluster's " "structure is preserved in the condensed HDBSCAN tree. " "High persistence → cluster survives parameter variation → " "same label will appear on re-run. " "Low persistence → cluster may split or merge → label may change.\n\n" "🟢 ≥ 0.7 Stable · 🟡 0.4–0.7 Borderline · 🔴 < 0.4 Fragile" ) repro_chart_out = gr.Plot() repro_df_out = gr.Dataframe(wrap=True) with gr.Tab("🧠 Interpretability Check"): gr.Markdown( "### Human Interpretability Check — 'Topic List Must Be Distinct'\n\n" "Your supervisor flagged that labels like " "*'Cybersecurity and Privacy'* and *'Cyber-Risk Management and Online Security'* " "look like the same topic. This tab automatically detects:\n\n" "**⚠ Label Overlap** — pairs of cluster labels sharing ≥ 2 significant " "words (noise words like 'and', 'for', 'in' excluded). " "Overlapping labels suggest the two clusters may cover the same theme " "and should be reviewed for merging.\n\n" "**❌ Too Vague** — labels where all meaningful words are generic " "('systems', 'digital', 'data') with no domain-specific content. " "These need the optimization pass to refine them.\n\n" "**Action column** tells you exactly what to do for each flag." ) interpretability_out = gr.Dataframe(wrap=True) # ── Wire callbacks ──────────────────────────────────────────────────────── file_in.change(_preview, inputs=[file_in], outputs=[preview_out]) method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview]) run_btn.click( _run, inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in, trials_in, optimize_in], outputs=[ # original summary_out, scatter_out, pareto_out, trial_out, cluster_out, top_papers_out, method_chart_out, method_summary_out, extraction_out, per_llm_out, regex_hits_out, regex_info_out, refine_out, s1_out, s2_out, s3_out, s4_out, dl_out, mismatch_out, # new tech_llm_chart_out, tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out, per_llm_freq_out, jct_chart_out, jct_df_out, tech_opt_out, # supervisor additions cluster_sizes_out, repro_chart_out, repro_df_out, interpretability_out, ], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)