BERTopic_AG_final

Running

File size: 47,416 Bytes

"""
app.py — Gradio UI entry point.
ORIGINAL structure and all tabs preserved.
NEW: second file upload for methodology CSV, technique sheets 1-4,
     journal cross-tabulation chart + table, technique optimisation log.
"""
import os, json
import re
import pandas as pd, numpy as np
import gradio as gr
import plotly.express as px
import plotly.graph_objects as go
from agent import run_pipeline, METHODOLOGY_PATTERNS, TECHNIQUE_PATTERNS

# ── CSV preview ──────────────────────────────────────────────────────────────
def _preview(file):
    if not file: return "Upload a Scopus CSV to begin."
    df = pd.read_csv(file.name)
    df.columns = df.columns.str.lower()
    has_t = "title" in df.columns
    has_a = "abstract" in df.columns
    n = len(df)
    blanks_t = int(df["title"].isna().sum()) if has_t else n
    blanks_a = int(df["abstract"].isna().sum()) if has_a else n
    ok = "✅" if has_t and has_a and blanks_t < n and blanks_a < n else "❌"
    return (f"## {ok} CSV loaded — {n} entries\n\n"
        f"| Column | Present | Blank rows |\n|---|---|---|\n"
        f"| title  | {'✅' if has_t else '❌'} | {blanks_t} |\n"
        f"| abstract | {'✅' if has_a else '❌'} | {blanks_a} |\n\n"
        f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")


def _preview_methodology(file):
    if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis."
    df = pd.read_csv(file.name)
    df.columns = df.columns.str.lower()
    has_t = "title"        in df.columns
    has_m = "methodology"  in df.columns
    has_d = "doi"          in df.columns
    n = len(df)
    ok = "✅" if has_t and has_m else "❌"
    return (f"## {ok} Methodology CSV — {n} papers\n\n"
        f"| Column | Present |\n|---|---|\n"
        f"| title | {'✅' if has_t else '❌'} |\n"
        f"| doi | {'✅' if has_d else '⚠ optional'} |\n"
        f"| methodology | {'✅' if has_m else '❌'} |\n\n"
        f"Journals will be auto-detected from DOI + title.")


# ── Original helper builders ─────────────────────────────────────────────────
def _top_papers_df(top_papers: dict) -> pd.DataFrame:
    rows = []
    for cid in sorted(top_papers.keys()):
        for p in top_papers[cid]:
            rows.append({"Cluster": cid, "Label": p["cluster_label"],
                         "Rank": p["rank"], "Title": p["title"],
                         "Abstract Snippet": p["abstract_snippet"]})
    return pd.DataFrame(rows)


def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
    rows = []
    for cid in sorted(methodology_data.keys()):
        md    = methodology_data[cid]
        label = interps.get(cid, {}).get("label", f"Cluster {cid}")
        rows.append({
            "Cluster":            cid,
            "Label":              label,
            "Dominant Method":    md.get("dominant_method", "—"),
            "Dominant Technique": md.get("dominant_technique", "—"),
            "Empirical %":        md.get("empirical_pct", 0),
            "Theoretical %":      md.get("theoretical_pct", 0),
            "Mixed %":            md.get("mixed_pct", 0),
            "Methods (≥2 LLMs)":  ", ".join(
                f"{m['name']} ({m['pct']}%, {m['agreement']})"
                for m in md.get("methodologies", [])),
            "Techniques (≥2 LLMs)": ", ".join(
                f"{t['name']} ({t['pct']}%, {t['agreement']})"
                for t in md.get("techniques", [])),
            "Regex Confirmed":    ", ".join(md.get("regex_confirmed_consensus", [])) or "—",
            "Regex Rejected":     ", ".join(md.get("regex_rejected_consensus", [])) or "—",
        })
    return pd.DataFrame(rows)


def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
    rows = []
    for cid in sorted(methodology_data.keys()):
        md    = methodology_data[cid]
        label = interps.get(cid, {}).get("label", f"Cluster {cid}")
        scan  = md.get("regex_scan", {})
        for item in md.get("methodologies", []) + md.get("techniques", []):
            name      = item["name"]
            regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
            matched   = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
            rows.append({"Cluster": cid, "Label": label, "Item": name,
                "Type":       "Method" if item in md.get("methodologies",[]) else "Technique",
                "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
                "LLM Votes":  item["llm_votes"], "Agreement": item["agreement"],
                "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
                "Gate Passed":"✅ ACCEPTED"})
        for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]):
            name      = item["name"]
            regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
            matched   = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
            rows.append({"Cluster": cid, "Label": label, "Item": name,
                "Type":       "Method" if item in md.get("rejected_methods",[]) else "Technique",
                "Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
                "LLM Votes":  item["llm_votes"], "Agreement": item["agreement"],
                "Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
                "Gate Passed":"❌ REJECTED (single LLM)"})
    return pd.DataFrame(rows) if rows else pd.DataFrame()


def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
    rows = []
    for cid in sorted(methodology_data.keys()):
        md  = methodology_data[cid]
        label = interps.get(cid,{}).get("label", f"Cluster {cid}")
        raw = md.get("llm_raw",{})
        def _fmt(r, key):
            return " | ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "—"
        rows.append({"Cluster": cid, "Label": label,
            "Groq Methods":       _fmt(raw.get("groq",{}),    "methodologies"),
            "Mistral Methods":    _fmt(raw.get("mistral",{}), "methodologies"),
            "Gemini Methods":     _fmt(raw.get("gemini",{}),  "methodologies"),
            "Groq Techniques":    _fmt(raw.get("groq",{}),    "techniques"),
            "Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
            "Gemini Techniques":  _fmt(raw.get("gemini",{}),  "techniques"),
            "Groq E/T/M":    f"{raw.get('groq',{}).get('empirical_pct',0)}/"
                             f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
                             f"{raw.get('groq',{}).get('mixed_pct',0)}",
            "Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
                             f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
                             f"{raw.get('mistral',{}).get('mixed_pct',0)}",
            "Gemini E/T/M":  f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
                             f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
                             f"{raw.get('gemini',{}).get('mixed_pct',0)}",
        })
    return pd.DataFrame(rows)


def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
    rows = []
    for cid in sorted(methodology_data.keys()):
        md  = methodology_data[cid]
        label = interps.get(cid,{}).get("label", f"Cluster {cid}")
        scan  = md.get("regex_scan",{})
        for category, hits in scan.get("methods",{}).items():
            for h in hits:
                rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology",
                    "Pattern Category": category, "Matched Text": h["match"],
                    "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
        for category, hits in scan.get("techniques",{}).items():
            for h in hits:
                rows.append({"Cluster": cid, "Label": label, "Bank": "Technique",
                    "Pattern Category": category, "Matched Text": h["match"],
                    "Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
    return pd.DataFrame(rows) if rows else pd.DataFrame()


def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure:
    labels_list, empirical, theoretical, mixed = [], [], [], []
    for cid in sorted(methodology_data.keys()):
        md = methodology_data[cid]
        labels_list.append(interps.get(cid,{}).get("label", f"C{cid}")[:30])
        empirical.append(md.get("empirical_pct", 0))
        theoretical.append(md.get("theoretical_pct", 0))
        mixed.append(md.get("mixed_pct", 0))
    fig = go.Figure()
    fig.add_trace(go.Bar(name="Empirical %",   x=labels_list, y=empirical,   marker_color="#3dba7a"))
    fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
    fig.add_trace(go.Bar(name="Mixed %",       x=labels_list, y=mixed,       marker_color="#f5a623"))
    fig.update_layout(barmode="stack", template="plotly_dark", height=420,
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
        title="Research Orientation per Cluster — Averaged across Groq + Mistral + Gemini",
        xaxis_title="Cluster", yaxis_title="Percentage (%)",
        font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35)
    return fig


def _refinement_df(rl: list) -> pd.DataFrame:
    if not rl:
        return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
                                     "Issues","Improvement","Hallucination Detected"])
    return pd.DataFrame([{
        "Cluster": r["cluster"], "Iteration": r["iteration"],
        "Old Label": r["old_label"], "New Label": r["new_label"],
        "Issues": "; ".join(r.get("issues",[])),
        "Improvement": r["improvement_score"],
        "Hallucination Detected": r["hallucination_detected"],
    } for r in rl])


def _regex_pattern_info() -> str:
    m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
    t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
    return (
        "### How Cluster Methodology Extraction Works\n\n"
        "**Step 1 — Regex Pre-Scan:** Two compiled pattern banks run against representative "
        "abstracts. Every match recorded with exact character span, matched text, paper number.\n\n"
        "**Step 2 — 3-LLM Council:** Groq, Mistral, Gemini each receive regex evidence + abstracts. "
        "Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n"
        "**Step 3 — ≥2-LLM Gate:** Only items named by ≥2 LLMs survive. Percentages averaged.\n\n"
        "**Step 4 — Orientation:** Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n"
        "---\n\n#### Methodology Bank\n" + m_list +
        "\n\n#### Technique Bank\n" + t_list)


# ── NEW helpers for methodology-CSV pipeline ─────────────────────────────────
def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame:
    return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame()


def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure:
    """
    Grouped bar: for each technique, show the % of papers it was found in
    by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated.
    """
    s1 = comp_sheets.get(1, [])
    s2 = comp_sheets.get(2, [])
    s3 = comp_sheets.get(3, [])
    s4 = comp_sheets.get(4, [])

    def _freq(rows):
        counts = {}
        n = len(rows) or 1
        for row in rows:
            for t in (row.get("techniques","") or "").split(", "):
                t = t.strip().title()
                if t and t != "—":
                    counts[t] = counts.get(t,0) + 1
        return {k: round(v/n*100) for k,v in counts.items()}

    f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4)
    all_techs = sorted(set(f1)|set(f2)|set(f3)|set(f4))

    fig = go.Figure()
    fig.add_trace(go.Bar(name="Groq",         x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6"))
    fig.add_trace(go.Bar(name="Mistral",       x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623"))
    fig.add_trace(go.Bar(name="Gemini",        x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7"))
    fig.add_trace(go.Bar(name="Consolidated",  x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a"))
    fig.update_layout(barmode="group", template="plotly_dark", height=480,
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
        title="Computational Technique Frequency — % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)",
        xaxis_title="Technique", yaxis_title="% of papers",
        font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40)
    return fig


def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure:
    """
    Grouped bar: for each technique, show % usage per journal.
    Journals on x-axis, techniques as bar groups.
    """
    ct        = journal_crosstab.get("consolidated", {})
    journals  = journal_crosstab.get("journals", [])
    techniques= journal_crosstab.get("techniques", [])

    if not journals or not techniques:
        fig = go.Figure()
        fig.update_layout(template="plotly_dark", title="No journal data available",
                          paper_bgcolor="#0d1117")
        return fig

    COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4",
              "#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"]

    fig = go.Figure()
    for i, tech in enumerate(techniques[:15]):   # cap at 15 techniques for readability
        pcts = [ct.get(j,{}).get(tech, 0) for j in journals]
        fig.add_trace(go.Bar(name=tech, x=journals, y=pcts,
                             marker_color=COLORS[i % len(COLORS)]))

    fig.update_layout(barmode="group", template="plotly_dark", height=500,
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
        title="Computational Technique Usage — Cross-Tabulation by Journal (%)",
        xaxis_title="Journal", yaxis_title="% of papers using technique",
        font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20)
    return fig


def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame:
    ct        = journal_crosstab.get("consolidated", {})
    journals  = journal_crosstab.get("journals", [])
    techniques= journal_crosstab.get("techniques", [])
    paper_counts = journal_crosstab.get("journal_paper_counts", {})
    rows = []
    for j in journals:
        row = {"Journal": j, "N Papers": paper_counts.get(j,0)}
        for t in techniques:
            row[t] = f"{ct.get(j,{}).get(t,0)}%"
        rows.append(row)
    return pd.DataFrame(rows)


def _tech_opt_df(opt_log: list) -> pd.DataFrame:
    if not opt_log:
        return pd.DataFrame(columns=["Technique","Refined Name","Hallucination",
                                     "High Variance","Groq %","Mistral %","Gemini %",
                                     "Suggestion","Split Into","Merge With"])
    return pd.DataFrame([{
        "Technique":      r["technique"],
        "Refined Name":   r["refined_name"],
        "Hallucination":  r["is_hallucination"],
        "High Variance":  r["high_variance"],
        "Groq %":         r["pct_groq"],
        "Mistral %":      r["pct_mistral"],
        "Gemini %":       r["pct_gemini"],
        "Suggestion":     r["suggestion"],
        "Split Into":     r["split_into"],
        "Merge With":     r["merge_with"],
    } for r in opt_log])


def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame:
    """Per-LLM technique frequency across all papers in methodology CSV."""
    per_llm = journal_crosstab.get("per_llm_freq", {})
    techniques = sorted(set(t for d in per_llm.values() for t in d.keys()))
    rows = []
    for t in techniques:
        rows.append({
            "Technique":  t,
            "Groq %":     per_llm.get("Groq",{}).get(t, 0),
            "Mistral %":  per_llm.get("Mistral",{}).get(t, 0),
            "Gemini %":   per_llm.get("Gemini",{}).get(t, 0),
            "Variance":   round(max(
                per_llm.get("Groq",{}).get(t,0),
                per_llm.get("Mistral",{}).get(t,0),
                per_llm.get("Gemini",{}).get(t,0),
            ) - min(
                per_llm.get("Groq",{}).get(t,0),
                per_llm.get("Mistral",{}).get(t,0),
                per_llm.get("Gemini",{}).get(t,0),
            )),
        })
    return pd.DataFrame(rows).sort_values("Groq %", ascending=False)


# ── NEW: Cluster Sizes bar chart (what supervisor pointed to) ────────────────
def _cluster_sizes_chart(interps: dict, disc: dict) -> go.Figure:
    """
    Bar chart: Papers per Cluster — coloured by discipline rule status.
    Green  = passes both constraints (mass ≤ 25%, size ≥ 5).
    Yellow = exceeds 25% mass cap (dominant cluster warning).
    Red    = below min-size of 5 (too small).
    Number label shown on top of each bar, exactly like supervisor's image.
    """
    cluster_sizes = disc.get("cluster_sizes", {})
    n_docs        = sum(cluster_sizes.values()) or 1
    max_allowed   = int(0.25 * n_docs)

    labels, sizes, colors, texts = [], [], [], []
    for cid in sorted(interps.keys()):
        label = interps[cid]["label"]
        size  = cluster_sizes.get(cid, interps[cid].get("strong",0) + interps[cid].get("weak",0))
        mass_pct = size / n_docs

        color = "#3dba7a"                        # green — PASS
        if mass_pct > 0.25:
            color = "#f5c518"                    # yellow — mass violation (like supervisor image)
        elif size < 5:
            color = "#e04d4d"                    # red — too small

        labels.append(label)
        sizes.append(size)
        colors.append(color)
        texts.append(str(size))

    fig = go.Figure(go.Bar(
        x=labels, y=sizes,
        marker_color=colors,
        text=texts,
        textposition="outside",
        textfont=dict(size=11, color="#c9d1d9"),
    ))
    fig.add_hline(y=max_allowed, line_dash="dash", line_color="#f5a623",
                  annotation_text=f"25% cap ({max_allowed} papers)",
                  annotation_font_color="#f5a623")
    fig.update_layout(
        template="plotly_dark", height=520,
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
        title="Cluster Sizes (Papers per Cluster) — Green=PASS · Yellow=Mass>25% · Red=Size<5",
        xaxis_title="Cluster", yaxis_title="Number of Papers",
        font=dict(size=10), xaxis_tickangle=-40,
        showlegend=False,
        margin=dict(t=80, b=200),
    )
    return fig


# ── NEW: Reproducibility panel ────────────────────────────────────────────────
def _reproducibility_df(td: dict, interps: dict) -> pd.DataFrame:
    """
    Shows what the supervisor means by 'run again and again, topic list is same'.
    Pulls the stability ARI (already computed across 3 seeds in tools.py) and
    shows per-cluster persistence as a proxy for how stable each cluster is.
    High persistence = cluster survives across seeds = reproducible.
    Low persistence = cluster may disappear or merge on re-run.
    """
    cluster_persistence = td.get("cluster_persistence", {})
    overall_stability   = td["metrics"].get("stability", 0.0)
    rows = []
    for cid in sorted(interps.keys()):
        pers  = cluster_persistence.get(cid, 0.0)
        label = interps[cid]["label"]
        size  = interps[cid].get("strong",0) + interps[cid].get("weak",0)
        stable_verdict = "✅ Stable"     if pers >= 0.7 else \
                         "⚠ Borderline" if pers >= 0.4 else \
                         "❌ Fragile"
        rows.append({
            "Cluster":           cid,
            "Label":             label,
            "Cluster Persistence": round(pers, 4),
            "Strong Members":    interps[cid].get("strong", 0),
            "Weak Members":      interps[cid].get("weak",   0),
            "Total Papers":      size,
            "Stability Verdict": stable_verdict,
            "Note": ("Likely same label on re-run" if pers >= 0.7 else
                     "Label may shift slightly"    if pers >= 0.4 else
                     "May merge/split on re-run — consider merging with adjacent cluster"),
        })
    df = pd.DataFrame(rows).sort_values("Cluster Persistence", ascending=False)
    # Prepend overall ARI row
    overall_row = pd.DataFrame([{
        "Cluster": "ALL",
        "Label": f"Overall ARI Stability across 3 seeds = {round(overall_stability,4)}",
        "Cluster Persistence": overall_stability,
        "Strong Members": "—", "Weak Members": "—", "Total Papers": "—",
        "Stability Verdict": "✅ Stable" if overall_stability >= 0.8 else
                             "⚠ Borderline" if overall_stability >= 0.5 else "❌ Unstable",
        "Note": "ARI close to 1.0 → running the pipeline again will produce the same clusters",
    }])
    return pd.concat([overall_row, df], ignore_index=True)


def _reproducibility_chart(td: dict, interps: dict) -> go.Figure:
    """Horizontal bar of cluster persistence — shows which clusters are stable."""
    cluster_persistence = td.get("cluster_persistence", {})
    labels, persis, colors = [], [], []
    for cid in sorted(interps.keys(), key=lambda c: cluster_persistence.get(c,0)):
        p = cluster_persistence.get(cid, 0.0)
        labels.append(interps[cid]["label"][:35])
        persis.append(round(p, 4))
        colors.append("#3dba7a" if p >= 0.7 else "#f5a623" if p >= 0.4 else "#e04d4d")

    fig = go.Figure(go.Bar(
        x=persis, y=labels, orientation="h",
        marker_color=colors,
        text=[str(v) for v in persis],
        textposition="outside",
    ))
    fig.add_vline(x=0.7, line_dash="dot", line_color="#3dba7a",
                  annotation_text="Stable threshold (0.7)")
    fig.add_vline(x=0.4, line_dash="dot", line_color="#f5a623",
                  annotation_text="Borderline (0.4)")
    fig.update_layout(
        template="plotly_dark", height=max(400, len(labels)*28),
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
        title="Cluster Persistence — Proxy for Reproducibility\n"
              "Green ≥ 0.7 (stable) · Orange 0.4–0.7 (borderline) · Red < 0.4 (fragile)",
        xaxis_title="Persistence Score", yaxis_title="",
        font=dict(size=10), margin=dict(l=260),
    )
    return fig


# ── NEW: Human interpretability check ────────────────────────────────────────
def _interpretability_df(interps: dict) -> pd.DataFrame:
    """
    Flags what supervisor called 'human interpretable topic list'.
    Checks two things:
      1. Label overlap — pairs of cluster labels that share ≥2 significant words
         (e.g. 'Cybersecurity and Privacy' vs 'Cyber-Risk Management and Online Security').
      2. Vagueness — labels containing generic terms like 'systems', 'digital', 'data'
         as the ONLY meaningful content.
    Output is a table the supervisor can review to confirm distinctiveness.
    """
    import itertools
    NOISE = {"the","and","for","with","using","based","from","that","are","this",
             "in","of","a","to","an","on","at","by","or","as","is","its","via",
             "systems","digital","information","management","based","driven"}
    VAGUE_SINGLES = {"systems","digital","data","information","analysis","research",
                     "study","approach","framework","model","methods","technology"}

    def _sig_words(label: str) -> set:
        words = set(re.findall(r"\b[a-z]{4,}\b", label.lower()))
        return words - NOISE

    rows = []
    cids  = sorted(interps.keys())
    labels_map = {cid: interps[cid]["label"] for cid in cids}

    # Check every pair
    seen_pairs = set()
    for cid_a, cid_b in itertools.combinations(cids, 2):
        la, lb   = labels_map[cid_a], labels_map[cid_b]
        wa, wb   = _sig_words(la), _sig_words(lb)
        overlap  = wa & wb
        if len(overlap) >= 2:
            pair_key = tuple(sorted([cid_a, cid_b]))
            if pair_key not in seen_pairs:
                seen_pairs.add(pair_key)
                rows.append({
                    "Issue":        "⚠ Label Overlap",
                    "Cluster A":    cid_a,
                    "Label A":      la,
                    "Cluster B":    cid_b,
                    "Label B":      lb,
                    "Shared Words": ", ".join(sorted(overlap)),
                    "Severity":     "HIGH — consider merging" if len(overlap) >= 3
                                    else "MEDIUM — review distinctiveness",
                    "Action":       "Check if these two clusters cover the same research theme. "
                                    "If yes, increase min_cluster_size to force a merge.",
                })

    # Check each label for vagueness
    for cid in cids:
        label    = labels_map[cid]
        sig      = _sig_words(label)
        vague    = sig & VAGUE_SINGLES
        specific = sig - VAGUE_SINGLES
        if len(specific) == 0:
            rows.append({
                "Issue":        "❌ Too Vague",
                "Cluster A":    cid,
                "Label A":      label,
                "Cluster B":    "—",
                "Label B":      "—",
                "Shared Words": ", ".join(vague),
                "Severity":     "HIGH — label is not human interpretable",
                "Action":       "Run optimization pass to refine the label, "
                                "or manually inspect keyphrases for more specific terms.",
            })

    if not rows:
        rows.append({
            "Issue": "✅ All Clear",
            "Cluster A": "—", "Label A": "All labels are distinct and specific",
            "Cluster B": "—", "Label B": "—",
            "Shared Words": "—", "Severity": "NONE",
            "Action": "Topic list is human interpretable and non-overlapping.",
        })

    return pd.DataFrame(rows)


# ── Pipeline runner ──────────────────────────────────────────────────────────
def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize,
         progress=gr.Progress(track_tqdm=True)):
    if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.")
    gk  = gk.strip()  or os.getenv("GROQ_API_KEY","")
    mk  = mk.strip()  or os.getenv("MISTRAL_API_KEY","")
    gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
    if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")

    method_path = method_file.name if method_file else None

    progress(0.05, desc="📥 Loading CSV…")
    progress(0.10, desc="🔬 Embedding corpus with SPECTER-2…")
    r = run_pipeline(corpus_file.name, gk, mk, gek,
                     int(n_trials), int(n_optimize), method_path)
    if r.get("error"): raise gr.Error(r["error"])
    progress(0.85, desc="📊 Building outputs…")

    td, interps = r["topic_data"], r.get("interpretations",{})
    disc, met   = td["discipline"], td["metrics"]
    ar          = r.get("agreement_rates",{})
    rl          = r.get("refinement_log", [])

    def _s(ok): return "✅ PASS" if ok else "❌ FAIL"
    summary = (
        f"## Pipeline Complete — {disc['n_clusters']} clusters discovered\n\n"
        f"| Criterion | Value | Status |\n|---|---|---|\n"
        f"| Max cluster mass | {round(disc['max_mass_pct']*100,1)}% | {_s(disc['max_mass_ok'])} |\n"
        f"| Min cluster size | {disc['min_size']} | {_s(disc['min_size_ok'])} |\n"
        f"| Persistence (mean) | {round(met['persistence'],4)} | — |\n"
        f"| DBCV | {round(met['dbcv'],4)} | — |\n"
        f"| Stability (3 seeds) | {round(met['stability'],4)} | — |\n\n"
        f"**Trials:** {td['n_trials_run']} (best #{td['best_trial']}) · "
        f"**Agreement:** Triple {ar.get('triple',0)}% · Two+ {ar.get('two_or_more',0)}% · "
        f"**Optimization passes:** {n_optimize} · **Labels refined:** {len(rl)}"
    )

    # UMAP scatter
    u2d = np.array(td["umap_2d"])
    sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
        "Cluster":[str(l) for l in td["labels"]],
        "Doc":[d[:60] for d in td["documents"]]})
    fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster",
        hover_data=["Doc"], opacity=0.75,
        title="2-D UMAP visualisation of SPECTER-2 embeddings")
    fig.update_layout(template="plotly_dark", height=500,
        paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))

    # Trial log + Pareto
    tl = pd.DataFrame(td["trial_log"])
    tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
        "dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
    tl_show = tl[tl_cols] if not tl.empty else pd.DataFrame()

    pfig = go.Figure()
    if not tl.empty:
        for passed, color, name in [(True,"#3dba7a","PASS"),(False,"#e04d4d","FAIL")]:
            sub = tl[tl["discipline_pass"]==passed]
            if not sub.empty:
                pfig.add_trace(go.Scatter(x=sub["max_mass_pct"],y=sub["persistence"],
                    mode="markers",marker=dict(size=8,color=color),name=name,
                    text=sub["trial"],hovertemplate="Trial %{text}<br>Mass: %{x:.0%}<br>Pers: %{y:.3f}"))
        pfig.add_vline(x=0.25,line_dash="dash",line_color="#5a6480",annotation_text="25% rule")
    pfig.update_layout(template="plotly_dark",height=400,
        paper_bgcolor="#0d1117",plot_bgcolor="#161b22",
        title="Pareto front — Persistence vs Max cluster mass",
        xaxis_title="Max cluster mass",yaxis_title="Persistence",font=dict(size=11))

    cdf_rows = []
    for cid in sorted(interps.keys()):
        v = interps[cid]
        cdf_rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"],
            "Strong":v["strong"],"Weak":v["weak"],
            "Persistence":round(v.get("persistence",0),4),
            "Keyphrases":", ".join(v.get("keyphrases",[]))})
    cdf = pd.DataFrame(cdf_rows)

    sheets = r.get("sheets",{})
    s1 = pd.DataFrame(sheets.get(1,[])); s2 = pd.DataFrame(sheets.get(2,[]))
    s3 = pd.DataFrame(sheets.get(3,[])); s4 = pd.DataFrame(sheets.get(4,[]))
    sp = r.get("sheet_paths",{})
    mdf = pd.DataFrame(r.get("mismatch_table",[]))

    md_data  = r.get("methodology_data",{})
    top_papers_df    = _top_papers_df(r.get("top_papers",{}))
    method_sum_df    = _methodology_summary_df(md_data, interps)
    method_chart     = _methodology_bar_chart(md_data, interps)
    extraction_df    = _extraction_pipeline_df(md_data, interps)
    per_llm_meth_df  = _per_llm_methodology_df(md_data, interps)
    regex_hits_df    = _regex_hits_df(md_data, interps)
    pattern_info     = _regex_pattern_info()
    refine_df        = _refinement_df(rl)

    # ── NEW: methodology-CSV outputs ─────────────────────────────────────────
    comp_sheets  = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]})
    jct          = r.get("journal_crosstab", {})
    tech_opt_log = r.get("technique_opt_log", [])

    tech_s1 = _tech_sheet_df(comp_sheets.get(1,[]))
    tech_s2 = _tech_sheet_df(comp_sheets.get(2,[]))
    tech_s3 = _tech_sheet_df(comp_sheets.get(3,[]))
    tech_s4 = _tech_sheet_df(comp_sheets.get(4,[]))

    tech_llm_chart    = _tech_llm_pct_chart(comp_sheets)
    jct_chart         = _journal_crosstab_chart(jct)
    jct_df            = _journal_crosstab_df(jct)
    per_llm_freq_df   = _per_llm_freq_df(jct)
    tech_opt_df       = _tech_opt_df(tech_opt_log)

    # ── NEW: cluster sizes, reproducibility, interpretability ─────────────────
    cluster_sizes_fig   = _cluster_sizes_chart(interps, disc)
    repro_chart         = _reproducibility_chart(td, interps)
    repro_df            = _reproducibility_df(td, interps)
    interpretability_df = _interpretability_df(interps)

    progress(1.0, desc="✅ Done!")
    dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]

    return (
        # ── original outputs (order preserved) ───────────────────────────────
        summary, fig, pfig, tl_show, cdf,
        top_papers_df,
        method_chart, method_sum_df, extraction_df, per_llm_meth_df,
        regex_hits_df, pattern_info,
        refine_df,
        s1, s2, s3, s4,
        dl_files if dl_files else None,
        mdf,
        # ── new outputs ───────────────────────────────────────────────────────
        tech_llm_chart,
        tech_s1, tech_s2, tech_s3, tech_s4,
        per_llm_freq_df,
        jct_chart,
        jct_df,
        tech_opt_df,
        # ── supervisor additions ──────────────────────────────────────────────
        cluster_sizes_fig,
        repro_chart,
        repro_df,
        interpretability_df,
    )


# ── UI ────────────────────────────────────────────────────────────────────────
css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
      "footer{display:none!important}"

with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
               css=css, title="SPECTER-2 Topic Analyzer") as demo:
    gr.Markdown("# 📐 SPECTER-2 Topic Analyzer")

    with gr.Row():
        # ── Left sidebar ─────────────────────────────────────────────────────
        with gr.Column(scale=1):
            gr.Markdown("### 📄 Corpus CSV")
            file_in    = gr.File(label="Upload Scopus CSV (title + abstract)",
                                 file_types=[".csv"])
            preview_out = gr.Markdown("Upload a CSV to see stats.")

            gr.Markdown("### 🔬 Methodology CSV *(optional)*")
            method_file_in   = gr.File(label="Upload Methodology CSV (title, doi, methodology)",
                                       file_types=[".csv"])
            method_preview   = gr.Markdown("Upload methodology CSV to enable technique analysis.")

            gr.Markdown("### 🔑 API Keys")
            groq_in    = gr.Textbox(label="Groq API Key", type="password",
                            placeholder="or set GROQ_API_KEY env var")
            mistral_in = gr.Textbox(label="Mistral API Key", type="password",
                            placeholder="or set MISTRAL_API_KEY env var")
            gemini_in  = gr.Textbox(label="Gemini API Key", type="password",
                            placeholder="or set GEMINI_API_KEY env var")

            gr.Markdown("### ⚙ Parameters")
            trials_in   = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
            optimize_in = gr.Slider(1, 5, 1, step=1,
                            label="🔁 Optimization Passes",
                            info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels "
                                 "AND technique labels for hallucinations + improvements.")
            run_btn = gr.Button("▶ Run Full Pipeline", variant="primary", size="lg")

        # ── Main panel ────────────────────────────────────────────────────────
        with gr.Column(scale=3):
            with gr.Tabs():

                # ── original tabs (order / content unchanged) ─────────────────
                with gr.Tab("Summary"):
                    summary_out = gr.Markdown()

                with gr.Tab("2-D UMAP"):
                    scatter_out = gr.Plot()

                with gr.Tab("Pareto Front"):
                    pareto_out = gr.Plot()

                with gr.Tab("Trial Log"):
                    trial_out = gr.Dataframe()

                with gr.Tab("Clusters"):
                    cluster_out = gr.Dataframe()

                with gr.Tab("🗞 Top 3 Papers"):
                    gr.Markdown("### Top 3 Representative Papers per Cluster\n"
                                "Ranked by cosine similarity to cluster centroid "
                                "in SPECTER-2 embedding space.")
                    top_papers_out = gr.Dataframe(
                        headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
                        wrap=True)

                with gr.Tab("🔬 Cluster Methodology"):
                    gr.Markdown("### Cluster-Level Methodology — 3-LLM Council\n"
                                "Derived from representative abstracts per cluster. "
                                "≥2-LLM gate applied.")
                    method_chart_out   = gr.Plot()
                    method_summary_out = gr.Dataframe(wrap=True)

                with gr.Tab("⚙ Cluster Extraction Pipeline"):
                    gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)")
                    extraction_out = gr.Dataframe(wrap=True)

                with gr.Tab("🤖 Cluster Per-LLM Votes"):
                    gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)")
                    per_llm_out = gr.Dataframe(wrap=True)

                with gr.Tab("🔍 Cluster Regex Hits"):
                    gr.Markdown("### Regex Pattern Matches (per cluster)\n"
                                "Every match with exact character span and paper number.")
                    regex_hits_out = gr.Dataframe(wrap=True)
                    regex_info_out = gr.Markdown()

                with gr.Tab("🔁 Refinement Log"):
                    gr.Markdown("### Topic Label Optimization Log\n"
                                "Changes made by LLM critic per optimization pass.")
                    refine_out = gr.Dataframe(wrap=True)

                with gr.Tab("Sheet 1 — Groq"):    s1_out = gr.Dataframe()
                with gr.Tab("Sheet 2 — Mistral"): s2_out = gr.Dataframe()
                with gr.Tab("Sheet 3 — Gemini"):  s3_out = gr.Dataframe()
                with gr.Tab("Sheet 4 — Consolidated"): s4_out = gr.Dataframe()
                with gr.Tab("RQ Mismatch"):        mismatch_out = gr.Dataframe()
                with gr.Tab("Downloads"):
                    dl_out = gr.File(label="All sheet CSVs + topics.json",
                                     file_count="multiple")

                # ── NEW tabs: methodology CSV pipeline ────────────────────────
                with gr.Tab("💻 Comp. Techniques — LLM % Chart"):
                    gr.Markdown("### Computational Technique Frequency — Methodology CSV\n"
                                "For each technique, shows the % of papers it was extracted "
                                "from by each of the 3 LLMs independently + the consolidated "
                                "result (≥2-LLM gate). Bars grouped by technique.")
                    tech_llm_chart_out = gr.Plot()

                with gr.Tab("💻 Tech Sheet 1 — Groq"):
                    gr.Markdown("### Groq raw technique extraction — one row per paper")
                    tech_s1_out = gr.Dataframe(wrap=True)

                with gr.Tab("💻 Tech Sheet 2 — Mistral"):
                    gr.Markdown("### Mistral raw technique extraction — one row per paper")
                    tech_s2_out = gr.Dataframe(wrap=True)

                with gr.Tab("💻 Tech Sheet 3 — Gemini"):
                    gr.Markdown("### Gemini raw technique extraction — one row per paper")
                    tech_s3_out = gr.Dataframe(wrap=True)

                with gr.Tab("💻 Tech Sheet 4 — Consolidated"):
                    gr.Markdown("### Consolidated techniques — ≥2-LLM agreement, one row per paper")
                    tech_s4_out = gr.Dataframe(wrap=True)

                with gr.Tab("📊 Tech Frequency by LLM"):
                    gr.Markdown("### Per-LLM Technique Frequency Table\n"
                                "% of all papers where each LLM extracted each technique. "
                                "High variance = LLMs disagree → optimization flag.")
                    per_llm_freq_out = gr.Dataframe(wrap=True)

                with gr.Tab("🗂 Journal Cross-Tabulation"):
                    gr.Markdown("### Technique × Journal Cross-Tabulation\n"
                                "Rows = journals auto-detected from DOI/title. "
                                "Columns = consolidated techniques. "
                                "Values = % of papers in that journal using the technique.\n\n"
                                "**Journals detected:** MISQ, JAIS, ISR, JMIS, PAJAIS, "
                                "ECIS, ICIS, Other.")
                    jct_chart_out = gr.Plot()
                    jct_df_out    = gr.Dataframe(wrap=True)

                with gr.Tab("🔧 Technique Optimization"):
                    gr.Markdown("### Technique Label Improvement Suggestions\n"
                                "Groq critic flags: hallucination, high inter-LLM variance "
                                "(>15% gap), split/merge recommendations.\n"
                                "Only runs when Optimization Passes ≥ 2.")
                    tech_opt_out = gr.Dataframe(wrap=True)

                # ── Supervisor-requested additions ────────────────────────────
                with gr.Tab("📊 Cluster Sizes"):
                    gr.Markdown(
                        "### Cluster Sizes (Papers per Cluster)\n"
                        "Exact chart your supervisor highlighted. "
                        "**Green** = passes both discipline rules (mass ≤ 25%, size ≥ 5). "
                        "**Yellow** = cluster exceeds 25% mass cap — dominant cluster warning. "
                        "**Red** = cluster has fewer than 5 papers — too small.\n\n"
                        "The orange dashed line marks the 25% cap. Any bar above it "
                        "will fail the discipline check and the pipeline will re-optimise."
                    )
                    cluster_sizes_out = gr.Plot()

                with gr.Tab("🔄 Reproducibility"):
                    gr.Markdown(
                        "### Reproducibility — 'Run Again and Again, Topic List is the Same'\n\n"
                        "Your supervisor wants proof that running the pipeline multiple times "
                        "produces the **same clusters**. This tab shows two measures:\n\n"
                        "**Overall ARI Stability** (top row) — Adjusted Rand Index averaged "
                        "across 3 random seeds. ARI = 1.0 means identical clusters every run. "
                        "ARI ≥ 0.8 is considered stable for publication.\n\n"
                        "**Cluster Persistence** (per row) — how strongly each cluster's "
                        "structure is preserved in the condensed HDBSCAN tree. "
                        "High persistence → cluster survives parameter variation → "
                        "same label will appear on re-run. "
                        "Low persistence → cluster may split or merge → label may change.\n\n"
                        "🟢 ≥ 0.7 Stable · 🟡 0.4–0.7 Borderline · 🔴 < 0.4 Fragile"
                    )
                    repro_chart_out = gr.Plot()
                    repro_df_out    = gr.Dataframe(wrap=True)

                with gr.Tab("🧠 Interpretability Check"):
                    gr.Markdown(
                        "### Human Interpretability Check — 'Topic List Must Be Distinct'\n\n"
                        "Your supervisor flagged that labels like "
                        "*'Cybersecurity and Privacy'* and *'Cyber-Risk Management and Online Security'* "
                        "look like the same topic. This tab automatically detects:\n\n"
                        "**⚠ Label Overlap** — pairs of cluster labels sharing ≥ 2 significant "
                        "words (noise words like 'and', 'for', 'in' excluded). "
                        "Overlapping labels suggest the two clusters may cover the same theme "
                        "and should be reviewed for merging.\n\n"
                        "**❌ Too Vague** — labels where all meaningful words are generic "
                        "('systems', 'digital', 'data') with no domain-specific content. "
                        "These need the optimization pass to refine them.\n\n"
                        "**Action column** tells you exactly what to do for each flag."
                    )
                    interpretability_out = gr.Dataframe(wrap=True)

    # ── Wire callbacks ────────────────────────────────────────────────────────
    file_in.change(_preview,            inputs=[file_in],        outputs=[preview_out])
    method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview])

    run_btn.click(
        _run,
        inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in,
                trials_in, optimize_in],
        outputs=[
            # original
            summary_out, scatter_out, pareto_out, trial_out, cluster_out,
            top_papers_out,
            method_chart_out, method_summary_out, extraction_out, per_llm_out,
            regex_hits_out, regex_info_out,
            refine_out,
            s1_out, s2_out, s3_out, s4_out,
            dl_out, mismatch_out,
            # new
            tech_llm_chart_out,
            tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out,
            per_llm_freq_out,
            jct_chart_out,
            jct_df_out,
            tech_opt_out,
            # supervisor additions
            cluster_sizes_out,
            repro_chart_out,
            repro_df_out,
            interpretability_out,
        ],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)