BHAVIKBANKER's picture
Update app.py
e161996 verified
"""
app.py β€” Gradio UI entry point.
ORIGINAL structure and all tabs preserved.
NEW: second file upload for methodology CSV, technique sheets 1-4,
journal cross-tabulation chart + table, technique optimisation log.
"""
import os, json
import re
import pandas as pd, numpy as np
import gradio as gr
import plotly.express as px
import plotly.graph_objects as go
from agent import run_pipeline, METHODOLOGY_PATTERNS, TECHNIQUE_PATTERNS
# ── CSV preview ──────────────────────────────────────────────────────────────
def _preview(file):
if not file: return "Upload a Scopus CSV to begin."
df = pd.read_csv(file.name)
df.columns = df.columns.str.lower()
has_t = "title" in df.columns
has_a = "abstract" in df.columns
n = len(df)
blanks_t = int(df["title"].isna().sum()) if has_t else n
blanks_a = int(df["abstract"].isna().sum()) if has_a else n
ok = "βœ…" if has_t and has_a and blanks_t < n and blanks_a < n else "❌"
return (f"## {ok} CSV loaded β€” {n} entries\n\n"
f"| Column | Present | Blank rows |\n|---|---|---|\n"
f"| title | {'βœ…' if has_t else '❌'} | {blanks_t} |\n"
f"| abstract | {'βœ…' if has_a else '❌'} | {blanks_a} |\n\n"
f"**Usable papers:** {n - max(blanks_t, blanks_a)} / {n}")
def _preview_methodology(file):
if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis."
df = pd.read_csv(file.name)
df.columns = df.columns.str.lower()
has_t = "title" in df.columns
has_m = "methodology" in df.columns
has_d = "doi" in df.columns
n = len(df)
ok = "βœ…" if has_t and has_m else "❌"
return (f"## {ok} Methodology CSV β€” {n} papers\n\n"
f"| Column | Present |\n|---|---|\n"
f"| title | {'βœ…' if has_t else '❌'} |\n"
f"| doi | {'βœ…' if has_d else '⚠ optional'} |\n"
f"| methodology | {'βœ…' if has_m else '❌'} |\n\n"
f"Journals will be auto-detected from DOI + title.")
# ── Original helper builders ─────────────────────────────────────────────────
def _top_papers_df(top_papers: dict) -> pd.DataFrame:
rows = []
for cid in sorted(top_papers.keys()):
for p in top_papers[cid]:
rows.append({"Cluster": cid, "Label": p["cluster_label"],
"Rank": p["rank"], "Title": p["title"],
"Abstract Snippet": p["abstract_snippet"]})
return pd.DataFrame(rows)
def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
rows = []
for cid in sorted(methodology_data.keys()):
md = methodology_data[cid]
label = interps.get(cid, {}).get("label", f"Cluster {cid}")
rows.append({
"Cluster": cid,
"Label": label,
"Dominant Method": md.get("dominant_method", "β€”"),
"Dominant Technique": md.get("dominant_technique", "β€”"),
"Empirical %": md.get("empirical_pct", 0),
"Theoretical %": md.get("theoretical_pct", 0),
"Mixed %": md.get("mixed_pct", 0),
"Methods (β‰₯2 LLMs)": ", ".join(
f"{m['name']} ({m['pct']}%, {m['agreement']})"
for m in md.get("methodologies", [])),
"Techniques (β‰₯2 LLMs)": ", ".join(
f"{t['name']} ({t['pct']}%, {t['agreement']})"
for t in md.get("techniques", [])),
"Regex Confirmed": ", ".join(md.get("regex_confirmed_consensus", [])) or "β€”",
"Regex Rejected": ", ".join(md.get("regex_rejected_consensus", [])) or "β€”",
})
return pd.DataFrame(rows)
def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
rows = []
for cid in sorted(methodology_data.keys()):
md = methodology_data[cid]
label = interps.get(cid, {}).get("label", f"Cluster {cid}")
scan = md.get("regex_scan", {})
for item in md.get("methodologies", []) + md.get("techniques", []):
name = item["name"]
regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
rows.append({"Cluster": cid, "Label": label, "Item": name,
"Type": "Method" if item in md.get("methodologies",[]) else "Technique",
"Regex Match":matched, "Regex Fired": "βœ…" if regex_hits else "❌",
"LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
"Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","β€”"),
"Gate Passed":"βœ… ACCEPTED"})
for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]):
name = item["name"]
regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "β€”"
rows.append({"Cluster": cid, "Label": label, "Item": name,
"Type": "Method" if item in md.get("rejected_methods",[]) else "Technique",
"Regex Match":matched, "Regex Fired": "βœ…" if regex_hits else "❌",
"LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
"Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","β€”"),
"Gate Passed":"❌ REJECTED (single LLM)"})
return pd.DataFrame(rows) if rows else pd.DataFrame()
def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
rows = []
for cid in sorted(methodology_data.keys()):
md = methodology_data[cid]
label = interps.get(cid,{}).get("label", f"Cluster {cid}")
raw = md.get("llm_raw",{})
def _fmt(r, key):
return " | ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "β€”"
rows.append({"Cluster": cid, "Label": label,
"Groq Methods": _fmt(raw.get("groq",{}), "methodologies"),
"Mistral Methods": _fmt(raw.get("mistral",{}), "methodologies"),
"Gemini Methods": _fmt(raw.get("gemini",{}), "methodologies"),
"Groq Techniques": _fmt(raw.get("groq",{}), "techniques"),
"Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
"Gemini Techniques": _fmt(raw.get("gemini",{}), "techniques"),
"Groq E/T/M": f"{raw.get('groq',{}).get('empirical_pct',0)}/"
f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
f"{raw.get('groq',{}).get('mixed_pct',0)}",
"Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
f"{raw.get('mistral',{}).get('mixed_pct',0)}",
"Gemini E/T/M": f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
f"{raw.get('gemini',{}).get('mixed_pct',0)}",
})
return pd.DataFrame(rows)
def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
rows = []
for cid in sorted(methodology_data.keys()):
md = methodology_data[cid]
label = interps.get(cid,{}).get("label", f"Cluster {cid}")
scan = md.get("regex_scan",{})
for category, hits in scan.get("methods",{}).items():
for h in hits:
rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology",
"Pattern Category": category, "Matched Text": h["match"],
"Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
for category, hits in scan.get("techniques",{}).items():
for h in hits:
rows.append({"Cluster": cid, "Label": label, "Bank": "Technique",
"Pattern Category": category, "Matched Text": h["match"],
"Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
return pd.DataFrame(rows) if rows else pd.DataFrame()
def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure:
labels_list, empirical, theoretical, mixed = [], [], [], []
for cid in sorted(methodology_data.keys()):
md = methodology_data[cid]
labels_list.append(interps.get(cid,{}).get("label", f"C{cid}")[:30])
empirical.append(md.get("empirical_pct", 0))
theoretical.append(md.get("theoretical_pct", 0))
mixed.append(md.get("mixed_pct", 0))
fig = go.Figure()
fig.add_trace(go.Bar(name="Empirical %", x=labels_list, y=empirical, marker_color="#3dba7a"))
fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
fig.add_trace(go.Bar(name="Mixed %", x=labels_list, y=mixed, marker_color="#f5a623"))
fig.update_layout(barmode="stack", template="plotly_dark", height=420,
paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
title="Research Orientation per Cluster β€” Averaged across Groq + Mistral + Gemini",
xaxis_title="Cluster", yaxis_title="Percentage (%)",
font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35)
return fig
def _refinement_df(rl: list) -> pd.DataFrame:
if not rl:
return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
"Issues","Improvement","Hallucination Detected"])
return pd.DataFrame([{
"Cluster": r["cluster"], "Iteration": r["iteration"],
"Old Label": r["old_label"], "New Label": r["new_label"],
"Issues": "; ".join(r.get("issues",[])),
"Improvement": r["improvement_score"],
"Hallucination Detected": r["hallucination_detected"],
} for r in rl])
def _regex_pattern_info() -> str:
m_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
t_list = "\n".join(f"- **{k}**: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
return (
"### How Cluster Methodology Extraction Works\n\n"
"**Step 1 β€” Regex Pre-Scan:** Two compiled pattern banks run against representative "
"abstracts. Every match recorded with exact character span, matched text, paper number.\n\n"
"**Step 2 β€” 3-LLM Council:** Groq, Mistral, Gemini each receive regex evidence + abstracts. "
"Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n"
"**Step 3 β€” β‰₯2-LLM Gate:** Only items named by β‰₯2 LLMs survive. Percentages averaged.\n\n"
"**Step 4 β€” Orientation:** Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n"
"---\n\n#### Methodology Bank\n" + m_list +
"\n\n#### Technique Bank\n" + t_list)
# ── NEW helpers for methodology-CSV pipeline ─────────────────────────────────
def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame:
return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame()
def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure:
"""
Grouped bar: for each technique, show the % of papers it was found in
by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated.
"""
s1 = comp_sheets.get(1, [])
s2 = comp_sheets.get(2, [])
s3 = comp_sheets.get(3, [])
s4 = comp_sheets.get(4, [])
def _freq(rows):
counts = {}
n = len(rows) or 1
for row in rows:
for t in (row.get("techniques","") or "").split(", "):
t = t.strip().title()
if t and t != "β€”":
counts[t] = counts.get(t,0) + 1
return {k: round(v/n*100) for k,v in counts.items()}
f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4)
all_techs = sorted(set(f1)|set(f2)|set(f3)|set(f4))
fig = go.Figure()
fig.add_trace(go.Bar(name="Groq", x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6"))
fig.add_trace(go.Bar(name="Mistral", x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623"))
fig.add_trace(go.Bar(name="Gemini", x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7"))
fig.add_trace(go.Bar(name="Consolidated", x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a"))
fig.update_layout(barmode="group", template="plotly_dark", height=480,
paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
title="Computational Technique Frequency β€” % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)",
xaxis_title="Technique", yaxis_title="% of papers",
font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40)
return fig
def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure:
"""
Grouped bar: for each technique, show % usage per journal.
Journals on x-axis, techniques as bar groups.
"""
ct = journal_crosstab.get("consolidated", {})
journals = journal_crosstab.get("journals", [])
techniques= journal_crosstab.get("techniques", [])
if not journals or not techniques:
fig = go.Figure()
fig.update_layout(template="plotly_dark", title="No journal data available",
paper_bgcolor="#0d1117")
return fig
COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4",
"#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"]
fig = go.Figure()
for i, tech in enumerate(techniques[:15]): # cap at 15 techniques for readability
pcts = [ct.get(j,{}).get(tech, 0) for j in journals]
fig.add_trace(go.Bar(name=tech, x=journals, y=pcts,
marker_color=COLORS[i % len(COLORS)]))
fig.update_layout(barmode="group", template="plotly_dark", height=500,
paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
title="Computational Technique Usage β€” Cross-Tabulation by Journal (%)",
xaxis_title="Journal", yaxis_title="% of papers using technique",
font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20)
return fig
def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame:
ct = journal_crosstab.get("consolidated", {})
journals = journal_crosstab.get("journals", [])
techniques= journal_crosstab.get("techniques", [])
paper_counts = journal_crosstab.get("journal_paper_counts", {})
rows = []
for j in journals:
row = {"Journal": j, "N Papers": paper_counts.get(j,0)}
for t in techniques:
row[t] = f"{ct.get(j,{}).get(t,0)}%"
rows.append(row)
return pd.DataFrame(rows)
def _tech_opt_df(opt_log: list) -> pd.DataFrame:
if not opt_log:
return pd.DataFrame(columns=["Technique","Refined Name","Hallucination",
"High Variance","Groq %","Mistral %","Gemini %",
"Suggestion","Split Into","Merge With"])
return pd.DataFrame([{
"Technique": r["technique"],
"Refined Name": r["refined_name"],
"Hallucination": r["is_hallucination"],
"High Variance": r["high_variance"],
"Groq %": r["pct_groq"],
"Mistral %": r["pct_mistral"],
"Gemini %": r["pct_gemini"],
"Suggestion": r["suggestion"],
"Split Into": r["split_into"],
"Merge With": r["merge_with"],
} for r in opt_log])
def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame:
"""Per-LLM technique frequency across all papers in methodology CSV."""
per_llm = journal_crosstab.get("per_llm_freq", {})
techniques = sorted(set(t for d in per_llm.values() for t in d.keys()))
rows = []
for t in techniques:
rows.append({
"Technique": t,
"Groq %": per_llm.get("Groq",{}).get(t, 0),
"Mistral %": per_llm.get("Mistral",{}).get(t, 0),
"Gemini %": per_llm.get("Gemini",{}).get(t, 0),
"Variance": round(max(
per_llm.get("Groq",{}).get(t,0),
per_llm.get("Mistral",{}).get(t,0),
per_llm.get("Gemini",{}).get(t,0),
) - min(
per_llm.get("Groq",{}).get(t,0),
per_llm.get("Mistral",{}).get(t,0),
per_llm.get("Gemini",{}).get(t,0),
)),
})
return pd.DataFrame(rows).sort_values("Groq %", ascending=False)
# ── NEW: Cluster Sizes bar chart (what supervisor pointed to) ────────────────
def _cluster_sizes_chart(interps: dict, disc: dict) -> go.Figure:
"""
Bar chart: Papers per Cluster β€” coloured by discipline rule status.
Green = passes both constraints (mass ≀ 25%, size β‰₯ 5).
Yellow = exceeds 25% mass cap (dominant cluster warning).
Red = below min-size of 5 (too small).
Number label shown on top of each bar, exactly like supervisor's image.
"""
cluster_sizes = disc.get("cluster_sizes", {})
n_docs = sum(cluster_sizes.values()) or 1
max_allowed = int(0.25 * n_docs)
labels, sizes, colors, texts = [], [], [], []
for cid in sorted(interps.keys()):
label = interps[cid]["label"]
size = cluster_sizes.get(cid, interps[cid].get("strong",0) + interps[cid].get("weak",0))
mass_pct = size / n_docs
color = "#3dba7a" # green β€” PASS
if mass_pct > 0.25:
color = "#f5c518" # yellow β€” mass violation (like supervisor image)
elif size < 5:
color = "#e04d4d" # red β€” too small
labels.append(label)
sizes.append(size)
colors.append(color)
texts.append(str(size))
fig = go.Figure(go.Bar(
x=labels, y=sizes,
marker_color=colors,
text=texts,
textposition="outside",
textfont=dict(size=11, color="#c9d1d9"),
))
fig.add_hline(y=max_allowed, line_dash="dash", line_color="#f5a623",
annotation_text=f"25% cap ({max_allowed} papers)",
annotation_font_color="#f5a623")
fig.update_layout(
template="plotly_dark", height=520,
paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
title="Cluster Sizes (Papers per Cluster) β€” Green=PASS Β· Yellow=Mass>25% Β· Red=Size<5",
xaxis_title="Cluster", yaxis_title="Number of Papers",
font=dict(size=10), xaxis_tickangle=-40,
showlegend=False,
margin=dict(t=80, b=200),
)
return fig
# ── NEW: Reproducibility panel ────────────────────────────────────────────────
def _reproducibility_df(td: dict, interps: dict) -> pd.DataFrame:
"""
Shows what the supervisor means by 'run again and again, topic list is same'.
Pulls the stability ARI (already computed across 3 seeds in tools.py) and
shows per-cluster persistence as a proxy for how stable each cluster is.
High persistence = cluster survives across seeds = reproducible.
Low persistence = cluster may disappear or merge on re-run.
"""
cluster_persistence = td.get("cluster_persistence", {})
overall_stability = td["metrics"].get("stability", 0.0)
rows = []
for cid in sorted(interps.keys()):
pers = cluster_persistence.get(cid, 0.0)
label = interps[cid]["label"]
size = interps[cid].get("strong",0) + interps[cid].get("weak",0)
stable_verdict = "βœ… Stable" if pers >= 0.7 else \
"⚠ Borderline" if pers >= 0.4 else \
"❌ Fragile"
rows.append({
"Cluster": cid,
"Label": label,
"Cluster Persistence": round(pers, 4),
"Strong Members": interps[cid].get("strong", 0),
"Weak Members": interps[cid].get("weak", 0),
"Total Papers": size,
"Stability Verdict": stable_verdict,
"Note": ("Likely same label on re-run" if pers >= 0.7 else
"Label may shift slightly" if pers >= 0.4 else
"May merge/split on re-run β€” consider merging with adjacent cluster"),
})
df = pd.DataFrame(rows).sort_values("Cluster Persistence", ascending=False)
# Prepend overall ARI row
overall_row = pd.DataFrame([{
"Cluster": "ALL",
"Label": f"Overall ARI Stability across 3 seeds = {round(overall_stability,4)}",
"Cluster Persistence": overall_stability,
"Strong Members": "β€”", "Weak Members": "β€”", "Total Papers": "β€”",
"Stability Verdict": "βœ… Stable" if overall_stability >= 0.8 else
"⚠ Borderline" if overall_stability >= 0.5 else "❌ Unstable",
"Note": "ARI close to 1.0 β†’ running the pipeline again will produce the same clusters",
}])
return pd.concat([overall_row, df], ignore_index=True)
def _reproducibility_chart(td: dict, interps: dict) -> go.Figure:
"""Horizontal bar of cluster persistence β€” shows which clusters are stable."""
cluster_persistence = td.get("cluster_persistence", {})
labels, persis, colors = [], [], []
for cid in sorted(interps.keys(), key=lambda c: cluster_persistence.get(c,0)):
p = cluster_persistence.get(cid, 0.0)
labels.append(interps[cid]["label"][:35])
persis.append(round(p, 4))
colors.append("#3dba7a" if p >= 0.7 else "#f5a623" if p >= 0.4 else "#e04d4d")
fig = go.Figure(go.Bar(
x=persis, y=labels, orientation="h",
marker_color=colors,
text=[str(v) for v in persis],
textposition="outside",
))
fig.add_vline(x=0.7, line_dash="dot", line_color="#3dba7a",
annotation_text="Stable threshold (0.7)")
fig.add_vline(x=0.4, line_dash="dot", line_color="#f5a623",
annotation_text="Borderline (0.4)")
fig.update_layout(
template="plotly_dark", height=max(400, len(labels)*28),
paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
title="Cluster Persistence β€” Proxy for Reproducibility\n"
"Green β‰₯ 0.7 (stable) Β· Orange 0.4–0.7 (borderline) Β· Red < 0.4 (fragile)",
xaxis_title="Persistence Score", yaxis_title="",
font=dict(size=10), margin=dict(l=260),
)
return fig
# ── NEW: Human interpretability check ────────────────────────────────────────
def _interpretability_df(interps: dict) -> pd.DataFrame:
"""
Flags what supervisor called 'human interpretable topic list'.
Checks two things:
1. Label overlap β€” pairs of cluster labels that share β‰₯2 significant words
(e.g. 'Cybersecurity and Privacy' vs 'Cyber-Risk Management and Online Security').
2. Vagueness β€” labels containing generic terms like 'systems', 'digital', 'data'
as the ONLY meaningful content.
Output is a table the supervisor can review to confirm distinctiveness.
"""
import itertools
NOISE = {"the","and","for","with","using","based","from","that","are","this",
"in","of","a","to","an","on","at","by","or","as","is","its","via",
"systems","digital","information","management","based","driven"}
VAGUE_SINGLES = {"systems","digital","data","information","analysis","research",
"study","approach","framework","model","methods","technology"}
def _sig_words(label: str) -> set:
words = set(re.findall(r"\b[a-z]{4,}\b", label.lower()))
return words - NOISE
rows = []
cids = sorted(interps.keys())
labels_map = {cid: interps[cid]["label"] for cid in cids}
# Check every pair
seen_pairs = set()
for cid_a, cid_b in itertools.combinations(cids, 2):
la, lb = labels_map[cid_a], labels_map[cid_b]
wa, wb = _sig_words(la), _sig_words(lb)
overlap = wa & wb
if len(overlap) >= 2:
pair_key = tuple(sorted([cid_a, cid_b]))
if pair_key not in seen_pairs:
seen_pairs.add(pair_key)
rows.append({
"Issue": "⚠ Label Overlap",
"Cluster A": cid_a,
"Label A": la,
"Cluster B": cid_b,
"Label B": lb,
"Shared Words": ", ".join(sorted(overlap)),
"Severity": "HIGH β€” consider merging" if len(overlap) >= 3
else "MEDIUM β€” review distinctiveness",
"Action": "Check if these two clusters cover the same research theme. "
"If yes, increase min_cluster_size to force a merge.",
})
# Check each label for vagueness
for cid in cids:
label = labels_map[cid]
sig = _sig_words(label)
vague = sig & VAGUE_SINGLES
specific = sig - VAGUE_SINGLES
if len(specific) == 0:
rows.append({
"Issue": "❌ Too Vague",
"Cluster A": cid,
"Label A": label,
"Cluster B": "β€”",
"Label B": "β€”",
"Shared Words": ", ".join(vague),
"Severity": "HIGH β€” label is not human interpretable",
"Action": "Run optimization pass to refine the label, "
"or manually inspect keyphrases for more specific terms.",
})
if not rows:
rows.append({
"Issue": "βœ… All Clear",
"Cluster A": "β€”", "Label A": "All labels are distinct and specific",
"Cluster B": "β€”", "Label B": "β€”",
"Shared Words": "β€”", "Severity": "NONE",
"Action": "Topic list is human interpretable and non-overlapping.",
})
return pd.DataFrame(rows)
# ── Pipeline runner ──────────────────────────────────────────────────────────
def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize,
progress=gr.Progress(track_tqdm=True)):
if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.")
gk = gk.strip() or os.getenv("GROQ_API_KEY","")
mk = mk.strip() or os.getenv("MISTRAL_API_KEY","")
gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")
method_path = method_file.name if method_file else None
progress(0.05, desc="πŸ“₯ Loading CSV…")
progress(0.10, desc="πŸ”¬ Embedding corpus with SPECTER-2…")
r = run_pipeline(corpus_file.name, gk, mk, gek,
int(n_trials), int(n_optimize), method_path)
if r.get("error"): raise gr.Error(r["error"])
progress(0.85, desc="πŸ“Š Building outputs…")
td, interps = r["topic_data"], r.get("interpretations",{})
disc, met = td["discipline"], td["metrics"]
ar = r.get("agreement_rates",{})
rl = r.get("refinement_log", [])
def _s(ok): return "βœ… PASS" if ok else "❌ FAIL"
summary = (
f"## Pipeline Complete β€” {disc['n_clusters']} clusters discovered\n\n"
f"| Criterion | Value | Status |\n|---|---|---|\n"
f"| Max cluster mass | {round(disc['max_mass_pct']*100,1)}% | {_s(disc['max_mass_ok'])} |\n"
f"| Min cluster size | {disc['min_size']} | {_s(disc['min_size_ok'])} |\n"
f"| Persistence (mean) | {round(met['persistence'],4)} | β€” |\n"
f"| DBCV | {round(met['dbcv'],4)} | β€” |\n"
f"| Stability (3 seeds) | {round(met['stability'],4)} | β€” |\n\n"
f"**Trials:** {td['n_trials_run']} (best #{td['best_trial']}) Β· "
f"**Agreement:** Triple {ar.get('triple',0)}% Β· Two+ {ar.get('two_or_more',0)}% Β· "
f"**Optimization passes:** {n_optimize} Β· **Labels refined:** {len(rl)}"
)
# UMAP scatter
u2d = np.array(td["umap_2d"])
sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
"Cluster":[str(l) for l in td["labels"]],
"Doc":[d[:60] for d in td["documents"]]})
fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster",
hover_data=["Doc"], opacity=0.75,
title="2-D UMAP visualisation of SPECTER-2 embeddings")
fig.update_layout(template="plotly_dark", height=500,
paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))
# Trial log + Pareto
tl = pd.DataFrame(td["trial_log"])
tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
"dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
tl_show = tl[tl_cols] if not tl.empty else pd.DataFrame()
pfig = go.Figure()
if not tl.empty:
for passed, color, name in [(True,"#3dba7a","PASS"),(False,"#e04d4d","FAIL")]:
sub = tl[tl["discipline_pass"]==passed]
if not sub.empty:
pfig.add_trace(go.Scatter(x=sub["max_mass_pct"],y=sub["persistence"],
mode="markers",marker=dict(size=8,color=color),name=name,
text=sub["trial"],hovertemplate="Trial %{text}<br>Mass: %{x:.0%}<br>Pers: %{y:.3f}"))
pfig.add_vline(x=0.25,line_dash="dash",line_color="#5a6480",annotation_text="25% rule")
pfig.update_layout(template="plotly_dark",height=400,
paper_bgcolor="#0d1117",plot_bgcolor="#161b22",
title="Pareto front β€” Persistence vs Max cluster mass",
xaxis_title="Max cluster mass",yaxis_title="Persistence",font=dict(size=11))
cdf_rows = []
for cid in sorted(interps.keys()):
v = interps[cid]
cdf_rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"],
"Strong":v["strong"],"Weak":v["weak"],
"Persistence":round(v.get("persistence",0),4),
"Keyphrases":", ".join(v.get("keyphrases",[]))})
cdf = pd.DataFrame(cdf_rows)
sheets = r.get("sheets",{})
s1 = pd.DataFrame(sheets.get(1,[])); s2 = pd.DataFrame(sheets.get(2,[]))
s3 = pd.DataFrame(sheets.get(3,[])); s4 = pd.DataFrame(sheets.get(4,[]))
sp = r.get("sheet_paths",{})
mdf = pd.DataFrame(r.get("mismatch_table",[]))
md_data = r.get("methodology_data",{})
top_papers_df = _top_papers_df(r.get("top_papers",{}))
method_sum_df = _methodology_summary_df(md_data, interps)
method_chart = _methodology_bar_chart(md_data, interps)
extraction_df = _extraction_pipeline_df(md_data, interps)
per_llm_meth_df = _per_llm_methodology_df(md_data, interps)
regex_hits_df = _regex_hits_df(md_data, interps)
pattern_info = _regex_pattern_info()
refine_df = _refinement_df(rl)
# ── NEW: methodology-CSV outputs ─────────────────────────────────────────
comp_sheets = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]})
jct = r.get("journal_crosstab", {})
tech_opt_log = r.get("technique_opt_log", [])
tech_s1 = _tech_sheet_df(comp_sheets.get(1,[]))
tech_s2 = _tech_sheet_df(comp_sheets.get(2,[]))
tech_s3 = _tech_sheet_df(comp_sheets.get(3,[]))
tech_s4 = _tech_sheet_df(comp_sheets.get(4,[]))
tech_llm_chart = _tech_llm_pct_chart(comp_sheets)
jct_chart = _journal_crosstab_chart(jct)
jct_df = _journal_crosstab_df(jct)
per_llm_freq_df = _per_llm_freq_df(jct)
tech_opt_df = _tech_opt_df(tech_opt_log)
# ── NEW: cluster sizes, reproducibility, interpretability ─────────────────
cluster_sizes_fig = _cluster_sizes_chart(interps, disc)
repro_chart = _reproducibility_chart(td, interps)
repro_df = _reproducibility_df(td, interps)
interpretability_df = _interpretability_df(interps)
progress(1.0, desc="βœ… Done!")
dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]
return (
# ── original outputs (order preserved) ───────────────────────────────
summary, fig, pfig, tl_show, cdf,
top_papers_df,
method_chart, method_sum_df, extraction_df, per_llm_meth_df,
regex_hits_df, pattern_info,
refine_df,
s1, s2, s3, s4,
dl_files if dl_files else None,
mdf,
# ── new outputs ───────────────────────────────────────────────────────
tech_llm_chart,
tech_s1, tech_s2, tech_s3, tech_s4,
per_llm_freq_df,
jct_chart,
jct_df,
tech_opt_df,
# ── supervisor additions ──────────────────────────────────────────────
cluster_sizes_fig,
repro_chart,
repro_df,
interpretability_df,
)
# ── UI ────────────────────────────────────────────────────────────────────────
css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
"footer{display:none!important}"
with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
css=css, title="SPECTER-2 Topic Analyzer") as demo:
gr.Markdown("# πŸ“ SPECTER-2 Topic Analyzer")
with gr.Row():
# ── Left sidebar ─────────────────────────────────────────────────────
with gr.Column(scale=1):
gr.Markdown("### πŸ“„ Corpus CSV")
file_in = gr.File(label="Upload Scopus CSV (title + abstract)",
file_types=[".csv"])
preview_out = gr.Markdown("Upload a CSV to see stats.")
gr.Markdown("### πŸ”¬ Methodology CSV *(optional)*")
method_file_in = gr.File(label="Upload Methodology CSV (title, doi, methodology)",
file_types=[".csv"])
method_preview = gr.Markdown("Upload methodology CSV to enable technique analysis.")
gr.Markdown("### πŸ”‘ API Keys")
groq_in = gr.Textbox(label="Groq API Key", type="password",
placeholder="or set GROQ_API_KEY env var")
mistral_in = gr.Textbox(label="Mistral API Key", type="password",
placeholder="or set MISTRAL_API_KEY env var")
gemini_in = gr.Textbox(label="Gemini API Key", type="password",
placeholder="or set GEMINI_API_KEY env var")
gr.Markdown("### βš™ Parameters")
trials_in = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
optimize_in = gr.Slider(1, 5, 1, step=1,
label="πŸ” Optimization Passes",
info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels "
"AND technique labels for hallucinations + improvements.")
run_btn = gr.Button("β–Ά Run Full Pipeline", variant="primary", size="lg")
# ── Main panel ────────────────────────────────────────────────────────
with gr.Column(scale=3):
with gr.Tabs():
# ── original tabs (order / content unchanged) ─────────────────
with gr.Tab("Summary"):
summary_out = gr.Markdown()
with gr.Tab("2-D UMAP"):
scatter_out = gr.Plot()
with gr.Tab("Pareto Front"):
pareto_out = gr.Plot()
with gr.Tab("Trial Log"):
trial_out = gr.Dataframe()
with gr.Tab("Clusters"):
cluster_out = gr.Dataframe()
with gr.Tab("πŸ—ž Top 3 Papers"):
gr.Markdown("### Top 3 Representative Papers per Cluster\n"
"Ranked by cosine similarity to cluster centroid "
"in SPECTER-2 embedding space.")
top_papers_out = gr.Dataframe(
headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
wrap=True)
with gr.Tab("πŸ”¬ Cluster Methodology"):
gr.Markdown("### Cluster-Level Methodology β€” 3-LLM Council\n"
"Derived from representative abstracts per cluster. "
"β‰₯2-LLM gate applied.")
method_chart_out = gr.Plot()
method_summary_out = gr.Dataframe(wrap=True)
with gr.Tab("βš™ Cluster Extraction Pipeline"):
gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)")
extraction_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ€– Cluster Per-LLM Votes"):
gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)")
per_llm_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ” Cluster Regex Hits"):
gr.Markdown("### Regex Pattern Matches (per cluster)\n"
"Every match with exact character span and paper number.")
regex_hits_out = gr.Dataframe(wrap=True)
regex_info_out = gr.Markdown()
with gr.Tab("πŸ” Refinement Log"):
gr.Markdown("### Topic Label Optimization Log\n"
"Changes made by LLM critic per optimization pass.")
refine_out = gr.Dataframe(wrap=True)
with gr.Tab("Sheet 1 β€” Groq"): s1_out = gr.Dataframe()
with gr.Tab("Sheet 2 β€” Mistral"): s2_out = gr.Dataframe()
with gr.Tab("Sheet 3 β€” Gemini"): s3_out = gr.Dataframe()
with gr.Tab("Sheet 4 β€” Consolidated"): s4_out = gr.Dataframe()
with gr.Tab("RQ Mismatch"): mismatch_out = gr.Dataframe()
with gr.Tab("Downloads"):
dl_out = gr.File(label="All sheet CSVs + topics.json",
file_count="multiple")
# ── NEW tabs: methodology CSV pipeline ────────────────────────
with gr.Tab("πŸ’» Comp. Techniques β€” LLM % Chart"):
gr.Markdown("### Computational Technique Frequency β€” Methodology CSV\n"
"For each technique, shows the % of papers it was extracted "
"from by each of the 3 LLMs independently + the consolidated "
"result (β‰₯2-LLM gate). Bars grouped by technique.")
tech_llm_chart_out = gr.Plot()
with gr.Tab("πŸ’» Tech Sheet 1 β€” Groq"):
gr.Markdown("### Groq raw technique extraction β€” one row per paper")
tech_s1_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ’» Tech Sheet 2 β€” Mistral"):
gr.Markdown("### Mistral raw technique extraction β€” one row per paper")
tech_s2_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ’» Tech Sheet 3 β€” Gemini"):
gr.Markdown("### Gemini raw technique extraction β€” one row per paper")
tech_s3_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ’» Tech Sheet 4 β€” Consolidated"):
gr.Markdown("### Consolidated techniques β€” β‰₯2-LLM agreement, one row per paper")
tech_s4_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ“Š Tech Frequency by LLM"):
gr.Markdown("### Per-LLM Technique Frequency Table\n"
"% of all papers where each LLM extracted each technique. "
"High variance = LLMs disagree β†’ optimization flag.")
per_llm_freq_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ—‚ Journal Cross-Tabulation"):
gr.Markdown("### Technique Γ— Journal Cross-Tabulation\n"
"Rows = journals auto-detected from DOI/title. "
"Columns = consolidated techniques. "
"Values = % of papers in that journal using the technique.\n\n"
"**Journals detected:** MISQ, JAIS, ISR, JMIS, PAJAIS, "
"ECIS, ICIS, Other.")
jct_chart_out = gr.Plot()
jct_df_out = gr.Dataframe(wrap=True)
with gr.Tab("πŸ”§ Technique Optimization"):
gr.Markdown("### Technique Label Improvement Suggestions\n"
"Groq critic flags: hallucination, high inter-LLM variance "
"(>15% gap), split/merge recommendations.\n"
"Only runs when Optimization Passes β‰₯ 2.")
tech_opt_out = gr.Dataframe(wrap=True)
# ── Supervisor-requested additions ────────────────────────────
with gr.Tab("πŸ“Š Cluster Sizes"):
gr.Markdown(
"### Cluster Sizes (Papers per Cluster)\n"
"Exact chart your supervisor highlighted. "
"**Green** = passes both discipline rules (mass ≀ 25%, size β‰₯ 5). "
"**Yellow** = cluster exceeds 25% mass cap β€” dominant cluster warning. "
"**Red** = cluster has fewer than 5 papers β€” too small.\n\n"
"The orange dashed line marks the 25% cap. Any bar above it "
"will fail the discipline check and the pipeline will re-optimise."
)
cluster_sizes_out = gr.Plot()
with gr.Tab("πŸ”„ Reproducibility"):
gr.Markdown(
"### Reproducibility β€” 'Run Again and Again, Topic List is the Same'\n\n"
"Your supervisor wants proof that running the pipeline multiple times "
"produces the **same clusters**. This tab shows two measures:\n\n"
"**Overall ARI Stability** (top row) β€” Adjusted Rand Index averaged "
"across 3 random seeds. ARI = 1.0 means identical clusters every run. "
"ARI β‰₯ 0.8 is considered stable for publication.\n\n"
"**Cluster Persistence** (per row) β€” how strongly each cluster's "
"structure is preserved in the condensed HDBSCAN tree. "
"High persistence β†’ cluster survives parameter variation β†’ "
"same label will appear on re-run. "
"Low persistence β†’ cluster may split or merge β†’ label may change.\n\n"
"🟒 β‰₯ 0.7 Stable Β· 🟑 0.4–0.7 Borderline Β· πŸ”΄ < 0.4 Fragile"
)
repro_chart_out = gr.Plot()
repro_df_out = gr.Dataframe(wrap=True)
with gr.Tab("🧠 Interpretability Check"):
gr.Markdown(
"### Human Interpretability Check β€” 'Topic List Must Be Distinct'\n\n"
"Your supervisor flagged that labels like "
"*'Cybersecurity and Privacy'* and *'Cyber-Risk Management and Online Security'* "
"look like the same topic. This tab automatically detects:\n\n"
"**⚠ Label Overlap** β€” pairs of cluster labels sharing β‰₯ 2 significant "
"words (noise words like 'and', 'for', 'in' excluded). "
"Overlapping labels suggest the two clusters may cover the same theme "
"and should be reviewed for merging.\n\n"
"**❌ Too Vague** β€” labels where all meaningful words are generic "
"('systems', 'digital', 'data') with no domain-specific content. "
"These need the optimization pass to refine them.\n\n"
"**Action column** tells you exactly what to do for each flag."
)
interpretability_out = gr.Dataframe(wrap=True)
# ── Wire callbacks ────────────────────────────────────────────────────────
file_in.change(_preview, inputs=[file_in], outputs=[preview_out])
method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview])
run_btn.click(
_run,
inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in,
trials_in, optimize_in],
outputs=[
# original
summary_out, scatter_out, pareto_out, trial_out, cluster_out,
top_papers_out,
method_chart_out, method_summary_out, extraction_out, per_llm_out,
regex_hits_out, regex_info_out,
refine_out,
s1_out, s2_out, s3_out, s4_out,
dl_out, mismatch_out,
# new
tech_llm_chart_out,
tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out,
per_llm_freq_out,
jct_chart_out,
jct_df_out,
tech_opt_out,
# supervisor additions
cluster_sizes_out,
repro_chart_out,
repro_df_out,
interpretability_out,
],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)