BERTopic_AG_final

Running

App Files Files Community

BERTopic_AG_final / app.py

BHAVIKBANKER

Update app.py

e161996 verified 3 days ago

raw

history blame contribute delete

47.4 kB

	"""
	app.py — Gradio UI entry point.
	ORIGINAL structure and all tabs preserved.
	NEW: second file upload for methodology CSV, technique sheets 1-4,
	journal cross-tabulation chart + table, technique optimisation log.
	"""
	import os, json
	import re
	import pandas as pd, numpy as np
	import gradio as gr
	import plotly.express as px
	import plotly.graph_objects as go
	from agent import run_pipeline, METHODOLOGY_PATTERNS, TECHNIQUE_PATTERNS

	# ── CSV preview ──────────────────────────────────────────────────────────────
	def _preview(file):
	if not file: return "Upload a Scopus CSV to begin."
	df = pd.read_csv(file.name)
	df.columns = df.columns.str.lower()
	has_t = "title" in df.columns
	has_a = "abstract" in df.columns
	n = len(df)
	blanks_t = int(df["title"].isna().sum()) if has_t else n
	blanks_a = int(df["abstract"].isna().sum()) if has_a else n
	ok = "✅" if has_t and has_a and blanks_t < n and blanks_a < n else "❌"
	return (f"## {ok} CSV loaded — {n} entries\n\n"
	f"\| Column \| Present \| Blank rows \|\n\|---\|---\|---\|\n"
	f"\| title \| {'✅' if has_t else '❌'} \| {blanks_t} \|\n"
	f"\| abstract \| {'✅' if has_a else '❌'} \| {blanks_a} \|\n\n"
	f"Usable papers: {n - max(blanks_t, blanks_a)} / {n}")


	def _preview_methodology(file):
	if not file: return "Upload methodology CSV (title, doi, methodology) to enable technique analysis."
	df = pd.read_csv(file.name)
	df.columns = df.columns.str.lower()
	has_t = "title" in df.columns
	has_m = "methodology" in df.columns
	has_d = "doi" in df.columns
	n = len(df)
	ok = "✅" if has_t and has_m else "❌"
	return (f"## {ok} Methodology CSV — {n} papers\n\n"
	f"\| Column \| Present \|\n\|---\|---\|\n"
	f"\| title \| {'✅' if has_t else '❌'} \|\n"
	f"\| doi \| {'✅' if has_d else '⚠ optional'} \|\n"
	f"\| methodology \| {'✅' if has_m else '❌'} \|\n\n"
	f"Journals will be auto-detected from DOI + title.")


	# ── Original helper builders ─────────────────────────────────────────────────
	def _top_papers_df(top_papers: dict) -> pd.DataFrame:
	rows = []
	for cid in sorted(top_papers.keys()):
	for p in top_papers[cid]:
	rows.append({"Cluster": cid, "Label": p["cluster_label"],
	"Rank": p["rank"], "Title": p["title"],
	"Abstract Snippet": p["abstract_snippet"]})
	return pd.DataFrame(rows)


	def _methodology_summary_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
	rows = []
	for cid in sorted(methodology_data.keys()):
	md = methodology_data[cid]
	label = interps.get(cid, {}).get("label", f"Cluster {cid}")
	rows.append({
	"Cluster": cid,
	"Label": label,
	"Dominant Method": md.get("dominant_method", "—"),
	"Dominant Technique": md.get("dominant_technique", "—"),
	"Empirical %": md.get("empirical_pct", 0),
	"Theoretical %": md.get("theoretical_pct", 0),
	"Mixed %": md.get("mixed_pct", 0),
	"Methods (≥2 LLMs)": ", ".join(
	f"{m['name']} ({m['pct']}%, {m['agreement']})"
	for m in md.get("methodologies", [])),
	"Techniques (≥2 LLMs)": ", ".join(
	f"{t['name']} ({t['pct']}%, {t['agreement']})"
	for t in md.get("techniques", [])),
	"Regex Confirmed": ", ".join(md.get("regex_confirmed_consensus", [])) or "—",
	"Regex Rejected": ", ".join(md.get("regex_rejected_consensus", [])) or "—",
	})
	return pd.DataFrame(rows)


	def _extraction_pipeline_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
	rows = []
	for cid in sorted(methodology_data.keys()):
	md = methodology_data[cid]
	label = interps.get(cid, {}).get("label", f"Cluster {cid}")
	scan = md.get("regex_scan", {})
	for item in md.get("methodologies", []) + md.get("techniques", []):
	name = item["name"]
	regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
	matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
	rows.append({"Cluster": cid, "Label": label, "Item": name,
	"Type": "Method" if item in md.get("methodologies",[]) else "Technique",
	"Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
	"LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
	"Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
	"Gate Passed":"✅ ACCEPTED"})
	for item in md.get("rejected_methods",[]) + md.get("rejected_techniques",[]):
	name = item["name"]
	regex_hits= scan.get("methods",{}).get(name,[]) or scan.get("techniques",{}).get(name,[])
	matched = ", ".join(dict.fromkeys(h["match"] for h in regex_hits))[:80] if regex_hits else "—"
	rows.append({"Cluster": cid, "Label": label, "Item": name,
	"Type": "Method" if item in md.get("rejected_methods",[]) else "Technique",
	"Regex Match":matched, "Regex Fired": "✅" if regex_hits else "❌",
	"LLM Votes": item["llm_votes"], "Agreement": item["agreement"],
	"Avg Pct (%)":item["pct"], "Evidence": item.get("evidence","—"),
	"Gate Passed":"❌ REJECTED (single LLM)"})
	return pd.DataFrame(rows) if rows else pd.DataFrame()


	def _per_llm_methodology_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
	rows = []
	for cid in sorted(methodology_data.keys()):
	md = methodology_data[cid]
	label = interps.get(cid,{}).get("label", f"Cluster {cid}")
	raw = md.get("llm_raw",{})
	def _fmt(r, key):
	return " \| ".join(f"{i['name']} ({i.get('pct',0)}%)" for i in r.get(key,[])) or "—"
	rows.append({"Cluster": cid, "Label": label,
	"Groq Methods": _fmt(raw.get("groq",{}), "methodologies"),
	"Mistral Methods": _fmt(raw.get("mistral",{}), "methodologies"),
	"Gemini Methods": _fmt(raw.get("gemini",{}), "methodologies"),
	"Groq Techniques": _fmt(raw.get("groq",{}), "techniques"),
	"Mistral Techniques": _fmt(raw.get("mistral",{}), "techniques"),
	"Gemini Techniques": _fmt(raw.get("gemini",{}), "techniques"),
	"Groq E/T/M": f"{raw.get('groq',{}).get('empirical_pct',0)}/"
	f"{raw.get('groq',{}).get('theoretical_pct',0)}/"
	f"{raw.get('groq',{}).get('mixed_pct',0)}",
	"Mistral E/T/M": f"{raw.get('mistral',{}).get('empirical_pct',0)}/"
	f"{raw.get('mistral',{}).get('theoretical_pct',0)}/"
	f"{raw.get('mistral',{}).get('mixed_pct',0)}",
	"Gemini E/T/M": f"{raw.get('gemini',{}).get('empirical_pct',0)}/"
	f"{raw.get('gemini',{}).get('theoretical_pct',0)}/"
	f"{raw.get('gemini',{}).get('mixed_pct',0)}",
	})
	return pd.DataFrame(rows)


	def _regex_hits_df(methodology_data: dict, interps: dict) -> pd.DataFrame:
	rows = []
	for cid in sorted(methodology_data.keys()):
	md = methodology_data[cid]
	label = interps.get(cid,{}).get("label", f"Cluster {cid}")
	scan = md.get("regex_scan",{})
	for category, hits in scan.get("methods",{}).items():
	for h in hits:
	rows.append({"Cluster": cid, "Label": label, "Bank": "Methodology",
	"Pattern Category": category, "Matched Text": h["match"],
	"Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
	for category, hits in scan.get("techniques",{}).items():
	for h in hits:
	rows.append({"Cluster": cid, "Label": label, "Bank": "Technique",
	"Pattern Category": category, "Matched Text": h["match"],
	"Paper #": h["doc"], "Char Span": f"{h['span'][0]}–{h['span'][1]}"})
	return pd.DataFrame(rows) if rows else pd.DataFrame()


	def _methodology_bar_chart(methodology_data: dict, interps: dict) -> go.Figure:
	labels_list, empirical, theoretical, mixed = [], [], [], []
	for cid in sorted(methodology_data.keys()):
	md = methodology_data[cid]
	labels_list.append(interps.get(cid,{}).get("label", f"C{cid}")[:30])
	empirical.append(md.get("empirical_pct", 0))
	theoretical.append(md.get("theoretical_pct", 0))
	mixed.append(md.get("mixed_pct", 0))
	fig = go.Figure()
	fig.add_trace(go.Bar(name="Empirical %", x=labels_list, y=empirical, marker_color="#3dba7a"))
	fig.add_trace(go.Bar(name="Theoretical %", x=labels_list, y=theoretical, marker_color="#5b9cf6"))
	fig.add_trace(go.Bar(name="Mixed %", x=labels_list, y=mixed, marker_color="#f5a623"))
	fig.update_layout(barmode="stack", template="plotly_dark", height=420,
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
	title="Research Orientation per Cluster — Averaged across Groq + Mistral + Gemini",
	xaxis_title="Cluster", yaxis_title="Percentage (%)",
	font=dict(size=11), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-35)
	return fig


	def _refinement_df(rl: list) -> pd.DataFrame:
	if not rl:
	return pd.DataFrame(columns=["Cluster","Iteration","Old Label","New Label",
	"Issues","Improvement","Hallucination Detected"])
	return pd.DataFrame([{
	"Cluster": r["cluster"], "Iteration": r["iteration"],
	"Old Label": r["old_label"], "New Label": r["new_label"],
	"Issues": "; ".join(r.get("issues",[])),
	"Improvement": r["improvement_score"],
	"Hallucination Detected": r["hallucination_detected"],
	} for r in rl])


	def _regex_pattern_info() -> str:
	m_list = "\n".join(f"- {k}: `{v.pattern}`" for k,v in METHODOLOGY_PATTERNS.items())
	t_list = "\n".join(f"- {k}: `{v.pattern}`" for k,v in TECHNIQUE_PATTERNS.items())
	return (
	"### How Cluster Methodology Extraction Works\n\n"
	"Step 1 — Regex Pre-Scan: Two compiled pattern banks run against representative "
	"abstracts. Every match recorded with exact character span, matched text, paper number.\n\n"
	"Step 2 — 3-LLM Council: Groq, Mistral, Gemini each receive regex evidence + abstracts. "
	"Each LLM confirms/rejects regex hits and adds any missed methods/techniques.\n\n"
	"Step 3 — ≥2-LLM Gate: Only items named by ≥2 LLMs survive. Percentages averaged.\n\n"
	"Step 4 — Orientation: Empirical/Theoretical/Mixed averaged across 3 LLMs.\n\n"
	"---\n\n#### Methodology Bank\n" + m_list +
	"\n\n#### Technique Bank\n" + t_list)


	# ── NEW helpers for methodology-CSV pipeline ─────────────────────────────────
	def _tech_sheet_df(sheet_rows: list) -> pd.DataFrame:
	return pd.DataFrame(sheet_rows) if sheet_rows else pd.DataFrame()


	def _tech_llm_pct_chart(comp_sheets: dict) -> go.Figure:
	"""
	Grouped bar: for each technique, show the % of papers it was found in
	by each of the 3 LLMs (Groq, Mistral, Gemini) + Consolidated.
	"""
	s1 = comp_sheets.get(1, [])
	s2 = comp_sheets.get(2, [])
	s3 = comp_sheets.get(3, [])
	s4 = comp_sheets.get(4, [])

	def _freq(rows):
	counts = {}
	n = len(rows) or 1
	for row in rows:
	for t in (row.get("techniques","") or "").split(", "):
	t = t.strip().title()
	if t and t != "—":
	counts[t] = counts.get(t,0) + 1
	return {k: round(v/n*100) for k,v in counts.items()}

	f1 = _freq(s1); f2 = _freq(s2); f3 = _freq(s3); f4 = _freq(s4)
	all_techs = sorted(set(f1)\|set(f2)\|set(f3)\|set(f4))

	fig = go.Figure()
	fig.add_trace(go.Bar(name="Groq", x=all_techs, y=[f1.get(t,0) for t in all_techs], marker_color="#5b9cf6"))
	fig.add_trace(go.Bar(name="Mistral", x=all_techs, y=[f2.get(t,0) for t in all_techs], marker_color="#f5a623"))
	fig.add_trace(go.Bar(name="Gemini", x=all_techs, y=[f3.get(t,0) for t in all_techs], marker_color="#a855f7"))
	fig.add_trace(go.Bar(name="Consolidated", x=all_techs, y=[f4.get(t,0) for t in all_techs], marker_color="#3dba7a"))
	fig.update_layout(barmode="group", template="plotly_dark", height=480,
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
	title="Computational Technique Frequency — % of Papers per LLM (Groq / Mistral / Gemini / Consolidated)",
	xaxis_title="Technique", yaxis_title="% of papers",
	font=dict(size=10), legend=dict(orientation="h", y=1.12), xaxis_tickangle=-40)
	return fig


	def _journal_crosstab_chart(journal_crosstab: dict) -> go.Figure:
	"""
	Grouped bar: for each technique, show % usage per journal.
	Journals on x-axis, techniques as bar groups.
	"""
	ct = journal_crosstab.get("consolidated", {})
	journals = journal_crosstab.get("journals", [])
	techniques= journal_crosstab.get("techniques", [])

	if not journals or not techniques:
	fig = go.Figure()
	fig.update_layout(template="plotly_dark", title="No journal data available",
	paper_bgcolor="#0d1117")
	return fig

	COLORS = ["#5b9cf6","#3dba7a","#f5a623","#e04d4d","#a855f7","#06b6d4",
	"#f97316","#84cc16","#ec4899","#14b8a6","#8b5cf6","#ef4444"]

	fig = go.Figure()
	for i, tech in enumerate(techniques[:15]): # cap at 15 techniques for readability
	pcts = [ct.get(j,{}).get(tech, 0) for j in journals]
	fig.add_trace(go.Bar(name=tech, x=journals, y=pcts,
	marker_color=COLORS[i % len(COLORS)]))

	fig.update_layout(barmode="group", template="plotly_dark", height=500,
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
	title="Computational Technique Usage — Cross-Tabulation by Journal (%)",
	xaxis_title="Journal", yaxis_title="% of papers using technique",
	font=dict(size=10), legend=dict(orientation="h", y=1.15), xaxis_tickangle=-20)
	return fig


	def _journal_crosstab_df(journal_crosstab: dict) -> pd.DataFrame:
	ct = journal_crosstab.get("consolidated", {})
	journals = journal_crosstab.get("journals", [])
	techniques= journal_crosstab.get("techniques", [])
	paper_counts = journal_crosstab.get("journal_paper_counts", {})
	rows = []
	for j in journals:
	row = {"Journal": j, "N Papers": paper_counts.get(j,0)}
	for t in techniques:
	row[t] = f"{ct.get(j,{}).get(t,0)}%"
	rows.append(row)
	return pd.DataFrame(rows)


	def _tech_opt_df(opt_log: list) -> pd.DataFrame:
	if not opt_log:
	return pd.DataFrame(columns=["Technique","Refined Name","Hallucination",
	"High Variance","Groq %","Mistral %","Gemini %",
	"Suggestion","Split Into","Merge With"])
	return pd.DataFrame([{
	"Technique": r["technique"],
	"Refined Name": r["refined_name"],
	"Hallucination": r["is_hallucination"],
	"High Variance": r["high_variance"],
	"Groq %": r["pct_groq"],
	"Mistral %": r["pct_mistral"],
	"Gemini %": r["pct_gemini"],
	"Suggestion": r["suggestion"],
	"Split Into": r["split_into"],
	"Merge With": r["merge_with"],
	} for r in opt_log])


	def _per_llm_freq_df(journal_crosstab: dict) -> pd.DataFrame:
	"""Per-LLM technique frequency across all papers in methodology CSV."""
	per_llm = journal_crosstab.get("per_llm_freq", {})
	techniques = sorted(set(t for d in per_llm.values() for t in d.keys()))
	rows = []
	for t in techniques:
	rows.append({
	"Technique": t,
	"Groq %": per_llm.get("Groq",{}).get(t, 0),
	"Mistral %": per_llm.get("Mistral",{}).get(t, 0),
	"Gemini %": per_llm.get("Gemini",{}).get(t, 0),
	"Variance": round(max(
	per_llm.get("Groq",{}).get(t,0),
	per_llm.get("Mistral",{}).get(t,0),
	per_llm.get("Gemini",{}).get(t,0),
	) - min(
	per_llm.get("Groq",{}).get(t,0),
	per_llm.get("Mistral",{}).get(t,0),
	per_llm.get("Gemini",{}).get(t,0),
	)),
	})
	return pd.DataFrame(rows).sort_values("Groq %", ascending=False)


	# ── NEW: Cluster Sizes bar chart (what supervisor pointed to) ────────────────
	def _cluster_sizes_chart(interps: dict, disc: dict) -> go.Figure:
	"""
	Bar chart: Papers per Cluster — coloured by discipline rule status.
	Green = passes both constraints (mass ≤ 25%, size ≥ 5).
	Yellow = exceeds 25% mass cap (dominant cluster warning).
	Red = below min-size of 5 (too small).
	Number label shown on top of each bar, exactly like supervisor's image.
	"""
	cluster_sizes = disc.get("cluster_sizes", {})
	n_docs = sum(cluster_sizes.values()) or 1
	max_allowed = int(0.25 * n_docs)

	labels, sizes, colors, texts = [], [], [], []
	for cid in sorted(interps.keys()):
	label = interps[cid]["label"]
	size = cluster_sizes.get(cid, interps[cid].get("strong",0) + interps[cid].get("weak",0))
	mass_pct = size / n_docs

	color = "#3dba7a" # green — PASS
	if mass_pct > 0.25:
	color = "#f5c518" # yellow — mass violation (like supervisor image)
	elif size < 5:
	color = "#e04d4d" # red — too small

	labels.append(label)
	sizes.append(size)
	colors.append(color)
	texts.append(str(size))

	fig = go.Figure(go.Bar(
	x=labels, y=sizes,
	marker_color=colors,
	text=texts,
	textposition="outside",
	textfont=dict(size=11, color="#c9d1d9"),
	))
	fig.add_hline(y=max_allowed, line_dash="dash", line_color="#f5a623",
	annotation_text=f"25% cap ({max_allowed} papers)",
	annotation_font_color="#f5a623")
	fig.update_layout(
	template="plotly_dark", height=520,
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
	title="Cluster Sizes (Papers per Cluster) — Green=PASS · Yellow=Mass>25% · Red=Size<5",
	xaxis_title="Cluster", yaxis_title="Number of Papers",
	font=dict(size=10), xaxis_tickangle=-40,
	showlegend=False,
	margin=dict(t=80, b=200),
	)
	return fig


	# ── NEW: Reproducibility panel ────────────────────────────────────────────────
	def _reproducibility_df(td: dict, interps: dict) -> pd.DataFrame:
	"""
	Shows what the supervisor means by 'run again and again, topic list is same'.
	Pulls the stability ARI (already computed across 3 seeds in tools.py) and
	shows per-cluster persistence as a proxy for how stable each cluster is.
	High persistence = cluster survives across seeds = reproducible.
	Low persistence = cluster may disappear or merge on re-run.
	"""
	cluster_persistence = td.get("cluster_persistence", {})
	overall_stability = td["metrics"].get("stability", 0.0)
	rows = []
	for cid in sorted(interps.keys()):
	pers = cluster_persistence.get(cid, 0.0)
	label = interps[cid]["label"]
	size = interps[cid].get("strong",0) + interps[cid].get("weak",0)
	stable_verdict = "✅ Stable" if pers >= 0.7 else \
	"⚠ Borderline" if pers >= 0.4 else \
	"❌ Fragile"
	rows.append({
	"Cluster": cid,
	"Label": label,
	"Cluster Persistence": round(pers, 4),
	"Strong Members": interps[cid].get("strong", 0),
	"Weak Members": interps[cid].get("weak", 0),
	"Total Papers": size,
	"Stability Verdict": stable_verdict,
	"Note": ("Likely same label on re-run" if pers >= 0.7 else
	"Label may shift slightly" if pers >= 0.4 else
	"May merge/split on re-run — consider merging with adjacent cluster"),
	})
	df = pd.DataFrame(rows).sort_values("Cluster Persistence", ascending=False)
	# Prepend overall ARI row
	overall_row = pd.DataFrame([{
	"Cluster": "ALL",
	"Label": f"Overall ARI Stability across 3 seeds = {round(overall_stability,4)}",
	"Cluster Persistence": overall_stability,
	"Strong Members": "—", "Weak Members": "—", "Total Papers": "—",
	"Stability Verdict": "✅ Stable" if overall_stability >= 0.8 else
	"⚠ Borderline" if overall_stability >= 0.5 else "❌ Unstable",
	"Note": "ARI close to 1.0 → running the pipeline again will produce the same clusters",
	}])
	return pd.concat([overall_row, df], ignore_index=True)


	def _reproducibility_chart(td: dict, interps: dict) -> go.Figure:
	"""Horizontal bar of cluster persistence — shows which clusters are stable."""
	cluster_persistence = td.get("cluster_persistence", {})
	labels, persis, colors = [], [], []
	for cid in sorted(interps.keys(), key=lambda c: cluster_persistence.get(c,0)):
	p = cluster_persistence.get(cid, 0.0)
	labels.append(interps[cid]["label"][:35])
	persis.append(round(p, 4))
	colors.append("#3dba7a" if p >= 0.7 else "#f5a623" if p >= 0.4 else "#e04d4d")

	fig = go.Figure(go.Bar(
	x=persis, y=labels, orientation="h",
	marker_color=colors,
	text=[str(v) for v in persis],
	textposition="outside",
	))
	fig.add_vline(x=0.7, line_dash="dot", line_color="#3dba7a",
	annotation_text="Stable threshold (0.7)")
	fig.add_vline(x=0.4, line_dash="dot", line_color="#f5a623",
	annotation_text="Borderline (0.4)")
	fig.update_layout(
	template="plotly_dark", height=max(400, len(labels)*28),
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22",
	title="Cluster Persistence — Proxy for Reproducibility\n"
	"Green ≥ 0.7 (stable) · Orange 0.4–0.7 (borderline) · Red < 0.4 (fragile)",
	xaxis_title="Persistence Score", yaxis_title="",
	font=dict(size=10), margin=dict(l=260),
	)
	return fig


	# ── NEW: Human interpretability check ────────────────────────────────────────
	def _interpretability_df(interps: dict) -> pd.DataFrame:
	"""
	Flags what supervisor called 'human interpretable topic list'.
	Checks two things:
	1. Label overlap — pairs of cluster labels that share ≥2 significant words
	(e.g. 'Cybersecurity and Privacy' vs 'Cyber-Risk Management and Online Security').
	2. Vagueness — labels containing generic terms like 'systems', 'digital', 'data'
	as the ONLY meaningful content.
	Output is a table the supervisor can review to confirm distinctiveness.
	"""
	import itertools
	NOISE = {"the","and","for","with","using","based","from","that","are","this",
	"in","of","a","to","an","on","at","by","or","as","is","its","via",
	"systems","digital","information","management","based","driven"}
	VAGUE_SINGLES = {"systems","digital","data","information","analysis","research",
	"study","approach","framework","model","methods","technology"}

	def _sig_words(label: str) -> set:
	words = set(re.findall(r"\b[a-z]{4,}\b", label.lower()))
	return words - NOISE

	rows = []
	cids = sorted(interps.keys())
	labels_map = {cid: interps[cid]["label"] for cid in cids}

	# Check every pair
	seen_pairs = set()
	for cid_a, cid_b in itertools.combinations(cids, 2):
	la, lb = labels_map[cid_a], labels_map[cid_b]
	wa, wb = _sig_words(la), _sig_words(lb)
	overlap = wa & wb
	if len(overlap) >= 2:
	pair_key = tuple(sorted([cid_a, cid_b]))
	if pair_key not in seen_pairs:
	seen_pairs.add(pair_key)
	rows.append({
	"Issue": "⚠ Label Overlap",
	"Cluster A": cid_a,
	"Label A": la,
	"Cluster B": cid_b,
	"Label B": lb,
	"Shared Words": ", ".join(sorted(overlap)),
	"Severity": "HIGH — consider merging" if len(overlap) >= 3
	else "MEDIUM — review distinctiveness",
	"Action": "Check if these two clusters cover the same research theme. "
	"If yes, increase min_cluster_size to force a merge.",
	})

	# Check each label for vagueness
	for cid in cids:
	label = labels_map[cid]
	sig = _sig_words(label)
	vague = sig & VAGUE_SINGLES
	specific = sig - VAGUE_SINGLES
	if len(specific) == 0:
	rows.append({
	"Issue": "❌ Too Vague",
	"Cluster A": cid,
	"Label A": label,
	"Cluster B": "—",
	"Label B": "—",
	"Shared Words": ", ".join(vague),
	"Severity": "HIGH — label is not human interpretable",
	"Action": "Run optimization pass to refine the label, "
	"or manually inspect keyphrases for more specific terms.",
	})

	if not rows:
	rows.append({
	"Issue": "✅ All Clear",
	"Cluster A": "—", "Label A": "All labels are distinct and specific",
	"Cluster B": "—", "Label B": "—",
	"Shared Words": "—", "Severity": "NONE",
	"Action": "Topic list is human interpretable and non-overlapping.",
	})

	return pd.DataFrame(rows)


	# ── Pipeline runner ──────────────────────────────────────────────────────────
	def _run(corpus_file, method_file, gk, mk, gek, n_trials, n_optimize,
	progress=gr.Progress(track_tqdm=True)):
	if not corpus_file: raise gr.Error("Upload a Scopus corpus CSV first.")
	gk = gk.strip() or os.getenv("GROQ_API_KEY","")
	mk = mk.strip() or os.getenv("MISTRAL_API_KEY","")
	gek = gek.strip() or os.getenv("GEMINI_API_KEY","")
	if not all([gk,mk,gek]): raise gr.Error("All 3 API keys required.")

	method_path = method_file.name if method_file else None

	progress(0.05, desc="📥 Loading CSV…")
	progress(0.10, desc="🔬 Embedding corpus with SPECTER-2…")
	r = run_pipeline(corpus_file.name, gk, mk, gek,
	int(n_trials), int(n_optimize), method_path)
	if r.get("error"): raise gr.Error(r["error"])
	progress(0.85, desc="📊 Building outputs…")

	td, interps = r["topic_data"], r.get("interpretations",{})
	disc, met = td["discipline"], td["metrics"]
	ar = r.get("agreement_rates",{})
	rl = r.get("refinement_log", [])

	def _s(ok): return "✅ PASS" if ok else "❌ FAIL"
	summary = (
	f"## Pipeline Complete — {disc['n_clusters']} clusters discovered\n\n"
	f"\| Criterion \| Value \| Status \|\n\|---\|---\|---\|\n"
	f"\| Max cluster mass \| {round(disc['max_mass_pct']*100,1)}% \| {_s(disc['max_mass_ok'])} \|\n"
	f"\| Min cluster size \| {disc['min_size']} \| {_s(disc['min_size_ok'])} \|\n"
	f"\| Persistence (mean) \| {round(met['persistence'],4)} \| — \|\n"
	f"\| DBCV \| {round(met['dbcv'],4)} \| — \|\n"
	f"\| Stability (3 seeds) \| {round(met['stability'],4)} \| — \|\n\n"
	f"Trials: {td['n_trials_run']} (best #{td['best_trial']}) · "
	f"Agreement: Triple {ar.get('triple',0)}% · Two+ {ar.get('two_or_more',0)}% · "
	f"Optimization passes: {n_optimize} · Labels refined: {len(rl)}"
	)

	# UMAP scatter
	u2d = np.array(td["umap_2d"])
	sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
	"Cluster":[str(l) for l in td["labels"]],
	"Doc":[d[:60] for d in td["documents"]]})
	fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster",
	hover_data=["Doc"], opacity=0.75,
	title="2-D UMAP visualisation of SPECTER-2 embeddings")
	fig.update_layout(template="plotly_dark", height=500,
	paper_bgcolor="#0d1117", plot_bgcolor="#161b22", font=dict(size=11))

	# Trial log + Pareto
	tl = pd.DataFrame(td["trial_log"])
	tl_cols = [c for c in ["trial","discipline_pass","n_clusters","persistence",
	"dbcv","max_mass_pct","min_size","n_noise"] if c in tl.columns]
	tl_show = tl[tl_cols] if not tl.empty else pd.DataFrame()

	pfig = go.Figure()
	if not tl.empty:
	for passed, color, name in [(True,"#3dba7a","PASS"),(False,"#e04d4d","FAIL")]:
	sub = tl[tl["discipline_pass"]==passed]
	if not sub.empty:
	pfig.add_trace(go.Scatter(x=sub["max_mass_pct"],y=sub["persistence"],
	mode="markers",marker=dict(size=8,color=color),name=name,
	text=sub["trial"],hovertemplate="Trial %{text}<br>Mass: %{x:.0%}<br>Pers: %{y:.3f}"))
	pfig.add_vline(x=0.25,line_dash="dash",line_color="#5a6480",annotation_text="25% rule")
	pfig.update_layout(template="plotly_dark",height=400,
	paper_bgcolor="#0d1117",plot_bgcolor="#161b22",
	title="Pareto front — Persistence vs Max cluster mass",
	xaxis_title="Max cluster mass",yaxis_title="Persistence",font=dict(size=11))

	cdf_rows = []
	for cid in sorted(interps.keys()):
	v = interps[cid]
	cdf_rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"],
	"Strong":v["strong"],"Weak":v["weak"],
	"Persistence":round(v.get("persistence",0),4),
	"Keyphrases":", ".join(v.get("keyphrases",[]))})
	cdf = pd.DataFrame(cdf_rows)

	sheets = r.get("sheets",{})
	s1 = pd.DataFrame(sheets.get(1,[])); s2 = pd.DataFrame(sheets.get(2,[]))
	s3 = pd.DataFrame(sheets.get(3,[])); s4 = pd.DataFrame(sheets.get(4,[]))
	sp = r.get("sheet_paths",{})
	mdf = pd.DataFrame(r.get("mismatch_table",[]))

	md_data = r.get("methodology_data",{})
	top_papers_df = _top_papers_df(r.get("top_papers",{}))
	method_sum_df = _methodology_summary_df(md_data, interps)
	method_chart = _methodology_bar_chart(md_data, interps)
	extraction_df = _extraction_pipeline_df(md_data, interps)
	per_llm_meth_df = _per_llm_methodology_df(md_data, interps)
	regex_hits_df = _regex_hits_df(md_data, interps)
	pattern_info = _regex_pattern_info()
	refine_df = _refinement_df(rl)

	# ── NEW: methodology-CSV outputs ─────────────────────────────────────────
	comp_sheets = r.get("comp_technique_sheets", {1:[], 2:[], 3:[], 4:[]})
	jct = r.get("journal_crosstab", {})
	tech_opt_log = r.get("technique_opt_log", [])

	tech_s1 = _tech_sheet_df(comp_sheets.get(1,[]))
	tech_s2 = _tech_sheet_df(comp_sheets.get(2,[]))
	tech_s3 = _tech_sheet_df(comp_sheets.get(3,[]))
	tech_s4 = _tech_sheet_df(comp_sheets.get(4,[]))

	tech_llm_chart = _tech_llm_pct_chart(comp_sheets)
	jct_chart = _journal_crosstab_chart(jct)
	jct_df = _journal_crosstab_df(jct)
	per_llm_freq_df = _per_llm_freq_df(jct)
	tech_opt_df = _tech_opt_df(tech_opt_log)

	# ── NEW: cluster sizes, reproducibility, interpretability ─────────────────
	cluster_sizes_fig = _cluster_sizes_chart(interps, disc)
	repro_chart = _reproducibility_chart(td, interps)
	repro_df = _reproducibility_df(td, interps)
	interpretability_df = _interpretability_df(interps)

	progress(1.0, desc="✅ Done!")
	dl_files = [f for f in [sp.get(1),sp.get(2),sp.get(3),sp.get(4),r.get("json_path")] if f]

	return (
	# ── original outputs (order preserved) ───────────────────────────────
	summary, fig, pfig, tl_show, cdf,
	top_papers_df,
	method_chart, method_sum_df, extraction_df, per_llm_meth_df,
	regex_hits_df, pattern_info,
	refine_df,
	s1, s2, s3, s4,
	dl_files if dl_files else None,
	mdf,
	# ── new outputs ───────────────────────────────────────────────────────
	tech_llm_chart,
	tech_s1, tech_s2, tech_s3, tech_s4,
	per_llm_freq_df,
	jct_chart,
	jct_df,
	tech_opt_df,
	# ── supervisor additions ──────────────────────────────────────────────
	cluster_sizes_fig,
	repro_chart,
	repro_df,
	interpretability_df,
	)


	# ── UI ────────────────────────────────────────────────────────────────────────
	css = ".gradio-container{background:#0d1117!important;color:#c9d1d9!important}" \
	"footer{display:none!important}"

	with gr.Blocks(theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
	css=css, title="SPECTER-2 Topic Analyzer") as demo:
	gr.Markdown("# 📐 SPECTER-2 Topic Analyzer")

	with gr.Row():
	# ── Left sidebar ─────────────────────────────────────────────────────
	with gr.Column(scale=1):
	gr.Markdown("### 📄 Corpus CSV")
	file_in = gr.File(label="Upload Scopus CSV (title + abstract)",
	file_types=[".csv"])
	preview_out = gr.Markdown("Upload a CSV to see stats.")

	gr.Markdown("### 🔬 Methodology CSV (optional)")
	method_file_in = gr.File(label="Upload Methodology CSV (title, doi, methodology)",
	file_types=[".csv"])
	method_preview = gr.Markdown("Upload methodology CSV to enable technique analysis.")

	gr.Markdown("### 🔑 API Keys")
	groq_in = gr.Textbox(label="Groq API Key", type="password",
	placeholder="or set GROQ_API_KEY env var")
	mistral_in = gr.Textbox(label="Mistral API Key", type="password",
	placeholder="or set MISTRAL_API_KEY env var")
	gemini_in = gr.Textbox(label="Gemini API Key", type="password",
	placeholder="or set GEMINI_API_KEY env var")

	gr.Markdown("### ⚙ Parameters")
	trials_in = gr.Slider(10, 100, 50, step=5, label="Optuna Trials")
	optimize_in = gr.Slider(1, 5, 1, step=1,
	label="🔁 Optimization Passes",
	info="Pass 1 = no refinement. 2–5 = LLM critic audits topic labels "
	"AND technique labels for hallucinations + improvements.")
	run_btn = gr.Button("▶ Run Full Pipeline", variant="primary", size="lg")

	# ── Main panel ────────────────────────────────────────────────────────
	with gr.Column(scale=3):
	with gr.Tabs():

	# ── original tabs (order / content unchanged) ─────────────────
	with gr.Tab("Summary"):
	summary_out = gr.Markdown()

	with gr.Tab("2-D UMAP"):
	scatter_out = gr.Plot()

	with gr.Tab("Pareto Front"):
	pareto_out = gr.Plot()

	with gr.Tab("Trial Log"):
	trial_out = gr.Dataframe()

	with gr.Tab("Clusters"):
	cluster_out = gr.Dataframe()

	with gr.Tab("🗞 Top 3 Papers"):
	gr.Markdown("### Top 3 Representative Papers per Cluster\n"
	"Ranked by cosine similarity to cluster centroid "
	"in SPECTER-2 embedding space.")
	top_papers_out = gr.Dataframe(
	headers=["Cluster","Label","Rank","Title","Abstract Snippet"],
	wrap=True)

	with gr.Tab("🔬 Cluster Methodology"):
	gr.Markdown("### Cluster-Level Methodology — 3-LLM Council\n"
	"Derived from representative abstracts per cluster. "
	"≥2-LLM gate applied.")
	method_chart_out = gr.Plot()
	method_summary_out = gr.Dataframe(wrap=True)

	with gr.Tab("⚙ Cluster Extraction Pipeline"):
	gr.Markdown("### Full Regex + LLM Extraction Trace (per cluster)")
	extraction_out = gr.Dataframe(wrap=True)

	with gr.Tab("🤖 Cluster Per-LLM Votes"):
	gr.Markdown("### Raw Per-LLM Methodology Votes (per cluster)")
	per_llm_out = gr.Dataframe(wrap=True)

	with gr.Tab("🔍 Cluster Regex Hits"):
	gr.Markdown("### Regex Pattern Matches (per cluster)\n"
	"Every match with exact character span and paper number.")
	regex_hits_out = gr.Dataframe(wrap=True)
	regex_info_out = gr.Markdown()

	with gr.Tab("🔁 Refinement Log"):
	gr.Markdown("### Topic Label Optimization Log\n"
	"Changes made by LLM critic per optimization pass.")
	refine_out = gr.Dataframe(wrap=True)

	with gr.Tab("Sheet 1 — Groq"): s1_out = gr.Dataframe()
	with gr.Tab("Sheet 2 — Mistral"): s2_out = gr.Dataframe()
	with gr.Tab("Sheet 3 — Gemini"): s3_out = gr.Dataframe()
	with gr.Tab("Sheet 4 — Consolidated"): s4_out = gr.Dataframe()
	with gr.Tab("RQ Mismatch"): mismatch_out = gr.Dataframe()
	with gr.Tab("Downloads"):
	dl_out = gr.File(label="All sheet CSVs + topics.json",
	file_count="multiple")

	# ── NEW tabs: methodology CSV pipeline ────────────────────────
	with gr.Tab("💻 Comp. Techniques — LLM % Chart"):
	gr.Markdown("### Computational Technique Frequency — Methodology CSV\n"
	"For each technique, shows the % of papers it was extracted "
	"from by each of the 3 LLMs independently + the consolidated "
	"result (≥2-LLM gate). Bars grouped by technique.")
	tech_llm_chart_out = gr.Plot()

	with gr.Tab("💻 Tech Sheet 1 — Groq"):
	gr.Markdown("### Groq raw technique extraction — one row per paper")
	tech_s1_out = gr.Dataframe(wrap=True)

	with gr.Tab("💻 Tech Sheet 2 — Mistral"):
	gr.Markdown("### Mistral raw technique extraction — one row per paper")
	tech_s2_out = gr.Dataframe(wrap=True)

	with gr.Tab("💻 Tech Sheet 3 — Gemini"):
	gr.Markdown("### Gemini raw technique extraction — one row per paper")
	tech_s3_out = gr.Dataframe(wrap=True)

	with gr.Tab("💻 Tech Sheet 4 — Consolidated"):
	gr.Markdown("### Consolidated techniques — ≥2-LLM agreement, one row per paper")
	tech_s4_out = gr.Dataframe(wrap=True)

	with gr.Tab("📊 Tech Frequency by LLM"):
	gr.Markdown("### Per-LLM Technique Frequency Table\n"
	"% of all papers where each LLM extracted each technique. "
	"High variance = LLMs disagree → optimization flag.")
	per_llm_freq_out = gr.Dataframe(wrap=True)

	with gr.Tab("🗂 Journal Cross-Tabulation"):
	gr.Markdown("### Technique × Journal Cross-Tabulation\n"
	"Rows = journals auto-detected from DOI/title. "
	"Columns = consolidated techniques. "
	"Values = % of papers in that journal using the technique.\n\n"
	"Journals detected: MISQ, JAIS, ISR, JMIS, PAJAIS, "
	"ECIS, ICIS, Other.")
	jct_chart_out = gr.Plot()
	jct_df_out = gr.Dataframe(wrap=True)

	with gr.Tab("🔧 Technique Optimization"):
	gr.Markdown("### Technique Label Improvement Suggestions\n"
	"Groq critic flags: hallucination, high inter-LLM variance "
	"(>15% gap), split/merge recommendations.\n"
	"Only runs when Optimization Passes ≥ 2.")
	tech_opt_out = gr.Dataframe(wrap=True)

	# ── Supervisor-requested additions ────────────────────────────
	with gr.Tab("📊 Cluster Sizes"):
	gr.Markdown(
	"### Cluster Sizes (Papers per Cluster)\n"
	"Exact chart your supervisor highlighted. "
	"Green = passes both discipline rules (mass ≤ 25%, size ≥ 5). "
	"Yellow = cluster exceeds 25% mass cap — dominant cluster warning. "
	"Red = cluster has fewer than 5 papers — too small.\n\n"
	"The orange dashed line marks the 25% cap. Any bar above it "
	"will fail the discipline check and the pipeline will re-optimise."
	)
	cluster_sizes_out = gr.Plot()

	with gr.Tab("🔄 Reproducibility"):
	gr.Markdown(
	"### Reproducibility — 'Run Again and Again, Topic List is the Same'\n\n"
	"Your supervisor wants proof that running the pipeline multiple times "
	"produces the same clusters. This tab shows two measures:\n\n"
	"Overall ARI Stability (top row) — Adjusted Rand Index averaged "
	"across 3 random seeds. ARI = 1.0 means identical clusters every run. "
	"ARI ≥ 0.8 is considered stable for publication.\n\n"
	"Cluster Persistence (per row) — how strongly each cluster's "
	"structure is preserved in the condensed HDBSCAN tree. "
	"High persistence → cluster survives parameter variation → "
	"same label will appear on re-run. "
	"Low persistence → cluster may split or merge → label may change.\n\n"
	"🟢 ≥ 0.7 Stable · 🟡 0.4–0.7 Borderline · 🔴 < 0.4 Fragile"
	)
	repro_chart_out = gr.Plot()
	repro_df_out = gr.Dataframe(wrap=True)

	with gr.Tab("🧠 Interpretability Check"):
	gr.Markdown(
	"### Human Interpretability Check — 'Topic List Must Be Distinct'\n\n"
	"Your supervisor flagged that labels like "
	"'Cybersecurity and Privacy' and 'Cyber-Risk Management and Online Security' "
	"look like the same topic. This tab automatically detects:\n\n"
	"⚠ Label Overlap — pairs of cluster labels sharing ≥ 2 significant "
	"words (noise words like 'and', 'for', 'in' excluded). "
	"Overlapping labels suggest the two clusters may cover the same theme "
	"and should be reviewed for merging.\n\n"
	"❌ Too Vague — labels where all meaningful words are generic "
	"('systems', 'digital', 'data') with no domain-specific content. "
	"These need the optimization pass to refine them.\n\n"
	"Action column tells you exactly what to do for each flag."
	)
	interpretability_out = gr.Dataframe(wrap=True)

	# ── Wire callbacks ────────────────────────────────────────────────────────
	file_in.change(_preview, inputs=[file_in], outputs=[preview_out])
	method_file_in.change(_preview_methodology, inputs=[method_file_in], outputs=[method_preview])

	run_btn.click(
	_run,
	inputs=[file_in, method_file_in, groq_in, mistral_in, gemini_in,
	trials_in, optimize_in],
	outputs=[
	# original
	summary_out, scatter_out, pareto_out, trial_out, cluster_out,
	top_papers_out,
	method_chart_out, method_summary_out, extraction_out, per_llm_out,
	regex_hits_out, regex_info_out,
	refine_out,
	s1_out, s2_out, s3_out, s4_out,
	dl_out, mismatch_out,
	# new
	tech_llm_chart_out,
	tech_s1_out, tech_s2_out, tech_s3_out, tech_s4_out,
	per_llm_freq_out,
	jct_chart_out,
	jct_df_out,
	tech_opt_out,
	# supervisor additions
	cluster_sizes_out,
	repro_chart_out,
	repro_df_out,
	interpretability_out,
	],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)