Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,456 +1,124 @@
|
|
| 1 |
-
"""
|
| 2 |
-
app.py
|
| 3 |
-
------
|
| 4 |
-
Streamlit UI — SPECTER-2 + UMAP + HDBSCAN Bayesian Pipeline
|
| 5 |
-
with 2-D UMAP scatter, Pareto front, strong/weak members,
|
| 6 |
-
trial log, and LLM Council Sheets 1-4.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
import os, json, tempfile
|
| 10 |
-
import pandas as pd
|
| 11 |
-
import
|
| 12 |
-
import streamlit as st
|
| 13 |
import plotly.express as px
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
# ── CSS ──────────────────────────────────────────────────────────────────────
|
| 24 |
-
st.markdown("""
|
| 25 |
-
<style>
|
| 26 |
-
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
|
| 27 |
-
html, body, [class*="css"] { font-family:'IBM Plex Sans',sans-serif; }
|
| 28 |
-
.stApp { background:#0d0f14; color:#e8eaf0; }
|
| 29 |
-
[data-testid="stSidebar"] { background:#13161e; border-right:1px solid #1f2333; }
|
| 30 |
-
[data-testid="stSidebar"] * { color:#b0b8cc !important; }
|
| 31 |
-
[data-testid="stSidebar"] h1,[data-testid="stSidebar"] h2,[data-testid="stSidebar"] h3 {
|
| 32 |
-
color:#e8eaf0!important; font-family:'IBM Plex Mono',monospace!important;
|
| 33 |
-
font-size:.8rem!important; letter-spacing:.12em!important; text-transform:uppercase!important; }
|
| 34 |
-
.site-header { padding:2.5rem 0 1.5rem; border-bottom:1px solid #1f2333; margin-bottom:2rem; }
|
| 35 |
-
.site-header h1 { font-family:'IBM Plex Mono',monospace; font-size:1.6rem; font-weight:600;
|
| 36 |
-
color:#e8eaf0; letter-spacing:-.01em; margin:0 0 .3rem; }
|
| 37 |
-
.site-header p { font-size:.82rem; color:#5a6480; font-family:'IBM Plex Mono',monospace; margin:0; }
|
| 38 |
-
.pill { display:inline-block; font-family:'IBM Plex Mono',monospace; font-size:.68rem;
|
| 39 |
-
font-weight:600; letter-spacing:.08em; text-transform:uppercase; padding:3px 10px;
|
| 40 |
-
border-radius:2px; margin-right:6px; }
|
| 41 |
-
.pill-blue { background:#0f2a4a; color:#4d9de0; border:1px solid #1a4070; }
|
| 42 |
-
.pill-green { background:#0a2a1a; color:#3dba7a; border:1px solid #1a4a2a; }
|
| 43 |
-
.pill-amber { background:#2a1f00; color:#e8a020; border:1px solid #4a3500; }
|
| 44 |
-
.pill-red { background:#2a0f0f; color:#e04d4d; border:1px solid #4a1a1a; }
|
| 45 |
-
.pill-gray { background:#1a1e2a; color:#7a8090; border:1px solid #2a2e3a; }
|
| 46 |
-
.stat-grid { display:grid; grid-template-columns:repeat(5,1fr); gap:1px;
|
| 47 |
-
background:#1f2333; border:1px solid #1f2333; border-radius:6px; overflow:hidden; margin-bottom:2rem; }
|
| 48 |
-
.stat-card { background:#13161e; padding:1.25rem 1.5rem; text-align:center; }
|
| 49 |
-
.stat-val { font-family:'IBM Plex Mono',monospace; font-size:1.9rem; font-weight:600;
|
| 50 |
-
color:#e8eaf0; line-height:1; margin-bottom:.3rem; }
|
| 51 |
-
.stat-label { font-size:.7rem; color:#5a6480; text-transform:uppercase; letter-spacing:.1em;
|
| 52 |
-
font-family:'IBM Plex Mono',monospace; }
|
| 53 |
-
.section-title { font-family:'IBM Plex Mono',monospace; font-size:.7rem; font-weight:600;
|
| 54 |
-
letter-spacing:.15em; text-transform:uppercase; color:#5a6480;
|
| 55 |
-
padding-bottom:.6rem; border-bottom:1px solid #1f2333; margin-bottom:1.2rem; }
|
| 56 |
-
.topic-card { background:#13161e; border:1px solid #1f2333; border-left:3px solid #4d9de0;
|
| 57 |
-
border-radius:4px; padding:1rem 1.25rem; margin-bottom:.6rem; transition:border-color .15s; }
|
| 58 |
-
.topic-card:hover { border-left-color:#3dba7a; }
|
| 59 |
-
.topic-label { font-size:.92rem; font-weight:500; color:#e8eaf0; margin-bottom:.35rem; }
|
| 60 |
-
.topic-meta { font-family:'IBM Plex Mono',monospace; font-size:.7rem; color:#5a6480; }
|
| 61 |
-
.topic-kw { font-family:'IBM Plex Mono',monospace; font-size:.68rem; color:#3d6480;
|
| 62 |
-
margin-top:.4rem; line-height:1.5; }
|
| 63 |
-
.val-box { background:#0a2a1a; border:1px solid #1a4a2a; border-radius:6px;
|
| 64 |
-
padding:1.25rem 1.5rem; margin-bottom:1.5rem; }
|
| 65 |
-
.val-box h4 { font-family:'IBM Plex Mono',monospace; font-size:.72rem; font-weight:600;
|
| 66 |
-
letter-spacing:.1em; text-transform:uppercase; color:#3dba7a; margin:0 0 .75rem; }
|
| 67 |
-
.val-row { display:flex; justify-content:space-between; align-items:center;
|
| 68 |
-
padding:.4rem 0; border-bottom:1px solid #1a3a2a; font-size:.8rem; color:#a0b8a8; }
|
| 69 |
-
.val-row:last-child { border-bottom:none; }
|
| 70 |
-
.val-key { color:#5a7a6a; } .val-num { font-family:'IBM Plex Mono',monospace; color:#3dba7a; font-weight:600; }
|
| 71 |
-
.stButton > button { background:#4d9de0!important; color:#0d0f14!important; border:none!important;
|
| 72 |
-
border-radius:3px!important; font-family:'IBM Plex Mono',monospace!important;
|
| 73 |
-
font-size:.78rem!important; font-weight:600!important; letter-spacing:.08em!important;
|
| 74 |
-
text-transform:uppercase!important; padding:.6rem 2rem!important; }
|
| 75 |
-
.stButton > button:hover { background:#3d8ed0!important; }
|
| 76 |
-
.stDownloadButton > button { background:transparent!important; color:#4d9de0!important;
|
| 77 |
-
border:1px solid #1a4070!important; border-radius:3px!important;
|
| 78 |
-
font-family:'IBM Plex Mono',monospace!important; font-size:.72rem!important; }
|
| 79 |
-
</style>
|
| 80 |
-
""", unsafe_allow_html=True)
|
| 81 |
-
|
| 82 |
-
# ── Header ───────────────────────────────────────────────────────────────────
|
| 83 |
-
st.markdown("""
|
| 84 |
-
<div class="site-header">
|
| 85 |
-
<h1>SPECTER-2 Topic Analyzer</h1>
|
| 86 |
-
<p>SPECTER-2 embeddings · Bayesian UMAP+HDBSCAN ·
|
| 87 |
-
3-LLM Council (Groq + Mistral + Gemini)</p>
|
| 88 |
-
</div>
|
| 89 |
-
""", unsafe_allow_html=True)
|
| 90 |
-
|
| 91 |
-
# ── Sidebar ──────────────────────────────────────────────────────────────────
|
| 92 |
-
with st.sidebar:
|
| 93 |
-
st.markdown("### API Keys")
|
| 94 |
-
groq_key_in = st.text_input("Groq API Key", type="password")
|
| 95 |
-
mistral_key_in = st.text_input("Mistral API Key", type="password")
|
| 96 |
-
gemini_key_in = st.text_input("Gemini API Key", type="password")
|
| 97 |
-
st.caption("Keys are never stored. Leave blank to use env vars.")
|
| 98 |
-
|
| 99 |
-
st.markdown("---")
|
| 100 |
-
st.markdown("### Bayesian Optimisation")
|
| 101 |
-
n_trials = st.slider("Optuna trials", 20, 100, 50,
|
| 102 |
-
help="§3.4: 50–100 trials recommended")
|
| 103 |
-
st.markdown(
|
| 104 |
-
"<span class='pill pill-blue'>Max mass ≤ 25%</span>"
|
| 105 |
-
"<span class='pill pill-blue'>Min size ≥ 5</span>",
|
| 106 |
-
unsafe_allow_html=True)
|
| 107 |
-
|
| 108 |
-
st.markdown("---")
|
| 109 |
-
st.markdown("### LLM Council")
|
| 110 |
-
st.markdown("""
|
| 111 |
-
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:1rem;">
|
| 112 |
-
<span class="pill pill-blue">Groq / LLaMA-3.1</span>
|
| 113 |
-
<span class="pill pill-green">Mistral Small</span>
|
| 114 |
-
<span class="pill pill-amber">Gemini 2.5 Flash</span>
|
| 115 |
-
</div>
|
| 116 |
-
<p style="font-size:.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
|
| 117 |
-
Sheet 1–3 per LLM · Sheet 4 consolidation<br>
|
| 118 |
-
Triple / Two / Single agreement tags<br>
|
| 119 |
-
Defence prompt for disagreement clusters
|
| 120 |
-
</p>
|
| 121 |
-
""", unsafe_allow_html=True)
|
| 122 |
-
|
| 123 |
-
st.markdown("---")
|
| 124 |
-
if st.button("Reset Results", use_container_width=True):
|
| 125 |
-
for k in ["results", "agent_out", "topic_data"]:
|
| 126 |
-
st.session_state.pop(k, None)
|
| 127 |
-
st.rerun()
|
| 128 |
-
|
| 129 |
-
groq_key = groq_key_in.strip() or os.getenv("GROQ_API_KEY")
|
| 130 |
-
mistral_key = mistral_key_in.strip() or os.getenv("MISTRAL_API_KEY")
|
| 131 |
-
gemini_key = gemini_key_in.strip() or os.getenv("GEMINI_API_KEY")
|
| 132 |
-
|
| 133 |
-
# ── Upload ───────────────────────────────────────────────────────────────────
|
| 134 |
-
st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
|
| 135 |
-
col_up, col_s = st.columns([3, 1])
|
| 136 |
-
with col_up:
|
| 137 |
-
uploaded = st.file_uploader(
|
| 138 |
-
"Upload Scopus CSV (must have 'title' + 'abstract')", type=["csv"])
|
| 139 |
-
with col_s:
|
| 140 |
-
st.markdown("<br>", unsafe_allow_html=True)
|
| 141 |
-
use_sample = st.checkbox("Use sample dataset (50 papers)")
|
| 142 |
-
|
| 143 |
-
if uploaded and not use_sample:
|
| 144 |
-
dfp = pd.read_csv(uploaded); uploaded.seek(0)
|
| 145 |
-
c1, c2, c3 = st.columns(3)
|
| 146 |
-
c1.metric("Papers", len(dfp))
|
| 147 |
-
c2.metric("Columns", len(dfp.columns))
|
| 148 |
-
ok = {"title","abstract"}.issubset(set(dfp.columns.str.lower()))
|
| 149 |
-
c3.metric("Title+Abstract", "✓" if ok else "✗")
|
| 150 |
-
if not ok:
|
| 151 |
-
st.error("CSV must have 'title' and 'abstract' columns.")
|
| 152 |
-
|
| 153 |
-
# ── Run ──────────────────────────────────────────────────────────────────────
|
| 154 |
-
st.markdown("<br>", unsafe_allow_html=True)
|
| 155 |
-
run_btn = st.button("▶ Run Full Pipeline", type="primary")
|
| 156 |
-
|
| 157 |
-
if run_btn:
|
| 158 |
-
missing = []
|
| 159 |
-
if not groq_key: missing.append("Groq")
|
| 160 |
-
if not mistral_key: missing.append("Mistral")
|
| 161 |
-
if not gemini_key: missing.append("Gemini")
|
| 162 |
-
if missing:
|
| 163 |
-
st.error(f"Missing key(s): {', '.join(missing)}")
|
| 164 |
-
st.stop()
|
| 165 |
-
if not use_sample and not uploaded:
|
| 166 |
-
st.error("Upload a CSV or enable sample dataset.")
|
| 167 |
-
st.stop()
|
| 168 |
-
|
| 169 |
-
# Prepare CSV
|
| 170 |
-
if use_sample:
|
| 171 |
-
rng = np.random.default_rng(42)
|
| 172 |
-
pool = [
|
| 173 |
-
("Deep Learning for Healthcare Prediction",
|
| 174 |
-
"We apply LSTM networks to predict patient readmission from EHR data."),
|
| 175 |
-
("Process Mining in Enterprise Systems",
|
| 176 |
-
"Event log analysis using Petri nets for conformance checking in ERP workflows."),
|
| 177 |
-
("Recommender Systems Collaborative Filtering",
|
| 178 |
-
"Matrix factorization techniques applied to e-commerce product recommendation."),
|
| 179 |
-
("LLM Applications in Information Systems",
|
| 180 |
-
"GPT-4 used for automated requirements extraction from stakeholder documents."),
|
| 181 |
-
("Blockchain Smart Contract Security",
|
| 182 |
-
"Formal verification of Solidity smart contracts for financial transaction safety."),
|
| 183 |
-
("Federated Learning Privacy Preservation",
|
| 184 |
-
"Differential privacy mechanisms for distributed model training across hospitals."),
|
| 185 |
-
("Cybersecurity Intrusion Detection",
|
| 186 |
-
"Random forest classifiers for network anomaly detection in enterprise environments."),
|
| 187 |
-
("NLP Sentiment Analysis",
|
| 188 |
-
"BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
|
| 189 |
-
("Knowledge Graph Embedding",
|
| 190 |
-
"TransE and RotatE models for biomedical entity relation prediction."),
|
| 191 |
-
("Computer Vision Medical Imaging",
|
| 192 |
-
"CNN architectures for diabetic retinopathy grading from fundus photographs."),
|
| 193 |
-
]
|
| 194 |
-
rows = [{"title": t, "abstract": a + f" Study {i+1}.",
|
| 195 |
-
"doi": f"10.1145/sample.{i+1}"}
|
| 196 |
-
for i, (t, a) in enumerate(pool * 5)]
|
| 197 |
-
dfs = pd.DataFrame(rows)
|
| 198 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
| 199 |
-
dfs.to_csv(tmp.name, index=False); csv_path = tmp.name
|
| 200 |
-
else:
|
| 201 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
|
| 202 |
-
tmp.write(uploaded.read()); tmp.flush(); csv_path = tmp.name
|
| 203 |
-
|
| 204 |
-
# Step 1 — Topic modelling + Bayesian optimisation
|
| 205 |
-
pbar = st.progress(0, text="Step 1/2 — SPECTER-2 embed + Bayesian UMAP/HDBSCAN…")
|
| 206 |
-
def _progress(cur, total, entry):
|
| 207 |
-
pct = int(cur / total * 45)
|
| 208 |
-
txt = (f"Trial {cur}/{total} — "
|
| 209 |
-
f"{'PASS' if entry['discipline_pass'] else 'FAIL'} — "
|
| 210 |
-
f"{entry['n_clusters']} clusters")
|
| 211 |
-
pbar.progress(min(pct, 49), text=txt)
|
| 212 |
try:
|
| 213 |
-
|
| 214 |
-
progress_callback=_progress)
|
| 215 |
-
nc = topic_data["discipline"]["n_clusters"]
|
| 216 |
-
pbar.progress(50, text=f"Step 1 done — {nc} clusters, "
|
| 217 |
-
f"{topic_data['n_trials_run']} trials.")
|
| 218 |
except Exception as e:
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
<div class="stat-label">Clusters</div></div>
|
| 257 |
-
<div class="stat-card"><div class="stat-val">{total_papers}</div>
|
| 258 |
-
<div class="stat-label">Papers assigned</div></div>
|
| 259 |
-
<div class="stat-card"><div class="stat-val">{strong_pct}%</div>
|
| 260 |
-
<div class="stat-label">Strong members</div></div>
|
| 261 |
-
<div class="stat-card"><div class="stat-val">{round(met['persistence'],3)}</div>
|
| 262 |
-
<div class="stat-label">Persistence</div></div>
|
| 263 |
-
<div class="stat-card"><div class="stat-val">{round(met['dbcv'],3)}</div>
|
| 264 |
-
<div class="stat-label">DBCV</div></div>
|
| 265 |
-
</div>
|
| 266 |
-
""", unsafe_allow_html=True)
|
| 267 |
-
|
| 268 |
-
# ── Discipline + metrics panel ───────────────────────────────────────
|
| 269 |
-
st.markdown("<div class='section-title'>Discipline & Quality</div>",
|
| 270 |
-
unsafe_allow_html=True)
|
| 271 |
-
st.markdown(f"""
|
| 272 |
-
<div class="val-box">
|
| 273 |
-
<h4>§3.2 Hard Constraints + §3.4 Quality Criteria</h4>
|
| 274 |
-
<div class="val-row"><span class="val-key">Max cluster mass ≤ 25%</span>
|
| 275 |
-
<span class="val-num">{'✅ PASS' if disc['max_mass_ok'] else '❌ FAIL'}
|
| 276 |
-
({round(disc['max_mass_pct']*100,1)}%)</span></div>
|
| 277 |
-
<div class="val-row"><span class="val-key">Min cluster size ≥ 5</span>
|
| 278 |
-
<span class="val-num">{'✅ PASS' if disc['min_size_ok'] else '❌ FAIL'}
|
| 279 |
-
(min={disc['min_size']})</span></div>
|
| 280 |
-
<div class="val-row"><span class="val-key">HDBSCAN Persistence</span>
|
| 281 |
-
<span class="val-num">{round(met['persistence'],4)}</span></div>
|
| 282 |
-
<div class="val-row"><span class="val-key">DBCV</span>
|
| 283 |
-
<span class="val-num">{round(met['dbcv'],4)}</span></div>
|
| 284 |
-
<div class="val-row"><span class="val-key">Stability (ARI, 5 seeds)</span>
|
| 285 |
-
<span class="val-num">{round(met['stability'],4)}</span></div>
|
| 286 |
-
<div class="val-row"><span class="val-key">Bayesian trials run</span>
|
| 287 |
-
<span class="val-num">{td['n_trials_run']} (best = #{td['best_trial']})</span></div>
|
| 288 |
-
<div class="val-row"><span class="val-key">Noise papers (−1)</span>
|
| 289 |
-
<span class="val-num">{disc['n_noise']}</span></div>
|
| 290 |
-
</div>
|
| 291 |
-
""", unsafe_allow_html=True)
|
| 292 |
-
|
| 293 |
-
# ── Best params ──────────────────────────────────────────────────────
|
| 294 |
-
with st.expander("Winning UMAP + HDBSCAN parameters", expanded=False):
|
| 295 |
-
bp = td["best_params"]
|
| 296 |
-
pdf = pd.DataFrame([
|
| 297 |
-
{"Parameter": "UMAP.n_neighbors", "Value": bp["n_neighbors"]},
|
| 298 |
-
{"Parameter": "UMAP.n_components", "Value": bp["n_components"]},
|
| 299 |
-
{"Parameter": "UMAP.min_dist", "Value": 0.0},
|
| 300 |
-
{"Parameter": "UMAP.metric", "Value": "cosine"},
|
| 301 |
-
{"Parameter": "HDBSCAN.min_cluster_size",
|
| 302 |
-
"Value": bp["min_cluster_size"]},
|
| 303 |
-
{"Parameter": "HDBSCAN.min_samples", "Value": bp["min_samples"]},
|
| 304 |
-
{"Parameter": "HDBSCAN.cluster_selection_method",
|
| 305 |
-
"Value": bp["csm"]},
|
| 306 |
-
{"Parameter": "HDBSCAN.cluster_selection_epsilon",
|
| 307 |
-
"Value": bp["cse"]},
|
| 308 |
-
])
|
| 309 |
-
st.dataframe(pdf, use_container_width=True, hide_index=True)
|
| 310 |
-
|
| 311 |
-
# ── 2-D UMAP scatter ────────────────────────────────────────────────
|
| 312 |
-
st.markdown("<div class='section-title'>2-D UMAP Visualisation</div>",
|
| 313 |
-
unsafe_allow_html=True)
|
| 314 |
-
umap2d = np.array(td["umap_2d"])
|
| 315 |
-
labels_arr = np.array(td["labels"])
|
| 316 |
-
scatter_df = pd.DataFrame({
|
| 317 |
-
"UMAP-1": umap2d[:, 0], "UMAP-2": umap2d[:, 1],
|
| 318 |
-
"Cluster": [str(l) for l in labels_arr],
|
| 319 |
-
"Doc": [d[:80]+"…" for d in td["documents"]],
|
| 320 |
-
})
|
| 321 |
-
fig = px.scatter(scatter_df, x="UMAP-1", y="UMAP-2", color="Cluster",
|
| 322 |
-
hover_data=["Doc"], opacity=0.75,
|
| 323 |
-
title="SPECTER-2 embeddings (2-D UMAP, min_dist=0.1)")
|
| 324 |
-
fig.update_layout(
|
| 325 |
-
template="plotly_dark",
|
| 326 |
-
paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
|
| 327 |
-
font=dict(family="IBM Plex Mono", size=11),
|
| 328 |
-
height=520,
|
| 329 |
-
)
|
| 330 |
-
st.plotly_chart(fig, use_container_width=True)
|
| 331 |
-
|
| 332 |
-
# ── Pareto front ─────────────────────────────────────────────────────
|
| 333 |
-
with st.expander("Bayesian trial log & Pareto front", expanded=False):
|
| 334 |
-
tl = td["trial_log"]
|
| 335 |
-
tl_df = pd.DataFrame(tl)
|
| 336 |
-
if not tl_df.empty:
|
| 337 |
-
tl_df["colour"] = tl_df["discipline_pass"].map(
|
| 338 |
-
{True: "PASS", False: "FAIL"})
|
| 339 |
-
fig2 = px.scatter(
|
| 340 |
-
tl_df, x="persistence", y="dbcv", color="colour",
|
| 341 |
-
hover_data=["trial", "n_clusters", "max_mass_pct"],
|
| 342 |
-
color_discrete_map={"PASS": "#3dba7a", "FAIL": "#e04d4d"},
|
| 343 |
-
title="Pareto front — Persistence vs DBCV",
|
| 344 |
-
)
|
| 345 |
-
fig2.add_vline(x=0, line_dash="dash", line_color="#5a6480")
|
| 346 |
-
fig2.update_layout(
|
| 347 |
-
template="plotly_dark",
|
| 348 |
-
paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
|
| 349 |
-
font=dict(family="IBM Plex Mono", size=11), height=400)
|
| 350 |
-
st.plotly_chart(fig2, use_container_width=True)
|
| 351 |
-
st.dataframe(tl_df[["trial", "discipline_pass", "n_clusters",
|
| 352 |
-
"persistence", "dbcv", "max_mass_pct",
|
| 353 |
-
"min_size", "n_noise"]],
|
| 354 |
-
use_container_width=True, height=300)
|
| 355 |
-
|
| 356 |
-
# ── Cluster table (strong / weak) ────────────────────────────────────
|
| 357 |
-
st.markdown("<div class='section-title'>Cluster Results</div>",
|
| 358 |
-
unsafe_allow_html=True)
|
| 359 |
rows = []
|
| 360 |
for cid in sorted(interps.keys()):
|
| 361 |
-
|
| 362 |
-
rows.append({
|
| 363 |
-
"
|
| 364 |
-
"
|
| 365 |
-
"
|
| 366 |
-
"
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
#
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
<span class="pill pill-gray">{row['PAJAIS']}</span>
|
| 389 |
-
<span class="pill pill-blue">{row['Strong']}S / {row['Weak']}W</span>
|
| 390 |
-
<span class="pill pill-gray">Ground: {row['Grounding']}</span>
|
| 391 |
-
</div>
|
| 392 |
-
<div class="topic-kw">{row['Keyphrases']}</div>
|
| 393 |
-
</div>""", unsafe_allow_html=True)
|
| 394 |
-
|
| 395 |
-
# ── LLM Council Sheets ───────────────────────────────────────────────
|
| 396 |
-
with st.expander("LLM Council — Sheets 1-4", expanded=False):
|
| 397 |
-
sheet_rows = []
|
| 398 |
-
for cid in sorted(interps.keys()):
|
| 399 |
-
i = interps[cid]
|
| 400 |
-
for sn, sheet in [("Sheet 1 (Groq)", i.sheet1),
|
| 401 |
-
("Sheet 2 (Mistral)", i.sheet2),
|
| 402 |
-
("Sheet 3 (Gemini)", i.sheet3)]:
|
| 403 |
-
sheet_rows.append({
|
| 404 |
-
"Cluster": cid, "Sheet": sn,
|
| 405 |
-
"Label": sheet.get("label", "—"),
|
| 406 |
-
"PAJAIS": sheet.get("pacis_match", "—"),
|
| 407 |
-
"Conf": sheet.get("confidence", "—"),
|
| 408 |
-
})
|
| 409 |
-
sheet_rows.append({
|
| 410 |
-
"Cluster": cid, "Sheet": "Sheet 4 (Final)",
|
| 411 |
-
"Label": i.final_label,
|
| 412 |
-
"PAJAIS": i.final_pacis_match,
|
| 413 |
-
"Conf": i.final_confidence,
|
| 414 |
-
})
|
| 415 |
-
st.dataframe(pd.DataFrame(sheet_rows), use_container_width=True,
|
| 416 |
-
height=400)
|
| 417 |
-
|
| 418 |
-
# ── Downloads ───────────────────────────────────────────���────────────
|
| 419 |
-
st.markdown("<div class='section-title'>Downloads</div>",
|
| 420 |
-
unsafe_allow_html=True)
|
| 421 |
-
c1, c2, c3, c4 = st.columns(4)
|
| 422 |
-
with c1:
|
| 423 |
-
try:
|
| 424 |
-
with open(ao["json_path"]) as f:
|
| 425 |
-
st.download_button("⬇ topics.json", f.read(),
|
| 426 |
-
"topics.json", "application/json",
|
| 427 |
-
use_container_width=True)
|
| 428 |
-
except Exception:
|
| 429 |
-
st.warning("JSON not found.")
|
| 430 |
-
with c2:
|
| 431 |
-
st.download_button("⬇ results.csv",
|
| 432 |
-
df_res.to_csv(index=False),
|
| 433 |
-
"results.csv", "text/csv",
|
| 434 |
-
use_container_width=True)
|
| 435 |
-
with c3:
|
| 436 |
-
tl_csv = pd.DataFrame(td["trial_log"]).to_csv(index=False)
|
| 437 |
-
st.download_button("⬇ trial_log.csv", tl_csv,
|
| 438 |
-
"trial_log.csv", "text/csv",
|
| 439 |
-
use_container_width=True)
|
| 440 |
-
with c4:
|
| 441 |
-
bp_json = json.dumps(td["best_params"], indent=2)
|
| 442 |
-
st.download_button("⬇ best_params.json", bp_json,
|
| 443 |
-
"best_params.json", "application/json",
|
| 444 |
-
use_container_width=True)
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""app.py — Gradio UI entry point (<200 lines, §11)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os, json, tempfile
|
| 3 |
+
import pandas as pd, numpy as np
|
| 4 |
+
import gradio as gr
|
|
|
|
| 5 |
import plotly.express as px
|
| 6 |
+
from agent import run_pipeline
|
| 7 |
+
|
| 8 |
+
def _run(file, groq_key, mistral_key, gemini_key, n_trials):
|
| 9 |
+
if not file: return ("Upload a CSV first.",)+(None,)*7
|
| 10 |
+
gk = groq_key.strip() or os.getenv("GROQ_API_KEY","")
|
| 11 |
+
mk = mistral_key.strip() or os.getenv("MISTRAL_API_KEY","")
|
| 12 |
+
gek = gemini_key.strip() or os.getenv("GEMINI_API_KEY","")
|
| 13 |
+
if not all([gk,mk,gek]):
|
| 14 |
+
return ("All 3 API keys required.",)+(None,)*7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
+
r = run_pipeline(file.name, gk, mk, gek, int(n_trials))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
except Exception as e:
|
| 18 |
+
return (f"Pipeline error: {e}",)+(None,)*7
|
| 19 |
+
if r.get("error"):
|
| 20 |
+
return (f"Error: {r['error']}",)+(None,)*7
|
| 21 |
+
td, interps = r["topic_data"], r.get("interpretations",{})
|
| 22 |
+
disc, met = td["discipline"], td["metrics"]
|
| 23 |
+
sw_total = sum(v["strong"] for v in interps.values())
|
| 24 |
+
wk_total = sum(v["weak"] for v in interps.values())
|
| 25 |
+
ar = r.get("agreement_rates",{})
|
| 26 |
+
summary = (f"**Clusters:** {disc['n_clusters']} | "
|
| 27 |
+
f"**Strong:** {sw_total} ({round(sw_total/max(sw_total+wk_total,1)*100)}%) | "
|
| 28 |
+
f"**Weak:** {wk_total} | **Noise:** {disc['n_noise']}\n\n"
|
| 29 |
+
f"**Max mass:** {round(disc['max_mass_pct']*100,1)}% "
|
| 30 |
+
f"({'✅' if disc['max_mass_ok'] else '❌'}) | "
|
| 31 |
+
f"**Min size:** {disc['min_size']} "
|
| 32 |
+
f"({'✅' if disc['min_size_ok'] else '❌'})\n\n"
|
| 33 |
+
f"**Persistence:** {round(met['persistence'],4)} | "
|
| 34 |
+
f"**DBCV:** {round(met['dbcv'],4)} | "
|
| 35 |
+
f"**Stability:** {round(met['stability'],4)}\n\n"
|
| 36 |
+
f"**Trials:** {td['n_trials_run']} (best #{td['best_trial']})\n\n"
|
| 37 |
+
f"**Agreement:** Triple={ar.get('triple',0)}% "
|
| 38 |
+
f"Two+={ar.get('two_or_more',0)}% "
|
| 39 |
+
f"Single={ar.get('single',0)}%")
|
| 40 |
+
# UMAP scatter
|
| 41 |
+
u2d = np.array(td["umap_2d"])
|
| 42 |
+
sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
|
| 43 |
+
"Cluster":[str(l) for l in td["labels"]],
|
| 44 |
+
"Doc":[d[:60] for d in td["documents"]]})
|
| 45 |
+
fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster",
|
| 46 |
+
hover_data=["Doc"], opacity=0.7, title="2-D UMAP (SPECTER-2)")
|
| 47 |
+
fig.update_layout(template="plotly_dark", height=480,
|
| 48 |
+
paper_bgcolor="#0d0f14", plot_bgcolor="#13161e")
|
| 49 |
+
# Trial log
|
| 50 |
+
tl = pd.DataFrame(td["trial_log"])
|
| 51 |
+
tl_cols = ["trial","discipline_pass","n_clusters","persistence",
|
| 52 |
+
"dbcv","max_mass_pct","min_size","n_noise"]
|
| 53 |
+
tl_show = tl[[c for c in tl_cols if c in tl.columns]] if not tl.empty else pd.DataFrame()
|
| 54 |
+
# Cluster table with strong/weak/persistence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
rows = []
|
| 56 |
for cid in sorted(interps.keys()):
|
| 57 |
+
v = interps[cid]
|
| 58 |
+
rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"],
|
| 59 |
+
"Description":v.get("description",""),
|
| 60 |
+
"PAJAIS":v.get("pacis_match",""),"Strong":v["strong"],"Weak":v["weak"],
|
| 61 |
+
"Persistence":round(v.get("persistence",0),4),
|
| 62 |
+
"Keyphrases":", ".join(v.get("keyphrases",[]))})
|
| 63 |
+
cdf = pd.DataFrame(rows)
|
| 64 |
+
# TCCM sheets
|
| 65 |
+
sheets = r.get("sheets",{})
|
| 66 |
+
s_rows = []
|
| 67 |
+
for sn, label in [(1,"Groq"),(2,"Mistral"),(3,"Gemini"),(4,"Consolidated")]:
|
| 68 |
+
for row in sheets.get(sn,[]):
|
| 69 |
+
s_rows.append({"Sheet":f"{sn}-{label}", **row})
|
| 70 |
+
sdf2 = pd.DataFrame(s_rows) if s_rows else pd.DataFrame()
|
| 71 |
+
# Mismatch
|
| 72 |
+
mt = r.get("mismatch_table",[])
|
| 73 |
+
mdf = pd.DataFrame(mt) if mt else pd.DataFrame()
|
| 74 |
+
# Downloads
|
| 75 |
+
jp = r.get("json_path","topics.json")
|
| 76 |
+
cp = r.get("csv_path","topics.csv")
|
| 77 |
+
return summary, fig, tl_show, cdf, sdf2, mdf, jp, cp
|
| 78 |
+
|
| 79 |
+
css = """
|
| 80 |
+
.gradio-container{background:#0d0f14!important;color:#e8eaf0!important}
|
| 81 |
+
.gr-button-primary{background:#4d9de0!important}
|
| 82 |
+
footer{display:none!important}
|
| 83 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
with gr.Blocks(theme=gr.themes.Base(primary_hue="blue",
|
| 86 |
+
neutral_hue="slate"), css=css, title="SPECTER-2 Topic Analyzer") as demo:
|
| 87 |
+
gr.Markdown("# 📐 SPECTER-2 Topic Analyzer\n"
|
| 88 |
+
"SPECTER-2 → Bayesian UMAP+HDBSCAN → 3-LLM Council")
|
| 89 |
+
with gr.Row():
|
| 90 |
+
with gr.Column(scale=1):
|
| 91 |
+
file_in = gr.File(label="Upload Scopus CSV", file_types=[".csv"])
|
| 92 |
+
groq_in = gr.Textbox(label="Groq API Key", type="password")
|
| 93 |
+
mistral_in = gr.Textbox(label="Mistral API Key", type="password")
|
| 94 |
+
gemini_in = gr.Textbox(label="Gemini API Key", type="password")
|
| 95 |
+
trials_in = gr.Slider(10,100,50,step=5,label="Optuna Trials (§3.4)")
|
| 96 |
+
run_btn = gr.Button("▶ Run Full Pipeline", variant="primary")
|
| 97 |
+
gr.Markdown("**Hard rules:** max mass ≤25%, min size ≥5\n\n"
|
| 98 |
+
"**LLM Council:** Groq · Mistral · Gemini\n\n"
|
| 99 |
+
"**4 Sheets:** 3 independent LLMs + 1 consolidated")
|
| 100 |
+
with gr.Column(scale=3):
|
| 101 |
+
with gr.Tabs():
|
| 102 |
+
with gr.Tab("Summary"):
|
| 103 |
+
summary_out = gr.Markdown()
|
| 104 |
+
with gr.Tab("2-D UMAP"):
|
| 105 |
+
scatter_out = gr.Plot()
|
| 106 |
+
with gr.Tab("Trial Log"):
|
| 107 |
+
trial_out = gr.Dataframe(label="Bayesian Trials (≥50)")
|
| 108 |
+
with gr.Tab("Clusters"):
|
| 109 |
+
cluster_out = gr.Dataframe(
|
| 110 |
+
label="Strong/Weak + Persistence per cluster")
|
| 111 |
+
with gr.Tab("LLM Sheets 1-4"):
|
| 112 |
+
sheet_out = gr.Dataframe(label="4 Sheets: 3 LLMs + Consolidated")
|
| 113 |
+
with gr.Tab("RQ Mismatch"):
|
| 114 |
+
mismatch_out = gr.Dataframe(label="RQ2/RQ3 Mismatch Table")
|
| 115 |
+
with gr.Tab("Downloads"):
|
| 116 |
+
json_out = gr.File(label="topics.json")
|
| 117 |
+
csv_out = gr.File(label="topics.csv")
|
| 118 |
+
run_btn.click(_run,
|
| 119 |
+
inputs=[file_in, groq_in, mistral_in, gemini_in, trials_in],
|
| 120 |
+
outputs=[summary_out, scatter_out, trial_out, cluster_out,
|
| 121 |
+
sheet_out, mismatch_out, json_out, csv_out])
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|