anujjuna commited on
Commit
ee50027
·
verified ·
1 Parent(s): 0f5b08b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -449
app.py CHANGED
@@ -1,456 +1,124 @@
1
- """
2
- app.py
3
- ------
4
- Streamlit UI — SPECTER-2 + UMAP + HDBSCAN Bayesian Pipeline
5
- with 2-D UMAP scatter, Pareto front, strong/weak members,
6
- trial log, and LLM Council Sheets 1-4.
7
- """
8
-
9
  import os, json, tempfile
10
- import pandas as pd
11
- import numpy as np
12
- import streamlit as st
13
  import plotly.express as px
14
- import plotly.graph_objects as go
15
-
16
- from tools import run_topic_modeling
17
- from agent import run_agent
18
-
19
- # ── Page ─────────────────────────────────────────────────────────────────────
20
- st.set_page_config(page_title="SPECTER-2 Topic Analyzer", page_icon="📐",
21
- layout="wide", initial_sidebar_state="expanded")
22
-
23
- # ── CSS ──────────────────────────────────────────────────────────────────────
24
- st.markdown("""
25
- <style>
26
- @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
27
- html, body, [class*="css"] { font-family:'IBM Plex Sans',sans-serif; }
28
- .stApp { background:#0d0f14; color:#e8eaf0; }
29
- [data-testid="stSidebar"] { background:#13161e; border-right:1px solid #1f2333; }
30
- [data-testid="stSidebar"] * { color:#b0b8cc !important; }
31
- [data-testid="stSidebar"] h1,[data-testid="stSidebar"] h2,[data-testid="stSidebar"] h3 {
32
- color:#e8eaf0!important; font-family:'IBM Plex Mono',monospace!important;
33
- font-size:.8rem!important; letter-spacing:.12em!important; text-transform:uppercase!important; }
34
- .site-header { padding:2.5rem 0 1.5rem; border-bottom:1px solid #1f2333; margin-bottom:2rem; }
35
- .site-header h1 { font-family:'IBM Plex Mono',monospace; font-size:1.6rem; font-weight:600;
36
- color:#e8eaf0; letter-spacing:-.01em; margin:0 0 .3rem; }
37
- .site-header p { font-size:.82rem; color:#5a6480; font-family:'IBM Plex Mono',monospace; margin:0; }
38
- .pill { display:inline-block; font-family:'IBM Plex Mono',monospace; font-size:.68rem;
39
- font-weight:600; letter-spacing:.08em; text-transform:uppercase; padding:3px 10px;
40
- border-radius:2px; margin-right:6px; }
41
- .pill-blue { background:#0f2a4a; color:#4d9de0; border:1px solid #1a4070; }
42
- .pill-green { background:#0a2a1a; color:#3dba7a; border:1px solid #1a4a2a; }
43
- .pill-amber { background:#2a1f00; color:#e8a020; border:1px solid #4a3500; }
44
- .pill-red { background:#2a0f0f; color:#e04d4d; border:1px solid #4a1a1a; }
45
- .pill-gray { background:#1a1e2a; color:#7a8090; border:1px solid #2a2e3a; }
46
- .stat-grid { display:grid; grid-template-columns:repeat(5,1fr); gap:1px;
47
- background:#1f2333; border:1px solid #1f2333; border-radius:6px; overflow:hidden; margin-bottom:2rem; }
48
- .stat-card { background:#13161e; padding:1.25rem 1.5rem; text-align:center; }
49
- .stat-val { font-family:'IBM Plex Mono',monospace; font-size:1.9rem; font-weight:600;
50
- color:#e8eaf0; line-height:1; margin-bottom:.3rem; }
51
- .stat-label { font-size:.7rem; color:#5a6480; text-transform:uppercase; letter-spacing:.1em;
52
- font-family:'IBM Plex Mono',monospace; }
53
- .section-title { font-family:'IBM Plex Mono',monospace; font-size:.7rem; font-weight:600;
54
- letter-spacing:.15em; text-transform:uppercase; color:#5a6480;
55
- padding-bottom:.6rem; border-bottom:1px solid #1f2333; margin-bottom:1.2rem; }
56
- .topic-card { background:#13161e; border:1px solid #1f2333; border-left:3px solid #4d9de0;
57
- border-radius:4px; padding:1rem 1.25rem; margin-bottom:.6rem; transition:border-color .15s; }
58
- .topic-card:hover { border-left-color:#3dba7a; }
59
- .topic-label { font-size:.92rem; font-weight:500; color:#e8eaf0; margin-bottom:.35rem; }
60
- .topic-meta { font-family:'IBM Plex Mono',monospace; font-size:.7rem; color:#5a6480; }
61
- .topic-kw { font-family:'IBM Plex Mono',monospace; font-size:.68rem; color:#3d6480;
62
- margin-top:.4rem; line-height:1.5; }
63
- .val-box { background:#0a2a1a; border:1px solid #1a4a2a; border-radius:6px;
64
- padding:1.25rem 1.5rem; margin-bottom:1.5rem; }
65
- .val-box h4 { font-family:'IBM Plex Mono',monospace; font-size:.72rem; font-weight:600;
66
- letter-spacing:.1em; text-transform:uppercase; color:#3dba7a; margin:0 0 .75rem; }
67
- .val-row { display:flex; justify-content:space-between; align-items:center;
68
- padding:.4rem 0; border-bottom:1px solid #1a3a2a; font-size:.8rem; color:#a0b8a8; }
69
- .val-row:last-child { border-bottom:none; }
70
- .val-key { color:#5a7a6a; } .val-num { font-family:'IBM Plex Mono',monospace; color:#3dba7a; font-weight:600; }
71
- .stButton > button { background:#4d9de0!important; color:#0d0f14!important; border:none!important;
72
- border-radius:3px!important; font-family:'IBM Plex Mono',monospace!important;
73
- font-size:.78rem!important; font-weight:600!important; letter-spacing:.08em!important;
74
- text-transform:uppercase!important; padding:.6rem 2rem!important; }
75
- .stButton > button:hover { background:#3d8ed0!important; }
76
- .stDownloadButton > button { background:transparent!important; color:#4d9de0!important;
77
- border:1px solid #1a4070!important; border-radius:3px!important;
78
- font-family:'IBM Plex Mono',monospace!important; font-size:.72rem!important; }
79
- </style>
80
- """, unsafe_allow_html=True)
81
-
82
- # ── Header ───────────────────────────────────────────────────────────────────
83
- st.markdown("""
84
- <div class="site-header">
85
- <h1>SPECTER-2 Topic Analyzer</h1>
86
- <p>SPECTER-2 embeddings &nbsp;·&nbsp; Bayesian UMAP+HDBSCAN &nbsp;·&nbsp;
87
- 3-LLM Council (Groq + Mistral + Gemini)</p>
88
- </div>
89
- """, unsafe_allow_html=True)
90
-
91
- # ── Sidebar ──────────────────────────────────────────────────────────────────
92
- with st.sidebar:
93
- st.markdown("### API Keys")
94
- groq_key_in = st.text_input("Groq API Key", type="password")
95
- mistral_key_in = st.text_input("Mistral API Key", type="password")
96
- gemini_key_in = st.text_input("Gemini API Key", type="password")
97
- st.caption("Keys are never stored. Leave blank to use env vars.")
98
-
99
- st.markdown("---")
100
- st.markdown("### Bayesian Optimisation")
101
- n_trials = st.slider("Optuna trials", 20, 100, 50,
102
- help="§3.4: 50–100 trials recommended")
103
- st.markdown(
104
- "<span class='pill pill-blue'>Max mass ≤ 25%</span>"
105
- "<span class='pill pill-blue'>Min size ≥ 5</span>",
106
- unsafe_allow_html=True)
107
-
108
- st.markdown("---")
109
- st.markdown("### LLM Council")
110
- st.markdown("""
111
- <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:1rem;">
112
- <span class="pill pill-blue">Groq / LLaMA-3.1</span>
113
- <span class="pill pill-green">Mistral Small</span>
114
- <span class="pill pill-amber">Gemini 2.5 Flash</span>
115
- </div>
116
- <p style="font-size:.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
117
- Sheet 1–3 per LLM · Sheet 4 consolidation<br>
118
- Triple / Two / Single agreement tags<br>
119
- Defence prompt for disagreement clusters
120
- </p>
121
- """, unsafe_allow_html=True)
122
-
123
- st.markdown("---")
124
- if st.button("Reset Results", use_container_width=True):
125
- for k in ["results", "agent_out", "topic_data"]:
126
- st.session_state.pop(k, None)
127
- st.rerun()
128
-
129
- groq_key = groq_key_in.strip() or os.getenv("GROQ_API_KEY")
130
- mistral_key = mistral_key_in.strip() or os.getenv("MISTRAL_API_KEY")
131
- gemini_key = gemini_key_in.strip() or os.getenv("GEMINI_API_KEY")
132
-
133
- # ── Upload ───────────────────────────────────────────────────────────────────
134
- st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
135
- col_up, col_s = st.columns([3, 1])
136
- with col_up:
137
- uploaded = st.file_uploader(
138
- "Upload Scopus CSV (must have 'title' + 'abstract')", type=["csv"])
139
- with col_s:
140
- st.markdown("<br>", unsafe_allow_html=True)
141
- use_sample = st.checkbox("Use sample dataset (50 papers)")
142
-
143
- if uploaded and not use_sample:
144
- dfp = pd.read_csv(uploaded); uploaded.seek(0)
145
- c1, c2, c3 = st.columns(3)
146
- c1.metric("Papers", len(dfp))
147
- c2.metric("Columns", len(dfp.columns))
148
- ok = {"title","abstract"}.issubset(set(dfp.columns.str.lower()))
149
- c3.metric("Title+Abstract", "✓" if ok else "✗")
150
- if not ok:
151
- st.error("CSV must have 'title' and 'abstract' columns.")
152
-
153
- # ── Run ──────────────────────────────────────────────────────────────────────
154
- st.markdown("<br>", unsafe_allow_html=True)
155
- run_btn = st.button("▶ Run Full Pipeline", type="primary")
156
-
157
- if run_btn:
158
- missing = []
159
- if not groq_key: missing.append("Groq")
160
- if not mistral_key: missing.append("Mistral")
161
- if not gemini_key: missing.append("Gemini")
162
- if missing:
163
- st.error(f"Missing key(s): {', '.join(missing)}")
164
- st.stop()
165
- if not use_sample and not uploaded:
166
- st.error("Upload a CSV or enable sample dataset.")
167
- st.stop()
168
-
169
- # Prepare CSV
170
- if use_sample:
171
- rng = np.random.default_rng(42)
172
- pool = [
173
- ("Deep Learning for Healthcare Prediction",
174
- "We apply LSTM networks to predict patient readmission from EHR data."),
175
- ("Process Mining in Enterprise Systems",
176
- "Event log analysis using Petri nets for conformance checking in ERP workflows."),
177
- ("Recommender Systems Collaborative Filtering",
178
- "Matrix factorization techniques applied to e-commerce product recommendation."),
179
- ("LLM Applications in Information Systems",
180
- "GPT-4 used for automated requirements extraction from stakeholder documents."),
181
- ("Blockchain Smart Contract Security",
182
- "Formal verification of Solidity smart contracts for financial transaction safety."),
183
- ("Federated Learning Privacy Preservation",
184
- "Differential privacy mechanisms for distributed model training across hospitals."),
185
- ("Cybersecurity Intrusion Detection",
186
- "Random forest classifiers for network anomaly detection in enterprise environments."),
187
- ("NLP Sentiment Analysis",
188
- "BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
189
- ("Knowledge Graph Embedding",
190
- "TransE and RotatE models for biomedical entity relation prediction."),
191
- ("Computer Vision Medical Imaging",
192
- "CNN architectures for diabetic retinopathy grading from fundus photographs."),
193
- ]
194
- rows = [{"title": t, "abstract": a + f" Study {i+1}.",
195
- "doi": f"10.1145/sample.{i+1}"}
196
- for i, (t, a) in enumerate(pool * 5)]
197
- dfs = pd.DataFrame(rows)
198
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
199
- dfs.to_csv(tmp.name, index=False); csv_path = tmp.name
200
- else:
201
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
202
- tmp.write(uploaded.read()); tmp.flush(); csv_path = tmp.name
203
-
204
- # Step 1 — Topic modelling + Bayesian optimisation
205
- pbar = st.progress(0, text="Step 1/2 — SPECTER-2 embed + Bayesian UMAP/HDBSCAN…")
206
- def _progress(cur, total, entry):
207
- pct = int(cur / total * 45)
208
- txt = (f"Trial {cur}/{total} — "
209
- f"{'PASS' if entry['discipline_pass'] else 'FAIL'} — "
210
- f"{entry['n_clusters']} clusters")
211
- pbar.progress(min(pct, 49), text=txt)
212
  try:
213
- topic_data = run_topic_modeling(csv_path, n_trials=n_trials,
214
- progress_callback=_progress)
215
- nc = topic_data["discipline"]["n_clusters"]
216
- pbar.progress(50, text=f"Step 1 done — {nc} clusters, "
217
- f"{topic_data['n_trials_run']} trials.")
218
  except Exception as e:
219
- st.error(f"Topic modelling failed: {e}")
220
- st.stop()
221
-
222
- # Step 2 LLM Council
223
- pbar.progress(55, text="Step 2/2 — 3-LLM Council labelling…")
224
- try:
225
- agent_out = run_agent(topic_data, groq_key, mistral_key, gemini_key)
226
- pbar.progress(100, text="Pipeline complete.")
227
- st.session_state["topic_data"] = topic_data
228
- st.session_state["agent_out"] = agent_out
229
- st.success(f"Done {len(agent_out['interpretations'])} clusters labelled.")
230
- except Exception as e:
231
- st.error(f"LLM Council failed: {e}")
232
- st.stop()
233
-
234
-
235
- # ── Results ──────────────────────────────────────────────────────────────────
236
- td = st.session_state.get("topic_data")
237
- ao = st.session_state.get("agent_out")
238
-
239
- if td and ao:
240
- interps = ao["interpretations"]
241
- disc = td["discipline"]
242
- met = td["metrics"]
243
-
244
- # ── Summary stats ────────────────────────────────────────────────────
245
- st.markdown("<div class='section-title'>Pipeline Summary</div>",
246
- unsafe_allow_html=True)
247
- n_topics = disc["n_clusters"]
248
- strong_total = sum(i.strong_count for i in interps.values())
249
- weak_total = sum(i.weak_count for i in interps.values())
250
- total_papers = strong_total + weak_total
251
- strong_pct = round(strong_total / max(total_papers, 1) * 100)
252
-
253
- st.markdown(f"""
254
- <div class="stat-grid">
255
- <div class="stat-card"><div class="stat-val">{n_topics}</div>
256
- <div class="stat-label">Clusters</div></div>
257
- <div class="stat-card"><div class="stat-val">{total_papers}</div>
258
- <div class="stat-label">Papers assigned</div></div>
259
- <div class="stat-card"><div class="stat-val">{strong_pct}%</div>
260
- <div class="stat-label">Strong members</div></div>
261
- <div class="stat-card"><div class="stat-val">{round(met['persistence'],3)}</div>
262
- <div class="stat-label">Persistence</div></div>
263
- <div class="stat-card"><div class="stat-val">{round(met['dbcv'],3)}</div>
264
- <div class="stat-label">DBCV</div></div>
265
- </div>
266
- """, unsafe_allow_html=True)
267
-
268
- # ── Discipline + metrics panel ───────────────────────────────────────
269
- st.markdown("<div class='section-title'>Discipline & Quality</div>",
270
- unsafe_allow_html=True)
271
- st.markdown(f"""
272
- <div class="val-box">
273
- <h4>§3.2 Hard Constraints + §3.4 Quality Criteria</h4>
274
- <div class="val-row"><span class="val-key">Max cluster mass ≤ 25%</span>
275
- <span class="val-num">{'✅ PASS' if disc['max_mass_ok'] else '❌ FAIL'}
276
- ({round(disc['max_mass_pct']*100,1)}%)</span></div>
277
- <div class="val-row"><span class="val-key">Min cluster size ≥ 5</span>
278
- <span class="val-num">{'✅ PASS' if disc['min_size_ok'] else '❌ FAIL'}
279
- (min={disc['min_size']})</span></div>
280
- <div class="val-row"><span class="val-key">HDBSCAN Persistence</span>
281
- <span class="val-num">{round(met['persistence'],4)}</span></div>
282
- <div class="val-row"><span class="val-key">DBCV</span>
283
- <span class="val-num">{round(met['dbcv'],4)}</span></div>
284
- <div class="val-row"><span class="val-key">Stability (ARI, 5 seeds)</span>
285
- <span class="val-num">{round(met['stability'],4)}</span></div>
286
- <div class="val-row"><span class="val-key">Bayesian trials run</span>
287
- <span class="val-num">{td['n_trials_run']} (best = #{td['best_trial']})</span></div>
288
- <div class="val-row"><span class="val-key">Noise papers (−1)</span>
289
- <span class="val-num">{disc['n_noise']}</span></div>
290
- </div>
291
- """, unsafe_allow_html=True)
292
-
293
- # ── Best params ──────────────────────────────────────────────────────
294
- with st.expander("Winning UMAP + HDBSCAN parameters", expanded=False):
295
- bp = td["best_params"]
296
- pdf = pd.DataFrame([
297
- {"Parameter": "UMAP.n_neighbors", "Value": bp["n_neighbors"]},
298
- {"Parameter": "UMAP.n_components", "Value": bp["n_components"]},
299
- {"Parameter": "UMAP.min_dist", "Value": 0.0},
300
- {"Parameter": "UMAP.metric", "Value": "cosine"},
301
- {"Parameter": "HDBSCAN.min_cluster_size",
302
- "Value": bp["min_cluster_size"]},
303
- {"Parameter": "HDBSCAN.min_samples", "Value": bp["min_samples"]},
304
- {"Parameter": "HDBSCAN.cluster_selection_method",
305
- "Value": bp["csm"]},
306
- {"Parameter": "HDBSCAN.cluster_selection_epsilon",
307
- "Value": bp["cse"]},
308
- ])
309
- st.dataframe(pdf, use_container_width=True, hide_index=True)
310
-
311
- # ── 2-D UMAP scatter ────────────────────────────────────────────────
312
- st.markdown("<div class='section-title'>2-D UMAP Visualisation</div>",
313
- unsafe_allow_html=True)
314
- umap2d = np.array(td["umap_2d"])
315
- labels_arr = np.array(td["labels"])
316
- scatter_df = pd.DataFrame({
317
- "UMAP-1": umap2d[:, 0], "UMAP-2": umap2d[:, 1],
318
- "Cluster": [str(l) for l in labels_arr],
319
- "Doc": [d[:80]+"…" for d in td["documents"]],
320
- })
321
- fig = px.scatter(scatter_df, x="UMAP-1", y="UMAP-2", color="Cluster",
322
- hover_data=["Doc"], opacity=0.75,
323
- title="SPECTER-2 embeddings (2-D UMAP, min_dist=0.1)")
324
- fig.update_layout(
325
- template="plotly_dark",
326
- paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
327
- font=dict(family="IBM Plex Mono", size=11),
328
- height=520,
329
- )
330
- st.plotly_chart(fig, use_container_width=True)
331
-
332
- # ── Pareto front ─────────────────────────────────────────────────────
333
- with st.expander("Bayesian trial log & Pareto front", expanded=False):
334
- tl = td["trial_log"]
335
- tl_df = pd.DataFrame(tl)
336
- if not tl_df.empty:
337
- tl_df["colour"] = tl_df["discipline_pass"].map(
338
- {True: "PASS", False: "FAIL"})
339
- fig2 = px.scatter(
340
- tl_df, x="persistence", y="dbcv", color="colour",
341
- hover_data=["trial", "n_clusters", "max_mass_pct"],
342
- color_discrete_map={"PASS": "#3dba7a", "FAIL": "#e04d4d"},
343
- title="Pareto front — Persistence vs DBCV",
344
- )
345
- fig2.add_vline(x=0, line_dash="dash", line_color="#5a6480")
346
- fig2.update_layout(
347
- template="plotly_dark",
348
- paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
349
- font=dict(family="IBM Plex Mono", size=11), height=400)
350
- st.plotly_chart(fig2, use_container_width=True)
351
- st.dataframe(tl_df[["trial", "discipline_pass", "n_clusters",
352
- "persistence", "dbcv", "max_mass_pct",
353
- "min_size", "n_noise"]],
354
- use_container_width=True, height=300)
355
-
356
- # ── Cluster table (strong / weak) ────────────────────────────────────
357
- st.markdown("<div class='section-title'>Cluster Results</div>",
358
- unsafe_allow_html=True)
359
  rows = []
360
  for cid in sorted(interps.keys()):
361
- i = interps[cid]
362
- rows.append({
363
- "Cluster": cid,
364
- "Label": i.final_label,
365
- "Agreement": i.agreement,
366
- "PAJAIS": i.final_pacis_match,
367
- "Strong": i.strong_count,
368
- "Weak": i.weak_count,
369
- "Total": i.paper_count,
370
- "Confidence": round(i.final_confidence, 2),
371
- "Grounding": i.grounding_check.get("verdict", "?"),
372
- "Keyphrases": ", ".join(i.keyphrases[:5]),
373
- })
374
- df_res = pd.DataFrame(rows).sort_values("Total", ascending=False
375
- ).reset_index(drop=True)
376
- st.dataframe(df_res, use_container_width=True, height=420)
377
-
378
- # ── Topic cards ──────────────────────────────────────────────────────
379
- with st.expander("Topic cards (detailed)", expanded=False):
380
- for _, row in df_res.iterrows():
381
- ag_pill = {"Triple": "pill-green", "Two": "pill-blue",
382
- "Single": "pill-amber"}.get(row["Agreement"], "pill-gray")
383
- st.markdown(f"""
384
- <div class="topic-card">
385
- <div class="topic-label">{row['Label']}</div>
386
- <div class="topic-meta">
387
- <span class="pill {ag_pill}">{row['Agreement']}</span>
388
- <span class="pill pill-gray">{row['PAJAIS']}</span>
389
- <span class="pill pill-blue">{row['Strong']}S / {row['Weak']}W</span>
390
- <span class="pill pill-gray">Ground: {row['Grounding']}</span>
391
- </div>
392
- <div class="topic-kw">{row['Keyphrases']}</div>
393
- </div>""", unsafe_allow_html=True)
394
-
395
- # ── LLM Council Sheets ───────────────────────────────────────────────
396
- with st.expander("LLM Council — Sheets 1-4", expanded=False):
397
- sheet_rows = []
398
- for cid in sorted(interps.keys()):
399
- i = interps[cid]
400
- for sn, sheet in [("Sheet 1 (Groq)", i.sheet1),
401
- ("Sheet 2 (Mistral)", i.sheet2),
402
- ("Sheet 3 (Gemini)", i.sheet3)]:
403
- sheet_rows.append({
404
- "Cluster": cid, "Sheet": sn,
405
- "Label": sheet.get("label", "—"),
406
- "PAJAIS": sheet.get("pacis_match", "—"),
407
- "Conf": sheet.get("confidence", "—"),
408
- })
409
- sheet_rows.append({
410
- "Cluster": cid, "Sheet": "Sheet 4 (Final)",
411
- "Label": i.final_label,
412
- "PAJAIS": i.final_pacis_match,
413
- "Conf": i.final_confidence,
414
- })
415
- st.dataframe(pd.DataFrame(sheet_rows), use_container_width=True,
416
- height=400)
417
-
418
- # ── Downloads ───────────────────────────────────────────���────────────
419
- st.markdown("<div class='section-title'>Downloads</div>",
420
- unsafe_allow_html=True)
421
- c1, c2, c3, c4 = st.columns(4)
422
- with c1:
423
- try:
424
- with open(ao["json_path"]) as f:
425
- st.download_button("⬇ topics.json", f.read(),
426
- "topics.json", "application/json",
427
- use_container_width=True)
428
- except Exception:
429
- st.warning("JSON not found.")
430
- with c2:
431
- st.download_button("⬇ results.csv",
432
- df_res.to_csv(index=False),
433
- "results.csv", "text/csv",
434
- use_container_width=True)
435
- with c3:
436
- tl_csv = pd.DataFrame(td["trial_log"]).to_csv(index=False)
437
- st.download_button("⬇ trial_log.csv", tl_csv,
438
- "trial_log.csv", "text/csv",
439
- use_container_width=True)
440
- with c4:
441
- bp_json = json.dumps(td["best_params"], indent=2)
442
- st.download_button("⬇ best_params.json", bp_json,
443
- "best_params.json", "application/json",
444
- use_container_width=True)
445
 
446
- elif not td:
447
- st.markdown("""
448
- <div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;">
449
- <p style="font-family:'IBM Plex Mono',monospace;font-size:.8rem;color:#3a4060;letter-spacing:.1em;">
450
- UPLOAD CSV → ENTER API KEYS → RUN PIPELINE
451
- </p>
452
- <p style="font-size:.75rem;color:#2a3050;margin-top:.5rem;">
453
- SPECTER-2 Bayesian UMAP+HDBSCAN (50–100 trials) 3-LLM Council
454
- </p>
455
- </div>
456
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """app.py — Gradio UI entry point (<200 lines, §11)."""
 
 
 
 
 
 
 
2
  import os, json, tempfile
3
+ import pandas as pd, numpy as np
4
+ import gradio as gr
 
5
  import plotly.express as px
6
+ from agent import run_pipeline
7
+
8
+ def _run(file, groq_key, mistral_key, gemini_key, n_trials):
9
+ if not file: return ("Upload a CSV first.",)+(None,)*7
10
+ gk = groq_key.strip() or os.getenv("GROQ_API_KEY","")
11
+ mk = mistral_key.strip() or os.getenv("MISTRAL_API_KEY","")
12
+ gek = gemini_key.strip() or os.getenv("GEMINI_API_KEY","")
13
+ if not all([gk,mk,gek]):
14
+ return ("All 3 API keys required.",)+(None,)*7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  try:
16
+ r = run_pipeline(file.name, gk, mk, gek, int(n_trials))
 
 
 
 
17
  except Exception as e:
18
+ return (f"Pipeline error: {e}",)+(None,)*7
19
+ if r.get("error"):
20
+ return (f"Error: {r['error']}",)+(None,)*7
21
+ td, interps = r["topic_data"], r.get("interpretations",{})
22
+ disc, met = td["discipline"], td["metrics"]
23
+ sw_total = sum(v["strong"] for v in interps.values())
24
+ wk_total = sum(v["weak"] for v in interps.values())
25
+ ar = r.get("agreement_rates",{})
26
+ summary = (f"**Clusters:** {disc['n_clusters']} | "
27
+ f"**Strong:** {sw_total} ({round(sw_total/max(sw_total+wk_total,1)*100)}%) | "
28
+ f"**Weak:** {wk_total} | **Noise:** {disc['n_noise']}\n\n"
29
+ f"**Max mass:** {round(disc['max_mass_pct']*100,1)}% "
30
+ f"({'✅' if disc['max_mass_ok'] else '❌'}) | "
31
+ f"**Min size:** {disc['min_size']} "
32
+ f"({'✅' if disc['min_size_ok'] else '❌'})\n\n"
33
+ f"**Persistence:** {round(met['persistence'],4)} | "
34
+ f"**DBCV:** {round(met['dbcv'],4)} | "
35
+ f"**Stability:** {round(met['stability'],4)}\n\n"
36
+ f"**Trials:** {td['n_trials_run']} (best #{td['best_trial']})\n\n"
37
+ f"**Agreement:** Triple={ar.get('triple',0)}% "
38
+ f"Two+={ar.get('two_or_more',0)}% "
39
+ f"Single={ar.get('single',0)}%")
40
+ # UMAP scatter
41
+ u2d = np.array(td["umap_2d"])
42
+ sdf = pd.DataFrame({"UMAP-1":u2d[:,0],"UMAP-2":u2d[:,1],
43
+ "Cluster":[str(l) for l in td["labels"]],
44
+ "Doc":[d[:60] for d in td["documents"]]})
45
+ fig = px.scatter(sdf, x="UMAP-1", y="UMAP-2", color="Cluster",
46
+ hover_data=["Doc"], opacity=0.7, title="2-D UMAP (SPECTER-2)")
47
+ fig.update_layout(template="plotly_dark", height=480,
48
+ paper_bgcolor="#0d0f14", plot_bgcolor="#13161e")
49
+ # Trial log
50
+ tl = pd.DataFrame(td["trial_log"])
51
+ tl_cols = ["trial","discipline_pass","n_clusters","persistence",
52
+ "dbcv","max_mass_pct","min_size","n_noise"]
53
+ tl_show = tl[[c for c in tl_cols if c in tl.columns]] if not tl.empty else pd.DataFrame()
54
+ # Cluster table with strong/weak/persistence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  rows = []
56
  for cid in sorted(interps.keys()):
57
+ v = interps[cid]
58
+ rows.append({"Cluster":cid,"Label":v["label"],"Agreement":v["agreement"],
59
+ "Description":v.get("description",""),
60
+ "PAJAIS":v.get("pacis_match",""),"Strong":v["strong"],"Weak":v["weak"],
61
+ "Persistence":round(v.get("persistence",0),4),
62
+ "Keyphrases":", ".join(v.get("keyphrases",[]))})
63
+ cdf = pd.DataFrame(rows)
64
+ # TCCM sheets
65
+ sheets = r.get("sheets",{})
66
+ s_rows = []
67
+ for sn, label in [(1,"Groq"),(2,"Mistral"),(3,"Gemini"),(4,"Consolidated")]:
68
+ for row in sheets.get(sn,[]):
69
+ s_rows.append({"Sheet":f"{sn}-{label}", **row})
70
+ sdf2 = pd.DataFrame(s_rows) if s_rows else pd.DataFrame()
71
+ # Mismatch
72
+ mt = r.get("mismatch_table",[])
73
+ mdf = pd.DataFrame(mt) if mt else pd.DataFrame()
74
+ # Downloads
75
+ jp = r.get("json_path","topics.json")
76
+ cp = r.get("csv_path","topics.csv")
77
+ return summary, fig, tl_show, cdf, sdf2, mdf, jp, cp
78
+
79
+ css = """
80
+ .gradio-container{background:#0d0f14!important;color:#e8eaf0!important}
81
+ .gr-button-primary{background:#4d9de0!important}
82
+ footer{display:none!important}
83
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="blue",
86
+ neutral_hue="slate"), css=css, title="SPECTER-2 Topic Analyzer") as demo:
87
+ gr.Markdown("# 📐 SPECTER-2 Topic Analyzer\n"
88
+ "SPECTER-2 Bayesian UMAP+HDBSCAN → 3-LLM Council")
89
+ with gr.Row():
90
+ with gr.Column(scale=1):
91
+ file_in = gr.File(label="Upload Scopus CSV", file_types=[".csv"])
92
+ groq_in = gr.Textbox(label="Groq API Key", type="password")
93
+ mistral_in = gr.Textbox(label="Mistral API Key", type="password")
94
+ gemini_in = gr.Textbox(label="Gemini API Key", type="password")
95
+ trials_in = gr.Slider(10,100,50,step=5,label="Optuna Trials (§3.4)")
96
+ run_btn = gr.Button("▶ Run Full Pipeline", variant="primary")
97
+ gr.Markdown("**Hard rules:** max mass ≤25%, min size ≥5\n\n"
98
+ "**LLM Council:** Groq · Mistral · Gemini\n\n"
99
+ "**4 Sheets:** 3 independent LLMs + 1 consolidated")
100
+ with gr.Column(scale=3):
101
+ with gr.Tabs():
102
+ with gr.Tab("Summary"):
103
+ summary_out = gr.Markdown()
104
+ with gr.Tab("2-D UMAP"):
105
+ scatter_out = gr.Plot()
106
+ with gr.Tab("Trial Log"):
107
+ trial_out = gr.Dataframe(label="Bayesian Trials (≥50)")
108
+ with gr.Tab("Clusters"):
109
+ cluster_out = gr.Dataframe(
110
+ label="Strong/Weak + Persistence per cluster")
111
+ with gr.Tab("LLM Sheets 1-4"):
112
+ sheet_out = gr.Dataframe(label="4 Sheets: 3 LLMs + Consolidated")
113
+ with gr.Tab("RQ Mismatch"):
114
+ mismatch_out = gr.Dataframe(label="RQ2/RQ3 Mismatch Table")
115
+ with gr.Tab("Downloads"):
116
+ json_out = gr.File(label="topics.json")
117
+ csv_out = gr.File(label="topics.csv")
118
+ run_btn.click(_run,
119
+ inputs=[file_in, groq_in, mistral_in, gemini_in, trials_in],
120
+ outputs=[summary_out, scatter_out, trial_out, cluster_out,
121
+ sheet_out, mismatch_out, json_out, csv_out])
122
+
123
+ if __name__ == "__main__":
124
+ demo.launch(server_name="0.0.0.0", server_port=7860)