anujjuna commited on
Commit
05df72c
Β·
verified Β·
1 Parent(s): 9c754e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +360 -563
app.py CHANGED
@@ -1,659 +1,456 @@
1
  """
2
  app.py
3
  ------
4
- Streamlit UI β€” SPECTER2 + BERTopic + 3-LLM Council
5
- Research Topic Analyzer for SPJIMR Γ— SPIT Group 14
 
6
  """
7
 
8
- import os
9
- import json
10
- import tempfile
11
  import pandas as pd
 
12
  import streamlit as st
 
 
13
 
14
  from tools import run_topic_modeling
15
  from agent import run_agent
16
 
17
- # ── Page setup ──────────────────────────────────────────────────────────────
18
- st.set_page_config(
19
- page_title="TMIS Topic Analyzer",
20
- page_icon="πŸ“",
21
- layout="wide",
22
- initial_sidebar_state="expanded",
23
- )
24
 
25
- # ── Custom CSS ───────────────────────────────────────────────────────────────
26
  st.markdown("""
27
  <style>
28
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
29
-
30
- html, body, [class*="css"] {
31
- font-family: 'IBM Plex Sans', sans-serif;
32
- }
33
-
34
- /* App background */
35
- .stApp {
36
- background: #0d0f14;
37
- color: #e8eaf0;
38
- }
39
-
40
- /* Sidebar */
41
- [data-testid="stSidebar"] {
42
- background: #13161e;
43
- border-right: 1px solid #1f2333;
44
- }
45
- [data-testid="stSidebar"] * {
46
- color: #b0b8cc !important;
47
- }
48
- [data-testid="stSidebar"] h1,
49
- [data-testid="stSidebar"] h2,
50
- [data-testid="stSidebar"] h3 {
51
- color: #e8eaf0 !important;
52
- font-family: 'IBM Plex Mono', monospace !important;
53
- font-size: 0.8rem !important;
54
- letter-spacing: 0.12em !important;
55
- text-transform: uppercase !important;
56
- }
57
-
58
- /* Header */
59
- .site-header {
60
- padding: 2.5rem 0 1.5rem 0;
61
- border-bottom: 1px solid #1f2333;
62
- margin-bottom: 2rem;
63
- }
64
- .site-header h1 {
65
- font-family: 'IBM Plex Mono', monospace;
66
- font-size: 1.6rem;
67
- font-weight: 600;
68
- color: #e8eaf0;
69
- letter-spacing: -0.01em;
70
- margin: 0 0 0.3rem 0;
71
- }
72
- .site-header p {
73
- font-size: 0.82rem;
74
- color: #5a6480;
75
- font-family: 'IBM Plex Mono', monospace;
76
- margin: 0;
77
- letter-spacing: 0.04em;
78
- }
79
-
80
- /* Pills / badges */
81
- .pill {
82
- display: inline-block;
83
- font-family: 'IBM Plex Mono', monospace;
84
- font-size: 0.68rem;
85
- font-weight: 600;
86
- letter-spacing: 0.08em;
87
- text-transform: uppercase;
88
- padding: 3px 10px;
89
- border-radius: 2px;
90
- margin-right: 6px;
91
- }
92
- .pill-blue { background: #0f2a4a; color: #4d9de0; border: 1px solid #1a4070; }
93
- .pill-green { background: #0a2a1a; color: #3dba7a; border: 1px solid #1a4a2a; }
94
- .pill-amber { background: #2a1f00; color: #e8a020; border: 1px solid #4a3500; }
95
- .pill-red { background: #2a0f0f; color: #e04d4d; border: 1px solid #4a1a1a; }
96
- .pill-gray { background: #1a1e2a; color: #7a8090; border: 1px solid #2a2e3a; }
97
-
98
- /* Stats row */
99
- .stat-grid {
100
- display: grid;
101
- grid-template-columns: repeat(4, 1fr);
102
- gap: 1px;
103
- background: #1f2333;
104
- border: 1px solid #1f2333;
105
- border-radius: 6px;
106
- overflow: hidden;
107
- margin-bottom: 2rem;
108
- }
109
- .stat-card {
110
- background: #13161e;
111
- padding: 1.25rem 1.5rem;
112
- text-align: center;
113
- }
114
- .stat-val {
115
- font-family: 'IBM Plex Mono', monospace;
116
- font-size: 1.9rem;
117
- font-weight: 600;
118
- color: #e8eaf0;
119
- line-height: 1;
120
- margin-bottom: 0.3rem;
121
- }
122
- .stat-label {
123
- font-size: 0.7rem;
124
- color: #5a6480;
125
- text-transform: uppercase;
126
- letter-spacing: 0.1em;
127
- font-family: 'IBM Plex Mono', monospace;
128
- }
129
-
130
- /* Section titles */
131
- .section-title {
132
- font-family: 'IBM Plex Mono', monospace;
133
- font-size: 0.7rem;
134
- font-weight: 600;
135
- letter-spacing: 0.15em;
136
- text-transform: uppercase;
137
- color: #5a6480;
138
- padding-bottom: 0.6rem;
139
- border-bottom: 1px solid #1f2333;
140
- margin-bottom: 1.2rem;
141
- }
142
-
143
- /* Topic cards */
144
- .topic-card {
145
- background: #13161e;
146
- border: 1px solid #1f2333;
147
- border-left: 3px solid #4d9de0;
148
- border-radius: 4px;
149
- padding: 1rem 1.25rem;
150
- margin-bottom: 0.6rem;
151
- transition: border-color 0.15s;
152
- }
153
- .topic-card:hover { border-left-color: #3dba7a; }
154
- .topic-card.novel { border-left-color: #e8a020; }
155
- .topic-label {
156
- font-size: 0.92rem;
157
- font-weight: 500;
158
- color: #e8eaf0;
159
- margin-bottom: 0.35rem;
160
- }
161
- .topic-meta {
162
- font-family: 'IBM Plex Mono', monospace;
163
- font-size: 0.7rem;
164
- color: #5a6480;
165
- }
166
- .topic-kw {
167
- font-family: 'IBM Plex Mono', monospace;
168
- font-size: 0.68rem;
169
- color: #3d6480;
170
- margin-top: 0.4rem;
171
- line-height: 1.5;
172
- }
173
-
174
- /* Validation panel */
175
- .val-box {
176
- background: #0a2a1a;
177
- border: 1px solid #1a4a2a;
178
- border-radius: 6px;
179
- padding: 1.25rem 1.5rem;
180
- margin-bottom: 1.5rem;
181
- }
182
- .val-box h4 {
183
- font-family: 'IBM Plex Mono', monospace;
184
- font-size: 0.72rem;
185
- font-weight: 600;
186
- letter-spacing: 0.1em;
187
- text-transform: uppercase;
188
- color: #3dba7a;
189
- margin: 0 0 0.75rem 0;
190
- }
191
- .val-row {
192
- display: flex;
193
- justify-content: space-between;
194
- align-items: center;
195
- padding: 0.4rem 0;
196
- border-bottom: 1px solid #1a3a2a;
197
- font-size: 0.8rem;
198
- color: #a0b8a8;
199
- }
200
- .val-row:last-child { border-bottom: none; }
201
- .val-key { color: #5a7a6a; }
202
- .val-num { font-family: 'IBM Plex Mono', monospace; color: #3dba7a; font-weight: 600; }
203
-
204
- /* LLM council badge row */
205
- .council-row {
206
- display: flex;
207
- gap: 8px;
208
- margin-bottom: 1rem;
209
- flex-wrap: wrap;
210
- }
211
-
212
- /* Run button */
213
- .stButton > button {
214
- background: #4d9de0 !important;
215
- color: #0d0f14 !important;
216
- border: none !important;
217
- border-radius: 3px !important;
218
- font-family: 'IBM Plex Mono', monospace !important;
219
- font-size: 0.78rem !important;
220
- font-weight: 600 !important;
221
- letter-spacing: 0.08em !important;
222
- text-transform: uppercase !important;
223
- padding: 0.6rem 2rem !important;
224
- transition: background 0.15s !important;
225
- }
226
- .stButton > button:hover {
227
- background: #3d8ed0 !important;
228
- }
229
-
230
- /* Input overrides */
231
- .stTextInput input, .stSelectbox select {
232
- background: #13161e !important;
233
- border: 1px solid #1f2333 !important;
234
- color: #e8eaf0 !important;
235
- font-family: 'IBM Plex Mono', monospace !important;
236
- font-size: 0.82rem !important;
237
- border-radius: 3px !important;
238
- }
239
-
240
- /* Dataframe */
241
- .stDataFrame {
242
- background: #13161e;
243
- border: 1px solid #1f2333;
244
- border-radius: 4px;
245
- }
246
-
247
- /* Download buttons */
248
- .stDownloadButton > button {
249
- background: transparent !important;
250
- color: #4d9de0 !important;
251
- border: 1px solid #1a4070 !important;
252
- border-radius: 3px !important;
253
- font-family: 'IBM Plex Mono', monospace !important;
254
- font-size: 0.72rem !important;
255
- letter-spacing: 0.08em !important;
256
- }
257
-
258
- /* Expander */
259
- .streamlit-expanderHeader {
260
- background: #13161e !important;
261
- border: 1px solid #1f2333 !important;
262
- font-family: 'IBM Plex Mono', monospace !important;
263
- font-size: 0.78rem !important;
264
- color: #a0a8c0 !important;
265
- }
266
-
267
- /* Progress / spinner */
268
- .stSpinner > div { border-top-color: #4d9de0 !important; }
269
-
270
- /* Divider */
271
- hr { border-color: #1f2333 !important; }
272
-
273
- /* Alerts */
274
- .stAlert { border-radius: 4px !important; }
275
  </style>
276
  """, unsafe_allow_html=True)
277
 
278
  # ── Header ───────────────────────────────────────────────────────────────────
279
  st.markdown("""
280
  <div class="site-header">
281
- <h1>Research Topic Analyzer</h1>
282
- <p>SPECTER2 embeddings &nbsp;Β·&nbsp; HDBSCAN/UMAP clustering &nbsp;Β·&nbsp; 3-LLM Council (Groq + Mistral + Gemini) &nbsp;Β·&nbsp; PAJAIS validation</p>
 
283
  </div>
284
  """, unsafe_allow_html=True)
285
 
286
  # ── Sidebar ──────────────────────────────────────────────────────────────────
287
  with st.sidebar:
288
  st.markdown("### API Keys")
289
- groq_key_input = st.text_input("Groq API Key", type="password", placeholder="GROQ_API_KEY env var")
290
- mistral_key_input = st.text_input("Mistral API Key", type="password", placeholder="MISTRAL_API_KEY env var")
291
- gemini_key_input = st.text_input("Gemini API Key", type="password", placeholder="GEMINI_API_KEY env var")
292
  st.caption("Keys are never stored. Leave blank to use env vars.")
293
 
294
  st.markdown("---")
295
- st.markdown("### Clustering Parameters")
296
- min_topic_size = st.slider("Min papers per cluster", min_value=3, max_value=20, value=5,
297
- help="Prof. Kamat spec: min=5")
298
- st.markdown(
299
- "<span class='pill pill-blue'>Min clusters: 15</span>"
300
- "<span class='pill pill-blue'>Max clusters: 30</span>",
301
- unsafe_allow_html=True
302
- )
303
  st.markdown(
304
- "<span class='pill pill-gray'>Cosine sim: 0.50–0.55</span>",
305
- unsafe_allow_html=True
306
- )
307
 
308
  st.markdown("---")
309
  st.markdown("### LLM Council")
310
  st.markdown("""
311
- <div class="council-row">
312
  <span class="pill pill-blue">Groq / LLaMA-3.1</span>
313
  <span class="pill pill-green">Mistral Small</span>
314
  <span class="pill pill-amber">Gemini 2.5 Flash</span>
315
  </div>
316
- <p style="font-size:0.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
317
- Majority vote β†’ best label selected.<br>
318
- Keyword-overlap fallback if no consensus.
 
319
  </p>
320
  """, unsafe_allow_html=True)
321
 
322
  st.markdown("---")
323
  if st.button("Reset Results", use_container_width=True):
324
- for key in ["agent_results", "topic_stats"]:
325
- st.session_state.pop(key, None)
326
  st.rerun()
327
 
328
- groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
329
- mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
330
- gemini_api_key = gemini_key_input.strip() or os.getenv("GEMINI_API_KEY")
331
 
332
- # ── Dataset upload ────────────────────────────────────────────────────────────
333
  st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
334
-
335
- col_up, col_sample = st.columns([3, 1])
336
  with col_up:
337
- uploaded_file = st.file_uploader(
338
- "Upload Scopus CSV β€” must contain 'title' and 'abstract' columns",
339
- type=["csv"],
340
- help="Export your corpus from Scopus as CSV. The tool will combine Title + Abstract into one SPECTER2 vector per paper."
341
- )
342
- with col_sample:
343
  st.markdown("<br>", unsafe_allow_html=True)
344
- use_sample = st.checkbox("Use sample dataset (50 papers)", value=False)
345
-
346
- if uploaded_file and not use_sample:
347
- try:
348
- df_preview = pd.read_csv(uploaded_file)
349
- uploaded_file.seek(0)
350
- col_a, col_b, col_c = st.columns(3)
351
- col_a.metric("Papers detected", len(df_preview))
352
- col_b.metric("Columns", len(df_preview.columns))
353
- has_both = {"title", "abstract"}.issubset(set(df_preview.columns.str.lower()))
354
- col_c.metric("Title + Abstract", "βœ“ present" if has_both else "βœ— missing")
355
- if not has_both:
356
- st.error("CSV must have both 'title' and 'abstract' columns.")
357
- except Exception as e:
358
- st.error(f"Could not preview CSV: {e}")
359
-
360
- # ── Run Pipeline ─────────────────────────────────────────────────────────────
361
  st.markdown("<br>", unsafe_allow_html=True)
362
  run_btn = st.button("β–Ά Run Full Pipeline", type="primary")
363
 
364
  if run_btn:
365
- # Validation
366
- missing_keys = []
367
- if not groq_api_key: missing_keys.append("Groq")
368
- if not mistral_api_key: missing_keys.append("Mistral")
369
- if not gemini_api_key: missing_keys.append("Gemini")
370
- if missing_keys:
371
- st.error(f"Missing API key(s): {', '.join(missing_keys)}. All three are required for the LLM council.")
372
  st.stop()
373
-
374
- if not use_sample and uploaded_file is None:
375
- st.error("Please upload a CSV file or enable the sample dataset.")
376
  st.stop()
377
 
378
- # Prepare CSV path
379
  if use_sample:
380
- import numpy as np
381
  rng = np.random.default_rng(42)
382
- topics_pool = [
383
- ("Deep Learning for Healthcare Prediction", "We apply LSTM networks to predict patient readmission from EHR data."),
384
- ("Process Mining in Enterprise Systems", "Event log analysis using Petri nets for conformance checking in ERP workflows."),
385
- ("Recommender Systems Collaborative Filtering", "Matrix factorization techniques applied to e-commerce product recommendation."),
386
- ("LLM Applications in Information Systems", "GPT-4 used for automated requirements extraction from stakeholder documents."),
387
- ("Blockchain Smart Contract Security", "Formal verification of Solidity smart contracts for financial transaction safety."),
388
- ("Federated Learning Privacy Preservation", "Differential privacy mechanisms for distributed model training across hospitals."),
389
- ("Cybersecurity Intrusion Detection", "Random forest classifiers for network anomaly detection in enterprise environments."),
390
- ("Natural Language Processing Sentiment", "BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
391
- ("Knowledge Graph Embedding", "TransE and RotatE models for biomedical entity relation prediction."),
392
- ("Computer Vision Medical Imaging", "CNN architectures for diabetic retinopathy grading from fundus photographs."),
 
 
 
 
 
 
 
 
 
 
393
  ]
394
- rows = []
395
- for i in range(50):
396
- t, a = topics_pool[i % len(topics_pool)]
397
- rows.append({"title": t, "abstract": a + f" Study {i+1}.", "doi": f"10.1145/sample.{i+1}"})
398
- df_s = pd.DataFrame(rows)
399
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
400
- df_s.to_csv(tmp.name, index=False)
401
- csv_path = tmp.name
402
  else:
403
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
404
- tmp.write(uploaded_file.read())
405
- tmp.flush()
406
- csv_path = tmp.name
407
-
408
- # Step 1: Topic modeling
409
- progress_bar = st.progress(0, text="Step 1/2 β€” SPECTER2 embeddings + HDBSCAN clustering (15–30 clusters)…")
 
 
 
 
410
  try:
411
- topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
412
- n_clusters = len(topic_results["documents"]["topic_keywords"])
413
- progress_bar.progress(50, text=f"Step 1/2 β€” Done. {n_clusters} clusters found.")
414
- except Exception as exc:
415
- st.error(f"Topic modeling failed: {exc}")
 
 
416
  st.stop()
417
 
418
- # Step 2: LLM Council
419
- progress_bar.progress(55, text="Step 2/2 β€” 3-LLM Council labelling (Groq + Mistral + Gemini)…")
420
  try:
421
- agent_results = run_agent(
422
- topic_results=topic_results,
423
- groq_key=groq_api_key,
424
- mistral_key=mistral_api_key,
425
- gemini_key=gemini_api_key,
426
- )
427
- progress_bar.progress(100, text="Pipeline complete.")
428
- st.session_state["agent_results"] = agent_results
429
-
430
- # Compute summary stats
431
- interps = agent_results.get("interpretations", {})
432
- novel_count = sum(1 for i in interps.values() if i.classification == "NOVEL")
433
- mapped_count = sum(1 for i in interps.values() if i.classification == "MAPPED")
434
- total_papers = sum(i.paper_count for i in interps.values())
435
- st.session_state["topic_stats"] = {
436
- "n_topics": len(interps),
437
- "novel": novel_count,
438
- "mapped": mapped_count,
439
- "total_papers": total_papers,
440
- }
441
- st.success(f"Pipeline complete β€” {len(interps)} topics labelled by 3-LLM council.")
442
- except Exception as exc:
443
- st.error(f"LLM council failed: {exc}")
444
  st.stop()
445
 
446
- # ── Results Display ────────────────────────────────────────────────────────────
447
- results = st.session_state.get("agent_results")
448
- stats = st.session_state.get("topic_stats")
449
 
450
- if results and stats:
451
- interps = results.get("interpretations", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- # ── Summary stats ─────────────────────────────────────────────────────────
454
- st.markdown("<div class='section-title'>Pipeline Summary</div>", unsafe_allow_html=True)
455
  st.markdown(f"""
456
  <div class="stat-grid">
457
- <div class="stat-card">
458
- <div class="stat-val">{stats['n_topics']}</div>
459
- <div class="stat-label">Topics Found</div>
460
- </div>
461
- <div class="stat-card">
462
- <div class="stat-val">{stats['total_papers']}</div>
463
- <div class="stat-label">Papers Assigned</div>
464
- </div>
465
- <div class="stat-card">
466
- <div class="stat-val">{stats['novel']}</div>
467
- <div class="stat-label">NOVEL (no PAJAIS home)</div>
468
- </div>
469
- <div class="stat-card">
470
- <div class="stat-val">{stats['mapped']}</div>
471
- <div class="stat-label">MAPPED to PAJAIS</div>
472
- </div>
473
  </div>
474
  """, unsafe_allow_html=True)
475
 
476
- # ── Validation panel ──────────────────────────────────────────────────────
477
- st.markdown("<div class='section-title'>LLM Council Validation</div>", unsafe_allow_html=True)
478
- novel_pct = round(stats['novel'] / stats['n_topics'] * 100) if stats['n_topics'] else 0
479
- mapped_pct = round(stats['mapped'] / stats['n_topics'] * 100) if stats['n_topics'] else 0
480
  st.markdown(f"""
481
  <div class="val-box">
482
- <h4>Instructor Spec Compliance</h4>
483
- <div class="val-row"><span class="val-key">Embedding model</span><span class="val-num">SPECTER2 (allenai/specter2_base)</span></div>
484
- <div class="val-row"><span class="val-key">Input column</span><span class="val-num">Title + Abstract (combined)</span></div>
485
- <div class="val-row"><span class="val-key">Clustering</span><span class="val-num">UMAP β†’ HDBSCAN (min=5, max=100 per cluster)</span></div>
486
- <div class="val-row"><span class="val-key">Cosine similarity range</span><span class="val-num">0.50 – 0.55 (merge / outlier reassign)</span></div>
487
- <div class="val-row"><span class="val-key">Total clusters</span><span class="val-num">{stats['n_topics']} (target: 15–30)</span></div>
488
- <div class="val-row"><span class="val-key">LLM council</span><span class="val-num">Groq (LLaMA-3.1) + Mistral Small + Gemini 2.5 Flash</span></div>
489
- <div class="val-row"><span class="val-key">Label selection</span><span class="val-num">Majority vote β†’ keyword-overlap fallback</span></div>
490
- <div class="val-row"><span class="val-key">Rep. docs per topic</span><span class="val-num">Top-3 by cosine similarity to centroid</span></div>
491
- <div class="val-row"><span class="val-key">NOVEL themes (no PAJAIS home)</span><span class="val-num">{novel_pct}% ({stats['novel']} topics)</span></div>
492
- <div class="val-row"><span class="val-key">MAPPED to PAJAIS taxonomy</span><span class="val-num">{mapped_pct}% ({stats['mapped']} topics)</span></div>
 
 
 
 
 
 
493
  </div>
494
  """, unsafe_allow_html=True)
495
 
496
- # ── Filters ───────────────────────────────────────────────────────────────
497
- st.markdown("<div class='section-title'>Topic Results</div>", unsafe_allow_html=True)
498
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  rows = []
500
- for tid, interp in sorted(interps.items()):
 
501
  rows.append({
502
- "Topic ID": tid,
503
- "Label": interp.label,
504
- "Classification": interp.classification,
505
- "Category": interp.category,
506
- "Papers": interp.paper_count,
507
- "Keywords": ", ".join(interp.keywords[:8]),
 
 
 
 
508
  })
509
- df_res = pd.DataFrame(rows).sort_values("Papers", ascending=False).reset_index(drop=True)
510
-
511
- col_f1, col_f2, col_f3 = st.columns([2, 2, 1])
512
- with col_f1:
513
- cats = ["All"] + sorted(df_res["Category"].unique().tolist())
514
- sel_cat = st.selectbox("Filter by category", cats)
515
- with col_f2:
516
- clsf = ["All", "NOVEL", "MAPPED"]
517
- sel_cls = st.selectbox("Filter by classification", clsf)
518
- with col_f3:
519
- sort_by = st.selectbox("Sort by", ["Papers ↓", "Papers ↑", "Label A–Z"])
520
-
521
- df_f = df_res.copy()
522
- if sel_cat != "All":
523
- df_f = df_f[df_f["Category"] == sel_cat]
524
- if sel_cls != "All":
525
- df_f = df_f[df_f["Classification"] == sel_cls]
526
- if sort_by == "Papers ↓":
527
- df_f = df_f.sort_values("Papers", ascending=False)
528
- elif sort_by == "Papers ↑":
529
- df_f = df_f.sort_values("Papers", ascending=True)
530
- else:
531
- df_f = df_f.sort_values("Label")
532
- df_f = df_f.reset_index(drop=True)
533
-
534
- st.caption(f"Showing {len(df_f)} of {len(df_res)} topics")
535
-
536
- # ── Topic cards ───────────────────────────────────────────────────────────
537
- view_mode = st.radio("View as", ["Table", "Cards"], horizontal=True)
538
-
539
- if view_mode == "Table":
540
- st.dataframe(df_f, use_container_width=True, height=420)
541
- else:
542
- for _, row in df_f.iterrows():
543
- cls_pill = (
544
- "<span class='pill pill-amber'>NOVEL</span>"
545
- if row["Classification"] == "NOVEL"
546
- else "<span class='pill pill-green'>MAPPED</span>"
547
- )
548
- card_cls = "topic-card novel" if row["Classification"] == "NOVEL" else "topic-card"
549
  st.markdown(f"""
550
- <div class="{card_cls}">
551
  <div class="topic-label">{row['Label']}</div>
552
  <div class="topic-meta">
553
- {cls_pill}
554
- <span class="pill pill-gray">{row['Category']}</span>
555
- <span class="pill pill-blue">{row['Papers']} papers</span>
 
556
  </div>
557
- <div class="topic-kw">{row['Keywords']}</div>
558
- </div>
559
- """, unsafe_allow_html=True)
560
-
561
- # ── Bar chart ─────────────────────────────────────────────────────────────
562
- st.markdown("<br>", unsafe_allow_html=True)
563
- with st.expander("Topic frequency chart", expanded=False):
564
- chart_df = df_f[["Label", "Papers"]].copy()
565
- chart_df["Label"] = chart_df["Label"].apply(lambda x: x[:35] + "…" if len(x) > 35 else x)
566
- chart_df = chart_df.set_index("Label")
567
- st.bar_chart(chart_df, height=380)
568
-
569
- # ── NOVEL / PAJAIS breakdown ───────────────────────────────────────────────
570
- with st.expander("NOVEL vs PAJAIS breakdown β€” for paper Β§4.6", expanded=False):
571
- col_n, col_m = st.columns(2)
572
- with col_n:
573
- st.markdown("**NOVEL topics (no PAJAIS home)**")
574
- novel_df = df_f[df_f["Classification"] == "NOVEL"][["Label", "Papers", "Category"]].reset_index(drop=True)
575
- st.dataframe(novel_df, use_container_width=True)
576
- with col_m:
577
- st.markdown("**MAPPED topics (PAJAIS match)**")
578
- mapped_df = df_f[df_f["Classification"] == "MAPPED"][["Label", "Papers", "Category"]].reset_index(drop=True)
579
- st.dataframe(mapped_df, use_container_width=True)
580
-
581
- # ── Representative documents ──────────────────────────────────────────────
582
- with st.expander("Representative papers per topic (top-3 by centroid proximity)", expanded=False):
583
- rep_docs = results.get("rep_docs_raw", {})
584
- # Pull from topic_results stored in session if available
585
- for tid, interp in sorted(interps.items()):
586
- st.markdown(f"**Topic {tid} β€” {interp.label}**")
587
- docs = interp.keywords # fallback; actual rep_docs wired below
588
- st.caption("See topics.json for full representative document titles.")
589
- st.info("Download topics.json below to see the 3 representative paper titles per cluster used for LLM labelling.")
590
-
591
- # ── Downloads ─────────────────────────────────────────────────────────────
592
- st.markdown("<div class='section-title'>Downloads</div>", unsafe_allow_html=True)
593
- col_d1, col_d2, col_d3 = st.columns(3)
594
- with col_d1:
595
- try:
596
- with open(results["json_path"], "r") as f:
597
- st.download_button(
598
- "⬇ topics.json",
599
- f.read(),
600
- file_name="tmis_topics.json",
601
- mime="application/json",
602
- use_container_width=True,
603
- )
604
- except Exception:
605
- st.warning("JSON file not found.")
606
- with col_d2:
607
  try:
608
- df_dl = pd.read_csv(results["csv_path"])
609
- st.download_button(
610
- "⬇ topics.csv",
611
- df_dl.to_csv(index=False),
612
- file_name="tmis_topics.csv",
613
- mime="text/csv",
614
- use_container_width=True,
615
- )
616
  except Exception:
617
- st.warning("CSV file not found.")
618
- with col_d3:
619
- st.download_button(
620
- "⬇ results table",
621
- df_res.to_csv(index=False),
622
- file_name="tmis_topic_results.csv",
623
- mime="text/csv",
624
- use_container_width=True,
625
- )
626
-
627
- # ── Method note for paper ─────────────────────────────────────────────────
628
- st.markdown("<br>", unsafe_allow_html=True)
629
- with st.expander("Β§3.4 methodology note β€” paste into paper", expanded=False):
630
- st.code(f"""Pipeline A (Unsupervised Discovery): SPECTER2 (allenai/specter2_base) generates one
631
- 768-dimensional document embedding per paper from a combined Title + Abstract column.
632
- UMAP (n_neighbors=15, n_components=5, metric=cosine) reduces dimensionality; HDBSCAN
633
- (min_cluster_size={min_topic_size}, metric=euclidean, cluster_selection=eom) clusters embeddings.
634
- Cosine similarity threshold 0.50–0.55 governs cluster merging and outlier reassignment.
635
- Total clusters constrained to 15–30 via iterative split/merge.
636
-
637
- Pipeline B (LLM Council Validation): For each cluster, the 3 papers nearest the centroid
638
- (by cosine similarity) are passed as representative titles to 3 independent LLMs:
639
- Groq/LLaMA-3.1-8b, Mistral-Small-Latest, and Gemini-2.5-Flash. Each LLM returns a
640
- structured JSON with label, taxonomy_category, and classification (MAPPED/NOVEL).
641
- Majority vote selects the final label; keyword-overlap fallback applies when no consensus.
642
- This is the 3-LLM Council approach validating AI output without using the same model
643
- for self-validation (per Carlsen & Ralund, 2022 CALM principle).
644
-
645
- Results: {stats['n_topics']} clusters discovered. {novel_pct}% classified as NOVEL
646
- (no PAJAIS 2019 home). {mapped_pct}% MAPPED to existing PAJAIS categories.""", language="text")
647
-
648
- # ── Empty state ───────────────────────────────────────────────────────────────
649
- elif not results:
650
  st.markdown("""
651
  <div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;">
652
- <p style="font-family:'IBM Plex Mono',monospace;font-size:0.8rem;color:#3a4060;letter-spacing:0.1em;">
653
  UPLOAD CSV β†’ ENTER API KEYS β†’ RUN PIPELINE
654
  </p>
655
- <p style="font-size:0.75rem;color:#2a3050;margin-top:0.5rem;">
656
- SPECTER2 embeddings Β· HDBSCAN Β· 3-LLM council Β· PAJAIS validation
657
  </p>
658
  </div>
659
- """, unsafe_allow_html=True)
 
1
  """
2
  app.py
3
  ------
4
+ Streamlit UI β€” SPECTER-2 + UMAP + HDBSCAN Bayesian Pipeline
5
+ with 2-D UMAP scatter, Pareto front, strong/weak members,
6
+ trial log, and LLM Council Sheets 1-4.
7
  """
8
 
9
+ import os, json, tempfile
 
 
10
  import pandas as pd
11
+ import numpy as np
12
  import streamlit as st
13
+ import plotly.express as px
14
+ import plotly.graph_objects as go
15
 
16
  from tools import run_topic_modeling
17
  from agent import run_agent
18
 
19
+ # ── Page ─────────────────────────────────────────────────────────────────────
20
+ st.set_page_config(page_title="SPECTER-2 Topic Analyzer", page_icon="πŸ“",
21
+ layout="wide", initial_sidebar_state="expanded")
 
 
 
 
22
 
23
+ # ── CSS ──────────────────────────────────────────────────────────────────────
24
  st.markdown("""
25
  <style>
26
  @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
27
+ html, body, [class*="css"] { font-family:'IBM Plex Sans',sans-serif; }
28
+ .stApp { background:#0d0f14; color:#e8eaf0; }
29
+ [data-testid="stSidebar"] { background:#13161e; border-right:1px solid #1f2333; }
30
+ [data-testid="stSidebar"] * { color:#b0b8cc !important; }
31
+ [data-testid="stSidebar"] h1,[data-testid="stSidebar"] h2,[data-testid="stSidebar"] h3 {
32
+ color:#e8eaf0!important; font-family:'IBM Plex Mono',monospace!important;
33
+ font-size:.8rem!important; letter-spacing:.12em!important; text-transform:uppercase!important; }
34
+ .site-header { padding:2.5rem 0 1.5rem; border-bottom:1px solid #1f2333; margin-bottom:2rem; }
35
+ .site-header h1 { font-family:'IBM Plex Mono',monospace; font-size:1.6rem; font-weight:600;
36
+ color:#e8eaf0; letter-spacing:-.01em; margin:0 0 .3rem; }
37
+ .site-header p { font-size:.82rem; color:#5a6480; font-family:'IBM Plex Mono',monospace; margin:0; }
38
+ .pill { display:inline-block; font-family:'IBM Plex Mono',monospace; font-size:.68rem;
39
+ font-weight:600; letter-spacing:.08em; text-transform:uppercase; padding:3px 10px;
40
+ border-radius:2px; margin-right:6px; }
41
+ .pill-blue { background:#0f2a4a; color:#4d9de0; border:1px solid #1a4070; }
42
+ .pill-green { background:#0a2a1a; color:#3dba7a; border:1px solid #1a4a2a; }
43
+ .pill-amber { background:#2a1f00; color:#e8a020; border:1px solid #4a3500; }
44
+ .pill-red { background:#2a0f0f; color:#e04d4d; border:1px solid #4a1a1a; }
45
+ .pill-gray { background:#1a1e2a; color:#7a8090; border:1px solid #2a2e3a; }
46
+ .stat-grid { display:grid; grid-template-columns:repeat(5,1fr); gap:1px;
47
+ background:#1f2333; border:1px solid #1f2333; border-radius:6px; overflow:hidden; margin-bottom:2rem; }
48
+ .stat-card { background:#13161e; padding:1.25rem 1.5rem; text-align:center; }
49
+ .stat-val { font-family:'IBM Plex Mono',monospace; font-size:1.9rem; font-weight:600;
50
+ color:#e8eaf0; line-height:1; margin-bottom:.3rem; }
51
+ .stat-label { font-size:.7rem; color:#5a6480; text-transform:uppercase; letter-spacing:.1em;
52
+ font-family:'IBM Plex Mono',monospace; }
53
+ .section-title { font-family:'IBM Plex Mono',monospace; font-size:.7rem; font-weight:600;
54
+ letter-spacing:.15em; text-transform:uppercase; color:#5a6480;
55
+ padding-bottom:.6rem; border-bottom:1px solid #1f2333; margin-bottom:1.2rem; }
56
+ .topic-card { background:#13161e; border:1px solid #1f2333; border-left:3px solid #4d9de0;
57
+ border-radius:4px; padding:1rem 1.25rem; margin-bottom:.6rem; transition:border-color .15s; }
58
+ .topic-card:hover { border-left-color:#3dba7a; }
59
+ .topic-label { font-size:.92rem; font-weight:500; color:#e8eaf0; margin-bottom:.35rem; }
60
+ .topic-meta { font-family:'IBM Plex Mono',monospace; font-size:.7rem; color:#5a6480; }
61
+ .topic-kw { font-family:'IBM Plex Mono',monospace; font-size:.68rem; color:#3d6480;
62
+ margin-top:.4rem; line-height:1.5; }
63
+ .val-box { background:#0a2a1a; border:1px solid #1a4a2a; border-radius:6px;
64
+ padding:1.25rem 1.5rem; margin-bottom:1.5rem; }
65
+ .val-box h4 { font-family:'IBM Plex Mono',monospace; font-size:.72rem; font-weight:600;
66
+ letter-spacing:.1em; text-transform:uppercase; color:#3dba7a; margin:0 0 .75rem; }
67
+ .val-row { display:flex; justify-content:space-between; align-items:center;
68
+ padding:.4rem 0; border-bottom:1px solid #1a3a2a; font-size:.8rem; color:#a0b8a8; }
69
+ .val-row:last-child { border-bottom:none; }
70
+ .val-key { color:#5a7a6a; } .val-num { font-family:'IBM Plex Mono',monospace; color:#3dba7a; font-weight:600; }
71
+ .stButton > button { background:#4d9de0!important; color:#0d0f14!important; border:none!important;
72
+ border-radius:3px!important; font-family:'IBM Plex Mono',monospace!important;
73
+ font-size:.78rem!important; font-weight:600!important; letter-spacing:.08em!important;
74
+ text-transform:uppercase!important; padding:.6rem 2rem!important; }
75
+ .stButton > button:hover { background:#3d8ed0!important; }
76
+ .stDownloadButton > button { background:transparent!important; color:#4d9de0!important;
77
+ border:1px solid #1a4070!important; border-radius:3px!important;
78
+ font-family:'IBM Plex Mono',monospace!important; font-size:.72rem!important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  </style>
80
  """, unsafe_allow_html=True)
81
 
82
  # ── Header ───────────────────────────────────────────────────────────────────
83
  st.markdown("""
84
  <div class="site-header">
85
+ <h1>SPECTER-2 Topic Analyzer</h1>
86
+ <p>SPECTER-2 embeddings &nbsp;Β·&nbsp; Bayesian UMAP+HDBSCAN &nbsp;Β·&nbsp;
87
+ 3-LLM Council (Groq + Mistral + Gemini)</p>
88
  </div>
89
  """, unsafe_allow_html=True)
90
 
91
  # ── Sidebar ──────────────────────────────────────────────────────────────────
92
  with st.sidebar:
93
  st.markdown("### API Keys")
94
+ groq_key_in = st.text_input("Groq API Key", type="password")
95
+ mistral_key_in = st.text_input("Mistral API Key", type="password")
96
+ gemini_key_in = st.text_input("Gemini API Key", type="password")
97
  st.caption("Keys are never stored. Leave blank to use env vars.")
98
 
99
  st.markdown("---")
100
+ st.markdown("### Bayesian Optimisation")
101
+ n_trials = st.slider("Optuna trials", 20, 100, 50,
102
+ help="Β§3.4: 50–100 trials recommended")
 
 
 
 
 
103
  st.markdown(
104
+ "<span class='pill pill-blue'>Max mass ≀ 25%</span>"
105
+ "<span class='pill pill-blue'>Min size β‰₯ 5</span>",
106
+ unsafe_allow_html=True)
107
 
108
  st.markdown("---")
109
  st.markdown("### LLM Council")
110
  st.markdown("""
111
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:1rem;">
112
  <span class="pill pill-blue">Groq / LLaMA-3.1</span>
113
  <span class="pill pill-green">Mistral Small</span>
114
  <span class="pill pill-amber">Gemini 2.5 Flash</span>
115
  </div>
116
+ <p style="font-size:.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
117
+ Sheet 1–3 per LLM Β· Sheet 4 consolidation<br>
118
+ Triple / Two / Single agreement tags<br>
119
+ Defence prompt for disagreement clusters
120
  </p>
121
  """, unsafe_allow_html=True)
122
 
123
  st.markdown("---")
124
  if st.button("Reset Results", use_container_width=True):
125
+ for k in ["results", "agent_out", "topic_data"]:
126
+ st.session_state.pop(k, None)
127
  st.rerun()
128
 
129
+ groq_key = groq_key_in.strip() or os.getenv("GROQ_API_KEY")
130
+ mistral_key = mistral_key_in.strip() or os.getenv("MISTRAL_API_KEY")
131
+ gemini_key = gemini_key_in.strip() or os.getenv("GEMINI_API_KEY")
132
 
133
+ # ── Upload ───────────────────────────────────────────────────────────────────
134
  st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
135
+ col_up, col_s = st.columns([3, 1])
 
136
  with col_up:
137
+ uploaded = st.file_uploader(
138
+ "Upload Scopus CSV (must have 'title' + 'abstract')", type=["csv"])
139
+ with col_s:
 
 
 
140
  st.markdown("<br>", unsafe_allow_html=True)
141
+ use_sample = st.checkbox("Use sample dataset (50 papers)")
142
+
143
+ if uploaded and not use_sample:
144
+ dfp = pd.read_csv(uploaded); uploaded.seek(0)
145
+ c1, c2, c3 = st.columns(3)
146
+ c1.metric("Papers", len(dfp))
147
+ c2.metric("Columns", len(dfp.columns))
148
+ ok = {"title","abstract"}.issubset(set(dfp.columns.str.lower()))
149
+ c3.metric("Title+Abstract", "βœ“" if ok else "βœ—")
150
+ if not ok:
151
+ st.error("CSV must have 'title' and 'abstract' columns.")
152
+
153
+ # ── Run ──────────────────────────────────────────────────────────────────────
 
 
 
 
154
  st.markdown("<br>", unsafe_allow_html=True)
155
  run_btn = st.button("β–Ά Run Full Pipeline", type="primary")
156
 
157
  if run_btn:
158
+ missing = []
159
+ if not groq_key: missing.append("Groq")
160
+ if not mistral_key: missing.append("Mistral")
161
+ if not gemini_key: missing.append("Gemini")
162
+ if missing:
163
+ st.error(f"Missing key(s): {', '.join(missing)}")
 
164
  st.stop()
165
+ if not use_sample and not uploaded:
166
+ st.error("Upload a CSV or enable sample dataset.")
 
167
  st.stop()
168
 
169
+ # Prepare CSV
170
  if use_sample:
 
171
  rng = np.random.default_rng(42)
172
+ pool = [
173
+ ("Deep Learning for Healthcare Prediction",
174
+ "We apply LSTM networks to predict patient readmission from EHR data."),
175
+ ("Process Mining in Enterprise Systems",
176
+ "Event log analysis using Petri nets for conformance checking in ERP workflows."),
177
+ ("Recommender Systems Collaborative Filtering",
178
+ "Matrix factorization techniques applied to e-commerce product recommendation."),
179
+ ("LLM Applications in Information Systems",
180
+ "GPT-4 used for automated requirements extraction from stakeholder documents."),
181
+ ("Blockchain Smart Contract Security",
182
+ "Formal verification of Solidity smart contracts for financial transaction safety."),
183
+ ("Federated Learning Privacy Preservation",
184
+ "Differential privacy mechanisms for distributed model training across hospitals."),
185
+ ("Cybersecurity Intrusion Detection",
186
+ "Random forest classifiers for network anomaly detection in enterprise environments."),
187
+ ("NLP Sentiment Analysis",
188
+ "BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
189
+ ("Knowledge Graph Embedding",
190
+ "TransE and RotatE models for biomedical entity relation prediction."),
191
+ ("Computer Vision Medical Imaging",
192
+ "CNN architectures for diabetic retinopathy grading from fundus photographs."),
193
  ]
194
+ rows = [{"title": t, "abstract": a + f" Study {i+1}.",
195
+ "doi": f"10.1145/sample.{i+1}"}
196
+ for i, (t, a) in enumerate(pool * 5)]
197
+ dfs = pd.DataFrame(rows)
 
198
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
199
+ dfs.to_csv(tmp.name, index=False); csv_path = tmp.name
 
200
  else:
201
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
202
+ tmp.write(uploaded.read()); tmp.flush(); csv_path = tmp.name
203
+
204
+ # Step 1 β€” Topic modelling + Bayesian optimisation
205
+ pbar = st.progress(0, text="Step 1/2 β€” SPECTER-2 embed + Bayesian UMAP/HDBSCAN…")
206
+ def _progress(cur, total, entry):
207
+ pct = int(cur / total * 45)
208
+ txt = (f"Trial {cur}/{total} β€” "
209
+ f"{'PASS' if entry['discipline_pass'] else 'FAIL'} β€” "
210
+ f"{entry['n_clusters']} clusters")
211
+ pbar.progress(min(pct, 49), text=txt)
212
  try:
213
+ topic_data = run_topic_modeling(csv_path, n_trials=n_trials,
214
+ progress_callback=_progress)
215
+ nc = topic_data["discipline"]["n_clusters"]
216
+ pbar.progress(50, text=f"Step 1 done β€” {nc} clusters, "
217
+ f"{topic_data['n_trials_run']} trials.")
218
+ except Exception as e:
219
+ st.error(f"Topic modelling failed: {e}")
220
  st.stop()
221
 
222
+ # Step 2 β€” LLM Council
223
+ pbar.progress(55, text="Step 2/2 β€” 3-LLM Council labelling…")
224
  try:
225
+ agent_out = run_agent(topic_data, groq_key, mistral_key, gemini_key)
226
+ pbar.progress(100, text="Pipeline complete.")
227
+ st.session_state["topic_data"] = topic_data
228
+ st.session_state["agent_out"] = agent_out
229
+ st.success(f"Done β€” {len(agent_out['interpretations'])} clusters labelled.")
230
+ except Exception as e:
231
+ st.error(f"LLM Council failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  st.stop()
233
 
 
 
 
234
 
235
+ # ── Results ──────────────────────────────────────────────────────────────────
236
+ td = st.session_state.get("topic_data")
237
+ ao = st.session_state.get("agent_out")
238
+
239
+ if td and ao:
240
+ interps = ao["interpretations"]
241
+ disc = td["discipline"]
242
+ met = td["metrics"]
243
+
244
+ # ── Summary stats ────────────────────────────────────────────────────
245
+ st.markdown("<div class='section-title'>Pipeline Summary</div>",
246
+ unsafe_allow_html=True)
247
+ n_topics = disc["n_clusters"]
248
+ strong_total = sum(i.strong_count for i in interps.values())
249
+ weak_total = sum(i.weak_count for i in interps.values())
250
+ total_papers = strong_total + weak_total
251
+ strong_pct = round(strong_total / max(total_papers, 1) * 100)
252
 
 
 
253
  st.markdown(f"""
254
  <div class="stat-grid">
255
+ <div class="stat-card"><div class="stat-val">{n_topics}</div>
256
+ <div class="stat-label">Clusters</div></div>
257
+ <div class="stat-card"><div class="stat-val">{total_papers}</div>
258
+ <div class="stat-label">Papers assigned</div></div>
259
+ <div class="stat-card"><div class="stat-val">{strong_pct}%</div>
260
+ <div class="stat-label">Strong members</div></div>
261
+ <div class="stat-card"><div class="stat-val">{round(met['persistence'],3)}</div>
262
+ <div class="stat-label">Persistence</div></div>
263
+ <div class="stat-card"><div class="stat-val">{round(met['dbcv'],3)}</div>
264
+ <div class="stat-label">DBCV</div></div>
 
 
 
 
 
 
265
  </div>
266
  """, unsafe_allow_html=True)
267
 
268
+ # ── Discipline + metrics panel ───────────────────────────────────────
269
+ st.markdown("<div class='section-title'>Discipline & Quality</div>",
270
+ unsafe_allow_html=True)
 
271
  st.markdown(f"""
272
  <div class="val-box">
273
+ <h4>Β§3.2 Hard Constraints + Β§3.4 Quality Criteria</h4>
274
+ <div class="val-row"><span class="val-key">Max cluster mass ≀ 25%</span>
275
+ <span class="val-num">{'βœ… PASS' if disc['max_mass_ok'] else '❌ FAIL'}
276
+ ({round(disc['max_mass_pct']*100,1)}%)</span></div>
277
+ <div class="val-row"><span class="val-key">Min cluster size β‰₯ 5</span>
278
+ <span class="val-num">{'βœ… PASS' if disc['min_size_ok'] else '❌ FAIL'}
279
+ (min={disc['min_size']})</span></div>
280
+ <div class="val-row"><span class="val-key">HDBSCAN Persistence</span>
281
+ <span class="val-num">{round(met['persistence'],4)}</span></div>
282
+ <div class="val-row"><span class="val-key">DBCV</span>
283
+ <span class="val-num">{round(met['dbcv'],4)}</span></div>
284
+ <div class="val-row"><span class="val-key">Stability (ARI, 5 seeds)</span>
285
+ <span class="val-num">{round(met['stability'],4)}</span></div>
286
+ <div class="val-row"><span class="val-key">Bayesian trials run</span>
287
+ <span class="val-num">{td['n_trials_run']} (best = #{td['best_trial']})</span></div>
288
+ <div class="val-row"><span class="val-key">Noise papers (βˆ’1)</span>
289
+ <span class="val-num">{disc['n_noise']}</span></div>
290
  </div>
291
  """, unsafe_allow_html=True)
292
 
293
+ # ── Best params ──────────────────────────────────────────────────────
294
+ with st.expander("Winning UMAP + HDBSCAN parameters", expanded=False):
295
+ bp = td["best_params"]
296
+ pdf = pd.DataFrame([
297
+ {"Parameter": "UMAP.n_neighbors", "Value": bp["n_neighbors"]},
298
+ {"Parameter": "UMAP.n_components", "Value": bp["n_components"]},
299
+ {"Parameter": "UMAP.min_dist", "Value": 0.0},
300
+ {"Parameter": "UMAP.metric", "Value": "cosine"},
301
+ {"Parameter": "HDBSCAN.min_cluster_size",
302
+ "Value": bp["min_cluster_size"]},
303
+ {"Parameter": "HDBSCAN.min_samples", "Value": bp["min_samples"]},
304
+ {"Parameter": "HDBSCAN.cluster_selection_method",
305
+ "Value": bp["csm"]},
306
+ {"Parameter": "HDBSCAN.cluster_selection_epsilon",
307
+ "Value": bp["cse"]},
308
+ ])
309
+ st.dataframe(pdf, use_container_width=True, hide_index=True)
310
+
311
+ # ── 2-D UMAP scatter ────────────────────────────────────────────────
312
+ st.markdown("<div class='section-title'>2-D UMAP Visualisation</div>",
313
+ unsafe_allow_html=True)
314
+ umap2d = np.array(td["umap_2d"])
315
+ labels_arr = np.array(td["labels"])
316
+ scatter_df = pd.DataFrame({
317
+ "UMAP-1": umap2d[:, 0], "UMAP-2": umap2d[:, 1],
318
+ "Cluster": [str(l) for l in labels_arr],
319
+ "Doc": [d[:80]+"…" for d in td["documents"]],
320
+ })
321
+ fig = px.scatter(scatter_df, x="UMAP-1", y="UMAP-2", color="Cluster",
322
+ hover_data=["Doc"], opacity=0.75,
323
+ title="SPECTER-2 embeddings (2-D UMAP, min_dist=0.1)")
324
+ fig.update_layout(
325
+ template="plotly_dark",
326
+ paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
327
+ font=dict(family="IBM Plex Mono", size=11),
328
+ height=520,
329
+ )
330
+ st.plotly_chart(fig, use_container_width=True)
331
+
332
+ # ── Pareto front ─────────────────────────────────────────────────────
333
+ with st.expander("Bayesian trial log & Pareto front", expanded=False):
334
+ tl = td["trial_log"]
335
+ tl_df = pd.DataFrame(tl)
336
+ if not tl_df.empty:
337
+ tl_df["colour"] = tl_df["discipline_pass"].map(
338
+ {True: "PASS", False: "FAIL"})
339
+ fig2 = px.scatter(
340
+ tl_df, x="persistence", y="dbcv", color="colour",
341
+ hover_data=["trial", "n_clusters", "max_mass_pct"],
342
+ color_discrete_map={"PASS": "#3dba7a", "FAIL": "#e04d4d"},
343
+ title="Pareto front β€” Persistence vs DBCV",
344
+ )
345
+ fig2.add_vline(x=0, line_dash="dash", line_color="#5a6480")
346
+ fig2.update_layout(
347
+ template="plotly_dark",
348
+ paper_bgcolor="#0d0f14", plot_bgcolor="#13161e",
349
+ font=dict(family="IBM Plex Mono", size=11), height=400)
350
+ st.plotly_chart(fig2, use_container_width=True)
351
+ st.dataframe(tl_df[["trial", "discipline_pass", "n_clusters",
352
+ "persistence", "dbcv", "max_mass_pct",
353
+ "min_size", "n_noise"]],
354
+ use_container_width=True, height=300)
355
+
356
+ # ── Cluster table (strong / weak) ────────────────────────────────────
357
+ st.markdown("<div class='section-title'>Cluster Results</div>",
358
+ unsafe_allow_html=True)
359
  rows = []
360
+ for cid in sorted(interps.keys()):
361
+ i = interps[cid]
362
  rows.append({
363
+ "Cluster": cid,
364
+ "Label": i.final_label,
365
+ "Agreement": i.agreement,
366
+ "PAJAIS": i.final_pacis_match,
367
+ "Strong": i.strong_count,
368
+ "Weak": i.weak_count,
369
+ "Total": i.paper_count,
370
+ "Confidence": round(i.final_confidence, 2),
371
+ "Grounding": i.grounding_check.get("verdict", "?"),
372
+ "Keyphrases": ", ".join(i.keyphrases[:5]),
373
  })
374
+ df_res = pd.DataFrame(rows).sort_values("Total", ascending=False
375
+ ).reset_index(drop=True)
376
+ st.dataframe(df_res, use_container_width=True, height=420)
377
+
378
+ # ── Topic cards ──────────────────────────────────────────────────────
379
+ with st.expander("Topic cards (detailed)", expanded=False):
380
+ for _, row in df_res.iterrows():
381
+ ag_pill = {"Triple": "pill-green", "Two": "pill-blue",
382
+ "Single": "pill-amber"}.get(row["Agreement"], "pill-gray")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  st.markdown(f"""
384
+ <div class="topic-card">
385
  <div class="topic-label">{row['Label']}</div>
386
  <div class="topic-meta">
387
+ <span class="pill {ag_pill}">{row['Agreement']}</span>
388
+ <span class="pill pill-gray">{row['PAJAIS']}</span>
389
+ <span class="pill pill-blue">{row['Strong']}S / {row['Weak']}W</span>
390
+ <span class="pill pill-gray">Ground: {row['Grounding']}</span>
391
  </div>
392
+ <div class="topic-kw">{row['Keyphrases']}</div>
393
+ </div>""", unsafe_allow_html=True)
394
+
395
+ # ── LLM Council Sheets ───────────────────────────────────────────────
396
+ with st.expander("LLM Council β€” Sheets 1-4", expanded=False):
397
+ sheet_rows = []
398
+ for cid in sorted(interps.keys()):
399
+ i = interps[cid]
400
+ for sn, sheet in [("Sheet 1 (Groq)", i.sheet1),
401
+ ("Sheet 2 (Mistral)", i.sheet2),
402
+ ("Sheet 3 (Gemini)", i.sheet3)]:
403
+ sheet_rows.append({
404
+ "Cluster": cid, "Sheet": sn,
405
+ "Label": sheet.get("label", "β€”"),
406
+ "PAJAIS": sheet.get("pacis_match", "β€”"),
407
+ "Conf": sheet.get("confidence", "β€”"),
408
+ })
409
+ sheet_rows.append({
410
+ "Cluster": cid, "Sheet": "Sheet 4 (Final)",
411
+ "Label": i.final_label,
412
+ "PAJAIS": i.final_pacis_match,
413
+ "Conf": i.final_confidence,
414
+ })
415
+ st.dataframe(pd.DataFrame(sheet_rows), use_container_width=True,
416
+ height=400)
417
+
418
+ # ── Downloads ────────────────────────────────────────────────────────
419
+ st.markdown("<div class='section-title'>Downloads</div>",
420
+ unsafe_allow_html=True)
421
+ c1, c2, c3, c4 = st.columns(4)
422
+ with c1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  try:
424
+ with open(ao["json_path"]) as f:
425
+ st.download_button("⬇ topics.json", f.read(),
426
+ "topics.json", "application/json",
427
+ use_container_width=True)
 
 
 
 
428
  except Exception:
429
+ st.warning("JSON not found.")
430
+ with c2:
431
+ st.download_button("⬇ results.csv",
432
+ df_res.to_csv(index=False),
433
+ "results.csv", "text/csv",
434
+ use_container_width=True)
435
+ with c3:
436
+ tl_csv = pd.DataFrame(td["trial_log"]).to_csv(index=False)
437
+ st.download_button("⬇ trial_log.csv", tl_csv,
438
+ "trial_log.csv", "text/csv",
439
+ use_container_width=True)
440
+ with c4:
441
+ bp_json = json.dumps(td["best_params"], indent=2)
442
+ st.download_button("⬇ best_params.json", bp_json,
443
+ "best_params.json", "application/json",
444
+ use_container_width=True)
445
+
446
+ elif not td:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  st.markdown("""
448
  <div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;">
449
+ <p style="font-family:'IBM Plex Mono',monospace;font-size:.8rem;color:#3a4060;letter-spacing:.1em;">
450
  UPLOAD CSV β†’ ENTER API KEYS β†’ RUN PIPELINE
451
  </p>
452
+ <p style="font-size:.75rem;color:#2a3050;margin-top:.5rem;">
453
+ SPECTER-2 β†’ Bayesian UMAP+HDBSCAN (50–100 trials) β†’ 3-LLM Council
454
  </p>
455
  </div>
456
+ """, unsafe_allow_html=True)