anujjuna commited on
Commit
0a624b3
Β·
verified Β·
1 Parent(s): 099d241

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +530 -485
app.py CHANGED
@@ -1,436 +1,403 @@
1
  """
2
  app.py
3
  ------
4
- Streamlit UI for the BERTopic + Dual LLM (Groq + Mistral) research paper analysis pipeline.
5
- Redesigned with a clean, dark editorial aesthetic.
6
  """
7
 
8
  import os
9
-
10
- # Must be set before streamlit imports so HF Spaces proxy can reach the app
11
- os.environ["STREAMLIT_SERVER_PORT"] = "7860"
12
- os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
13
- os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
14
- os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
15
-
16
  import json
17
  import tempfile
18
-
19
  import pandas as pd
20
  import streamlit as st
21
 
22
  from tools import run_topic_modeling
23
  from agent import run_agent
24
 
25
- # ---------------------------------------------------------------------------
26
- # Page Config & Global Styles
27
- # ---------------------------------------------------------------------------
28
  st.set_page_config(
29
- page_title="Arxiv Lens Β· Topic Analyzer",
30
- page_icon="πŸ”¬",
31
  layout="wide",
32
  initial_sidebar_state="expanded",
33
  )
34
 
 
35
  st.markdown("""
36
  <style>
37
- @import url('https://fonts.googleapis.com/css2?family=DM+Serif+Display:ital@0;1&family=DM+Mono:wght@400;500&family=DM+Sans:wght@300;400;500&display=swap');
38
 
39
- /* ── Global Reset ─────────────────────────────────────────── */
40
  html, body, [class*="css"] {
41
- font-family: 'DM Sans', sans-serif;
42
  }
43
 
 
44
  .stApp {
45
- background-color: #0d0f14;
46
- color: #e8e4dc;
47
  }
48
 
49
- /* ── Sidebar ──────────────────────────────────────────────── */
50
  [data-testid="stSidebar"] {
51
- background-color: #111318 !important;
52
- border-right: 1px solid #1e2028;
53
  }
54
-
55
  [data-testid="stSidebar"] * {
56
- color: #c8c4bc !important;
57
  }
58
-
59
- .sidebar-logo {
60
- font-family: 'DM Serif Display', serif;
61
- font-size: 1.5rem;
62
- color: #f0ebe0 !important;
63
- letter-spacing: -0.02em;
64
- margin-bottom: 0.2rem;
65
- }
66
-
67
- .sidebar-tagline {
68
- font-size: 0.72rem;
69
- color: #5a5f6e !important;
70
- text-transform: uppercase;
71
- letter-spacing: 0.12em;
72
- margin-bottom: 1.5rem;
73
  }
74
 
75
- /* ── Header ───────────────────────────────────────────────── */
76
- .hero {
77
  padding: 2.5rem 0 1.5rem 0;
78
- border-bottom: 1px solid #1e2028;
79
  margin-bottom: 2rem;
80
  }
81
-
82
- .hero-title {
83
- font-family: 'DM Serif Display', serif;
84
- font-size: 2.8rem;
85
- color: #f0ebe0;
86
- letter-spacing: -0.03em;
87
- line-height: 1.1;
 
 
 
 
 
88
  margin: 0;
 
89
  }
90
 
91
- .hero-title em {
92
- font-style: italic;
93
- color: #c8a97e;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
-
96
- .hero-sub {
97
- font-size: 0.88rem;
98
- color: #5a5f6e;
99
- margin-top: 0.5rem;
 
 
 
 
 
 
 
 
 
 
 
100
  text-transform: uppercase;
101
  letter-spacing: 0.1em;
 
102
  }
103
 
104
- /* ── Key Pill ─────────────────────────────────────────────── */
105
- .key-required {
106
- display: inline-block;
107
- background: #1a1d25;
108
- border: 1px solid #2e3240;
109
- border-radius: 4px;
110
- padding: 0.15rem 0.5rem;
111
- font-family: 'DM Mono', monospace;
112
- font-size: 0.72rem;
113
- color: #c8a97e;
114
- margin-bottom: 0.4rem;
115
  }
116
 
117
- .key-optional {
118
- display: inline-block;
119
- background: #1a1d25;
120
- border: 1px solid #2e3240;
 
121
  border-radius: 4px;
122
- padding: 0.15rem 0.5rem;
123
- font-family: 'DM Mono', monospace;
124
- font-size: 0.72rem;
125
- color: #5a8a6e;
126
- margin-bottom: 0.4rem;
 
 
 
 
 
 
127
  }
128
-
129
- /* ── Section Headers ──────────────────────────────────────── */
130
- .section-label {
131
- font-family: 'DM Mono', monospace;
132
- font-size: 0.68rem;
133
- color: #5a5f6e;
134
- text-transform: uppercase;
135
- letter-spacing: 0.14em;
136
- margin-bottom: 0.75rem;
137
- padding-bottom: 0.4rem;
138
- border-bottom: 1px solid #1e2028;
139
  }
140
-
141
- /* ── Stat Cards ───────────────────────────────────────────── */
142
- .stat-card {
143
- background: #111318;
144
- border: 1px solid #1e2028;
145
- border-radius: 8px;
146
- padding: 1.2rem 1.4rem;
147
- margin-bottom: 0.75rem;
148
  }
149
 
150
- .stat-number {
151
- font-family: 'DM Serif Display', serif;
152
- font-size: 2.4rem;
153
- color: #c8a97e;
154
- line-height: 1;
 
 
155
  }
156
-
157
- .stat-label {
158
- font-size: 0.75rem;
159
- color: #5a5f6e;
160
- text-transform: uppercase;
161
  letter-spacing: 0.1em;
162
- margin-top: 0.3rem;
 
 
163
  }
164
-
165
- /* ── Pipeline Step Badges ─────────────────────────────────── */
166
- .step-row {
167
  display: flex;
 
168
  align-items: center;
169
- gap: 1rem;
170
- margin-bottom: 0.5rem;
 
 
171
  }
 
 
 
172
 
173
- .step-num {
174
- font-family: 'DM Mono', monospace;
175
- font-size: 0.7rem;
176
- color: #0d0f14;
177
- background: #c8a97e;
178
- border-radius: 50%;
179
- width: 1.4rem;
180
- height: 1.4rem;
181
  display: flex;
182
- align-items: center;
183
- justify-content: center;
184
- flex-shrink: 0;
185
- font-weight: 500;
186
- }
187
-
188
- .step-text {
189
- font-size: 0.82rem;
190
- color: #8a8f9e;
191
  }
192
 
193
- /* ── Buttons ──────────────────────────────────────────────── */
194
  .stButton > button {
195
- background: #c8a97e !important;
196
  color: #0d0f14 !important;
197
  border: none !important;
198
- border-radius: 6px !important;
199
- font-family: 'DM Mono', monospace !important;
200
- font-size: 0.8rem !important;
201
- font-weight: 500 !important;
202
  letter-spacing: 0.08em !important;
203
  text-transform: uppercase !important;
204
- padding: 0.6rem 1.4rem !important;
205
- transition: all 0.15s ease !important;
206
  }
207
-
208
  .stButton > button:hover {
209
- background: #debb94 !important;
210
- transform: translateY(-1px) !important;
211
- }
212
-
213
- /* ── Inputs ───────────────────────────────────────────────── */
214
- .stTextInput > div > div > input,
215
- .stSelectbox > div > div,
216
- .stSlider {
217
- background-color: #111318 !important;
218
- border-color: #2e3240 !important;
219
- color: #e8e4dc !important;
220
  }
221
 
222
- /* ── Dataframe ────────────────────────────────────────────── */
223
- [data-testid="stDataFrame"] {
224
- border: 1px solid #1e2028 !important;
225
- border-radius: 8px !important;
226
- overflow: hidden;
227
- }
228
-
229
- /* ── Upload zone ──────────────────────────────────────────── */
230
- [data-testid="stFileUploader"] {
231
- background: #111318;
232
- border: 1px dashed #2e3240 !important;
233
- border-radius: 8px;
234
- }
235
-
236
- /* ── Expanders ────────────────────────────────────────────── */
237
- .streamlit-expanderHeader {
238
- background-color: #111318 !important;
239
- border: 1px solid #1e2028 !important;
240
- border-radius: 6px !important;
241
- color: #c8c4bc !important;
242
  font-size: 0.82rem !important;
 
243
  }
244
 
245
- /* ── Tabs ─────────────────────────────────────────────────── */
246
- .stTabs [data-baseweb="tab-list"] {
247
- gap: 0;
248
- border-bottom: 1px solid #1e2028;
249
- background: transparent;
250
- }
251
-
252
- .stTabs [data-baseweb="tab"] {
253
- font-family: 'DM Mono', monospace;
254
- font-size: 0.75rem;
255
- text-transform: uppercase;
256
- letter-spacing: 0.1em;
257
- color: #5a5f6e !important;
258
- background: transparent !important;
259
- border: none !important;
260
- padding: 0.6rem 1.2rem;
261
- }
262
-
263
- .stTabs [aria-selected="true"] {
264
- color: #c8a97e !important;
265
- border-bottom: 2px solid #c8a97e !important;
266
- }
267
-
268
- /* ── Success / Error ──────────────────────────────────────── */
269
- .stSuccess {
270
- background: #0d1f16 !important;
271
- border-left: 3px solid #4caf7d !important;
272
- border-radius: 4px !important;
273
- }
274
-
275
- .stError {
276
- background: #1f0d0d !important;
277
- border-left: 3px solid #cf4f4f !important;
278
- border-radius: 4px !important;
279
  }
280
 
281
- /* ── Download buttons ─────────────────────────────────────── */
282
  .stDownloadButton > button {
283
  background: transparent !important;
284
- color: #c8a97e !important;
285
- border: 1px solid #c8a97e !important;
286
- border-radius: 6px !important;
287
- font-family: 'DM Mono', monospace !important;
288
- font-size: 0.75rem !important;
289
  letter-spacing: 0.08em !important;
290
  }
291
 
292
- .stDownloadButton > button:hover {
293
- background: #c8a97e22 !important;
 
 
 
 
 
294
  }
295
 
296
- /* ── Divider ──────────────────────────────────────────────── */
297
- hr {
298
- border-color: #1e2028 !important;
299
- margin: 1.5rem 0 !important;
300
- }
 
 
 
301
  </style>
302
  """, unsafe_allow_html=True)
303
 
304
- # ---------------------------------------------------------------------------
305
- # Sidebar
306
- # ---------------------------------------------------------------------------
307
- with st.sidebar:
308
- st.markdown('<div class="sidebar-logo">Arxiv Lens</div>', unsafe_allow_html=True)
309
- st.markdown('<div class="sidebar-tagline">Research Topic Analyzer</div>', unsafe_allow_html=True)
310
-
311
- st.markdown('<div class="section-label">API Keys</div>', unsafe_allow_html=True)
312
-
313
- st.markdown('<span class="key-required">REQUIRED Β· GROQ</span>', unsafe_allow_html=True)
314
- groq_key_input = st.text_input(
315
- "Groq API Key",
316
- value="",
317
- type="password",
318
- placeholder="gsk_...",
319
- label_visibility="collapsed",
320
- )
321
-
322
- st.markdown('<span class="key-optional">OPTIONAL Β· MISTRAL</span>', unsafe_allow_html=True)
323
- mistral_key_input = st.text_input(
324
- "Mistral API Key",
325
- value="",
326
- type="password",
327
- placeholder="For dual-LLM validation",
328
- label_visibility="collapsed",
329
- )
330
 
331
- st.caption("Keys are never stored. Falls back to env vars if blank.")
 
 
 
 
 
 
332
 
333
  st.markdown("---")
334
- st.markdown('<div class="section-label">Model Settings</div>', unsafe_allow_html=True)
335
- min_topic_size = st.slider("Min Topic Size", min_value=3, max_value=30, value=5)
 
 
 
 
 
 
 
 
 
 
336
 
337
  st.markdown("---")
338
- st.markdown('<div class="section-label">Pipeline</div>', unsafe_allow_html=True)
339
- for i, step in enumerate([
340
- "BERTopic clusters abstracts + titles",
341
- "Groq LLM labels each cluster",
342
- "Mistral validates Groq's labels",
343
- "Cross-source diff report generated",
344
- ], 1):
345
- st.markdown(f"""
346
- <div class="step-row">
347
- <div class="step-num">{i}</div>
348
- <div class="step-text">{step}</div>
349
- </div>
350
- """, unsafe_allow_html=True)
351
 
352
  st.markdown("---")
353
- if st.button("β†Ί Reset Results", use_container_width=True):
354
- if "agent_results" in st.session_state:
355
- del st.session_state["agent_results"]
356
  st.rerun()
357
 
358
- groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
359
  mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
 
360
 
361
- # ---------------------------------------------------------------------------
362
- # Hero
363
- # ---------------------------------------------------------------------------
364
- st.markdown("""
365
- <div class="hero">
366
- <h1 class="hero-title">Research<br><em>Topic Intelligence</em></h1>
367
- <p class="hero-sub">BERTopic Β· Groq llama-3.1 Β· Mistral Validation</p>
368
- </div>
369
- """, unsafe_allow_html=True)
370
-
371
- # ---------------------------------------------------------------------------
372
- # Dataset Input
373
- # ---------------------------------------------------------------------------
374
- st.markdown('<div class="section-label">Dataset</div>', unsafe_allow_html=True)
375
 
376
- col_a, col_b = st.columns([3, 1])
377
- with col_a:
378
  uploaded_file = st.file_uploader(
379
- "Upload a CSV with **title** and **abstract** columns",
380
  type=["csv"],
381
- help="Must have at minimum 'title' and 'abstract' columns. More rows = richer topics.",
382
  )
383
- with col_b:
384
  st.markdown("<br>", unsafe_allow_html=True)
385
- use_sample = st.checkbox("Use built-in sample dataset", value=False)
386
-
387
- st.markdown("---")
388
-
389
- # ---------------------------------------------------------------------------
390
- # Run Pipeline
391
- # ---------------------------------------------------------------------------
392
- run_btn = st.button("β–Ά Run Analysis Pipeline", use_container_width=False)
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  if run_btn:
395
- if not groq_api_key:
396
- st.error("**Groq API key required.** Enter it in the sidebar or set `GROQ_API_KEY` in your environment.")
 
 
 
 
 
397
  st.stop()
398
 
399
  if not use_sample and uploaded_file is None:
400
- st.error("**No dataset.** Upload a CSV or enable the sample dataset.")
401
  st.stop()
402
 
403
- # Resolve CSV path
404
  if use_sample:
405
- sample_data = {
406
- "title": [
407
- "Deep Learning for Image Classification",
408
- "Neural Networks in Healthcare",
409
- "Transformer Models for NLP",
410
- "BERT in Question Answering",
411
- "Blockchain and Distributed Ledger Technology",
412
- "Smart Contracts in Finance",
413
- "Federated Learning for Privacy",
414
- "Differential Privacy in ML",
415
- "Graph Neural Networks",
416
- "Knowledge Graph Embeddings",
417
- ],
418
- "abstract": [
419
- "We propose a deep learning model achieving state-of-the-art accuracy on image benchmarks.",
420
- "A convolutional network trained for medical image classification tasks.",
421
- "We introduce a transformer-based approach for text understanding.",
422
- "Fine-tuning BERT achieves strong results on reading comprehension datasets.",
423
- "This paper surveys blockchain consensus mechanisms and distributed ledger architectures.",
424
- "We implement smart contracts for automated financial transactions on a public blockchain.",
425
- "Federated learning enables collaborative model training without sharing raw data.",
426
- "Differential privacy provides formal privacy guarantees for machine learning models.",
427
- "Graph neural networks learn from relational data structures effectively.",
428
- "Knowledge graph embeddings enable link prediction and entity classification.",
429
- ],
430
- }
431
- df_sample = pd.DataFrame(sample_data)
432
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
433
- df_sample.to_csv(tmp.name, index=False)
434
  csv_path = tmp.name
435
  else:
436
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
@@ -438,177 +405,255 @@ if run_btn:
438
  tmp.flush()
439
  csv_path = tmp.name
440
 
441
- # Step 1 β€” BERTopic
442
- with st.spinner("πŸ”¬ Running BERTopic clustering…"):
443
- try:
444
- topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
445
- except Exception as exc:
446
- st.error(f"**Topic modeling failed:** {exc}")
447
- st.stop()
 
 
448
 
449
- abstract_res = topic_results["abstracts"]
450
- title_res = topic_results["titles"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
- df = pd.read_csv(csv_path)
453
- df.columns = df.columns.str.lower()
454
- raw_titles = df["title"].fillna("").tolist()
455
- raw_abstracts = df["abstract"].fillna("").tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
- # Step 2 β€” Agent
458
- with st.spinner("πŸ€– LLM interpretation + Mistral validation…"):
459
- try:
460
- st.session_state["agent_results"] = run_agent(
461
- title_topic_keywords=title_res["topic_keywords"],
462
- abstract_topic_keywords=abstract_res["topic_keywords"],
463
- title_topic_assignments=title_res["topics"],
464
- abstract_topic_assignments=abstract_res["topics"],
465
- raw_titles=raw_titles,
466
- raw_abstracts=raw_abstracts,
467
- api_key=groq_api_key,
468
- mistral_api_key=mistral_api_key,
469
- )
470
- st.success("Pipeline complete.")
471
- except Exception as exc:
472
- st.error(f"**Agent pipeline failed:** {exc}")
473
- st.stop()
474
-
475
- # ---------------------------------------------------------------------------
476
- # Results
477
- # ---------------------------------------------------------------------------
478
- agent_results = st.session_state.get("agent_results")
479
-
480
- if agent_results:
481
- title_interps = agent_results.get("title_interpretations", {})
482
- abstract_interps = agent_results.get("abstract_interpretations", {})
483
- comparison_rows = agent_results.get("comparison_rows", [])
484
- taxonomy_map = agent_results.get("taxonomy_map", {})
485
-
486
- # ── Stats Row ──────────────────────────────────────────────────────────
487
- c1, c2, c3, c4 = st.columns(4)
488
- with c1:
489
- st.markdown(f"""
490
- <div class="stat-card">
491
- <div class="stat-number">{len(title_interps)}</div>
492
- <div class="stat-label">Title Topics</div>
493
- </div>
494
- """, unsafe_allow_html=True)
495
- with c2:
496
- st.markdown(f"""
497
- <div class="stat-card">
498
- <div class="stat-number">{len(abstract_interps)}</div>
499
- <div class="stat-label">Abstract Topics</div>
500
- </div>
501
- """, unsafe_allow_html=True)
502
- with c3:
503
- agreed = sum(
504
- 1 for i in list(title_interps.values()) + list(abstract_interps.values())
505
- if i.validation_status == "AGREED"
506
- )
507
- st.markdown(f"""
508
- <div class="stat-card">
509
- <div class="stat-number">{agreed}</div>
510
- <div class="stat-label">LLM Agreements</div>
511
- </div>
512
- """, unsafe_allow_html=True)
513
- with c4:
514
- novel = sum(
515
- 1 for i in list(title_interps.values()) + list(abstract_interps.values())
516
- if i.classification == "NOVEL"
517
- )
518
- st.markdown(f"""
519
- <div class="stat-card">
520
- <div class="stat-number">{novel}</div>
521
- <div class="stat-label">Novel Topics</div>
522
- </div>
523
- """, unsafe_allow_html=True)
524
 
525
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
526
 
527
- # ── Main Tabs ──────────────────────────────────────────────────────────
528
- tab1, tab2, tab3, tab4 = st.tabs([
529
- "Title Topics",
530
- "Abstract Topics",
531
- "Taxonomy Map",
532
- "Comparison",
533
- ])
534
-
535
- def _interp_rows(interps):
536
- return [
537
- {
538
- "ID": tid,
539
- "Label": i.label,
540
- "Category": i.taxonomy_category,
541
- "Class": i.classification,
542
- "Validation": i.validation_status,
543
- "Confidence": i.confidence,
544
- "Keywords": ", ".join(i.keywords[:8]),
545
- "Reasoning": i.reasoning,
546
- }
547
- for tid, i in sorted(interps.items())
548
- ]
549
 
550
- with tab1:
551
- st.markdown('<div class="section-label">Topics derived from paper titles</div>', unsafe_allow_html=True)
552
- if title_interps:
553
- st.dataframe(pd.DataFrame(_interp_rows(title_interps)), use_container_width=True, hide_index=True)
554
- else:
555
- st.info("No title topics found.")
556
-
557
- with tab2:
558
- st.markdown('<div class="section-label">Topics derived from paper abstracts</div>', unsafe_allow_html=True)
559
- if abstract_interps:
560
- st.dataframe(pd.DataFrame(_interp_rows(abstract_interps)), use_container_width=True, hide_index=True)
561
- else:
562
- st.info("No abstract topics found.")
563
-
564
- with tab3:
565
- st.markdown('<div class="section-label">Full taxonomy classification</div>', unsafe_allow_html=True)
566
- inner_tabs = st.tabs(["Titles", "Abstracts"])
567
- for itab, section in zip(inner_tabs, ["titles", "abstracts"]):
568
- with itab:
569
- entries = taxonomy_map.get(section, [])
570
- if entries:
571
- st.dataframe(
572
- pd.DataFrame(entries)[[
573
- "topic_id", "label", "taxonomy_category",
574
- "classification", "validation_status", "confidence", "reasoning"
575
- ]],
576
- use_container_width=True,
577
- hide_index=True,
578
- )
579
- else:
580
- st.info(f"No {section} entries.")
581
-
582
- with tab4:
583
- st.markdown('<div class="section-label">Side-by-side title vs abstract topic comparison</div>', unsafe_allow_html=True)
584
- if comparison_rows:
585
- from dataclasses import asdict
586
- st.dataframe(pd.DataFrame([asdict(r) for r in comparison_rows]), use_container_width=True, hide_index=True)
587
- else:
588
- st.info("No overlapping topic IDs between title and abstract sources.")
589
 
590
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
 
592
- # ── Downloads ──────────────────────────────────────────────────────────
593
- st.markdown('<div class="section-label">Export Results</div>', unsafe_allow_html=True)
594
- dl1, dl2 = st.columns(2)
595
- with dl1:
596
- st.download_button(
597
- "⬇ taxonomy_map.json",
598
- json.dumps(agent_results["taxonomy_map"], indent=2),
599
- file_name="taxonomy_map.json",
600
- mime="application/json",
601
- key="dl_json",
602
- use_container_width=True,
603
- )
604
- with dl2:
605
- from dataclasses import asdict
606
- comp_df = pd.DataFrame([asdict(r) for r in agent_results["comparison_rows"]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  st.download_button(
608
- "⬇ comparison.csv",
609
- comp_df.to_csv(index=False),
610
- file_name="comparison.csv",
611
  mime="text/csv",
612
- key="dl_csv",
613
  use_container_width=True,
614
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  app.py
3
  ------
4
+ Streamlit UI β€” SPECTER2 + BERTopic + 3-LLM Council
5
+ Research Topic Analyzer for SPJIMR Γ— SPIT Group 14
6
  """
7
 
8
  import os
 
 
 
 
 
 
 
9
  import json
10
  import tempfile
 
11
  import pandas as pd
12
  import streamlit as st
13
 
14
  from tools import run_topic_modeling
15
  from agent import run_agent
16
 
17
+ # ── Page setup ──────────────────────────────────────────────────────────────
 
 
18
  st.set_page_config(
19
+ page_title="TMIS Topic Analyzer",
20
+ page_icon="πŸ“",
21
  layout="wide",
22
  initial_sidebar_state="expanded",
23
  )
24
 
25
+ # ── Custom CSS ───────────────────────────────────────────────────────────────
26
  st.markdown("""
27
  <style>
28
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
29
 
 
30
  html, body, [class*="css"] {
31
+ font-family: 'IBM Plex Sans', sans-serif;
32
  }
33
 
34
+ /* App background */
35
  .stApp {
36
+ background: #0d0f14;
37
+ color: #e8eaf0;
38
  }
39
 
40
+ /* Sidebar */
41
  [data-testid="stSidebar"] {
42
+ background: #13161e;
43
+ border-right: 1px solid #1f2333;
44
  }
 
45
  [data-testid="stSidebar"] * {
46
+ color: #b0b8cc !important;
47
  }
48
+ [data-testid="stSidebar"] h1,
49
+ [data-testid="stSidebar"] h2,
50
+ [data-testid="stSidebar"] h3 {
51
+ color: #e8eaf0 !important;
52
+ font-family: 'IBM Plex Mono', monospace !important;
53
+ font-size: 0.8rem !important;
54
+ letter-spacing: 0.12em !important;
55
+ text-transform: uppercase !important;
 
 
 
 
 
 
 
56
  }
57
 
58
+ /* Header */
59
+ .site-header {
60
  padding: 2.5rem 0 1.5rem 0;
61
+ border-bottom: 1px solid #1f2333;
62
  margin-bottom: 2rem;
63
  }
64
+ .site-header h1 {
65
+ font-family: 'IBM Plex Mono', monospace;
66
+ font-size: 1.6rem;
67
+ font-weight: 600;
68
+ color: #e8eaf0;
69
+ letter-spacing: -0.01em;
70
+ margin: 0 0 0.3rem 0;
71
+ }
72
+ .site-header p {
73
+ font-size: 0.82rem;
74
+ color: #5a6480;
75
+ font-family: 'IBM Plex Mono', monospace;
76
  margin: 0;
77
+ letter-spacing: 0.04em;
78
  }
79
 
80
+ /* Pills / badges */
81
+ .pill {
82
+ display: inline-block;
83
+ font-family: 'IBM Plex Mono', monospace;
84
+ font-size: 0.68rem;
85
+ font-weight: 600;
86
+ letter-spacing: 0.08em;
87
+ text-transform: uppercase;
88
+ padding: 3px 10px;
89
+ border-radius: 2px;
90
+ margin-right: 6px;
91
+ }
92
+ .pill-blue { background: #0f2a4a; color: #4d9de0; border: 1px solid #1a4070; }
93
+ .pill-green { background: #0a2a1a; color: #3dba7a; border: 1px solid #1a4a2a; }
94
+ .pill-amber { background: #2a1f00; color: #e8a020; border: 1px solid #4a3500; }
95
+ .pill-red { background: #2a0f0f; color: #e04d4d; border: 1px solid #4a1a1a; }
96
+ .pill-gray { background: #1a1e2a; color: #7a8090; border: 1px solid #2a2e3a; }
97
+
98
+ /* Stats row */
99
+ .stat-grid {
100
+ display: grid;
101
+ grid-template-columns: repeat(4, 1fr);
102
+ gap: 1px;
103
+ background: #1f2333;
104
+ border: 1px solid #1f2333;
105
+ border-radius: 6px;
106
+ overflow: hidden;
107
+ margin-bottom: 2rem;
108
  }
109
+ .stat-card {
110
+ background: #13161e;
111
+ padding: 1.25rem 1.5rem;
112
+ text-align: center;
113
+ }
114
+ .stat-val {
115
+ font-family: 'IBM Plex Mono', monospace;
116
+ font-size: 1.9rem;
117
+ font-weight: 600;
118
+ color: #e8eaf0;
119
+ line-height: 1;
120
+ margin-bottom: 0.3rem;
121
+ }
122
+ .stat-label {
123
+ font-size: 0.7rem;
124
+ color: #5a6480;
125
  text-transform: uppercase;
126
  letter-spacing: 0.1em;
127
+ font-family: 'IBM Plex Mono', monospace;
128
  }
129
 
130
+ /* Section titles */
131
+ .section-title {
132
+ font-family: 'IBM Plex Mono', monospace;
133
+ font-size: 0.7rem;
134
+ font-weight: 600;
135
+ letter-spacing: 0.15em;
136
+ text-transform: uppercase;
137
+ color: #5a6480;
138
+ padding-bottom: 0.6rem;
139
+ border-bottom: 1px solid #1f2333;
140
+ margin-bottom: 1.2rem;
141
  }
142
 
143
+ /* Topic cards */
144
+ .topic-card {
145
+ background: #13161e;
146
+ border: 1px solid #1f2333;
147
+ border-left: 3px solid #4d9de0;
148
  border-radius: 4px;
149
+ padding: 1rem 1.25rem;
150
+ margin-bottom: 0.6rem;
151
+ transition: border-color 0.15s;
152
+ }
153
+ .topic-card:hover { border-left-color: #3dba7a; }
154
+ .topic-card.novel { border-left-color: #e8a020; }
155
+ .topic-label {
156
+ font-size: 0.92rem;
157
+ font-weight: 500;
158
+ color: #e8eaf0;
159
+ margin-bottom: 0.35rem;
160
  }
161
+ .topic-meta {
162
+ font-family: 'IBM Plex Mono', monospace;
163
+ font-size: 0.7rem;
164
+ color: #5a6480;
 
 
 
 
 
 
 
165
  }
166
+ .topic-kw {
167
+ font-family: 'IBM Plex Mono', monospace;
168
+ font-size: 0.68rem;
169
+ color: #3d6480;
170
+ margin-top: 0.4rem;
171
+ line-height: 1.5;
 
 
172
  }
173
 
174
+ /* Validation panel */
175
+ .val-box {
176
+ background: #0a2a1a;
177
+ border: 1px solid #1a4a2a;
178
+ border-radius: 6px;
179
+ padding: 1.25rem 1.5rem;
180
+ margin-bottom: 1.5rem;
181
  }
182
+ .val-box h4 {
183
+ font-family: 'IBM Plex Mono', monospace;
184
+ font-size: 0.72rem;
185
+ font-weight: 600;
 
186
  letter-spacing: 0.1em;
187
+ text-transform: uppercase;
188
+ color: #3dba7a;
189
+ margin: 0 0 0.75rem 0;
190
  }
191
+ .val-row {
 
 
192
  display: flex;
193
+ justify-content: space-between;
194
  align-items: center;
195
+ padding: 0.4rem 0;
196
+ border-bottom: 1px solid #1a3a2a;
197
+ font-size: 0.8rem;
198
+ color: #a0b8a8;
199
  }
200
+ .val-row:last-child { border-bottom: none; }
201
+ .val-key { color: #5a7a6a; }
202
+ .val-num { font-family: 'IBM Plex Mono', monospace; color: #3dba7a; font-weight: 600; }
203
 
204
+ /* LLM council badge row */
205
+ .council-row {
 
 
 
 
 
 
206
  display: flex;
207
+ gap: 8px;
208
+ margin-bottom: 1rem;
209
+ flex-wrap: wrap;
 
 
 
 
 
 
210
  }
211
 
212
+ /* Run button */
213
  .stButton > button {
214
+ background: #4d9de0 !important;
215
  color: #0d0f14 !important;
216
  border: none !important;
217
+ border-radius: 3px !important;
218
+ font-family: 'IBM Plex Mono', monospace !important;
219
+ font-size: 0.78rem !important;
220
+ font-weight: 600 !important;
221
  letter-spacing: 0.08em !important;
222
  text-transform: uppercase !important;
223
+ padding: 0.6rem 2rem !important;
224
+ transition: background 0.15s !important;
225
  }
 
226
  .stButton > button:hover {
227
+ background: #3d8ed0 !important;
 
 
 
 
 
 
 
 
 
 
228
  }
229
 
230
+ /* Input overrides */
231
+ .stTextInput input, .stSelectbox select {
232
+ background: #13161e !important;
233
+ border: 1px solid #1f2333 !important;
234
+ color: #e8eaf0 !important;
235
+ font-family: 'IBM Plex Mono', monospace !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  font-size: 0.82rem !important;
237
+ border-radius: 3px !important;
238
  }
239
 
240
+ /* Dataframe */
241
+ .stDataFrame {
242
+ background: #13161e;
243
+ border: 1px solid #1f2333;
244
+ border-radius: 4px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  }
246
 
247
+ /* Download buttons */
248
  .stDownloadButton > button {
249
  background: transparent !important;
250
+ color: #4d9de0 !important;
251
+ border: 1px solid #1a4070 !important;
252
+ border-radius: 3px !important;
253
+ font-family: 'IBM Plex Mono', monospace !important;
254
+ font-size: 0.72rem !important;
255
  letter-spacing: 0.08em !important;
256
  }
257
 
258
+ /* Expander */
259
+ .streamlit-expanderHeader {
260
+ background: #13161e !important;
261
+ border: 1px solid #1f2333 !important;
262
+ font-family: 'IBM Plex Mono', monospace !important;
263
+ font-size: 0.78rem !important;
264
+ color: #a0a8c0 !important;
265
  }
266
 
267
+ /* Progress / spinner */
268
+ .stSpinner > div { border-top-color: #4d9de0 !important; }
269
+
270
+ /* Divider */
271
+ hr { border-color: #1f2333 !important; }
272
+
273
+ /* Alerts */
274
+ .stAlert { border-radius: 4px !important; }
275
  </style>
276
  """, unsafe_allow_html=True)
277
 
278
+ # ── Header ───────────────────────────────────────────────────────────────────
279
+ st.markdown("""
280
+ <div class="site-header">
281
+ <h1>Research Topic Analyzer</h1>
282
+ <p>SPECTER2 embeddings &nbsp;Β·&nbsp; HDBSCAN/UMAP clustering &nbsp;Β·&nbsp; 3-LLM Council (Groq + Mistral + Gemini) &nbsp;Β·&nbsp; PAJAIS validation</p>
283
+ </div>
284
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
 
286
+ # ── Sidebar ──────────────────────────────────────────────────────────────────
287
+ with st.sidebar:
288
+ st.markdown("### API Keys")
289
+ groq_key_input = st.text_input("Groq API Key", type="password", placeholder="GROQ_API_KEY env var")
290
+ mistral_key_input = st.text_input("Mistral API Key", type="password", placeholder="MISTRAL_API_KEY env var")
291
+ gemini_key_input = st.text_input("Gemini API Key", type="password", placeholder="GEMINI_API_KEY env var")
292
+ st.caption("Keys are never stored. Leave blank to use env vars.")
293
 
294
  st.markdown("---")
295
+ st.markdown("### Clustering Parameters")
296
+ min_topic_size = st.slider("Min papers per cluster", min_value=3, max_value=20, value=5,
297
+ help="Prof. Kamat spec: min=5")
298
+ st.markdown(
299
+ "<span class='pill pill-blue'>Min clusters: 15</span>"
300
+ "<span class='pill pill-blue'>Max clusters: 30</span>",
301
+ unsafe_allow_html=True
302
+ )
303
+ st.markdown(
304
+ "<span class='pill pill-gray'>Cosine sim: 0.50–0.55</span>",
305
+ unsafe_allow_html=True
306
+ )
307
 
308
  st.markdown("---")
309
+ st.markdown("### LLM Council")
310
+ st.markdown("""
311
+ <div class="council-row">
312
+ <span class="pill pill-blue">Groq / LLaMA-3.1</span>
313
+ <span class="pill pill-green">Mistral Small</span>
314
+ <span class="pill pill-amber">Gemini 2.5 Flash</span>
315
+ </div>
316
+ <p style="font-size:0.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
317
+ Majority vote β†’ best label selected.<br>
318
+ Keyword-overlap fallback if no consensus.
319
+ </p>
320
+ """, unsafe_allow_html=True)
 
321
 
322
  st.markdown("---")
323
+ if st.button("Reset Results", use_container_width=True):
324
+ for key in ["agent_results", "topic_stats"]:
325
+ st.session_state.pop(key, None)
326
  st.rerun()
327
 
328
+ groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
329
  mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
330
+ gemini_api_key = gemini_key_input.strip() or os.getenv("GEMINI_API_KEY")
331
 
332
+ # ── Dataset upload ────────────────────────────────────────────────────────────
333
+ st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
+ col_up, col_sample = st.columns([3, 1])
336
+ with col_up:
337
  uploaded_file = st.file_uploader(
338
+ "Upload Scopus CSV β€” must contain 'title' and 'abstract' columns",
339
  type=["csv"],
340
+ help="Export your corpus from Scopus as CSV. The tool will combine Title + Abstract into one SPECTER2 vector per paper."
341
  )
342
+ with col_sample:
343
  st.markdown("<br>", unsafe_allow_html=True)
344
+ use_sample = st.checkbox("Use sample dataset (50 papers)", value=False)
345
+
346
+ if uploaded_file and not use_sample:
347
+ try:
348
+ df_preview = pd.read_csv(uploaded_file)
349
+ uploaded_file.seek(0)
350
+ col_a, col_b, col_c = st.columns(3)
351
+ col_a.metric("Papers detected", len(df_preview))
352
+ col_b.metric("Columns", len(df_preview.columns))
353
+ has_both = {"title", "abstract"}.issubset(set(df_preview.columns.str.lower()))
354
+ col_c.metric("Title + Abstract", "βœ“ present" if has_both else "βœ— missing")
355
+ if not has_both:
356
+ st.error("CSV must have both 'title' and 'abstract' columns.")
357
+ except Exception as e:
358
+ st.error(f"Could not preview CSV: {e}")
359
+
360
+ # ── Run Pipeline ─────────────────────────────────────────────────────────────
361
+ st.markdown("<br>", unsafe_allow_html=True)
362
+ run_btn = st.button("β–Ά Run Full Pipeline", type="primary")
363
 
364
  if run_btn:
365
+ # Validation
366
+ missing_keys = []
367
+ if not groq_api_key: missing_keys.append("Groq")
368
+ if not mistral_api_key: missing_keys.append("Mistral")
369
+ if not gemini_api_key: missing_keys.append("Gemini")
370
+ if missing_keys:
371
+ st.error(f"Missing API key(s): {', '.join(missing_keys)}. All three are required for the LLM council.")
372
  st.stop()
373
 
374
  if not use_sample and uploaded_file is None:
375
+ st.error("Please upload a CSV file or enable the sample dataset.")
376
  st.stop()
377
 
378
+ # Prepare CSV path
379
  if use_sample:
380
+ import numpy as np
381
+ rng = np.random.default_rng(42)
382
+ topics_pool = [
383
+ ("Deep Learning for Healthcare Prediction", "We apply LSTM networks to predict patient readmission from EHR data."),
384
+ ("Process Mining in Enterprise Systems", "Event log analysis using Petri nets for conformance checking in ERP workflows."),
385
+ ("Recommender Systems Collaborative Filtering", "Matrix factorization techniques applied to e-commerce product recommendation."),
386
+ ("LLM Applications in Information Systems", "GPT-4 used for automated requirements extraction from stakeholder documents."),
387
+ ("Blockchain Smart Contract Security", "Formal verification of Solidity smart contracts for financial transaction safety."),
388
+ ("Federated Learning Privacy Preservation", "Differential privacy mechanisms for distributed model training across hospitals."),
389
+ ("Cybersecurity Intrusion Detection", "Random forest classifiers for network anomaly detection in enterprise environments."),
390
+ ("Natural Language Processing Sentiment", "BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
391
+ ("Knowledge Graph Embedding", "TransE and RotatE models for biomedical entity relation prediction."),
392
+ ("Computer Vision Medical Imaging", "CNN architectures for diabetic retinopathy grading from fundus photographs."),
393
+ ]
394
+ rows = []
395
+ for i in range(50):
396
+ t, a = topics_pool[i % len(topics_pool)]
397
+ rows.append({"title": t, "abstract": a + f" Study {i+1}.", "doi": f"10.1145/sample.{i+1}"})
398
+ df_s = pd.DataFrame(rows)
 
 
 
 
 
 
 
 
399
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
400
+ df_s.to_csv(tmp.name, index=False)
401
  csv_path = tmp.name
402
  else:
403
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
 
405
  tmp.flush()
406
  csv_path = tmp.name
407
 
408
+ # Step 1: Topic modeling
409
+ progress_bar = st.progress(0, text="Step 1/2 β€” SPECTER2 embeddings + HDBSCAN clustering (15–30 clusters)…")
410
+ try:
411
+ topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
412
+ n_clusters = len(topic_results["documents"]["topic_keywords"])
413
+ progress_bar.progress(50, text=f"Step 1/2 β€” Done. {n_clusters} clusters found.")
414
+ except Exception as exc:
415
+ st.error(f"Topic modeling failed: {exc}")
416
+ st.stop()
417
 
418
+ # Step 2: LLM Council
419
+ progress_bar.progress(55, text="Step 2/2 β€” 3-LLM Council labelling (Groq + Mistral + Gemini)…")
420
+ try:
421
+ agent_results = run_agent(
422
+ topic_results=topic_results,
423
+ groq_key=groq_api_key,
424
+ mistral_key=mistral_api_key,
425
+ gemini_key=gemini_api_key,
426
+ )
427
+ progress_bar.progress(100, text="Pipeline complete.")
428
+ st.session_state["agent_results"] = agent_results
429
+
430
+ # Compute summary stats
431
+ interps = agent_results.get("interpretations", {})
432
+ novel_count = sum(1 for i in interps.values() if i.classification == "NOVEL")
433
+ mapped_count = sum(1 for i in interps.values() if i.classification == "MAPPED")
434
+ total_papers = sum(i.paper_count for i in interps.values())
435
+ st.session_state["topic_stats"] = {
436
+ "n_topics": len(interps),
437
+ "novel": novel_count,
438
+ "mapped": mapped_count,
439
+ "total_papers": total_papers,
440
+ }
441
+ st.success(f"Pipeline complete β€” {len(interps)} topics labelled by 3-LLM council.")
442
+ except Exception as exc:
443
+ st.error(f"LLM council failed: {exc}")
444
+ st.stop()
445
 
446
+ # ── Results Display ────────────────────────────────────────────────────────────
447
+ results = st.session_state.get("agent_results")
448
+ stats = st.session_state.get("topic_stats")
449
+
450
+ if results and stats:
451
+ interps = results.get("interpretations", {})
452
+
453
+ # ── Summary stats ─────────────────────────────────────────────────────────
454
+ st.markdown("<div class='section-title'>Pipeline Summary</div>", unsafe_allow_html=True)
455
+ st.markdown(f"""
456
+ <div class="stat-grid">
457
+ <div class="stat-card">
458
+ <div class="stat-val">{stats['n_topics']}</div>
459
+ <div class="stat-label">Topics Found</div>
460
+ </div>
461
+ <div class="stat-card">
462
+ <div class="stat-val">{stats['total_papers']}</div>
463
+ <div class="stat-label">Papers Assigned</div>
464
+ </div>
465
+ <div class="stat-card">
466
+ <div class="stat-val">{stats['novel']}</div>
467
+ <div class="stat-label">NOVEL (no PAJAIS home)</div>
468
+ </div>
469
+ <div class="stat-card">
470
+ <div class="stat-val">{stats['mapped']}</div>
471
+ <div class="stat-label">MAPPED to PAJAIS</div>
472
+ </div>
473
+ </div>
474
+ """, unsafe_allow_html=True)
475
 
476
+ # ── Validation panel ──────────────────────────────────────────────────────
477
+ st.markdown("<div class='section-title'>LLM Council Validation</div>", unsafe_allow_html=True)
478
+ novel_pct = round(stats['novel'] / stats['n_topics'] * 100) if stats['n_topics'] else 0
479
+ mapped_pct = round(stats['mapped'] / stats['n_topics'] * 100) if stats['n_topics'] else 0
480
+ st.markdown(f"""
481
+ <div class="val-box">
482
+ <h4>Instructor Spec Compliance</h4>
483
+ <div class="val-row"><span class="val-key">Embedding model</span><span class="val-num">SPECTER2 (allenai/specter2_base)</span></div>
484
+ <div class="val-row"><span class="val-key">Input column</span><span class="val-num">Title + Abstract (combined)</span></div>
485
+ <div class="val-row"><span class="val-key">Clustering</span><span class="val-num">UMAP β†’ HDBSCAN (min=5, max=100 per cluster)</span></div>
486
+ <div class="val-row"><span class="val-key">Cosine similarity range</span><span class="val-num">0.50 – 0.55 (merge / outlier reassign)</span></div>
487
+ <div class="val-row"><span class="val-key">Total clusters</span><span class="val-num">{stats['n_topics']} (target: 15–30)</span></div>
488
+ <div class="val-row"><span class="val-key">LLM council</span><span class="val-num">Groq (LLaMA-3.1) + Mistral Small + Gemini 2.5 Flash</span></div>
489
+ <div class="val-row"><span class="val-key">Label selection</span><span class="val-num">Majority vote β†’ keyword-overlap fallback</span></div>
490
+ <div class="val-row"><span class="val-key">Rep. docs per topic</span><span class="val-num">Top-3 by cosine similarity to centroid</span></div>
491
+ <div class="val-row"><span class="val-key">NOVEL themes (no PAJAIS home)</span><span class="val-num">{novel_pct}% ({stats['novel']} topics)</span></div>
492
+ <div class="val-row"><span class="val-key">MAPPED to PAJAIS taxonomy</span><span class="val-num">{mapped_pct}% ({stats['mapped']} topics)</span></div>
493
+ </div>
494
+ """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
+ # ── Filters ───────────────────────────────────────────────────────────────
497
+ st.markdown("<div class='section-title'>Topic Results</div>", unsafe_allow_html=True)
498
+
499
+ rows = []
500
+ for tid, interp in sorted(interps.items()):
501
+ rows.append({
502
+ "Topic ID": tid,
503
+ "Label": interp.label,
504
+ "Classification": interp.classification,
505
+ "Category": interp.category,
506
+ "Papers": interp.paper_count,
507
+ "Keywords": ", ".join(interp.keywords[:8]),
508
+ })
509
+ df_res = pd.DataFrame(rows).sort_values("Papers", ascending=False).reset_index(drop=True)
510
+
511
+ col_f1, col_f2, col_f3 = st.columns([2, 2, 1])
512
+ with col_f1:
513
+ cats = ["All"] + sorted(df_res["Category"].unique().tolist())
514
+ sel_cat = st.selectbox("Filter by category", cats)
515
+ with col_f2:
516
+ clsf = ["All", "NOVEL", "MAPPED"]
517
+ sel_cls = st.selectbox("Filter by classification", clsf)
518
+ with col_f3:
519
+ sort_by = st.selectbox("Sort by", ["Papers ↓", "Papers ↑", "Label A–Z"])
520
+
521
+ df_f = df_res.copy()
522
+ if sel_cat != "All":
523
+ df_f = df_f[df_f["Category"] == sel_cat]
524
+ if sel_cls != "All":
525
+ df_f = df_f[df_f["Classification"] == sel_cls]
526
+ if sort_by == "Papers ↓":
527
+ df_f = df_f.sort_values("Papers", ascending=False)
528
+ elif sort_by == "Papers ↑":
529
+ df_f = df_f.sort_values("Papers", ascending=True)
530
+ else:
531
+ df_f = df_f.sort_values("Label")
532
+ df_f = df_f.reset_index(drop=True)
533
 
534
+ st.caption(f"Showing {len(df_f)} of {len(df_res)} topics")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
 
536
+ # ── Topic cards ───────────────────────────────────────────────────────────
537
+ view_mode = st.radio("View as", ["Table", "Cards"], horizontal=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
+ if view_mode == "Table":
540
+ st.dataframe(df_f, use_container_width=True, height=420)
541
+ else:
542
+ for _, row in df_f.iterrows():
543
+ cls_pill = (
544
+ "<span class='pill pill-amber'>NOVEL</span>"
545
+ if row["Classification"] == "NOVEL"
546
+ else "<span class='pill pill-green'>MAPPED</span>"
547
+ )
548
+ card_cls = "topic-card novel" if row["Classification"] == "NOVEL" else "topic-card"
549
+ st.markdown(f"""
550
+ <div class="{card_cls}">
551
+ <div class="topic-label">{row['Label']}</div>
552
+ <div class="topic-meta">
553
+ {cls_pill}
554
+ <span class="pill pill-gray">{row['Category']}</span>
555
+ <span class="pill pill-blue">{row['Papers']} papers</span>
556
+ </div>
557
+ <div class="topic-kw">{row['Keywords']}</div>
558
+ </div>
559
+ """, unsafe_allow_html=True)
560
 
561
+ # ── Bar chart ─────────────────────────────────────────────────────────────
562
+ st.markdown("<br>", unsafe_allow_html=True)
563
+ with st.expander("Topic frequency chart", expanded=False):
564
+ chart_df = df_f[["Label", "Papers"]].copy()
565
+ chart_df["Label"] = chart_df["Label"].apply(lambda x: x[:35] + "…" if len(x) > 35 else x)
566
+ chart_df = chart_df.set_index("Label")
567
+ st.bar_chart(chart_df, height=380)
568
+
569
+ # ── NOVEL / PAJAIS breakdown ───────────────────────────────────────────────
570
+ with st.expander("NOVEL vs PAJAIS breakdown β€” for paper Β§4.6", expanded=False):
571
+ col_n, col_m = st.columns(2)
572
+ with col_n:
573
+ st.markdown("**NOVEL topics (no PAJAIS home)**")
574
+ novel_df = df_f[df_f["Classification"] == "NOVEL"][["Label", "Papers", "Category"]].reset_index(drop=True)
575
+ st.dataframe(novel_df, use_container_width=True)
576
+ with col_m:
577
+ st.markdown("**MAPPED topics (PAJAIS match)**")
578
+ mapped_df = df_f[df_f["Classification"] == "MAPPED"][["Label", "Papers", "Category"]].reset_index(drop=True)
579
+ st.dataframe(mapped_df, use_container_width=True)
580
+
581
+ # ── Representative documents ──────────────────────────────────────────────
582
+ with st.expander("Representative papers per topic (top-3 by centroid proximity)", expanded=False):
583
+ rep_docs = results.get("rep_docs_raw", {})
584
+ # Pull from topic_results stored in session if available
585
+ for tid, interp in sorted(interps.items()):
586
+ st.markdown(f"**Topic {tid} β€” {interp.label}**")
587
+ docs = interp.keywords # fallback; actual rep_docs wired below
588
+ st.caption("See topics.json for full representative document titles.")
589
+ st.info("Download topics.json below to see the 3 representative paper titles per cluster used for LLM labelling.")
590
+
591
+ # ── Downloads ─────────────────────────────────────────────────────────────
592
+ st.markdown("<div class='section-title'>Downloads</div>", unsafe_allow_html=True)
593
+ col_d1, col_d2, col_d3 = st.columns(3)
594
+ with col_d1:
595
+ try:
596
+ with open(results["json_path"], "r") as f:
597
+ st.download_button(
598
+ "⬇ topics.json",
599
+ f.read(),
600
+ file_name="tmis_topics.json",
601
+ mime="application/json",
602
+ use_container_width=True,
603
+ )
604
+ except Exception:
605
+ st.warning("JSON file not found.")
606
+ with col_d2:
607
+ try:
608
+ df_dl = pd.read_csv(results["csv_path"])
609
+ st.download_button(
610
+ "⬇ topics.csv",
611
+ df_dl.to_csv(index=False),
612
+ file_name="tmis_topics.csv",
613
+ mime="text/csv",
614
+ use_container_width=True,
615
+ )
616
+ except Exception:
617
+ st.warning("CSV file not found.")
618
+ with col_d3:
619
  st.download_button(
620
+ "⬇ results table",
621
+ df_res.to_csv(index=False),
622
+ file_name="tmis_topic_results.csv",
623
  mime="text/csv",
 
624
  use_container_width=True,
625
+ )
626
+
627
+ # ── Method note for paper ─────────────────────────────────────────────────
628
+ st.markdown("<br>", unsafe_allow_html=True)
629
+ with st.expander("Β§3.4 methodology note β€” paste into paper", expanded=False):
630
+ st.code(f"""Pipeline A (Unsupervised Discovery): SPECTER2 (allenai/specter2_base) generates one
631
+ 768-dimensional document embedding per paper from a combined Title + Abstract column.
632
+ UMAP (n_neighbors=15, n_components=5, metric=cosine) reduces dimensionality; HDBSCAN
633
+ (min_cluster_size={min_topic_size}, metric=euclidean, cluster_selection=eom) clusters embeddings.
634
+ Cosine similarity threshold 0.50–0.55 governs cluster merging and outlier reassignment.
635
+ Total clusters constrained to 15–30 via iterative split/merge.
636
+
637
+ Pipeline B (LLM Council Validation): For each cluster, the 3 papers nearest the centroid
638
+ (by cosine similarity) are passed as representative titles to 3 independent LLMs:
639
+ Groq/LLaMA-3.1-8b, Mistral-Small-Latest, and Gemini-2.5-Flash. Each LLM returns a
640
+ structured JSON with label, taxonomy_category, and classification (MAPPED/NOVEL).
641
+ Majority vote selects the final label; keyword-overlap fallback applies when no consensus.
642
+ This is the 3-LLM Council approach validating AI output without using the same model
643
+ for self-validation (per Carlsen & Ralund, 2022 CALM principle).
644
+
645
+ Results: {stats['n_topics']} clusters discovered. {novel_pct}% classified as NOVEL
646
+ (no PAJAIS 2019 home). {mapped_pct}% MAPPED to existing PAJAIS categories.""", language="text")
647
+
648
+ # ── Empty state ───────────────────────────────────────────────────────────────
649
+ elif not results:
650
+ st.markdown("""
651
+ <div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;">
652
+ <p style="font-family:'IBM Plex Mono',monospace;font-size:0.8rem;color:#3a4060;letter-spacing:0.1em;">
653
+ UPLOAD CSV β†’ ENTER API KEYS β†’ RUN PIPELINE
654
+ </p>
655
+ <p style="font-size:0.75rem;color:#2a3050;margin-top:0.5rem;">
656
+ SPECTER2 embeddings Β· HDBSCAN Β· 3-LLM council Β· PAJAIS validation
657
+ </p>
658
+ </div>
659
+ """, unsafe_allow_html=True)