anujjuna commited on
Commit
8c6e466
·
verified ·
1 Parent(s): 5a7ae5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -607
app.py CHANGED
@@ -1,607 +1,261 @@
1
- """
2
- app.py
3
- ------
4
- Streamlit UI for the BERTopic + Dual LLM (Groq + Mistral) research paper analysis pipeline.
5
- Redesigned with a clean, dark editorial aesthetic.
6
- """
7
-
8
- import os
9
- import json
10
- import tempfile
11
-
12
- import pandas as pd
13
- import streamlit as st
14
-
15
- from tools import run_topic_modeling
16
- from agent import run_agent
17
-
18
- # ---------------------------------------------------------------------------
19
- # Page Config & Global Styles
20
- # ---------------------------------------------------------------------------
21
- st.set_page_config(
22
- page_title="Arxiv Lens · Topic Analyzer",
23
- page_icon="🔬",
24
- layout="wide",
25
- initial_sidebar_state="expanded",
26
- )
27
-
28
- st.markdown("""
29
- <style>
30
- @import url('https://fonts.googleapis.com/css2?family=DM+Serif+Display:ital@0;1&family=DM+Mono:wght@400;500&family=DM+Sans:wght@300;400;500&display=swap');
31
-
32
- /* ── Global Reset ─────────────────────────────────────────── */
33
- html, body, [class*="css"] {
34
- font-family: 'DM Sans', sans-serif;
35
- }
36
-
37
- .stApp {
38
- background-color: #0d0f14;
39
- color: #e8e4dc;
40
- }
41
-
42
- /* ── Sidebar ──────────────────────────────────────────────── */
43
- [data-testid="stSidebar"] {
44
- background-color: #111318 !important;
45
- border-right: 1px solid #1e2028;
46
- }
47
-
48
- [data-testid="stSidebar"] * {
49
- color: #c8c4bc !important;
50
- }
51
-
52
- .sidebar-logo {
53
- font-family: 'DM Serif Display', serif;
54
- font-size: 1.5rem;
55
- color: #f0ebe0 !important;
56
- letter-spacing: -0.02em;
57
- margin-bottom: 0.2rem;
58
- }
59
-
60
- .sidebar-tagline {
61
- font-size: 0.72rem;
62
- color: #5a5f6e !important;
63
- text-transform: uppercase;
64
- letter-spacing: 0.12em;
65
- margin-bottom: 1.5rem;
66
- }
67
-
68
- /* ── Header ───────────────────────────────────────────────── */
69
- .hero {
70
- padding: 2.5rem 0 1.5rem 0;
71
- border-bottom: 1px solid #1e2028;
72
- margin-bottom: 2rem;
73
- }
74
-
75
- .hero-title {
76
- font-family: 'DM Serif Display', serif;
77
- font-size: 2.8rem;
78
- color: #f0ebe0;
79
- letter-spacing: -0.03em;
80
- line-height: 1.1;
81
- margin: 0;
82
- }
83
-
84
- .hero-title em {
85
- font-style: italic;
86
- color: #c8a97e;
87
- }
88
-
89
- .hero-sub {
90
- font-size: 0.88rem;
91
- color: #5a5f6e;
92
- margin-top: 0.5rem;
93
- text-transform: uppercase;
94
- letter-spacing: 0.1em;
95
- }
96
-
97
- /* ── Key Pill ─────────────────────────────────────────────── */
98
- .key-required {
99
- display: inline-block;
100
- background: #1a1d25;
101
- border: 1px solid #2e3240;
102
- border-radius: 4px;
103
- padding: 0.15rem 0.5rem;
104
- font-family: 'DM Mono', monospace;
105
- font-size: 0.72rem;
106
- color: #c8a97e;
107
- margin-bottom: 0.4rem;
108
- }
109
-
110
- .key-optional {
111
- display: inline-block;
112
- background: #1a1d25;
113
- border: 1px solid #2e3240;
114
- border-radius: 4px;
115
- padding: 0.15rem 0.5rem;
116
- font-family: 'DM Mono', monospace;
117
- font-size: 0.72rem;
118
- color: #5a8a6e;
119
- margin-bottom: 0.4rem;
120
- }
121
-
122
- /* ── Section Headers ──────────────────────────────────────── */
123
- .section-label {
124
- font-family: 'DM Mono', monospace;
125
- font-size: 0.68rem;
126
- color: #5a5f6e;
127
- text-transform: uppercase;
128
- letter-spacing: 0.14em;
129
- margin-bottom: 0.75rem;
130
- padding-bottom: 0.4rem;
131
- border-bottom: 1px solid #1e2028;
132
- }
133
-
134
- /* ── Stat Cards ───────────────────────────────────────────── */
135
- .stat-card {
136
- background: #111318;
137
- border: 1px solid #1e2028;
138
- border-radius: 8px;
139
- padding: 1.2rem 1.4rem;
140
- margin-bottom: 0.75rem;
141
- }
142
-
143
- .stat-number {
144
- font-family: 'DM Serif Display', serif;
145
- font-size: 2.4rem;
146
- color: #c8a97e;
147
- line-height: 1;
148
- }
149
-
150
- .stat-label {
151
- font-size: 0.75rem;
152
- color: #5a5f6e;
153
- text-transform: uppercase;
154
- letter-spacing: 0.1em;
155
- margin-top: 0.3rem;
156
- }
157
-
158
- /* ── Pipeline Step Badges ─────────────────────────────────── */
159
- .step-row {
160
- display: flex;
161
- align-items: center;
162
- gap: 1rem;
163
- margin-bottom: 0.5rem;
164
- }
165
-
166
- .step-num {
167
- font-family: 'DM Mono', monospace;
168
- font-size: 0.7rem;
169
- color: #0d0f14;
170
- background: #c8a97e;
171
- border-radius: 50%;
172
- width: 1.4rem;
173
- height: 1.4rem;
174
- display: flex;
175
- align-items: center;
176
- justify-content: center;
177
- flex-shrink: 0;
178
- font-weight: 500;
179
- }
180
-
181
- .step-text {
182
- font-size: 0.82rem;
183
- color: #8a8f9e;
184
- }
185
-
186
- /* ── Buttons ──────────────────────────────────────────────── */
187
- .stButton > button {
188
- background: #c8a97e !important;
189
- color: #0d0f14 !important;
190
- border: none !important;
191
- border-radius: 6px !important;
192
- font-family: 'DM Mono', monospace !important;
193
- font-size: 0.8rem !important;
194
- font-weight: 500 !important;
195
- letter-spacing: 0.08em !important;
196
- text-transform: uppercase !important;
197
- padding: 0.6rem 1.4rem !important;
198
- transition: all 0.15s ease !important;
199
- }
200
-
201
- .stButton > button:hover {
202
- background: #debb94 !important;
203
- transform: translateY(-1px) !important;
204
- }
205
-
206
- /* ── Inputs ───────────────────────────────────────────────── */
207
- .stTextInput > div > div > input,
208
- .stSelectbox > div > div,
209
- .stSlider {
210
- background-color: #111318 !important;
211
- border-color: #2e3240 !important;
212
- color: #e8e4dc !important;
213
- }
214
-
215
- /* ── Dataframe ────────────────────────────────────────────── */
216
- [data-testid="stDataFrame"] {
217
- border: 1px solid #1e2028 !important;
218
- border-radius: 8px !important;
219
- overflow: hidden;
220
- }
221
-
222
- /* ── Upload zone ──────────────────────────────────────────── */
223
- [data-testid="stFileUploader"] {
224
- background: #111318;
225
- border: 1px dashed #2e3240 !important;
226
- border-radius: 8px;
227
- }
228
-
229
- /* ── Expanders ────────────────────────────────────────────── */
230
- .streamlit-expanderHeader {
231
- background-color: #111318 !important;
232
- border: 1px solid #1e2028 !important;
233
- border-radius: 6px !important;
234
- color: #c8c4bc !important;
235
- font-size: 0.82rem !important;
236
- }
237
-
238
- /* ── Tabs ─────────────────────────────────────────────────── */
239
- .stTabs [data-baseweb="tab-list"] {
240
- gap: 0;
241
- border-bottom: 1px solid #1e2028;
242
- background: transparent;
243
- }
244
-
245
- .stTabs [data-baseweb="tab"] {
246
- font-family: 'DM Mono', monospace;
247
- font-size: 0.75rem;
248
- text-transform: uppercase;
249
- letter-spacing: 0.1em;
250
- color: #5a5f6e !important;
251
- background: transparent !important;
252
- border: none !important;
253
- padding: 0.6rem 1.2rem;
254
- }
255
-
256
- .stTabs [aria-selected="true"] {
257
- color: #c8a97e !important;
258
- border-bottom: 2px solid #c8a97e !important;
259
- }
260
-
261
- /* ── Success / Error ──────────────────────────────────────── */
262
- .stSuccess {
263
- background: #0d1f16 !important;
264
- border-left: 3px solid #4caf7d !important;
265
- border-radius: 4px !important;
266
- }
267
-
268
- .stError {
269
- background: #1f0d0d !important;
270
- border-left: 3px solid #cf4f4f !important;
271
- border-radius: 4px !important;
272
- }
273
-
274
- /* ── Download buttons ─────────────────────────────────────── */
275
- .stDownloadButton > button {
276
- background: transparent !important;
277
- color: #c8a97e !important;
278
- border: 1px solid #c8a97e !important;
279
- border-radius: 6px !important;
280
- font-family: 'DM Mono', monospace !important;
281
- font-size: 0.75rem !important;
282
- letter-spacing: 0.08em !important;
283
- }
284
-
285
- .stDownloadButton > button:hover {
286
- background: #c8a97e22 !important;
287
- }
288
-
289
- /* ── Divider ──────────────────────────────────────────────── */
290
- hr {
291
- border-color: #1e2028 !important;
292
- margin: 1.5rem 0 !important;
293
- }
294
- </style>
295
- """, unsafe_allow_html=True)
296
-
297
- # ---------------------------------------------------------------------------
298
- # Sidebar
299
- # ---------------------------------------------------------------------------
300
- with st.sidebar:
301
- st.markdown('<div class="sidebar-logo">Arxiv Lens</div>', unsafe_allow_html=True)
302
- st.markdown('<div class="sidebar-tagline">Research Topic Analyzer</div>', unsafe_allow_html=True)
303
-
304
- st.markdown('<div class="section-label">API Keys</div>', unsafe_allow_html=True)
305
-
306
- st.markdown('<span class="key-required">REQUIRED · GROQ</span>', unsafe_allow_html=True)
307
- groq_key_input = st.text_input(
308
- "Groq API Key",
309
- value="",
310
- type="password",
311
- placeholder="gsk_...",
312
- label_visibility="collapsed",
313
- )
314
-
315
- st.markdown('<span class="key-optional">OPTIONAL · MISTRAL</span>', unsafe_allow_html=True)
316
- mistral_key_input = st.text_input(
317
- "Mistral API Key",
318
- value="",
319
- type="password",
320
- placeholder="For dual-LLM validation",
321
- label_visibility="collapsed",
322
- )
323
-
324
- st.caption("Keys are never stored. Falls back to env vars if blank.")
325
-
326
- st.markdown("---")
327
- st.markdown('<div class="section-label">Model Settings</div>', unsafe_allow_html=True)
328
- min_topic_size = st.slider("Min Topic Size", min_value=3, max_value=30, value=5)
329
-
330
- st.markdown("---")
331
- st.markdown('<div class="section-label">Pipeline</div>', unsafe_allow_html=True)
332
- for i, step in enumerate([
333
- "BERTopic clusters abstracts + titles",
334
- "Groq LLM labels each cluster",
335
- "Mistral validates Groq's labels",
336
- "Cross-source diff report generated",
337
- ], 1):
338
- st.markdown(f"""
339
- <div class="step-row">
340
- <div class="step-num">{i}</div>
341
- <div class="step-text">{step}</div>
342
- </div>
343
- """, unsafe_allow_html=True)
344
-
345
- st.markdown("---")
346
- if st.button("↺ Reset Results", use_container_width=True):
347
- if "agent_results" in st.session_state:
348
- del st.session_state["agent_results"]
349
- st.rerun()
350
-
351
- groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
352
- mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
353
-
354
- # ---------------------------------------------------------------------------
355
- # Hero
356
- # ---------------------------------------------------------------------------
357
- st.markdown("""
358
- <div class="hero">
359
- <h1 class="hero-title">Research<br><em>Topic Intelligence</em></h1>
360
- <p class="hero-sub">BERTopic · Groq llama-3.1 · Mistral Validation</p>
361
- </div>
362
- """, unsafe_allow_html=True)
363
-
364
- # ---------------------------------------------------------------------------
365
- # Dataset Input
366
- # ---------------------------------------------------------------------------
367
- st.markdown('<div class="section-label">Dataset</div>', unsafe_allow_html=True)
368
-
369
- col_a, col_b = st.columns([3, 1])
370
- with col_a:
371
- uploaded_file = st.file_uploader(
372
- "Upload a CSV with **title** and **abstract** columns",
373
- type=["csv"],
374
- help="Must have at minimum 'title' and 'abstract' columns. More rows = richer topics.",
375
- )
376
- with col_b:
377
- st.markdown("<br>", unsafe_allow_html=True)
378
- use_sample = st.checkbox("Use built-in sample dataset", value=False)
379
-
380
- st.markdown("---")
381
-
382
- # ---------------------------------------------------------------------------
383
- # Run Pipeline
384
- # ---------------------------------------------------------------------------
385
- run_btn = st.button("▶ Run Analysis Pipeline", use_container_width=False)
386
-
387
- if run_btn:
388
- if not groq_api_key:
389
- st.error("**Groq API key required.** Enter it in the sidebar or set `GROQ_API_KEY` in your environment.")
390
- st.stop()
391
-
392
- if not use_sample and uploaded_file is None:
393
- st.error("**No dataset.** Upload a CSV or enable the sample dataset.")
394
- st.stop()
395
-
396
- # Resolve CSV path
397
- if use_sample:
398
- sample_data = {
399
- "title": [
400
- "Deep Learning for Image Classification",
401
- "Neural Networks in Healthcare",
402
- "Transformer Models for NLP",
403
- "BERT in Question Answering",
404
- "Blockchain and Distributed Ledger Technology",
405
- "Smart Contracts in Finance",
406
- "Federated Learning for Privacy",
407
- "Differential Privacy in ML",
408
- "Graph Neural Networks",
409
- "Knowledge Graph Embeddings",
410
- ],
411
- "abstract": [
412
- "We propose a deep learning model achieving state-of-the-art accuracy on image benchmarks.",
413
- "A convolutional network trained for medical image classification tasks.",
414
- "We introduce a transformer-based approach for text understanding.",
415
- "Fine-tuning BERT achieves strong results on reading comprehension datasets.",
416
- "This paper surveys blockchain consensus mechanisms and distributed ledger architectures.",
417
- "We implement smart contracts for automated financial transactions on a public blockchain.",
418
- "Federated learning enables collaborative model training without sharing raw data.",
419
- "Differential privacy provides formal privacy guarantees for machine learning models.",
420
- "Graph neural networks learn from relational data structures effectively.",
421
- "Knowledge graph embeddings enable link prediction and entity classification.",
422
- ],
423
- }
424
- df_sample = pd.DataFrame(sample_data)
425
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
426
- df_sample.to_csv(tmp.name, index=False)
427
- csv_path = tmp.name
428
- else:
429
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
430
- tmp.write(uploaded_file.read())
431
- tmp.flush()
432
- csv_path = tmp.name
433
-
434
- # Step 1 — BERTopic
435
- with st.spinner("🔬 Running BERTopic clustering…"):
436
- try:
437
- topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
438
- except Exception as exc:
439
- st.error(f"**Topic modeling failed:** {exc}")
440
- st.stop()
441
-
442
- abstract_res = topic_results["abstracts"]
443
- title_res = topic_results["titles"]
444
-
445
- df = pd.read_csv(csv_path)
446
- df.columns = df.columns.str.lower()
447
- raw_titles = df["title"].fillna("").tolist()
448
- raw_abstracts = df["abstract"].fillna("").tolist()
449
-
450
- # Step 2 — Agent
451
- with st.spinner("🤖 LLM interpretation + Mistral validation…"):
452
- try:
453
- st.session_state["agent_results"] = run_agent(
454
- title_topic_keywords=title_res["topic_keywords"],
455
- abstract_topic_keywords=abstract_res["topic_keywords"],
456
- title_topic_assignments=title_res["topics"],
457
- abstract_topic_assignments=abstract_res["topics"],
458
- raw_titles=raw_titles,
459
- raw_abstracts=raw_abstracts,
460
- api_key=groq_api_key,
461
- mistral_api_key=mistral_api_key,
462
- )
463
- st.success("Pipeline complete.")
464
- except Exception as exc:
465
- st.error(f"**Agent pipeline failed:** {exc}")
466
- st.stop()
467
-
468
- # ---------------------------------------------------------------------------
469
- # Results
470
- # ---------------------------------------------------------------------------
471
- agent_results = st.session_state.get("agent_results")
472
-
473
- if agent_results:
474
- title_interps = agent_results.get("title_interpretations", {})
475
- abstract_interps = agent_results.get("abstract_interpretations", {})
476
- comparison_rows = agent_results.get("comparison_rows", [])
477
- taxonomy_map = agent_results.get("taxonomy_map", {})
478
-
479
- # ── Stats Row ──────────────────────────────────────────────────────────
480
- c1, c2, c3, c4 = st.columns(4)
481
- with c1:
482
- st.markdown(f"""
483
- <div class="stat-card">
484
- <div class="stat-number">{len(title_interps)}</div>
485
- <div class="stat-label">Title Topics</div>
486
- </div>
487
- """, unsafe_allow_html=True)
488
- with c2:
489
- st.markdown(f"""
490
- <div class="stat-card">
491
- <div class="stat-number">{len(abstract_interps)}</div>
492
- <div class="stat-label">Abstract Topics</div>
493
- </div>
494
- """, unsafe_allow_html=True)
495
- with c3:
496
- agreed = sum(
497
- 1 for i in list(title_interps.values()) + list(abstract_interps.values())
498
- if i.validation_status == "AGREED"
499
- )
500
- st.markdown(f"""
501
- <div class="stat-card">
502
- <div class="stat-number">{agreed}</div>
503
- <div class="stat-label">LLM Agreements</div>
504
- </div>
505
- """, unsafe_allow_html=True)
506
- with c4:
507
- novel = sum(
508
- 1 for i in list(title_interps.values()) + list(abstract_interps.values())
509
- if i.classification == "NOVEL"
510
- )
511
- st.markdown(f"""
512
- <div class="stat-card">
513
- <div class="stat-number">{novel}</div>
514
- <div class="stat-label">Novel Topics</div>
515
- </div>
516
- """, unsafe_allow_html=True)
517
-
518
- st.markdown("---")
519
-
520
- # ── Main Tabs ──────────────────────────────────────────────────────────
521
- tab1, tab2, tab3, tab4 = st.tabs([
522
- "Title Topics",
523
- "Abstract Topics",
524
- "Taxonomy Map",
525
- "Comparison",
526
- ])
527
-
528
- def _interp_rows(interps):
529
- return [
530
- {
531
- "ID": tid,
532
- "Label": i.label,
533
- "Category": i.taxonomy_category,
534
- "Class": i.classification,
535
- "Validation": i.validation_status,
536
- "Confidence": i.confidence,
537
- "Keywords": ", ".join(i.keywords[:8]),
538
- "Reasoning": i.reasoning,
539
- }
540
- for tid, i in sorted(interps.items())
541
- ]
542
-
543
- with tab1:
544
- st.markdown('<div class="section-label">Topics derived from paper titles</div>', unsafe_allow_html=True)
545
- if title_interps:
546
- st.dataframe(pd.DataFrame(_interp_rows(title_interps)), use_container_width=True, hide_index=True)
547
- else:
548
- st.info("No title topics found.")
549
-
550
- with tab2:
551
- st.markdown('<div class="section-label">Topics derived from paper abstracts</div>', unsafe_allow_html=True)
552
- if abstract_interps:
553
- st.dataframe(pd.DataFrame(_interp_rows(abstract_interps)), use_container_width=True, hide_index=True)
554
- else:
555
- st.info("No abstract topics found.")
556
-
557
- with tab3:
558
- st.markdown('<div class="section-label">Full taxonomy classification</div>', unsafe_allow_html=True)
559
- inner_tabs = st.tabs(["Titles", "Abstracts"])
560
- for itab, section in zip(inner_tabs, ["titles", "abstracts"]):
561
- with itab:
562
- entries = taxonomy_map.get(section, [])
563
- if entries:
564
- st.dataframe(
565
- pd.DataFrame(entries)[[
566
- "topic_id", "label", "taxonomy_category",
567
- "classification", "validation_status", "confidence", "reasoning"
568
- ]],
569
- use_container_width=True,
570
- hide_index=True,
571
- )
572
- else:
573
- st.info(f"No {section} entries.")
574
-
575
- with tab4:
576
- st.markdown('<div class="section-label">Side-by-side title vs abstract topic comparison</div>', unsafe_allow_html=True)
577
- if comparison_rows:
578
- from dataclasses import asdict
579
- st.dataframe(pd.DataFrame([asdict(r) for r in comparison_rows]), use_container_width=True, hide_index=True)
580
- else:
581
- st.info("No overlapping topic IDs between title and abstract sources.")
582
-
583
- st.markdown("---")
584
-
585
- # ── Downloads ──────────────────────────────────────────────────────────
586
- st.markdown('<div class="section-label">Export Results</div>', unsafe_allow_html=True)
587
- dl1, dl2 = st.columns(2)
588
- with dl1:
589
- st.download_button(
590
- "⬇ taxonomy_map.json",
591
- json.dumps(agent_results["taxonomy_map"], indent=2),
592
- file_name="taxonomy_map.json",
593
- mime="application/json",
594
- key="dl_json",
595
- use_container_width=True,
596
- )
597
- with dl2:
598
- from dataclasses import asdict
599
- comp_df = pd.DataFrame([asdict(r) for r in agent_results["comparison_rows"]])
600
- st.download_button(
601
- "⬇ comparison.csv",
602
- comp_df.to_csv(index=False),
603
- file_name="comparison.csv",
604
- mime="text/csv",
605
- key="dl_csv",
606
- use_container_width=True,
607
- )
 
1
+ """
2
+ streamlit_app.py
3
+ ----------------
4
+ Streamlit UI for the BERTopic + Dual LLM (Groq + Mistral) research paper analysis pipeline.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import tempfile
10
+
11
+ import pandas as pd
12
+ import streamlit as st
13
+
14
+ from tools import run_topic_modeling
15
+ from agent import run_agent
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Page Config
19
+ # ---------------------------------------------------------------------------
20
+ st.set_page_config(page_title="Research Topic Analyzer", layout="wide")
21
+ st.title("Research Topic Analyzer")
22
+ st.caption("BERTopic + Groq + Mistral dual-validation pipeline")
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # API Key Handling (env-first, blank input as fallback)
26
+ # ---------------------------------------------------------------------------
27
+ with st.sidebar:
28
+ st.header("API Keys")
29
+ groq_key_input = st.text_input(
30
+ "Groq API Key",
31
+ value="",
32
+ type="password",
33
+ placeholder="Uses GROQ_API_KEY env var if blank",
34
+ )
35
+ mistral_key_input = st.text_input(
36
+ "Mistral API Key (optional)",
37
+ value="",
38
+ type="password",
39
+ placeholder="Uses MISTRAL_API_KEY env var if blank",
40
+ )
41
+ st.caption("Keys are never stored. Leave blank to use environment variables.")
42
+
43
+ st.divider()
44
+ min_topic_size = st.slider("Min Topic Size", min_value=3, max_value=30, value=5)
45
+ if st.button("Reset Results"):
46
+ if "agent_results" in st.session_state:
47
+ del st.session_state["agent_results"]
48
+ st.rerun()
49
+
50
+ groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
51
+ mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # Dataset Loading
55
+ # ---------------------------------------------------------------------------
56
+ st.subheader("Dataset")
57
+ use_sample = st.checkbox("Use sample dataset", value=False)
58
+
59
+ uploaded_file = None
60
+ if not use_sample:
61
+ uploaded_file = st.file_uploader(
62
+ "Upload CSV with 'title' and 'abstract' columns",
63
+ type=["csv"],
64
+ )
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Run Pipeline
68
+ # ---------------------------------------------------------------------------
69
+ run_btn = st.button("Run Pipeline", type="primary")
70
+
71
+ if run_btn:
72
+ # --- Validate inputs ---
73
+ if not groq_api_key:
74
+ st.error("Groq API key is required. Provide it in the sidebar or set GROQ_API_KEY.")
75
+ st.stop()
76
+
77
+ if not use_sample and uploaded_file is None:
78
+ st.error("Please upload a CSV file or enable the sample dataset.")
79
+ st.stop()
80
+
81
+ # --- Resolve CSV path ---
82
+ if use_sample:
83
+ # Inline sample data
84
+ sample_data = {
85
+ "title": [
86
+ "Deep Learning for Image Classification",
87
+ "Neural Networks in Healthcare",
88
+ "Transformer Models for NLP",
89
+ "BERT in Question Answering",
90
+ "Blockchain and Distributed Ledger Technology",
91
+ "Smart Contracts in Finance",
92
+ "Federated Learning for Privacy",
93
+ "Differential Privacy in ML",
94
+ "Graph Neural Networks",
95
+ "Knowledge Graph Embeddings",
96
+ ],
97
+ "abstract": [
98
+ "We propose a deep learning model achieving state-of-the-art accuracy on image benchmarks.",
99
+ "A convolutional network trained for medical image classification tasks.",
100
+ "We introduce a transformer-based approach for text understanding.",
101
+ "Fine-tuning BERT achieves strong results on reading comprehension datasets.",
102
+ "This paper surveys blockchain consensus mechanisms and distributed ledger architectures.",
103
+ "We implement smart contracts for automated financial transactions on a public blockchain.",
104
+ "Federated learning enables collaborative model training without sharing raw data.",
105
+ "Differential privacy provides formal privacy guarantees for machine learning models.",
106
+ "Graph neural networks learn from relational data structures effectively.",
107
+ "Knowledge graph embeddings enable link prediction and entity classification.",
108
+ ],
109
+ }
110
+ df_sample = pd.DataFrame(sample_data)
111
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
112
+ df_sample.to_csv(tmp.name, index=False)
113
+ csv_path = tmp.name
114
+ else:
115
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
116
+ tmp.write(uploaded_file.read())
117
+ tmp.flush()
118
+ csv_path = tmp.name
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Step 1: Topic Modeling
122
+ # ---------------------------------------------------------------------------
123
+ with st.spinner("Running BERTopic (this may take a minute)…"):
124
+ try:
125
+ topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
126
+ except Exception as exc:
127
+ st.error(f"Topic modeling failed: {exc}")
128
+ st.stop()
129
+
130
+ abstract_res = topic_results["abstracts"]
131
+ title_res = topic_results["titles"]
132
+
133
+ # Reload df for raw texts
134
+ df = pd.read_csv(csv_path)
135
+ df.columns = df.columns.str.lower()
136
+ raw_titles = df["title"].fillna("").tolist()
137
+ raw_abstracts = df["abstract"].fillna("").tolist()
138
+
139
+ # ---------------------------------------------------------------------------
140
+ # Step 2: Agent (LLM interpretation + dual validation)
141
+ # ---------------------------------------------------------------------------
142
+ with st.spinner("Running LLM interpretation and Mistral validation…"):
143
+ try:
144
+ st.session_state["agent_results"] = run_agent(
145
+ title_topic_keywords=title_res["topic_keywords"],
146
+ abstract_topic_keywords=abstract_res["topic_keywords"],
147
+ title_topic_assignments=title_res["topics"],
148
+ abstract_topic_assignments=abstract_res["topics"],
149
+ raw_titles=raw_titles,
150
+ raw_abstracts=raw_abstracts,
151
+ api_key=groq_api_key,
152
+ mistral_api_key=mistral_api_key,
153
+ )
154
+ st.success("Pipeline complete!")
155
+ except Exception as exc:
156
+ st.error(f"Agent pipeline failed: {exc}")
157
+ st.stop()
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Display Logic (Outside if run_btn to persist during interactions)
161
+ # ---------------------------------------------------------------------------
162
+ agent_results = st.session_state.get("agent_results")
163
+
164
+ if agent_results:
165
+ # ---------------------------------------------------------------------------
166
+ # Display: Title Topics
167
+ # ---------------------------------------------------------------------------
168
+ st.subheader("Title Topics")
169
+ title_interps = agent_results.get("title_interpretations", {})
170
+ if title_interps:
171
+ title_rows = []
172
+ for tid, interp in sorted(title_interps.items()):
173
+ title_rows.append({
174
+ "Topic ID": tid,
175
+ "Label": interp.label,
176
+ "Category": interp.taxonomy_category,
177
+ "Classification": interp.classification,
178
+ "Validation Status": interp.validation_status,
179
+ "Confidence": interp.confidence,
180
+ "Keywords": ", ".join(interp.keywords[:8]),
181
+ })
182
+ st.dataframe(pd.DataFrame(title_rows), use_container_width=True)
183
+ else:
184
+ st.info("No title topics found.")
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # Display: Abstract Topics
188
+ # ---------------------------------------------------------------------------
189
+ st.subheader("Abstract Topics")
190
+ abstract_interps = agent_results.get("abstract_interpretations", {})
191
+ if abstract_interps:
192
+ abstract_rows = []
193
+ for tid, interp in sorted(abstract_interps.items()):
194
+ abstract_rows.append({
195
+ "Topic ID": tid,
196
+ "Label": interp.label,
197
+ "Category": interp.taxonomy_category,
198
+ "Classification": interp.classification,
199
+ "Validation Status": interp.validation_status,
200
+ "Confidence": interp.confidence,
201
+ "Keywords": ", ".join(interp.keywords[:8]),
202
+ })
203
+ st.dataframe(pd.DataFrame(abstract_rows), use_container_width=True)
204
+ else:
205
+ st.info("No abstract topics found.")
206
+
207
+ # ---------------------------------------------------------------------------
208
+ # Display: Taxonomy Map
209
+ # ---------------------------------------------------------------------------
210
+ st.subheader("Taxonomy Map")
211
+ taxonomy_map = agent_results.get("taxonomy_map", {})
212
+ tabs = st.tabs(["Titles", "Abstracts"])
213
+ for tab, section in zip(tabs, ["titles", "abstracts"]):
214
+ with tab:
215
+ entries = taxonomy_map.get(section, [])
216
+ if entries:
217
+ st.dataframe(
218
+ pd.DataFrame(entries)[[
219
+ "topic_id", "label", "taxonomy_category",
220
+ "classification", "validation_status", "confidence", "reasoning"
221
+ ]],
222
+ use_container_width=True,
223
+ )
224
+ else:
225
+ st.info(f"No {section} taxonomy entries.")
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Display: Comparison Table
229
+ # ---------------------------------------------------------------------------
230
+ st.subheader("Title vs Abstract Comparison")
231
+ comparison_rows = agent_results.get("comparison_rows", [])
232
+ if comparison_rows:
233
+ from dataclasses import asdict
234
+ comp_df = pd.DataFrame([asdict(r) for r in comparison_rows])
235
+ st.dataframe(comp_df, use_container_width=True)
236
+ else:
237
+ st.info("No overlapping topics to compare.")
238
+
239
+ # ---------------------------------------------------------------------------
240
+ # Downloads
241
+ # ---------------------------------------------------------------------------
242
+ st.subheader("Downloads")
243
+ col1, col2 = st.columns(2)
244
+ with col1:
245
+ st.download_button(
246
+ "Download taxonomy_map.json",
247
+ json.dumps(agent_results["taxonomy_map"], indent=2),
248
+ file_name="taxonomy_map.json",
249
+ mime="application/json",
250
+ key="dl_json"
251
+ )
252
+ with col2:
253
+ from dataclasses import asdict
254
+ comp_df = pd.DataFrame([asdict(r) for r in agent_results["comparison_rows"]])
255
+ st.download_button(
256
+ "Download comparison.csv",
257
+ comp_df.to_csv(index=False),
258
+ file_name="comparison.csv",
259
+ mime="text/csv",
260
+ key="dl_csv"
261
+ )