| """ |
| app.py |
| ------ |
| Streamlit UI β SPECTER2 + BERTopic + 3-LLM Council |
| Research Topic Analyzer for SPJIMR Γ SPIT Group 14 |
| """ |
|
|
| import os |
| import json |
| import tempfile |
| import pandas as pd |
| import streamlit as st |
|
|
| from tools import run_topic_modeling |
| from agent import run_agent |
|
|
| |
| st.set_page_config( |
| page_title="TMIS Topic Analyzer", |
| page_icon="π", |
| layout="wide", |
| initial_sidebar_state="expanded", |
| ) |
|
|
| |
| st.markdown(""" |
| <style> |
| @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap'); |
| |
| html, body, [class*="css"] { |
| font-family: 'IBM Plex Sans', sans-serif; |
| } |
| |
| /* App background */ |
| .stApp { |
| background: #0d0f14; |
| color: #e8eaf0; |
| } |
| |
| /* Sidebar */ |
| [data-testid="stSidebar"] { |
| background: #13161e; |
| border-right: 1px solid #1f2333; |
| } |
| [data-testid="stSidebar"] * { |
| color: #b0b8cc !important; |
| } |
| [data-testid="stSidebar"] h1, |
| [data-testid="stSidebar"] h2, |
| [data-testid="stSidebar"] h3 { |
| color: #e8eaf0 !important; |
| font-family: 'IBM Plex Mono', monospace !important; |
| font-size: 0.8rem !important; |
| letter-spacing: 0.12em !important; |
| text-transform: uppercase !important; |
| } |
| |
| /* Header */ |
| .site-header { |
| padding: 2.5rem 0 1.5rem 0; |
| border-bottom: 1px solid #1f2333; |
| margin-bottom: 2rem; |
| } |
| .site-header h1 { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 1.6rem; |
| font-weight: 600; |
| color: #e8eaf0; |
| letter-spacing: -0.01em; |
| margin: 0 0 0.3rem 0; |
| } |
| .site-header p { |
| font-size: 0.82rem; |
| color: #5a6480; |
| font-family: 'IBM Plex Mono', monospace; |
| margin: 0; |
| letter-spacing: 0.04em; |
| } |
| |
| /* Pills / badges */ |
| .pill { |
| display: inline-block; |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 0.68rem; |
| font-weight: 600; |
| letter-spacing: 0.08em; |
| text-transform: uppercase; |
| padding: 3px 10px; |
| border-radius: 2px; |
| margin-right: 6px; |
| } |
| .pill-blue { background: #0f2a4a; color: #4d9de0; border: 1px solid #1a4070; } |
| .pill-green { background: #0a2a1a; color: #3dba7a; border: 1px solid #1a4a2a; } |
| .pill-amber { background: #2a1f00; color: #e8a020; border: 1px solid #4a3500; } |
| .pill-red { background: #2a0f0f; color: #e04d4d; border: 1px solid #4a1a1a; } |
| .pill-gray { background: #1a1e2a; color: #7a8090; border: 1px solid #2a2e3a; } |
| |
| /* Stats row */ |
| .stat-grid { |
| display: grid; |
| grid-template-columns: repeat(4, 1fr); |
| gap: 1px; |
| background: #1f2333; |
| border: 1px solid #1f2333; |
| border-radius: 6px; |
| overflow: hidden; |
| margin-bottom: 2rem; |
| } |
| .stat-card { |
| background: #13161e; |
| padding: 1.25rem 1.5rem; |
| text-align: center; |
| } |
| .stat-val { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 1.9rem; |
| font-weight: 600; |
| color: #e8eaf0; |
| line-height: 1; |
| margin-bottom: 0.3rem; |
| } |
| .stat-label { |
| font-size: 0.7rem; |
| color: #5a6480; |
| text-transform: uppercase; |
| letter-spacing: 0.1em; |
| font-family: 'IBM Plex Mono', monospace; |
| } |
| |
| /* Section titles */ |
| .section-title { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 0.7rem; |
| font-weight: 600; |
| letter-spacing: 0.15em; |
| text-transform: uppercase; |
| color: #5a6480; |
| padding-bottom: 0.6rem; |
| border-bottom: 1px solid #1f2333; |
| margin-bottom: 1.2rem; |
| } |
| |
| /* Topic cards */ |
| .topic-card { |
| background: #13161e; |
| border: 1px solid #1f2333; |
| border-left: 3px solid #4d9de0; |
| border-radius: 4px; |
| padding: 1rem 1.25rem; |
| margin-bottom: 0.6rem; |
| transition: border-color 0.15s; |
| } |
| .topic-card:hover { border-left-color: #3dba7a; } |
| .topic-card.novel { border-left-color: #e8a020; } |
| .topic-label { |
| font-size: 0.92rem; |
| font-weight: 500; |
| color: #e8eaf0; |
| margin-bottom: 0.35rem; |
| } |
| .topic-meta { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 0.7rem; |
| color: #5a6480; |
| } |
| .topic-kw { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 0.68rem; |
| color: #3d6480; |
| margin-top: 0.4rem; |
| line-height: 1.5; |
| } |
| |
| /* Validation panel */ |
| .val-box { |
| background: #0a2a1a; |
| border: 1px solid #1a4a2a; |
| border-radius: 6px; |
| padding: 1.25rem 1.5rem; |
| margin-bottom: 1.5rem; |
| } |
| .val-box h4 { |
| font-family: 'IBM Plex Mono', monospace; |
| font-size: 0.72rem; |
| font-weight: 600; |
| letter-spacing: 0.1em; |
| text-transform: uppercase; |
| color: #3dba7a; |
| margin: 0 0 0.75rem 0; |
| } |
| .val-row { |
| display: flex; |
| justify-content: space-between; |
| align-items: center; |
| padding: 0.4rem 0; |
| border-bottom: 1px solid #1a3a2a; |
| font-size: 0.8rem; |
| color: #a0b8a8; |
| } |
| .val-row:last-child { border-bottom: none; } |
| .val-key { color: #5a7a6a; } |
| .val-num { font-family: 'IBM Plex Mono', monospace; color: #3dba7a; font-weight: 600; } |
| |
| /* LLM council badge row */ |
| .council-row { |
| display: flex; |
| gap: 8px; |
| margin-bottom: 1rem; |
| flex-wrap: wrap; |
| } |
| |
| /* Run button */ |
| .stButton > button { |
| background: #4d9de0 !important; |
| color: #0d0f14 !important; |
| border: none !important; |
| border-radius: 3px !important; |
| font-family: 'IBM Plex Mono', monospace !important; |
| font-size: 0.78rem !important; |
| font-weight: 600 !important; |
| letter-spacing: 0.08em !important; |
| text-transform: uppercase !important; |
| padding: 0.6rem 2rem !important; |
| transition: background 0.15s !important; |
| } |
| .stButton > button:hover { |
| background: #3d8ed0 !important; |
| } |
| |
| /* Input overrides */ |
| .stTextInput input, .stSelectbox select { |
| background: #13161e !important; |
| border: 1px solid #1f2333 !important; |
| color: #e8eaf0 !important; |
| font-family: 'IBM Plex Mono', monospace !important; |
| font-size: 0.82rem !important; |
| border-radius: 3px !important; |
| } |
| |
| /* Dataframe */ |
| .stDataFrame { |
| background: #13161e; |
| border: 1px solid #1f2333; |
| border-radius: 4px; |
| } |
| |
| /* Download buttons */ |
| .stDownloadButton > button { |
| background: transparent !important; |
| color: #4d9de0 !important; |
| border: 1px solid #1a4070 !important; |
| border-radius: 3px !important; |
| font-family: 'IBM Plex Mono', monospace !important; |
| font-size: 0.72rem !important; |
| letter-spacing: 0.08em !important; |
| } |
| |
| /* Expander */ |
| .streamlit-expanderHeader { |
| background: #13161e !important; |
| border: 1px solid #1f2333 !important; |
| font-family: 'IBM Plex Mono', monospace !important; |
| font-size: 0.78rem !important; |
| color: #a0a8c0 !important; |
| } |
| |
| /* Progress / spinner */ |
| .stSpinner > div { border-top-color: #4d9de0 !important; } |
| |
| /* Divider */ |
| hr { border-color: #1f2333 !important; } |
| |
| /* Alerts */ |
| .stAlert { border-radius: 4px !important; } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown(""" |
| <div class="site-header"> |
| <h1>Research Topic Analyzer</h1> |
| <p>SPECTER2 embeddings Β· HDBSCAN/UMAP clustering Β· 3-LLM Council (Groq + Mistral + Gemini) Β· PAJAIS validation</p> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| with st.sidebar: |
| st.markdown("### API Keys") |
| groq_key_input = st.text_input("Groq API Key", type="password", placeholder="GROQ_API_KEY env var") |
| mistral_key_input = st.text_input("Mistral API Key", type="password", placeholder="MISTRAL_API_KEY env var") |
| gemini_key_input = st.text_input("Gemini API Key", type="password", placeholder="GEMINI_API_KEY env var") |
| st.caption("Keys are never stored. Leave blank to use env vars.") |
|
|
| st.markdown("---") |
| st.markdown("### Clustering Parameters") |
| min_topic_size = st.slider("Min papers per cluster", min_value=3, max_value=20, value=5, |
| help="Prof. Kamat spec: min=5") |
| st.markdown( |
| "<span class='pill pill-blue'>Min clusters: 15</span>" |
| "<span class='pill pill-blue'>Max clusters: 30</span>", |
| unsafe_allow_html=True |
| ) |
| st.markdown( |
| "<span class='pill pill-gray'>Cosine sim: 0.50β0.55</span>", |
| unsafe_allow_html=True |
| ) |
|
|
| st.markdown("---") |
| st.markdown("### LLM Council") |
| st.markdown(""" |
| <div class="council-row"> |
| <span class="pill pill-blue">Groq / LLaMA-3.1</span> |
| <span class="pill pill-green">Mistral Small</span> |
| <span class="pill pill-amber">Gemini 2.5 Flash</span> |
| </div> |
| <p style="font-size:0.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;"> |
| Majority vote β best label selected.<br> |
| Keyword-overlap fallback if no consensus. |
| </p> |
| """, unsafe_allow_html=True) |
|
|
| st.markdown("---") |
| if st.button("Reset Results", use_container_width=True): |
| for key in ["agent_results", "topic_stats"]: |
| st.session_state.pop(key, None) |
| st.rerun() |
|
|
| groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY") |
| mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY") |
| gemini_api_key = gemini_key_input.strip() or os.getenv("GEMINI_API_KEY") |
|
|
| |
| st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True) |
|
|
| col_up, col_sample = st.columns([3, 1]) |
| with col_up: |
| uploaded_file = st.file_uploader( |
| "Upload Scopus CSV β must contain 'title' and 'abstract' columns", |
| type=["csv"], |
| help="Export your corpus from Scopus as CSV. The tool will combine Title + Abstract into one SPECTER2 vector per paper." |
| ) |
| with col_sample: |
| st.markdown("<br>", unsafe_allow_html=True) |
| use_sample = st.checkbox("Use sample dataset (50 papers)", value=False) |
|
|
| if uploaded_file and not use_sample: |
| try: |
| df_preview = pd.read_csv(uploaded_file) |
| uploaded_file.seek(0) |
| col_a, col_b, col_c = st.columns(3) |
| col_a.metric("Papers detected", len(df_preview)) |
| col_b.metric("Columns", len(df_preview.columns)) |
| has_both = {"title", "abstract"}.issubset(set(df_preview.columns.str.lower())) |
| col_c.metric("Title + Abstract", "β present" if has_both else "β missing") |
| if not has_both: |
| st.error("CSV must have both 'title' and 'abstract' columns.") |
| except Exception as e: |
| st.error(f"Could not preview CSV: {e}") |
|
|
| |
| st.markdown("<br>", unsafe_allow_html=True) |
| run_btn = st.button("βΆ Run Full Pipeline", type="primary") |
|
|
| if run_btn: |
| |
| missing_keys = [] |
| if not groq_api_key: missing_keys.append("Groq") |
| if not mistral_api_key: missing_keys.append("Mistral") |
| if not gemini_api_key: missing_keys.append("Gemini") |
| if missing_keys: |
| st.error(f"Missing API key(s): {', '.join(missing_keys)}. All three are required for the LLM council.") |
| st.stop() |
|
|
| if not use_sample and uploaded_file is None: |
| st.error("Please upload a CSV file or enable the sample dataset.") |
| st.stop() |
|
|
| |
| if use_sample: |
| import numpy as np |
| rng = np.random.default_rng(42) |
| topics_pool = [ |
| ("Deep Learning for Healthcare Prediction", "We apply LSTM networks to predict patient readmission from EHR data."), |
| ("Process Mining in Enterprise Systems", "Event log analysis using Petri nets for conformance checking in ERP workflows."), |
| ("Recommender Systems Collaborative Filtering", "Matrix factorization techniques applied to e-commerce product recommendation."), |
| ("LLM Applications in Information Systems", "GPT-4 used for automated requirements extraction from stakeholder documents."), |
| ("Blockchain Smart Contract Security", "Formal verification of Solidity smart contracts for financial transaction safety."), |
| ("Federated Learning Privacy Preservation", "Differential privacy mechanisms for distributed model training across hospitals."), |
| ("Cybersecurity Intrusion Detection", "Random forest classifiers for network anomaly detection in enterprise environments."), |
| ("Natural Language Processing Sentiment", "BERT fine-tuning for aspect-level sentiment analysis in product reviews."), |
| ("Knowledge Graph Embedding", "TransE and RotatE models for biomedical entity relation prediction."), |
| ("Computer Vision Medical Imaging", "CNN architectures for diabetic retinopathy grading from fundus photographs."), |
| ] |
| rows = [] |
| for i in range(50): |
| t, a = topics_pool[i % len(topics_pool)] |
| rows.append({"title": t, "abstract": a + f" Study {i+1}.", "doi": f"10.1145/sample.{i+1}"}) |
| df_s = pd.DataFrame(rows) |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") |
| df_s.to_csv(tmp.name, index=False) |
| csv_path = tmp.name |
| else: |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") |
| tmp.write(uploaded_file.read()) |
| tmp.flush() |
| csv_path = tmp.name |
|
|
| |
| progress_bar = st.progress(0, text="Step 1/2 β SPECTER2 embeddings + HDBSCAN clustering (15β30 clusters)β¦") |
| try: |
| topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size) |
| n_clusters = len(topic_results["documents"]["topic_keywords"]) |
| progress_bar.progress(50, text=f"Step 1/2 β Done. {n_clusters} clusters found.") |
| except Exception as exc: |
| st.error(f"Topic modeling failed: {exc}") |
| st.stop() |
|
|
| |
| progress_bar.progress(55, text="Step 2/2 β 3-LLM Council labelling (Groq + Mistral + Gemini)β¦") |
| try: |
| agent_results = run_agent( |
| topic_results=topic_results, |
| groq_key=groq_api_key, |
| mistral_key=mistral_api_key, |
| gemini_key=gemini_api_key, |
| ) |
| progress_bar.progress(100, text="Pipeline complete.") |
| st.session_state["agent_results"] = agent_results |
|
|
| |
| interps = agent_results.get("interpretations", {}) |
| novel_count = sum(1 for i in interps.values() if i.classification == "NOVEL") |
| mapped_count = sum(1 for i in interps.values() if i.classification == "MAPPED") |
| total_papers = sum(i.paper_count for i in interps.values()) |
| st.session_state["topic_stats"] = { |
| "n_topics": len(interps), |
| "novel": novel_count, |
| "mapped": mapped_count, |
| "total_papers": total_papers, |
| } |
| st.success(f"Pipeline complete β {len(interps)} topics labelled by 3-LLM council.") |
| except Exception as exc: |
| st.error(f"LLM council failed: {exc}") |
| st.stop() |
|
|
| |
| results = st.session_state.get("agent_results") |
| stats = st.session_state.get("topic_stats") |
|
|
| if results and stats: |
| interps = results.get("interpretations", {}) |
|
|
| |
| st.markdown("<div class='section-title'>Pipeline Summary</div>", unsafe_allow_html=True) |
| st.markdown(f""" |
| <div class="stat-grid"> |
| <div class="stat-card"> |
| <div class="stat-val">{stats['n_topics']}</div> |
| <div class="stat-label">Topics Found</div> |
| </div> |
| <div class="stat-card"> |
| <div class="stat-val">{stats['total_papers']}</div> |
| <div class="stat-label">Papers Assigned</div> |
| </div> |
| <div class="stat-card"> |
| <div class="stat-val">{stats['novel']}</div> |
| <div class="stat-label">NOVEL (no PAJAIS home)</div> |
| </div> |
| <div class="stat-card"> |
| <div class="stat-val">{stats['mapped']}</div> |
| <div class="stat-label">MAPPED to PAJAIS</div> |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown("<div class='section-title'>LLM Council Validation</div>", unsafe_allow_html=True) |
| novel_pct = round(stats['novel'] / stats['n_topics'] * 100) if stats['n_topics'] else 0 |
| mapped_pct = round(stats['mapped'] / stats['n_topics'] * 100) if stats['n_topics'] else 0 |
| st.markdown(f""" |
| <div class="val-box"> |
| <h4>Instructor Spec Compliance</h4> |
| <div class="val-row"><span class="val-key">Embedding model</span><span class="val-num">SPECTER2 (allenai/specter2_base)</span></div> |
| <div class="val-row"><span class="val-key">Input column</span><span class="val-num">Title + Abstract (combined)</span></div> |
| <div class="val-row"><span class="val-key">Clustering</span><span class="val-num">UMAP β HDBSCAN (min=5, max=100 per cluster)</span></div> |
| <div class="val-row"><span class="val-key">Cosine similarity range</span><span class="val-num">0.50 β 0.55 (merge / outlier reassign)</span></div> |
| <div class="val-row"><span class="val-key">Total clusters</span><span class="val-num">{stats['n_topics']} (target: 15β30)</span></div> |
| <div class="val-row"><span class="val-key">LLM council</span><span class="val-num">Groq (LLaMA-3.1) + Mistral Small + Gemini 2.5 Flash</span></div> |
| <div class="val-row"><span class="val-key">Label selection</span><span class="val-num">Majority vote β keyword-overlap fallback</span></div> |
| <div class="val-row"><span class="val-key">Rep. docs per topic</span><span class="val-num">Top-3 by cosine similarity to centroid</span></div> |
| <div class="val-row"><span class="val-key">NOVEL themes (no PAJAIS home)</span><span class="val-num">{novel_pct}% ({stats['novel']} topics)</span></div> |
| <div class="val-row"><span class="val-key">MAPPED to PAJAIS taxonomy</span><span class="val-num">{mapped_pct}% ({stats['mapped']} topics)</span></div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown("<div class='section-title'>Topic Results</div>", unsafe_allow_html=True) |
|
|
| rows = [] |
| for tid, interp in sorted(interps.items()): |
| rows.append({ |
| "Topic ID": tid, |
| "Label": interp.label, |
| "Classification": interp.classification, |
| "Category": interp.category, |
| "Papers": interp.paper_count, |
| "Keywords": ", ".join(interp.keywords[:8]), |
| }) |
| df_res = pd.DataFrame(rows).sort_values("Papers", ascending=False).reset_index(drop=True) |
|
|
| col_f1, col_f2, col_f3 = st.columns([2, 2, 1]) |
| with col_f1: |
| cats = ["All"] + sorted(df_res["Category"].unique().tolist()) |
| sel_cat = st.selectbox("Filter by category", cats) |
| with col_f2: |
| clsf = ["All", "NOVEL", "MAPPED"] |
| sel_cls = st.selectbox("Filter by classification", clsf) |
| with col_f3: |
| sort_by = st.selectbox("Sort by", ["Papers β", "Papers β", "Label AβZ"]) |
|
|
| df_f = df_res.copy() |
| if sel_cat != "All": |
| df_f = df_f[df_f["Category"] == sel_cat] |
| if sel_cls != "All": |
| df_f = df_f[df_f["Classification"] == sel_cls] |
| if sort_by == "Papers β": |
| df_f = df_f.sort_values("Papers", ascending=False) |
| elif sort_by == "Papers β": |
| df_f = df_f.sort_values("Papers", ascending=True) |
| else: |
| df_f = df_f.sort_values("Label") |
| df_f = df_f.reset_index(drop=True) |
|
|
| st.caption(f"Showing {len(df_f)} of {len(df_res)} topics") |
|
|
| |
| view_mode = st.radio("View as", ["Table", "Cards"], horizontal=True) |
|
|
| if view_mode == "Table": |
| st.dataframe(df_f, use_container_width=True, height=420) |
| else: |
| for _, row in df_f.iterrows(): |
| cls_pill = ( |
| "<span class='pill pill-amber'>NOVEL</span>" |
| if row["Classification"] == "NOVEL" |
| else "<span class='pill pill-green'>MAPPED</span>" |
| ) |
| card_cls = "topic-card novel" if row["Classification"] == "NOVEL" else "topic-card" |
| st.markdown(f""" |
| <div class="{card_cls}"> |
| <div class="topic-label">{row['Label']}</div> |
| <div class="topic-meta"> |
| {cls_pill} |
| <span class="pill pill-gray">{row['Category']}</span> |
| <span class="pill pill-blue">{row['Papers']} papers</span> |
| </div> |
| <div class="topic-kw">{row['Keywords']}</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown("<br>", unsafe_allow_html=True) |
| with st.expander("Topic frequency chart", expanded=False): |
| chart_df = df_f[["Label", "Papers"]].copy() |
| chart_df["Label"] = chart_df["Label"].apply(lambda x: x[:35] + "β¦" if len(x) > 35 else x) |
| chart_df = chart_df.set_index("Label") |
| st.bar_chart(chart_df, height=380) |
|
|
| |
| with st.expander("NOVEL vs PAJAIS breakdown β for paper Β§4.6", expanded=False): |
| col_n, col_m = st.columns(2) |
| with col_n: |
| st.markdown("**NOVEL topics (no PAJAIS home)**") |
| novel_df = df_f[df_f["Classification"] == "NOVEL"][["Label", "Papers", "Category"]].reset_index(drop=True) |
| st.dataframe(novel_df, use_container_width=True) |
| with col_m: |
| st.markdown("**MAPPED topics (PAJAIS match)**") |
| mapped_df = df_f[df_f["Classification"] == "MAPPED"][["Label", "Papers", "Category"]].reset_index(drop=True) |
| st.dataframe(mapped_df, use_container_width=True) |
|
|
| |
| with st.expander("Representative papers per topic (top-3 by centroid proximity)", expanded=False): |
| rep_docs = results.get("rep_docs_raw", {}) |
| |
| for tid, interp in sorted(interps.items()): |
| st.markdown(f"**Topic {tid} β {interp.label}**") |
| docs = interp.keywords |
| st.caption("See topics.json for full representative document titles.") |
| st.info("Download topics.json below to see the 3 representative paper titles per cluster used for LLM labelling.") |
|
|
| |
| st.markdown("<div class='section-title'>Downloads</div>", unsafe_allow_html=True) |
| col_d1, col_d2, col_d3 = st.columns(3) |
| with col_d1: |
| try: |
| with open(results["json_path"], "r") as f: |
| st.download_button( |
| "β¬ topics.json", |
| f.read(), |
| file_name="tmis_topics.json", |
| mime="application/json", |
| use_container_width=True, |
| ) |
| except Exception: |
| st.warning("JSON file not found.") |
| with col_d2: |
| try: |
| df_dl = pd.read_csv(results["csv_path"]) |
| st.download_button( |
| "β¬ topics.csv", |
| df_dl.to_csv(index=False), |
| file_name="tmis_topics.csv", |
| mime="text/csv", |
| use_container_width=True, |
| ) |
| except Exception: |
| st.warning("CSV file not found.") |
| with col_d3: |
| st.download_button( |
| "β¬ results table", |
| df_res.to_csv(index=False), |
| file_name="tmis_topic_results.csv", |
| mime="text/csv", |
| use_container_width=True, |
| ) |
|
|
| |
| st.markdown("<br>", unsafe_allow_html=True) |
| with st.expander("Β§3.4 methodology note β paste into paper", expanded=False): |
| st.code(f"""Pipeline A (Unsupervised Discovery): SPECTER2 (allenai/specter2_base) generates one |
| 768-dimensional document embedding per paper from a combined Title + Abstract column. |
| UMAP (n_neighbors=15, n_components=5, metric=cosine) reduces dimensionality; HDBSCAN |
| (min_cluster_size={min_topic_size}, metric=euclidean, cluster_selection=eom) clusters embeddings. |
| Cosine similarity threshold 0.50β0.55 governs cluster merging and outlier reassignment. |
| Total clusters constrained to 15β30 via iterative split/merge. |
| |
| Pipeline B (LLM Council Validation): For each cluster, the 3 papers nearest the centroid |
| (by cosine similarity) are passed as representative titles to 3 independent LLMs: |
| Groq/LLaMA-3.1-8b, Mistral-Small-Latest, and Gemini-2.5-Flash. Each LLM returns a |
| structured JSON with label, taxonomy_category, and classification (MAPPED/NOVEL). |
| Majority vote selects the final label; keyword-overlap fallback applies when no consensus. |
| This is the 3-LLM Council approach validating AI output without using the same model |
| for self-validation (per Carlsen & Ralund, 2022 CALM principle). |
| |
| Results: {stats['n_topics']} clusters discovered. {novel_pct}% classified as NOVEL |
| (no PAJAIS 2019 home). {mapped_pct}% MAPPED to existing PAJAIS categories.""", language="text") |
|
|
| |
| elif not results: |
| st.markdown(""" |
| <div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;"> |
| <p style="font-family:'IBM Plex Mono',monospace;font-size:0.8rem;color:#3a4060;letter-spacing:0.1em;"> |
| UPLOAD CSV β ENTER API KEYS β RUN PIPELINE |
| </p> |
| <p style="font-size:0.75rem;color:#2a3050;margin-top:0.5rem;"> |
| SPECTER2 embeddings Β· HDBSCAN Β· 3-LLM council Β· PAJAIS validation |
| </p> |
| </div> |
| """, unsafe_allow_html=True) |