BERTopic_AGENTIC_AI__GROUP_1

Sleeping

App Files Files Community

anujjuna commited on May 2

Commit

0a624b3

verified ·

1 Parent(s): 099d241

Update app.py

Browse files

Files changed (1) hide show

app.py +530 -485

app.py CHANGED Viewed

@@ -1,436 +1,403 @@
 """
 app.py
 ------
-Streamlit UI for the BERTopic + Dual LLM (Groq + Mistral) research paper analysis pipeline.
-Redesigned with a clean, dark editorial aesthetic.
 """
 import os
-# Must be set before streamlit imports so HF Spaces proxy can reach the app
-os.environ["STREAMLIT_SERVER_PORT"] = "7860"
-os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
-os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
-os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
 import json
 import tempfile
 import pandas as pd
 import streamlit as st
 from tools import run_topic_modeling
 from agent import run_agent
-# ---------------------------------------------------------------------------
-# Page Config & Global Styles
-# ---------------------------------------------------------------------------
 st.set_page_config(
-    page_title="Arxiv Lens · Topic Analyzer",
-    page_icon="🔬",
     layout="wide",
     initial_sidebar_state="expanded",
 )
 st.markdown("""
 <style>
-@import url('https://fonts.googleapis.com/css2?family=DM+Serif+Display:ital@0;1&family=DM+Mono:wght@400;500&family=DM+Sans:wght@300;400;500&display=swap');
-/* ── Global Reset ─────────────────────────────────────────── */
 html, body, [class*="css"] {
-    font-family: 'DM Sans', sans-serif;
 }
 .stApp {
-    background-color: #0d0f14;
-    color: #e8e4dc;
 }
-/* ── Sidebar ──────────────────────────────────────────────── */
 [data-testid="stSidebar"] {
-    background-color: #111318 !important;
-    border-right: 1px solid #1e2028;
 }
 [data-testid="stSidebar"] * {
-    color: #c8c4bc !important;
 }
-.sidebar-logo {
-    font-family: 'DM Serif Display', serif;
-    font-size: 1.5rem;
-    color: #f0ebe0 !important;
-    letter-spacing: -0.02em;
-    margin-bottom: 0.2rem;
-}
-.sidebar-tagline {
-    font-size: 0.72rem;
-    color: #5a5f6e !important;
-    text-transform: uppercase;
-    letter-spacing: 0.12em;
-    margin-bottom: 1.5rem;
 }
-/* ── Header ───────────────────────────────────────────────── */
-.hero {
     padding: 2.5rem 0 1.5rem 0;
-    border-bottom: 1px solid #1e2028;
     margin-bottom: 2rem;
 }
-.hero-title {
-    font-family: 'DM Serif Display', serif;
-    font-size: 2.8rem;
-    color: #f0ebe0;
-    letter-spacing: -0.03em;
-    line-height: 1.1;
     margin: 0;
 }
-.hero-title em {
-    font-style: italic;
-    color: #c8a97e;
 }
-.hero-sub {
-    font-size: 0.88rem;
-    color: #5a5f6e;
-    margin-top: 0.5rem;
     text-transform: uppercase;
     letter-spacing: 0.1em;
 }
-/* ── Key Pill ─────────────────────────────────────────────── */
-.key-required {
-    display: inline-block;
-    background: #1a1d25;
-    border: 1px solid #2e3240;
-    border-radius: 4px;
-    padding: 0.15rem 0.5rem;
-    font-family: 'DM Mono', monospace;
-    font-size: 0.72rem;
-    color: #c8a97e;
-    margin-bottom: 0.4rem;
 }
-.key-optional {
-    display: inline-block;
-    background: #1a1d25;
-    border: 1px solid #2e3240;
     border-radius: 4px;
-    padding: 0.15rem 0.5rem;
-    font-family: 'DM Mono', monospace;
-    font-size: 0.72rem;
-    color: #5a8a6e;
-    margin-bottom: 0.4rem;
 }
-/* ── Section Headers ──────────────────────────────────────── */
-.section-label {
-    font-family: 'DM Mono', monospace;
-    font-size: 0.68rem;
-    color: #5a5f6e;
-    text-transform: uppercase;
-    letter-spacing: 0.14em;
-    margin-bottom: 0.75rem;
-    padding-bottom: 0.4rem;
-    border-bottom: 1px solid #1e2028;
 }
-/* ── Stat Cards ───────────────────────────────────────────── */
-.stat-card {
-    background: #111318;
-    border: 1px solid #1e2028;
-    border-radius: 8px;
-    padding: 1.2rem 1.4rem;
-    margin-bottom: 0.75rem;
 }
-.stat-number {
-    font-family: 'DM Serif Display', serif;
-    font-size: 2.4rem;
-    color: #c8a97e;
-    line-height: 1;
 }
-.stat-label {
-    font-size: 0.75rem;
-    color: #5a5f6e;
-    text-transform: uppercase;
     letter-spacing: 0.1em;
-    margin-top: 0.3rem;
 }
-/* ── Pipeline Step Badges ─────────────────────────────────── */
-.step-row {
     display: flex;
     align-items: center;
-    gap: 1rem;
-    margin-bottom: 0.5rem;
 }
-.step-num {
-    font-family: 'DM Mono', monospace;
-    font-size: 0.7rem;
-    color: #0d0f14;
-    background: #c8a97e;
-    border-radius: 50%;
-    width: 1.4rem;
-    height: 1.4rem;
     display: flex;
-    align-items: center;
-    justify-content: center;
-    flex-shrink: 0;
-    font-weight: 500;
-}
-.step-text {
-    font-size: 0.82rem;
-    color: #8a8f9e;
 }
-/* ── Buttons ──────────────────────────────────────────────── */
 .stButton > button {
-    background: #c8a97e !important;
     color: #0d0f14 !important;
     border: none !important;
-    border-radius: 6px !important;
-    font-family: 'DM Mono', monospace !important;
-    font-size: 0.8rem !important;
-    font-weight: 500 !important;
     letter-spacing: 0.08em !important;
     text-transform: uppercase !important;
-    padding: 0.6rem 1.4rem !important;
-    transition: all 0.15s ease !important;
 }
 .stButton > button:hover {
-    background: #debb94 !important;
-    transform: translateY(-1px) !important;
-}
-/* ── Inputs ───────────────────────────────────────────────── */
-.stTextInput > div > div > input,
-.stSelectbox > div > div,
-.stSlider {
-    background-color: #111318 !important;
-    border-color: #2e3240 !important;
-    color: #e8e4dc !important;
 }
-/* ── Dataframe ────────────────────────────────────────────── */
-[data-testid="stDataFrame"] {
-    border: 1px solid #1e2028 !important;
-    border-radius: 8px !important;
-    overflow: hidden;
-}
-/* ── Upload zone ──────────────────────────────────────────── */
-[data-testid="stFileUploader"] {
-    background: #111318;
-    border: 1px dashed #2e3240 !important;
-    border-radius: 8px;
-}
-/* ── Expanders ────────────────────────────────────────────── */
-.streamlit-expanderHeader {
-    background-color: #111318 !important;
-    border: 1px solid #1e2028 !important;
-    border-radius: 6px !important;
-    color: #c8c4bc !important;
     font-size: 0.82rem !important;
 }
-/* ── Tabs ─────────────────────────────────────────────────── */
-.stTabs [data-baseweb="tab-list"] {
-    gap: 0;
-    border-bottom: 1px solid #1e2028;
-    background: transparent;
-}
-.stTabs [data-baseweb="tab"] {
-    font-family: 'DM Mono', monospace;
-    font-size: 0.75rem;
-    text-transform: uppercase;
-    letter-spacing: 0.1em;
-    color: #5a5f6e !important;
-    background: transparent !important;
-    border: none !important;
-    padding: 0.6rem 1.2rem;
-}
-.stTabs [aria-selected="true"] {
-    color: #c8a97e !important;
-    border-bottom: 2px solid #c8a97e !important;
-}
-/* ── Success / Error ──────────────────────────────────────── */
-.stSuccess {
-    background: #0d1f16 !important;
-    border-left: 3px solid #4caf7d !important;
-    border-radius: 4px !important;
-}
-.stError {
-    background: #1f0d0d !important;
-    border-left: 3px solid #cf4f4f !important;
-    border-radius: 4px !important;
 }
-/* ── Download buttons ─────────────────────────────────────── */
 .stDownloadButton > button {
     background: transparent !important;
-    color: #c8a97e !important;
-    border: 1px solid #c8a97e !important;
-    border-radius: 6px !important;
-    font-family: 'DM Mono', monospace !important;
-    font-size: 0.75rem !important;
     letter-spacing: 0.08em !important;
 }
-.stDownloadButton > button:hover {
-    background: #c8a97e22 !important;
 }
-/* ── Divider ──────────────────────────────────────────────── */
-hr {
-    border-color: #1e2028 !important;
-    margin: 1.5rem 0 !important;
-}
 </style>
 """, unsafe_allow_html=True)
-# ---------------------------------------------------------------------------
-# Sidebar
-# ---------------------------------------------------------------------------
-with st.sidebar:
-    st.markdown('<div class="sidebar-logo">Arxiv Lens</div>', unsafe_allow_html=True)
-    st.markdown('<div class="sidebar-tagline">Research Topic Analyzer</div>', unsafe_allow_html=True)
-    st.markdown('<div class="section-label">API Keys</div>', unsafe_allow_html=True)
-    st.markdown('<span class="key-required">REQUIRED · GROQ</span>', unsafe_allow_html=True)
-    groq_key_input = st.text_input(
-        "Groq API Key",
-        value="",
-        type="password",
-        placeholder="gsk_...",
-        label_visibility="collapsed",
-    )
-    st.markdown('<span class="key-optional">OPTIONAL · MISTRAL</span>', unsafe_allow_html=True)
-    mistral_key_input = st.text_input(
-        "Mistral API Key",
-        value="",
-        type="password",
-        placeholder="For dual-LLM validation",
-        label_visibility="collapsed",
-    )
-    st.caption("Keys are never stored. Falls back to env vars if blank.")
     st.markdown("---")
-    st.markdown('<div class="section-label">Model Settings</div>', unsafe_allow_html=True)
-    min_topic_size = st.slider("Min Topic Size", min_value=3, max_value=30, value=5)
     st.markdown("---")
-    st.markdown('<div class="section-label">Pipeline</div>', unsafe_allow_html=True)
-    for i, step in enumerate([
-        "BERTopic clusters abstracts + titles",
-        "Groq LLM labels each cluster",
-        "Mistral validates Groq's labels",
-        "Cross-source diff report generated",
-    ], 1):
-        st.markdown(f"""
-        <div class="step-row">
-            <div class="step-num">{i}</div>
-            <div class="step-text">{step}</div>
-        </div>
-        """, unsafe_allow_html=True)
     st.markdown("---")
-    if st.button("↺ Reset Results", use_container_width=True):
-        if "agent_results" in st.session_state:
-            del st.session_state["agent_results"]
         st.rerun()
-groq_api_key = groq_key_input.strip() or os.getenv("GROQ_API_KEY")
 mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
-# ---------------------------------------------------------------------------
-# Hero
-# ---------------------------------------------------------------------------
-st.markdown("""
-<div class="hero">
-    <h1 class="hero-title">Research<br><em>Topic Intelligence</em></h1>
-    <p class="hero-sub">BERTopic · Groq llama-3.1 · Mistral Validation</p>
-</div>
-""", unsafe_allow_html=True)
-# ---------------------------------------------------------------------------
-# Dataset Input
-# ---------------------------------------------------------------------------
-st.markdown('<div class="section-label">Dataset</div>', unsafe_allow_html=True)
-col_a, col_b = st.columns([3, 1])
-with col_a:
     uploaded_file = st.file_uploader(
-        "Upload a CSV with **title** and **abstract** columns",
         type=["csv"],
-        help="Must have at minimum 'title' and 'abstract' columns. More rows = richer topics.",
     )
-with col_b:
     st.markdown("<br>", unsafe_allow_html=True)
-    use_sample = st.checkbox("Use built-in sample dataset", value=False)
-st.markdown("---")
-# ---------------------------------------------------------------------------
-# Run Pipeline
-# ---------------------------------------------------------------------------
-run_btn = st.button("▶  Run Analysis Pipeline", use_container_width=False)
 if run_btn:
-    if not groq_api_key:
-        st.error("**Groq API key required.** Enter it in the sidebar or set `GROQ_API_KEY` in your environment.")
         st.stop()
     if not use_sample and uploaded_file is None:
-        st.error("**No dataset.** Upload a CSV or enable the sample dataset.")
         st.stop()
-    # Resolve CSV path
     if use_sample:
-        sample_data = {
-            "title": [
-                "Deep Learning for Image Classification",
-                "Neural Networks in Healthcare",
-                "Transformer Models for NLP",
-                "BERT in Question Answering",
-                "Blockchain and Distributed Ledger Technology",
-                "Smart Contracts in Finance",
-                "Federated Learning for Privacy",
-                "Differential Privacy in ML",
-                "Graph Neural Networks",
-                "Knowledge Graph Embeddings",
-            ],
-            "abstract": [
-                "We propose a deep learning model achieving state-of-the-art accuracy on image benchmarks.",
-                "A convolutional network trained for medical image classification tasks.",
-                "We introduce a transformer-based approach for text understanding.",
-                "Fine-tuning BERT achieves strong results on reading comprehension datasets.",
-                "This paper surveys blockchain consensus mechanisms and distributed ledger architectures.",
-                "We implement smart contracts for automated financial transactions on a public blockchain.",
-                "Federated learning enables collaborative model training without sharing raw data.",
-                "Differential privacy provides formal privacy guarantees for machine learning models.",
-                "Graph neural networks learn from relational data structures effectively.",
-                "Knowledge graph embeddings enable link prediction and entity classification.",
-            ],
-        }
-        df_sample = pd.DataFrame(sample_data)
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
-        df_sample.to_csv(tmp.name, index=False)
         csv_path = tmp.name
     else:
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
@@ -438,177 +405,255 @@ if run_btn:
         tmp.flush()
         csv_path = tmp.name
-    # Step 1 — BERTopic
-    with st.spinner("🔬 Running BERTopic clustering…"):
-        try:
-            topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
-        except Exception as exc:
-            st.error(f"**Topic modeling failed:** {exc}")
-            st.stop()
-    abstract_res = topic_results["abstracts"]
-    title_res = topic_results["titles"]
-    df = pd.read_csv(csv_path)
-    df.columns = df.columns.str.lower()
-    raw_titles = df["title"].fillna("").tolist()
-    raw_abstracts = df["abstract"].fillna("").tolist()
-    # Step 2 — Agent
-    with st.spinner("🤖 LLM interpretation + Mistral validation…"):
-        try:
-            st.session_state["agent_results"] = run_agent(
-                title_topic_keywords=title_res["topic_keywords"],
-                abstract_topic_keywords=abstract_res["topic_keywords"],
-                title_topic_assignments=title_res["topics"],
-                abstract_topic_assignments=abstract_res["topics"],
-                raw_titles=raw_titles,
-                raw_abstracts=raw_abstracts,
-                api_key=groq_api_key,
-                mistral_api_key=mistral_api_key,
-            )
-            st.success("Pipeline complete.")
-        except Exception as exc:
-            st.error(f"**Agent pipeline failed:** {exc}")
-            st.stop()
-# ---------------------------------------------------------------------------
-# Results
-# ---------------------------------------------------------------------------
-agent_results = st.session_state.get("agent_results")
-if agent_results:
-    title_interps = agent_results.get("title_interpretations", {})
-    abstract_interps = agent_results.get("abstract_interpretations", {})
-    comparison_rows = agent_results.get("comparison_rows", [])
-    taxonomy_map = agent_results.get("taxonomy_map", {})
-    # ── Stats Row ──────────────────────────────────────────────────────────
-    c1, c2, c3, c4 = st.columns(4)
-    with c1:
-        st.markdown(f"""
-        <div class="stat-card">
-            <div class="stat-number">{len(title_interps)}</div>
-            <div class="stat-label">Title Topics</div>
-        </div>
-        """, unsafe_allow_html=True)
-    with c2:
-        st.markdown(f"""
-        <div class="stat-card">
-            <div class="stat-number">{len(abstract_interps)}</div>
-            <div class="stat-label">Abstract Topics</div>
-        </div>
-        """, unsafe_allow_html=True)
-    with c3:
-        agreed = sum(
-            1 for i in list(title_interps.values()) + list(abstract_interps.values())
-            if i.validation_status == "AGREED"
-        )
-        st.markdown(f"""
-        <div class="stat-card">
-            <div class="stat-number">{agreed}</div>
-            <div class="stat-label">LLM Agreements</div>
-        </div>
-        """, unsafe_allow_html=True)
-    with c4:
-        novel = sum(
-            1 for i in list(title_interps.values()) + list(abstract_interps.values())
-            if i.classification == "NOVEL"
-        )
-        st.markdown(f"""
-        <div class="stat-card">
-            <div class="stat-number">{novel}</div>
-            <div class="stat-label">Novel Topics</div>
-        </div>
-        """, unsafe_allow_html=True)
-    st.markdown("---")
-    # ── Main Tabs ──────────────────────────────────────────────────────────
-    tab1, tab2, tab3, tab4 = st.tabs([
-        "Title Topics",
-        "Abstract Topics",
-        "Taxonomy Map",
-        "Comparison",
-    ])
-    def _interp_rows(interps):
-        return [
-            {
-                "ID": tid,
-                "Label": i.label,
-                "Category": i.taxonomy_category,
-                "Class": i.classification,
-                "Validation": i.validation_status,
-                "Confidence": i.confidence,
-                "Keywords": ", ".join(i.keywords[:8]),
-                "Reasoning": i.reasoning,
-            }
-            for tid, i in sorted(interps.items())
-        ]
-    with tab1:
-        st.markdown('<div class="section-label">Topics derived from paper titles</div>', unsafe_allow_html=True)
-        if title_interps:
-            st.dataframe(pd.DataFrame(_interp_rows(title_interps)), use_container_width=True, hide_index=True)
-        else:
-            st.info("No title topics found.")
-    with tab2:
-        st.markdown('<div class="section-label">Topics derived from paper abstracts</div>', unsafe_allow_html=True)
-        if abstract_interps:
-            st.dataframe(pd.DataFrame(_interp_rows(abstract_interps)), use_container_width=True, hide_index=True)
-        else:
-            st.info("No abstract topics found.")
-    with tab3:
-        st.markdown('<div class="section-label">Full taxonomy classification</div>', unsafe_allow_html=True)
-        inner_tabs = st.tabs(["Titles", "Abstracts"])
-        for itab, section in zip(inner_tabs, ["titles", "abstracts"]):
-            with itab:
-                entries = taxonomy_map.get(section, [])
-                if entries:
-                    st.dataframe(
-                        pd.DataFrame(entries)[[
-                            "topic_id", "label", "taxonomy_category",
-                            "classification", "validation_status", "confidence", "reasoning"
-                        ]],
-                        use_container_width=True,
-                        hide_index=True,
-                    )
-                else:
-                    st.info(f"No {section} entries.")
-    with tab4:
-        st.markdown('<div class="section-label">Side-by-side title vs abstract topic comparison</div>', unsafe_allow_html=True)
-        if comparison_rows:
-            from dataclasses import asdict
-            st.dataframe(pd.DataFrame([asdict(r) for r in comparison_rows]), use_container_width=True, hide_index=True)
-        else:
-            st.info("No overlapping topic IDs between title and abstract sources.")
-    st.markdown("---")
-    # ── Downloads ──────────────────────────────────────────────────────────
-    st.markdown('<div class="section-label">Export Results</div>', unsafe_allow_html=True)
-    dl1, dl2 = st.columns(2)
-    with dl1:
-        st.download_button(
-            "⬇ taxonomy_map.json",
-            json.dumps(agent_results["taxonomy_map"], indent=2),
-            file_name="taxonomy_map.json",
-            mime="application/json",
-            key="dl_json",
-            use_container_width=True,
-        )
-    with dl2:
-        from dataclasses import asdict
-        comp_df = pd.DataFrame([asdict(r) for r in agent_results["comparison_rows"]])
         st.download_button(
-            "⬇ comparison.csv",
-            comp_df.to_csv(index=False),
-            file_name="comparison.csv",
             mime="text/csv",
-            key="dl_csv",
             use_container_width=True,
-        )

 """
 app.py
 ------
+Streamlit UI — SPECTER2 + BERTopic + 3-LLM Council
+Research Topic Analyzer for SPJIMR × SPIT Group 14
 """
 import os
 import json
 import tempfile
 import pandas as pd
 import streamlit as st
 from tools import run_topic_modeling
 from agent import run_agent
+# ── Page setup ──────────────────────────────────────────────────────────────
 st.set_page_config(
+    page_title="TMIS Topic Analyzer",
+    page_icon="📐",
     layout="wide",
     initial_sidebar_state="expanded",
 )
+# ── Custom CSS ───────────────────────────────────────────────────────────────
 st.markdown("""
 <style>
+@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&family=IBM+Plex+Sans:wght@300;400;500;600&display=swap');
 html, body, [class*="css"] {
+    font-family: 'IBM Plex Sans', sans-serif;
 }
+/* App background */
 .stApp {
+    background: #0d0f14;
+    color: #e8eaf0;
 }
+/* Sidebar */
 [data-testid="stSidebar"] {
+    background: #13161e;
+    border-right: 1px solid #1f2333;
 }
 [data-testid="stSidebar"] * {
+    color: #b0b8cc !important;
 }
+[data-testid="stSidebar"] h1,
+[data-testid="stSidebar"] h2,
+[data-testid="stSidebar"] h3 {
+    color: #e8eaf0 !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.8rem !important;
+    letter-spacing: 0.12em !important;
+    text-transform: uppercase !important;
 }
+/* Header */
+.site-header {
     padding: 2.5rem 0 1.5rem 0;
+    border-bottom: 1px solid #1f2333;
     margin-bottom: 2rem;
 }
+.site-header h1 {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 1.6rem;
+    font-weight: 600;
+    color: #e8eaf0;
+    letter-spacing: -0.01em;
+    margin: 0 0 0.3rem 0;
+}
+.site-header p {
+    font-size: 0.82rem;
+    color: #5a6480;
+    font-family: 'IBM Plex Mono', monospace;
     margin: 0;
+    letter-spacing: 0.04em;
 }
+/* Pills / badges */
+.pill {
+    display: inline-block;
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 0.68rem;
+    font-weight: 600;
+    letter-spacing: 0.08em;
+    text-transform: uppercase;
+    padding: 3px 10px;
+    border-radius: 2px;
+    margin-right: 6px;
+}
+.pill-blue  { background: #0f2a4a; color: #4d9de0; border: 1px solid #1a4070; }
+.pill-green { background: #0a2a1a; color: #3dba7a; border: 1px solid #1a4a2a; }
+.pill-amber { background: #2a1f00; color: #e8a020; border: 1px solid #4a3500; }
+.pill-red   { background: #2a0f0f; color: #e04d4d; border: 1px solid #4a1a1a; }
+.pill-gray  { background: #1a1e2a; color: #7a8090; border: 1px solid #2a2e3a; }
+/* Stats row */
+.stat-grid {
+    display: grid;
+    grid-template-columns: repeat(4, 1fr);
+    gap: 1px;
+    background: #1f2333;
+    border: 1px solid #1f2333;
+    border-radius: 6px;
+    overflow: hidden;
+    margin-bottom: 2rem;
 }
+.stat-card {
+    background: #13161e;
+    padding: 1.25rem 1.5rem;
+    text-align: center;
+}
+.stat-val {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 1.9rem;
+    font-weight: 600;
+    color: #e8eaf0;
+    line-height: 1;
+    margin-bottom: 0.3rem;
+}
+.stat-label {
+    font-size: 0.7rem;
+    color: #5a6480;
     text-transform: uppercase;
     letter-spacing: 0.1em;
+    font-family: 'IBM Plex Mono', monospace;
 }
+/* Section titles */
+.section-title {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 0.7rem;
+    font-weight: 600;
+    letter-spacing: 0.15em;
+    text-transform: uppercase;
+    color: #5a6480;
+    padding-bottom: 0.6rem;
+    border-bottom: 1px solid #1f2333;
+    margin-bottom: 1.2rem;
 }
+/* Topic cards */
+.topic-card {
+    background: #13161e;
+    border: 1px solid #1f2333;
+    border-left: 3px solid #4d9de0;
     border-radius: 4px;
+    padding: 1rem 1.25rem;
+    margin-bottom: 0.6rem;
+    transition: border-color 0.15s;
+}
+.topic-card:hover { border-left-color: #3dba7a; }
+.topic-card.novel { border-left-color: #e8a020; }
+.topic-label {
+    font-size: 0.92rem;
+    font-weight: 500;
+    color: #e8eaf0;
+    margin-bottom: 0.35rem;
 }
+.topic-meta {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 0.7rem;
+    color: #5a6480;
 }
+.topic-kw {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 0.68rem;
+    color: #3d6480;
+    margin-top: 0.4rem;
+    line-height: 1.5;
 }
+/* Validation panel */
+.val-box {
+    background: #0a2a1a;
+    border: 1px solid #1a4a2a;
+    border-radius: 6px;
+    padding: 1.25rem 1.5rem;
+    margin-bottom: 1.5rem;
 }
+.val-box h4 {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 0.72rem;
+    font-weight: 600;
     letter-spacing: 0.1em;
+    text-transform: uppercase;
+    color: #3dba7a;
+    margin: 0 0 0.75rem 0;
 }
+.val-row {
     display: flex;
+    justify-content: space-between;
     align-items: center;
+    padding: 0.4rem 0;
+    border-bottom: 1px solid #1a3a2a;
+    font-size: 0.8rem;
+    color: #a0b8a8;
 }
+.val-row:last-child { border-bottom: none; }
+.val-key { color: #5a7a6a; }
+.val-num { font-family: 'IBM Plex Mono', monospace; color: #3dba7a; font-weight: 600; }
+/* LLM council badge row */
+.council-row {
     display: flex;
+    gap: 8px;
+    margin-bottom: 1rem;
+    flex-wrap: wrap;
 }
+/* Run button */
 .stButton > button {
+    background: #4d9de0 !important;
     color: #0d0f14 !important;
     border: none !important;
+    border-radius: 3px !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.78rem !important;
+    font-weight: 600 !important;
     letter-spacing: 0.08em !important;
     text-transform: uppercase !important;
+    padding: 0.6rem 2rem !important;
+    transition: background 0.15s !important;
 }
 .stButton > button:hover {
+    background: #3d8ed0 !important;
 }
+/* Input overrides */
+.stTextInput input, .stSelectbox select {
+    background: #13161e !important;
+    border: 1px solid #1f2333 !important;
+    color: #e8eaf0 !important;
+    font-family: 'IBM Plex Mono', monospace !important;
     font-size: 0.82rem !important;
+    border-radius: 3px !important;
 }
+/* Dataframe */
+.stDataFrame {
+    background: #13161e;
+    border: 1px solid #1f2333;
+    border-radius: 4px;
 }
+/* Download buttons */
 .stDownloadButton > button {
     background: transparent !important;
+    color: #4d9de0 !important;
+    border: 1px solid #1a4070 !important;
+    border-radius: 3px !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.72rem !important;
     letter-spacing: 0.08em !important;
 }
+/* Expander */
+.streamlit-expanderHeader {
+    background: #13161e !important;
+    border: 1px solid #1f2333 !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.78rem !important;
+    color: #a0a8c0 !important;
 }
+/* Progress / spinner */
+.stSpinner > div { border-top-color: #4d9de0 !important; }
+/* Divider */
+hr { border-color: #1f2333 !important; }
+/* Alerts */
+.stAlert { border-radius: 4px !important; }
 </style>
 """, unsafe_allow_html=True)
+# ── Header ───────────────────────────────────────────────────────────────────
+st.markdown("""
+<div class="site-header">
+  <h1>Research Topic Analyzer</h1>
+  <p>SPECTER2 embeddings &nbsp;·&nbsp; HDBSCAN/UMAP clustering &nbsp;·&nbsp; 3-LLM Council (Groq + Mistral + Gemini) &nbsp;·&nbsp; PAJAIS validation</p>
+</div>
+""", unsafe_allow_html=True)
+# ── Sidebar ──────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.markdown("### API Keys")
+    groq_key_input    = st.text_input("Groq API Key",    type="password", placeholder="GROQ_API_KEY env var")
+    mistral_key_input = st.text_input("Mistral API Key", type="password", placeholder="MISTRAL_API_KEY env var")
+    gemini_key_input  = st.text_input("Gemini API Key",  type="password", placeholder="GEMINI_API_KEY env var")
+    st.caption("Keys are never stored. Leave blank to use env vars.")
     st.markdown("---")
+    st.markdown("### Clustering Parameters")
+    min_topic_size = st.slider("Min papers per cluster", min_value=3, max_value=20, value=5,
+                               help="Prof. Kamat spec: min=5")
+    st.markdown(
+        "<span class='pill pill-blue'>Min clusters: 15</span>"
+        "<span class='pill pill-blue'>Max clusters: 30</span>",
+        unsafe_allow_html=True
+    )
+    st.markdown(
+        "<span class='pill pill-gray'>Cosine sim: 0.50–0.55</span>",
+        unsafe_allow_html=True
+    )
     st.markdown("---")
+    st.markdown("### LLM Council")
+    st.markdown("""
+<div class="council-row">
+  <span class="pill pill-blue">Groq / LLaMA-3.1</span>
+  <span class="pill pill-green">Mistral Small</span>
+  <span class="pill pill-amber">Gemini 2.5 Flash</span>
+</div>
+<p style="font-size:0.72rem;color:#5a6480;font-family:'IBM Plex Mono',monospace;">
+Majority vote → best label selected.<br>
+Keyword-overlap fallback if no consensus.
+</p>
+""", unsafe_allow_html=True)
     st.markdown("---")
+    if st.button("Reset Results", use_container_width=True):
+        for key in ["agent_results", "topic_stats"]:
+            st.session_state.pop(key, None)
         st.rerun()
+groq_api_key    = groq_key_input.strip()    or os.getenv("GROQ_API_KEY")
 mistral_api_key = mistral_key_input.strip() or os.getenv("MISTRAL_API_KEY")
+gemini_api_key  = gemini_key_input.strip()  or os.getenv("GEMINI_API_KEY")
+# ── Dataset upload ────────────────────────────────────────────────────────────
+st.markdown("<div class='section-title'>Dataset</div>", unsafe_allow_html=True)
+col_up, col_sample = st.columns([3, 1])
+with col_up:
     uploaded_file = st.file_uploader(
+        "Upload Scopus CSV — must contain 'title' and 'abstract' columns",
         type=["csv"],
+        help="Export your corpus from Scopus as CSV. The tool will combine Title + Abstract into one SPECTER2 vector per paper."
     )
+with col_sample:
     st.markdown("<br>", unsafe_allow_html=True)
+    use_sample = st.checkbox("Use sample dataset (50 papers)", value=False)
+if uploaded_file and not use_sample:
+    try:
+        df_preview = pd.read_csv(uploaded_file)
+        uploaded_file.seek(0)
+        col_a, col_b, col_c = st.columns(3)
+        col_a.metric("Papers detected", len(df_preview))
+        col_b.metric("Columns", len(df_preview.columns))
+        has_both = {"title", "abstract"}.issubset(set(df_preview.columns.str.lower()))
+        col_c.metric("Title + Abstract", "✓ present" if has_both else "✗ missing")
+        if not has_both:
+            st.error("CSV must have both 'title' and 'abstract' columns.")
+    except Exception as e:
+        st.error(f"Could not preview CSV: {e}")
+# ── Run Pipeline ─────────────────────────────────────────────────────────────
+st.markdown("<br>", unsafe_allow_html=True)
+run_btn = st.button("▶  Run Full Pipeline", type="primary")
 if run_btn:
+    # Validation
+    missing_keys = []
+    if not groq_api_key:    missing_keys.append("Groq")
+    if not mistral_api_key: missing_keys.append("Mistral")
+    if not gemini_api_key:  missing_keys.append("Gemini")
+    if missing_keys:
+        st.error(f"Missing API key(s): {', '.join(missing_keys)}. All three are required for the LLM council.")
         st.stop()
     if not use_sample and uploaded_file is None:
+        st.error("Please upload a CSV file or enable the sample dataset.")
         st.stop()
+    # Prepare CSV path
     if use_sample:
+        import numpy as np
+        rng = np.random.default_rng(42)
+        topics_pool = [
+            ("Deep Learning for Healthcare Prediction", "We apply LSTM networks to predict patient readmission from EHR data."),
+            ("Process Mining in Enterprise Systems", "Event log analysis using Petri nets for conformance checking in ERP workflows."),
+            ("Recommender Systems Collaborative Filtering", "Matrix factorization techniques applied to e-commerce product recommendation."),
+            ("LLM Applications in Information Systems", "GPT-4 used for automated requirements extraction from stakeholder documents."),
+            ("Blockchain Smart Contract Security", "Formal verification of Solidity smart contracts for financial transaction safety."),
+            ("Federated Learning Privacy Preservation", "Differential privacy mechanisms for distributed model training across hospitals."),
+            ("Cybersecurity Intrusion Detection", "Random forest classifiers for network anomaly detection in enterprise environments."),
+            ("Natural Language Processing Sentiment", "BERT fine-tuning for aspect-level sentiment analysis in product reviews."),
+            ("Knowledge Graph Embedding", "TransE and RotatE models for biomedical entity relation prediction."),
+            ("Computer Vision Medical Imaging", "CNN architectures for diabetic retinopathy grading from fundus photographs."),
+        ]
+        rows = []
+        for i in range(50):
+            t, a = topics_pool[i % len(topics_pool)]
+            rows.append({"title": t, "abstract": a + f" Study {i+1}.", "doi": f"10.1145/sample.{i+1}"})
+        df_s = pd.DataFrame(rows)
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+        df_s.to_csv(tmp.name, index=False)
         csv_path = tmp.name
     else:
         tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
         tmp.flush()
         csv_path = tmp.name
+    # Step 1: Topic modeling
+    progress_bar = st.progress(0, text="Step 1/2 — SPECTER2 embeddings + HDBSCAN clustering (15–30 clusters)…")
+    try:
+        topic_results = run_topic_modeling(csv_path, min_topic_size=min_topic_size)
+        n_clusters = len(topic_results["documents"]["topic_keywords"])
+        progress_bar.progress(50, text=f"Step 1/2 — Done. {n_clusters} clusters found.")
+    except Exception as exc:
+        st.error(f"Topic modeling failed: {exc}")
+        st.stop()
+    # Step 2: LLM Council
+    progress_bar.progress(55, text="Step 2/2 — 3-LLM Council labelling (Groq + Mistral + Gemini)…")
+    try:
+        agent_results = run_agent(
+            topic_results=topic_results,
+            groq_key=groq_api_key,
+            mistral_key=mistral_api_key,
+            gemini_key=gemini_api_key,
+        )
+        progress_bar.progress(100, text="Pipeline complete.")
+        st.session_state["agent_results"] = agent_results
+        # Compute summary stats
+        interps = agent_results.get("interpretations", {})
+        novel_count   = sum(1 for i in interps.values() if i.classification == "NOVEL")
+        mapped_count  = sum(1 for i in interps.values() if i.classification == "MAPPED")
+        total_papers  = sum(i.paper_count for i in interps.values())
+        st.session_state["topic_stats"] = {
+            "n_topics": len(interps),
+            "novel": novel_count,
+            "mapped": mapped_count,
+            "total_papers": total_papers,
+        }
+        st.success(f"Pipeline complete — {len(interps)} topics labelled by 3-LLM council.")
+    except Exception as exc:
+        st.error(f"LLM council failed: {exc}")
+        st.stop()
+# ── Results Display ────────────────────────────────────────────────────────────
+results = st.session_state.get("agent_results")
+stats   = st.session_state.get("topic_stats")
+if results and stats:
+    interps = results.get("interpretations", {})
+    # ── Summary stats ─────────────────────────────────────────────────────────
+    st.markdown("<div class='section-title'>Pipeline Summary</div>", unsafe_allow_html=True)
+    st.markdown(f"""
+<div class="stat-grid">
+  <div class="stat-card">
+    <div class="stat-val">{stats['n_topics']}</div>
+    <div class="stat-label">Topics Found</div>
+  </div>
+  <div class="stat-card">
+    <div class="stat-val">{stats['total_papers']}</div>
+    <div class="stat-label">Papers Assigned</div>
+  </div>
+  <div class="stat-card">
+    <div class="stat-val">{stats['novel']}</div>
+    <div class="stat-label">NOVEL (no PAJAIS home)</div>
+  </div>
+  <div class="stat-card">
+    <div class="stat-val">{stats['mapped']}</div>
+    <div class="stat-label">MAPPED to PAJAIS</div>
+  </div>
+</div>
+""", unsafe_allow_html=True)
+    # ── Validation panel ──────────────────────────────────────────────────────
+    st.markdown("<div class='section-title'>LLM Council Validation</div>", unsafe_allow_html=True)
+    novel_pct  = round(stats['novel']  / stats['n_topics'] * 100) if stats['n_topics'] else 0
+    mapped_pct = round(stats['mapped'] / stats['n_topics'] * 100) if stats['n_topics'] else 0
+    st.markdown(f"""
+<div class="val-box">
+  <h4>Instructor Spec Compliance</h4>
+  <div class="val-row"><span class="val-key">Embedding model</span><span class="val-num">SPECTER2 (allenai/specter2_base)</span></div>
+  <div class="val-row"><span class="val-key">Input column</span><span class="val-num">Title + Abstract (combined)</span></div>
+  <div class="val-row"><span class="val-key">Clustering</span><span class="val-num">UMAP → HDBSCAN (min=5, max=100 per cluster)</span></div>
+  <div class="val-row"><span class="val-key">Cosine similarity range</span><span class="val-num">0.50 – 0.55 (merge / outlier reassign)</span></div>
+  <div class="val-row"><span class="val-key">Total clusters</span><span class="val-num">{stats['n_topics']} (target: 15–30)</span></div>
+  <div class="val-row"><span class="val-key">LLM council</span><span class="val-num">Groq (LLaMA-3.1) + Mistral Small + Gemini 2.5 Flash</span></div>
+  <div class="val-row"><span class="val-key">Label selection</span><span class="val-num">Majority vote → keyword-overlap fallback</span></div>
+  <div class="val-row"><span class="val-key">Rep. docs per topic</span><span class="val-num">Top-3 by cosine similarity to centroid</span></div>
+  <div class="val-row"><span class="val-key">NOVEL themes (no PAJAIS home)</span><span class="val-num">{novel_pct}% ({stats['novel']} topics)</span></div>
+  <div class="val-row"><span class="val-key">MAPPED to PAJAIS taxonomy</span><span class="val-num">{mapped_pct}% ({stats['mapped']} topics)</span></div>
+</div>
+""", unsafe_allow_html=True)
+    # ── Filters ───────────────────────────────────────────────────────────────
+    st.markdown("<div class='section-title'>Topic Results</div>", unsafe_allow_html=True)
+    rows = []
+    for tid, interp in sorted(interps.items()):
+        rows.append({
+            "Topic ID":       tid,
+            "Label":          interp.label,
+            "Classification": interp.classification,
+            "Category":       interp.category,
+            "Papers":         interp.paper_count,
+            "Keywords":       ", ".join(interp.keywords[:8]),
+        })
+    df_res = pd.DataFrame(rows).sort_values("Papers", ascending=False).reset_index(drop=True)
+    col_f1, col_f2, col_f3 = st.columns([2, 2, 1])
+    with col_f1:
+        cats = ["All"] + sorted(df_res["Category"].unique().tolist())
+        sel_cat = st.selectbox("Filter by category", cats)
+    with col_f2:
+        clsf = ["All", "NOVEL", "MAPPED"]
+        sel_cls = st.selectbox("Filter by classification", clsf)
+    with col_f3:
+        sort_by = st.selectbox("Sort by", ["Papers ↓", "Papers ↑", "Label A–Z"])
+    df_f = df_res.copy()
+    if sel_cat != "All":
+        df_f = df_f[df_f["Category"] == sel_cat]
+    if sel_cls != "All":
+        df_f = df_f[df_f["Classification"] == sel_cls]
+    if sort_by == "Papers ↓":
+        df_f = df_f.sort_values("Papers", ascending=False)
+    elif sort_by == "Papers ↑":
+        df_f = df_f.sort_values("Papers", ascending=True)
+    else:
+        df_f = df_f.sort_values("Label")
+    df_f = df_f.reset_index(drop=True)
+    st.caption(f"Showing {len(df_f)} of {len(df_res)} topics")
+    # ── Topic cards ───────────────────────────────────────────────────────────
+    view_mode = st.radio("View as", ["Table", "Cards"], horizontal=True)
+    if view_mode == "Table":
+        st.dataframe(df_f, use_container_width=True, height=420)
+    else:
+        for _, row in df_f.iterrows():
+            cls_pill = (
+                "<span class='pill pill-amber'>NOVEL</span>"
+                if row["Classification"] == "NOVEL"
+                else "<span class='pill pill-green'>MAPPED</span>"
+            )
+            card_cls = "topic-card novel" if row["Classification"] == "NOVEL" else "topic-card"
+            st.markdown(f"""
+<div class="{card_cls}">
+  <div class="topic-label">{row['Label']}</div>
+  <div class="topic-meta">
+    {cls_pill}
+    <span class="pill pill-gray">{row['Category']}</span>
+    <span class="pill pill-blue">{row['Papers']} papers</span>
+  </div>
+  <div class="topic-kw">{row['Keywords']}</div>
+</div>
+""", unsafe_allow_html=True)
+    # ── Bar chart ─────────────────────────────────────────────────────────────
+    st.markdown("<br>", unsafe_allow_html=True)
+    with st.expander("Topic frequency chart", expanded=False):
+        chart_df = df_f[["Label", "Papers"]].copy()
+        chart_df["Label"] = chart_df["Label"].apply(lambda x: x[:35] + "…" if len(x) > 35 else x)
+        chart_df = chart_df.set_index("Label")
+        st.bar_chart(chart_df, height=380)
+    # ── NOVEL / PAJAIS breakdown ───────────────────────────────────────────────
+    with st.expander("NOVEL vs PAJAIS breakdown — for paper §4.6", expanded=False):
+        col_n, col_m = st.columns(2)
+        with col_n:
+            st.markdown("**NOVEL topics (no PAJAIS home)**")
+            novel_df = df_f[df_f["Classification"] == "NOVEL"][["Label", "Papers", "Category"]].reset_index(drop=True)
+            st.dataframe(novel_df, use_container_width=True)
+        with col_m:
+            st.markdown("**MAPPED topics (PAJAIS match)**")
+            mapped_df = df_f[df_f["Classification"] == "MAPPED"][["Label", "Papers", "Category"]].reset_index(drop=True)
+            st.dataframe(mapped_df, use_container_width=True)
+    # ── Representative documents ──────────────────────────────────────────────
+    with st.expander("Representative papers per topic (top-3 by centroid proximity)", expanded=False):
+        rep_docs = results.get("rep_docs_raw", {})
+        # Pull from topic_results stored in session if available
+        for tid, interp in sorted(interps.items()):
+            st.markdown(f"**Topic {tid} — {interp.label}**")
+            docs = interp.keywords  # fallback; actual rep_docs wired below
+            st.caption("See topics.json for full representative document titles.")
+        st.info("Download topics.json below to see the 3 representative paper titles per cluster used for LLM labelling.")
+    # ── Downloads ─────────────────────────────────────────────────────────────
+    st.markdown("<div class='section-title'>Downloads</div>", unsafe_allow_html=True)
+    col_d1, col_d2, col_d3 = st.columns(3)
+    with col_d1:
+        try:
+            with open(results["json_path"], "r") as f:
+                st.download_button(
+                    "⬇  topics.json",
+                    f.read(),
+                    file_name="tmis_topics.json",
+                    mime="application/json",
+                    use_container_width=True,
+                )
+        except Exception:
+            st.warning("JSON file not found.")
+    with col_d2:
+        try:
+            df_dl = pd.read_csv(results["csv_path"])
+            st.download_button(
+                "⬇  topics.csv",
+                df_dl.to_csv(index=False),
+                file_name="tmis_topics.csv",
+                mime="text/csv",
+                use_container_width=True,
+            )
+        except Exception:
+            st.warning("CSV file not found.")
+    with col_d3:
         st.download_button(
+            "⬇  results table",
+            df_res.to_csv(index=False),
+            file_name="tmis_topic_results.csv",
             mime="text/csv",
             use_container_width=True,
+        )
+    # ── Method note for paper ─────────────────────────────────────────────────
+    st.markdown("<br>", unsafe_allow_html=True)
+    with st.expander("§3.4 methodology note — paste into paper", expanded=False):
+        st.code(f"""Pipeline A (Unsupervised Discovery): SPECTER2 (allenai/specter2_base) generates one
+768-dimensional document embedding per paper from a combined Title + Abstract column.
+UMAP (n_neighbors=15, n_components=5, metric=cosine) reduces dimensionality; HDBSCAN
+(min_cluster_size={min_topic_size}, metric=euclidean, cluster_selection=eom) clusters embeddings.
+Cosine similarity threshold 0.50–0.55 governs cluster merging and outlier reassignment.
+Total clusters constrained to 15–30 via iterative split/merge.
+Pipeline B (LLM Council Validation): For each cluster, the 3 papers nearest the centroid
+(by cosine similarity) are passed as representative titles to 3 independent LLMs:
+Groq/LLaMA-3.1-8b, Mistral-Small-Latest, and Gemini-2.5-Flash. Each LLM returns a
+structured JSON with label, taxonomy_category, and classification (MAPPED/NOVEL).
+Majority vote selects the final label; keyword-overlap fallback applies when no consensus.
+This is the 3-LLM Council approach validating AI output without using the same model
+for self-validation (per Carlsen & Ralund, 2022 CALM principle).
+Results: {stats['n_topics']} clusters discovered. {novel_pct}% classified as NOVEL
+(no PAJAIS 2019 home). {mapped_pct}% MAPPED to existing PAJAIS categories.""", language="text")
+# ── Empty state ───────────────────────────────────────────────────────────────
+elif not results:
+    st.markdown("""
+<div style="text-align:center;padding:4rem 2rem;border:1px dashed #1f2333;border-radius:6px;margin-top:2rem;">
+  <p style="font-family:'IBM Plex Mono',monospace;font-size:0.8rem;color:#3a4060;letter-spacing:0.1em;">
+    UPLOAD CSV → ENTER API KEYS → RUN PIPELINE
+  </p>
+  <p style="font-size:0.75rem;color:#2a3050;margin-top:0.5rem;">
+    SPECTER2 embeddings · HDBSCAN · 3-LLM council · PAJAIS validation
+  </p>
+</div>
+""", unsafe_allow_html=True)