Spaces:

Akilashamnaka12
/

NewsAI_web_app

Sleeping

File size: 30,489 Bytes

import os
import re
import nltk
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# ─── HF token (set as a Secret in Space settings for private/gated models) ────
HF_TOKEN = os.environ.get("HF_TOKEN", None)

# ─── Page Config ──────────────────────────────────────────────────────────────
st.set_page_config(
    page_title="NewsLens · Sri Lanka",
    page_icon="🔎",
    layout="wide",
    initial_sidebar_state="collapsed",
)

# ─── NLTK – write to /tmp so HF Spaces (read-only FS) can cache data ──────────
NLTK_DATA_DIR = "/tmp/nltk_data"
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
if NLTK_DATA_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DATA_DIR)

@st.cache_resource
def download_nltk():
    for pkg in ["stopwords", "punkt", "punkt_tab"]:
        try:
            nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
        except Exception:
            pass

download_nltk()

# ─── CSS ──────────────────────────────────────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=DM+Sans:ital,wght@0,300;0,400;0,500;1,300&display=swap');

*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }

html, body, [data-testid="stAppViewContainer"] {
    background: #07090f !important;
    color: #e8eaf0 !important;
    font-family: 'DM Sans', sans-serif !important;
}
[data-testid="stAppViewContainer"] { padding: 0 !important; }
[data-testid="stHeader"] { background: transparent !important; }
section.main > div { padding-top: 0 !important; }
.block-container { padding: 0 2rem 4rem 2rem !important; max-width: 1280px !important; }

/* Hero */
.hero {
    background: linear-gradient(135deg, #0b1120 0%, #0d1f3c 55%, #062a3a 100%);
    border-bottom: 1px solid #1a2a44;
    padding: 3.5rem 3rem 2.8rem;
    position: relative; overflow: hidden;
}
.hero::before {
    content:''; position:absolute; inset:0;
    background: radial-gradient(ellipse 70% 60% at 80% 30%, rgba(0,200,180,.09) 0%, transparent 70%);
    pointer-events: none;
}
.hero-eyebrow { font-size:.75rem; font-weight:500; letter-spacing:.18em; color:#00c8b4; text-transform:uppercase; margin-bottom:.9rem; }
.hero-title { font-family:'Syne',sans-serif; font-size:clamp(2.2rem,5vw,3.6rem); font-weight:800; line-height:1.08; color:#fff; margin-bottom:1rem; }
.hero-title span { color:#00c8b4; }
.hero-sub { font-size:1.05rem; font-weight:300; line-height:1.65; color:#94a3b8; max-width:560px; }

/* Tabs */
[data-testid="stTabs"] > div:first-child { background:#0b111f; border-bottom:1px solid #1a2a44; padding:0 2rem; gap:0 !important; }
[data-testid="stTabs"] button { font-family:'Syne',sans-serif !important; font-size:.88rem !important; font-weight:600 !important; color:#64748b !important; padding:1rem 1.5rem !important; border-radius:0 !important; border-bottom:2px solid transparent !important; transition:color .2s,border-color .2s !important; }
[data-testid="stTabs"] button:hover { color:#cbd5e1 !important; }
[data-testid="stTabs"] button[aria-selected="true"] { color:#00c8b4 !important; border-bottom-color:#00c8b4 !important; background:transparent !important; }

/* Cards */
.card { background:#0f172a; border:1px solid #1e2d45; border-radius:14px; padding:1.8rem 1.8rem 1.6rem; margin-bottom:1.4rem; transition:border-color .2s,box-shadow .2s; }
.card:hover { border-color:#00c8b4; box-shadow:0 0 28px rgba(0,200,180,.08); }
.card-title { font-family:'Syne',sans-serif; font-size:1rem; font-weight:700; color:#e2e8f0; margin-bottom:.35rem; }
.card-sub { font-size:.82rem; color:#64748b; font-weight:300; margin-bottom:1.1rem; }

/* Labels / chips / badges */
.section-label { font-family:'Syne',sans-serif; font-size:.72rem; font-weight:700; letter-spacing:.14em; text-transform:uppercase; color:#00c8b4; margin-bottom:.6rem; }
.stat-row { display:flex; gap:1rem; flex-wrap:wrap; margin:1rem 0; }
.stat-chip { background:#1e2d45; border-radius:8px; padding:.55rem 1.1rem; font-family:'Syne',sans-serif; font-size:.85rem; font-weight:600; color:#e2e8f0; }
.stat-chip span { color:#00c8b4; font-size:1.15rem; display:block; }
.badge { display:inline-block; padding:.25rem .7rem; border-radius:999px; font-size:.72rem; font-weight:600; letter-spacing:.05em; text-transform:uppercase; }
.badge-teal   { background:rgba(0,200,180,.15);  color:#00c8b4; border:1px solid rgba(0,200,180,.3); }
.badge-blue   { background:rgba(59,130,246,.15);  color:#60a5fa; border:1px solid rgba(59,130,246,.3); }
.badge-amber  { background:rgba(245,158,11,.12);  color:#fbbf24; border:1px solid rgba(245,158,11,.3); }
.badge-rose   { background:rgba(244,63,94,.12);   color:#fb7185; border:1px solid rgba(244,63,94,.3); }
.badge-violet { background:rgba(139,92,246,.12);  color:#a78bfa; border:1px solid rgba(139,92,246,.3); }

/* Answer box */
.answer-box { background:linear-gradient(135deg,#0b2034,#091c2e); border:1px solid #00c8b4; border-radius:12px; padding:1.4rem 1.6rem; margin-top:1.2rem; }
.answer-label { font-family:'Syne',sans-serif; font-size:.68rem; font-weight:700; letter-spacing:.14em; text-transform:uppercase; color:#00c8b4; margin-bottom:.5rem; }
.answer-text { font-size:1.05rem; color:#e2e8f0; line-height:1.7; }
.score-bar-wrap { margin-top:.8rem; }
.score-bar-label { font-size:.75rem; color:#64748b; margin-bottom:.25rem; }
.score-bar-outer { background:#1e2d45; border-radius:999px; height:6px; }
.score-bar-inner { background:linear-gradient(90deg,#00c8b4,#0ea5e9); border-radius:999px; height:6px; }

/* Inputs */
[data-testid="stFileUploader"] { background:#0f172a !important; border:1.5px dashed #1e3a5f !important; border-radius:12px !important; padding:1.5rem !important; }
[data-testid="stFileUploader"]:hover { border-color:#00c8b4 !important; }
textarea { background:#0f172a !important; border:1px solid #1e2d45 !important; border-radius:10px !important; color:#e2e8f0 !important; font-family:'DM Sans',sans-serif !important; font-size:.95rem !important; }
textarea:focus { border-color:#00c8b4 !important; box-shadow:0 0 0 2px rgba(0,200,180,.18) !important; }

/* Buttons */
.stButton > button { background:linear-gradient(135deg,#00c8b4,#0ea5e9) !important; color:#07090f !important; border:none !important; border-radius:8px !important; font-family:'Syne',sans-serif !important; font-weight:700 !important; font-size:.88rem !important; letter-spacing:.04em !important; padding:.6rem 1.6rem !important; cursor:pointer !important; transition:opacity .2s,box-shadow .2s !important; }
.stButton > button:hover { opacity:.88 !important; box-shadow:0 4px 20px rgba(0,200,180,.35) !important; }
[data-testid="stDownloadButton"] button { background:transparent !important; border:1.5px solid #00c8b4 !important; color:#00c8b4 !important; font-family:'Syne',sans-serif !important; font-weight:700 !important; font-size:.85rem !important; border-radius:8px !important; padding:.55rem 1.4rem !important; transition:background .2s !important; }
[data-testid="stDownloadButton"] button:hover { background:rgba(0,200,180,.12) !important; }

/* Misc */
hr { border-color:#1e2d45 !important; margin:1.8rem 0 !important; }
[data-testid="stSelectbox"] > div > div { background:#0f172a !important; border-color:#1e2d45 !important; color:#e2e8f0 !important; border-radius:8px !important; }
::-webkit-scrollbar { width:6px; }
::-webkit-scrollbar-track { background:#0b111f; }
::-webkit-scrollbar-thumb { background:#1e2d45; border-radius:3px; }
::-webkit-scrollbar-thumb:hover { background:#00c8b4; }
[data-testid="stTabsContent"] { padding:2rem 0 !important; }
</style>
""", unsafe_allow_html=True)

# ─── Constants ────────────────────────────────────────────────────────────────
CATEGORIES = ["Business", "Opinion", "Political_gossip", "Sports", "World_news"]

CAT_BADGE = {
    "Business": "badge-teal", "Opinion": "badge-blue",
    "Political_gossip": "badge-amber", "Sports": "badge-rose", "World_news": "badge-violet",
}
CAT_COLOR = {
    "Business": "#00c8b4", "Opinion": "#60a5fa",
    "Political_gossip": "#fbbf24", "Sports": "#fb7185", "World_news": "#a78bfa",
}

# Map whatever the model returns → one of the 5 assignment class names
LABEL_MAP = {
    "business": "Business", "opinion": "Opinion",
    "political_gossip": "Political_gossip", "political gossip": "Political_gossip",
    "sports": "Sports", "world_news": "World_news", "world news": "World_news", "world": "World_news",
    "label_0": "Business", "label_1": "Opinion",
    "label_2": "Political_gossip", "label_3": "Sports", "label_4": "World_news",
    "business and finance": "Business", "opinions and editorials": "Opinion",
    "politics": "Political_gossip",
}

def normalise_label(raw: str) -> str:
    if raw in CATEGORIES:
        return raw
    return LABEL_MAP.get(raw.strip().lower(), raw)

# ─── Text preprocessor ────────────────────────────────────────────────────────
def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    try:
        sw     = set(stopwords.words("english"))
        tokens = word_tokenize(text)
        text   = " ".join(t for t in tokens if t not in sw and len(t) > 2)
    except Exception:
        pass
    return text

# ─── Model loaders ────────────────────────────────────────────────────────────
@st.cache_resource(show_spinner=False)
def load_classifier():
    """
    Replace MODEL_ID with your fine-tuned model pushed to HF Hub in Task 4.
    e.g.  "Akilashamnaka12/news_classifier_model"
    If your Space or model is private, add HF_TOKEN as a Secret in Space settings.
    """
    MODEL_ID = "Akilashamnaka12/news_classifier_model"   # ← swap after Task 4

    try:
        from transformers import pipeline as hf_pipeline
        kwargs = {"task": "text-classification", "model": MODEL_ID,
                  "truncation": True, "max_length": 512}
        if HF_TOKEN:
            kwargs["token"] = HF_TOKEN
        return hf_pipeline(**kwargs), None
    except Exception as e:
        return None, str(e)


@st.cache_resource(show_spinner=False)
@st.cache_resource(show_spinner=False)
def load_qa():
    QA_MODEL = "deepset/roberta-base-squad2"
    try:
        from transformers import AutoTokenizer, AutoModelForQuestionAnswering
        import torch
        tok   = AutoTokenizer.from_pretrained(QA_MODEL)
        model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)
        return (tok, model), None
    except Exception as e:
        return None, str(e)

# ══════════════════════════════════════════════════════════════════════════════
# HERO
# ══════════════════════════════════════════════════════════════════════════════
st.markdown("""
<div class="hero">
    <div class="hero-eyebrow"><h5>🔎 &nbsp;Text Analytics · DA3111 - Group 6</h5></div>
    <div class="hero-title">News Lens</div>
    <div class="hero-sub">
        Classify News articles, interrogate content with Q&amp;A,
        and surface editorial insights — all in one unified workspace.
    </div>
</div>
""", unsafe_allow_html=True)

tab1, tab2, tab3 = st.tabs([
    "  📂  Text Classification  ",
    "  💬  Q & A Pipeline  ",
    "  📊  Insights  ",
])

# ══════════════════════════════════════════════════════════════════════════════
# TAB 1 – TEXT CLASSIFICATION
# ══════════════════════════════════════════════════════════════════════════════
with tab1:
    left, right = st.columns([1.1, 1], gap="large")

    with left:
        st.markdown('<div class="section-label">Upload</div>', unsafe_allow_html=True)
        st.markdown("""
        <div class="card">
            <div class="card-title">Upload your CSV file</div>
            <div class="card-sub"><code style="background: #00c8b4; color:#000000">Must contain a content column with news excerpts.</code></div>
        """, unsafe_allow_html=True)
        uploaded = st.file_uploader("", type=["csv"], label_visibility="collapsed")
        st.markdown("</div>", unsafe_allow_html=True)

        if uploaded:
            try:
                uploaded.seek(0)                        # reset buffer – important on HF Spaces
                df_raw = pd.read_csv(uploaded)
            except Exception as e:
                st.error(f"Could not parse CSV: {e}")
                st.stop()

            if "content" not in df_raw.columns:
                st.error("❌  The uploaded file must have a `content` column.")
            else:
                st.markdown(f"""
                <div class="stat-row">
                    <div class="stat-chip"><span>{len(df_raw)}</span>Records</div>
                    <div class="stat-chip"><span>{df_raw.shape[1]}</span>Columns</div>
                </div>""", unsafe_allow_html=True)

                st.markdown('<div class="section-label" style="margin-top:1rem">Preview</div>',
                            unsafe_allow_html=True)
                st.dataframe(df_raw.head(5), use_container_width=True, hide_index=True)

                run_btn = st.button("⚡  Run Classification", use_container_width=True)

                if run_btn:
                    with st.spinner("Loading classifier… (first run ~30 s on HF Spaces)"):
                        clf, err = load_classifier()
                    if err:
                        st.error(f"Model load error: {err}")
                    else:
                        df_out      = df_raw.copy()
                        pred_labels = []
                        prog        = st.progress(0, text="Classifying…")
                        texts       = df_out["content"].fillna("").tolist()

                        for i, txt in enumerate(texts):
                            clean = preprocess_text(txt) or txt[:512]
                            try:
                                raw   = clf(clean[:512])[0]["label"]
                                label = normalise_label(raw)
                            except Exception:
                                label = "Unknown"
                            pred_labels.append(label)
                            prog.progress((i + 1) / len(texts),
                                          text=f"Classifying {i+1}/{len(texts)}…")

                        prog.empty()
                        df_out["class"] = pred_labels
                        st.session_state["df_classified"]   = df_out
                        st.session_state["classification_done"] = True
                        st.rerun()

    with right:
        st.markdown('<div class="section-label">Results</div>', unsafe_allow_html=True)

        if st.session_state.get("classification_done"):
            df_out = st.session_state["df_classified"]
            counts = df_out["class"].value_counts()

            chip_html = '<div class="stat-row">'
            for cat, cnt in counts.items():
                badge = CAT_BADGE.get(cat, "badge-teal")
                chip_html += (f'<div class="stat-chip"><span>{cnt}</span>'
                              f'<span class="badge {badge}">{cat.replace("_"," ")}</span></div>')
            chip_html += "</div>"
            st.markdown(chip_html, unsafe_allow_html=True)

            cols = [c for c in ["content", "class"] if c in df_out.columns]
            st.markdown('<div class="card" style="margin-top:.8rem">', unsafe_allow_html=True)
            st.markdown('<div class="card-title">Classified Records</div>', unsafe_allow_html=True)
            st.dataframe(df_out[cols].head(20), use_container_width=True, hide_index=True,
                         column_config={"content": st.column_config.TextColumn("Content", width="large")})
            st.markdown("</div>", unsafe_allow_html=True)

            st.download_button(
                "⬇  Download output.csv",
                data=df_out.to_csv(index=False).encode("utf-8"),
                file_name="output.csv", mime="text/csv",
                use_container_width=True,
            )
        else:
            st.markdown("""
            <div class="card" style="text-align:center;padding:3.5rem 2rem;">
                <div style="font-size:3rem;margin-bottom:1rem">📂</div>
                <div style="font-family:'Syne',sans-serif;font-size:1rem;font-weight:700;color:#334155;">
                    Upload a CSV to see results</div>
                <div style="font-size:.82rem;color:#475569;margin-top:.4rem;">
                    Predictions appear here after classification runs.</div>
            </div>""", unsafe_allow_html=True)

# ══════════════════════════════════════════════════════════════════════════════
# TAB 2 – Q&A PIPELINE
# ══════════════════════════════════════════════════════════════════════════════
with tab2:
    l2, r2 = st.columns([1, 1], gap="large")

    with l2:
        st.markdown('<div class="section-label">Context</div>', unsafe_allow_html=True)
        st.markdown('<div class="card">', unsafe_allow_html=True)
        st.markdown('<div class="card-title">Paste a news excerpt</div>', unsafe_allow_html=True)
        st.markdown('<div class="card-sub">The Q&A model will read this as its context.</div>',
                    unsafe_allow_html=True)

        default_ctx = ""
        if st.session_state.get("classification_done"):
            df_c = st.session_state["df_classified"]
            if len(df_c):
                default_ctx = str(df_c["content"].iloc[0])

        context_text = st.text_area("", value=default_ctx, height=260,
                                    placeholder="Paste any news article content here…",
                                    label_visibility="collapsed", key="qa_context")
        st.markdown("</div>", unsafe_allow_html=True)

    with r2:
        st.markdown('<div class="section-label">Question</div>', unsafe_allow_html=True)
        st.markdown('<div class="card">', unsafe_allow_html=True)
        st.markdown('<div class="card-title">Ask anything about the article</div>', unsafe_allow_html=True)
        st.markdown('<div class="card-sub">The model extracts an answer from the context on the left.</div>',
                    unsafe_allow_html=True)

        question_text = st.text_area("", height=120,
                                     placeholder="e.g. Who is mentioned in this article?",
                                     label_visibility="collapsed", key="qa_question")
        ask_btn = st.button("🔍  Get Answer", use_container_width=True)
        st.markdown("</div>", unsafe_allow_html=True)

        if ask_btn:
            if not context_text.strip():
                st.warning("Please paste a news excerpt in the Context panel on the left.")
            elif not question_text.strip():
                st.warning("Please type a question.")
            else:
                with st.spinner("Loading Q&A model (first run ~30 s)"):
                    qa, err = load_qa()
                if err:
                    st.error(f"Q&A model failed to load: {err}")
                else:
                    with st.spinner("Finding the answer..."):
                        try:
                            import torch
                            tok, model = qa
                            q   = question_text.strip()
                            ctx = context_text.strip()[:3000]

                            inputs = tok(q, ctx, return_tensors="pt",
                                         truncation=True, max_length=512)
                            with torch.no_grad():
                                outputs = model(**inputs)

                            start  = outputs.start_logits.argmax()
                            end    = outputs.end_logits.argmax() + 1
                            answer = tok.convert_tokens_to_string(
                                tok.convert_ids_to_tokens(
                                    inputs["input_ids"][0][start:end]
                                )
                            )
                            # Add this line to clean RoBERTa special characters
                            answer = answer.replace("Ġ", " ").strip()


                            
                            start_prob = outputs.start_logits.softmax(dim=-1).max().item()
                            end_prob   = outputs.end_logits.softmax(dim=-1).max().item()
                            score_pct  = int(((start_prob + end_prob) / 2) * 100)

                            st.markdown(f"""
                            <div class="answer-box">
                                <div class="answer-label">Answer</div>
                                <div class="answer-text">{answer}</div>
                                <div class="score-bar-wrap">
                                    <div class="score-bar-label">Confidence : {score_pct}%</div>
                                    <div class="score-bar-outer">
                                        <div class="score-bar-inner" style="width:{score_pct}%"></div>
                                    </div>
                                </div>
                            </div>""", unsafe_allow_html=True)
                        except Exception as e:
                            st.error(f"Inference error: {e}")

    if st.session_state.get("classification_done"):
        st.markdown("---")
        st.markdown('<div class="section-label">Suggested Questions</div>', unsafe_allow_html=True)
        c1, c2, c3, c4 = st.columns(4)
        for col, q in zip([c1, c2, c3, c4],
                          ["Who is this article about?", "What event is described?",
                           "Where did this take place?", "What was the outcome?"]):
            col.markdown(f"""
            <div class="card" style="padding:1rem 1.2rem;text-align:center;">
                <div style="font-size:.85rem;color:#94a3b8;">{q}</div>
            </div>""", unsafe_allow_html=True)

# ══════════════════════════════════════════════════════════════════════════════
# TAB 3 – INSIGHTS
# ══════════════════════════════════════════════════════════════════════════════
with tab3:
    if not st.session_state.get("classification_done"):
        st.markdown("""
        <div class="card" style="text-align:center;padding:4rem 2rem;">
            <div style="font-size:3.5rem;margin-bottom:1rem">📊</div>
            <div style="font-family:'Syne',sans-serif;font-size:1.1rem;font-weight:700;color:#334155;">
                Insights unlock after classification</div>
            <div style="font-size:.88rem;color:#475569;margin-top:.5rem;">
                Go to <strong style="color:#00c8b4">Text Classification</strong>,
                upload a CSV, and run the model first.</div>
        </div>""", unsafe_allow_html=True)
    else:
        df_ins = st.session_state["df_classified"]
        counts = df_ins["class"].value_counts()
        total  = len(df_ins)

        # KPI row
        kpi_cols = st.columns(5)
        for col, cat in zip(kpi_cols, CATEGORIES):
            cnt  = int(counts.get(cat, 0))
            pct  = round(cnt / total * 100, 1) if total else 0
            badge = CAT_BADGE.get(cat, "badge-teal")
            col.markdown(f"""
            <div class="card" style="text-align:center;padding:1.4rem 1rem;">
                <div class="badge {badge}" style="margin-bottom:.7rem">{cat.replace('_',' ')}</div>
                <div style="font-family:'Syne',sans-serif;font-size:1.9rem;font-weight:800;color:#e2e8f0">{cnt}</div>
                <div style="font-size:.78rem;color:#64748b;margin-top:.2rem">{pct}% of total</div>
            </div>""", unsafe_allow_html=True)

        st.markdown("---")
        ch1, ch2 = st.columns(2, gap="large")

        with ch1:
            st.markdown('<div class="section-label">Category Distribution</div>', unsafe_allow_html=True)
            fig, ax = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a")
            labels = [c.replace("_", " ") for c in counts.index]
            colors = [CAT_COLOR.get(c, "#00c8b4") for c in counts.index]
            wedges, _, autotexts = ax.pie(
                counts.values, labels=None, autopct="%1.1f%%", colors=colors,
                startangle=120, wedgeprops=dict(width=0.55, edgecolor="#07090f", linewidth=2),
                pctdistance=0.78)
            for at in autotexts:
                at.set_color("#e2e8f0"); at.set_fontsize(8.5); at.set_fontweight("bold")
            ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12),
                      ncol=3, frameon=False, labelcolor="#94a3b8", fontsize=8)
            ax.set_facecolor("#0f172a"); fig.patch.set_facecolor("#0f172a")
            st.pyplot(fig, use_container_width=True); plt.close(fig)

        with ch2:
            st.markdown('<div class="section-label">Article Counts by Category</div>', unsafe_allow_html=True)
            fig2, ax2 = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a")
            bars = ax2.barh([l.replace("_", " ") for l in counts.index], counts.values,
                            color=[CAT_COLOR.get(c, "#00c8b4") for c in counts.index],
                            height=0.55, edgecolor="none")
            ax2.set_facecolor("#0f172a")
            for sp in ["top", "right"]: ax2.spines[sp].set_visible(False)
            for sp in ["left", "bottom"]: ax2.spines[sp].set_color("#1e2d45")
            ax2.tick_params(colors="#64748b", labelsize=8.5)
            for bar in bars:
                ax2.text(bar.get_width() + 0.4, bar.get_y() + bar.get_height() / 2,
                         str(int(bar.get_width())), va="center", ha="left",
                         color="#e2e8f0", fontsize=8.5, fontweight="bold")
            fig2.patch.set_facecolor("#0f172a")
            st.pyplot(fig2, use_container_width=True); plt.close(fig2)

        st.markdown("---")
        st.markdown('<div class="section-label">Word Cloud by Category</div>', unsafe_allow_html=True)
        selected_cat = st.selectbox("", options=CATEGORIES,
                                    format_func=lambda c: c.replace("_", " "),
                                    label_visibility="collapsed")

        cat_texts = df_ins[df_ins["class"] == selected_cat]["content"].fillna("").tolist()
        combined  = " ".join(preprocess_text(t) for t in cat_texts[:200])

        if combined.strip():
            wc = WordCloud(width=900, height=340, background_color="#0f172a",
                           colormap="cool", max_words=120, collocations=False).generate(combined)
            fig3, ax3 = plt.subplots(figsize=(9, 3.5), facecolor="#0f172a")
            ax3.imshow(wc, interpolation="bilinear"); ax3.axis("off")
            fig3.patch.set_facecolor("#0f172a")
            st.pyplot(fig3, use_container_width=True); plt.close(fig3)
        else:
            st.info(f"No content found for: {selected_cat.replace('_',' ')}")

        st.markdown("---")
        st.markdown(f'<div class="section-label">Top Unigrams · {selected_cat.replace("_"," ")}</div>',
                    unsafe_allow_html=True)
        top_words = Counter(combined.split()).most_common(15)
        if top_words:
            words, freqs = zip(*top_words)
            fig4, ax4 = plt.subplots(figsize=(9, 3), facecolor="#0f172a")
            ax4.bar(words, freqs, color=CAT_COLOR.get(selected_cat, "#00c8b4"), edgecolor="none", width=0.6)
            ax4.set_facecolor("#0f172a")
            for sp in ["top", "right"]: ax4.spines[sp].set_visible(False)
            for sp in ["left", "bottom"]: ax4.spines[sp].set_color("#1e2d45")
            ax4.tick_params(axis="x", colors="#64748b", labelsize=8, rotation=30)
            ax4.tick_params(axis="y", colors="#64748b", labelsize=8)
            fig4.patch.set_facecolor("#0f172a")
            st.pyplot(fig4, use_container_width=True); plt.close(fig4)

# ─── Footer ───────────────────────────────────────────────────────────────────
st.markdown("""
<div style="text-align:center;padding:2.5rem 0 1rem;color:#2a3a55;
            font-size:.78rem;border-top:1px solid #1a2a44;margin-top:3rem;">
    Built for <strong style="color:#00c8b4">IN23-S5-DA3111 · Text Analytics Group Project</strong>
    &nbsp;·&nbsp; Powered by Hugging Face &amp; Streamlit
</div>
""", unsafe_allow_html=True)