Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import nltk | |
| import pandas as pd | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| from wordcloud import WordCloud | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| # βββ HF token (set as a Secret in Space settings for private/gated models) ββββ | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| # βββ Page Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config( | |
| page_title="NewsLens Β· Sri Lanka", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="collapsed", | |
| ) | |
| # βββ NLTK β write to /tmp so HF Spaces (read-only FS) can cache data ββββββββββ | |
| NLTK_DATA_DIR = "/tmp/nltk_data" | |
| os.makedirs(NLTK_DATA_DIR, exist_ok=True) | |
| if NLTK_DATA_DIR not in nltk.data.path: | |
| nltk.data.path.insert(0, NLTK_DATA_DIR) | |
| def download_nltk(): | |
| for pkg in ["stopwords", "punkt", "punkt_tab"]: | |
| try: | |
| nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True) | |
| except Exception: | |
| pass | |
| download_nltk() | |
| # βββ CSS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=DM+Sans:ital,wght@0,300;0,400;0,500;1,300&display=swap'); | |
| *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } | |
| html, body, [data-testid="stAppViewContainer"] { | |
| background: #07090f !important; | |
| color: #e8eaf0 !important; | |
| font-family: 'DM Sans', sans-serif !important; | |
| } | |
| [data-testid="stAppViewContainer"] { padding: 0 !important; } | |
| [data-testid="stHeader"] { background: transparent !important; } | |
| section.main > div { padding-top: 0 !important; } | |
| .block-container { padding: 0 2rem 4rem 2rem !important; max-width: 1280px !important; } | |
| /* Hero */ | |
| .hero { | |
| background: linear-gradient(135deg, #0b1120 0%, #0d1f3c 55%, #062a3a 100%); | |
| border-bottom: 1px solid #1a2a44; | |
| padding: 3.5rem 3rem 2.8rem; | |
| position: relative; overflow: hidden; | |
| } | |
| .hero::before { | |
| content:''; position:absolute; inset:0; | |
| background: radial-gradient(ellipse 70% 60% at 80% 30%, rgba(0,200,180,.09) 0%, transparent 70%); | |
| pointer-events: none; | |
| } | |
| .hero-eyebrow { font-size:.75rem; font-weight:500; letter-spacing:.18em; color:#00c8b4; text-transform:uppercase; margin-bottom:.9rem; } | |
| .hero-title { font-family:'Syne',sans-serif; font-size:clamp(2.2rem,5vw,3.6rem); font-weight:800; line-height:1.08; color:#fff; margin-bottom:1rem; } | |
| .hero-title span { color:#00c8b4; } | |
| .hero-sub { font-size:1.05rem; font-weight:300; line-height:1.65; color:#94a3b8; max-width:560px; } | |
| /* Tabs */ | |
| [data-testid="stTabs"] > div:first-child { background:#0b111f; border-bottom:1px solid #1a2a44; padding:0 2rem; gap:0 !important; } | |
| [data-testid="stTabs"] button { font-family:'Syne',sans-serif !important; font-size:.88rem !important; font-weight:600 !important; color:#64748b !important; padding:1rem 1.5rem !important; border-radius:0 !important; border-bottom:2px solid transparent !important; transition:color .2s,border-color .2s !important; } | |
| [data-testid="stTabs"] button:hover { color:#cbd5e1 !important; } | |
| [data-testid="stTabs"] button[aria-selected="true"] { color:#00c8b4 !important; border-bottom-color:#00c8b4 !important; background:transparent !important; } | |
| /* Cards */ | |
| .card { background:#0f172a; border:1px solid #1e2d45; border-radius:14px; padding:1.8rem 1.8rem 1.6rem; margin-bottom:1.4rem; transition:border-color .2s,box-shadow .2s; } | |
| .card:hover { border-color:#00c8b4; box-shadow:0 0 28px rgba(0,200,180,.08); } | |
| .card-title { font-family:'Syne',sans-serif; font-size:1rem; font-weight:700; color:#e2e8f0; margin-bottom:.35rem; } | |
| .card-sub { font-size:.82rem; color:#64748b; font-weight:300; margin-bottom:1.1rem; } | |
| /* Labels / chips / badges */ | |
| .section-label { font-family:'Syne',sans-serif; font-size:.72rem; font-weight:700; letter-spacing:.14em; text-transform:uppercase; color:#00c8b4; margin-bottom:.6rem; } | |
| .stat-row { display:flex; gap:1rem; flex-wrap:wrap; margin:1rem 0; } | |
| .stat-chip { background:#1e2d45; border-radius:8px; padding:.55rem 1.1rem; font-family:'Syne',sans-serif; font-size:.85rem; font-weight:600; color:#e2e8f0; } | |
| .stat-chip span { color:#00c8b4; font-size:1.15rem; display:block; } | |
| .badge { display:inline-block; padding:.25rem .7rem; border-radius:999px; font-size:.72rem; font-weight:600; letter-spacing:.05em; text-transform:uppercase; } | |
| .badge-teal { background:rgba(0,200,180,.15); color:#00c8b4; border:1px solid rgba(0,200,180,.3); } | |
| .badge-blue { background:rgba(59,130,246,.15); color:#60a5fa; border:1px solid rgba(59,130,246,.3); } | |
| .badge-amber { background:rgba(245,158,11,.12); color:#fbbf24; border:1px solid rgba(245,158,11,.3); } | |
| .badge-rose { background:rgba(244,63,94,.12); color:#fb7185; border:1px solid rgba(244,63,94,.3); } | |
| .badge-violet { background:rgba(139,92,246,.12); color:#a78bfa; border:1px solid rgba(139,92,246,.3); } | |
| /* Answer box */ | |
| .answer-box { background:linear-gradient(135deg,#0b2034,#091c2e); border:1px solid #00c8b4; border-radius:12px; padding:1.4rem 1.6rem; margin-top:1.2rem; } | |
| .answer-label { font-family:'Syne',sans-serif; font-size:.68rem; font-weight:700; letter-spacing:.14em; text-transform:uppercase; color:#00c8b4; margin-bottom:.5rem; } | |
| .answer-text { font-size:1.05rem; color:#e2e8f0; line-height:1.7; } | |
| .score-bar-wrap { margin-top:.8rem; } | |
| .score-bar-label { font-size:.75rem; color:#64748b; margin-bottom:.25rem; } | |
| .score-bar-outer { background:#1e2d45; border-radius:999px; height:6px; } | |
| .score-bar-inner { background:linear-gradient(90deg,#00c8b4,#0ea5e9); border-radius:999px; height:6px; } | |
| /* Inputs */ | |
| [data-testid="stFileUploader"] { background:#0f172a !important; border:1.5px dashed #1e3a5f !important; border-radius:12px !important; padding:1.5rem !important; } | |
| [data-testid="stFileUploader"]:hover { border-color:#00c8b4 !important; } | |
| textarea { background:#0f172a !important; border:1px solid #1e2d45 !important; border-radius:10px !important; color:#e2e8f0 !important; font-family:'DM Sans',sans-serif !important; font-size:.95rem !important; } | |
| textarea:focus { border-color:#00c8b4 !important; box-shadow:0 0 0 2px rgba(0,200,180,.18) !important; } | |
| /* Buttons */ | |
| .stButton > button { background:linear-gradient(135deg,#00c8b4,#0ea5e9) !important; color:#07090f !important; border:none !important; border-radius:8px !important; font-family:'Syne',sans-serif !important; font-weight:700 !important; font-size:.88rem !important; letter-spacing:.04em !important; padding:.6rem 1.6rem !important; cursor:pointer !important; transition:opacity .2s,box-shadow .2s !important; } | |
| .stButton > button:hover { opacity:.88 !important; box-shadow:0 4px 20px rgba(0,200,180,.35) !important; } | |
| [data-testid="stDownloadButton"] button { background:transparent !important; border:1.5px solid #00c8b4 !important; color:#00c8b4 !important; font-family:'Syne',sans-serif !important; font-weight:700 !important; font-size:.85rem !important; border-radius:8px !important; padding:.55rem 1.4rem !important; transition:background .2s !important; } | |
| [data-testid="stDownloadButton"] button:hover { background:rgba(0,200,180,.12) !important; } | |
| /* Misc */ | |
| hr { border-color:#1e2d45 !important; margin:1.8rem 0 !important; } | |
| [data-testid="stSelectbox"] > div > div { background:#0f172a !important; border-color:#1e2d45 !important; color:#e2e8f0 !important; border-radius:8px !important; } | |
| ::-webkit-scrollbar { width:6px; } | |
| ::-webkit-scrollbar-track { background:#0b111f; } | |
| ::-webkit-scrollbar-thumb { background:#1e2d45; border-radius:3px; } | |
| ::-webkit-scrollbar-thumb:hover { background:#00c8b4; } | |
| [data-testid="stTabsContent"] { padding:2rem 0 !important; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # βββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CATEGORIES = ["Business", "Opinion", "Political_gossip", "Sports", "World_news"] | |
| CAT_BADGE = { | |
| "Business": "badge-teal", "Opinion": "badge-blue", | |
| "Political_gossip": "badge-amber", "Sports": "badge-rose", "World_news": "badge-violet", | |
| } | |
| CAT_COLOR = { | |
| "Business": "#00c8b4", "Opinion": "#60a5fa", | |
| "Political_gossip": "#fbbf24", "Sports": "#fb7185", "World_news": "#a78bfa", | |
| } | |
| # Map whatever the model returns β one of the 5 assignment class names | |
| LABEL_MAP = { | |
| "business": "Business", "opinion": "Opinion", | |
| "political_gossip": "Political_gossip", "political gossip": "Political_gossip", | |
| "sports": "Sports", "world_news": "World_news", "world news": "World_news", "world": "World_news", | |
| "label_0": "Business", "label_1": "Opinion", | |
| "label_2": "Political_gossip", "label_3": "Sports", "label_4": "World_news", | |
| "business and finance": "Business", "opinions and editorials": "Opinion", | |
| "politics": "Political_gossip", | |
| } | |
| def normalise_label(raw: str) -> str: | |
| if raw in CATEGORIES: | |
| return raw | |
| return LABEL_MAP.get(raw.strip().lower(), raw) | |
| # βββ Text preprocessor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def preprocess_text(text: str) -> str: | |
| if not isinstance(text, str): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r"http\S+|www\.\S+", " ", text) | |
| text = re.sub(r"[^a-z\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| try: | |
| sw = set(stopwords.words("english")) | |
| tokens = word_tokenize(text) | |
| text = " ".join(t for t in tokens if t not in sw and len(t) > 2) | |
| except Exception: | |
| pass | |
| return text | |
| # βββ Model loaders ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_classifier(): | |
| """ | |
| Replace MODEL_ID with your fine-tuned model pushed to HF Hub in Task 4. | |
| e.g. "Akilashamnaka12/news_classifier_model" | |
| If your Space or model is private, add HF_TOKEN as a Secret in Space settings. | |
| """ | |
| MODEL_ID = "Akilashamnaka12/news_classifier_model" # β swap after Task 4 | |
| try: | |
| from transformers import pipeline as hf_pipeline | |
| kwargs = {"task": "text-classification", "model": MODEL_ID, | |
| "truncation": True, "max_length": 512} | |
| if HF_TOKEN: | |
| kwargs["token"] = HF_TOKEN | |
| return hf_pipeline(**kwargs), None | |
| except Exception as e: | |
| return None, str(e) | |
| def load_qa(): | |
| QA_MODEL = "deepset/roberta-base-squad2" | |
| try: | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
| import torch | |
| tok = AutoTokenizer.from_pretrained(QA_MODEL) | |
| model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL) | |
| return (tok, model), None | |
| except Exception as e: | |
| return None, str(e) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HERO | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <div class="hero"> | |
| <div class="hero-eyebrow"><h5>π Text Analytics Β· DA3111 - Group 6</h5></div> | |
| <div class="hero-title">News Lens</div> | |
| <div class="hero-sub"> | |
| Classify News articles, interrogate content with Q&A, | |
| and surface editorial insights β all in one unified workspace. | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| tab1, tab2, tab3 = st.tabs([ | |
| " π Text Classification ", | |
| " π¬ Q & A Pipeline ", | |
| " π Insights ", | |
| ]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1 β TEXT CLASSIFICATION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab1: | |
| left, right = st.columns([1.1, 1], gap="large") | |
| with left: | |
| st.markdown('<div class="section-label">Upload</div>', unsafe_allow_html=True) | |
| st.markdown(""" | |
| <div class="card"> | |
| <div class="card-title">Upload your CSV file</div> | |
| <div class="card-sub"><code style="background: #00c8b4; color:#000000">Must contain a content column with news excerpts.</code></div> | |
| """, unsafe_allow_html=True) | |
| uploaded = st.file_uploader("", type=["csv"], label_visibility="collapsed") | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| if uploaded: | |
| try: | |
| uploaded.seek(0) # reset buffer β important on HF Spaces | |
| df_raw = pd.read_csv(uploaded) | |
| except Exception as e: | |
| st.error(f"Could not parse CSV: {e}") | |
| st.stop() | |
| if "content" not in df_raw.columns: | |
| st.error("β The uploaded file must have a `content` column.") | |
| else: | |
| st.markdown(f""" | |
| <div class="stat-row"> | |
| <div class="stat-chip"><span>{len(df_raw)}</span>Records</div> | |
| <div class="stat-chip"><span>{df_raw.shape[1]}</span>Columns</div> | |
| </div>""", unsafe_allow_html=True) | |
| st.markdown('<div class="section-label" style="margin-top:1rem">Preview</div>', | |
| unsafe_allow_html=True) | |
| st.dataframe(df_raw.head(5), use_container_width=True, hide_index=True) | |
| run_btn = st.button("β‘ Run Classification", use_container_width=True) | |
| if run_btn: | |
| with st.spinner("Loading classifier⦠(first run ~30 s on HF Spaces)"): | |
| clf, err = load_classifier() | |
| if err: | |
| st.error(f"Model load error: {err}") | |
| else: | |
| df_out = df_raw.copy() | |
| pred_labels = [] | |
| prog = st.progress(0, text="Classifyingβ¦") | |
| texts = df_out["content"].fillna("").tolist() | |
| for i, txt in enumerate(texts): | |
| clean = preprocess_text(txt) or txt[:512] | |
| try: | |
| raw = clf(clean[:512])[0]["label"] | |
| label = normalise_label(raw) | |
| except Exception: | |
| label = "Unknown" | |
| pred_labels.append(label) | |
| prog.progress((i + 1) / len(texts), | |
| text=f"Classifying {i+1}/{len(texts)}β¦") | |
| prog.empty() | |
| df_out["class"] = pred_labels | |
| st.session_state["df_classified"] = df_out | |
| st.session_state["classification_done"] = True | |
| st.rerun() | |
| with right: | |
| st.markdown('<div class="section-label">Results</div>', unsafe_allow_html=True) | |
| if st.session_state.get("classification_done"): | |
| df_out = st.session_state["df_classified"] | |
| counts = df_out["class"].value_counts() | |
| chip_html = '<div class="stat-row">' | |
| for cat, cnt in counts.items(): | |
| badge = CAT_BADGE.get(cat, "badge-teal") | |
| chip_html += (f'<div class="stat-chip"><span>{cnt}</span>' | |
| f'<span class="badge {badge}">{cat.replace("_"," ")}</span></div>') | |
| chip_html += "</div>" | |
| st.markdown(chip_html, unsafe_allow_html=True) | |
| cols = [c for c in ["content", "class"] if c in df_out.columns] | |
| st.markdown('<div class="card" style="margin-top:.8rem">', unsafe_allow_html=True) | |
| st.markdown('<div class="card-title">Classified Records</div>', unsafe_allow_html=True) | |
| st.dataframe(df_out[cols].head(20), use_container_width=True, hide_index=True, | |
| column_config={"content": st.column_config.TextColumn("Content", width="large")}) | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| st.download_button( | |
| "β¬ Download output.csv", | |
| data=df_out.to_csv(index=False).encode("utf-8"), | |
| file_name="output.csv", mime="text/csv", | |
| use_container_width=True, | |
| ) | |
| else: | |
| st.markdown(""" | |
| <div class="card" style="text-align:center;padding:3.5rem 2rem;"> | |
| <div style="font-size:3rem;margin-bottom:1rem">π</div> | |
| <div style="font-family:'Syne',sans-serif;font-size:1rem;font-weight:700;color:#334155;"> | |
| Upload a CSV to see results</div> | |
| <div style="font-size:.82rem;color:#475569;margin-top:.4rem;"> | |
| Predictions appear here after classification runs.</div> | |
| </div>""", unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2 β Q&A PIPELINE | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab2: | |
| l2, r2 = st.columns([1, 1], gap="large") | |
| with l2: | |
| st.markdown('<div class="section-label">Context</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="card">', unsafe_allow_html=True) | |
| st.markdown('<div class="card-title">Paste a news excerpt</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="card-sub">The Q&A model will read this as its context.</div>', | |
| unsafe_allow_html=True) | |
| default_ctx = "" | |
| if st.session_state.get("classification_done"): | |
| df_c = st.session_state["df_classified"] | |
| if len(df_c): | |
| default_ctx = str(df_c["content"].iloc[0]) | |
| context_text = st.text_area("", value=default_ctx, height=260, | |
| placeholder="Paste any news article content hereβ¦", | |
| label_visibility="collapsed", key="qa_context") | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| with r2: | |
| st.markdown('<div class="section-label">Question</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="card">', unsafe_allow_html=True) | |
| st.markdown('<div class="card-title">Ask anything about the article</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="card-sub">The model extracts an answer from the context on the left.</div>', | |
| unsafe_allow_html=True) | |
| question_text = st.text_area("", height=120, | |
| placeholder="e.g. Who is mentioned in this article?", | |
| label_visibility="collapsed", key="qa_question") | |
| ask_btn = st.button("π Get Answer", use_container_width=True) | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| if ask_btn: | |
| if not context_text.strip(): | |
| st.warning("Please paste a news excerpt in the Context panel on the left.") | |
| elif not question_text.strip(): | |
| st.warning("Please type a question.") | |
| else: | |
| with st.spinner("Loading Q&A model (first run ~30 s)"): | |
| qa, err = load_qa() | |
| if err: | |
| st.error(f"Q&A model failed to load: {err}") | |
| else: | |
| with st.spinner("Finding the answer..."): | |
| try: | |
| import torch | |
| tok, model = qa | |
| q = question_text.strip() | |
| ctx = context_text.strip()[:3000] | |
| inputs = tok(q, ctx, return_tensors="pt", | |
| truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| start = outputs.start_logits.argmax() | |
| end = outputs.end_logits.argmax() + 1 | |
| answer = tok.convert_tokens_to_string( | |
| tok.convert_ids_to_tokens( | |
| inputs["input_ids"][0][start:end] | |
| ) | |
| ) | |
| # Add this line to clean RoBERTa special characters | |
| answer = answer.replace("Δ ", " ").strip() | |
| start_prob = outputs.start_logits.softmax(dim=-1).max().item() | |
| end_prob = outputs.end_logits.softmax(dim=-1).max().item() | |
| score_pct = int(((start_prob + end_prob) / 2) * 100) | |
| st.markdown(f""" | |
| <div class="answer-box"> | |
| <div class="answer-label">Answer</div> | |
| <div class="answer-text">{answer}</div> | |
| <div class="score-bar-wrap"> | |
| <div class="score-bar-label">Confidence : {score_pct}%</div> | |
| <div class="score-bar-outer"> | |
| <div class="score-bar-inner" style="width:{score_pct}%"></div> | |
| </div> | |
| </div> | |
| </div>""", unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"Inference error: {e}") | |
| if st.session_state.get("classification_done"): | |
| st.markdown("---") | |
| st.markdown('<div class="section-label">Suggested Questions</div>', unsafe_allow_html=True) | |
| c1, c2, c3, c4 = st.columns(4) | |
| for col, q in zip([c1, c2, c3, c4], | |
| ["Who is this article about?", "What event is described?", | |
| "Where did this take place?", "What was the outcome?"]): | |
| col.markdown(f""" | |
| <div class="card" style="padding:1rem 1.2rem;text-align:center;"> | |
| <div style="font-size:.85rem;color:#94a3b8;">{q}</div> | |
| </div>""", unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3 β INSIGHTS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab3: | |
| if not st.session_state.get("classification_done"): | |
| st.markdown(""" | |
| <div class="card" style="text-align:center;padding:4rem 2rem;"> | |
| <div style="font-size:3.5rem;margin-bottom:1rem">π</div> | |
| <div style="font-family:'Syne',sans-serif;font-size:1.1rem;font-weight:700;color:#334155;"> | |
| Insights unlock after classification</div> | |
| <div style="font-size:.88rem;color:#475569;margin-top:.5rem;"> | |
| Go to <strong style="color:#00c8b4">Text Classification</strong>, | |
| upload a CSV, and run the model first.</div> | |
| </div>""", unsafe_allow_html=True) | |
| else: | |
| df_ins = st.session_state["df_classified"] | |
| counts = df_ins["class"].value_counts() | |
| total = len(df_ins) | |
| # KPI row | |
| kpi_cols = st.columns(5) | |
| for col, cat in zip(kpi_cols, CATEGORIES): | |
| cnt = int(counts.get(cat, 0)) | |
| pct = round(cnt / total * 100, 1) if total else 0 | |
| badge = CAT_BADGE.get(cat, "badge-teal") | |
| col.markdown(f""" | |
| <div class="card" style="text-align:center;padding:1.4rem 1rem;"> | |
| <div class="badge {badge}" style="margin-bottom:.7rem">{cat.replace('_',' ')}</div> | |
| <div style="font-family:'Syne',sans-serif;font-size:1.9rem;font-weight:800;color:#e2e8f0">{cnt}</div> | |
| <div style="font-size:.78rem;color:#64748b;margin-top:.2rem">{pct}% of total</div> | |
| </div>""", unsafe_allow_html=True) | |
| st.markdown("---") | |
| ch1, ch2 = st.columns(2, gap="large") | |
| with ch1: | |
| st.markdown('<div class="section-label">Category Distribution</div>', unsafe_allow_html=True) | |
| fig, ax = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a") | |
| labels = [c.replace("_", " ") for c in counts.index] | |
| colors = [CAT_COLOR.get(c, "#00c8b4") for c in counts.index] | |
| wedges, _, autotexts = ax.pie( | |
| counts.values, labels=None, autopct="%1.1f%%", colors=colors, | |
| startangle=120, wedgeprops=dict(width=0.55, edgecolor="#07090f", linewidth=2), | |
| pctdistance=0.78) | |
| for at in autotexts: | |
| at.set_color("#e2e8f0"); at.set_fontsize(8.5); at.set_fontweight("bold") | |
| ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12), | |
| ncol=3, frameon=False, labelcolor="#94a3b8", fontsize=8) | |
| ax.set_facecolor("#0f172a"); fig.patch.set_facecolor("#0f172a") | |
| st.pyplot(fig, use_container_width=True); plt.close(fig) | |
| with ch2: | |
| st.markdown('<div class="section-label">Article Counts by Category</div>', unsafe_allow_html=True) | |
| fig2, ax2 = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a") | |
| bars = ax2.barh([l.replace("_", " ") for l in counts.index], counts.values, | |
| color=[CAT_COLOR.get(c, "#00c8b4") for c in counts.index], | |
| height=0.55, edgecolor="none") | |
| ax2.set_facecolor("#0f172a") | |
| for sp in ["top", "right"]: ax2.spines[sp].set_visible(False) | |
| for sp in ["left", "bottom"]: ax2.spines[sp].set_color("#1e2d45") | |
| ax2.tick_params(colors="#64748b", labelsize=8.5) | |
| for bar in bars: | |
| ax2.text(bar.get_width() + 0.4, bar.get_y() + bar.get_height() / 2, | |
| str(int(bar.get_width())), va="center", ha="left", | |
| color="#e2e8f0", fontsize=8.5, fontweight="bold") | |
| fig2.patch.set_facecolor("#0f172a") | |
| st.pyplot(fig2, use_container_width=True); plt.close(fig2) | |
| st.markdown("---") | |
| st.markdown('<div class="section-label">Word Cloud by Category</div>', unsafe_allow_html=True) | |
| selected_cat = st.selectbox("", options=CATEGORIES, | |
| format_func=lambda c: c.replace("_", " "), | |
| label_visibility="collapsed") | |
| cat_texts = df_ins[df_ins["class"] == selected_cat]["content"].fillna("").tolist() | |
| combined = " ".join(preprocess_text(t) for t in cat_texts[:200]) | |
| if combined.strip(): | |
| wc = WordCloud(width=900, height=340, background_color="#0f172a", | |
| colormap="cool", max_words=120, collocations=False).generate(combined) | |
| fig3, ax3 = plt.subplots(figsize=(9, 3.5), facecolor="#0f172a") | |
| ax3.imshow(wc, interpolation="bilinear"); ax3.axis("off") | |
| fig3.patch.set_facecolor("#0f172a") | |
| st.pyplot(fig3, use_container_width=True); plt.close(fig3) | |
| else: | |
| st.info(f"No content found for: {selected_cat.replace('_',' ')}") | |
| st.markdown("---") | |
| st.markdown(f'<div class="section-label">Top Unigrams Β· {selected_cat.replace("_"," ")}</div>', | |
| unsafe_allow_html=True) | |
| top_words = Counter(combined.split()).most_common(15) | |
| if top_words: | |
| words, freqs = zip(*top_words) | |
| fig4, ax4 = plt.subplots(figsize=(9, 3), facecolor="#0f172a") | |
| ax4.bar(words, freqs, color=CAT_COLOR.get(selected_cat, "#00c8b4"), edgecolor="none", width=0.6) | |
| ax4.set_facecolor("#0f172a") | |
| for sp in ["top", "right"]: ax4.spines[sp].set_visible(False) | |
| for sp in ["left", "bottom"]: ax4.spines[sp].set_color("#1e2d45") | |
| ax4.tick_params(axis="x", colors="#64748b", labelsize=8, rotation=30) | |
| ax4.tick_params(axis="y", colors="#64748b", labelsize=8) | |
| fig4.patch.set_facecolor("#0f172a") | |
| st.pyplot(fig4, use_container_width=True); plt.close(fig4) | |
| # βββ Footer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <div style="text-align:center;padding:2.5rem 0 1rem;color:#2a3a55; | |
| font-size:.78rem;border-top:1px solid #1a2a44;margin-top:3rem;"> | |
| Built for <strong style="color:#00c8b4">IN23-S5-DA3111 Β· Text Analytics Group Project</strong> | |
| Β· Powered by Hugging Face & Streamlit | |
| </div> | |
| """, unsafe_allow_html=True) | |