import os import re import nltk import pandas as pd import streamlit as st import matplotlib.pyplot as plt from collections import Counter from wordcloud import WordCloud from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # ─── HF token (set as a Secret in Space settings for private/gated models) ──── HF_TOKEN = os.environ.get("HF_TOKEN", None) # ─── Page Config ────────────────────────────────────────────────────────────── st.set_page_config( page_title="NewsLens · Sri Lanka", page_icon="🔎", layout="wide", initial_sidebar_state="collapsed", ) # ─── NLTK – write to /tmp so HF Spaces (read-only FS) can cache data ────────── NLTK_DATA_DIR = "/tmp/nltk_data" os.makedirs(NLTK_DATA_DIR, exist_ok=True) if NLTK_DATA_DIR not in nltk.data.path: nltk.data.path.insert(0, NLTK_DATA_DIR) @st.cache_resource def download_nltk(): for pkg in ["stopwords", "punkt", "punkt_tab"]: try: nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True) except Exception: pass download_nltk() # ─── CSS ────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ─── Constants ──────────────────────────────────────────────────────────────── CATEGORIES = ["Business", "Opinion", "Political_gossip", "Sports", "World_news"] CAT_BADGE = { "Business": "badge-teal", "Opinion": "badge-blue", "Political_gossip": "badge-amber", "Sports": "badge-rose", "World_news": "badge-violet", } CAT_COLOR = { "Business": "#00c8b4", "Opinion": "#60a5fa", "Political_gossip": "#fbbf24", "Sports": "#fb7185", "World_news": "#a78bfa", } # Map whatever the model returns → one of the 5 assignment class names LABEL_MAP = { "business": "Business", "opinion": "Opinion", "political_gossip": "Political_gossip", "political gossip": "Political_gossip", "sports": "Sports", "world_news": "World_news", "world news": "World_news", "world": "World_news", "label_0": "Business", "label_1": "Opinion", "label_2": "Political_gossip", "label_3": "Sports", "label_4": "World_news", "business and finance": "Business", "opinions and editorials": "Opinion", "politics": "Political_gossip", } def normalise_label(raw: str) -> str: if raw in CATEGORIES: return raw return LABEL_MAP.get(raw.strip().lower(), raw) # ─── Text preprocessor ──────────────────────────────────────────────────────── def preprocess_text(text: str) -> str: if not isinstance(text, str): return "" text = text.lower() text = re.sub(r"http\S+|www\.\S+", " ", text) text = re.sub(r"[^a-z\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() try: sw = set(stopwords.words("english")) tokens = word_tokenize(text) text = " ".join(t for t in tokens if t not in sw and len(t) > 2) except Exception: pass return text # ─── Model loaders ──────────────────────────────────────────────────────────── @st.cache_resource(show_spinner=False) def load_classifier(): """ Replace MODEL_ID with your fine-tuned model pushed to HF Hub in Task 4. e.g. "Akilashamnaka12/news_classifier_model" If your Space or model is private, add HF_TOKEN as a Secret in Space settings. """ MODEL_ID = "Akilashamnaka12/news_classifier_model" # ← swap after Task 4 try: from transformers import pipeline as hf_pipeline kwargs = {"task": "text-classification", "model": MODEL_ID, "truncation": True, "max_length": 512} if HF_TOKEN: kwargs["token"] = HF_TOKEN return hf_pipeline(**kwargs), None except Exception as e: return None, str(e) @st.cache_resource(show_spinner=False) @st.cache_resource(show_spinner=False) def load_qa(): QA_MODEL = "deepset/roberta-base-squad2" try: from transformers import AutoTokenizer, AutoModelForQuestionAnswering import torch tok = AutoTokenizer.from_pretrained(QA_MODEL) model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL) return (tok, model), None except Exception as e: return None, str(e) # ══════════════════════════════════════════════════════════════════════════════ # HERO # ══════════════════════════════════════════════════════════════════════════════ st.markdown("""

🔎 Text Analytics · DA3111 - Group 6

News Lens

Classify News articles, interrogate content with Q&A, and surface editorial insights — all in one unified workspace.

""", unsafe_allow_html=True) tab1, tab2, tab3 = st.tabs([ " 📂 Text Classification ", " 💬 Q & A Pipeline ", " 📊 Insights ", ]) # ══════════════════════════════════════════════════════════════════════════════ # TAB 1 – TEXT CLASSIFICATION # ══════════════════════════════════════════════════════════════════════════════ with tab1: left, right = st.columns([1.1, 1], gap="large") with left: st.markdown('

Upload

', unsafe_allow_html=True) st.markdown("""

Upload your CSV file

Must contain a content column with news excerpts.

""", unsafe_allow_html=True) uploaded = st.file_uploader("", type=["csv"], label_visibility="collapsed") st.markdown("

", unsafe_allow_html=True) if uploaded: try: uploaded.seek(0) # reset buffer – important on HF Spaces df_raw = pd.read_csv(uploaded) except Exception as e: st.error(f"Could not parse CSV: {e}") st.stop() if "content" not in df_raw.columns: st.error("❌ The uploaded file must have a `content` column.") else: st.markdown(f"""

{len(df_raw)}Records

{df_raw.shape[1]}Columns

""", unsafe_allow_html=True) st.markdown('

Preview

', unsafe_allow_html=True) st.dataframe(df_raw.head(5), use_container_width=True, hide_index=True) run_btn = st.button("⚡ Run Classification", use_container_width=True) if run_btn: with st.spinner("Loading classifier… (first run ~30 s on HF Spaces)"): clf, err = load_classifier() if err: st.error(f"Model load error: {err}") else: df_out = df_raw.copy() pred_labels = [] prog = st.progress(0, text="Classifying…") texts = df_out["content"].fillna("").tolist() for i, txt in enumerate(texts): clean = preprocess_text(txt) or txt[:512] try: raw = clf(clean[:512])[0]["label"] label = normalise_label(raw) except Exception: label = "Unknown" pred_labels.append(label) prog.progress((i + 1) / len(texts), text=f"Classifying {i+1}/{len(texts)}…") prog.empty() df_out["class"] = pred_labels st.session_state["df_classified"] = df_out st.session_state["classification_done"] = True st.rerun() with right: st.markdown('

Results

', unsafe_allow_html=True) if st.session_state.get("classification_done"): df_out = st.session_state["df_classified"] counts = df_out["class"].value_counts() chip_html = '

' for cat, cnt in counts.items(): badge = CAT_BADGE.get(cat, "badge-teal") chip_html += (f'

{cnt}' f'{cat.replace("_"," ")}

') chip_html += "

" st.markdown(chip_html, unsafe_allow_html=True) cols = [c for c in ["content", "class"] if c in df_out.columns] st.markdown('

', unsafe_allow_html=True) st.markdown('

Classified Records

', unsafe_allow_html=True) st.dataframe(df_out[cols].head(20), use_container_width=True, hide_index=True, column_config={"content": st.column_config.TextColumn("Content", width="large")}) st.markdown("

", unsafe_allow_html=True) st.download_button( "⬇ Download output.csv", data=df_out.to_csv(index=False).encode("utf-8"), file_name="output.csv", mime="text/csv", use_container_width=True, ) else: st.markdown("""

📂

Upload a CSV to see results

Predictions appear here after classification runs.

""", unsafe_allow_html=True) # ══════════════════════════════════════════════════════════════════════════════ # TAB 2 – Q&A PIPELINE # ══════════════════════════════════════════════════════════════════════════════ with tab2: l2, r2 = st.columns([1, 1], gap="large") with l2: st.markdown('

Context

', unsafe_allow_html=True) st.markdown('

Paste a news excerpt

', unsafe_allow_html=True) st.markdown('

The Q&A model will read this as its context.

', unsafe_allow_html=True) default_ctx = "" if st.session_state.get("classification_done"): df_c = st.session_state["df_classified"] if len(df_c): default_ctx = str(df_c["content"].iloc[0]) context_text = st.text_area("", value=default_ctx, height=260, placeholder="Paste any news article content here…", label_visibility="collapsed", key="qa_context") st.markdown("

", unsafe_allow_html=True) with r2: st.markdown('

Question

', unsafe_allow_html=True) st.markdown('

Ask anything about the article

', unsafe_allow_html=True) st.markdown('

The model extracts an answer from the context on the left.

', unsafe_allow_html=True) question_text = st.text_area("", height=120, placeholder="e.g. Who is mentioned in this article?", label_visibility="collapsed", key="qa_question") ask_btn = st.button("🔍 Get Answer", use_container_width=True) st.markdown("

", unsafe_allow_html=True) if ask_btn: if not context_text.strip(): st.warning("Please paste a news excerpt in the Context panel on the left.") elif not question_text.strip(): st.warning("Please type a question.") else: with st.spinner("Loading Q&A model (first run ~30 s)"): qa, err = load_qa() if err: st.error(f"Q&A model failed to load: {err}") else: with st.spinner("Finding the answer..."): try: import torch tok, model = qa q = question_text.strip() ctx = context_text.strip()[:3000] inputs = tok(q, ctx, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) start = outputs.start_logits.argmax() end = outputs.end_logits.argmax() + 1 answer = tok.convert_tokens_to_string( tok.convert_ids_to_tokens( inputs["input_ids"][0][start:end] ) ) # Add this line to clean RoBERTa special characters answer = answer.replace("Ġ", " ").strip() start_prob = outputs.start_logits.softmax(dim=-1).max().item() end_prob = outputs.end_logits.softmax(dim=-1).max().item() score_pct = int(((start_prob + end_prob) / 2) * 100) st.markdown(f"""

Answer

{answer}

Confidence : {score_pct}%

""", unsafe_allow_html=True) except Exception as e: st.error(f"Inference error: {e}") if st.session_state.get("classification_done"): st.markdown("---") st.markdown('

Suggested Questions

', unsafe_allow_html=True) c1, c2, c3, c4 = st.columns(4) for col, q in zip([c1, c2, c3, c4], ["Who is this article about?", "What event is described?", "Where did this take place?", "What was the outcome?"]): col.markdown(f"""

{q}

""", unsafe_allow_html=True) # ══════════════════════════════════════════════════════════════════════════════ # TAB 3 – INSIGHTS # ══════════════════════════════════════════════════════════════════════════════ with tab3: if not st.session_state.get("classification_done"): st.markdown("""

📊

Insights unlock after classification

Go to Text Classification, upload a CSV, and run the model first.

""", unsafe_allow_html=True) else: df_ins = st.session_state["df_classified"] counts = df_ins["class"].value_counts() total = len(df_ins) # KPI row kpi_cols = st.columns(5) for col, cat in zip(kpi_cols, CATEGORIES): cnt = int(counts.get(cat, 0)) pct = round(cnt / total * 100, 1) if total else 0 badge = CAT_BADGE.get(cat, "badge-teal") col.markdown(f"""

{cat.replace('_',' ')}

{cnt}

{pct}% of total

""", unsafe_allow_html=True) st.markdown("---") ch1, ch2 = st.columns(2, gap="large") with ch1: st.markdown('

Category Distribution

', unsafe_allow_html=True) fig, ax = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a") labels = [c.replace("_", " ") for c in counts.index] colors = [CAT_COLOR.get(c, "#00c8b4") for c in counts.index] wedges, _, autotexts = ax.pie( counts.values, labels=None, autopct="%1.1f%%", colors=colors, startangle=120, wedgeprops=dict(width=0.55, edgecolor="#07090f", linewidth=2), pctdistance=0.78) for at in autotexts: at.set_color("#e2e8f0"); at.set_fontsize(8.5); at.set_fontweight("bold") ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12), ncol=3, frameon=False, labelcolor="#94a3b8", fontsize=8) ax.set_facecolor("#0f172a"); fig.patch.set_facecolor("#0f172a") st.pyplot(fig, use_container_width=True); plt.close(fig) with ch2: st.markdown('

Article Counts by Category

', unsafe_allow_html=True) fig2, ax2 = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a") bars = ax2.barh([l.replace("_", " ") for l in counts.index], counts.values, color=[CAT_COLOR.get(c, "#00c8b4") for c in counts.index], height=0.55, edgecolor="none") ax2.set_facecolor("#0f172a") for sp in ["top", "right"]: ax2.spines[sp].set_visible(False) for sp in ["left", "bottom"]: ax2.spines[sp].set_color("#1e2d45") ax2.tick_params(colors="#64748b", labelsize=8.5) for bar in bars: ax2.text(bar.get_width() + 0.4, bar.get_y() + bar.get_height() / 2, str(int(bar.get_width())), va="center", ha="left", color="#e2e8f0", fontsize=8.5, fontweight="bold") fig2.patch.set_facecolor("#0f172a") st.pyplot(fig2, use_container_width=True); plt.close(fig2) st.markdown("---") st.markdown('

Word Cloud by Category

', unsafe_allow_html=True) selected_cat = st.selectbox("", options=CATEGORIES, format_func=lambda c: c.replace("_", " "), label_visibility="collapsed") cat_texts = df_ins[df_ins["class"] == selected_cat]["content"].fillna("").tolist() combined = " ".join(preprocess_text(t) for t in cat_texts[:200]) if combined.strip(): wc = WordCloud(width=900, height=340, background_color="#0f172a", colormap="cool", max_words=120, collocations=False).generate(combined) fig3, ax3 = plt.subplots(figsize=(9, 3.5), facecolor="#0f172a") ax3.imshow(wc, interpolation="bilinear"); ax3.axis("off") fig3.patch.set_facecolor("#0f172a") st.pyplot(fig3, use_container_width=True); plt.close(fig3) else: st.info(f"No content found for: {selected_cat.replace('_',' ')}") st.markdown("---") st.markdown(f'

Top Unigrams · {selected_cat.replace("_"," ")}

', unsafe_allow_html=True) top_words = Counter(combined.split()).most_common(15) if top_words: words, freqs = zip(*top_words) fig4, ax4 = plt.subplots(figsize=(9, 3), facecolor="#0f172a") ax4.bar(words, freqs, color=CAT_COLOR.get(selected_cat, "#00c8b4"), edgecolor="none", width=0.6) ax4.set_facecolor("#0f172a") for sp in ["top", "right"]: ax4.spines[sp].set_visible(False) for sp in ["left", "bottom"]: ax4.spines[sp].set_color("#1e2d45") ax4.tick_params(axis="x", colors="#64748b", labelsize=8, rotation=30) ax4.tick_params(axis="y", colors="#64748b", labelsize=8) fig4.patch.set_facecolor("#0f172a") st.pyplot(fig4, use_container_width=True); plt.close(fig4) # ─── Footer ─────────────────────────────────────────────────────────────────── st.markdown("""

Built for IN23-S5-DA3111 · Text Analytics Group Project · Powered by Hugging Face & Streamlit

""", unsafe_allow_html=True)