import os
import re
import nltk
import pandas as pd
import streamlit as st
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# ─── HF token (set as a Secret in Space settings for private/gated models) ────
HF_TOKEN = os.environ.get("HF_TOKEN", None)
# ─── Page Config ──────────────────────────────────────────────────────────────
st.set_page_config(
page_title="NewsLens · Sri Lanka",
page_icon="🔎",
layout="wide",
initial_sidebar_state="collapsed",
)
# ─── NLTK – write to /tmp so HF Spaces (read-only FS) can cache data ──────────
NLTK_DATA_DIR = "/tmp/nltk_data"
os.makedirs(NLTK_DATA_DIR, exist_ok=True)
if NLTK_DATA_DIR not in nltk.data.path:
nltk.data.path.insert(0, NLTK_DATA_DIR)
@st.cache_resource
def download_nltk():
for pkg in ["stopwords", "punkt", "punkt_tab"]:
try:
nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
except Exception:
pass
download_nltk()
# ─── CSS ──────────────────────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ─── Constants ────────────────────────────────────────────────────────────────
CATEGORIES = ["Business", "Opinion", "Political_gossip", "Sports", "World_news"]
CAT_BADGE = {
"Business": "badge-teal", "Opinion": "badge-blue",
"Political_gossip": "badge-amber", "Sports": "badge-rose", "World_news": "badge-violet",
}
CAT_COLOR = {
"Business": "#00c8b4", "Opinion": "#60a5fa",
"Political_gossip": "#fbbf24", "Sports": "#fb7185", "World_news": "#a78bfa",
}
# Map whatever the model returns → one of the 5 assignment class names
LABEL_MAP = {
"business": "Business", "opinion": "Opinion",
"political_gossip": "Political_gossip", "political gossip": "Political_gossip",
"sports": "Sports", "world_news": "World_news", "world news": "World_news", "world": "World_news",
"label_0": "Business", "label_1": "Opinion",
"label_2": "Political_gossip", "label_3": "Sports", "label_4": "World_news",
"business and finance": "Business", "opinions and editorials": "Opinion",
"politics": "Political_gossip",
}
def normalise_label(raw: str) -> str:
if raw in CATEGORIES:
return raw
return LABEL_MAP.get(raw.strip().lower(), raw)
# ─── Text preprocessor ────────────────────────────────────────────────────────
def preprocess_text(text: str) -> str:
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(r"http\S+|www\.\S+", " ", text)
text = re.sub(r"[^a-z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
try:
sw = set(stopwords.words("english"))
tokens = word_tokenize(text)
text = " ".join(t for t in tokens if t not in sw and len(t) > 2)
except Exception:
pass
return text
# ─── Model loaders ────────────────────────────────────────────────────────────
@st.cache_resource(show_spinner=False)
def load_classifier():
"""
Replace MODEL_ID with your fine-tuned model pushed to HF Hub in Task 4.
e.g. "Akilashamnaka12/news_classifier_model"
If your Space or model is private, add HF_TOKEN as a Secret in Space settings.
"""
MODEL_ID = "Akilashamnaka12/news_classifier_model" # ← swap after Task 4
try:
from transformers import pipeline as hf_pipeline
kwargs = {"task": "text-classification", "model": MODEL_ID,
"truncation": True, "max_length": 512}
if HF_TOKEN:
kwargs["token"] = HF_TOKEN
return hf_pipeline(**kwargs), None
except Exception as e:
return None, str(e)
@st.cache_resource(show_spinner=False)
@st.cache_resource(show_spinner=False)
def load_qa():
QA_MODEL = "deepset/roberta-base-squad2"
try:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
tok = AutoTokenizer.from_pretrained(QA_MODEL)
model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)
return (tok, model), None
except Exception as e:
return None, str(e)
# ══════════════════════════════════════════════════════════════════════════════
# HERO
# ══════════════════════════════════════════════════════════════════════════════
st.markdown("""
🔎 Text Analytics · DA3111 - Group 6
News Lens
Classify News articles, interrogate content with Q&A,
and surface editorial insights — all in one unified workspace.
""", unsafe_allow_html=True)
tab1, tab2, tab3 = st.tabs([
" 📂 Text Classification ",
" 💬 Q & A Pipeline ",
" 📊 Insights ",
])
# ══════════════════════════════════════════════════════════════════════════════
# TAB 1 – TEXT CLASSIFICATION
# ══════════════════════════════════════════════════════════════════════════════
with tab1:
left, right = st.columns([1.1, 1], gap="large")
with left:
st.markdown('Upload
', unsafe_allow_html=True)
st.markdown("""
Upload your CSV file
Must contain a content column with news excerpts.
""", unsafe_allow_html=True)
uploaded = st.file_uploader("", type=["csv"], label_visibility="collapsed")
st.markdown("
", unsafe_allow_html=True)
if uploaded:
try:
uploaded.seek(0) # reset buffer – important on HF Spaces
df_raw = pd.read_csv(uploaded)
except Exception as e:
st.error(f"Could not parse CSV: {e}")
st.stop()
if "content" not in df_raw.columns:
st.error("❌ The uploaded file must have a `content` column.")
else:
st.markdown(f"""
{len(df_raw)}Records
{df_raw.shape[1]}Columns
""", unsafe_allow_html=True)
st.markdown('Preview
',
unsafe_allow_html=True)
st.dataframe(df_raw.head(5), use_container_width=True, hide_index=True)
run_btn = st.button("⚡ Run Classification", use_container_width=True)
if run_btn:
with st.spinner("Loading classifier… (first run ~30 s on HF Spaces)"):
clf, err = load_classifier()
if err:
st.error(f"Model load error: {err}")
else:
df_out = df_raw.copy()
pred_labels = []
prog = st.progress(0, text="Classifying…")
texts = df_out["content"].fillna("").tolist()
for i, txt in enumerate(texts):
clean = preprocess_text(txt) or txt[:512]
try:
raw = clf(clean[:512])[0]["label"]
label = normalise_label(raw)
except Exception:
label = "Unknown"
pred_labels.append(label)
prog.progress((i + 1) / len(texts),
text=f"Classifying {i+1}/{len(texts)}…")
prog.empty()
df_out["class"] = pred_labels
st.session_state["df_classified"] = df_out
st.session_state["classification_done"] = True
st.rerun()
with right:
st.markdown('Results
', unsafe_allow_html=True)
if st.session_state.get("classification_done"):
df_out = st.session_state["df_classified"]
counts = df_out["class"].value_counts()
chip_html = ''
for cat, cnt in counts.items():
badge = CAT_BADGE.get(cat, "badge-teal")
chip_html += (f'
{cnt}'
f'{cat.replace("_"," ")}
')
chip_html += "
"
st.markdown(chip_html, unsafe_allow_html=True)
cols = [c for c in ["content", "class"] if c in df_out.columns]
st.markdown('', unsafe_allow_html=True)
st.markdown('
Classified Records
', unsafe_allow_html=True)
st.dataframe(df_out[cols].head(20), use_container_width=True, hide_index=True,
column_config={"content": st.column_config.TextColumn("Content", width="large")})
st.markdown("
", unsafe_allow_html=True)
st.download_button(
"⬇ Download output.csv",
data=df_out.to_csv(index=False).encode("utf-8"),
file_name="output.csv", mime="text/csv",
use_container_width=True,
)
else:
st.markdown("""
📂
Upload a CSV to see results
Predictions appear here after classification runs.
""", unsafe_allow_html=True)
# ══════════════════════════════════════════════════════════════════════════════
# TAB 2 – Q&A PIPELINE
# ══════════════════════════════════════════════════════════════════════════════
with tab2:
l2, r2 = st.columns([1, 1], gap="large")
with l2:
st.markdown('Context
', unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
st.markdown('
Paste a news excerpt
', unsafe_allow_html=True)
st.markdown('
The Q&A model will read this as its context.
',
unsafe_allow_html=True)
default_ctx = ""
if st.session_state.get("classification_done"):
df_c = st.session_state["df_classified"]
if len(df_c):
default_ctx = str(df_c["content"].iloc[0])
context_text = st.text_area("", value=default_ctx, height=260,
placeholder="Paste any news article content here…",
label_visibility="collapsed", key="qa_context")
st.markdown("
", unsafe_allow_html=True)
with r2:
st.markdown('Question
', unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
st.markdown('
Ask anything about the article
', unsafe_allow_html=True)
st.markdown('
The model extracts an answer from the context on the left.
',
unsafe_allow_html=True)
question_text = st.text_area("", height=120,
placeholder="e.g. Who is mentioned in this article?",
label_visibility="collapsed", key="qa_question")
ask_btn = st.button("🔍 Get Answer", use_container_width=True)
st.markdown("
", unsafe_allow_html=True)
if ask_btn:
if not context_text.strip():
st.warning("Please paste a news excerpt in the Context panel on the left.")
elif not question_text.strip():
st.warning("Please type a question.")
else:
with st.spinner("Loading Q&A model (first run ~30 s)"):
qa, err = load_qa()
if err:
st.error(f"Q&A model failed to load: {err}")
else:
with st.spinner("Finding the answer..."):
try:
import torch
tok, model = qa
q = question_text.strip()
ctx = context_text.strip()[:3000]
inputs = tok(q, ctx, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
start = outputs.start_logits.argmax()
end = outputs.end_logits.argmax() + 1
answer = tok.convert_tokens_to_string(
tok.convert_ids_to_tokens(
inputs["input_ids"][0][start:end]
)
)
# Add this line to clean RoBERTa special characters
answer = answer.replace("Ġ", " ").strip()
start_prob = outputs.start_logits.softmax(dim=-1).max().item()
end_prob = outputs.end_logits.softmax(dim=-1).max().item()
score_pct = int(((start_prob + end_prob) / 2) * 100)
st.markdown(f"""
Answer
{answer}
Confidence : {score_pct}%
""", unsafe_allow_html=True)
except Exception as e:
st.error(f"Inference error: {e}")
if st.session_state.get("classification_done"):
st.markdown("---")
st.markdown('Suggested Questions
', unsafe_allow_html=True)
c1, c2, c3, c4 = st.columns(4)
for col, q in zip([c1, c2, c3, c4],
["Who is this article about?", "What event is described?",
"Where did this take place?", "What was the outcome?"]):
col.markdown(f"""
""", unsafe_allow_html=True)
# ══════════════════════════════════════════════════════════════════════════════
# TAB 3 – INSIGHTS
# ══════════════════════════════════════════════════════════════════════════════
with tab3:
if not st.session_state.get("classification_done"):
st.markdown("""
📊
Insights unlock after classification
Go to Text Classification,
upload a CSV, and run the model first.
""", unsafe_allow_html=True)
else:
df_ins = st.session_state["df_classified"]
counts = df_ins["class"].value_counts()
total = len(df_ins)
# KPI row
kpi_cols = st.columns(5)
for col, cat in zip(kpi_cols, CATEGORIES):
cnt = int(counts.get(cat, 0))
pct = round(cnt / total * 100, 1) if total else 0
badge = CAT_BADGE.get(cat, "badge-teal")
col.markdown(f"""
{cat.replace('_',' ')}
{cnt}
{pct}% of total
""", unsafe_allow_html=True)
st.markdown("---")
ch1, ch2 = st.columns(2, gap="large")
with ch1:
st.markdown('Category Distribution
', unsafe_allow_html=True)
fig, ax = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a")
labels = [c.replace("_", " ") for c in counts.index]
colors = [CAT_COLOR.get(c, "#00c8b4") for c in counts.index]
wedges, _, autotexts = ax.pie(
counts.values, labels=None, autopct="%1.1f%%", colors=colors,
startangle=120, wedgeprops=dict(width=0.55, edgecolor="#07090f", linewidth=2),
pctdistance=0.78)
for at in autotexts:
at.set_color("#e2e8f0"); at.set_fontsize(8.5); at.set_fontweight("bold")
ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12),
ncol=3, frameon=False, labelcolor="#94a3b8", fontsize=8)
ax.set_facecolor("#0f172a"); fig.patch.set_facecolor("#0f172a")
st.pyplot(fig, use_container_width=True); plt.close(fig)
with ch2:
st.markdown('Article Counts by Category
', unsafe_allow_html=True)
fig2, ax2 = plt.subplots(figsize=(5, 4.2), facecolor="#0f172a")
bars = ax2.barh([l.replace("_", " ") for l in counts.index], counts.values,
color=[CAT_COLOR.get(c, "#00c8b4") for c in counts.index],
height=0.55, edgecolor="none")
ax2.set_facecolor("#0f172a")
for sp in ["top", "right"]: ax2.spines[sp].set_visible(False)
for sp in ["left", "bottom"]: ax2.spines[sp].set_color("#1e2d45")
ax2.tick_params(colors="#64748b", labelsize=8.5)
for bar in bars:
ax2.text(bar.get_width() + 0.4, bar.get_y() + bar.get_height() / 2,
str(int(bar.get_width())), va="center", ha="left",
color="#e2e8f0", fontsize=8.5, fontweight="bold")
fig2.patch.set_facecolor("#0f172a")
st.pyplot(fig2, use_container_width=True); plt.close(fig2)
st.markdown("---")
st.markdown('Word Cloud by Category
', unsafe_allow_html=True)
selected_cat = st.selectbox("", options=CATEGORIES,
format_func=lambda c: c.replace("_", " "),
label_visibility="collapsed")
cat_texts = df_ins[df_ins["class"] == selected_cat]["content"].fillna("").tolist()
combined = " ".join(preprocess_text(t) for t in cat_texts[:200])
if combined.strip():
wc = WordCloud(width=900, height=340, background_color="#0f172a",
colormap="cool", max_words=120, collocations=False).generate(combined)
fig3, ax3 = plt.subplots(figsize=(9, 3.5), facecolor="#0f172a")
ax3.imshow(wc, interpolation="bilinear"); ax3.axis("off")
fig3.patch.set_facecolor("#0f172a")
st.pyplot(fig3, use_container_width=True); plt.close(fig3)
else:
st.info(f"No content found for: {selected_cat.replace('_',' ')}")
st.markdown("---")
st.markdown(f'Top Unigrams · {selected_cat.replace("_"," ")}
',
unsafe_allow_html=True)
top_words = Counter(combined.split()).most_common(15)
if top_words:
words, freqs = zip(*top_words)
fig4, ax4 = plt.subplots(figsize=(9, 3), facecolor="#0f172a")
ax4.bar(words, freqs, color=CAT_COLOR.get(selected_cat, "#00c8b4"), edgecolor="none", width=0.6)
ax4.set_facecolor("#0f172a")
for sp in ["top", "right"]: ax4.spines[sp].set_visible(False)
for sp in ["left", "bottom"]: ax4.spines[sp].set_color("#1e2d45")
ax4.tick_params(axis="x", colors="#64748b", labelsize=8, rotation=30)
ax4.tick_params(axis="y", colors="#64748b", labelsize=8)
fig4.patch.set_facecolor("#0f172a")
st.pyplot(fig4, use_container_width=True); plt.close(fig4)
# ─── Footer ───────────────────────────────────────────────────────────────────
st.markdown("""
Built for IN23-S5-DA3111 · Text Analytics Group Project
· Powered by Hugging Face & Streamlit
""", unsafe_allow_html=True)