import streamlit as st import pandas as pd import numpy as np import string, time, re, random from collections import Counter # ───────────────────────────────────────────────────────────────────────────── # PAGE CONFIG # ───────────────────────────────────────────────────────────────────────────── st.set_page_config( page_title="NewsLens AI — Daily Mirror Intelligence", page_icon="◉", layout="wide", initial_sidebar_state="collapsed", ) # ───────────────────────────────────────────────────────────────────────────── # NLTK # ───────────────────────────────────────────────────────────────────────────── import nltk @st.cache_resource(show_spinner=False) def _nltk(): for p in ["punkt","punkt_tab","stopwords","wordnet"]: nltk.download(p, quiet=True) _nltk() from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer # ───────────────────────────────────────────────────────────────────────────── # MODELS # ───────────────────────────────────────────────────────────────────────────── @st.cache_resource(show_spinner=False) def load_clf(): from transformers import pipeline return pipeline("text-classification", model="Akilashamnaka12/news_classifier_model", truncation=True, max_length=512) @st.cache_resource(show_spinner=False) def load_qa(): from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline n = "deepset/roberta-base-squad2" return pipeline("question-answering", model=AutoModelForQuestionAnswering.from_pretrained(n), tokenizer=AutoTokenizer.from_pretrained(n)) # ───────────────────────────────────────────────────────────────────────────── # CONSTANTS # ───────────────────────────────────────────────────────────────────────────── LABEL_MAP = {"LABEL_0":"Business","LABEL_1":"Opinion", "LABEL_2":"Political_gossip","LABEL_3":"Sports","LABEL_4":"World_news"} CATS = { "Business": {"icon":"💼","color":"#0071e3","bg":"#f0f7ff","desc":"Finance & Economy"}, "Opinion": {"icon":"💬","color":"#34c759","bg":"#f0fdf4","desc":"Views & Editorials"}, "Political_gossip": {"icon":"🏛️", "color":"#ff3b30","bg":"#fff1f2","desc":"Politics & Governance"}, "Sports": {"icon":"⚽","color":"#ff9f0a","bg":"#fff7ed","desc":"Matches & Athletics"}, "World_news": {"icon":"🌍","color":"#5e5ce6","bg":"#f5f3ff","desc":"International Affairs"}, } _sw = set(stopwords.words("english")) _lem = WordNetLemmatizer() def preprocess(t): if not isinstance(t,str) or not t.strip(): return "" t = t.lower().translate(str.maketrans("","",string.punctuation)) tokens = [_lem.lemmatize(w) for w in word_tokenize(t) if w not in _sw and w.isalpha()] return " ".join(tokens) def resolve(r): return LABEL_MAP.get(r, r) def word_cloud_html(text, n=65): words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower()) stops = {"this","that","with","have","will","from","they","been","were", "their","there","also","which","when","into","than","then","about", "more","over","some","such","just","very","even","only","most","said"} freq = Counter(w for w in words if w not in stops) top = freq.most_common(n) if not top: return "

Not enough text.

" mx = top[0][1] pal = ["#0071e3","#34c759","#ff3b30","#ff9f0a","#5e5ce6","#00c7be","#ff6b9d"] out = "" for word,cnt in top: sz = 0.76 + (cnt/mx)*1.85 col = random.choice(pal) op = 0.45 + (cnt/mx)*0.55 fw = 300 + int((cnt/mx)*500) rot = random.choice([-3,-1,0,0,0,1,3]) out += (f'' f'{word}') return f'
{out}
' # ───────────────────────────────────────────────────────────────────────────── # ═══════════════════════ MASTER CSS ═══════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # STATE # ───────────────────────────────────────────────────────────────────────────── if "page" not in st.session_state: st.session_state["page"] = "classify" # ───────────────────────────────────────────────────────────────────────────── # ══════════════ NAVIGATION ═══════════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── pg = st.session_state["page"] st.markdown(f""" """, unsafe_allow_html=True) # Nav button row (functional, visually hidden by CSS) c1,c2,c3,_ = st.columns([1,1,1,6]) with c1: if st.button("Classify", key="nb1", use_container_width=True): st.session_state["page"] = "classify"; st.rerun() with c2: if st.button("Q & A", key="nb2", use_container_width=True): st.session_state["page"] = "qa"; st.rerun() with c3: if st.button("Insights", key="nb3", use_container_width=True): st.session_state["page"] = "insights"; st.rerun() # ───────────────────────────────────────────────────────────────────────────── # ══════════════ HERO ═════════════════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── st.markdown("""
Daily Mirror · AI Intelligence · Assignment 01

News that
understands itself.

Classify articles, extract answers, and surface visual insights from Daily Mirror news — powered by fine-tuned Hugging Face Transformers.

""", unsafe_allow_html=True) # Feature bar st.markdown("""
🧠
DistilBERT Classifier
Fine-tuned on 5 news categories
💬
RoBERTa Q&A
Extractive answers with highlights
📊
Visual Insights
Charts, word clouds, distributions
⚙️
NLP Preprocessing
7-step NLTK pipeline built in

""", unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # ══════════════ PAGE: CLASSIFY ═══════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── if pg == "classify": # ── Section header ────────────────────────────────────────────────────── st.markdown("""
Component 01 · Text Classification

Every article,
perfectly categorised.

Upload your CSV and a fine-tuned DistilBERT model instantly sorts each article into one of five categories — Business, Opinion, Political Gossip, Sports, or World News.

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Image banner st.markdown("""
Upload · Preprocess · Classify · Download
News Classification at Scale
7-step preprocessing pipeline · Batch inference · CSV output
""", unsafe_allow_html=True) col_L, col_R = st.columns([3, 2], gap="large") # ── LEFT COLUMN ───────────────────────────────────────────────────────── with col_L: # Upload card st.markdown('
', unsafe_allow_html=True) st.markdown('Step 01 — Upload', unsafe_allow_html=True) st.markdown('
Select your CSV file
', unsafe_allow_html=True) st.markdown( '
Requires a ' 'content' ' column. Compatible with the evaluation.csv provided with this assignment.
', unsafe_allow_html=True) uploaded = st.file_uploader("", type=["csv"], key="cls_upload", label_visibility="collapsed") if uploaded: df = pd.read_csv(uploaded) st.success(f"✓   {len(df):,} records loaded  ·  {len(df.columns)} columns") if "content" not in df.columns: st.error(f"Column `content` not found. " f"Found: **{', '.join(df.columns.tolist())}**") st.stop() with st.expander("Preview — first 5 rows"): st.dataframe(df.head(), use_container_width=True) st.markdown("
", unsafe_allow_html=True) if st.button("Run Classification Pipeline", key="run_cls"): with st.status("⚙️ Preprocessing text (7 steps)…", expanded=False) as s: cleaned = df["content"].fillna("").apply(preprocess).tolist() s.update(label="✅ Preprocessing complete", state="complete") with st.spinner("Loading model — first run takes ~30s…"): clf = load_clf() prog = st.progress(0, text="Classifying articles…") preds, confs = [], [] for i in range(0, len(cleaned), 16): batch = [t if t.strip() else " " for t in cleaned[i:i+16]] results = clf(batch, truncation=True, max_length=512) for r in results: preds.append(resolve(r["label"])) confs.append(round(r["score"], 4)) pct = min(int((i+16)/len(cleaned)*100), 100) prog.progress(pct, text=f"Classifying… {pct}%") time.sleep(0.01) prog.empty() out = df.copy() out["class"] = preds out["confidence"] = confs st.session_state["out_df"] = out st.success("✅ Classification complete — results ready below.") st.markdown("
", unsafe_allow_html=True) # Results if "out_df" in st.session_state: out = st.session_state["out_df"] counts = out["class"].value_counts() # Stat tiles st.markdown('
', unsafe_allow_html=True) for label, meta in CATS.items(): n = counts.get(label, 0) st.markdown(f"""
{meta['icon']}
{n}
{label.replace('_',' ')}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # Tabbed results st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown('Results', unsafe_allow_html=True) st.markdown('
' 'Classified Articles
', unsafe_allow_html=True) all_t, *cat_ts = st.tabs( ["All Articles"] + [f"{CATS[l]['icon']} {l.replace('_',' ')}" for l in CATS] ) with all_t: st.dataframe(out[["content","class","confidence"]], use_container_width=True, height=320) for i, label in enumerate(CATS): with cat_ts[i]: sub = out[out["class"]==label][["content","confidence"]] if sub.empty: st.info(f"No articles classified as **{label.replace('_',' ')}**.") else: st.dataframe(sub, use_container_width=True, height=280) st.markdown("
", unsafe_allow_html=True) avg_c = out["confidence"].mean() if "confidence" in out.columns else 0 hi = (out["confidence"]>=0.9).sum() if "confidence" in out.columns else 0 st.markdown( f'

' f'Average confidence  {avg_c:.1%}' f'  ·  ' f'High confidence ≥ 90%  {hi}' f'

', unsafe_allow_html=True) st.download_button( "⬇ Download output.csv", data=out.to_csv(index=False).encode("utf-8"), file_name="output.csv", mime="text/csv", ) st.markdown("
", unsafe_allow_html=True) else: st.markdown("""
No file selected yet
Upload your evaluation.csv above to begin
""", unsafe_allow_html=True) # ── RIGHT COLUMN ──────────────────────────────────────────────────────── with col_R: st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown('Reference', unsafe_allow_html=True) st.markdown('
' 'Five News Categories
', unsafe_allow_html=True) for label, meta in CATS.items(): st.markdown(f"""
{meta['icon']}
{label.replace('_',' ')}
{meta['desc']}
""", unsafe_allow_html=True) if "out_df" in st.session_state: st.markdown('
', unsafe_allow_html=True) st.markdown('' 'Distribution Chart', unsafe_allow_html=True) st.bar_chart( st.session_state["out_df"]["class"].value_counts(), use_container_width=True, height=190, ) st.markdown("
", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # /section-inner /section-alt # ───────────────────────────────────────────────────────────────────────────── # ══════════════ PAGE: Q&A ════════════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── elif pg == "qa": st.markdown("""
Component 02 · Question-Answering

Ask anything.
Get precise answers.

Paste any news article and ask a natural language question. The AI reads the passage and extracts an exact, source-referenced answer — powered by deepset/roberta-base-squad2 (SQuAD 2.0).

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Image banner st.markdown("""
Extractive QA · RoBERTa · SQuAD 2.0
Intelligence That Reads Closely
Ask in plain language · Get source-highlighted answers
""", unsafe_allow_html=True) col_qa, col_side = st.columns([3, 2], gap="large") with col_qa: st.markdown('
', unsafe_allow_html=True) st.markdown('' 'Input', unsafe_allow_html=True) st.markdown('
' 'Paste article & ask
', unsafe_allow_html=True) src = st.radio("Text Source", ["Paste article text", "Pick from classified results"], horizontal=True, key="qa_src") context = "" if src == "Paste article text": context = st.text_area( "News Article", height=210, placeholder="Paste any Daily Mirror news article here…", key="qa_ctx", ) else: if "out_df" not in st.session_state: st.info("ℹ️ Run the **Classify** pipeline first to use this option.") else: out_df = st.session_state["out_df"] sel_cat = st.selectbox( "Filter Category", ["All"] + [l.replace("_"," ") for l in CATS], key="qa_cat", ) pool = (out_df if sel_cat == "All" else out_df[out_df["class"].isin( [sel_cat, sel_cat.replace(" ","_")])]) if not pool.empty: idx = st.selectbox( "Select Article", pool.index.tolist(), format_func=lambda i: f"#{i} — {str(pool.loc[i,'content'])[:72]}…", key="qa_idx", ) row = pool.loc[idx] context = str(row["content"]) lbl = row.get("class","") meta = CATS.get(lbl, {"icon":"◉","color":"#1d1d1f","bg":"#f5f5f7"}) conf_v = row.get("confidence", None) st.markdown(f"""
{meta['icon']}  {lbl.replace('_',' ')} {f"  ·  {conf_v:.1%}" if conf_v else ""}
{context}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) question = st.text_input( "Your Question", placeholder="e.g. Who announced the new policy?", key="qa_q", ) st.markdown("
", unsafe_allow_html=True) if st.button("Extract Answer", key="run_qa"): if not context.strip(): st.warning("⚠️ Please provide article text.") elif not question.strip(): st.warning("⚠️ Please enter a question.") else: with st.spinner("Reading the passage…"): qa_pipe = load_qa() result = qa_pipe(question=question, context=context) ans = result["answer"] score = result["score"] s, e = result["start"], result["end"] highlighted = ( context[:s] + f'' f'{context[s:e]}' + context[e:] ) st.markdown(f"""
Answer
{ans}
Confidence  {score:.1%}  ·  deepset/roberta-base-squad2
""", unsafe_allow_html=True) with st.expander("View highlighted source context"): st.markdown( f'
{highlighted}
', unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) with col_side: st.markdown('
', unsafe_allow_html=True) st.markdown('' 'Tips', unsafe_allow_html=True) st.markdown('
' 'Better questions,
better answers
', unsafe_allow_html=True) for i, (t, d) in enumerate([ ("Who · What · When · Where", "Factual questions extract the sharpest answers"), ("Provide full context", "Longer passages give the model more evidence to work from"), ("Stay specific", "Narrow, focused questions outperform vague ones every time"), ("Full sentence questions", "Questions ending with '?' consistently perform best"), ("Avoid yes / no", "Open-ended questions return richer, more informative answers"), ]): st.markdown(f"""
{i+1:02}
{t}
{d}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.markdown('Model', unsafe_allow_html=True) for k, v in [ ("Architecture", "RoBERTa Base"), ("Training Data", "SQuAD 2.0"), ("Task Type", "Extractive Q&A"), ("Provider", "deepset · Hugging Face"), ]: st.markdown(f"""
{k} {v}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # ══════════════ PAGE: INSIGHTS ═══════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── elif pg == "insights": # Dark hero section st.markdown("""
Component 03 · Visual Insights

Clarity from
every angle.

Distribution breakdowns, word clouds, confidence analysis, and article spotlights — everything you need to understand your classified corpus at a glance.


""", unsafe_allow_html=True) if "out_df" not in st.session_state: st.markdown("""
No classified data yet
Run the Classify pipeline first, then return here for visual insights.
""", unsafe_allow_html=True) st.stop() out_df = st.session_state["out_df"] total = len(out_df) counts = out_df["class"].value_counts() # ── Section A: Distribution ────────────────────────────────────────── st.markdown("""
01 · Distribution

How your corpus breaks down.

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) col_da, col_db = st.columns([2, 3], gap="large") with col_da: st.markdown('
', unsafe_allow_html=True) st.markdown('Breakdown', unsafe_allow_html=True) for label, meta in CATS.items(): n = counts.get(label, 0) pct = n / total if total > 0 else 0 st.markdown(f"""
{meta['icon']}
{label.replace('_',' ')} {n} · {pct:.0%}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) with col_db: try: import plotly.express as px cdf = counts.reset_index() cdf.columns = ["Category","Count"] cdf["Label"] = cdf["Category"].str.replace("_"," ") cmap = {k: CATS[k]["color"] for k in CATS} fig = px.bar(cdf, x="Label", y="Count", color="Category", color_discrete_map=cmap, text="Count", labels={"Label":"","Count":""}) fig.update_layout( plot_bgcolor="white",paper_bgcolor="white", font=dict(family="-apple-system,BlinkMacSystemFont,'SF Pro Text',sans-serif", size=12,color="#1d1d1f"), showlegend=False,margin=dict(l=0,r=0,t=10,b=0), xaxis=dict(showgrid=False,color="#86868b", tickfont=dict(size=11,color="#6e6e73")), yaxis=dict(gridcolor="#f5f5f7",color="#86868b"), ) fig.update_traces(textposition="outside", textfont=dict(size=12,color="#1d1d1f"), marker_line_width=0, marker_corner_radius=6) st.plotly_chart(fig, use_container_width=True) except ImportError: st.bar_chart(counts, use_container_width=True, height=270) st.markdown("
", unsafe_allow_html=True) # ── Section B: Word Cloud ──────────────────────────────────────────── st.markdown("""
02 · Word Cloud

The language of the news.

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) col_wl, col_wr = st.columns([2, 3], gap="large") with col_wl: st.markdown('
', unsafe_allow_html=True) st.markdown('Configure', unsafe_allow_html=True) st.markdown('
' 'Build word cloud
', unsafe_allow_html=True) wc_sel = st.selectbox("Category Filter", ["All"]+[l.replace("_"," ") for l in CATS], key="wc_cat") wc_n = st.slider("Number of Words", 20, 120, 70, key="wc_n") st.markdown("
", unsafe_allow_html=True) if st.button("Generate Word Cloud", key="run_wc"): lbl = wc_sel.replace(" ","_") if wc_sel != "All" else "All" corpus = (" ".join(out_df["content"].fillna("").tolist()) if lbl == "All" else " ".join( out_df[out_df["class"].isin([lbl,wc_sel])]["content"] .fillna("").tolist())) try: from wordcloud import WordCloud import matplotlib.pyplot as plt import matplotlib.colors as mcolors accent = CATS.get(lbl,{}).get("color","#0071e3") processed = preprocess(corpus) def _cf(*a,**k): r,g,b = mcolors.to_rgb(accent) f = random.uniform(.45,1.) return f"rgb({int(r*f*255)},{int(g*f*255)},{int(b*f*255)})" wc = WordCloud(width=900,height=360, background_color="white", color_func=_cf,max_words=wc_n, prefer_horizontal=.82).generate(processed) fig_wc,ax = plt.subplots(figsize=(12,4)) ax.imshow(wc,interpolation="bilinear"); ax.axis("off") fig_wc.patch.set_facecolor("white"); plt.tight_layout(pad=0) st.session_state["wc_fig"] = fig_wc st.session_state["wc_html"] = None except ImportError: st.session_state["wc_html"] = word_cloud_html(preprocess(corpus), wc_n) st.session_state["wc_fig"] = None st.markdown("
", unsafe_allow_html=True) with col_wr: st.markdown('
', unsafe_allow_html=True) st.markdown('' 'Word Frequency Canvas', unsafe_allow_html=True) if st.session_state.get("wc_fig"): import matplotlib.pyplot as plt st.pyplot(st.session_state["wc_fig"]) elif st.session_state.get("wc_html"): st.markdown(st.session_state["wc_html"], unsafe_allow_html=True) else: st.markdown("""
Configure and generate your word cloud
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # ── Section C: Confidence ──────────────────────────────────────────── if "confidence" in out_df.columns: st.markdown("""
03 · Confidence Analysis

How certain is the model?

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) c1,c2,c3 = st.columns(3, gap="large") for col,(val,lbl,color) in zip([c1,c2,c3],[ (f"{out_df['confidence'].mean():.1%}","Average Confidence","#0071e3"), (str((out_df["confidence"]>=.9).sum()),"High Confidence ≥ 90%","#34c759"), (str((out_df["confidence"]<.7).sum()), "Low Confidence < 70%", "#ff3b30"), ]): with col: st.markdown(f"""
{val}
{lbl}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) try: import plotly.express as px cmap = {k: CATS[k]["color"] for k in CATS} fig2 = px.histogram(out_df,x="confidence",color="class", nbins=25,color_discrete_map=cmap, labels={"confidence":"Confidence Score","class":""}) fig2.update_layout( plot_bgcolor="white",paper_bgcolor="white", font=dict(family="-apple-system,BlinkMacSystemFont,'SF Pro Text',sans-serif", size=11,color="#1d1d1f"), margin=dict(l=0,r=0,t=10,b=0),bargap=.06, xaxis=dict(showgrid=False,color="#86868b"), yaxis=dict(gridcolor="#f5f5f7",color="#86868b"), legend=dict(bgcolor="white",bordercolor="#e2e2e7",borderwidth=1, font=dict(size=11)), ) st.plotly_chart(fig2, use_container_width=True) except ImportError: st.dataframe(out_df.groupby("class")["confidence"].describe().round(3), use_container_width=True) st.markdown("
", unsafe_allow_html=True) # ── Section D: Article Length ──────────────────────────────────────── st.markdown("""
04 · Article Length

Word count by category.

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) out_df["word_count"] = out_df["content"].fillna("").apply(lambda x: len(x.split())) try: import plotly.express as px cmap = {k: CATS[k]["color"] for k in CATS} fig3 = px.box(out_df,x="class",y="word_count",color="class", color_discrete_map=cmap,points="outliers", labels={"class":"","word_count":"Word Count"}) fig3.update_layout( plot_bgcolor="white",paper_bgcolor="white", font=dict(family="-apple-system,BlinkMacSystemFont,'SF Pro Text',sans-serif", size=11,color="#1d1d1f"), showlegend=False,margin=dict(l=0,r=0,t=10,b=0), xaxis=dict(showgrid=False,color="#86868b", tickfont=dict(size=11,color="#6e6e73")), yaxis=dict(gridcolor="#f5f5f7",color="#86868b"), ) st.plotly_chart(fig3, use_container_width=True) except ImportError: st.dataframe(out_df.groupby("class")["word_count"].describe().round(1), use_container_width=True) st.markdown("
", unsafe_allow_html=True) # ── Section E: Spotlight ───────────────────────────────────────────── st.markdown("""
05 · Article Spotlight

Discover a random article.

""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) if st.button("Shuffle Article", key="spot"): row = out_df.sample(1).iloc[0] label = row.get("class","") meta = CATS.get(label, {"icon":"◉","color":"#1d1d1f","bg":"#f5f5f7"}) conf_v = row.get("confidence", None) text = str(row["content"]) wc_c = len(text.split()) st.markdown(f"""
{meta['icon']}  {label.replace('_',' ')} {f'{conf_v:.1%} confidence' if conf_v else ""} {wc_c} words
{text[:640]}{"…" if len(text)>640 else ""}
""", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) # ───────────────────────────────────────────────────────────────────────────── # ══════════════ FOOTER ═══════════════════════════════════════════════════════ # ───────────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True)