| import re |
|
|
| import pandas as pd |
| import streamlit as st |
| from transformers import pipeline |
| from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering |
| import torch |
|
|
| st.set_page_config( |
| page_title="News Intelligence Studio", |
| page_icon="📰", |
| layout="wide", |
| initial_sidebar_state="collapsed", |
| ) |
|
|
| MODEL_NAME = "Akilashamnaka12/news-classifier-model" |
| QA_MODEL = "distilbert-base-cased-distilled-squad" |
| MAX_CONTEXT_ROWS = 8 |
|
|
|
|
| def inject_styles() -> None: |
| st.markdown( |
| """ |
| <style> |
| :root { |
| --bg: #f5f4ef; |
| --paper: #fbfaf6; |
| --ink: #1a1a18; |
| --muted: #6d6a63; |
| --line: rgba(26,26,24,0.12); |
| --soft: #ece8db; |
| --accent: #121212; |
| --gradient-a: rgba(127, 177, 183, 0.65); |
| --gradient-b: rgba(30, 44, 58, 0.35); |
| --gradient-c: rgba(220, 191, 151, 0.35); |
| } |
| |
| .stApp { |
| background: var(--bg); |
| color: var(--ink); |
| } |
| |
| .block-container { |
| padding-top: 1.2rem; |
| padding-bottom: 4rem; |
| max-width: 1240px; |
| } |
| |
| header[data-testid="stHeader"] { |
| background: transparent; |
| } |
| |
| [data-testid="stSidebar"] { |
| display: none; |
| } |
| |
| div[data-testid="stFileUploaderDropzone"] { |
| background: rgba(255,255,255,0.55); |
| border: 1px dashed rgba(26,26,24,0.18); |
| border-radius: 22px; |
| min-height: 220px; |
| } |
| |
| .nav { |
| display: flex; |
| justify-content: space-between; |
| align-items: center; |
| padding: 0.4rem 0 1rem 0; |
| border-bottom: 1px solid var(--line); |
| margin-bottom: 1rem; |
| font-size: 0.92rem; |
| color: var(--muted); |
| } |
| |
| .nav-links { |
| display: flex; |
| gap: 1.4rem; |
| flex-wrap: wrap; |
| } |
| |
| .brand { |
| font-weight: 700; |
| letter-spacing: 0.02em; |
| color: var(--ink); |
| } |
| |
| .hero { |
| position: relative; |
| overflow: hidden; |
| border-radius: 30px; |
| min-height: 460px; |
| padding: 3.5rem 3rem; |
| margin: 1rem 0 2rem 0; |
| background: |
| radial-gradient(circle at 20% 20%, rgba(255,255,255,0.62), transparent 32%), |
| radial-gradient(circle at 75% 28%, rgba(11, 27, 42, 0.35), transparent 18%), |
| radial-gradient(circle at 82% 82%, rgba(232, 196, 154, 0.5), transparent 20%), |
| linear-gradient(135deg, var(--gradient-a), var(--gradient-b) 48%, var(--gradient-c)); |
| box-shadow: 0 10px 30px rgba(0,0,0,0.05); |
| } |
| |
| .eyebrow { |
| text-transform: uppercase; |
| font-size: 0.78rem; |
| letter-spacing: 0.18em; |
| color: rgba(255,255,255,0.82); |
| margin-bottom: 1rem; |
| } |
| |
| .hero h1, |
| .section-title { |
| font-family: Georgia, 'Times New Roman', serif; |
| font-weight: 400; |
| letter-spacing: -0.02em; |
| } |
| |
| .hero h1 { |
| font-size: clamp(3rem, 7vw, 5.4rem); |
| line-height: 0.92; |
| color: #fffdf8; |
| max-width: 700px; |
| margin: 0; |
| } |
| |
| .hero p { |
| max-width: 520px; |
| color: rgba(255,255,255,0.84); |
| font-size: 1.03rem; |
| line-height: 1.7; |
| margin-top: 1rem; |
| } |
| |
| .hero-chip-row { |
| display: flex; |
| gap: 0.7rem; |
| flex-wrap: wrap; |
| margin-top: 1.6rem; |
| } |
| |
| .chip { |
| border: 1px solid rgba(255,255,255,0.24); |
| background: rgba(255,255,255,0.12); |
| color: white; |
| padding: 0.62rem 0.9rem; |
| border-radius: 999px; |
| font-size: 0.84rem; |
| backdrop-filter: blur(6px); |
| } |
| |
| .panel, |
| .soft-panel, |
| .metric-card, |
| .story-card { |
| border-radius: 26px; |
| overflow: hidden; |
| } |
| |
| .panel { |
| background: rgba(255,255,255,0.5); |
| border: 1px solid rgba(26,26,24,0.08); |
| padding: 1.25rem; |
| } |
| |
| .soft-panel { |
| background: #e7e1cf; |
| border: 1px solid rgba(26,26,24,0.06); |
| padding: 1.5rem; |
| } |
| |
| .section-head { |
| display: flex; |
| justify-content: space-between; |
| gap: 1rem; |
| align-items: end; |
| margin: 2.5rem 0 1.2rem 0; |
| } |
| |
| .section-title { |
| font-size: clamp(1.8rem, 3vw, 3rem); |
| line-height: 1; |
| margin: 0; |
| } |
| |
| .section-copy { |
| max-width: 520px; |
| color: var(--muted); |
| font-size: 0.96rem; |
| line-height: 1.7; |
| } |
| |
| .metric-grid { |
| display: grid; |
| grid-template-columns: repeat(4, minmax(0,1fr)); |
| gap: 1rem; |
| margin-top: 1rem; |
| } |
| |
| .metric-card { |
| background: #f7f5ee; |
| border: 1px solid rgba(26,26,24,0.08); |
| padding: 1rem 1.1rem; |
| min-height: 130px; |
| } |
| |
| .metric-label { |
| color: var(--muted); |
| font-size: 0.84rem; |
| margin-bottom: 1rem; |
| } |
| |
| .metric-value { |
| font-size: 2rem; |
| line-height: 1; |
| margin-bottom: 0.35rem; |
| font-weight: 600; |
| } |
| |
| .story-card { |
| position: relative; |
| min-height: 170px; |
| padding: 1.2rem; |
| color: #fffaf2; |
| background: |
| linear-gradient(180deg, rgba(0,0,0,0.05), rgba(0,0,0,0.55)), |
| linear-gradient(135deg, rgba(48,93,112,0.8), rgba(24,24,24,0.75), rgba(176,103,77,0.65)); |
| border: 1px solid rgba(255,255,255,0.08); |
| } |
| |
| .story-card h4 { |
| margin: 0; |
| font-size: 1.2rem; |
| line-height: 1.2; |
| font-family: Georgia, 'Times New Roman', serif; |
| font-weight: 400; |
| } |
| |
| .story-card p { |
| font-size: 0.9rem; |
| color: rgba(255,255,255,0.82); |
| line-height: 1.6; |
| margin-top: 0.8rem; |
| } |
| |
| .cta { |
| text-align: center; |
| padding: 3rem 2rem; |
| margin-top: 2rem; |
| border-radius: 28px; |
| background: #dfe7d7; |
| border: 1px solid rgba(26,26,24,0.08); |
| } |
| |
| .cta h2 { |
| font-family: Georgia, 'Times New Roman', serif; |
| font-size: clamp(2rem, 4vw, 3.2rem); |
| font-weight: 400; |
| margin: 0 0 0.7rem 0; |
| } |
| |
| .foot { |
| border-top: 1px solid var(--line); |
| margin-top: 2.5rem; |
| padding-top: 1.2rem; |
| color: var(--muted); |
| font-size: 0.88rem; |
| } |
| |
| label, .stRadio label, .stCaption, [data-testid="stCaptionContainer"] { |
| color: #4f4b45 !important; |
| opacity: 1 !important; |
| } |
| |
| [data-testid="stMarkdownContainer"] p { |
| color: #4f4b45; |
| } |
| |
| div[role="radiogroup"] label { |
| color: #2b2925 !important; |
| font-weight: 500; |
| } |
| |
| @media (max-width: 900px) { |
| .hero { |
| min-height: 360px; |
| padding: 2rem 1.4rem; |
| } |
| .metric-grid { |
| grid-template-columns: repeat(2, minmax(0,1fr)); |
| } |
| } |
| |
| @media (max-width: 640px) { |
| .metric-grid { |
| grid-template-columns: 1fr; |
| } |
| .section-head { |
| flex-direction: column; |
| align-items: start; |
| } |
| } |
| </style> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| @st.cache_resource(show_spinner=False) |
| def load_pipelines(): |
| classifier = pipeline( |
| "text-classification", |
| model=MODEL_NAME, |
| tokenizer=MODEL_NAME, |
| truncation=True, |
| ) |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(QA_MODEL) |
| model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL) |
|
|
| def qa_fn(question, context): |
| inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
| start = outputs.start_logits.argmax() |
| end = outputs.end_logits.argmax() + 1 |
| answer = tokenizer.convert_tokens_to_string( |
| tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end]) |
| ) |
| score = float(outputs.start_logits.softmax(dim=-1).max()) |
| return {"answer": answer, "score": score} |
|
|
| return classifier, qa_fn |
|
|
|
|
| def preprocess_text(text: str) -> str: |
| text = str(text) |
| text = re.sub(r"http\S+|www\.\S+", " ", text) |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|
|
|
| def get_text_column(df: pd.DataFrame) -> str: |
| lowered = {c.lower(): c for c in df.columns} |
| if "content" in lowered: |
| return lowered["content"] |
| raise ValueError("CSV must contain a 'content' column.") |
|
|
|
|
| def predict_classes(df: pd.DataFrame, text_col: str, classifier): |
| texts = df[text_col].fillna("").astype(str).apply(preprocess_text).tolist() |
| outputs = classifier(texts, batch_size=16) |
| labels = [o.get("label", "Unknown") for o in outputs] |
| scores = [round(float(o.get("score", 0.0)), 4) for o in outputs] |
| return texts, labels, scores |
|
|
|
|
| def dataframe_to_csv_bytes(df: pd.DataFrame) -> bytes: |
| return df.to_csv(index=False).encode("utf-8") |
|
|
|
|
| inject_styles() |
|
|
| with st.spinner("Loading models..."): |
| classifier, qa_pipeline = load_pipelines() |
|
|
| st.markdown( |
| """ |
| <div class="nav"> |
| <div class="brand">News Intelligence Studio</div> |
| <div class="nav-links"> |
| <span>Classification</span> |
| <span>Question Answering</span> |
| <span>Insights</span> |
| <span>Local Streamlit</span> |
| </div> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| st.markdown( |
| """ |
| <section class="hero"> |
| <div class="eyebrow">Powered by Hugging Face</div> |
| <h1>Intelligence that reads your news operations</h1> |
| <p> |
| Upload a CSV, classify every news excerpt with your fine-tuned model, |
| explore the predicted distribution, and ask grounded questions from the |
| article content in one polished Streamlit workspace. |
| </p> |
| <div class="hero-chip-row"> |
| <div class="chip">Model: Akilashamnaka12/news-classifier-model</div> |
| <div class="chip">CSV in → output.csv out</div> |
| <div class="chip">Local-first Streamlit experience</div> |
| </div> |
| </section> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| left, right = st.columns([1.15, 0.85], gap="large") |
|
|
| uploaded_file = None |
| question = "" |
| context_mode = "Use first few records" |
| answer_box = right.empty() |
|
|
| with left: |
| st.markdown('<div class="panel">', unsafe_allow_html=True) |
| uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) |
| st.caption("Expected column: content") |
| st.markdown("</div>", unsafe_allow_html=True) |
|
|
| result_df = None |
| filtered_df = None |
| selected_class = "All" |
| text_col = None |
|
|
| if uploaded_file is not None: |
| try: |
| raw_df = pd.read_csv(uploaded_file) |
| text_col = get_text_column(raw_df) |
| texts, labels, scores = predict_classes(raw_df.copy(), text_col, classifier) |
|
|
| result_df = raw_df.copy() |
| result_df[text_col] = texts |
| result_df["class"] = labels |
| result_df["confidence"] = scores |
|
|
| classes = sorted(result_df["class"].dropna().unique().tolist()) |
| selected_class = left.selectbox("Filter predictions", ["All"] + classes, index=0) |
|
|
| filtered_df = ( |
| result_df |
| if selected_class == "All" |
| else result_df[result_df["class"] == selected_class] |
| ) |
|
|
| except Exception as exc: |
| st.error(f"Could not process the file: {exc}") |
|
|
| with right: |
| st.markdown('<div class="panel">', unsafe_allow_html=True) |
| st.subheader("Ask questions from the uploaded news") |
| question = st.text_input("Type your question") |
| st.caption("Ask things like: What happened in sports? What caused flooding in Colombo?") |
| context_mode = st.radio( |
| "Context source", |
| ["Use first few records", "Use selected class only"], |
| horizontal=True, |
| ) |
|
|
| if uploaded_file is not None and result_df is not None and question: |
| try: |
| qa_source_df = result_df.copy() |
|
|
| if context_mode == "Use selected class only" and selected_class not in (None, "All"): |
| qa_source_df = qa_source_df[qa_source_df["class"] == selected_class] |
|
|
| candidate_rows = qa_source_df[text_col].fillna("").astype(str).head(MAX_CONTEXT_ROWS).tolist() |
| candidate_rows = [row for row in candidate_rows if row.strip()] |
|
|
| if candidate_rows: |
| best_answer = None |
| best_score = -1.0 |
| best_context = "" |
|
|
| for row_text in candidate_rows: |
| result = qa_pipeline( |
| question=question, |
| context=row_text |
| ) |
| score = float(result.get("score", 0.0)) |
|
|
| if score > best_score: |
| best_score = score |
| best_answer = result.get("answer", "No answer found.") |
| best_context = row_text |
|
|
| st.markdown("---") |
| st.markdown("### Answer") |
| st.success(best_answer) |
| st.caption(f"Confidence: {best_score:.4f}") |
|
|
| with st.expander("Show context used"): |
| st.write(best_context) |
| else: |
| st.warning("No usable context found.") |
|
|
| except Exception as e: |
| st.error(f"Error generating answer: {e}") |
|
|
| st.markdown("</div>", unsafe_allow_html=True) |
|
|
| if result_df is not None: |
| st.markdown( |
| """ |
| <div class="section-head"> |
| <div> |
| <div class="section-title">Continuously test and explore output</div> |
| </div> |
| <div class="section-copy"> |
| Once a file is uploaded, the app predicts a class for each row, |
| adds a confidence score, and prepares an exportable output.csv. |
| </div> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| top_class = result_df["class"].mode().iat[0] if not result_df.empty else "N/A" |
| avg_conf = f"{result_df['confidence'].mean():.2%}" if not result_df.empty else "0%" |
|
|
| st.markdown( |
| f""" |
| <div class="metric-grid"> |
| <div class="metric-card"> |
| <div class="metric-label">Uploaded records</div> |
| <div class="metric-value">{len(result_df)}</div> |
| <div>Rows processed from your CSV</div> |
| </div> |
| <div class="metric-card"> |
| <div class="metric-label">Detected classes</div> |
| <div class="metric-value">{result_df['class'].nunique()}</div> |
| <div>Unique labels predicted by the model</div> |
| </div> |
| <div class="metric-card"> |
| <div class="metric-label">Top predicted class</div> |
| <div class="metric-value">{top_class}</div> |
| <div>Most frequent label in the batch</div> |
| </div> |
| <div class="metric-card"> |
| <div class="metric-label">Average confidence</div> |
| <div class="metric-value">{avg_conf}</div> |
| <div>Mean prediction confidence score</div> |
| </div> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| col_a, col_b = st.columns([1.05, 0.95], gap="large") |
|
|
| with col_a: |
| st.markdown('<div class="soft-panel">', unsafe_allow_html=True) |
| st.subheader("Predicted class distribution") |
| st.bar_chart(result_df["class"].value_counts()) |
| st.markdown("</div>", unsafe_allow_html=True) |
|
|
| with col_b: |
| st.markdown('<div class="soft-panel">', unsafe_allow_html=True) |
| st.subheader("Download ready") |
| st.write( |
| "Your exported file includes the original columns, the predicted class, and the confidence score." |
| ) |
| st.download_button( |
| label="Download output.csv", |
| data=dataframe_to_csv_bytes(result_df), |
| file_name="output.csv", |
| mime="text/csv", |
| use_container_width=True, |
| ) |
| st.markdown("</div>", unsafe_allow_html=True) |
|
|
| st.markdown( |
| """ |
| <div class="section-head"> |
| <div> |
| <div class="section-title">Built for the real world</div> |
| </div> |
| <div class="section-copy"> |
| Below are presentation-friendly feature cards. They help your app |
| feel more like a polished product during the live demo. |
| </div> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| story_cols = st.columns(4, gap="small") |
| stories = [ |
| ( |
| "Scalable batch classification", |
| "Upload larger CSV files and label each record in a single flow.", |
| ), |
| ( |
| "Grounded question answering", |
| "Ask focused questions using article content as context.", |
| ), |
| ( |
| "Confidence-aware review", |
| "Inspect how certain the model is before exporting the final sheet.", |
| ), |
| ( |
| "Presentation-ready interface", |
| "A clean editorial design that feels stronger than a default dashboard.", |
| ), |
| ] |
|
|
| for col, (title, copy) in zip(story_cols, stories): |
| with col: |
| st.markdown( |
| f'<div class="story-card"><h4>{title}</h4><p>{copy}</p></div>', |
| unsafe_allow_html=True, |
| ) |
|
|
| st.markdown( |
| """ |
| <div class="section-head"> |
| <div> |
| <div class="section-title">Records</div> |
| </div> |
| <div class="section-copy"> |
| Review the classified rows before downloading the final output. |
| </div> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
| st.dataframe(filtered_df, use_container_width=True, height=360) |
| else: |
| st.markdown( |
| """ |
| <div class="cta"> |
| <h2>Intelligence that runs your news workflow</h2> |
| <p>Upload a CSV to activate classification, analytics, downloadable results, and grounded Q&A.</p> |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
| st.markdown( |
| """ |
| <div class="foot"> |
| Local run command: <code>python -m streamlit run app.py</code><br> |
| Make sure your CSV contains a <code>content</code> column, and keep the preprocessing function aligned with Section 01. |
| </div> |
| """, |
| unsafe_allow_html=True, |
| ) |