Spaces:

Akilashamnaka12
/

news_classification_UI

Sleeping

File size: 19,323 Bytes

import re

import pandas as pd
import streamlit as st
from transformers import pipeline
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch

st.set_page_config(
    page_title="News Intelligence Studio",
    page_icon="📰",
    layout="wide",
    initial_sidebar_state="collapsed",
)

MODEL_NAME = "Akilashamnaka12/news-classifier-model"
QA_MODEL = "distilbert-base-cased-distilled-squad"
MAX_CONTEXT_ROWS = 8


def inject_styles() -> None:
    st.markdown(
        """
        <style>
        :root {
            --bg: #f5f4ef;
            --paper: #fbfaf6;
            --ink: #1a1a18;
            --muted: #6d6a63;
            --line: rgba(26,26,24,0.12);
            --soft: #ece8db;
            --accent: #121212;
            --gradient-a: rgba(127, 177, 183, 0.65);
            --gradient-b: rgba(30, 44, 58, 0.35);
            --gradient-c: rgba(220, 191, 151, 0.35);
        }

        .stApp {
            background: var(--bg);
            color: var(--ink);
        }

        .block-container {
            padding-top: 1.2rem;
            padding-bottom: 4rem;
            max-width: 1240px;
        }

        header[data-testid="stHeader"] {
            background: transparent;
        }

        [data-testid="stSidebar"] {
            display: none;
        }

        div[data-testid="stFileUploaderDropzone"] {
            background: rgba(255,255,255,0.55);
            border: 1px dashed rgba(26,26,24,0.18);
            border-radius: 22px;
            min-height: 220px;
        }

        .nav {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 0.4rem 0 1rem 0;
            border-bottom: 1px solid var(--line);
            margin-bottom: 1rem;
            font-size: 0.92rem;
            color: var(--muted);
        }

        .nav-links {
            display: flex;
            gap: 1.4rem;
            flex-wrap: wrap;
        }

        .brand {
            font-weight: 700;
            letter-spacing: 0.02em;
            color: var(--ink);
        }

        .hero {
            position: relative;
            overflow: hidden;
            border-radius: 30px;
            min-height: 460px;
            padding: 3.5rem 3rem;
            margin: 1rem 0 2rem 0;
            background:
                radial-gradient(circle at 20% 20%, rgba(255,255,255,0.62), transparent 32%),
                radial-gradient(circle at 75% 28%, rgba(11, 27, 42, 0.35), transparent 18%),
                radial-gradient(circle at 82% 82%, rgba(232, 196, 154, 0.5), transparent 20%),
                linear-gradient(135deg, var(--gradient-a), var(--gradient-b) 48%, var(--gradient-c));
            box-shadow: 0 10px 30px rgba(0,0,0,0.05);
        }

        .eyebrow {
            text-transform: uppercase;
            font-size: 0.78rem;
            letter-spacing: 0.18em;
            color: rgba(255,255,255,0.82);
            margin-bottom: 1rem;
        }

        .hero h1,
        .section-title {
            font-family: Georgia, 'Times New Roman', serif;
            font-weight: 400;
            letter-spacing: -0.02em;
        }

        .hero h1 {
            font-size: clamp(3rem, 7vw, 5.4rem);
            line-height: 0.92;
            color: #fffdf8;
            max-width: 700px;
            margin: 0;
        }

        .hero p {
            max-width: 520px;
            color: rgba(255,255,255,0.84);
            font-size: 1.03rem;
            line-height: 1.7;
            margin-top: 1rem;
        }

        .hero-chip-row {
            display: flex;
            gap: 0.7rem;
            flex-wrap: wrap;
            margin-top: 1.6rem;
        }

        .chip {
            border: 1px solid rgba(255,255,255,0.24);
            background: rgba(255,255,255,0.12);
            color: white;
            padding: 0.62rem 0.9rem;
            border-radius: 999px;
            font-size: 0.84rem;
            backdrop-filter: blur(6px);
        }

        .panel,
        .soft-panel,
        .metric-card,
        .story-card {
            border-radius: 26px;
            overflow: hidden;
        }

        .panel {
            background: rgba(255,255,255,0.5);
            border: 1px solid rgba(26,26,24,0.08);
            padding: 1.25rem;
        }

        .soft-panel {
            background: #e7e1cf;
            border: 1px solid rgba(26,26,24,0.06);
            padding: 1.5rem;
        }

        .section-head {
            display: flex;
            justify-content: space-between;
            gap: 1rem;
            align-items: end;
            margin: 2.5rem 0 1.2rem 0;
        }

        .section-title {
            font-size: clamp(1.8rem, 3vw, 3rem);
            line-height: 1;
            margin: 0;
        }

        .section-copy {
            max-width: 520px;
            color: var(--muted);
            font-size: 0.96rem;
            line-height: 1.7;
        }

        .metric-grid {
            display: grid;
            grid-template-columns: repeat(4, minmax(0,1fr));
            gap: 1rem;
            margin-top: 1rem;
        }

        .metric-card {
            background: #f7f5ee;
            border: 1px solid rgba(26,26,24,0.08);
            padding: 1rem 1.1rem;
            min-height: 130px;
        }

        .metric-label {
            color: var(--muted);
            font-size: 0.84rem;
            margin-bottom: 1rem;
        }

        .metric-value {
            font-size: 2rem;
            line-height: 1;
            margin-bottom: 0.35rem;
            font-weight: 600;
        }

        .story-card {
            position: relative;
            min-height: 170px;
            padding: 1.2rem;
            color: #fffaf2;
            background:
                linear-gradient(180deg, rgba(0,0,0,0.05), rgba(0,0,0,0.55)),
                linear-gradient(135deg, rgba(48,93,112,0.8), rgba(24,24,24,0.75), rgba(176,103,77,0.65));
            border: 1px solid rgba(255,255,255,0.08);
        }

        .story-card h4 {
            margin: 0;
            font-size: 1.2rem;
            line-height: 1.2;
            font-family: Georgia, 'Times New Roman', serif;
            font-weight: 400;
        }

        .story-card p {
            font-size: 0.9rem;
            color: rgba(255,255,255,0.82);
            line-height: 1.6;
            margin-top: 0.8rem;
        }

        .cta {
            text-align: center;
            padding: 3rem 2rem;
            margin-top: 2rem;
            border-radius: 28px;
            background: #dfe7d7;
            border: 1px solid rgba(26,26,24,0.08);
        }

        .cta h2 {
            font-family: Georgia, 'Times New Roman', serif;
            font-size: clamp(2rem, 4vw, 3.2rem);
            font-weight: 400;
            margin: 0 0 0.7rem 0;
        }

        .foot {
            border-top: 1px solid var(--line);
            margin-top: 2.5rem;
            padding-top: 1.2rem;
            color: var(--muted);
            font-size: 0.88rem;
        }

        label, .stRadio label, .stCaption, [data-testid="stCaptionContainer"] {
            color: #4f4b45 !important;
            opacity: 1 !important;
        }

        [data-testid="stMarkdownContainer"] p {
            color: #4f4b45;
        }

        div[role="radiogroup"] label {
            color: #2b2925 !important;
            font-weight: 500;
        }

        @media (max-width: 900px) {
            .hero {
                min-height: 360px;
                padding: 2rem 1.4rem;
            }
            .metric-grid {
                grid-template-columns: repeat(2, minmax(0,1fr));
            }
        }

        @media (max-width: 640px) {
            .metric-grid {
                grid-template-columns: 1fr;
            }
            .section-head {
                flex-direction: column;
                align-items: start;
            }
        }
        </style>
        """,
        unsafe_allow_html=True,
    )

@st.cache_resource(show_spinner=False)
def load_pipelines():
    classifier = pipeline(
        "text-classification",
        model=MODEL_NAME,
        tokenizer=MODEL_NAME,
        truncation=True,
    )

    # Load QA manually instead of using pipeline("question-answering")
    tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)
    model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)

    def qa_fn(question, context):
        inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        start = outputs.start_logits.argmax()
        end = outputs.end_logits.argmax() + 1
        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end])
        )
        score = float(outputs.start_logits.softmax(dim=-1).max())
        return {"answer": answer, "score": score}

    return classifier, qa_fn


def preprocess_text(text: str) -> str:
    text = str(text)
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def get_text_column(df: pd.DataFrame) -> str:
    lowered = {c.lower(): c for c in df.columns}
    if "content" in lowered:
        return lowered["content"]
    raise ValueError("CSV must contain a 'content' column.")


def predict_classes(df: pd.DataFrame, text_col: str, classifier):
    texts = df[text_col].fillna("").astype(str).apply(preprocess_text).tolist()
    outputs = classifier(texts, batch_size=16)
    labels = [o.get("label", "Unknown") for o in outputs]
    scores = [round(float(o.get("score", 0.0)), 4) for o in outputs]
    return texts, labels, scores


def dataframe_to_csv_bytes(df: pd.DataFrame) -> bytes:
    return df.to_csv(index=False).encode("utf-8")


inject_styles()

with st.spinner("Loading models..."):
    classifier, qa_pipeline = load_pipelines()

st.markdown(
    """
    <div class="nav">
        <div class="brand">News Intelligence Studio</div>
        <div class="nav-links">
            <span>Classification</span>
            <span>Question Answering</span>
            <span>Insights</span>
            <span>Local Streamlit</span>
        </div>
    </div>
    """,
    unsafe_allow_html=True,
)

st.markdown(
    """
    <section class="hero">
        <div class="eyebrow">Powered by Hugging Face</div>
        <h1>Intelligence that reads your news operations</h1>
        <p>
            Upload a CSV, classify every news excerpt with your fine-tuned model,
            explore the predicted distribution, and ask grounded questions from the
            article content in one polished Streamlit workspace.
        </p>
        <div class="hero-chip-row">
            <div class="chip">Model: Akilashamnaka12/news-classifier-model</div>
            <div class="chip">CSV in → output.csv out</div>
            <div class="chip">Local-first Streamlit experience</div>
        </div>
    </section>
    """,
    unsafe_allow_html=True,
)

left, right = st.columns([1.15, 0.85], gap="large")

uploaded_file = None
question = ""
context_mode = "Use first few records"
answer_box = right.empty()

with left:
    st.markdown('<div class="panel">', unsafe_allow_html=True)
    uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
    st.caption("Expected column: content")
    st.markdown("</div>", unsafe_allow_html=True)

result_df = None
filtered_df = None
selected_class = "All"
text_col = None

if uploaded_file is not None:
    try:
        raw_df = pd.read_csv(uploaded_file)
        text_col = get_text_column(raw_df)
        texts, labels, scores = predict_classes(raw_df.copy(), text_col, classifier)

        result_df = raw_df.copy()
        result_df[text_col] = texts
        result_df["class"] = labels
        result_df["confidence"] = scores

        classes = sorted(result_df["class"].dropna().unique().tolist())
        selected_class = left.selectbox("Filter predictions", ["All"] + classes, index=0)

        filtered_df = (
            result_df
            if selected_class == "All"
            else result_df[result_df["class"] == selected_class]
        )

    except Exception as exc:
        st.error(f"Could not process the file: {exc}")

with right:
    st.markdown('<div class="panel">', unsafe_allow_html=True)
    st.subheader("Ask questions from the uploaded news")
    question = st.text_input("Type your question")
    st.caption("Ask things like: What happened in sports? What caused flooding in Colombo?")
    context_mode = st.radio(
        "Context source",
        ["Use first few records", "Use selected class only"],
        horizontal=True,
    )

    if uploaded_file is not None and result_df is not None and question:
        try:
            qa_source_df = result_df.copy()

            if context_mode == "Use selected class only" and selected_class not in (None, "All"):
                qa_source_df = qa_source_df[qa_source_df["class"] == selected_class]

            candidate_rows = qa_source_df[text_col].fillna("").astype(str).head(MAX_CONTEXT_ROWS).tolist()
            candidate_rows = [row for row in candidate_rows if row.strip()]

            if candidate_rows:
                best_answer = None
                best_score = -1.0
                best_context = ""

                for row_text in candidate_rows:
                    result = qa_pipeline(
                        question=question,
                        context=row_text
                    )
                    score = float(result.get("score", 0.0))

                    if score > best_score:
                        best_score = score
                        best_answer = result.get("answer", "No answer found.")
                        best_context = row_text

                st.markdown("---")
                st.markdown("### Answer")
                st.success(best_answer)
                st.caption(f"Confidence: {best_score:.4f}")

                with st.expander("Show context used"):
                    st.write(best_context)
            else:
                st.warning("No usable context found.")

        except Exception as e:
            st.error(f"Error generating answer: {e}")

    st.markdown("</div>", unsafe_allow_html=True)

if result_df is not None:
    st.markdown(
        """
        <div class="section-head">
            <div>
                <div class="section-title">Continuously test and explore output</div>
            </div>
            <div class="section-copy">
                Once a file is uploaded, the app predicts a class for each row,
                adds a confidence score, and prepares an exportable output.csv.
            </div>
        </div>
        """,
        unsafe_allow_html=True,
    )

    top_class = result_df["class"].mode().iat[0] if not result_df.empty else "N/A"
    avg_conf = f"{result_df['confidence'].mean():.2%}" if not result_df.empty else "0%"

    st.markdown(
        f"""
        <div class="metric-grid">
            <div class="metric-card">
                <div class="metric-label">Uploaded records</div>
                <div class="metric-value">{len(result_df)}</div>
                <div>Rows processed from your CSV</div>
            </div>
            <div class="metric-card">
                <div class="metric-label">Detected classes</div>
                <div class="metric-value">{result_df['class'].nunique()}</div>
                <div>Unique labels predicted by the model</div>
            </div>
            <div class="metric-card">
                <div class="metric-label">Top predicted class</div>
                <div class="metric-value">{top_class}</div>
                <div>Most frequent label in the batch</div>
            </div>
            <div class="metric-card">
                <div class="metric-label">Average confidence</div>
                <div class="metric-value">{avg_conf}</div>
                <div>Mean prediction confidence score</div>
            </div>
        </div>
        """,
        unsafe_allow_html=True,
    )

    col_a, col_b = st.columns([1.05, 0.95], gap="large")

    with col_a:
        st.markdown('<div class="soft-panel">', unsafe_allow_html=True)
        st.subheader("Predicted class distribution")
        st.bar_chart(result_df["class"].value_counts())
        st.markdown("</div>", unsafe_allow_html=True)

    with col_b:
        st.markdown('<div class="soft-panel">', unsafe_allow_html=True)
        st.subheader("Download ready")
        st.write(
            "Your exported file includes the original columns, the predicted class, and the confidence score."
        )
        st.download_button(
            label="Download output.csv",
            data=dataframe_to_csv_bytes(result_df),
            file_name="output.csv",
            mime="text/csv",
            use_container_width=True,
        )
        st.markdown("</div>", unsafe_allow_html=True)

    st.markdown(
        """
        <div class="section-head">
            <div>
                <div class="section-title">Built for the real world</div>
            </div>
            <div class="section-copy">
                Below are presentation-friendly feature cards. They help your app
                feel more like a polished product during the live demo.
            </div>
        </div>
        """,
        unsafe_allow_html=True,
    )

    story_cols = st.columns(4, gap="small")
    stories = [
        (
            "Scalable batch classification",
            "Upload larger CSV files and label each record in a single flow.",
        ),
        (
            "Grounded question answering",
            "Ask focused questions using article content as context.",
        ),
        (
            "Confidence-aware review",
            "Inspect how certain the model is before exporting the final sheet.",
        ),
        (
            "Presentation-ready interface",
            "A clean editorial design that feels stronger than a default dashboard.",
        ),
    ]

    for col, (title, copy) in zip(story_cols, stories):
        with col:
            st.markdown(
                f'<div class="story-card"><h4>{title}</h4><p>{copy}</p></div>',
                unsafe_allow_html=True,
            )

    st.markdown(
        """
        <div class="section-head">
            <div>
                <div class="section-title">Records</div>
            </div>
            <div class="section-copy">
                Review the classified rows before downloading the final output.
            </div>
        </div>
        """,
        unsafe_allow_html=True,
    )
    st.dataframe(filtered_df, use_container_width=True, height=360)
else:
    st.markdown(
        """
        <div class="cta">
            <h2>Intelligence that runs your news workflow</h2>
            <p>Upload a CSV to activate classification, analytics, downloadable results, and grounded Q&A.</p>
        </div>
        """,
        unsafe_allow_html=True,
    )

st.markdown(
    """
    <div class="foot">
        Local run command: <code>python -m streamlit run app.py</code><br>
        Make sure your CSV contains a <code>content</code> column, and keep the preprocessing function aligned with Section 01.
    </div>
    """,
    unsafe_allow_html=True,
)