import re import pandas as pd import streamlit as st from transformers import pipeline from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering import torch st.set_page_config( page_title="News Intelligence Studio", page_icon="📰", layout="wide", initial_sidebar_state="collapsed", ) MODEL_NAME = "Akilashamnaka12/news-classifier-model" QA_MODEL = "distilbert-base-cased-distilled-squad" MAX_CONTEXT_ROWS = 8 def inject_styles() -> None: st.markdown( """ """, unsafe_allow_html=True, ) @st.cache_resource(show_spinner=False) def load_pipelines(): classifier = pipeline( "text-classification", model=MODEL_NAME, tokenizer=MODEL_NAME, truncation=True, ) # Load QA manually instead of using pipeline("question-answering") tokenizer = AutoTokenizer.from_pretrained(QA_MODEL) model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL) def qa_fn(question, context): inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) start = outputs.start_logits.argmax() end = outputs.end_logits.argmax() + 1 answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end]) ) score = float(outputs.start_logits.softmax(dim=-1).max()) return {"answer": answer, "score": score} return classifier, qa_fn def preprocess_text(text: str) -> str: text = str(text) text = re.sub(r"http\S+|www\.\S+", " ", text) text = re.sub(r"\s+", " ", text).strip() return text def get_text_column(df: pd.DataFrame) -> str: lowered = {c.lower(): c for c in df.columns} if "content" in lowered: return lowered["content"] raise ValueError("CSV must contain a 'content' column.") def predict_classes(df: pd.DataFrame, text_col: str, classifier): texts = df[text_col].fillna("").astype(str).apply(preprocess_text).tolist() outputs = classifier(texts, batch_size=16) labels = [o.get("label", "Unknown") for o in outputs] scores = [round(float(o.get("score", 0.0)), 4) for o in outputs] return texts, labels, scores def dataframe_to_csv_bytes(df: pd.DataFrame) -> bytes: return df.to_csv(index=False).encode("utf-8") inject_styles() with st.spinner("Loading models..."): classifier, qa_pipeline = load_pipelines() st.markdown( """ """, unsafe_allow_html=True, ) st.markdown( """
Powered by Hugging Face

Intelligence that reads your news operations

Upload a CSV, classify every news excerpt with your fine-tuned model, explore the predicted distribution, and ask grounded questions from the article content in one polished Streamlit workspace.

Model: Akilashamnaka12/news-classifier-model
CSV in → output.csv out
Local-first Streamlit experience
""", unsafe_allow_html=True, ) left, right = st.columns([1.15, 0.85], gap="large") uploaded_file = None question = "" context_mode = "Use first few records" answer_box = right.empty() with left: st.markdown('
', unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) st.caption("Expected column: content") st.markdown("
", unsafe_allow_html=True) result_df = None filtered_df = None selected_class = "All" text_col = None if uploaded_file is not None: try: raw_df = pd.read_csv(uploaded_file) text_col = get_text_column(raw_df) texts, labels, scores = predict_classes(raw_df.copy(), text_col, classifier) result_df = raw_df.copy() result_df[text_col] = texts result_df["class"] = labels result_df["confidence"] = scores classes = sorted(result_df["class"].dropna().unique().tolist()) selected_class = left.selectbox("Filter predictions", ["All"] + classes, index=0) filtered_df = ( result_df if selected_class == "All" else result_df[result_df["class"] == selected_class] ) except Exception as exc: st.error(f"Could not process the file: {exc}") with right: st.markdown('
', unsafe_allow_html=True) st.subheader("Ask questions from the uploaded news") question = st.text_input("Type your question") st.caption("Ask things like: What happened in sports? What caused flooding in Colombo?") context_mode = st.radio( "Context source", ["Use first few records", "Use selected class only"], horizontal=True, ) if uploaded_file is not None and result_df is not None and question: try: qa_source_df = result_df.copy() if context_mode == "Use selected class only" and selected_class not in (None, "All"): qa_source_df = qa_source_df[qa_source_df["class"] == selected_class] candidate_rows = qa_source_df[text_col].fillna("").astype(str).head(MAX_CONTEXT_ROWS).tolist() candidate_rows = [row for row in candidate_rows if row.strip()] if candidate_rows: best_answer = None best_score = -1.0 best_context = "" for row_text in candidate_rows: result = qa_pipeline( question=question, context=row_text ) score = float(result.get("score", 0.0)) if score > best_score: best_score = score best_answer = result.get("answer", "No answer found.") best_context = row_text st.markdown("---") st.markdown("### Answer") st.success(best_answer) st.caption(f"Confidence: {best_score:.4f}") with st.expander("Show context used"): st.write(best_context) else: st.warning("No usable context found.") except Exception as e: st.error(f"Error generating answer: {e}") st.markdown("
", unsafe_allow_html=True) if result_df is not None: st.markdown( """
Continuously test and explore output
Once a file is uploaded, the app predicts a class for each row, adds a confidence score, and prepares an exportable output.csv.
""", unsafe_allow_html=True, ) top_class = result_df["class"].mode().iat[0] if not result_df.empty else "N/A" avg_conf = f"{result_df['confidence'].mean():.2%}" if not result_df.empty else "0%" st.markdown( f"""
Uploaded records
{len(result_df)}
Rows processed from your CSV
Detected classes
{result_df['class'].nunique()}
Unique labels predicted by the model
Top predicted class
{top_class}
Most frequent label in the batch
Average confidence
{avg_conf}
Mean prediction confidence score
""", unsafe_allow_html=True, ) col_a, col_b = st.columns([1.05, 0.95], gap="large") with col_a: st.markdown('
', unsafe_allow_html=True) st.subheader("Predicted class distribution") st.bar_chart(result_df["class"].value_counts()) st.markdown("
", unsafe_allow_html=True) with col_b: st.markdown('
', unsafe_allow_html=True) st.subheader("Download ready") st.write( "Your exported file includes the original columns, the predicted class, and the confidence score." ) st.download_button( label="Download output.csv", data=dataframe_to_csv_bytes(result_df), file_name="output.csv", mime="text/csv", use_container_width=True, ) st.markdown("
", unsafe_allow_html=True) st.markdown( """
Built for the real world
Below are presentation-friendly feature cards. They help your app feel more like a polished product during the live demo.
""", unsafe_allow_html=True, ) story_cols = st.columns(4, gap="small") stories = [ ( "Scalable batch classification", "Upload larger CSV files and label each record in a single flow.", ), ( "Grounded question answering", "Ask focused questions using article content as context.", ), ( "Confidence-aware review", "Inspect how certain the model is before exporting the final sheet.", ), ( "Presentation-ready interface", "A clean editorial design that feels stronger than a default dashboard.", ), ] for col, (title, copy) in zip(story_cols, stories): with col: st.markdown( f'

{title}

{copy}

', unsafe_allow_html=True, ) st.markdown( """
Records
Review the classified rows before downloading the final output.
""", unsafe_allow_html=True, ) st.dataframe(filtered_df, use_container_width=True, height=360) else: st.markdown( """

Intelligence that runs your news workflow

Upload a CSV to activate classification, analytics, downloadable results, and grounded Q&A.

""", unsafe_allow_html=True, ) st.markdown( """
Local run command: python -m streamlit run app.py
Make sure your CSV contains a content column, and keep the preprocessing function aligned with Section 01.
""", unsafe_allow_html=True, )