import re
import pandas as pd
import streamlit as st
from transformers import pipeline
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch
st.set_page_config(
page_title="News Intelligence Studio",
page_icon="📰",
layout="wide",
initial_sidebar_state="collapsed",
)
MODEL_NAME = "Akilashamnaka12/news-classifier-model"
QA_MODEL = "distilbert-base-cased-distilled-squad"
MAX_CONTEXT_ROWS = 8
def inject_styles() -> None:
st.markdown(
"""
""",
unsafe_allow_html=True,
)
@st.cache_resource(show_spinner=False)
def load_pipelines():
classifier = pipeline(
"text-classification",
model=MODEL_NAME,
tokenizer=MODEL_NAME,
truncation=True,
)
# Load QA manually instead of using pipeline("question-answering")
tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)
model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)
def qa_fn(question, context):
inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
start = outputs.start_logits.argmax()
end = outputs.end_logits.argmax() + 1
answer = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end])
)
score = float(outputs.start_logits.softmax(dim=-1).max())
return {"answer": answer, "score": score}
return classifier, qa_fn
def preprocess_text(text: str) -> str:
text = str(text)
text = re.sub(r"http\S+|www\.\S+", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def get_text_column(df: pd.DataFrame) -> str:
lowered = {c.lower(): c for c in df.columns}
if "content" in lowered:
return lowered["content"]
raise ValueError("CSV must contain a 'content' column.")
def predict_classes(df: pd.DataFrame, text_col: str, classifier):
texts = df[text_col].fillna("").astype(str).apply(preprocess_text).tolist()
outputs = classifier(texts, batch_size=16)
labels = [o.get("label", "Unknown") for o in outputs]
scores = [round(float(o.get("score", 0.0)), 4) for o in outputs]
return texts, labels, scores
def dataframe_to_csv_bytes(df: pd.DataFrame) -> bytes:
return df.to_csv(index=False).encode("utf-8")
inject_styles()
with st.spinner("Loading models..."):
classifier, qa_pipeline = load_pipelines()
st.markdown(
"""
News Intelligence Studio
Classification
Question Answering
Insights
Local Streamlit
""",
unsafe_allow_html=True,
)
st.markdown(
"""
Powered by Hugging Face
Intelligence that reads your news operations
Upload a CSV, classify every news excerpt with your fine-tuned model,
explore the predicted distribution, and ask grounded questions from the
article content in one polished Streamlit workspace.
Model: Akilashamnaka12/news-classifier-model
CSV in → output.csv out
Local-first Streamlit experience
""",
unsafe_allow_html=True,
)
left, right = st.columns([1.15, 0.85], gap="large")
uploaded_file = None
question = ""
context_mode = "Use first few records"
answer_box = right.empty()
with left:
st.markdown('', unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
st.caption("Expected column: content")
st.markdown("
", unsafe_allow_html=True)
result_df = None
filtered_df = None
selected_class = "All"
text_col = None
if uploaded_file is not None:
try:
raw_df = pd.read_csv(uploaded_file)
text_col = get_text_column(raw_df)
texts, labels, scores = predict_classes(raw_df.copy(), text_col, classifier)
result_df = raw_df.copy()
result_df[text_col] = texts
result_df["class"] = labels
result_df["confidence"] = scores
classes = sorted(result_df["class"].dropna().unique().tolist())
selected_class = left.selectbox("Filter predictions", ["All"] + classes, index=0)
filtered_df = (
result_df
if selected_class == "All"
else result_df[result_df["class"] == selected_class]
)
except Exception as exc:
st.error(f"Could not process the file: {exc}")
with right:
st.markdown('', unsafe_allow_html=True)
st.subheader("Ask questions from the uploaded news")
question = st.text_input("Type your question")
st.caption("Ask things like: What happened in sports? What caused flooding in Colombo?")
context_mode = st.radio(
"Context source",
["Use first few records", "Use selected class only"],
horizontal=True,
)
if uploaded_file is not None and result_df is not None and question:
try:
qa_source_df = result_df.copy()
if context_mode == "Use selected class only" and selected_class not in (None, "All"):
qa_source_df = qa_source_df[qa_source_df["class"] == selected_class]
candidate_rows = qa_source_df[text_col].fillna("").astype(str).head(MAX_CONTEXT_ROWS).tolist()
candidate_rows = [row for row in candidate_rows if row.strip()]
if candidate_rows:
best_answer = None
best_score = -1.0
best_context = ""
for row_text in candidate_rows:
result = qa_pipeline(
question=question,
context=row_text
)
score = float(result.get("score", 0.0))
if score > best_score:
best_score = score
best_answer = result.get("answer", "No answer found.")
best_context = row_text
st.markdown("---")
st.markdown("### Answer")
st.success(best_answer)
st.caption(f"Confidence: {best_score:.4f}")
with st.expander("Show context used"):
st.write(best_context)
else:
st.warning("No usable context found.")
except Exception as e:
st.error(f"Error generating answer: {e}")
st.markdown("
", unsafe_allow_html=True)
if result_df is not None:
st.markdown(
"""
Continuously test and explore output
Once a file is uploaded, the app predicts a class for each row,
adds a confidence score, and prepares an exportable output.csv.
""",
unsafe_allow_html=True,
)
top_class = result_df["class"].mode().iat[0] if not result_df.empty else "N/A"
avg_conf = f"{result_df['confidence'].mean():.2%}" if not result_df.empty else "0%"
st.markdown(
f"""
Uploaded records
{len(result_df)}
Rows processed from your CSV
Detected classes
{result_df['class'].nunique()}
Unique labels predicted by the model
Top predicted class
{top_class}
Most frequent label in the batch
Average confidence
{avg_conf}
Mean prediction confidence score
""",
unsafe_allow_html=True,
)
col_a, col_b = st.columns([1.05, 0.95], gap="large")
with col_a:
st.markdown('', unsafe_allow_html=True)
st.subheader("Predicted class distribution")
st.bar_chart(result_df["class"].value_counts())
st.markdown("
", unsafe_allow_html=True)
with col_b:
st.markdown('', unsafe_allow_html=True)
st.subheader("Download ready")
st.write(
"Your exported file includes the original columns, the predicted class, and the confidence score."
)
st.download_button(
label="Download output.csv",
data=dataframe_to_csv_bytes(result_df),
file_name="output.csv",
mime="text/csv",
use_container_width=True,
)
st.markdown("
", unsafe_allow_html=True)
st.markdown(
"""
Below are presentation-friendly feature cards. They help your app
feel more like a polished product during the live demo.
""",
unsafe_allow_html=True,
)
story_cols = st.columns(4, gap="small")
stories = [
(
"Scalable batch classification",
"Upload larger CSV files and label each record in a single flow.",
),
(
"Grounded question answering",
"Ask focused questions using article content as context.",
),
(
"Confidence-aware review",
"Inspect how certain the model is before exporting the final sheet.",
),
(
"Presentation-ready interface",
"A clean editorial design that feels stronger than a default dashboard.",
),
]
for col, (title, copy) in zip(story_cols, stories):
with col:
st.markdown(
f'',
unsafe_allow_html=True,
)
st.markdown(
"""
Review the classified rows before downloading the final output.
""",
unsafe_allow_html=True,
)
st.dataframe(filtered_df, use_container_width=True, height=360)
else:
st.markdown(
"""
Intelligence that runs your news workflow
Upload a CSV to activate classification, analytics, downloadable results, and grounded Q&A.
""",
unsafe_allow_html=True,
)
st.markdown(
"""
""",
unsafe_allow_html=True,
)