Spaces:

Ginidu2003
/

Distilbert_Base_News_Classifier

Sleeping

File size: 2,929 Bytes

import streamlit as st
import pandas as pd
import torch                              # ← This was missing
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# ====================== PREPROCESSING ======================
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# ====================== LOAD MODEL ======================
@st.cache_resource
def load_model():
    model_name = "Ginidu2003/Distilbert-Base-News-classifier"   # ← Change if your model name is different
    return pipeline(
        "text-classification",
        model=model_name,
        device=0 if torch.cuda.is_available() else -1
    )

classifier = load_model()

# ====================== STREAMLIT APP ======================
st.title("📰 Daily Mirror News Classifier")
st.subheader("Classify news into Business, Opinion, Political Gossip, Sports, or World News")

st.markdown("**Upload a CSV file** with a column named `content`")

uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    
    st.write("### Preview of uploaded data")
    st.dataframe(df.head())

    if 'content' not in df.columns:
        st.error("Your CSV must have a column named 'content'")
    else:
        with st.spinner("Preprocessing and classifying..."):
            df['clean_content'] = df['content'].apply(preprocess_text)
            
            predictions = []
            for text in df['clean_content']:
                if text.strip() == "":
                    predictions.append("Unknown")
                else:
                    result = classifier(text)[0]
                    predictions.append(result['label'])
            
            df['class'] = predictions
            df = df.drop(columns=['clean_content'], errors='ignore')
            
            st.success("✅ Classification completed!")
            st.write("### Preview of classified data")
            st.dataframe(df.head())

            # Download button
            csv = df.to_csv(index=False).encode('utf-8')
            st.download_button(
                label="📥 Download output.csv",
                data=csv,
                file_name="output.csv",
                mime="text/csv"
            )

st.caption("Built for Text Analytics Assignment - Section 02")