File size: 2,929 Bytes
59c07da
c368330
6472126
c368330
 
 
 
 
 
 
ab4f49e
191b0d0
 
 
25346d7
191b0d0
 
c368330
191b0d0
 
 
 
 
 
 
 
 
 
 
 
6472126
c368330
6472126
 
 
 
 
 
c368330
 
 
6472126
c368330
 
 
 
 
 
 
 
 
6472126
c368330
 
 
 
 
 
191b0d0
 
c368330
 
191b0d0
6472126
c368330
 
 
 
 
 
191b0d0
c368330
 
6472126
c368330
 
6472126
c368330
 
 
 
 
 
 
59c07da
c368330
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
import pandas as pd
import torch                              # ← This was missing
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# ====================== PREPROCESSING ======================
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# ====================== LOAD MODEL ======================
@st.cache_resource
def load_model():
    model_name = "Ginidu2003/Distilbert-Base-News-classifier"   # ← Change if your model name is different
    return pipeline(
        "text-classification",
        model=model_name,
        device=0 if torch.cuda.is_available() else -1
    )

classifier = load_model()

# ====================== STREAMLIT APP ======================
st.title("πŸ“° Daily Mirror News Classifier")
st.subheader("Classify news into Business, Opinion, Political Gossip, Sports, or World News")

st.markdown("**Upload a CSV file** with a column named `content`")

uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    
    st.write("### Preview of uploaded data")
    st.dataframe(df.head())

    if 'content' not in df.columns:
        st.error("Your CSV must have a column named 'content'")
    else:
        with st.spinner("Preprocessing and classifying..."):
            df['clean_content'] = df['content'].apply(preprocess_text)
            
            predictions = []
            for text in df['clean_content']:
                if text.strip() == "":
                    predictions.append("Unknown")
                else:
                    result = classifier(text)[0]
                    predictions.append(result['label'])
            
            df['class'] = predictions
            df = df.drop(columns=['clean_content'], errors='ignore')
            
            st.success("βœ… Classification completed!")
            st.write("### Preview of classified data")
            st.dataframe(df.head())

            # Download button
            csv = df.to_csv(index=False).encode('utf-8')
            st.download_button(
                label="πŸ“₯ Download output.csv",
                data=csv,
                file_name="output.csv",
                mime="text/csv"
            )

st.caption("Built for Text Analytics Assignment - Section 02")