File size: 2,929 Bytes
59c07da c368330 6472126 c368330 ab4f49e 191b0d0 25346d7 191b0d0 c368330 191b0d0 6472126 c368330 6472126 c368330 6472126 c368330 6472126 c368330 191b0d0 c368330 191b0d0 6472126 c368330 191b0d0 c368330 6472126 c368330 6472126 c368330 59c07da c368330 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | import streamlit as st
import pandas as pd
import torch # β This was missing
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
# ====================== PREPROCESSING ======================
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
if not isinstance(text, str):
return ""
text = text.lower()
text = re.sub(f'[{string.punctuation}]', ' ', text)
text = re.sub(r'[^a-z\s]', ' ', text)
tokens = nltk.word_tokenize(text)
tokens = [word for word in tokens if word not in stop_words]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
# ====================== LOAD MODEL ======================
@st.cache_resource
def load_model():
model_name = "Ginidu2003/Distilbert-Base-News-classifier" # β Change if your model name is different
return pipeline(
"text-classification",
model=model_name,
device=0 if torch.cuda.is_available() else -1
)
classifier = load_model()
# ====================== STREAMLIT APP ======================
st.title("π° Daily Mirror News Classifier")
st.subheader("Classify news into Business, Opinion, Political Gossip, Sports, or World News")
st.markdown("**Upload a CSV file** with a column named `content`")
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
st.write("### Preview of uploaded data")
st.dataframe(df.head())
if 'content' not in df.columns:
st.error("Your CSV must have a column named 'content'")
else:
with st.spinner("Preprocessing and classifying..."):
df['clean_content'] = df['content'].apply(preprocess_text)
predictions = []
for text in df['clean_content']:
if text.strip() == "":
predictions.append("Unknown")
else:
result = classifier(text)[0]
predictions.append(result['label'])
df['class'] = predictions
df = df.drop(columns=['clean_content'], errors='ignore')
st.success("β
Classification completed!")
st.write("### Preview of classified data")
st.dataframe(df.head())
# Download button
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="π₯ Download output.csv",
data=csv,
file_name="output.csv",
mime="text/csv"
)
st.caption("Built for Text Analytics Assignment - Section 02") |