Spaces:

Ashendilantha
/

News_Classification

Sleeping

App Files Files Community

News_Classification / app.py

Ashendilantha

Update app.py

47aaa4b verified about 1 year ago

raw

history blame

8.03 kB

	import streamlit as st
	import pandas as pd
	import torch
	import re
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	import nltk

	# Set page configuration
	st.set_page_config(page_title="News Analysis App", layout="wide")

	# Download required NLTK resources
	@st.cache_resource
	def download_nltk_resources():
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	download_nltk_resources()

	# Initialize preprocessor components
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# Load the fine-tuned model for classification
	@st.cache_resource
	def load_classification_model():
	model_name = "Oneli/News_Classification" # Replace with your actual model path
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	return model, tokenizer

	# Load Q&A pipeline
	@st.cache_resource
	def load_qa_pipeline():
	qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
	return qa_pipeline

	# Text preprocessing function
	def preprocess_text(text):
	if pd.isna(text):
	return ""

	# Convert to lowercase
	text = text.lower()

	# Remove URLs
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text)

	# Remove HTML tags
	text = re.sub(r'<.*?>', '', text)

	# Remove special characters and numbers
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize
	tokens = word_tokenize(text)

	# Remove stopwords and lemmatize
	cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

	# Join tokens back into text
	cleaned_text = ' '.join(cleaned_tokens)

	return cleaned_text

	# Function to classify news articles (bulk processing)
	def classify_news(df, model, tokenizer):
	# Preprocess the text
	df['cleaned_content'] = df['content'].apply(preprocess_text)

	# Prepare for classification
	texts = df['cleaned_content'].tolist()

	# Get predictions
	predictions = []
	batch_size = 16

	for i in range(0, len(texts), batch_size):
	batch_texts = texts[i:i+batch_size]
	inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	batch_predictions = torch.argmax(logits, dim=1).tolist()
	predictions.extend(batch_predictions)

	# Map numeric predictions back to class labels
	id2label = model.config.id2label
	df['class'] = [id2label[pred] for pred in predictions]

	return df

	# Function for single article classification
	def classify_single_article(text, model, tokenizer):
	# Preprocess the text
	cleaned_text = preprocess_text(text)

	# Prepare for classification
	inputs = tokenizer(cleaned_text, padding=True, truncation=True, max_length=512, return_tensors="pt")

	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	prediction = torch.argmax(logits, dim=1).item()

	# Map numeric prediction back to class label
	id2label = model.config.id2label
	category = id2label[prediction]
	confidence = torch.nn.functional.softmax(logits, dim=1).max().item() * 100

	return category, round(confidence, 2)

	# Main app
	def main():
	st.title("News Classifier 📢")

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])

	# Section for Single Article Classification
	if app_mode == "News Classification":
	st.header("📰 Single Article Classification")
	st.write("Enter a news article or upload a CSV file to classify the content.")

	# Text input for single article classification
	text_input = st.text_area("Enter News Text", placeholder="Type or paste news content here...")
	if st.button("🔍 Classify"):
	if text_input:
	# Load classification model
	with st.spinner("Loading classification model..."):
	model, tokenizer = load_classification_model()

	# Classify the text
	with st.spinner("Classifying the article..."):
	category, confidence = classify_single_article(text_input, model, tokenizer)
	st.write(f"Predicted Category: {category}")
	st.write(f"Confidence Level: {confidence}%")
	else:
	st.warning("Please enter some text to classify.")

	# File upload for bulk classification
	st.subheader("📂 Bulk Classification (CSV)")
	file_input = st.file_uploader("Upload CSV File", type="csv")
	if file_input:
	df = pd.read_csv(file_input)

	# Display sample of the data
	st.subheader("Sample of uploaded data")
	st.dataframe(df.head())

	# Check if the required column exists
	if 'content' not in df.columns:
	st.error("The CSV file must contain a 'content' column with the news articles text.")
	else:
	# Load model and tokenizer
	with st.spinner("Loading classification model..."):
	model, tokenizer = load_classification_model()

	# Classify button
	if st.button("Classify Articles"):
	with st.spinner("Classifying news articles..."):
	# Perform classification
	result_df = classify_news(df, model, tokenizer)

	# Display results
	st.subheader("Classification Results")
	st.dataframe(result_df[['content', 'class']])

	# Save to CSV
	csv = result_df.to_csv(index=False)
	st.download_button(
	label="Download output.csv",
	data=csv,
	file_name="output.csv",
	mime="text/csv"
	)

	# Show distribution of classes
	st.subheader("Class Distribution")
	class_counts = result_df['class'].value_counts()
	st.bar_chart(class_counts)

	# Section for Question Answering
	elif app_mode == "Question Answering":
	st.header("💬 AI Chat Assistant")
	st.write("Ask questions about news content and get answers using a Q&A model.")

	# Text area for news content
	news_content = st.text_area("Paste news article content here:", height=200)

	# Question input
	question = st.text_input("Enter your question about the article:")

	if news_content and question:
	# Load QA pipeline
	with st.spinner("Loading Q&A model..."):
	qa_pipeline = load_qa_pipeline()

	# Get answer
	if st.button("Get Answer"):
	with st.spinner("Finding answer..."):
	result = qa_pipeline(question=question, context=news_content)

	# Display results
	st.subheader("Answer")
	st.write(result["answer"])

	st.subheader("Confidence")
	st.progress(float(result["score"]))
	st.write(f"Confidence Score: {result['score']:.4f}")

	if __name__ == "__main__":
	main()