Spaces:

Prageeth-1
/

News_Classification_App

Sleeping

App Files Files Community

News_Classification_App / app.py

Prageeth-1

Update app.py

06c38a0 verified about 1 year ago

raw

history blame

14.4 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import io
	from collections import Counter
	import string
	import os

	# Download NLTK resources
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	# Ensure NLTK data is downloaded at runtime
	nltk_data_path = "/home/user/nltk_data"
	if not os.path.exists(nltk_data_path):
	os.makedirs(nltk_data_path)

	nltk.data.path.append(nltk_data_path)
	nltk.download('punkt', download_dir=nltk_data_path)

	# Initialize lemmatizer
	lemmatizer = WordNetLemmatizer()

	# Load models (cache them to avoid reloading on every interaction)
	@st.cache_resource
	def load_classification_model():
	model_name = "Prageeth-1/News_classification.2" # Replace with your model path
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	return pipeline("text-classification", model=model, tokenizer=tokenizer)

	@st.cache_resource
	def load_qa_model():
	return pipeline("question-answering", model="deepset/roberta-base-squad2")

	# Preprocessing function (same as in Section 01)
	def preprocess_text():
	# Lowercase
	df["cleaned_content"] = df["content"].str.lower()

	# Remove URLs
	def remove_urls(text):
	url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
	text = url_pattern.sub(r'', text)
	return text.strip()

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))

	# Remove Emails
	def remove_emails(text):
	email_pattern = re.compile(r'\S+@\S+')
	return email_pattern.sub(r'', text)

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))

	#Remove punctuations
	def remove_punctuation(text):
	return "".join([char for char in text if char not in string.punctuation])

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))

	# Get the list of stop words
	stop_words = set(stopwords.words('english'))

	# define the function
	def remove_stopwords(text):
	return " ".join([word for word in str(text).split() if word not in stop_words])

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))

	# define the function
	def remove_special_characters(text):
	return re.sub(r'[^A-Za-z\s]', '', text)

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))

	#Remove Frequent words

	# Get the count of each word in cleaned_text
	word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())

	# Get a set of common words
	common_words = set([word for (word,count) in word_count.most_common(10)])

	# deinfe the function
	def remove_common_words(text):
	return " ".join([word for word in str(text).split() if word not in common_words])

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))

	#Remove rare words
	# Get a set of rare words
	rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
	print(rare_words)

	# define the function
	def remove_rare_words(text):
	return " ".join([word for word in str(text).split() if word not in rare_words])

	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))

	df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())


	# initialize stemmer
	stemmer = PorterStemmer()

	# Defining the function
	def stem_tokens(tokens):
	stems = [stemmer.stem(token) for token in tokens]
	return stems

	# apply the function
	df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))

	df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))



	# Function to generate word cloud
	def generate_wordcloud(text, title=None):
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis("off")
	plt.title(title, fontsize=20)
	st.pyplot(plt)

	# Set page config
	st.set_page_config(
	page_title="News Analysis Dashboard",
	page_icon="📰",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main {
	background-color: #f5f5f5;
	}
	.stButton>button {
	background-color: #4CAF50;
	color: white;
	}
	.stDownloadButton>button {
	background-color: #2196F3;
	color: white;
	}
	.stTextInput>div>div>input {
	background-color: #ffffff;
	}
	</style>
	""", unsafe_allow_html=True)

	# App title and description
	st.title("📰 Daily Mirror News Analyzer")
	st.markdown("""
	Analyze news excerpts with our powerful AI tools:
	- Classify news articles into categories
	- Get answers to your questions about the news content
	- Visualize key themes
	""")

	# Create tabs for different functionalities
	tab1, tab2, tab3 = st.tabs(["📋 News Classification", "❓ Q&A Pipeline", "✨ Advanced Features"])

	with tab1:

	st.header("News Classification Pipeline")
	st.write("Upload a CSV file containing news excerpts to classify them into categories.")

	# File uploader
	uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

	# Check the file
	if uploaded_file is None:
	st.warning("Please upload a CSV file.")


	else:
	df = pd.read_csv(uploaded_file)

	# Load the fine-tuned news classifier
	classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")

	# Preprocess
	# Lowercase
	df["cleaned_content"] = df["content"].str.lower()

	# Remove URLs
	def remove_urls(text):
	url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
	text = url_pattern.sub(r'', text)
	return text.strip()

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))

	# Remove Emails
	def remove_emails(text):
	email_pattern = re.compile(r'\S+@\S+')
	return email_pattern.sub(r'', text)

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))

	#Remove punctuations
	def remove_punctuation(text):
	return "".join([char for char in text if char not in string.punctuation])

	# applying the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))

	# Get the list of stop words
	stop_words = set(stopwords.words('english'))

	# define the function
	def remove_stopwords(text):
	return " ".join([word for word in str(text).split() if word not in stop_words])

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))

	# define the function
	def remove_special_characters(text):
	return re.sub(r'[^A-Za-z\s]', '', text)

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))

	#Remove Frequent words

	# Get the count of each word in cleaned_text
	word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())

	# Get a set of common words
	common_words = set([word for (word,count) in word_count.most_common(10)])

	# deinfe the function
	def remove_common_words(text):
	return " ".join([word for word in str(text).split() if word not in common_words])

	# apply the function
	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))

	#Remove rare words
	# Get a set of rare words
	rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
	print(rare_words)

	# define the function
	def remove_rare_words(text):
	return " ".join([word for word in str(text).split() if word not in rare_words])

	df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))

	df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())

	# initialize stemmer
	stemmer = PorterStemmer()

	# Defining the function
	def stem_tokens(tokens):
	stems = [stemmer.stem(token) for token in tokens]
	return stems

	# apply the function
	df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))

	df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))












	# Classify each article and store the predictions
	df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])

	#Delete Unnecessary columns
	df = df[['content', 'Class']]


	# Show results
	st.subheader("Classification Results")
	st.write(df)

	# Show distribution
	st.subheader("Class Distribution")
	class_dist = df['Class'].value_counts()
	st.bar_chart(class_dist)





	# Download button
	st.subheader("Download Results")
	csv = df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download output.csv",
	data=csv,
	file_name='output.csv',
	mime='text/csv'
	)




	with tab2:

	st.header("Question Answering Pipeline")
	st.write("Ask questions about news content and get answers from our AI model.")



	if uploaded_file is not None:

	context = ' '.join(df['content'].tolist()) # Use predictions for Q&A
	st.write(f"Loaded {len(df)} news excerpts")

	else:
	st.warning("Please upload a CSV file.")


	question = st.text_input("Enter your question:")

	if st.button("Get Answer") and context and question:
	with st.spinner("Searching for answers..."):
	qa_pipeline = load_qa_model()
	result = qa_pipeline(question=question, context=context)

	st.subheader("Answer")
	st.success(result['answer'])

	st.subheader("Details")
	st.write(f"Confidence: {result['score']:.2f}")


	with tab3:
	st.header("Advanced Features")
	st.write("Explore additional functionalities to enhance your news analysis.")

	# Sentiment Analysis
	st.subheader("📊 Sentiment Analysis")
	sentiment_text = st.text_area("Enter text for sentiment analysis:", height=100)
	if st.button("Analyze Sentiment"):
	with st.spinner("Analyzing sentiment..."):
	sentiment_pipeline = pipeline("sentiment-analysis")
	result = sentiment_pipeline(sentiment_text)[0]
	st.write(f"Label: {result['label']}")
	st.write(f"Confidence: {result['score']:.2f}")
	if result['label'] == 'POSITIVE':
	st.success("This text appears positive!")
	else:
	st.warning("This text appears negative.")

	# Named Entity Recognition
	st.subheader("🏷 Named Entity Recognition")
	ner_text = st.text_area("Enter text for entity recognition:", height=100)
	if st.button("Extract Entities"):
	with st.spinner("Identifying entities..."):
	ner_pipeline = pipeline("ner", grouped_entities=True)
	results = ner_pipeline(ner_text)

	entities = []
	for entity in results:
	entities.append({
	"Entity": entity['entity_group'],
	"Word": entity['word'],
	"Score": entity['score']
	})

	st.table(pd.DataFrame(entities))

	# Text Summarization
	st.subheader("✍ Text Summarization")
	summary_text = st.text_area("Enter text to summarize:", height=150)
	if st.button("Generate Summary"):
	with st.spinner("Generating summary..."):
	summarizer = pipeline("summarization")
	summary = summarizer(summary_text, max_length=130, min_length=30)
	st.write(summary[0]['summary_text'])

	# Sidebar with additional info
	with st.sidebar:
	st.image("https://via.placeholder.com/150x50?text=Daily+Mirror", width=150)
	st.title("About")
	st.write("""
	This app helps analyze news content using AI-powered tools:
	- Classify news into categories
	- Answer questions about news content
	- Perform advanced text analysis
	""")

	st.title("Instructions")
	st.write("""
	1. Upload a CSV file with 'excerpt' column
	2. Click classify to categorize news
	3. Download results as CSV
	4. Use Q&A tab to ask questions
	""")

	st.title("Model Information")
	st.write("""
	- Classification: Fine-tuned DistilBERT
	- Q&A: RoBERTa-base
	- Sentiment: DistilBERT-base
	""")

	st.markdown("[View model on Hugging Face](https://huggingface.co/your-username/daily-mirror-news-classifier)")

	# Footer
	st.markdown("---")
	st.markdown("© 2023 Daily Mirror News Analyzer \| Powered by Hugging Face Transformers")