import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
from collections import Counter
import string
import os
from nltk.stem import PorterStemmer
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Ensure NLTK data is downloaded at runtime
nltk_data_path = "/home/user/nltk_data"
if not os.path.exists(nltk_data_path):
os.makedirs(nltk_data_path)
nltk.data.path.append(nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# Load models (cache them to avoid reloading on every interaction)
@st.cache_resource
def load_classification_model():
model_name = "Imasha17/News_classification.4" # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
return pipeline("text-classification", model=model, tokenizer=tokenizer)
@st.cache_resource
def load_qa_model():
return pipeline("question-answering", model="deepset/roberta-base-squad2")
# Function to generate word cloud
def generate_wordcloud(text, title=None):
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(title, fontsize=20)
st.pyplot(plt)
# Set page config with an attractive icon and layout options
st.set_page_config(
page_title="News Analysis Dashboard",
page_icon="📰",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS to improve styling
st.markdown("""
""", unsafe_allow_html=True)
# Banner header
st.markdown("""
""", unsafe_allow_html=True)
# Layout introduction text
st.markdown("""
Welcome!
This dashboard allows you to:
- Classify news articles into categories
- Ask questions about the news content
- Visualize sentiment, entities, and summaries
Use the tabs below to navigate between different functionalities.
""", unsafe_allow_html=True)
# Create tabs for different functionalities
tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"])
with tab1:
st.markdown('', unsafe_allow_html=True)
st.header("News Classification ")
st.write("Upload a CSV file containing news excerpts to classify them into categories.")
# File uploader with a descriptive message
uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
if uploaded_file is None:
st.warning("Please upload a CSV file to get started.")
else:
df = pd.read_csv(uploaded_file)
#Preview Uploaded Data
st.subheader("Preview Uploaded Data")
st.dataframe(df.head(5))
# Load the fine-tuned news classifier
classifier = pipeline("text-classification", model="Imasha17/News_classification.4")
# Preprocessing steps
df["cleaned_content"] = df["content"].str.lower()
# Remove URLs
def remove_urls(text):
url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
return url_pattern.sub(r'', text).strip()
df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
# Remove Emails
def remove_emails(text):
email_pattern = re.compile(r'\S+@\S+')
return email_pattern.sub(r'', text)
df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
# Remove punctuation
def remove_punctuation(text):
return "".join([char for char in text if char not in string.punctuation])
df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
# Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
return " ".join([word for word in text.split() if word not in stop_words])
df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
# Remove special characters
def remove_special_characters(text):
return re.sub(r'[^A-Za-z\s]', '', text)
df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
# Remove frequent words
word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
common_words = set([word for (word, count) in word_count.most_common(10)])
def remove_common_words(text):
return " ".join([word for word in text.split() if word not in common_words])
df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
# Remove rare words
rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
def remove_rare_words(text):
return " ".join([word for word in text.split() if word not in rare_words])
df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
# Tokenize and stem
df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
stemmer = PorterStemmer()
def stem_tokens(tokens):
return [stemmer.stem(token) for token in tokens]
df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
# Classify each article and store predictions
df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
# Word Cloud Visualization
def create_wordcloud(text_data):
text = ' '.join(text_data)
wordcloud = WordCloud(width=800, height=400).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(plt)
st.subheader("Word Cloud of News Content")
create_wordcloud(df['preprocessed_content'])
# Keep only necessary columns
df = df[['content','Class']]
#show Classification Results
st.subheader("Classification Results")
st.write(df)
#show class distribution
st.subheader("Class Distribution")
class_dist = df['Class'].value_counts()
st.bar_chart(class_dist)
#download csv file
st.subheader("Download Results")
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download output.csv",
data=csv,
file_name='output.csv',
mime='text/csv'
)
st.markdown('
', unsafe_allow_html=True)
with tab2:
st.markdown('', unsafe_allow_html=True)
st.header("Ask Questions Based on Uploaded News Content File")
st.write("Ask questions about news content and get answers from our AI model.")
#check file is uploaded
if uploaded_file is not None:
context = ' '.join(df['content'].tolist())
st.write(f"Loaded {len(df)} news excerpts")
else:
st.warning("Please upload a CSV file.")
#generate the answer based on uloaded news content file using the given model
question = st.text_input("Enter your question:")
if st.button("Get Answer"):
#check for file available
if uploaded_file is None:
st.error("Please upload a CSV file before asking a question.")
elif context and question:
with st.spinner("Searching for answers..."):
#load the model for Q&A pipline
qa_pipeline = load_qa_model()
result = qa_pipeline(question=question, context=context)
st.subheader("Answer")
st.success(result['answer'])
st.subheader("Details")
st.write(f"Confidence: {result['score']:.2f}")
else:
st.error("Please enter a question.")
#generate the answer based on selected news content using the given model
st.markdown("---")
st.header("Ask Questions Based on Your News Content")
context_1 = st.text_area("Enter News Content", height=100)
question_1 = st.text_input("Enter your question:", key="question_input")
if st.button("Get Answer", key="get_answer_1"):
#check for selected context and question are available
if context_1 and question_1:
qa_pipeline = load_qa_model()
answer_1 = qa_pipeline(question=question_1, context=context_1)
st.success(f"Answer: {answer_1['answer']}")
else:
st.warning("Provide both context and question.")
st.markdown('
', unsafe_allow_html=True)
with tab3:
st.markdown('', unsafe_allow_html=True)
st.header("Advanced Features")
st.write("Explore additional functionalities to enhance your news analysis.")
# Named Entity Recognition of news content
st.subheader("Named Entity Recognition Of News Content")
ner_text = st.text_area("Enter News Content for entity recognition:", height=100)
if st.button("Extract Entities"):
with st.spinner("Identifying entities..."):
#load the model
ner_pipeline = pipeline("ner", grouped_entities=True)
results = ner_pipeline(ner_text)
entities = []
for entity in results:
entities.append({
"Entity": entity['entity_group'],
"Word": entity['word'],
"Score": entity['score']
})
st.table(pd.DataFrame(entities))
# Text Summarization
st.subheader("News Content Summarization")
summary_text = st.text_area("Enter news content to summarize:", height=150)
if st.button("Generate Summary"):
with st.spinner("Generating summary..."):
#load the summarization model
summarizer = pipeline("summarization")
summary = summarizer(summary_text, max_length=130, min_length=30)
st.write(summary[0]['summary_text'])
st.markdown('
', unsafe_allow_html=True)
# Sentiment Analysis
st.subheader("News Tone Detector")
sentiment_text = st.text_area("Enter text for news content analysis:", height=100)
if st.button("Analyze Tone"):
with st.spinner("Analyzing sentiment..."):
#load the model
sentiment_pipeline = pipeline("sentiment-analysis")
result = sentiment_pipeline(sentiment_text)[0]
st.write(f"Label: {result['label']}")
st.write(f"Confidence: {result['score']:.2f}")
if result['label'] == 'POSITIVE':
st.success("This text appears positive!")
else:
st.warning("This text appears negative.")
# Enhanced Sidebar with branding and instructions
with st.sidebar:
st.image("news_logo.jpg", width=300)
st.title("About")
st.write("""
This app helps analyze news content:
- Classify news into categories
- Answer questions about news content
- Perform advanced text analysis
""")
st.title("Instructions")
st.write("""
1. Upload a CSV file with a 'content' column.
2. Click on the appropriate tab to use a feature.
3. Download results as CSV.
4. Use the Q&A tab to ask questions about the news.
""")
st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)")
# Footer
st.markdown("---")
st.markdown("© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers
", unsafe_allow_html=True)