Prageeth-1's picture
Update app.py
06c38a0 verified
raw
history blame
14.4 kB
import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
from collections import Counter
import string
import os
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Ensure NLTK data is downloaded at runtime
nltk_data_path = "/home/user/nltk_data"
if not os.path.exists(nltk_data_path):
os.makedirs(nltk_data_path)
nltk.data.path.append(nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# Load models (cache them to avoid reloading on every interaction)
@st.cache_resource
def load_classification_model():
model_name = "Prageeth-1/News_classification.2" # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
return pipeline("text-classification", model=model, tokenizer=tokenizer)
@st.cache_resource
def load_qa_model():
return pipeline("question-answering", model="deepset/roberta-base-squad2")
# Preprocessing function (same as in Section 01)
def preprocess_text():
# Lowercase
df["cleaned_content"] = df["content"].str.lower()
# Remove URLs
def remove_urls(text):
url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
text = url_pattern.sub(r'', text)
return text.strip()
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
# Remove Emails
def remove_emails(text):
email_pattern = re.compile(r'\S+@\S+')
return email_pattern.sub(r'', text)
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
#Remove punctuations
def remove_punctuation(text):
return "".join([char for char in text if char not in string.punctuation])
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
# Get the list of stop words
stop_words = set(stopwords.words('english'))
# define the function
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
# define the function
def remove_special_characters(text):
return re.sub(r'[^A-Za-z\s]', '', text)
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
#Remove Frequent words
# Get the count of each word in cleaned_text
word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
# Get a set of common words
common_words = set([word for (word,count) in word_count.most_common(10)])
# deinfe the function
def remove_common_words(text):
return " ".join([word for word in str(text).split() if word not in common_words])
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
#Remove rare words
# Get a set of rare words
rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
print(rare_words)
# define the function
def remove_rare_words(text):
return " ".join([word for word in str(text).split() if word not in rare_words])
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
# initialize stemmer
stemmer = PorterStemmer()
# Defining the function
def stem_tokens(tokens):
stems = [stemmer.stem(token) for token in tokens]
return stems
# apply the function
df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
# Function to generate word cloud
def generate_wordcloud(text, title=None):
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(title, fontsize=20)
st.pyplot(plt)
# Set page config
st.set_page_config(
page_title="News Analysis Dashboard",
page_icon="📰",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
<style>
.main {
background-color: #f5f5f5;
}
.stButton>button {
background-color: #4CAF50;
color: white;
}
.stDownloadButton>button {
background-color: #2196F3;
color: white;
}
.stTextInput>div>div>input {
background-color: #ffffff;
}
</style>
""", unsafe_allow_html=True)
# App title and description
st.title("📰 Daily Mirror News Analyzer")
st.markdown("""
Analyze news excerpts with our powerful AI tools:
- Classify news articles into categories
- Get answers to your questions about the news content
- Visualize key themes
""")
# Create tabs for different functionalities
tab1, tab2, tab3 = st.tabs(["📋 News Classification", "❓ Q&A Pipeline", "✨ Advanced Features"])
with tab1:
st.header("News Classification Pipeline")
st.write("Upload a CSV file containing news excerpts to classify them into categories.")
# File uploader
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
# Check the file
if uploaded_file is None:
st.warning("Please upload a CSV file.")
else:
df = pd.read_csv(uploaded_file)
# Load the fine-tuned news classifier
classifier = pipeline("text-classification", model="Prageeth-1/News_classification.2")
# Preprocess
# Lowercase
df["cleaned_content"] = df["content"].str.lower()
# Remove URLs
def remove_urls(text):
url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
text = url_pattern.sub(r'', text)
return text.strip()
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
# Remove Emails
def remove_emails(text):
email_pattern = re.compile(r'\S+@\S+')
return email_pattern.sub(r'', text)
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
#Remove punctuations
def remove_punctuation(text):
return "".join([char for char in text if char not in string.punctuation])
# applying the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
# Get the list of stop words
stop_words = set(stopwords.words('english'))
# define the function
def remove_stopwords(text):
return " ".join([word for word in str(text).split() if word not in stop_words])
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
# define the function
def remove_special_characters(text):
return re.sub(r'[^A-Za-z\s]', '', text)
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
#Remove Frequent words
# Get the count of each word in cleaned_text
word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
# Get a set of common words
common_words = set([word for (word,count) in word_count.most_common(10)])
# deinfe the function
def remove_common_words(text):
return " ".join([word for word in str(text).split() if word not in common_words])
# apply the function
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
#Remove rare words
# Get a set of rare words
rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
print(rare_words)
# define the function
def remove_rare_words(text):
return " ".join([word for word in str(text).split() if word not in rare_words])
df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
# initialize stemmer
stemmer = PorterStemmer()
# Defining the function
def stem_tokens(tokens):
stems = [stemmer.stem(token) for token in tokens]
return stems
# apply the function
df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
# Classify each article and store the predictions
df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
#Delete Unnecessary columns
df = df[['content', 'Class']]
# Show results
st.subheader("Classification Results")
st.write(df)
# Show distribution
st.subheader("Class Distribution")
class_dist = df['Class'].value_counts()
st.bar_chart(class_dist)
# Download button
st.subheader("Download Results")
csv = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download output.csv",
data=csv,
file_name='output.csv',
mime='text/csv'
)
with tab2:
st.header("Question Answering Pipeline")
st.write("Ask questions about news content and get answers from our AI model.")
if uploaded_file is not None:
context = ' '.join(df['content'].tolist()) # Use predictions for Q&A
st.write(f"Loaded {len(df)} news excerpts")
else:
st.warning("Please upload a CSV file.")
question = st.text_input("Enter your question:")
if st.button("Get Answer") and context and question:
with st.spinner("Searching for answers..."):
qa_pipeline = load_qa_model()
result = qa_pipeline(question=question, context=context)
st.subheader("Answer")
st.success(result['answer'])
st.subheader("Details")
st.write(f"Confidence: {result['score']:.2f}")
with tab3:
st.header("Advanced Features")
st.write("Explore additional functionalities to enhance your news analysis.")
# Sentiment Analysis
st.subheader("📊 Sentiment Analysis")
sentiment_text = st.text_area("Enter text for sentiment analysis:", height=100)
if st.button("Analyze Sentiment"):
with st.spinner("Analyzing sentiment..."):
sentiment_pipeline = pipeline("sentiment-analysis")
result = sentiment_pipeline(sentiment_text)[0]
st.write(f"Label: {result['label']}")
st.write(f"Confidence: {result['score']:.2f}")
if result['label'] == 'POSITIVE':
st.success("This text appears positive!")
else:
st.warning("This text appears negative.")
# Named Entity Recognition
st.subheader("🏷 Named Entity Recognition")
ner_text = st.text_area("Enter text for entity recognition:", height=100)
if st.button("Extract Entities"):
with st.spinner("Identifying entities..."):
ner_pipeline = pipeline("ner", grouped_entities=True)
results = ner_pipeline(ner_text)
entities = []
for entity in results:
entities.append({
"Entity": entity['entity_group'],
"Word": entity['word'],
"Score": entity['score']
})
st.table(pd.DataFrame(entities))
# Text Summarization
st.subheader("✍ Text Summarization")
summary_text = st.text_area("Enter text to summarize:", height=150)
if st.button("Generate Summary"):
with st.spinner("Generating summary..."):
summarizer = pipeline("summarization")
summary = summarizer(summary_text, max_length=130, min_length=30)
st.write(summary[0]['summary_text'])
# Sidebar with additional info
with st.sidebar:
st.image("https://via.placeholder.com/150x50?text=Daily+Mirror", width=150)
st.title("About")
st.write("""
This app helps analyze news content using AI-powered tools:
- Classify news into categories
- Answer questions about news content
- Perform advanced text analysis
""")
st.title("Instructions")
st.write("""
1. Upload a CSV file with 'excerpt' column
2. Click classify to categorize news
3. Download results as CSV
4. Use Q&A tab to ask questions
""")
st.title("Model Information")
st.write("""
- Classification: Fine-tuned DistilBERT
- Q&A: RoBERTa-base
- Sentiment: DistilBERT-base
""")
st.markdown("[View model on Hugging Face](https://huggingface.co/your-username/daily-mirror-news-classifier)")
# Footer
st.markdown("---")
st.markdown("© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers")