import streamlit as st import pandas as pd import numpy as np import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification from wordcloud import WordCloud import matplotlib.pyplot as plt import io from collections import Counter import string import os from nltk.stem import PorterStemmer # Download NLTK resources nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') # Ensure NLTK data is downloaded at runtime nltk_data_path = "/home/user/nltk_data" if not os.path.exists(nltk_data_path): os.makedirs(nltk_data_path) nltk.data.path.append(nltk_data_path) nltk.download('punkt', download_dir=nltk_data_path) # Initialize lemmatizer lemmatizer = WordNetLemmatizer() # Load models (cache them to avoid reloading on every interaction) @st.cache_resource def load_classification_model(): model_name = "Imasha17/News_classification.4" # Replace with your model path tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) return pipeline("text-classification", model=model, tokenizer=tokenizer) @st.cache_resource def load_qa_model(): return pipeline("question-answering", model="deepset/roberta-base-squad2") # Function to generate word cloud def generate_wordcloud(text, title=None): wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.title(title, fontsize=20) st.pyplot(plt) # Set page config with an attractive icon and layout options st.set_page_config( page_title="News Analysis Dashboard", page_icon="📰", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS to improve styling st.markdown(""" """, unsafe_allow_html=True) # Banner header st.markdown("""

News Content Analyzer

Analyze, classify, and explore news content with AI

""", unsafe_allow_html=True) # Layout introduction text st.markdown("""

Welcome!

This dashboard allows you to:

Use the tabs below to navigate between different functionalities.

""", unsafe_allow_html=True) # Create tabs for different functionalities tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"]) with tab1: st.markdown('
', unsafe_allow_html=True) st.header("News Classification ") st.write("Upload a CSV file containing news excerpts to classify them into categories.") # File uploader with a descriptive message uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv") if uploaded_file is None: st.warning("Please upload a CSV file to get started.") else: df = pd.read_csv(uploaded_file) #Preview Uploaded Data st.subheader("Preview Uploaded Data") st.dataframe(df.head(5)) # Load the fine-tuned news classifier classifier = pipeline("text-classification", model="Imasha17/News_classification.4") # Preprocessing steps df["cleaned_content"] = df["content"].str.lower() # Remove URLs def remove_urls(text): url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']') return url_pattern.sub(r'', text).strip() df["cleaned_content"] = df["cleaned_content"].apply(remove_urls) # Remove Emails def remove_emails(text): email_pattern = re.compile(r'\S+@\S+') return email_pattern.sub(r'', text) df["cleaned_content"] = df["cleaned_content"].apply(remove_emails) # Remove punctuation def remove_punctuation(text): return "".join([char for char in text if char not in string.punctuation]) df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation) # Remove stopwords stop_words = set(stopwords.words('english')) def remove_stopwords(text): return " ".join([word for word in text.split() if word not in stop_words]) df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords) # Remove special characters def remove_special_characters(text): return re.sub(r'[^A-Za-z\s]', '', text) df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters) # Remove frequent words word_count = Counter(df["cleaned_content"].str.split(expand=True).stack()) common_words = set([word for (word, count) in word_count.most_common(10)]) def remove_common_words(text): return " ".join([word for word in text.split() if word not in common_words]) df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words) # Remove rare words rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]]) def remove_rare_words(text): return " ".join([word for word in text.split() if word not in rare_words]) df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words) # Tokenize and stem df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split()) stemmer = PorterStemmer() def stem_tokens(tokens): return [stemmer.stem(token) for token in tokens] df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens) df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text)) # Classify each article and store predictions df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"]) # Word Cloud Visualization def create_wordcloud(text_data): text = ' '.join(text_data) wordcloud = WordCloud(width=800, height=400).generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') st.pyplot(plt) st.subheader("Word Cloud of News Content") create_wordcloud(df['preprocessed_content']) # Keep only necessary columns df = df[['content','Class']] #show Classification Results st.subheader("Classification Results") st.write(df) #show class distribution st.subheader("Class Distribution") class_dist = df['Class'].value_counts() st.bar_chart(class_dist) #download csv file st.subheader("Download Results") csv = df.to_csv(index=False).encode('utf-8') st.download_button( label="Download output.csv", data=csv, file_name='output.csv', mime='text/csv' ) st.markdown('
', unsafe_allow_html=True) with tab2: st.markdown('
', unsafe_allow_html=True) st.header("Ask Questions Based on Uploaded News Content File") st.write("Ask questions about news content and get answers from our AI model.") #check file is uploaded if uploaded_file is not None: context = ' '.join(df['content'].tolist()) st.write(f"Loaded {len(df)} news excerpts") else: st.warning("Please upload a CSV file.") #generate the answer based on uloaded news content file using the given model question = st.text_input("Enter your question:") if st.button("Get Answer"): #check for file available if uploaded_file is None: st.error("Please upload a CSV file before asking a question.") elif context and question: with st.spinner("Searching for answers..."): #load the model for Q&A pipline qa_pipeline = load_qa_model() result = qa_pipeline(question=question, context=context) st.subheader("Answer") st.success(result['answer']) st.subheader("Details") st.write(f"Confidence: {result['score']:.2f}") else: st.error("Please enter a question.") #generate the answer based on selected news content using the given model st.markdown("---") st.header("Ask Questions Based on Your News Content") context_1 = st.text_area("Enter News Content", height=100) question_1 = st.text_input("Enter your question:", key="question_input") if st.button("Get Answer", key="get_answer_1"): #check for selected context and question are available if context_1 and question_1: qa_pipeline = load_qa_model() answer_1 = qa_pipeline(question=question_1, context=context_1) st.success(f"Answer: {answer_1['answer']}") else: st.warning("Provide both context and question.") st.markdown('
', unsafe_allow_html=True) with tab3: st.markdown('
', unsafe_allow_html=True) st.header("Advanced Features") st.write("Explore additional functionalities to enhance your news analysis.") # Named Entity Recognition of news content st.subheader("Named Entity Recognition Of News Content") ner_text = st.text_area("Enter News Content for entity recognition:", height=100) if st.button("Extract Entities"): with st.spinner("Identifying entities..."): #load the model ner_pipeline = pipeline("ner", grouped_entities=True) results = ner_pipeline(ner_text) entities = [] for entity in results: entities.append({ "Entity": entity['entity_group'], "Word": entity['word'], "Score": entity['score'] }) st.table(pd.DataFrame(entities)) # Text Summarization st.subheader("News Content Summarization") summary_text = st.text_area("Enter news content to summarize:", height=150) if st.button("Generate Summary"): with st.spinner("Generating summary..."): #load the summarization model summarizer = pipeline("summarization") summary = summarizer(summary_text, max_length=130, min_length=30) st.write(summary[0]['summary_text']) st.markdown('
', unsafe_allow_html=True) # Sentiment Analysis st.subheader("News Tone Detector") sentiment_text = st.text_area("Enter text for news content analysis:", height=100) if st.button("Analyze Tone"): with st.spinner("Analyzing sentiment..."): #load the model sentiment_pipeline = pipeline("sentiment-analysis") result = sentiment_pipeline(sentiment_text)[0] st.write(f"Label: {result['label']}") st.write(f"Confidence: {result['score']:.2f}") if result['label'] == 'POSITIVE': st.success("This text appears positive!") else: st.warning("This text appears negative.") # Enhanced Sidebar with branding and instructions with st.sidebar: st.image("news_logo.jpg", width=300) st.title("About") st.write(""" This app helps analyze news content: - Classify news into categories - Answer questions about news content - Perform advanced text analysis """) st.title("Instructions") st.write(""" 1. Upload a CSV file with a 'content' column. 2. Click on the appropriate tab to use a feature. 3. Download results as CSV. 4. Use the Q&A tab to ask questions about the news. """) st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)") # Footer st.markdown("---") st.markdown("
© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers
", unsafe_allow_html=True)