Spaces:

Imasha17
/

News_Classification_App

Sleeping

File size: 13,757 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
from collections import Counter
import string
import os

from nltk.stem import PorterStemmer


# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Ensure NLTK data is downloaded at runtime
nltk_data_path = "/home/user/nltk_data"
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)
nltk.data.path.append(nltk_data_path)
nltk.download('punkt', download_dir=nltk_data_path)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Load models (cache them to avoid reloading on every interaction)
@st.cache_resource
def load_classification_model():
    model_name = "Imasha17/News_classification.4"  # Replace with your model path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return pipeline("text-classification", model=model, tokenizer=tokenizer)

@st.cache_resource
def load_qa_model():
    return pipeline("question-answering", model="deepset/roberta-base-squad2")

# Function to generate word cloud
def generate_wordcloud(text, title=None):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title, fontsize=20)
    st.pyplot(plt)

# Set page config with an attractive icon and layout options
st.set_page_config(
    page_title="News Analysis Dashboard",
    page_icon="📰",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS to improve styling
st.markdown("""
    <style>
  
    .reportview-container {
        background: #f0f2f6;
    }
    /* Header styling */
    .header {
        background: linear-gradient(90deg, #1a73e8, #4285f4);
        padding: 20px;
        border-radius: 8px;
        margin-bottom: 20px;
        text-align: center;
        color: white;
    }
    .header h1 {
        font-size: 48px;
        margin: 0;
        font-weight: bold;
    }
    /* Sidebar styling */
    .css-1d391kg { 
        background-color: #ffffff;
    }
    /* Button styling */
    .stButton>button {
        background-color: #1a73e8;
        color: white;
        border: none;
        padding: 10px 20px;
        border-radius: 5px;
        font-size: 16px;
    }
    .stButton>button:hover {
        background-color: #0c55b3;
    }
    /* Text input styling */
    .stTextInput>div>div>input {
        background-color: #ffffff;
        color: #333333;
        font-size: 16px;
    }
    /* Card style containers */
    .card {
        background-color: #ffffff;
        padding: 20px;
        border-radius: 8px;
        margin-bottom: 20px;
        box-shadow: 0px 4px 8px rgba(0,0,0,0.05);
        colour:#1a73e8;
    }
    </style>
    """, unsafe_allow_html=True)

# Banner header
st.markdown("""
    <div class="header">
        <h1>News Content Analyzer</h1>
        <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p>
    </div>
""", unsafe_allow_html=True)

# Layout introduction text
st.markdown("""
    <div class="card">
        <h2 style="color:#1a73e8;">Welcome!</h2>
        <p style="color:#1a73e8;">This dashboard allows you to:
            <ul style="color:#1a73e8;">
                <li>Classify news articles into categories</li>
                <li>Ask questions about the news content</li>
                <li>Visualize sentiment, entities, and summaries</li>
            </ul>
            Use the tabs below to navigate between different functionalities.
        </p>
    </div>
""", unsafe_allow_html=True)

# Create tabs for different functionalities
tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"])

with tab1:
    st.markdown('<div class="card">', unsafe_allow_html=True)
    st.header("News Classification ")
    st.write("Upload a CSV file containing news excerpts to classify them into categories.")
    
    # File uploader with a descriptive message
    uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
    
    if uploaded_file is None:
        st.warning("Please upload a CSV file to get started.")
    else:
        df = pd.read_csv(uploaded_file)
    

        #Preview Uploaded Data
        st.subheader("Preview Uploaded Data")
        st.dataframe(df.head(5))

        
        # Load the fine-tuned news classifier
        classifier = pipeline("text-classification", model="Imasha17/News_classification.4")

        # Preprocessing steps
        df["cleaned_content"] = df["content"].str.lower()
        
        # Remove URLs
        def remove_urls(text):
            url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
            return url_pattern.sub(r'', text).strip()
        df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
    
        # Remove Emails
        def remove_emails(text):
            email_pattern = re.compile(r'\S+@\S+')
            return email_pattern.sub(r'', text)
        df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
    
        # Remove punctuation
        def remove_punctuation(text):
            return "".join([char for char in text if char not in string.punctuation])
        df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
    
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        def remove_stopwords(text):
            return " ".join([word for word in text.split() if word not in stop_words])
        df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
    
        # Remove special characters
        def remove_special_characters(text):
            return re.sub(r'[^A-Za-z\s]', '', text)
        df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
    
        # Remove frequent words
        word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
        common_words = set([word for (word, count) in word_count.most_common(10)])
        def remove_common_words(text):
            return " ".join([word for word in text.split() if word not in common_words])
        df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
    
        # Remove rare words
        rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
        def remove_rare_words(text):
            return " ".join([word for word in text.split() if word not in rare_words])
        df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
    
        # Tokenize and stem
        df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
        stemmer = PorterStemmer()
        def stem_tokens(tokens):
            return [stemmer.stem(token) for token in tokens]
        df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
        df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
    
        # Classify each article and store predictions
        df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])

        # Word Cloud Visualization
        def create_wordcloud(text_data):
            text = ' '.join(text_data)
            wordcloud = WordCloud(width=800, height=400).generate(text)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            st.pyplot(plt)

        st.subheader("Word Cloud of News Content")
        create_wordcloud(df['preprocessed_content'])
        
        # Keep only necessary columns
        df = df[['content','Class']]

        
        #show Classification Results
        st.subheader("Classification Results")
        st.write(df)


        #show class distribution
        st.subheader("Class Distribution")
        class_dist = df['Class'].value_counts()
        st.bar_chart(class_dist)

        #download csv file
        st.subheader("Download Results")
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download output.csv",
            data=csv,
            file_name='output.csv',
            mime='text/csv'
        )
    st.markdown('</div>', unsafe_allow_html=True)

with tab2:
    st.markdown('<div class="card">', unsafe_allow_html=True)
    st.header("Ask Questions Based on Uploaded News Content File")
    st.write("Ask questions about news content and get answers from our AI model.")
     
    #check file is uploaded
    if uploaded_file is not None:
        context = ' '.join(df['content'].tolist())
        st.write(f"Loaded {len(df)} news excerpts")
    else:
        st.warning("Please upload a CSV file.")

    #generate the answer based on uloaded news content file using the given model 
    question = st.text_input("Enter your question:")
    if st.button("Get Answer"):
        #check for file available
        if uploaded_file is None:
            st.error("Please upload a CSV file before asking a question.")
        elif context and question:
            with st.spinner("Searching for answers..."):
                #load the model for Q&A pipline
                qa_pipeline = load_qa_model()
                result = qa_pipeline(question=question, context=context)
                st.subheader("Answer")
                st.success(result['answer'])
                st.subheader("Details")
                st.write(f"Confidence: {result['score']:.2f}")
        else:
            st.error("Please enter a question.")

    #generate the answer based on selected news content using the given model
       
    st.markdown("---")
    st.header("Ask Questions Based on Your News Content")
    context_1 = st.text_area("Enter News Content", height=100)
    
    question_1 = st.text_input("Enter your question:", key="question_input")
    if st.button("Get Answer", key="get_answer_1"):
        #check for selected context and question are available
        if context_1 and question_1:
            qa_pipeline = load_qa_model()
            answer_1 = qa_pipeline(question=question_1, context=context_1)
            st.success(f"Answer: {answer_1['answer']}")
        else:
            st.warning("Provide both context and question.")
    st.markdown('</div>', unsafe_allow_html=True)

with tab3:
    st.markdown('<div class="card">', unsafe_allow_html=True)
    st.header("Advanced Features")
    st.write("Explore additional functionalities to enhance your news analysis.")
    
    
    # Named Entity Recognition of news content
    st.subheader("Named Entity Recognition Of News Content")
    ner_text = st.text_area("Enter News Content for entity recognition:", height=100)
    if st.button("Extract Entities"):
        with st.spinner("Identifying entities..."):
            #load the model
            ner_pipeline = pipeline("ner", grouped_entities=True)
            results = ner_pipeline(ner_text)
            entities = []
            for entity in results:
                entities.append({
                    "Entity": entity['entity_group'],
                    "Word": entity['word'],
                    "Score": entity['score']
                })
            st.table(pd.DataFrame(entities))
    
    # Text Summarization
    st.subheader("News Content Summarization")
    summary_text = st.text_area("Enter news content to summarize:", height=150)
    if st.button("Generate Summary"):
        with st.spinner("Generating summary..."):
            #load the summarization model
            summarizer = pipeline("summarization")
            summary = summarizer(summary_text, max_length=130, min_length=30)
            st.write(summary[0]['summary_text'])
    st.markdown('</div>', unsafe_allow_html=True)
    

    # Sentiment Analysis
    st.subheader("News Tone Detector")
    sentiment_text = st.text_area("Enter text for news content analysis:", height=100)
    if st.button("Analyze Tone"):
        with st.spinner("Analyzing sentiment..."):
            #load the model
            sentiment_pipeline = pipeline("sentiment-analysis")
            result = sentiment_pipeline(sentiment_text)[0]
            st.write(f"Label: {result['label']}")
            st.write(f"Confidence: {result['score']:.2f}")
            if result['label'] == 'POSITIVE':
                st.success("This text appears positive!")
            else:
                st.warning("This text appears negative.")
                

# Enhanced Sidebar with branding and instructions
with st.sidebar:
    st.image("news_logo.jpg", width=300)
    st.title("About")
    st.write("""
        This app helps analyze news content:
        - Classify news into categories
        - Answer questions about news content
        - Perform advanced text analysis
    """)
    
    st.title("Instructions")
    st.write("""
        1. Upload a CSV file with a 'content' column.
        2. Click on the appropriate tab to use a feature.
        3. Download results as CSV.
        4. Use the Q&A tab to ask questions about the news.
    """)
    
    st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)")

# Footer
st.markdown("---")
st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True)