Spaces:

Imasha17
/

News_Classification_WebApp

Build error

App Files Files Community

Imasha17 commited on Apr 8, 2025

Commit

fe0b7af

verified ·

1 Parent(s): 807487d

Create app.py

Browse files

Files changed (1) hide show

app.py +378 -0

app.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import io
+from collections import Counter
+import string
+import os
+from nltk.stem import PorterStemmer
+# Download NLTK resources
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+# Ensure NLTK data is downloaded at runtime
+nltk_data_path = "/home/user/nltk_data"
+if not os.path.exists(nltk_data_path):
+    os.makedirs(nltk_data_path)
+nltk.data.path.append(nltk_data_path)
+nltk.download('punkt', download_dir=nltk_data_path)
+# Initialize lemmatizer
+lemmatizer = WordNetLemmatizer()
+# Load models (cache them to avoid reloading on every interaction)
+@st.cache_resource
+def load_classification_model():
+    model_name = "Imasha17/News_classification.4"  # Replace with your model path
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return pipeline("text-classification", model=model, tokenizer=tokenizer)
+@st.cache_resource
+def load_qa_model():
+    return pipeline("question-answering", model="deepset/roberta-base-squad2")
+# Function to generate word cloud
+def generate_wordcloud(text, title=None):
+    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis("off")
+    plt.title(title, fontsize=20)
+    st.pyplot(plt)
+# Set page config with an attractive icon and layout options
+st.set_page_config(
+    page_title="News Analysis Dashboard",
+    page_icon="📰",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS to improve styling
+st.markdown("""
+    <style>
+    .reportview-container {
+        background: #f0f2f6;
+    }
+    /* Header styling */
+    .header {
+        background: linear-gradient(90deg, #1a73e8, #4285f4);
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        text-align: center;
+        color: white;
+    }
+    .header h1 {
+        font-size: 48px;
+        margin: 0;
+        font-weight: bold;
+    }
+    /* Sidebar styling */
+    .css-1d391kg {
+        background-color: #ffffff;
+    }
+    /* Button styling */
+    .stButton>button {
+        background-color: #1a73e8;
+        color: white;
+        border: none;
+        padding: 10px 20px;
+        border-radius: 5px;
+        font-size: 16px;
+    }
+    .stButton>button:hover {
+        background-color: #0c55b3;
+    }
+    /* Text input styling */
+    .stTextInput>div>div>input {
+        background-color: #ffffff;
+        color: #333333;
+        font-size: 16px;
+    }
+    /* Card style containers */
+    .card {
+        background-color: #ffffff;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        box-shadow: 0px 4px 8px rgba(0,0,0,0.05);
+        colour:#1a73e8;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+# Banner header
+st.markdown("""
+    <div class="header">
+        <h1>News Content Analyzer</h1>
+        <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p>
+    </div>
+""", unsafe_allow_html=True)
+# Layout introduction text
+st.markdown("""
+    <div class="card">
+        <h2 style="color:#1a73e8;">Welcome!</h2>
+        <p style="color:#1a73e8;">This dashboard allows you to:
+            <ul style="color:#1a73e8;">
+                <li>Classify news articles into categories</li>
+                <li>Ask questions about the news content</li>
+                <li>Visualize sentiment, entities, and summaries</li>
+            </ul>
+            Use the tabs below to navigate between different functionalities.
+        </p>
+    </div>
+""", unsafe_allow_html=True)
+# Create tabs for different functionalities
+tab1, tab2, tab3 = st.tabs(["News Classification", "Ask Questions", "Advanced Features"])
+with tab1:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
+    st.header("News Classification ")
+    st.write("Upload a CSV file containing news excerpts to classify them into categories.")
+    # File uploader with a descriptive message
+    uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
+    if uploaded_file is None:
+        st.warning("Please upload a CSV file to get started.")
+    else:
+        df = pd.read_csv(uploaded_file)
+        #Preview Uploaded Data
+        st.subheader("Preview Uploaded Data")
+        st.dataframe(df.head(5))
+        # Load the fine-tuned news classifier
+        classifier = pipeline("text-classification", model="Imasha17/News_classification.4")
+        # Preprocessing steps
+        df["cleaned_content"] = df["content"].str.lower()
+        # Remove URLs
+        def remove_urls(text):
+            url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
+            return url_pattern.sub(r'', text).strip()
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
+        # Remove Emails
+        def remove_emails(text):
+            email_pattern = re.compile(r'\S+@\S+')
+            return email_pattern.sub(r'', text)
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
+        # Remove punctuation
+        def remove_punctuation(text):
+            return "".join([char for char in text if char not in string.punctuation])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
+        # Remove stopwords
+        stop_words = set(stopwords.words('english'))
+        def remove_stopwords(text):
+            return " ".join([word for word in text.split() if word not in stop_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
+        # Remove special characters
+        def remove_special_characters(text):
+            return re.sub(r'[^A-Za-z\s]', '', text)
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
+        # Remove frequent words
+        word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
+        common_words = set([word for (word, count) in word_count.most_common(10)])
+        def remove_common_words(text):
+            return " ".join([word for word in text.split() if word not in common_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
+        # Remove rare words
+        rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
+        def remove_rare_words(text):
+            return " ".join([word for word in text.split() if word not in rare_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
+        # Tokenize and stem
+        df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
+        stemmer = PorterStemmer()
+        def stem_tokens(tokens):
+            return [stemmer.stem(token) for token in tokens]
+        df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
+        df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
+        # Classify each article and store predictions
+        df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
+        # Word Cloud Visualization
+        def create_wordcloud(text_data):
+            text = ' '.join(text_data)
+            wordcloud = WordCloud(width=800, height=400).generate(text)
+            plt.figure(figsize=(10, 5))
+            plt.imshow(wordcloud, interpolation='bilinear')
+            plt.axis('off')
+            st.pyplot(plt)
+        st.subheader("Word Cloud of News Content")
+        create_wordcloud(df['preprocessed_content'])
+        # Keep only necessary columns
+        df = df[['content','Class']]
+        #show Classification Results
+        st.subheader("Classification Results")
+        st.write(df)
+        #show class distribution
+        st.subheader("Class Distribution")
+        class_dist = df['Class'].value_counts()
+        st.bar_chart(class_dist)
+        #download csv file
+        st.subheader("Download Results")
+        csv = df.to_csv(index=False).encode('utf-8')
+        st.download_button(
+            label="Download output.csv",
+            data=csv,
+            file_name='output.csv',
+            mime='text/csv'
+        )
+    st.markdown('</div>', unsafe_allow_html=True)
+with tab2:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
+    st.header("Ask Questions Based on Uploaded News Content File")
+    st.write("Ask questions about news content and get answers from our AI model.")
+    #check file is uploaded
+    if uploaded_file is not None:
+        context = ' '.join(df['content'].tolist())
+        st.write(f"Loaded {len(df)} news excerpts")
+    else:
+        st.warning("Please upload a CSV file.")
+    #generate the answer based on uloaded news content file using the given model
+    question = st.text_input("Enter your question:")
+    if st.button("Get Answer"):
+        #check for file available
+        if uploaded_file is None:
+            st.error("Please upload a CSV file before asking a question.")
+        elif context and question:
+            with st.spinner("Searching for answers..."):
+                #load the model for Q&A pipline
+                qa_pipeline = load_qa_model()
+                result = qa_pipeline(question=question, context=context)
+                st.subheader("Answer")
+                st.success(result['answer'])
+                st.subheader("Details")
+                st.write(f"Confidence: {result['score']:.2f}")
+        else:
+            st.error("Please enter a question.")
+    #generate the answer based on selected news content using the given model
+    st.markdown("---")
+    st.header("Ask Questions Based on Your News Content")
+    context_1 = st.text_area("Enter News Content", height=100)
+    question_1 = st.text_input("Enter your question:", key="question_input")
+    if st.button("Get Answer", key="get_answer_1"):
+        #check for selected context and question are available
+        if context_1 and question_1:
+            qa_pipeline = load_qa_model()
+            answer_1 = qa_pipeline(question=question_1, context=context_1)
+            st.success(f"Answer: {answer_1['answer']}")
+        else:
+            st.warning("Provide both context and question.")
+    st.markdown('</div>', unsafe_allow_html=True)
+with tab3:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
+    st.header("Advanced Features")
+    st.write("Explore additional functionalities to enhance your news analysis.")
+    # Named Entity Recognition of news content
+    st.subheader("Named Entity Recognition Of News Content")
+    ner_text = st.text_area("Enter News Content for entity recognition:", height=100)
+    if st.button("Extract Entities"):
+        with st.spinner("Identifying entities..."):
+            #load the model
+            ner_pipeline = pipeline("ner", grouped_entities=True)
+            results = ner_pipeline(ner_text)
+            entities = []
+            for entity in results:
+                entities.append({
+                    "Entity": entity['entity_group'],
+                    "Word": entity['word'],
+                    "Score": entity['score']
+                })
+            st.table(pd.DataFrame(entities))
+    # Text Summarization
+    st.subheader("News Content Summarization")
+    summary_text = st.text_area("Enter news content to summarize:", height=150)
+    if st.button("Generate Summary"):
+        with st.spinner("Generating summary..."):
+            #load the summarization model
+            summarizer = pipeline("summarization")
+            summary = summarizer(summary_text, max_length=130, min_length=30)
+            st.write(summary[0]['summary_text'])
+    st.markdown('</div>', unsafe_allow_html=True)
+    # Sentiment Analysis
+    st.subheader("News Tone Detector")
+    sentiment_text = st.text_area("Enter text for news content analysis:", height=100)
+    if st.button("Analyze Tone"):
+        with st.spinner("Analyzing sentiment..."):
+            #load the model
+            sentiment_pipeline = pipeline("sentiment-analysis")
+            result = sentiment_pipeline(sentiment_text)[0]
+            st.write(f"Label: {result['label']}")
+            st.write(f"Confidence: {result['score']:.2f}")
+            if result['label'] == 'POSITIVE':
+                st.success("This text appears positive!")
+            else:
+                st.warning("This text appears negative.")
+# Enhanced Sidebar with branding and instructions
+with st.sidebar:
+    st.image("news_logo.jpg", width=300)
+    st.title("About")
+    st.write("""
+        This app helps analyze news content:
+        - Classify news into categories
+        - Answer questions about news content
+        - Perform advanced text analysis
+    """)
+    st.title("Instructions")
+    st.write("""
+        1. Upload a CSV file with a 'content' column.
+        2. Click on the appropriate tab to use a feature.
+        3. Download results as CSV.
+        4. Use the Q&A tab to ask questions about the news.
+    """)
+    st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.4)")
+# Footer
+st.markdown("---")
+st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True)