Spaces:

Prageeth-1
/

News_Classification_App

Sleeping

App Files Files Community

Prageeth-1 commited on Mar 29, 2025

Commit

770c070

verified ·

1 Parent(s): 875d133

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -151

app.py CHANGED Viewed

@@ -15,7 +15,6 @@ import string
 import os
 from nltk.stem import PorterStemmer
 # Download NLTK resources
 nltk.download('punkt')
 nltk.download('stopwords')
@@ -25,7 +24,6 @@ nltk.download('wordnet')
 nltk_data_path = "/home/user/nltk_data"
 if not os.path.exists(nltk_data_path):
     os.makedirs(nltk_data_path)
 nltk.data.path.append(nltk_data_path)
 nltk.download('punkt', download_dir=nltk_data_path)
@@ -44,8 +42,6 @@ def load_classification_model():
 def load_qa_model():
     return pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Function to generate word cloud
 def generate_wordcloud(text, title=None):
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
@@ -55,7 +51,7 @@ def generate_wordcloud(text, title=None):
     plt.title(title, fontsize=20)
     st.pyplot(plt)
-# Set page config
 st.set_page_config(
     page_title="News Analysis Dashboard",
     page_icon="📰",
@@ -63,188 +59,166 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# Custom CSS
 st.markdown("""
     <style>
-    .main {
-        background-color: #f5f5f5;
     }
-    .stButton>button {
-        background-color: #4CAF50;
         color: white;
     }
-    .stDownloadButton>button {
-        background-color: #2196F3;
         color: white;
     }
     .stTextInput>div>div>input {
         background-color: #ffffff;
-        color : #FF6347;
     }
-    .header {
-        display: flex;
-        align-items: center;
         margin-bottom: 20px;
-        background-color: #2196F3;
-    }
-    .header img {
-        height: 50px;
-        margin-right: 10px;
-    }
-    .header h1 {
-        font-size: 40px;
-        color: white;
-        margin: 0;
-        align: center;
     }
     </style>
     """, unsafe_allow_html=True)
 st.markdown("""
     <div class="header">
-        <center><h1>Daily Mirror News Analyzer</h1></center>
     </div>
-""", unsafe_allow_html =True)
-# App title and description
 st.markdown("""
-    Analyze news excerpts with our powerful AI tools:
-    - Classify news articles into categories
-    - Get answers to your questions about the news content
-    - Visualize key themes
-    """)
 # Create tabs for different functionalities
 tab1, tab2, tab3 = st.tabs(["News Classification", "Q&A Pipeline", "Advanced Features"])
 with tab1:
     st.header("News Classification Pipeline")
     st.write("Upload a CSV file containing news excerpts to classify them into categories.")
-    # File uploader
-    uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    # Check the file
-    if uploaded_file is None:
-        st.warning("Please upload a CSV file.")
     else:
         df = pd.read_csv(uploaded_file)
         # Load the fine-tuned news classifier
         classifier = pipeline("text-classification", model="Imasha17/News_classification.3")
-        # Preprocess
-        # Lowercase
         df["cleaned_content"] = df["content"].str.lower()
         # Remove URLs
         def remove_urls(text):
             url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
-            text = url_pattern.sub(r'', text)
-            return text.strip()
-        # applying the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_urls(text))
         # Remove Emails
         def remove_emails(text):
             email_pattern = re.compile(r'\S+@\S+')
             return email_pattern.sub(r'', text)
-        # applying the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_emails(text))
-        #Remove punctuations
         def remove_punctuation(text):
             return "".join([char for char in text if char not in string.punctuation])
-        # applying the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_punctuation(text))
-         # Get the list of stop words
         stop_words = set(stopwords.words('english'))
-        # define the function
         def remove_stopwords(text):
-            return " ".join([word for word in str(text).split() if word not in stop_words])
-        # apply the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_stopwords(text))
-        # define the function
         def remove_special_characters(text):
             return re.sub(r'[^A-Za-z\s]', '', text)
-        # apply the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_special_characters(text))
-        #Remove Frequent words
-        # Get the count of each word in cleaned_text
         word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
-        # Get a set of common words
-        common_words = set([word for (word,count) in word_count.most_common(10)])
-        # deinfe the function
         def remove_common_words(text):
-            return " ".join([word for word in str(text).split() if word not in common_words])
-        # apply the function
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_common_words(text))
-        #Remove rare words
-        # Get a set of rare words
-        rare_words = set([word for (word,count) in word_count.most_common()[:-20-1:-1]])
-        print(rare_words)
-        # define the function
         def remove_rare_words(text):
-            return " ".join([word for word in str(text).split() if word not in rare_words])
-        df["cleaned_content"] = df["cleaned_content"].apply(lambda text: remove_rare_words(text))
         df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
-        # initialize stemmer
         stemmer = PorterStemmer()
-        # Defining the function
         def stem_tokens(tokens):
-            stems = [stemmer.stem(token) for token in tokens]
-            return stems
-        # apply the function
-        df['stemmed_content'] = df['tokenized_content'].apply(lambda text: stem_tokens(text))
         df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
-        # Classify each article and store the predictions
         df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
-        #Delete Unnecessary columns
         df = df[['content','Class']]
-        # Show results
         st.subheader("Classification Results")
         st.write(df)
-        # Show distribution
         st.subheader("Class Distribution")
         class_dist = df['Class'].value_counts()
         st.bar_chart(class_dist)
-       # Download button
         st.subheader("Download Results")
         csv = df.to_csv(index=False).encode('utf-8')
         st.download_button(
@@ -253,59 +227,48 @@ with tab1:
             file_name='output.csv',
             mime='text/csv'
         )
 with tab2:
     st.header("Question Answering Pipeline")
     st.write("Ask questions about news content and get answers from our AI model.")
     if uploaded_file is not None:
-        # Load the CSV and prepare context for the model
-        context = ' '.join(df['content'].tolist())  # Use predictions for Q&A
         st.write(f"Loaded {len(df)} news excerpts")
     else:
         st.warning("Please upload a CSV file.")
-    # Input field for the question
     question = st.text_input("Enter your question:")
-    # Handle the "Get Answer" button
     if st.button("Get Answer"):
         if uploaded_file is None:
-            # Display an error message if no file is uploaded
             st.error("Please upload a CSV file before asking a question.")
         elif context and question:
-            # If both a file and a question are provided, answer the question
             with st.spinner("Searching for answers..."):
-                qa_pipeline = load_qa_model()  # Ensure this function is defined elsewhere
                 result = qa_pipeline(question=question, context=context)
-                # Display the answer and details
                 st.subheader("Answer")
                 st.success(result['answer'])
                 st.subheader("Details")
                 st.write(f"Confidence: {result['score']:.2f}")
         else:
             st.error("Please enter a question.")
-        # Question Answering section
     st.header("Ask Questions Based on Your News Content")
     context_1 = st.text_area("Enter the news content (context):")
-    question_1 = st.text_input("Enter your question:" , key="question_input" )
-    if st.button("Get Answer" , key="get_answer_1"):
         if context_1 and question_1:
-            answer_1 = qa_pipeline({'context': context, 'question': question})
             st.success(f"Answer: {answer_1['answer']}")
         else:
-            st.warning("Provide both context and question.")
 with tab3:
     st.header("Advanced Features")
     st.write("Explore additional functionalities to enhance your news analysis.")
@@ -330,7 +293,6 @@ with tab3:
         with st.spinner("Identifying entities..."):
             ner_pipeline = pipeline("ner", grouped_entities=True)
             results = ner_pipeline(ner_text)
             entities = []
             for entity in results:
                 entities.append({
@@ -338,7 +300,6 @@ with tab3:
                     "Word": entity['word'],
                     "Score": entity['score']
                 })
             st.table(pd.DataFrame(entities))
     # Text Summarization
@@ -349,30 +310,29 @@ with tab3:
             summarizer = pipeline("summarization")
             summary = summarizer(summary_text, max_length=130, min_length=30)
             st.write(summary[0]['summary_text'])
-# Sidebar with additional info
 with st.sidebar:
-    st.image("https://via.placeholder.com/150x50?text=Daily+Mirror", width=150)
     st.title("About")
     st.write("""
         This app helps analyze news content:
         - Classify news into categories
         - Answer questions about news content
         - Perform advanced text analysis
-        """)
     st.title("Instructions")
     st.write("""
-        1. Upload a CSV file with 'content' column
-        2. Click classify to categorize news
-        3. Download results as CSV
-        4. Use Q&A tab to ask questions
-        """)
     st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.3)")
 # Footer
 st.markdown("---")
-st.markdown("© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers")

 import os
 from nltk.stem import PorterStemmer
 # Download NLTK resources
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk_data_path = "/home/user/nltk_data"
 if not os.path.exists(nltk_data_path):
     os.makedirs(nltk_data_path)
 nltk.data.path.append(nltk_data_path)
 nltk.download('punkt', download_dir=nltk_data_path)
 def load_qa_model():
     return pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Function to generate word cloud
 def generate_wordcloud(text, title=None):
     wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
     plt.title(title, fontsize=20)
     st.pyplot(plt)
+# Set page config with an attractive icon and layout options
 st.set_page_config(
     page_title="News Analysis Dashboard",
     page_icon="📰",
     initial_sidebar_state="expanded"
 )
+# Custom CSS to improve styling
 st.markdown("""
     <style>
+    /* Overall page background */
+    .reportview-container {
+        background: #f0f2f6;
     }
+    /* Header styling */
+    .header {
+        background: linear-gradient(90deg, #1a73e8, #4285f4);
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        text-align: center;
         color: white;
     }
+    .header h1 {
+        font-size: 48px;
+        margin: 0;
+        font-weight: bold;
+    }
+    /* Sidebar styling */
+    .css-1d391kg {
+        background-color: #ffffff;
+    }
+    /* Button styling */
+    .stButton>button {
+        background-color: #1a73e8;
         color: white;
+        border: none;
+        padding: 10px 20px;
+        border-radius: 5px;
+        font-size: 16px;
     }
+    .stButton>button:hover {
+        background-color: #0c55b3;
+    }
+    /* Text input styling */
     .stTextInput>div>div>input {
         background-color: #ffffff;
+        color: #333333;
+        font-size: 16px;
     }
+    /* Card style containers */
+    .card {
+        background-color: #ffffff;
+        padding: 20px;
+        border-radius: 8px;
         margin-bottom: 20px;
+        box-shadow: 0px 4px 8px rgba(0,0,0,0.05);
     }
     </style>
     """, unsafe_allow_html=True)
+# Banner header
 st.markdown("""
     <div class="header">
+        <h1>Daily Mirror News Analyzer</h1>
+        <p style="font-size: 20px; margin-top: 5px;">Analyze, classify, and explore news content with AI</p>
     </div>
+""", unsafe_allow_html=True)
+# Layout introduction text
 st.markdown("""
+    <div class="card">
+        <h2>Welcome!</h2>
+        <p>This dashboard allows you to:
+            <ul>
+                <li>Classify news articles into categories</li>
+                <li>Ask questions about the news content</li>
+                <li>Visualize sentiment, entities, and summaries</li>
+            </ul>
+            Use the tabs below to navigate between different functionalities.
+        </p>
+    </div>
+""", unsafe_allow_html=True)
 # Create tabs for different functionalities
 tab1, tab2, tab3 = st.tabs(["News Classification", "Q&A Pipeline", "Advanced Features"])
 with tab1:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
     st.header("News Classification Pipeline")
     st.write("Upload a CSV file containing news excerpts to classify them into categories.")
+    # File uploader with a descriptive message
+    uploaded_file = st.file_uploader("Choose a CSV file (must contain a 'content' column)", type="csv")
+    if uploaded_file is None:
+        st.warning("Please upload a CSV file to get started.")
     else:
         df = pd.read_csv(uploaded_file)
         # Load the fine-tuned news classifier
         classifier = pipeline("text-classification", model="Imasha17/News_classification.3")
+        # Preprocessing steps
         df["cleaned_content"] = df["content"].str.lower()
         # Remove URLs
         def remove_urls(text):
             url_pattern = re.compile(r'http[s]?://\S+[^\s.,;:()"\']')
+            return url_pattern.sub(r'', text).strip()
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_urls)
         # Remove Emails
         def remove_emails(text):
             email_pattern = re.compile(r'\S+@\S+')
             return email_pattern.sub(r'', text)
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_emails)
+        # Remove punctuation
         def remove_punctuation(text):
             return "".join([char for char in text if char not in string.punctuation])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_punctuation)
+        # Remove stopwords
         stop_words = set(stopwords.words('english'))
         def remove_stopwords(text):
+            return " ".join([word for word in text.split() if word not in stop_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_stopwords)
+        # Remove special characters
         def remove_special_characters(text):
             return re.sub(r'[^A-Za-z\s]', '', text)
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_special_characters)
+        # Remove frequent words
         word_count = Counter(df["cleaned_content"].str.split(expand=True).stack())
+        common_words = set([word for (word, count) in word_count.most_common(10)])
         def remove_common_words(text):
+            return " ".join([word for word in text.split() if word not in common_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_common_words)
+        # Remove rare words
+        rare_words = set([word for (word, count) in word_count.most_common()[:-20-1:-1]])
         def remove_rare_words(text):
+            return " ".join([word for word in text.split() if word not in rare_words])
+        df["cleaned_content"] = df["cleaned_content"].apply(remove_rare_words)
+        # Tokenize and stem
         df['tokenized_content'] = df['cleaned_content'].apply(lambda text: text.split())
         stemmer = PorterStemmer()
         def stem_tokens(tokens):
+            return [stemmer.stem(token) for token in tokens]
+        df['stemmed_content'] = df['tokenized_content'].apply(stem_tokens)
         df["preprocessed_content"] = df["stemmed_content"].apply(lambda text: " ".join(text))
+        # Classify each article and store predictions
         df["Class"] = df["preprocessed_content"].apply(lambda text: classifier(text)[0]["label"])
+        # Keep only necessary columns
         df = df[['content','Class']]
         st.subheader("Classification Results")
         st.write(df)
         st.subheader("Class Distribution")
         class_dist = df['Class'].value_counts()
         st.bar_chart(class_dist)
         st.subheader("Download Results")
         csv = df.to_csv(index=False).encode('utf-8')
         st.download_button(
             file_name='output.csv',
             mime='text/csv'
         )
+    st.markdown('</div>', unsafe_allow_html=True)
 with tab2:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
     st.header("Question Answering Pipeline")
     st.write("Ask questions about news content and get answers from our AI model.")
     if uploaded_file is not None:
+        context = ' '.join(df['content'].tolist())
         st.write(f"Loaded {len(df)} news excerpts")
     else:
         st.warning("Please upload a CSV file.")
     question = st.text_input("Enter your question:")
     if st.button("Get Answer"):
         if uploaded_file is None:
             st.error("Please upload a CSV file before asking a question.")
         elif context and question:
             with st.spinner("Searching for answers..."):
+                qa_pipeline = load_qa_model()
                 result = qa_pipeline(question=question, context=context)
                 st.subheader("Answer")
                 st.success(result['answer'])
                 st.subheader("Details")
                 st.write(f"Confidence: {result['score']:.2f}")
         else:
             st.error("Please enter a question.")
+    st.markdown("---")
     st.header("Ask Questions Based on Your News Content")
     context_1 = st.text_area("Enter the news content (context):")
+    question_1 = st.text_input("Enter your question:", key="question_input")
+    if st.button("Get Answer", key="get_answer_1"):
         if context_1 and question_1:
+            answer_1 = qa_pipeline({'context': context_1, 'question': question_1})
             st.success(f"Answer: {answer_1['answer']}")
         else:
+            st.warning("Provide both context and question.")
+    st.markdown('</div>', unsafe_allow_html=True)
 with tab3:
+    st.markdown('<div class="card">', unsafe_allow_html=True)
     st.header("Advanced Features")
     st.write("Explore additional functionalities to enhance your news analysis.")
         with st.spinner("Identifying entities..."):
             ner_pipeline = pipeline("ner", grouped_entities=True)
             results = ner_pipeline(ner_text)
             entities = []
             for entity in results:
                 entities.append({
                     "Word": entity['word'],
                     "Score": entity['score']
                 })
             st.table(pd.DataFrame(entities))
     # Text Summarization
             summarizer = pipeline("summarization")
             summary = summarizer(summary_text, max_length=130, min_length=30)
             st.write(summary[0]['summary_text'])
+    st.markdown('</div>', unsafe_allow_html=True)
+# Enhanced Sidebar with branding and instructions
 with st.sidebar:
+    st.image("https://via.placeholder.com/300x100?text=Daily+Mirror", width=300)
     st.title("About")
     st.write("""
         This app helps analyze news content:
         - Classify news into categories
         - Answer questions about news content
         - Perform advanced text analysis
+    """)
     st.title("Instructions")
     st.write("""
+        1. Upload a CSV file with a 'content' column.
+        2. Click on the appropriate tab to use a feature.
+        3. Download results as CSV.
+        4. Use the Q&A tab to ask questions about the news.
+    """)
     st.markdown("[View model on Hugging Face](https://huggingface.co/Imasha17/News_classification.3)")
 # Footer
 st.markdown("---")
+st.markdown("<div style='text-align: center;'>© 2023 Daily Mirror News Analyzer | Powered by Hugging Face Transformers</div>", unsafe_allow_html=True)