Spaces:

Ashendilantha
/

News_Classification

Sleeping

App Files Files Community

Ashendilantha commited on Mar 30, 2025

Commit

e8f4b00

verified ·

1 Parent(s): 8dbdc8f

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -127

app.py CHANGED Viewed

@@ -1,156 +1,137 @@
 import streamlit as st
 import pandas as pd
 import re
 import nltk
-from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
-from transformers import pipeline
-from PIL import Image
-# Ensure NLTK resources are downloaded correctly
-nltk.download('stopwords')
-nltk.download('punkt')
-# Load Models
-news_classifier = pipeline("text-classification", model="Oneli/News_Classification")
-# Preprocessing Function
-lemmatizer = WordNetLemmatizer()
 stop_words = set(stopwords.words('english'))
 def preprocess_text(text):
     if pd.isna(text):
         return ""
-    # Convert to lowercase
     text = text.lower()
-    # Remove URLs
     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-    # Remove HTML tags
     text = re.sub(r'<.*?>', '', text)
-    # Remove special characters and numbers
     text = re.sub(r'[^a-zA-Z\s]', '', text)
-    # Tokenize
     tokens = word_tokenize(text)
-    # Remove stopwords and lemmatize
     cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
-    # Join tokens back into text
     cleaned_text = ' '.join(cleaned_tokens)
     return cleaned_text
-# Load Cover Image
-cover_image = Image.open("cover.png")  # Ensure this image exists
-# Label Mapping
-label_mapping = {
-    "LABEL_0": "Business",
-    "LABEL_1": "Opinion",
-    "LABEL_2": "Political Gossip",
-    "LABEL_3": "Sports",
-    "LABEL_4": "World News"
-}
-# Store classified article for QA
-context_storage = {"context": "", "bulk_context": "", "num_articles": 0}
-# Function for Single Article Classification
-def classify_text(text):
-    text = preprocess_text(text)  # Preprocess text
-    result = news_classifier(text)[0]
-    category = label_mapping.get(result['label'], "Unknown")
-    confidence = round(result['score'] * 100, 2)
-    # Store context for QA
-    context_storage["context"] = text
-    return category, f"Confidence: {confidence}%"
-# Function for Bulk Classification
-def classify_csv(file_path):
-    try:
-        df = pd.read_csv(file_path, encoding="utf-8")
-        # Automatically detect the column containing text
-        text_column = df.columns[0]  # Assume first column is the text column
-        df["Encoded Prediction"] = df[text_column].apply(lambda x: news_classifier(preprocess_text(str(x)))[0]['label'])
-        df["Decoded Prediction"] = df["Encoded Prediction"].map(label_mapping)
-        df["Confidence"] = df[text_column].apply(lambda x: round(news_classifier(preprocess_text(str(x)))[0]['score'] * 100, 2))
-        # Store all text as a single context for QA
-        context_storage["bulk_context"] = " ".join(df[text_column].dropna().astype(str).tolist())
-        context_storage["num_articles"] = len(df)
-        output_file = "output.csv"
-        df.to_csv(output_file, index=False)
-        return df, output_file
-    except Exception as e:
-        return None, f"Error: {str(e)}"
-# Function to Load Q&A Pipeline
-def load_qa_pipeline():
-    return pipeline("question-answering", model="deepset/roberta-base-squad2")
-# Streamlit App Layout
-st.set_page_config(page_title="News Classifier", page_icon="📰")
-# Load and display the cover image
-st.image(cover_image, caption="News Classifier 📢", use_container_width=True)
-# Section for Single Article Classification
-st.subheader("📰 Single Article Classification")
-text_input = st.text_area("Enter News Text", placeholder="Type or paste news content here...")
-if st.button("🔍 Classify"):
-    if text_input:
-        category, confidence = classify_text(text_input)
-        st.write(f"**Predicted Category:** {category}")
-        st.write(f"**Confidence Level:** {confidence}")
-    else:
-        st.warning("Please enter some text to classify.")
-# Section for Bulk CSV Classification
-st.subheader("📂 Bulk Classification (CSV)")
-file_input = st.file_uploader("Upload CSV File", type="csv")
-if file_input:
-    df, output_file = classify_csv(file_input)
-    if df is not None:
-        st.dataframe(df)
-        st.download_button(
-            label="Download Processed CSV",
-            data=open(output_file, 'rb').read(),
-            file_name=output_file,
-            mime="text/csv"
-        )
-    else:
-        st.error(f"Error processing file: {output_file}")
-# Section for Q&A
-st.subheader("💬 Q&A Model")
-question = st.text_input("Ask a question about the news article:", placeholder="Ask anything related to the news...")
-if question:
-    # Load the QA model and get the answer
-    with st.spinner("Loading Q&A model..."):
-        qa_pipeline = load_qa_pipeline()
-    if st.button("Get Answer"):
-        with st.spinner("Finding answer..."):
-            result = qa_pipeline(question=question, context=context_storage["context"])
-            # Display results
-            st.subheader("Answer")
-            st.write(result["answer"])
-            st.subheader("Confidence")
-            st.progress(float(result["score"]))
-            st.write(f"Confidence Score: {result['score']:.4f}")

 import streamlit as st
 import pandas as pd
+import numpy as np
 import re
 import nltk
 from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+# Set page configuration
+st.set_page_config(page_title="News Analysis App", layout="wide")
+# Download required NLTK resources
+@st.cache_resource
+def download_nltk_resources():
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+download_nltk_resources()
+# Initialize preprocessor components
 stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+# Load classification model
+@st.cache_resource
+def load_classification_model():
+    model_name = "Oneli/News_Classification"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model, tokenizer
+# Load Q&A pipeline
+@st.cache_resource
+def load_qa_pipeline():
+    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+    return qa_pipeline
+# Preprocessing function
 def preprocess_text(text):
     if pd.isna(text):
         return ""
     text = text.lower()
     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
     text = re.sub(r'<.*?>', '', text)
     text = re.sub(r'[^a-zA-Z\s]', '', text)
     tokens = word_tokenize(text)
     cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
     cleaned_text = ' '.join(cleaned_tokens)
     return cleaned_text
+# Batch classification function
+def classify_news(df, model, tokenizer):
+    df['cleaned_content'] = df['content'].apply(preprocess_text)
+    texts = df['cleaned_content'].tolist()
+    predictions = []
+    batch_size = 16
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i+batch_size]
+        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+            batch_predictions = torch.argmax(logits, dim=1).tolist()
+            predictions.extend(batch_predictions)
+    id2label = model.config.id2label
+    df['class'] = [id2label[pred] for pred in predictions]
+    return df
+# Main app
+def main():
+    st.title("News Analysis Application")
+    st.sidebar.title("Navigation")
+    app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])
+    if app_mode == "News Classification":
+        st.header("News Article Classification")
+        uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
+        if uploaded_file is not None:
+            df = pd.read_csv(uploaded_file)
+            st.subheader("Sample of uploaded data")
+            st.dataframe(df.head())
+            if 'content' not in df.columns:
+                st.error("The CSV file must contain a 'content' column.")
+            else:
+                with st.spinner("Loading model..."):
+                    model, tokenizer = load_classification_model()
+                if st.button("Classify Articles"):
+                    with st.spinner("Classifying news articles..."):
+                        result_df = classify_news(df, model, tokenizer)
+                        st.subheader("Classification Results")
+                        st.dataframe(result_df[['content', 'class']])
+                        csv = result_df.to_csv(index=False)
+                        st.download_button("Download output.csv", csv, "output.csv", "text/csv")
+                        st.subheader("Class Distribution")
+                        st.bar_chart(result_df['class'].value_counts())
+    elif app_mode == "Question Answering":
+        st.header("News Article Q&A")
+        uploaded_file = st.file_uploader("Upload CSV for Q&A", type="csv")
+        if uploaded_file is not None:
+            df = pd.read_csv(uploaded_file)
+            if 'content' not in df.columns:
+                st.error("The CSV file must contain a 'content' column.")
+            else:
+                combined_text = " ".join(df['cleaned_content'].dropna().astype(str).tolist())
+                question = st.text_input("Enter your question about the news:")
+                if combined_text and question:
+                    with st.spinner("Loading Q&A model..."):
+                        qa_pipeline = load_qa_pipeline()
+                    if st.button("Get Answer"):
+                        with st.spinner("Finding answer..."):
+                            result = qa_pipeline(question=question, context=combined_text)
+                            st.subheader("Answer")
+                            st.write(result["answer"])
+                            st.subheader("Confidence")
+                            st.progress(float(result["score"]))
+                            st.write(f"Confidence Score: {result['score']:.4f}")
+if __name__ == "__main__":
+    main()