Spaces:

Ashendilantha
/

newsarticle

Sleeping

App Files Files Community

Ashendilantha commited on Mar 30, 2025

Commit

0dd8629

verified ·

1 Parent(s): c1691a3

Upload app.py

Browse files

Files changed (1) hide show

app.py +186 -0

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import requests
+from io import BytesIO
+# Set page configuration
+st.set_page_config(page_title="News Analysis App", layout="wide")
+# Download required NLTK resources
+@st.cache_resource
+def download_nltk_resources():
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    nltk.download('wordnet')
+download_nltk_resources()
+# Initialize preprocessor components
+stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+# Load the fine-tuned model for classification
+@st.cache_resource
+def load_classification_model():
+    model_name = "Oneli/News_Classification"  # Replace with your actual model path
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    return model, tokenizer
+# Load Q&A pipeline
+@st.cache_resource
+def load_qa_pipeline():
+    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+    return qa_pipeline
+# Text preprocessing function
+def preprocess_text(text):
+    if pd.isna(text):
+        return ""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove URLs
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    # Remove HTML tags
+    text = re.sub(r'<.*?>', '', text)
+    # Remove special characters and numbers
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    # Tokenize
+    tokens = word_tokenize(text)
+    # Remove stopwords and lemmatize
+    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
+    # Join tokens back into text
+    cleaned_text = ' '.join(cleaned_tokens)
+    return cleaned_text
+# Function to classify news articles
+def classify_news(df, model, tokenizer):
+    # Preprocess the text
+    df['cleaned_content'] = df['content'].apply(preprocess_text)
+    # Prepare for classification
+    texts = df['cleaned_content'].tolist()
+    # Get predictions
+    predictions = []
+    batch_size = 16
+    for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i:i+batch_size]
+        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+            batch_predictions = torch.argmax(logits, dim=1).tolist()
+            predictions.extend(batch_predictions)
+    # Map numeric predictions back to class labels
+    id2label = model.config.id2label
+    df['class'] = [id2label[pred] for pred in predictions]
+    return df
+# Main app
+def main():
+    st.title("News Analysis Application")
+    # Sidebar for navigation
+    st.sidebar.title("Navigation")
+    app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])
+    if app_mode == "News Classification":
+        st.header("News Article Classification")
+        st.write("Upload a CSV file containing news articles to classify them into categories.")
+        # File upload
+        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+        if uploaded_file is not None:
+            # Load the data
+            df = pd.read_csv(uploaded_file)
+            # Display sample of the data
+            st.subheader("Sample of uploaded data")
+            st.dataframe(df.head())
+            # Check if the required column exists
+            if 'content' not in df.columns:
+                st.error("The CSV file must contain a 'content' column with the news articles text.")
+            else:
+                # Load model and tokenizer
+                with st.spinner("Loading classification model..."):
+                    model, tokenizer = load_classification_model()
+                # Classify button
+                if st.button("Classify Articles"):
+                    with st.spinner("Classifying news articles..."):
+                        # Perform classification
+                        result_df = classify_news(df, model, tokenizer)
+                        # Display results
+                        st.subheader("Classification Results")
+                        st.dataframe(result_df[['content', 'class']])
+                        # Save to CSV
+                        csv = result_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download output.csv",
+                            data=csv,
+                            file_name="output.csv",
+                            mime="text/csv"
+                        )
+                        # Show distribution of classes
+                        st.subheader("Class Distribution")
+                        class_counts = result_df['class'].value_counts()
+                        st.bar_chart(class_counts)
+    elif app_mode == "Question Answering":
+        st.header("News Article Q&A")
+        st.write("Ask questions about news content and get answers using a Q&A model.")
+        # Text area for news content
+        news_content = st.text_area("Paste news article content here:", height=200)
+        # Question input
+        question = st.text_input("Enter your question about the article:")
+        if news_content and question:
+            # Load QA pipeline
+            with st.spinner("Loading Q&A model..."):
+                qa_pipeline = load_qa_pipeline()
+            # Get answer
+            if st.button("Get Answer"):
+                with st.spinner("Finding answer..."):
+                    result = qa_pipeline(question=question, context=news_content)
+                    # Display results
+                    st.subheader("Answer")
+                    st.write(result["answer"])
+                    st.subheader("Confidence")
+                    st.progress(float(result["score"]))
+                    st.write(f"Confidence Score: {result['score']:.4f}")
+if __name__ == "__main__":
+    main()