Spaces:

Ashendilantha
/

TA_Project

Sleeping

File size: 6,424 Bytes

79578f9

import streamlit as st
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import requests
from io import BytesIO

# Set page configuration
st.set_page_config(page_title="News Analysis App", layout="wide")

# Download required NLTK resources
@st.cache_resource
def download_nltk_resources():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

download_nltk_resources()

# Initialize preprocessor components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the fine-tuned model for classification
@st.cache_resource
def load_classification_model():
    model_name = "Oneli/News_Classification"  # Replace with your actual model path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return model, tokenizer

# Load Q&A pipeline
@st.cache_resource
def load_qa_pipeline():
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
    return qa_pipeline

# Text preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into text
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

# Function to classify news articles
def classify_news(df, model, tokenizer):
    # Preprocess the text
    df['cleaned_content'] = df['content'].apply(preprocess_text)
    
    # Prepare for classification
    texts = df['cleaned_content'].tolist()
    
    # Get predictions
    predictions = []
    batch_size = 16
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1).tolist()
            predictions.extend(batch_predictions)
    
    # Map numeric predictions back to class labels
    id2label = model.config.id2label
    df['class'] = [id2label[pred] for pred in predictions]
    
    return df

# Main app
def main():
    st.title("News Analysis Application")
    
    # Sidebar for navigation
    st.sidebar.title("Navigation")
    app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"])
    
    if app_mode == "News Classification":
        st.header("News Article Classification")
        st.write("Upload a CSV file containing news articles to classify them into categories.")
        
        # File upload
        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
        
        if uploaded_file is not None:
            # Load the data
            df = pd.read_csv(uploaded_file)
            
            # Display sample of the data
            st.subheader("Sample of uploaded data")
            st.dataframe(df.head())
            
            # Check if the required column exists
            if 'content' not in df.columns:
                st.error("The CSV file must contain a 'content' column with the news articles text.")
            else:
                # Load model and tokenizer
                with st.spinner("Loading classification model..."):
                    model, tokenizer = load_classification_model()
                
                # Classify button
                if st.button("Classify Articles"):
                    with st.spinner("Classifying news articles..."):
                        # Perform classification
                        result_df = classify_news(df, model, tokenizer)
                        
                        # Display results
                        st.subheader("Classification Results")
                        st.dataframe(result_df[['content', 'class']])
                        
                        # Save to CSV
                        csv = result_df.to_csv(index=False)
                        st.download_button(
                            label="Download output.csv",
                            data=csv,
                            file_name="output.csv",
                            mime="text/csv"
                        )
                        
                        # Show distribution of classes
                        st.subheader("Class Distribution")
                        class_counts = result_df['class'].value_counts()
                        st.bar_chart(class_counts)
    
    elif app_mode == "Question Answering":
        st.header("News Article Q&A")
        st.write("Ask questions about news content and get answers using a Q&A model.")
        
        # Text area for news content
        news_content = st.text_area("Paste news article content here:", height=200)
        
        # Question input
        question = st.text_input("Enter your question about the article:")
        
        if news_content and question:
            # Load QA pipeline
            with st.spinner("Loading Q&A model..."):
                qa_pipeline = load_qa_pipeline()
            
            # Get answer
            if st.button("Get Answer"):
                with st.spinner("Finding answer..."):
                    result = qa_pipeline(question=question, context=news_content)
                    
                    # Display results
                    st.subheader("Answer")
                    st.write(result["answer"])
                    
                    st.subheader("Confidence")
                    st.progress(float(result["score"]))
                    st.write(f"Confidence Score: {result['score']:.4f}")

if __name__ == "__main__":
    main()