import streamlit as st import pandas as pd import numpy as np import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline import requests from io import BytesIO # Set page configuration st.set_page_config(page_title="News Analysis App", layout="wide") # Download required NLTK resources @st.cache_resource def download_nltk_resources(): nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') download_nltk_resources() # Initialize preprocessor components stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # Load the fine-tuned model for classification @st.cache_resource def load_classification_model(): model_name = "Oneli/News_Classification" # Replace with your actual model path tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) return model, tokenizer # Load Q&A pipeline @st.cache_resource def load_qa_pipeline(): qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") return qa_pipeline # Text preprocessing function def preprocess_text(text): if pd.isna(text): return "" # Convert to lowercase text = text.lower() # Remove URLs text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove HTML tags text = re.sub(r'<.*?>', '', text) # Remove special characters and numbers text = re.sub(r'[^a-zA-Z\s]', '', text) # Tokenize tokens = word_tokenize(text) # Remove stopwords and lemmatize cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] # Join tokens back into text cleaned_text = ' '.join(cleaned_tokens) return cleaned_text # Function to classify news articles def classify_news(df, model, tokenizer): # Preprocess the text df['cleaned_content'] = df['content'].apply(preprocess_text) # Prepare for classification texts = df['cleaned_content'].tolist() # Get predictions predictions = [] batch_size = 16 for i in range(0, len(texts), batch_size): batch_texts = texts[i:i+batch_size] inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits batch_predictions = torch.argmax(logits, dim=1).tolist() predictions.extend(batch_predictions) # Map numeric predictions back to class labels id2label = model.config.id2label df['class'] = [id2label[pred] for pred in predictions] return df # Main app def main(): st.title("News Analysis Application") # Sidebar for navigation st.sidebar.title("Navigation") app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"]) if app_mode == "News Classification": st.header("News Article Classification") st.write("Upload a CSV file containing news articles to classify them into categories.") # File upload uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: # Load the data df = pd.read_csv(uploaded_file) # Display sample of the data st.subheader("Sample of uploaded data") st.dataframe(df.head()) # Check if the required column exists if 'content' not in df.columns: st.error("The CSV file must contain a 'content' column with the news articles text.") else: # Load model and tokenizer with st.spinner("Loading classification model..."): model, tokenizer = load_classification_model() # Classify button if st.button("Classify Articles"): with st.spinner("Classifying news articles..."): # Perform classification result_df = classify_news(df, model, tokenizer) # Display results st.subheader("Classification Results") st.dataframe(result_df[['content', 'class']]) # Save to CSV csv = result_df.to_csv(index=False) st.download_button( label="Download output.csv", data=csv, file_name="output.csv", mime="text/csv" ) # Show distribution of classes st.subheader("Class Distribution") class_counts = result_df['class'].value_counts() st.bar_chart(class_counts) elif app_mode == "Question Answering": st.header("News Article Q&A") st.write("Ask questions about news content and get answers using a Q&A model.") # Text area for news content news_content = st.text_area("Paste news article content here:", height=200) # Question input question = st.text_input("Enter your question about the article:") if news_content and question: # Load QA pipeline with st.spinner("Loading Q&A model..."): qa_pipeline = load_qa_pipeline() # Get answer if st.button("Get Answer"): with st.spinner("Finding answer..."): result = qa_pipeline(question=question, context=news_content) # Display results st.subheader("Answer") st.write(result["answer"]) st.subheader("Confidence") st.progress(float(result["score"])) st.write(f"Confidence Score: {result['score']:.4f}") if __name__ == "__main__": main()