Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
| import requests | |
| from io import BytesIO | |
| # Set page configuration | |
| st.set_page_config(page_title="News Analysis App", layout="wide") | |
| # Download required NLTK resources | |
| def download_nltk_resources(): | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| download_nltk_resources() | |
| # Initialize preprocessor components | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| # Load the fine-tuned model for classification | |
| def load_classification_model(): | |
| model_name = "Oneli/News_Classification" # Replace with your actual model path | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| return model, tokenizer | |
| # Load Q&A pipeline | |
| def load_qa_pipeline(): | |
| qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad") | |
| return qa_pipeline | |
| # Text preprocessing function | |
| def preprocess_text(text): | |
| if pd.isna(text): | |
| return "" | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove URLs | |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text) | |
| # Remove HTML tags | |
| text = re.sub(r'<.*?>', '', text) | |
| # Remove special characters and numbers | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Tokenize | |
| tokens = word_tokenize(text) | |
| # Remove stopwords and lemmatize | |
| cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] | |
| # Join tokens back into text | |
| cleaned_text = ' '.join(cleaned_tokens) | |
| return cleaned_text | |
| # Function to classify news articles | |
| def classify_news(df, model, tokenizer): | |
| # Preprocess the text | |
| df['cleaned_content'] = df['content'].apply(preprocess_text) | |
| # Prepare for classification | |
| texts = df['cleaned_content'].tolist() | |
| # Get predictions | |
| predictions = [] | |
| batch_size = 16 | |
| for i in range(0, len(texts), batch_size): | |
| batch_texts = texts[i:i+batch_size] | |
| inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| logits = outputs.logits | |
| batch_predictions = torch.argmax(logits, dim=1).tolist() | |
| predictions.extend(batch_predictions) | |
| # Map numeric predictions back to class labels | |
| id2label = model.config.id2label | |
| df['class'] = [id2label[pred] for pred in predictions] | |
| return df | |
| # Main app | |
| def main(): | |
| st.title("News Analysis Application") | |
| # Sidebar for navigation | |
| st.sidebar.title("Navigation") | |
| app_mode = st.sidebar.radio("Choose the app mode", ["News Classification", "Question Answering"]) | |
| if app_mode == "News Classification": | |
| st.header("News Article Classification") | |
| st.write("Upload a CSV file containing news articles to classify them into categories.") | |
| # File upload | |
| uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
| if uploaded_file is not None: | |
| # Load the data | |
| df = pd.read_csv(uploaded_file) | |
| # Display sample of the data | |
| st.subheader("Sample of uploaded data") | |
| st.dataframe(df.head()) | |
| # Check if the required column exists | |
| if 'content' not in df.columns: | |
| st.error("The CSV file must contain a 'content' column with the news articles text.") | |
| else: | |
| # Load model and tokenizer | |
| with st.spinner("Loading classification model..."): | |
| model, tokenizer = load_classification_model() | |
| # Classify button | |
| if st.button("Classify Articles"): | |
| with st.spinner("Classifying news articles..."): | |
| # Perform classification | |
| result_df = classify_news(df, model, tokenizer) | |
| # Display results | |
| st.subheader("Classification Results") | |
| st.dataframe(result_df[['content', 'class']]) | |
| # Save to CSV | |
| csv = result_df.to_csv(index=False) | |
| st.download_button( | |
| label="Download output.csv", | |
| data=csv, | |
| file_name="output.csv", | |
| mime="text/csv" | |
| ) | |
| # Show distribution of classes | |
| st.subheader("Class Distribution") | |
| class_counts = result_df['class'].value_counts() | |
| st.bar_chart(class_counts) | |
| elif app_mode == "Question Answering": | |
| st.header("News Article Q&A") | |
| st.write("Ask questions about news content and get answers using a Q&A model.") | |
| # Text area for news content | |
| news_content = st.text_area("Paste news article content here:", height=200) | |
| # Question input | |
| question = st.text_input("Enter your question about the article:") | |
| if news_content and question: | |
| # Load QA pipeline | |
| with st.spinner("Loading Q&A model..."): | |
| qa_pipeline = load_qa_pipeline() | |
| # Get answer | |
| if st.button("Get Answer"): | |
| with st.spinner("Finding answer..."): | |
| result = qa_pipeline(question=question, context=news_content) | |
| # Display results | |
| st.subheader("Answer") | |
| st.write(result["answer"]) | |
| st.subheader("Confidence") | |
| st.progress(float(result["score"])) | |
| st.write(f"Confidence Score: {result['score']:.4f}") | |
| if __name__ == "__main__": | |
| main() |