import PyPDF2 as pdf from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import streamlit as st import bert_score from rouge_score import rouge_scorer from transformers import T5ForConditionalGeneration, T5Tokenizer from difflib import SequenceMatcher from nltk.sentiment import SentimentIntensityAnalyzer import matplotlib.pyplot as plt import nltk nltk.download('vader_lexicon') st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png") # Initialize the model and tokenizer model = T5ForConditionalGeneration.from_pretrained("t5-base") tokenizer = T5Tokenizer.from_pretrained("t5-base") def extract_text(uploaded_file): text = "" if uploaded_file: reader = pdf.PdfReader(uploaded_file) for page in reader.pages: text += page.extract_text() return text def calculate_similarity(text1, text2): vectorizer = CountVectorizer().fit_transform([text1, text2]) vectors = vectorizer.toarray() return cosine_similarity(vectors)[0][1] def bert_similarity(text1, text2): P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True) return F1.item() def rouge_similarity(text1, text2): scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) scores = scorer.score(text1, text2) return scores['rougeL'].fmeasure def highlight_similarity(text1, text2): matcher = SequenceMatcher(None, text1, text2) matches = matcher.get_matching_blocks() highlighted_text = "" for match in matches: start1 = match.a end1 = match.a + match.size start2 = match.b end2 = match.b + match.size # Highlight the matching subsequence highlighted_text += text1[start1:end1] + '\n' highlighted_text += text2[start2:end2] + '\n\n' return highlighted_text def generate_summary(text): # Encode the text inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True) # Generate the summary outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True) # Decode the summary summary = tokenizer.decode(outputs[0]) return summary def predict_sentiment(text, threshold_positive, threshold_negative): sid = SentimentIntensityAnalyzer() sentiment_scores = sid.polarity_scores(text) threshold_positive = float(threshold_positive) threshold_negative = float(threshold_negative) if sentiment_scores.get("compound", 0) >= threshold_positive: return "Positive" elif sentiment_scores.get("compound", 0) <= threshold_negative: return "Negative" else: return "Neutral" def main(): st.title("Text Analysis App") st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.") st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.") option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"]) if option == "Check Similarity": uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf") uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf") st.sidebar.title("Similarity Metrics") st.sidebar.write("**Cosine Similarity**:") st.sidebar.write("Measures how similar the two documents are based on their content.") st.sidebar.write("**BERT Score**:") st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.") st.sidebar.write("**ROUGE Score**:") st.sidebar.write("Evaluates the overlap in n-grams between the two documents.") similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"]) if uploaded_file1 and uploaded_file2: if st.button("Check Similarity"): text1 = extract_text(uploaded_file1) text2 = extract_text(uploaded_file2) similarity = None if similarity_metric == "Cosine Similarity": similarity = calculate_similarity(text1, text2) st.write(f"The similarity between the two files is {similarity:.2f}.") elif similarity_metric == "BERT Score": bert_similarity_score = bert_similarity(text1, text2) st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.") elif similarity_metric == "ROUGE Score": rouge_similarity_score = rouge_similarity(text1, text2) st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.") st.write("Highlighted Similarity:") st.write(highlight_similarity(text1, text2)) elif option == "Generate Summary": uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file: if st.button("Generate Summary"): text = extract_text(uploaded_file) summary = generate_summary(text) st.write("Summary:") st.write(summary) elif option == "Sentiment Analysis": threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01) threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01) uploaded_file = st.file_uploader("Upload PDF Document") if uploaded_file: pdf_reader = pdf.PdfReader(uploaded_file) positive_count = 0 negative_count = 0 neutral_count = 0 for page in pdf_reader.pages: text = page.extract_text() sentences = text.split(".") for sentence in sentences: sentence = sentence.strip() if sentence: sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative) if sentiment == "Positive": positive_count += 1 elif sentiment == "Negative": negative_count += 1 else: neutral_count += 1 st.write("Positive Sentences:", positive_count) st.write("Negative Sentences:", negative_count) st.write("Neutral Sentences:", neutral_count) labels = ["Positive", "Negative", "Neutral"] sizes = [positive_count, negative_count, neutral_count] fig, ax = plt.subplots() ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90) ax.axis("equal") ax.set_title("Sentiment Distribution") st.pyplot(fig) if __name__ == "__main__": main()