Spaces:

manasvikalyan
/

nlp_text_analysis

Runtime error

File size: 7,144 Bytes

25afb5b
 
 
 
 
 
 
 
 
 
b9e94ef
e18fce5
25afb5b

import PyPDF2 as pdf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import bert_score
from rouge_score import rouge_scorer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from difflib import SequenceMatcher
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import nltk
nltk.download('vader_lexicon')

st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png")


# Initialize the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def extract_text(uploaded_file):
    text = ""
    if uploaded_file:
        reader = pdf.PdfReader(uploaded_file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def calculate_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

def bert_similarity(text1, text2):
    P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True)
    return F1.item()

def rouge_similarity(text1, text2):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(text1, text2)
    return scores['rougeL'].fmeasure

def highlight_similarity(text1, text2):
    matcher = SequenceMatcher(None, text1, text2)
    matches = matcher.get_matching_blocks()

    highlighted_text = ""
    for match in matches:
        start1 = match.a
        end1 = match.a + match.size
        start2 = match.b
        end2 = match.b + match.size
        # Highlight the matching subsequence
        highlighted_text += text1[start1:end1] + '\n'
        highlighted_text += text2[start2:end2] + '\n\n'
    
    return highlighted_text


def generate_summary(text):
    # Encode the text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True)
    
    # Generate the summary
    outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode the summary
    summary = tokenizer.decode(outputs[0])
    
    return summary


def predict_sentiment(text, threshold_positive, threshold_negative):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)

    threshold_positive = float(threshold_positive)
    threshold_negative = float(threshold_negative)

    if sentiment_scores.get("compound", 0) >= threshold_positive:
        return "Positive"
    elif sentiment_scores.get("compound", 0) <= threshold_negative:
        return "Negative"
    else:
        return "Neutral"


def main():
    st.title("Text Analysis App")
    st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.")
    st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.")
    
    
    option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"])

    if option == "Check Similarity":
        uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf")
        uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf")

        st.sidebar.title("Similarity Metrics")
        st.sidebar.write("**Cosine Similarity**:")
        st.sidebar.write("Measures how similar the two documents are based on their content.")
        st.sidebar.write("**BERT Score**:")
        st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.")
        st.sidebar.write("**ROUGE Score**:")
        st.sidebar.write("Evaluates the overlap in n-grams between the two documents.")

        similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"])

        if uploaded_file1 and uploaded_file2:
            if st.button("Check Similarity"):
                text1 = extract_text(uploaded_file1)
                text2 = extract_text(uploaded_file2)
                similarity = None
                if similarity_metric == "Cosine Similarity":
                    similarity = calculate_similarity(text1, text2)
                    st.write(f"The similarity between the two files is {similarity:.2f}.")
                elif similarity_metric == "BERT Score":
                    bert_similarity_score = bert_similarity(text1, text2)
                    st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.")
                elif similarity_metric == "ROUGE Score":
                    rouge_similarity_score = rouge_similarity(text1, text2)
                    st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.")

                st.write("Highlighted Similarity:")
                st.write(highlight_similarity(text1, text2))

    elif option == "Generate Summary":
        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
        if uploaded_file:
            if st.button("Generate Summary"):
                text = extract_text(uploaded_file)
                summary = generate_summary(text)
                st.write("Summary:")
                st.write(summary)
    elif option == "Sentiment Analysis":
        threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01)
        threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01)
        uploaded_file = st.file_uploader("Upload PDF Document")

        if uploaded_file:
            pdf_reader = pdf.PdfReader(uploaded_file)
            positive_count = 0
            negative_count = 0
            neutral_count = 0

            for page in pdf_reader.pages:
                text = page.extract_text()
                sentences = text.split(".")
                for sentence in sentences:
                    sentence = sentence.strip()
                    if sentence:
                        sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative)
                        if sentiment == "Positive":
                            positive_count += 1
                        elif sentiment == "Negative":
                            negative_count += 1
                        else:
                            neutral_count += 1

            st.write("Positive Sentences:", positive_count)
            st.write("Negative Sentences:", negative_count)
            st.write("Neutral Sentences:", neutral_count)

            labels = ["Positive", "Negative", "Neutral"]
            sizes = [positive_count, negative_count, neutral_count]

            fig, ax = plt.subplots()
            ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
            ax.axis("equal")
            ax.set_title("Sentiment Distribution")

            st.pyplot(fig)
            
if __name__ == "__main__":
    main()