Spaces:
Runtime error
Runtime error
File size: 7,144 Bytes
25afb5b b9e94ef e18fce5 25afb5b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | import PyPDF2 as pdf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import bert_score
from rouge_score import rouge_scorer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from difflib import SequenceMatcher
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import nltk
nltk.download('vader_lexicon')
st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png")
# Initialize the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
def extract_text(uploaded_file):
text = ""
if uploaded_file:
reader = pdf.PdfReader(uploaded_file)
for page in reader.pages:
text += page.extract_text()
return text
def calculate_similarity(text1, text2):
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
return cosine_similarity(vectors)[0][1]
def bert_similarity(text1, text2):
P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True)
return F1.item()
def rouge_similarity(text1, text2):
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(text1, text2)
return scores['rougeL'].fmeasure
def highlight_similarity(text1, text2):
matcher = SequenceMatcher(None, text1, text2)
matches = matcher.get_matching_blocks()
highlighted_text = ""
for match in matches:
start1 = match.a
end1 = match.a + match.size
start2 = match.b
end2 = match.b + match.size
# Highlight the matching subsequence
highlighted_text += text1[start1:end1] + '\n'
highlighted_text += text2[start2:end2] + '\n\n'
return highlighted_text
def generate_summary(text):
# Encode the text
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True)
# Generate the summary
outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
# Decode the summary
summary = tokenizer.decode(outputs[0])
return summary
def predict_sentiment(text, threshold_positive, threshold_negative):
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
threshold_positive = float(threshold_positive)
threshold_negative = float(threshold_negative)
if sentiment_scores.get("compound", 0) >= threshold_positive:
return "Positive"
elif sentiment_scores.get("compound", 0) <= threshold_negative:
return "Negative"
else:
return "Neutral"
def main():
st.title("Text Analysis App")
st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.")
st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.")
option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"])
if option == "Check Similarity":
uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf")
uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf")
st.sidebar.title("Similarity Metrics")
st.sidebar.write("**Cosine Similarity**:")
st.sidebar.write("Measures how similar the two documents are based on their content.")
st.sidebar.write("**BERT Score**:")
st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.")
st.sidebar.write("**ROUGE Score**:")
st.sidebar.write("Evaluates the overlap in n-grams between the two documents.")
similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"])
if uploaded_file1 and uploaded_file2:
if st.button("Check Similarity"):
text1 = extract_text(uploaded_file1)
text2 = extract_text(uploaded_file2)
similarity = None
if similarity_metric == "Cosine Similarity":
similarity = calculate_similarity(text1, text2)
st.write(f"The similarity between the two files is {similarity:.2f}.")
elif similarity_metric == "BERT Score":
bert_similarity_score = bert_similarity(text1, text2)
st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.")
elif similarity_metric == "ROUGE Score":
rouge_similarity_score = rouge_similarity(text1, text2)
st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.")
st.write("Highlighted Similarity:")
st.write(highlight_similarity(text1, text2))
elif option == "Generate Summary":
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file:
if st.button("Generate Summary"):
text = extract_text(uploaded_file)
summary = generate_summary(text)
st.write("Summary:")
st.write(summary)
elif option == "Sentiment Analysis":
threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01)
threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01)
uploaded_file = st.file_uploader("Upload PDF Document")
if uploaded_file:
pdf_reader = pdf.PdfReader(uploaded_file)
positive_count = 0
negative_count = 0
neutral_count = 0
for page in pdf_reader.pages:
text = page.extract_text()
sentences = text.split(".")
for sentence in sentences:
sentence = sentence.strip()
if sentence:
sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative)
if sentiment == "Positive":
positive_count += 1
elif sentiment == "Negative":
negative_count += 1
else:
neutral_count += 1
st.write("Positive Sentences:", positive_count)
st.write("Negative Sentences:", negative_count)
st.write("Neutral Sentences:", neutral_count)
labels = ["Positive", "Negative", "Neutral"]
sizes = [positive_count, negative_count, neutral_count]
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
ax.axis("equal")
ax.set_title("Sentiment Distribution")
st.pyplot(fig)
if __name__ == "__main__":
main()
|