lively06 commited on
Commit
25afb5b
·
1 Parent(s): b7b7c2d
Files changed (3) hide show
  1. .streamlit/config.toml +6 -0
  2. main.py +174 -0
  3. requirements.txt +10 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#aa4bff"
3
+ backgroundColor="#1e5630"
4
+ secondaryBackgroundColor="#8e8947"
5
+ textColor="#ffffff"
6
+ font="serif"
main.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2 as pdf
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import streamlit as st
5
+ import bert_score
6
+ from rouge_score import rouge_scorer
7
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
8
+ from difflib import SequenceMatcher
9
+ from nltk.sentiment import SentimentIntensityAnalyzer
10
+ import matplotlib.pyplot as plt
11
+
12
+ st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png")
13
+
14
+
15
+ # Initialize the model and tokenizer
16
+ model = T5ForConditionalGeneration.from_pretrained("t5-base")
17
+ tokenizer = T5Tokenizer.from_pretrained("t5-base")
18
+
19
+ def extract_text(uploaded_file):
20
+ text = ""
21
+ if uploaded_file:
22
+ reader = pdf.PdfReader(uploaded_file)
23
+ for page in reader.pages:
24
+ text += page.extract_text()
25
+ return text
26
+
27
+ def calculate_similarity(text1, text2):
28
+ vectorizer = CountVectorizer().fit_transform([text1, text2])
29
+ vectors = vectorizer.toarray()
30
+ return cosine_similarity(vectors)[0][1]
31
+
32
+ def bert_similarity(text1, text2):
33
+ P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True)
34
+ return F1.item()
35
+
36
+ def rouge_similarity(text1, text2):
37
+ scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
38
+ scores = scorer.score(text1, text2)
39
+ return scores['rougeL'].fmeasure
40
+
41
+ def highlight_similarity(text1, text2):
42
+ matcher = SequenceMatcher(None, text1, text2)
43
+ matches = matcher.get_matching_blocks()
44
+
45
+ highlighted_text = ""
46
+ for match in matches:
47
+ start1 = match.a
48
+ end1 = match.a + match.size
49
+ start2 = match.b
50
+ end2 = match.b + match.size
51
+ # Highlight the matching subsequence
52
+ highlighted_text += text1[start1:end1] + '\n'
53
+ highlighted_text += text2[start2:end2] + '\n\n'
54
+
55
+ return highlighted_text
56
+
57
+
58
+ def generate_summary(text):
59
+ # Encode the text
60
+ inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True)
61
+
62
+ # Generate the summary
63
+ outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)
64
+
65
+ # Decode the summary
66
+ summary = tokenizer.decode(outputs[0])
67
+
68
+ return summary
69
+
70
+
71
+ def predict_sentiment(text, threshold_positive, threshold_negative):
72
+ sid = SentimentIntensityAnalyzer()
73
+ sentiment_scores = sid.polarity_scores(text)
74
+
75
+ threshold_positive = float(threshold_positive)
76
+ threshold_negative = float(threshold_negative)
77
+
78
+ if sentiment_scores.get("compound", 0) >= threshold_positive:
79
+ return "Positive"
80
+ elif sentiment_scores.get("compound", 0) <= threshold_negative:
81
+ return "Negative"
82
+ else:
83
+ return "Neutral"
84
+
85
+
86
+ def main():
87
+ st.title("Text Analysis App")
88
+ st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.")
89
+ st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.")
90
+
91
+
92
+ option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"])
93
+
94
+ if option == "Check Similarity":
95
+ uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf")
96
+ uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf")
97
+
98
+ st.sidebar.title("Similarity Metrics")
99
+ st.sidebar.write("**Cosine Similarity**:")
100
+ st.sidebar.write("Measures how similar the two documents are based on their content.")
101
+ st.sidebar.write("**BERT Score**:")
102
+ st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.")
103
+ st.sidebar.write("**ROUGE Score**:")
104
+ st.sidebar.write("Evaluates the overlap in n-grams between the two documents.")
105
+
106
+ similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"])
107
+
108
+ if uploaded_file1 and uploaded_file2:
109
+ if st.button("Check Similarity"):
110
+ text1 = extract_text(uploaded_file1)
111
+ text2 = extract_text(uploaded_file2)
112
+ similarity = None
113
+ if similarity_metric == "Cosine Similarity":
114
+ similarity = calculate_similarity(text1, text2)
115
+ st.write(f"The similarity between the two files is {similarity:.2f}.")
116
+ elif similarity_metric == "BERT Score":
117
+ bert_similarity_score = bert_similarity(text1, text2)
118
+ st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.")
119
+ elif similarity_metric == "ROUGE Score":
120
+ rouge_similarity_score = rouge_similarity(text1, text2)
121
+ st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.")
122
+
123
+ st.write("Highlighted Similarity:")
124
+ st.write(highlight_similarity(text1, text2))
125
+
126
+ elif option == "Generate Summary":
127
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
128
+ if uploaded_file:
129
+ if st.button("Generate Summary"):
130
+ text = extract_text(uploaded_file)
131
+ summary = generate_summary(text)
132
+ st.write("Summary:")
133
+ st.write(summary)
134
+ elif option == "Sentiment Analysis":
135
+ threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01)
136
+ threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01)
137
+ uploaded_file = st.file_uploader("Upload PDF Document")
138
+
139
+ if uploaded_file:
140
+ pdf_reader = pdf.PdfReader(uploaded_file)
141
+ positive_count = 0
142
+ negative_count = 0
143
+ neutral_count = 0
144
+
145
+ for page in pdf_reader.pages:
146
+ text = page.extract_text()
147
+ sentences = text.split(".")
148
+ for sentence in sentences:
149
+ sentence = sentence.strip()
150
+ if sentence:
151
+ sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative)
152
+ if sentiment == "Positive":
153
+ positive_count += 1
154
+ elif sentiment == "Negative":
155
+ negative_count += 1
156
+ else:
157
+ neutral_count += 1
158
+
159
+ st.write("Positive Sentences:", positive_count)
160
+ st.write("Negative Sentences:", negative_count)
161
+ st.write("Neutral Sentences:", neutral_count)
162
+
163
+ labels = ["Positive", "Negative", "Neutral"]
164
+ sizes = [positive_count, negative_count, neutral_count]
165
+
166
+ fig, ax = plt.subplots()
167
+ ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
168
+ ax.axis("equal")
169
+ ax.set_title("Sentiment Distribution")
170
+
171
+ st.pyplot(fig)
172
+
173
+ if __name__ == "__main__":
174
+ main()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ flask
4
+ scikit-learn
5
+ PyPDF2
6
+ bert_score
7
+ rouge_score
8
+ nltk
9
+ matplotlib
10
+ sentencepiece