Spaces:

manasvikalyan
/

nlp_text_analysis

Runtime error

lively06

commit4

b9e94ef almost 2 years ago

7.14 kB

	import PyPDF2 as pdf
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import streamlit as st
	import bert_score
	from rouge_score import rouge_scorer
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	from difflib import SequenceMatcher
	from nltk.sentiment import SentimentIntensityAnalyzer
	import matplotlib.pyplot as plt
	import nltk
	nltk.download('vader_lexicon')

	st.set_page_config(page_title="Streamlit Sentiment App", page_icon="static/res/favicon.png")


	# Initialize the model and tokenizer
	model = T5ForConditionalGeneration.from_pretrained("t5-base")
	tokenizer = T5Tokenizer.from_pretrained("t5-base")

	def extract_text(uploaded_file):
	text = ""
	if uploaded_file:
	reader = pdf.PdfReader(uploaded_file)
	for page in reader.pages:
	text += page.extract_text()
	return text

	def calculate_similarity(text1, text2):
	vectorizer = CountVectorizer().fit_transform([text1, text2])
	vectors = vectorizer.toarray()
	return cosine_similarity(vectors)[0][1]

	def bert_similarity(text1, text2):
	P, R, F1 = bert_score.score([text1], [text2], lang="en", verbose=True)
	return F1.item()

	def rouge_similarity(text1, text2):
	scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
	scores = scorer.score(text1, text2)
	return scores['rougeL'].fmeasure

	def highlight_similarity(text1, text2):
	matcher = SequenceMatcher(None, text1, text2)
	matches = matcher.get_matching_blocks()

	highlighted_text = ""
	for match in matches:
	start1 = match.a
	end1 = match.a + match.size
	start2 = match.b
	end2 = match.b + match.size
	# Highlight the matching subsequence
	highlighted_text += text1[start1:end1] + '\n'
	highlighted_text += text2[start2:end2] + '\n\n'

	return highlighted_text


	def generate_summary(text):
	# Encode the text
	inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1000, truncation=True)

	# Generate the summary
	outputs = model.generate(inputs, max_length=1000, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)

	# Decode the summary
	summary = tokenizer.decode(outputs[0])

	return summary


	def predict_sentiment(text, threshold_positive, threshold_negative):
	sid = SentimentIntensityAnalyzer()
	sentiment_scores = sid.polarity_scores(text)

	threshold_positive = float(threshold_positive)
	threshold_negative = float(threshold_negative)

	if sentiment_scores.get("compound", 0) >= threshold_positive:
	return "Positive"
	elif sentiment_scores.get("compound", 0) <= threshold_negative:
	return "Negative"
	else:
	return "Neutral"


	def main():
	st.title("Text Analysis App")
	st.write("This app checks the similarity between two PDF files using different similarity metrics or generates a summary for a single document or does the sentiment analyis.")
	st.write("Upload PDF files, select an option from the dropdown menu, and proceed accordingly.")


	option = st.selectbox("Select Option", ["Check Similarity", "Generate Summary", "Sentiment Analysis"])

	if option == "Check Similarity":
	uploaded_file1 = st.file_uploader("Choose a PDF file 1", type="pdf")
	uploaded_file2 = st.file_uploader("Choose a PDF file 2", type="pdf")

	st.sidebar.title("Similarity Metrics")
	st.sidebar.write("Cosine Similarity:")
	st.sidebar.write("Measures how similar the two documents are based on their content.")
	st.sidebar.write("BERT Score:")
	st.sidebar.write("Provides a similarity measure based on contextual embeddings of the documents.")
	st.sidebar.write("ROUGE Score:")
	st.sidebar.write("Evaluates the overlap in n-grams between the two documents.")

	similarity_metric = st.selectbox("Select Similarity Metric", ["Cosine Similarity", "BERT Score", "ROUGE Score"])

	if uploaded_file1 and uploaded_file2:
	if st.button("Check Similarity"):
	text1 = extract_text(uploaded_file1)
	text2 = extract_text(uploaded_file2)
	similarity = None
	if similarity_metric == "Cosine Similarity":
	similarity = calculate_similarity(text1, text2)
	st.write(f"The similarity between the two files is {similarity:.2f}.")
	elif similarity_metric == "BERT Score":
	bert_similarity_score = bert_similarity(text1, text2)
	st.write(f"The BERT similarity score between the two files is {bert_similarity_score:.2f}.")
	elif similarity_metric == "ROUGE Score":
	rouge_similarity_score = rouge_similarity(text1, text2)
	st.write(f"The ROUGE similarity score between the two files is {rouge_similarity_score:.2f}.")

	st.write("Highlighted Similarity:")
	st.write(highlight_similarity(text1, text2))

	elif option == "Generate Summary":
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
	if uploaded_file:
	if st.button("Generate Summary"):
	text = extract_text(uploaded_file)
	summary = generate_summary(text)
	st.write("Summary:")
	st.write(summary)
	elif option == "Sentiment Analysis":
	threshold_positive = st.number_input("Threshold for Positive Sentiment:", value=0.05, step=0.01)
	threshold_negative = st.number_input("Threshold for Negative Sentiment:", value=-0.05, step=0.01)
	uploaded_file = st.file_uploader("Upload PDF Document")

	if uploaded_file:
	pdf_reader = pdf.PdfReader(uploaded_file)
	positive_count = 0
	negative_count = 0
	neutral_count = 0

	for page in pdf_reader.pages:
	text = page.extract_text()
	sentences = text.split(".")
	for sentence in sentences:
	sentence = sentence.strip()
	if sentence:
	sentiment = predict_sentiment(sentence, threshold_positive, threshold_negative)
	if sentiment == "Positive":
	positive_count += 1
	elif sentiment == "Negative":
	negative_count += 1
	else:
	neutral_count += 1

	st.write("Positive Sentences:", positive_count)
	st.write("Negative Sentences:", negative_count)
	st.write("Neutral Sentences:", neutral_count)

	labels = ["Positive", "Negative", "Neutral"]
	sizes = [positive_count, negative_count, neutral_count]

	fig, ax = plt.subplots()
	ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
	ax.axis("equal")
	ax.set_title("Sentiment Distribution")

	st.pyplot(fig)

	if __name__ == "__main__":
	main()