Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import os.path | |
| import pathlib | |
| import pandas as pd | |
| import numpy as np | |
| import PyPDF2 | |
| from PyPDF2 import PdfReader | |
| from os import walk | |
| import nltk | |
| import glob | |
| import plotly.express as px | |
| from wordcloud import WordCloud | |
| import plotly.io as pio | |
| from plotly.subplots import make_subplots | |
| import plotly.graph_objs as go | |
| import pandas as pd | |
| import plotly.offline as pyo | |
| import io | |
| from io import StringIO | |
| #@st.cache_resource() | |
| def get_nl(): | |
| return nltk.download('punkt') | |
| get_nl() | |
| from nltk.tokenize import sent_tokenize | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from transformers import pipeline | |
| # if os.path.exists("report.html"): | |
| # os.remove("report.html") | |
| #@st.cache_resource() | |
| def get_sentiment_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") | |
| model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert") | |
| return tokenizer,model | |
| tokenizer_sentiment,model_sentiment = get_sentiment_model() | |
| def get_emotion_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
| model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
| return tokenizer,model | |
| tokenizer_emotion,model_emotion = get_emotion_model() | |
| def extract_text_from_pdf(path): | |
| text='' | |
| reader = PdfReader(path) | |
| number_of_pages = len(reader.pages) | |
| print(number_of_pages) | |
| for i in range(number_of_pages): | |
| page=reader.pages[i] | |
| text = text + page.extract_text() | |
| return text | |
| # Create a button to download the HTML file | |
| def download_html(): | |
| with st.spinner('Downloading HTML file...'): | |
| # Get the HTML content | |
| with open('report.html', "r") as f: | |
| html = f.read() | |
| f.close() | |
| # Set the file name and content type | |
| file_name = "report.html" | |
| mime_type = "text/html" | |
| # Use st.download_button() to create a download button | |
| print('download button') | |
| st.download_button(label="Download Report", data=html, file_name=file_name, mime=mime_type) | |
| st.stop() | |
| if 'filename_key' not in st.session_state: | |
| st.session_state.filename_key = '' | |
| st.write(""" | |
| # Sentiment Analysis Tool | |
| """) | |
| #uploaded_file = st.file_uploader("Choose a PDF file") | |
| #uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False, type=['pdf']) | |
| uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=True, type=['pdf']) | |
| #if uploaded_file is not None: | |
| if len(uploaded_file)==0: | |
| #print('none') | |
| st.session_state.filename_key = '' | |
| elif len(uploaded_file)>0: | |
| import time | |
| # Wait for 5 seconds | |
| time.sleep(5) | |
| pdf_reader = PyPDF2.PdfReader(uploaded_file[0]) | |
| num_pages = len(pdf_reader.pages) | |
| file_name = uploaded_file[0].name | |
| # st.write(st.session_state.filename_key) | |
| # print(file_name) | |
| # st.write("Filename:", file_name) | |
| if num_pages > 20: | |
| st.error("Pages in PDF file should be less than 20.") | |
| # Check that only one file was uploaded | |
| #elif isinstance(uploaded_file, list): | |
| elif len(uploaded_file) > 1: | |
| st.error("Please upload only one PDF file at a time.") | |
| elif st.session_state.filename_key == file_name: | |
| st.write("Report downloaded successfully") | |
| else: | |
| #uploaded_file = uploaded_file[0] | |
| # Check that the file is a PDF | |
| if uploaded_file[0].type != 'application/pdf': | |
| st.error("Please upload a PDF file.") | |
| else: | |
| ############################ 1. Extract text from PDF ############################ | |
| text='' | |
| # return text from pdf | |
| pdf_reader = PyPDF2.PdfReader(uploaded_file[0]) | |
| # Get the number of pages in the PDF file | |
| num_pages = len(pdf_reader.pages) | |
| # Display the number of pages in the PDF file | |
| st.write(f"Number of pages in PDF file: {num_pages}") | |
| for i in range(num_pages): | |
| page=pdf_reader.pages[i] | |
| text = text + page.extract_text() | |
| ############################ 2. Sentiment Analysis ############################ | |
| text = text.replace("\n", " " ) | |
| sentences = sent_tokenize(text) | |
| title = sentences[0] | |
| long_sentence=[] | |
| small_sentence=[] | |
| useful_sentence=[] | |
| for i in sentences: | |
| if len(i) > 510: | |
| long_sentence.append(i) | |
| elif len(i) < 50: | |
| small_sentence.append(i) | |
| else: | |
| useful_sentence.append(i) | |
| del sentences | |
| with st.spinner('Performing Sentiment Analysis...'): | |
| tokenizer = tokenizer_sentiment | |
| model = model_sentiment | |
| pipe = pipeline(model="ProsusAI/finbert") | |
| classifier = pipeline(model="ProsusAI/finbert") | |
| output = classifier(useful_sentence) | |
| with st.spinner('Performing Emotion Analysis...'): | |
| tokenizer = tokenizer_emotion | |
| model = model_emotion | |
| classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1) | |
| output_emotion = classifier(useful_sentence) | |
| df = pd.DataFrame.from_dict(output) | |
| df['Sentence']= pd.Series(useful_sentence) | |
| ############################ 3. Processing ############################ | |
| ############################ 3.1. Sentiment Analysis ############################ | |
| labels = ['neutral', 'positive', 'negative'] | |
| values = df.label.value_counts().to_list() | |
| # removing words | |
| words_to_remove = ["s", "quarter", "thank", "million", "Thank", "quetion", 'wa', 'rate', 'firt', | |
| "customer", "business", "last year", "year", 'lat', 'well', 'jut', 'thi', 'cutomer', | |
| "will", "think", "higher", "question", "going"] | |
| for word in words_to_remove: | |
| text = text.replace(word, "") | |
| wordcloud = WordCloud(background_color='white', width=800, height=400).generate(text) | |
| image = wordcloud.to_image() | |
| pos_df = df[df['label']=='positive'] | |
| pos_df = pos_df[['score', 'Sentence']] | |
| pos_df = pos_df.sort_values('score', ascending=False) | |
| pos_df_mean = pos_df.score.mean() | |
| pos_df['score'] = pos_df['score'].round(4) | |
| pos_df.rename(columns = {'Sentence':'Positive Sentences'}, inplace = True) | |
| neg_df = df[df['label']=='negative'] | |
| neg_df = neg_df[['score', 'Sentence']] | |
| neg_df = neg_df.sort_values('score', ascending=False) | |
| neg_df_mean = neg_df.score.mean() | |
| neg_df['score'] = neg_df['score'].round(4) | |
| neg_df.rename(columns = {'Sentence':'Negative Sentences'}, inplace = True) | |
| neu_df = df[df['label']=='neutral'] | |
| neu_df = neu_df[['score', 'Sentence']] | |
| neu_df = neu_df.sort_values('score', ascending=False) | |
| #neu_df_mean = neu_df.score.mean() | |
| neu_df['score'] = neu_df['score'].round(4) | |
| neu_df.rename(columns = {'Sentence':'Neutral Sentences'}, inplace = True) | |
| df_temp = neg_df | |
| df_temp = df_temp['score'] * -1 | |
| df_temp = pd.concat([df_temp, pos_df]) | |
| ############################ 3.2. Emotion Analysis ############################ | |
| df_emotion = pd.DataFrame.from_dict(output_emotion) | |
| df_emotion['Sentence']= pd.Series(useful_sentence) | |
| df_joy = df_emotion[df_emotion['label']=='joy'] | |
| df_joy = df_joy[['score', 'Sentence']] | |
| df_joy = df_joy.sort_values('score', ascending=False) | |
| df_joy['score'] = df_joy['score'].round(4) | |
| df_joy.rename(columns = {'Sentence':'Happy Sentences'}, inplace = True) | |
| df_sadness = df_emotion[df_emotion['label']=='sadness'] | |
| df_sadness = df_sadness[['score', 'Sentence']] | |
| df_sadness = df_sadness.sort_values('score', ascending=False) | |
| df_sadness['score'] = df_sadness['score'].round(4) | |
| df_sadness.rename(columns = {'Sentence':'Sad Sentences'}, inplace = True) | |
| df_anger = df_emotion[df_emotion['label']=='anger'] | |
| df_anger = df_anger[['score', 'Sentence']] | |
| df_anger = df_anger.sort_values('score', ascending=False) | |
| df_anger['score'] = df_anger['score'].round(4) | |
| df_anger.rename(columns = {'Sentence':'Angry Sentences'}, inplace = True) | |
| df_surprise = df_emotion[df_emotion['label']=='surprise'] | |
| df_surprise = df_surprise[['score', 'Sentence']] | |
| df_surprise = df_surprise.sort_values('score', ascending=False) | |
| df_surprise['score'] = df_surprise['score'].round(4) | |
| df_surprise.rename(columns = {'Sentence':'Surprised Sentences'}, inplace = True) | |
| ############################ 4. Plotting ############################ | |
| fig = make_subplots( | |
| rows=26, cols=6, | |
| specs=[ [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [{"type": "pie", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [{"type": "image", "rowspan": 15, "colspan": 3}, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| [None, None, None, None, None, None], | |
| ], | |
| ) | |
| colors = px.colors.diverging.Portland#RdBu | |
| fig.add_trace(go.Pie(labels=labels, values=values, hole = 0.5, | |
| title = 'Count by label', | |
| marker=dict(colors=colors, | |
| line=dict(width=2, color='white'))), | |
| row=6, col=1) | |
| fig.add_trace(go.Indicator( | |
| mode = "number", | |
| value = len(df.label.values.tolist()), | |
| title = {"text": "Count of Sentence"}), row=6, col=3) | |
| fig.add_trace(go.Indicator( | |
| mode = "gauge+number", | |
| value = df_temp.score.mean(), | |
| domain = {'x': [0, 1], 'y': [0, 1]}, | |
| title = {'text': "Average of Score", 'font': {'size': 16}}, | |
| gauge = { | |
| 'axis': {'range': [-1, 1], 'tickwidth': 1, 'tickcolor': "darkblue"}, | |
| 'bar': {'color': "darkblue"}, | |
| 'steps': [ | |
| {'range': [-0.29, 0.29], 'color': 'white'}, | |
| {'range': [0.3, 1], 'color': 'green'}, | |
| {'range': [-1, -0.3], 'color': 'red'} | |
| ], | |
| 'threshold': { | |
| 'line': {'color': "black", 'width': 4}, | |
| 'thickness': 0.75, | |
| 'value': abs((pos_df_mean - neg_df_mean)) | |
| } | |
| } | |
| ), row=6, col=5) | |
| if df_temp.score.mean() < -0.29: | |
| fig.update_traces(title_text="Cummulative Sentiment Negative", selector=dict(type='indicator'), row=6, col=5) | |
| elif df_temp.score.mean() < 0.29: | |
| fig.update_traces(title_text="Cummulative Sentiment Neutral", selector=dict(type='indicator'), row=6, col=5) | |
| else: | |
| fig.update_traces(title_text="Cummulative Sentiment Positive", selector=dict(type='indicator'), row=6, col=5) | |
| fig.add_trace(go.Image(z=image), row=12, col=1) | |
| fig.update_xaxes(visible=False, row=12, col=1) | |
| fig.update_yaxes(visible=False, row=12, col=1) | |
| table_trace1 = go.Table( | |
| header=dict(values=list(pos_df.columns), fill_color='lightgray', align='left'), | |
| cells=dict(values=[pos_df[name] for name in pos_df.columns], fill_color='white', align='left'), | |
| columnwidth=[1, 4] | |
| ) | |
| fig.add_trace(table_trace1, row=12, col=4) | |
| table_trace2 = go.Table( | |
| header=dict(values=list(neg_df.columns), fill_color='lightgray', align='left'), | |
| cells=dict(values=[neg_df[name] for name in neg_df.columns], fill_color='white', align='left'), | |
| columnwidth=[1, 4] | |
| ) | |
| fig.add_trace(table_trace2, row=17, col=4) | |
| table_trace2 = go.Table( | |
| header=dict(values=list(neu_df.columns), fill_color='lightgray', align='left'), | |
| cells=dict(values=[neu_df[name] for name in neu_df.columns], fill_color='white', align='left'), | |
| columnwidth=[1, 4] | |
| ) | |
| fig.add_trace(table_trace2, row=22, col=4) | |
| import textwrap | |
| wrapped_title = "\n".join(textwrap.wrap(title, width=50)) | |
| # Add HTML tags to force line breaks in the title text | |
| wrapped_title = "<br>".join(wrapped_title.split("\n")) | |
| fig.update_layout(height=1500, showlegend=False, title={'text': f"<b>{wrapped_title} - Sentiment Analysis Report</b>", 'x': 0.5, 'xanchor': 'center','font': {'size': 32}}) | |
| #pyo.plot(fig, filename='report.html') | |
| ############################## 5. Download Report ############################## | |
| buffer = io.StringIO() | |
| fig.write_html(buffer, include_plotlyjs='cdn') | |
| html_bytes = buffer.getvalue().encode() | |
| st.download_button( | |
| label='Download Report', | |
| data=html_bytes, | |
| file_name='report.html', | |
| mime='text/html' | |
| ) | |
| st.session_state.filename_key = file_name |