Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +146 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import torch
|
| 6 |
+
import io
|
| 7 |
+
import base64
|
| 8 |
+
from stqdm import stqdm
|
| 9 |
+
|
| 10 |
+
# Define the model and tokenizer
|
| 11 |
+
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
|
| 12 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 14 |
+
st.set_page_config(layout="wide")
|
| 15 |
+
|
| 16 |
+
#defs
|
| 17 |
+
def classify_reviews(reviews):
|
| 18 |
+
inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
| 19 |
+
outputs = model(**inputs)
|
| 20 |
+
probabilities = F.softmax(outputs.logits, dim=1).tolist()
|
| 21 |
+
return probabilities
|
| 22 |
+
|
| 23 |
+
def top_rating(scores):
|
| 24 |
+
return scores.index(max(scores)) + 1
|
| 25 |
+
|
| 26 |
+
def top_prob(scores):
|
| 27 |
+
return max(scores)
|
| 28 |
+
|
| 29 |
+
def get_table_download_link(df):
|
| 30 |
+
csv = df.to_csv(index=False)
|
| 31 |
+
b64 = base64.b64encode(csv.encode()).decode()
|
| 32 |
+
return f'<a href="data:file/csv;base64,{b64}" download="data.csv">Download csv file</a>'
|
| 33 |
+
|
| 34 |
+
def main():
|
| 35 |
+
st.title('Sentiment Analysis')
|
| 36 |
+
st.markdown('Upload an Excel file and select a column to get sentiment analysis.')
|
| 37 |
+
|
| 38 |
+
file = st.file_uploader("Upload an excel file", type=['xlsx'])
|
| 39 |
+
review_column = None
|
| 40 |
+
df = None
|
| 41 |
+
|
| 42 |
+
if file is not None:
|
| 43 |
+
try:
|
| 44 |
+
df = pd.read_excel(file)
|
| 45 |
+
review_column = st.selectbox('Select the reviews column', df.columns)
|
| 46 |
+
df[review_column] = df[review_column].astype(str)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
start_button = st.button('Start Analysis')
|
| 52 |
+
|
| 53 |
+
if start_button and df is not None:
|
| 54 |
+
if review_column in df.columns:
|
| 55 |
+
with st.spinner('Performing sentiment analysis...'):
|
| 56 |
+
df, df_display = process_reviews(df, review_column)
|
| 57 |
+
|
| 58 |
+
display_ratings(df)
|
| 59 |
+
display_dataframe(df, df_display)
|
| 60 |
+
else:
|
| 61 |
+
st.write(f'No column named "{review_column}" found in the uploaded file.')
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def process_reviews(df, review_column):
|
| 65 |
+
with st.spinner('Classifying reviews...'):
|
| 66 |
+
progress_bar = st.progress(0)
|
| 67 |
+
total_reviews = len(df[review_column].tolist())
|
| 68 |
+
review_counter = 0
|
| 69 |
+
|
| 70 |
+
batch_size = 50
|
| 71 |
+
raw_scores = []
|
| 72 |
+
reviews = df[review_column].tolist()
|
| 73 |
+
for i in range(0, len(reviews), batch_size):
|
| 74 |
+
batch_reviews = reviews[i:i+batch_size]
|
| 75 |
+
batch_scores = classify_reviews(batch_reviews)
|
| 76 |
+
raw_scores.extend(batch_scores)
|
| 77 |
+
review_counter += len(batch_reviews)
|
| 78 |
+
progress_bar.progress(review_counter / total_reviews)
|
| 79 |
+
|
| 80 |
+
df_new = df.copy()
|
| 81 |
+
df_new['raw_scores'] = raw_scores
|
| 82 |
+
scores_to_df(df_new)
|
| 83 |
+
df_display = scores_to_percent(df_new.copy())
|
| 84 |
+
|
| 85 |
+
# Get all columns excluding the created ones and the review_column
|
| 86 |
+
remaining_columns = [col for col in df.columns if col not in [review_column, 'raw_scores', 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star']]
|
| 87 |
+
|
| 88 |
+
# Reorder the dataframe with selected columns first, created columns next, then the remaining columns
|
| 89 |
+
df_new = df_new[[review_column, 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star'] + remaining_columns]
|
| 90 |
+
|
| 91 |
+
# Reorder df_display as well
|
| 92 |
+
df_display = df_display[[review_column, 'Weighted Rating', 'Rating', 'Probability', '1 Star', '2 Star', '3 Star', '4 Star', '5 Star'] + remaining_columns]
|
| 93 |
+
|
| 94 |
+
return df_new, df_display
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def scores_to_df(df):
|
| 98 |
+
for i in range(1, 6):
|
| 99 |
+
df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
|
| 100 |
+
|
| 101 |
+
df['Rating'] = df['raw_scores'].apply(top_rating)
|
| 102 |
+
df['Probability'] = df['raw_scores'].apply(top_prob).round(2)
|
| 103 |
+
# Compute the Weighted Rating
|
| 104 |
+
df['Weighted Rating'] = sum(df[f'{i} Star']*i for i in range(1, 6))
|
| 105 |
+
|
| 106 |
+
df.drop(columns=['raw_scores'], inplace=True)
|
| 107 |
+
|
| 108 |
+
def scores_to_percent(df):
|
| 109 |
+
for i in range(1, 6):
|
| 110 |
+
df[f'{i} Star'] = df[f'{i} Star'].apply(lambda x: f'{x*100:.0f}%')
|
| 111 |
+
|
| 112 |
+
df['Probability'] = df['Probability'].apply(lambda x: f'{x*100:.0f}%')
|
| 113 |
+
|
| 114 |
+
return df
|
| 115 |
+
|
| 116 |
+
def convert_df_to_csv(df):
|
| 117 |
+
return df.to_csv(index=False).encode('utf-8')
|
| 118 |
+
|
| 119 |
+
def display_dataframe(df, df_display):
|
| 120 |
+
csv = convert_df_to_csv(df)
|
| 121 |
+
|
| 122 |
+
col1, col2, col3, col4, col5, col6, col7, col8, col9 = st.columns(9)
|
| 123 |
+
|
| 124 |
+
with col1:
|
| 125 |
+
st.download_button(
|
| 126 |
+
"Download CSV",
|
| 127 |
+
csv,
|
| 128 |
+
"data.csv",
|
| 129 |
+
"text/csv",
|
| 130 |
+
key='download-csv'
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
st.dataframe(df_display)
|
| 134 |
+
|
| 135 |
+
def display_ratings(df):
|
| 136 |
+
cols = st.columns(5)
|
| 137 |
+
|
| 138 |
+
for i in range(1, 6):
|
| 139 |
+
rating_counts = df[df['Rating'] == i].shape[0]
|
| 140 |
+
cols[i-1].markdown(f"### {rating_counts}")
|
| 141 |
+
cols[i-1].markdown(f"{'⭐' * i}")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
transformers
|
| 4 |
+
torch
|
| 5 |
+
stqdm
|
| 6 |
+
openpyxl
|