D3MI4N-KDS commited on
Commit
493aaf9
·
verified ·
1 Parent(s): b8ed3bd

Upload streamlit_app.py

Browse files

This app uses Balearia data from 3 different sources (Google reviews, TrustPilot & VIS).
It shows topic analysis from all sources across time.
Topic analysis consists mainly in:
- Evolution of reviews rates (stars) by topic.
- General reviews rates distribution by topic.
- Positive and Negative reviews tags (Wordclouds).

Files changed (1) hide show
  1. streamlit_app.py +255 -0
streamlit_app.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from scipy.stats import linregress
7
+ from datetime import datetime
8
+ from wordcloud import WordCloud
9
+ from nltk.corpus import stopwords
10
+ from nltk.tokenize import word_tokenize
11
+ from io import BytesIO
12
+ from PIL import Image
13
+
14
+ # Set page configuration for wider layout
15
+ st.set_page_config(layout="wide")
16
+
17
+ # Load data using st.cache_data
18
+ @st.cache_data
19
+ def load_data():
20
+ df = pd.read_csv("Balearia/outputs/balearia_categorized_agg_wdates.csv")
21
+
22
+ # Convert string to datetime with explicit format
23
+ df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date
24
+
25
+ # Drop rows where gpt_topics is NaN (if necessary)
26
+ df = df.dropna(subset=['gpt_topics'])
27
+
28
+ # Ensure gpt_topics is a list of strings
29
+ df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x)
30
+
31
+ return df
32
+
33
+ # Function to explode list columns and retain original index
34
+ def explode_and_retain_index(df, col_to_explode):
35
+ exploded = df.explode(col_to_explode)
36
+ return exploded
37
+
38
+ # Function to calculate metrics
39
+ @st.cache_data
40
+ def calculate_metrics(df):
41
+ # Explode gpt_topics to have one topic per row
42
+ df_exploded = explode_and_retain_index(df, 'gpt_topics')
43
+
44
+ # Calculate topic counts
45
+ topic_counts = df_exploded['gpt_topics'].value_counts().reset_index()
46
+ topic_counts.columns = ['Topic', 'count']
47
+
48
+ # Calculate average reviews per topic and date
49
+ avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index()
50
+
51
+ return topic_counts, avg_reviews
52
+
53
+ # Function to plot line chart
54
+ def plot_line_chart(data, ax):
55
+ # Round average reviews to the nearest whole number
56
+ data['review'] = data['review'].round().astype(int)
57
+
58
+ # Check if data is empty
59
+ if not data.empty:
60
+ # Plot the line chart
61
+ sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax)
62
+
63
+ # Remove y-axis label
64
+ ax.set_ylabel('')
65
+
66
+ # Increase font size of y-axis labels
67
+ ax.tick_params(axis='y', labelsize=14) # Adjust font size
68
+
69
+ # Add horizontal dotted lines for each star rating
70
+ stars_ticks = [1, 2, 3, 4, 5]
71
+ for tick in stars_ticks:
72
+ ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5)
73
+
74
+ # Calculate and plot trendline (orange dotted)
75
+ slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review'])
76
+ trendline = intercept + slope * range(len(data))
77
+ ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1)
78
+
79
+ # Set y-axis ticks to integers from 1 to 5
80
+ ax.set_yticks(range(1, 6))
81
+
82
+ # Remove x-axis label and ticks for cleaner look
83
+ ax.set_xlabel('')
84
+ ax.set_xticks([])
85
+ else:
86
+ # If data is empty, just show a message
87
+ ax.text(0.5, 0.5, 'No data available for the selected date range',
88
+ horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
89
+ ax.axis('off') # Hide the axes if no data is available
90
+
91
+ # Function to create filled stars based on average review
92
+ def filled_stars(avg_review):
93
+ filled = int(round(avg_review))
94
+ empty = 5 - filled
95
+ return "★" * filled + "☆" * empty
96
+
97
+ # Function to plot horizontal bar chart for star ratings distribution
98
+ def plot_star_distribution(data, ax):
99
+ # Count number of reviews for each star rating and ensure the index is sorted from 1 to 5
100
+ star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index()
101
+
102
+ # Check if star_counts is empty (all values are zero)
103
+ if star_counts.sum() == 0:
104
+ # Display a message if there is no data available
105
+ ax.text(0.5, 0.5, 'No data available for the selected date range',
106
+ horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
107
+ ax.axis('off') # Hide the axes if no data is available
108
+ else:
109
+ # Plot horizontal bar chart with different colors for each star rating
110
+ colors = sns.color_palette('viridis', len(star_counts))
111
+
112
+ # Plot bars for each star rating
113
+ bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6)
114
+
115
+ # Display the count value on each bar
116
+ for bar in bars:
117
+ width = bar.get_width()
118
+ count = int(width)
119
+ if count > 0:
120
+ ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white')
121
+
122
+ # Set y-axis ticks and labels in ascending order (1 to 5 stars)
123
+ ax.set_yticks(range(1, 6))
124
+ ax.set_yticklabels(range(1, 6), fontsize=14)
125
+
126
+ # Remove x-axis ticks and label for cleaner look
127
+ ax.set_xticks([])
128
+ ax.set_xlabel('')
129
+
130
+ # Set y-axis to ascending order
131
+ ax.set_ylim(0.5, 5.5)
132
+
133
+ # Function to generate Wordcloud based on reviews
134
+ def generate_wordcloud(text, title):
135
+ # Set stopwords for Spanish
136
+ stop_words = set(stopwords.words('spanish'))
137
+
138
+ # List of additional seen stopwords
139
+ additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar',
140
+ 'falta', 'ningun']
141
+ # Update the stop_words set with the additional stopwords
142
+ stop_words.update(additional_stopwords)
143
+
144
+ # Tokenize the text into words
145
+ tokens = word_tokenize(text)
146
+
147
+ # Remove punctuation
148
+ tokens = [word for word in tokens if word.isalnum()]
149
+
150
+ # Remove stopwords
151
+ filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
152
+
153
+ # Join filtered tokens back into a single string
154
+ filtered_text = ' '.join(filtered_tokens)
155
+
156
+ # Generate wordcloud
157
+ wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text)
158
+
159
+ # Create Matplotlib figure and axes
160
+ fig, ax = plt.subplots(figsize=(8, 4))
161
+ ax.imshow(wordcloud, interpolation='bilinear')
162
+ ax.axis('off')
163
+ ax.set_title(title)
164
+
165
+ # Convert Matplotlib figure to PNG image
166
+ buf = BytesIO()
167
+ fig.savefig(buf, format='png')
168
+ buf.seek(0)
169
+
170
+ # Convert PNG image to PIL image
171
+ img = Image.open(buf)
172
+
173
+ return img # Return the PIL image object
174
+
175
+ # Main function
176
+ def main():
177
+ # Load data
178
+ df = load_data()
179
+
180
+ # Calculate metrics
181
+ topic_counts, avg_reviews = calculate_metrics(df)
182
+
183
+ # Display Balearia logo and main title
184
+ st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True)
185
+
186
+ # Date slider for interactive filtering
187
+ min_date = df['date'].min()
188
+ max_date = df['date'].max()
189
+ start_date, end_date = st.slider(
190
+ "Select date range:",
191
+ min_value=min_date,
192
+ max_value=max_date,
193
+ value=(min_date, max_date),
194
+ format="MM/DD/YY"
195
+ )
196
+
197
+ # Filter data based on selected date range
198
+ filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)]
199
+
200
+ # Display topics in dynamic columns
201
+ topics = topic_counts['Topic']
202
+
203
+ num_columns = 5 # Number of topics per row
204
+ num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed
205
+
206
+ for row in range(num_rows):
207
+ cols = st.columns(num_columns)
208
+ for col in range(num_columns):
209
+ idx = row * num_columns + col
210
+ if idx < len(topics):
211
+ topic = topics[idx]
212
+ with cols[col]:
213
+ # First box: Topic name, number of reviews, filled stars
214
+ avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean()
215
+ avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0
216
+ stars_html = filled_stars(avg_review_rounded)
217
+ st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; "
218
+ f"border-radius: 5px; text-align: center;'>"
219
+ f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>"
220
+ f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>"
221
+ f"<p style='font-size:20px;'>{stars_html}</p>"
222
+ f"</div>", unsafe_allow_html=True)
223
+
224
+ # Second box: Line chart
225
+ avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]
226
+ fig_line, ax_line = plt.subplots()
227
+ plot_line_chart(avg_reviews_topic, ax_line)
228
+ st.pyplot(fig_line, use_container_width=True)
229
+
230
+ # Third box: Star rating distribution
231
+ fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size
232
+ plot_star_distribution(avg_reviews_topic, ax_bar)
233
+ st.pyplot(fig_bar, use_container_width=True)
234
+
235
+ # Wordclouds for positive and negative reviews
236
+ st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True)
237
+
238
+ # Filter data for positive and negative reviews based on the date range
239
+ positive_df = df[df['review'] >= 3]
240
+ negative_df = df[df['review'] < 3]
241
+
242
+ # Concatenate all comments into a single string for positive and negative reviews
243
+ positive_comments = ' '.join(positive_df['comment'].astype(str))
244
+ negative_comments = ' '.join(negative_df['comment'].astype(str))
245
+
246
+ # Generate and display positive reviews Wordcloud
247
+ fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud")
248
+ st.image(fig_pos_wordcloud, use_column_width=True)
249
+
250
+ # Generate and display negative reviews Wordcloud
251
+ fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud")
252
+ st.image(fig_neg_wordcloud, use_column_width=True)
253
+
254
+ if __name__ == '__main__':
255
+ main()