Spaces:
Build error
Build error
File size: 10,341 Bytes
493aaf9 eebe248 493aaf9 12cb9f9 493aaf9 6daa644 493aaf9 6daa644 493aaf9 6daa644 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import linregress
from datetime import datetime
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from io import BytesIO
from PIL import Image
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
# Set page configuration for wider layout
st.set_page_config(layout="wide")
# Load data using st.cache_data
@st.cache_data
def load_data():
df = pd.read_csv("balearia_categorized_agg_wdates.csv")
# Convert string to datetime with explicit format
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date
# Drop rows where gpt_topics is NaN (if necessary)
df = df.dropna(subset=['gpt_topics'])
# Ensure gpt_topics is a list of strings
df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x)
return df
# Function to explode list columns and retain original index
def explode_and_retain_index(df, col_to_explode):
exploded = df.explode(col_to_explode)
return exploded
# Function to calculate metrics
@st.cache_data
def calculate_metrics(df):
# Explode gpt_topics to have one topic per row
df_exploded = explode_and_retain_index(df, 'gpt_topics')
# Calculate topic counts
topic_counts = df_exploded['gpt_topics'].value_counts().reset_index()
topic_counts.columns = ['Topic', 'count']
# Calculate average reviews per topic and date
avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index()
return topic_counts, avg_reviews
# Function to plot line chart
def plot_line_chart(data, ax):
# Round average reviews to the nearest whole number
data['review'] = data['review'].round().astype(int)
# Check if data is empty
if not data.empty:
# Plot the line chart
sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax)
# Remove y-axis label
ax.set_ylabel('')
# Increase font size of y-axis labels
ax.tick_params(axis='y', labelsize=14) # Adjust font size
# Add horizontal dotted lines for each star rating
stars_ticks = [1, 2, 3, 4, 5]
for tick in stars_ticks:
ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5)
# Calculate and plot trendline (orange dotted)
slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review'])
trendline = intercept + slope * range(len(data))
ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1)
# Set y-axis ticks to integers from 1 to 5
ax.set_yticks(range(1, 6))
# Remove x-axis label and ticks for cleaner look
ax.set_xlabel('')
ax.set_xticks([])
else:
# If data is empty, just show a message
ax.text(0.5, 0.5, 'No data available for the selected date range',
horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
ax.axis('off') # Hide the axes if no data is available
# Function to create filled stars based on average review
def filled_stars(avg_review):
filled = int(round(avg_review))
empty = 5 - filled
return "★" * filled + "☆" * empty
# Function to plot horizontal bar chart for star ratings distribution
def plot_star_distribution(data, ax):
# Count number of reviews for each star rating and ensure the index is sorted from 1 to 5
star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index()
# Check if star_counts is empty (all values are zero)
if star_counts.sum() == 0:
# Display a message if there is no data available
ax.text(0.5, 0.5, 'No data available for the selected date range',
horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
ax.axis('off') # Hide the axes if no data is available
else:
# Plot horizontal bar chart with different colors for each star rating
colors = sns.color_palette('viridis', len(star_counts))
# Plot bars for each star rating
bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6)
# Display the count value on each bar
for bar in bars:
width = bar.get_width()
count = int(width)
if count > 0:
ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white')
# Set y-axis ticks and labels in ascending order (1 to 5 stars)
ax.set_yticks(range(1, 6))
ax.set_yticklabels(range(1, 6), fontsize=14)
# Remove x-axis ticks and label for cleaner look
ax.set_xticks([])
ax.set_xlabel('')
# Set y-axis to ascending order
ax.set_ylim(0.5, 5.5)
# Function to generate Wordcloud based on reviews
def generate_wordcloud(text, title):
# Set stopwords for Spanish
stop_words = set(stopwords.words('spanish'))
# List of additional seen stopwords
additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar',
'falta', 'ningun']
# Update the stop_words set with the additional stopwords
stop_words.update(additional_stopwords)
# Tokenize the text into words
tokens = word_tokenize(text)
# Remove punctuation
tokens = [word for word in tokens if word.isalnum()]
# Remove stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Join filtered tokens back into a single string
filtered_text = ' '.join(filtered_tokens)
# Generate wordcloud
wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text)
# Create Matplotlib figure and axes
fig, ax = plt.subplots(figsize=(8, 4))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title)
# Convert Matplotlib figure to PNG image
buf = BytesIO()
fig.savefig(buf, format='png')
buf.seek(0)
# Convert PNG image to PIL image
img = Image.open(buf)
return img # Return the PIL image object
# Main function
def main():
# Load data
df = load_data()
# Calculate metrics
topic_counts, avg_reviews = calculate_metrics(df)
# Display Balearia logo and main title
logo_path = "balearia_logo.png" # Replace with the actual path to your logo file
st.image(logo_path, width=200) # Adjust width as needed
st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True)
# Date slider for interactive filtering
min_date = df['date'].min()
max_date = df['date'].max()
start_date, end_date = st.slider(
"Select date range:",
min_value=min_date,
max_value=max_date,
value=(min_date, max_date),
format="MM/DD/YY"
)
# Filter data based on selected date range
filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)]
# Display topics in dynamic columns
topics = topic_counts['Topic']
num_columns = 5 # Number of topics per row
num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed
for row in range(num_rows):
cols = st.columns(num_columns)
for col in range(num_columns):
idx = row * num_columns + col
if idx < len(topics):
topic = topics[idx]
with cols[col]:
# First box: Topic name, number of reviews, filled stars
avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean()
avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0
stars_html = filled_stars(avg_review_rounded)
st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; "
f"border-radius: 5px; text-align: center;'>"
f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>"
f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>"
f"<p style='font-size:20px;'>{stars_html}</p>"
f"</div>", unsafe_allow_html=True)
# Second box: Line chart
avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]
fig_line, ax_line = plt.subplots()
plot_line_chart(avg_reviews_topic, ax_line)
st.pyplot(fig_line, use_container_width=True)
# Third box: Star rating distribution
fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size
plot_star_distribution(avg_reviews_topic, ax_bar)
st.pyplot(fig_bar, use_container_width=True)
# Wordclouds for positive and negative reviews
st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True)
# Filter data for positive and negative reviews based on the date range
positive_df = df[df['review'] >= 3]
negative_df = df[df['review'] < 3]
# Concatenate all comments into a single string for positive and negative reviews
positive_comments = ' '.join(positive_df['comment'].astype(str))
negative_comments = ' '.join(negative_df['comment'].astype(str))
# Generate and display positive reviews Wordcloud
fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud")
st.image(fig_pos_wordcloud, use_column_width=True)
# Generate and display negative reviews Wordcloud
fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud")
st.image(fig_neg_wordcloud, use_column_width=True)
if __name__ == '__main__':
main()
|