import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from scipy.stats import linregress from datetime import datetime from wordcloud import WordCloud import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from io import BytesIO from PIL import Image # Download NLTK data nltk.download('stopwords') nltk.download('punkt') # Set page configuration for wider layout st.set_page_config(layout="wide") # Load data using st.cache_data @st.cache_data def load_data(): df = pd.read_csv("balearia_categorized_agg_wdates.csv") # Convert string to datetime with explicit format df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date # Drop rows where gpt_topics is NaN (if necessary) df = df.dropna(subset=['gpt_topics']) # Ensure gpt_topics is a list of strings df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x) return df # Function to explode list columns and retain original index def explode_and_retain_index(df, col_to_explode): exploded = df.explode(col_to_explode) return exploded # Function to calculate metrics @st.cache_data def calculate_metrics(df): # Explode gpt_topics to have one topic per row df_exploded = explode_and_retain_index(df, 'gpt_topics') # Calculate topic counts topic_counts = df_exploded['gpt_topics'].value_counts().reset_index() topic_counts.columns = ['Topic', 'count'] # Calculate average reviews per topic and date avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index() return topic_counts, avg_reviews # Function to plot line chart def plot_line_chart(data, ax): # Round average reviews to the nearest whole number data['review'] = data['review'].round().astype(int) # Check if data is empty if not data.empty: # Plot the line chart sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax) # Remove y-axis label ax.set_ylabel('') # Increase font size of y-axis labels ax.tick_params(axis='y', labelsize=14) # Adjust font size # Add horizontal dotted lines for each star rating stars_ticks = [1, 2, 3, 4, 5] for tick in stars_ticks: ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5) # Calculate and plot trendline (orange dotted) slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review']) trendline = intercept + slope * range(len(data)) ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1) # Set y-axis ticks to integers from 1 to 5 ax.set_yticks(range(1, 6)) # Remove x-axis label and ticks for cleaner look ax.set_xlabel('') ax.set_xticks([]) else: # If data is empty, just show a message ax.text(0.5, 0.5, 'No data available for the selected date range', horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray') ax.axis('off') # Hide the axes if no data is available # Function to create filled stars based on average review def filled_stars(avg_review): filled = int(round(avg_review)) empty = 5 - filled return "★" * filled + "☆" * empty # Function to plot horizontal bar chart for star ratings distribution def plot_star_distribution(data, ax): # Count number of reviews for each star rating and ensure the index is sorted from 1 to 5 star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index() # Check if star_counts is empty (all values are zero) if star_counts.sum() == 0: # Display a message if there is no data available ax.text(0.5, 0.5, 'No data available for the selected date range', horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray') ax.axis('off') # Hide the axes if no data is available else: # Plot horizontal bar chart with different colors for each star rating colors = sns.color_palette('viridis', len(star_counts)) # Plot bars for each star rating bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6) # Display the count value on each bar for bar in bars: width = bar.get_width() count = int(width) if count > 0: ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white') # Set y-axis ticks and labels in ascending order (1 to 5 stars) ax.set_yticks(range(1, 6)) ax.set_yticklabels(range(1, 6), fontsize=14) # Remove x-axis ticks and label for cleaner look ax.set_xticks([]) ax.set_xlabel('') # Set y-axis to ascending order ax.set_ylim(0.5, 5.5) # Function to generate Wordcloud based on reviews def generate_wordcloud(text, title): # Set stopwords for Spanish stop_words = set(stopwords.words('spanish')) # List of additional seen stopwords additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar', 'falta', 'ningun'] # Update the stop_words set with the additional stopwords stop_words.update(additional_stopwords) # Tokenize the text into words tokens = word_tokenize(text) # Remove punctuation tokens = [word for word in tokens if word.isalnum()] # Remove stopwords filtered_tokens = [word for word in tokens if word.lower() not in stop_words] # Join filtered tokens back into a single string filtered_text = ' '.join(filtered_tokens) # Generate wordcloud wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text) # Create Matplotlib figure and axes fig, ax = plt.subplots(figsize=(8, 4)) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title(title) # Convert Matplotlib figure to PNG image buf = BytesIO() fig.savefig(buf, format='png') buf.seek(0) # Convert PNG image to PIL image img = Image.open(buf) return img # Return the PIL image object # Main function def main(): # Load data df = load_data() # Calculate metrics topic_counts, avg_reviews = calculate_metrics(df) # Display Balearia logo and main title logo_path = "balearia_logo.png" # Replace with the actual path to your logo file st.image(logo_path, width=200) # Adjust width as needed st.markdown("

Topic Analysis

", unsafe_allow_html=True) # Date slider for interactive filtering min_date = df['date'].min() max_date = df['date'].max() start_date, end_date = st.slider( "Select date range:", min_value=min_date, max_value=max_date, value=(min_date, max_date), format="MM/DD/YY" ) # Filter data based on selected date range filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)] # Display topics in dynamic columns topics = topic_counts['Topic'] num_columns = 5 # Number of topics per row num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed for row in range(num_rows): cols = st.columns(num_columns) for col in range(num_columns): idx = row * num_columns + col if idx < len(topics): topic = topics[idx] with cols[col]: # First box: Topic name, number of reviews, filled stars avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean() avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0 stars_html = filled_stars(avg_review_rounded) st.markdown(f"
" f"

{topic}

" f"

{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews

" f"

{stars_html}

" f"
", unsafe_allow_html=True) # Second box: Line chart avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic] fig_line, ax_line = plt.subplots() plot_line_chart(avg_reviews_topic, ax_line) st.pyplot(fig_line, use_container_width=True) # Third box: Star rating distribution fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size plot_star_distribution(avg_reviews_topic, ax_bar) st.pyplot(fig_bar, use_container_width=True) # Wordclouds for positive and negative reviews st.markdown("

Wordclouds

", unsafe_allow_html=True) # Filter data for positive and negative reviews based on the date range positive_df = df[df['review'] >= 3] negative_df = df[df['review'] < 3] # Concatenate all comments into a single string for positive and negative reviews positive_comments = ' '.join(positive_df['comment'].astype(str)) negative_comments = ' '.join(negative_df['comment'].astype(str)) # Generate and display positive reviews Wordcloud fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud") st.image(fig_pos_wordcloud, use_column_width=True) # Generate and display negative reviews Wordcloud fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud") st.image(fig_neg_wordcloud, use_column_width=True) if __name__ == '__main__': main()