Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import numpy as np | |
| from scipy.stats import linregress | |
| from datetime import datetime | |
| from wordcloud import WordCloud | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from io import BytesIO | |
| from PIL import Image | |
| # Download NLTK data | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| # Set page configuration for wider layout | |
| st.set_page_config(layout="wide") | |
| # Load data using st.cache_data | |
| def load_data(): | |
| df = pd.read_csv("balearia_categorized_agg_wdates.csv") | |
| # Convert string to datetime with explicit format | |
| df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date | |
| # Drop rows where gpt_topics is NaN (if necessary) | |
| df = df.dropna(subset=['gpt_topics']) | |
| # Ensure gpt_topics is a list of strings | |
| df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x) | |
| return df | |
| # Function to explode list columns and retain original index | |
| def explode_and_retain_index(df, col_to_explode): | |
| exploded = df.explode(col_to_explode) | |
| return exploded | |
| # Function to calculate metrics | |
| def calculate_metrics(df): | |
| # Explode gpt_topics to have one topic per row | |
| df_exploded = explode_and_retain_index(df, 'gpt_topics') | |
| # Calculate topic counts | |
| topic_counts = df_exploded['gpt_topics'].value_counts().reset_index() | |
| topic_counts.columns = ['Topic', 'count'] | |
| # Calculate average reviews per topic and date | |
| avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index() | |
| return topic_counts, avg_reviews | |
| # Function to plot line chart | |
| def plot_line_chart(data, ax): | |
| # Round average reviews to the nearest whole number | |
| data['review'] = data['review'].round().astype(int) | |
| # Check if data is empty | |
| if not data.empty: | |
| # Plot the line chart | |
| sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax) | |
| # Remove y-axis label | |
| ax.set_ylabel('') | |
| # Increase font size of y-axis labels | |
| ax.tick_params(axis='y', labelsize=14) # Adjust font size | |
| # Add horizontal dotted lines for each star rating | |
| stars_ticks = [1, 2, 3, 4, 5] | |
| for tick in stars_ticks: | |
| ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5) | |
| # Calculate and plot trendline (orange dotted) | |
| slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review']) | |
| trendline = intercept + slope * range(len(data)) | |
| ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1) | |
| # Set y-axis ticks to integers from 1 to 5 | |
| ax.set_yticks(range(1, 6)) | |
| # Remove x-axis label and ticks for cleaner look | |
| ax.set_xlabel('') | |
| ax.set_xticks([]) | |
| else: | |
| # If data is empty, just show a message | |
| ax.text(0.5, 0.5, 'No data available for the selected date range', | |
| horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray') | |
| ax.axis('off') # Hide the axes if no data is available | |
| # Function to create filled stars based on average review | |
| def filled_stars(avg_review): | |
| filled = int(round(avg_review)) | |
| empty = 5 - filled | |
| return "★" * filled + "☆" * empty | |
| # Function to plot horizontal bar chart for star ratings distribution | |
| def plot_star_distribution(data, ax): | |
| # Count number of reviews for each star rating and ensure the index is sorted from 1 to 5 | |
| star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index() | |
| # Check if star_counts is empty (all values are zero) | |
| if star_counts.sum() == 0: | |
| # Display a message if there is no data available | |
| ax.text(0.5, 0.5, 'No data available for the selected date range', | |
| horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray') | |
| ax.axis('off') # Hide the axes if no data is available | |
| else: | |
| # Plot horizontal bar chart with different colors for each star rating | |
| colors = sns.color_palette('viridis', len(star_counts)) | |
| # Plot bars for each star rating | |
| bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6) | |
| # Display the count value on each bar | |
| for bar in bars: | |
| width = bar.get_width() | |
| count = int(width) | |
| if count > 0: | |
| ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white') | |
| # Set y-axis ticks and labels in ascending order (1 to 5 stars) | |
| ax.set_yticks(range(1, 6)) | |
| ax.set_yticklabels(range(1, 6), fontsize=14) | |
| # Remove x-axis ticks and label for cleaner look | |
| ax.set_xticks([]) | |
| ax.set_xlabel('') | |
| # Set y-axis to ascending order | |
| ax.set_ylim(0.5, 5.5) | |
| # Function to generate Wordcloud based on reviews | |
| def generate_wordcloud(text, title): | |
| # Set stopwords for Spanish | |
| stop_words = set(stopwords.words('spanish')) | |
| # List of additional seen stopwords | |
| additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar', | |
| 'falta', 'ningun'] | |
| # Update the stop_words set with the additional stopwords | |
| stop_words.update(additional_stopwords) | |
| # Tokenize the text into words | |
| tokens = word_tokenize(text) | |
| # Remove punctuation | |
| tokens = [word for word in tokens if word.isalnum()] | |
| # Remove stopwords | |
| filtered_tokens = [word for word in tokens if word.lower() not in stop_words] | |
| # Join filtered tokens back into a single string | |
| filtered_text = ' '.join(filtered_tokens) | |
| # Generate wordcloud | |
| wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text) | |
| # Create Matplotlib figure and axes | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| ax.imshow(wordcloud, interpolation='bilinear') | |
| ax.axis('off') | |
| ax.set_title(title) | |
| # Convert Matplotlib figure to PNG image | |
| buf = BytesIO() | |
| fig.savefig(buf, format='png') | |
| buf.seek(0) | |
| # Convert PNG image to PIL image | |
| img = Image.open(buf) | |
| return img # Return the PIL image object | |
| # Main function | |
| def main(): | |
| # Load data | |
| df = load_data() | |
| # Calculate metrics | |
| topic_counts, avg_reviews = calculate_metrics(df) | |
| # Display Balearia logo and main title | |
| logo_path = "balearia_logo.png" # Replace with the actual path to your logo file | |
| st.image(logo_path, width=200) # Adjust width as needed | |
| st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True) | |
| # Date slider for interactive filtering | |
| min_date = df['date'].min() | |
| max_date = df['date'].max() | |
| start_date, end_date = st.slider( | |
| "Select date range:", | |
| min_value=min_date, | |
| max_value=max_date, | |
| value=(min_date, max_date), | |
| format="MM/DD/YY" | |
| ) | |
| # Filter data based on selected date range | |
| filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)] | |
| # Display topics in dynamic columns | |
| topics = topic_counts['Topic'] | |
| num_columns = 5 # Number of topics per row | |
| num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed | |
| for row in range(num_rows): | |
| cols = st.columns(num_columns) | |
| for col in range(num_columns): | |
| idx = row * num_columns + col | |
| if idx < len(topics): | |
| topic = topics[idx] | |
| with cols[col]: | |
| # First box: Topic name, number of reviews, filled stars | |
| avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean() | |
| avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0 | |
| stars_html = filled_stars(avg_review_rounded) | |
| st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; " | |
| f"border-radius: 5px; text-align: center;'>" | |
| f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>" | |
| f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>" | |
| f"<p style='font-size:20px;'>{stars_html}</p>" | |
| f"</div>", unsafe_allow_html=True) | |
| # Second box: Line chart | |
| avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic] | |
| fig_line, ax_line = plt.subplots() | |
| plot_line_chart(avg_reviews_topic, ax_line) | |
| st.pyplot(fig_line, use_container_width=True) | |
| # Third box: Star rating distribution | |
| fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size | |
| plot_star_distribution(avg_reviews_topic, ax_bar) | |
| st.pyplot(fig_bar, use_container_width=True) | |
| # Wordclouds for positive and negative reviews | |
| st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True) | |
| # Filter data for positive and negative reviews based on the date range | |
| positive_df = df[df['review'] >= 3] | |
| negative_df = df[df['review'] < 3] | |
| # Concatenate all comments into a single string for positive and negative reviews | |
| positive_comments = ' '.join(positive_df['comment'].astype(str)) | |
| negative_comments = ' '.join(negative_df['comment'].astype(str)) | |
| # Generate and display positive reviews Wordcloud | |
| fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud") | |
| st.image(fig_pos_wordcloud, use_column_width=True) | |
| # Generate and display negative reviews Wordcloud | |
| fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud") | |
| st.image(fig_neg_wordcloud, use_column_width=True) | |
| if __name__ == '__main__': | |
| main() | |