Spaces:

KairosData
/

Balearia

Build error

App Files Files Community

D3MI4N-KDS commited on Jun 20, 2024

Commit

493aaf9

verified ·

1 Parent(s): b8ed3bd

Upload streamlit_app.py

Browse files

This app uses Balearia data from 3 different sources (Google reviews, TrustPilot & VIS).
It shows topic analysis from all sources across time.
Topic analysis consists mainly in:
- Evolution of reviews rates (stars) by topic.
- General reviews rates distribution by topic.
- Positive and Negative reviews tags (Wordclouds).

Files changed (1) hide show

streamlit_app.py +255 -0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from scipy.stats import linregress
+from datetime import datetime
+from wordcloud import WordCloud
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from io import BytesIO
+from PIL import Image
+# Set page configuration for wider layout
+st.set_page_config(layout="wide")
+# Load data using st.cache_data
+@st.cache_data
+def load_data():
+    df = pd.read_csv("Balearia/outputs/balearia_categorized_agg_wdates.csv")
+    # Convert string to datetime with explicit format
+    df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date
+    # Drop rows where gpt_topics is NaN (if necessary)
+    df = df.dropna(subset=['gpt_topics'])
+    # Ensure gpt_topics is a list of strings
+    df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x)
+    return df
+# Function to explode list columns and retain original index
+def explode_and_retain_index(df, col_to_explode):
+    exploded = df.explode(col_to_explode)
+    return exploded
+# Function to calculate metrics
+@st.cache_data
+def calculate_metrics(df):
+    # Explode gpt_topics to have one topic per row
+    df_exploded = explode_and_retain_index(df, 'gpt_topics')
+    # Calculate topic counts
+    topic_counts = df_exploded['gpt_topics'].value_counts().reset_index()
+    topic_counts.columns = ['Topic', 'count']
+    # Calculate average reviews per topic and date
+    avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index()
+    return topic_counts, avg_reviews
+# Function to plot line chart
+def plot_line_chart(data, ax):
+    # Round average reviews to the nearest whole number
+    data['review'] = data['review'].round().astype(int)
+    # Check if data is empty
+    if not data.empty:
+        # Plot the line chart
+        sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax)
+        # Remove y-axis label
+        ax.set_ylabel('')
+        # Increase font size of y-axis labels
+        ax.tick_params(axis='y', labelsize=14)  # Adjust font size
+        # Add horizontal dotted lines for each star rating
+        stars_ticks = [1, 2, 3, 4, 5]
+        for tick in stars_ticks:
+            ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5)
+        # Calculate and plot trendline (orange dotted)
+        slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review'])
+        trendline = intercept + slope * range(len(data))
+        ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1)
+        # Set y-axis ticks to integers from 1 to 5
+        ax.set_yticks(range(1, 6))
+        # Remove x-axis label and ticks for cleaner look
+        ax.set_xlabel('')
+        ax.set_xticks([])
+    else:
+        # If data is empty, just show a message
+        ax.text(0.5, 0.5, 'No data available for the selected date range',
+                horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
+        ax.axis('off')  # Hide the axes if no data is available
+# Function to create filled stars based on average review
+def filled_stars(avg_review):
+    filled = int(round(avg_review))
+    empty = 5 - filled
+    return "★" * filled + "☆" * empty
+# Function to plot horizontal bar chart for star ratings distribution
+def plot_star_distribution(data, ax):
+    # Count number of reviews for each star rating and ensure the index is sorted from 1 to 5
+    star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index()
+    # Check if star_counts is empty (all values are zero)
+    if star_counts.sum() == 0:
+        # Display a message if there is no data available
+        ax.text(0.5, 0.5, 'No data available for the selected date range',
+                horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
+        ax.axis('off')  # Hide the axes if no data is available
+    else:
+        # Plot horizontal bar chart with different colors for each star rating
+        colors = sns.color_palette('viridis', len(star_counts))
+        # Plot bars for each star rating
+        bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6)
+        # Display the count value on each bar
+        for bar in bars:
+            width = bar.get_width()
+            count = int(width)
+            if count > 0:
+                ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white')
+        # Set y-axis ticks and labels in ascending order (1 to 5 stars)
+        ax.set_yticks(range(1, 6))
+        ax.set_yticklabels(range(1, 6), fontsize=14)
+        # Remove x-axis ticks and label for cleaner look
+        ax.set_xticks([])
+        ax.set_xlabel('')
+        # Set y-axis to ascending order
+        ax.set_ylim(0.5, 5.5)
+# Function to generate Wordcloud based on reviews
+def generate_wordcloud(text, title):
+    # Set stopwords for Spanish
+    stop_words = set(stopwords.words('spanish'))
+    # List of additional seen stopwords
+    additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar',
+                           'falta', 'ningun']
+    # Update the stop_words set with the additional stopwords
+    stop_words.update(additional_stopwords)
+    # Tokenize the text into words
+    tokens = word_tokenize(text)
+    # Remove punctuation
+    tokens = [word for word in tokens if word.isalnum()]
+    # Remove stopwords
+    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
+    # Join filtered tokens back into a single string
+    filtered_text = ' '.join(filtered_tokens)
+    # Generate wordcloud
+    wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text)
+    # Create Matplotlib figure and axes
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.imshow(wordcloud, interpolation='bilinear')
+    ax.axis('off')
+    ax.set_title(title)
+    # Convert Matplotlib figure to PNG image
+    buf = BytesIO()
+    fig.savefig(buf, format='png')
+    buf.seek(0)
+    # Convert PNG image to PIL image
+    img = Image.open(buf)
+    return img  # Return the PIL image object
+# Main function
+def main():
+    # Load data
+    df = load_data()
+    # Calculate metrics
+    topic_counts, avg_reviews = calculate_metrics(df)
+    # Display Balearia logo and main title
+    st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True)
+    # Date slider for interactive filtering
+    min_date = df['date'].min()
+    max_date = df['date'].max()
+    start_date, end_date = st.slider(
+        "Select date range:",
+        min_value=min_date,
+        max_value=max_date,
+        value=(min_date, max_date),
+        format="MM/DD/YY"
+    )
+    # Filter data based on selected date range
+    filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)]
+    # Display topics in dynamic columns
+    topics = topic_counts['Topic']
+    num_columns = 5  # Number of topics per row
+    num_rows = (len(topics) + num_columns - 1) // num_columns  # Calculate the number of rows needed
+    for row in range(num_rows):
+        cols = st.columns(num_columns)
+        for col in range(num_columns):
+            idx = row * num_columns + col
+            if idx < len(topics):
+                topic = topics[idx]
+                with cols[col]:
+                    # First box: Topic name, number of reviews, filled stars
+                    avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean()
+                    avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0
+                    stars_html = filled_stars(avg_review_rounded)
+                    st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; "
+                                f"border-radius: 5px; text-align: center;'>"
+                                f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>"
+                                f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>"
+                                f"<p style='font-size:20px;'>{stars_html}</p>"
+                                f"</div>", unsafe_allow_html=True)
+                    # Second box: Line chart
+                    avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]
+                    fig_line, ax_line = plt.subplots()
+                    plot_line_chart(avg_reviews_topic, ax_line)
+                    st.pyplot(fig_line, use_container_width=True)
+                    # Third box: Star rating distribution
+                    fig_bar, ax_bar = plt.subplots(figsize=(6, 4))  # Adjust size
+                    plot_star_distribution(avg_reviews_topic, ax_bar)
+                    st.pyplot(fig_bar, use_container_width=True)
+    # Wordclouds for positive and negative reviews
+    st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True)
+    # Filter data for positive and negative reviews based on the date range
+    positive_df = df[df['review'] >= 3]
+    negative_df = df[df['review'] < 3]
+    # Concatenate all comments into a single string for positive and negative reviews
+    positive_comments = ' '.join(positive_df['comment'].astype(str))
+    negative_comments = ' '.join(negative_df['comment'].astype(str))
+    # Generate and display positive reviews Wordcloud
+    fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud")
+    st.image(fig_pos_wordcloud, use_column_width=True)
+    # Generate and display negative reviews Wordcloud
+    fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud")
+    st.image(fig_neg_wordcloud, use_column_width=True)
+if __name__ == '__main__':
+    main()