Balearia / app.py
D3MI4N-KDS's picture
Include nltk direct import
eebe248 verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import linregress
from datetime import datetime
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from io import BytesIO
from PIL import Image
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
# Set page configuration for wider layout
st.set_page_config(layout="wide")
# Load data using st.cache_data
@st.cache_data
def load_data():
df = pd.read_csv("balearia_categorized_agg_wdates.csv")
# Convert string to datetime with explicit format
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date
# Drop rows where gpt_topics is NaN (if necessary)
df = df.dropna(subset=['gpt_topics'])
# Ensure gpt_topics is a list of strings
df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x)
return df
# Function to explode list columns and retain original index
def explode_and_retain_index(df, col_to_explode):
exploded = df.explode(col_to_explode)
return exploded
# Function to calculate metrics
@st.cache_data
def calculate_metrics(df):
# Explode gpt_topics to have one topic per row
df_exploded = explode_and_retain_index(df, 'gpt_topics')
# Calculate topic counts
topic_counts = df_exploded['gpt_topics'].value_counts().reset_index()
topic_counts.columns = ['Topic', 'count']
# Calculate average reviews per topic and date
avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index()
return topic_counts, avg_reviews
# Function to plot line chart
def plot_line_chart(data, ax):
# Round average reviews to the nearest whole number
data['review'] = data['review'].round().astype(int)
# Check if data is empty
if not data.empty:
# Plot the line chart
sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax)
# Remove y-axis label
ax.set_ylabel('')
# Increase font size of y-axis labels
ax.tick_params(axis='y', labelsize=14) # Adjust font size
# Add horizontal dotted lines for each star rating
stars_ticks = [1, 2, 3, 4, 5]
for tick in stars_ticks:
ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5)
# Calculate and plot trendline (orange dotted)
slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review'])
trendline = intercept + slope * range(len(data))
ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1)
# Set y-axis ticks to integers from 1 to 5
ax.set_yticks(range(1, 6))
# Remove x-axis label and ticks for cleaner look
ax.set_xlabel('')
ax.set_xticks([])
else:
# If data is empty, just show a message
ax.text(0.5, 0.5, 'No data available for the selected date range',
horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
ax.axis('off') # Hide the axes if no data is available
# Function to create filled stars based on average review
def filled_stars(avg_review):
filled = int(round(avg_review))
empty = 5 - filled
return "★" * filled + "☆" * empty
# Function to plot horizontal bar chart for star ratings distribution
def plot_star_distribution(data, ax):
# Count number of reviews for each star rating and ensure the index is sorted from 1 to 5
star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index()
# Check if star_counts is empty (all values are zero)
if star_counts.sum() == 0:
# Display a message if there is no data available
ax.text(0.5, 0.5, 'No data available for the selected date range',
horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
ax.axis('off') # Hide the axes if no data is available
else:
# Plot horizontal bar chart with different colors for each star rating
colors = sns.color_palette('viridis', len(star_counts))
# Plot bars for each star rating
bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6)
# Display the count value on each bar
for bar in bars:
width = bar.get_width()
count = int(width)
if count > 0:
ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white')
# Set y-axis ticks and labels in ascending order (1 to 5 stars)
ax.set_yticks(range(1, 6))
ax.set_yticklabels(range(1, 6), fontsize=14)
# Remove x-axis ticks and label for cleaner look
ax.set_xticks([])
ax.set_xlabel('')
# Set y-axis to ascending order
ax.set_ylim(0.5, 5.5)
# Function to generate Wordcloud based on reviews
def generate_wordcloud(text, title):
# Set stopwords for Spanish
stop_words = set(stopwords.words('spanish'))
# List of additional seen stopwords
additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar',
'falta', 'ningun']
# Update the stop_words set with the additional stopwords
stop_words.update(additional_stopwords)
# Tokenize the text into words
tokens = word_tokenize(text)
# Remove punctuation
tokens = [word for word in tokens if word.isalnum()]
# Remove stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
# Join filtered tokens back into a single string
filtered_text = ' '.join(filtered_tokens)
# Generate wordcloud
wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text)
# Create Matplotlib figure and axes
fig, ax = plt.subplots(figsize=(8, 4))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
ax.set_title(title)
# Convert Matplotlib figure to PNG image
buf = BytesIO()
fig.savefig(buf, format='png')
buf.seek(0)
# Convert PNG image to PIL image
img = Image.open(buf)
return img # Return the PIL image object
# Main function
def main():
# Load data
df = load_data()
# Calculate metrics
topic_counts, avg_reviews = calculate_metrics(df)
# Display Balearia logo and main title
logo_path = "balearia_logo.png" # Replace with the actual path to your logo file
st.image(logo_path, width=200) # Adjust width as needed
st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True)
# Date slider for interactive filtering
min_date = df['date'].min()
max_date = df['date'].max()
start_date, end_date = st.slider(
"Select date range:",
min_value=min_date,
max_value=max_date,
value=(min_date, max_date),
format="MM/DD/YY"
)
# Filter data based on selected date range
filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)]
# Display topics in dynamic columns
topics = topic_counts['Topic']
num_columns = 5 # Number of topics per row
num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed
for row in range(num_rows):
cols = st.columns(num_columns)
for col in range(num_columns):
idx = row * num_columns + col
if idx < len(topics):
topic = topics[idx]
with cols[col]:
# First box: Topic name, number of reviews, filled stars
avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean()
avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0
stars_html = filled_stars(avg_review_rounded)
st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; "
f"border-radius: 5px; text-align: center;'>"
f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>"
f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>"
f"<p style='font-size:20px;'>{stars_html}</p>"
f"</div>", unsafe_allow_html=True)
# Second box: Line chart
avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]
fig_line, ax_line = plt.subplots()
plot_line_chart(avg_reviews_topic, ax_line)
st.pyplot(fig_line, use_container_width=True)
# Third box: Star rating distribution
fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size
plot_star_distribution(avg_reviews_topic, ax_bar)
st.pyplot(fig_bar, use_container_width=True)
# Wordclouds for positive and negative reviews
st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True)
# Filter data for positive and negative reviews based on the date range
positive_df = df[df['review'] >= 3]
negative_df = df[df['review'] < 3]
# Concatenate all comments into a single string for positive and negative reviews
positive_comments = ' '.join(positive_df['comment'].astype(str))
negative_comments = ' '.join(negative_df['comment'].astype(str))
# Generate and display positive reviews Wordcloud
fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud")
st.image(fig_pos_wordcloud, use_column_width=True)
# Generate and display negative reviews Wordcloud
fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud")
st.image(fig_neg_wordcloud, use_column_width=True)
if __name__ == '__main__':
main()