Spaces:

KairosData
/

Balearia

Build error

App Files Files Community

Balearia / app.py

D3MI4N-KDS

Include nltk direct import

eebe248 verified over 1 year ago

raw

history blame contribute delete

10.3 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	from scipy.stats import linregress
	from datetime import datetime
	from wordcloud import WordCloud
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from io import BytesIO
	from PIL import Image

	# Download NLTK data
	nltk.download('stopwords')
	nltk.download('punkt')

	# Set page configuration for wider layout
	st.set_page_config(layout="wide")

	# Load data using st.cache_data
	@st.cache_data
	def load_data():
	df = pd.read_csv("balearia_categorized_agg_wdates.csv")

	# Convert string to datetime with explicit format
	df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y').dt.date

	# Drop rows where gpt_topics is NaN (if necessary)
	df = df.dropna(subset=['gpt_topics'])

	# Ensure gpt_topics is a list of strings
	df['gpt_topics'] = df['gpt_topics'].apply(lambda x: eval(x) if isinstance(x, str) else x)

	return df

	# Function to explode list columns and retain original index
	def explode_and_retain_index(df, col_to_explode):
	exploded = df.explode(col_to_explode)
	return exploded

	# Function to calculate metrics
	@st.cache_data
	def calculate_metrics(df):
	# Explode gpt_topics to have one topic per row
	df_exploded = explode_and_retain_index(df, 'gpt_topics')

	# Calculate topic counts
	topic_counts = df_exploded['gpt_topics'].value_counts().reset_index()
	topic_counts.columns = ['Topic', 'count']

	# Calculate average reviews per topic and date
	avg_reviews = df_exploded.groupby(['date', 'gpt_topics'])['review'].mean().reset_index()

	return topic_counts, avg_reviews

	# Function to plot line chart
	def plot_line_chart(data, ax):
	# Round average reviews to the nearest whole number
	data['review'] = data['review'].round().astype(int)

	# Check if data is empty
	if not data.empty:
	# Plot the line chart
	sns.lineplot(data=data, x='date', y='review', marker='o', ax=ax)

	# Remove y-axis label
	ax.set_ylabel('')

	# Increase font size of y-axis labels
	ax.tick_params(axis='y', labelsize=14) # Adjust font size

	# Add horizontal dotted lines for each star rating
	stars_ticks = [1, 2, 3, 4, 5]
	for tick in stars_ticks:
	ax.axhline(y=tick, color='gray', linestyle=':', linewidth=0.5)

	# Calculate and plot trendline (orange dotted)
	slope, intercept, r_value, p_value, std_err = linregress(range(len(data)), data['review'])
	trendline = intercept + slope * range(len(data))
	ax.plot(data['date'], trendline, color='orange', linestyle='--', linewidth=1)

	# Set y-axis ticks to integers from 1 to 5
	ax.set_yticks(range(1, 6))

	# Remove x-axis label and ticks for cleaner look
	ax.set_xlabel('')
	ax.set_xticks([])
	else:
	# If data is empty, just show a message
	ax.text(0.5, 0.5, 'No data available for the selected date range',
	horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
	ax.axis('off') # Hide the axes if no data is available

	# Function to create filled stars based on average review
	def filled_stars(avg_review):
	filled = int(round(avg_review))
	empty = 5 - filled
	return "★" * filled + "☆" * empty

	# Function to plot horizontal bar chart for star ratings distribution
	def plot_star_distribution(data, ax):
	# Count number of reviews for each star rating and ensure the index is sorted from 1 to 5
	star_counts = data['review'].value_counts().reindex(range(1, 6), fill_value=0).sort_index()

	# Check if star_counts is empty (all values are zero)
	if star_counts.sum() == 0:
	# Display a message if there is no data available
	ax.text(0.5, 0.5, 'No data available for the selected date range',
	horizontalalignment='center', verticalalignment='center', fontsize=12, color='gray')
	ax.axis('off') # Hide the axes if no data is available
	else:
	# Plot horizontal bar chart with different colors for each star rating
	colors = sns.color_palette('viridis', len(star_counts))

	# Plot bars for each star rating
	bars = ax.barh(star_counts.index, star_counts.values, color=colors, height=0.6)

	# Display the count value on each bar
	for bar in bars:
	width = bar.get_width()
	count = int(width)
	if count > 0:
	ax.text(width / 2, bar.get_y() + bar.get_height() / 2, str(count), va='center', ha='center', fontsize=12, color='white')

	# Set y-axis ticks and labels in ascending order (1 to 5 stars)
	ax.set_yticks(range(1, 6))
	ax.set_yticklabels(range(1, 6), fontsize=14)

	# Remove x-axis ticks and label for cleaner look
	ax.set_xticks([])
	ax.set_xlabel('')

	# Set y-axis to ascending order
	ax.set_ylim(0.5, 5.5)

	# Function to generate Wordcloud based on reviews
	def generate_wordcloud(text, title):
	# Set stopwords for Spanish
	stop_words = set(stopwords.words('spanish'))

	# List of additional seen stopwords
	additional_stopwords = ['ma', 'us', 'may', 'hora', 'horas', 'barco', 'bien', 'buena', 'mala', 'balearia', 'mal', 'bueno', 'malo', 'habia', 'mas', 'pasar',
	'falta', 'ningun']
	# Update the stop_words set with the additional stopwords
	stop_words.update(additional_stopwords)

	# Tokenize the text into words
	tokens = word_tokenize(text)

	# Remove punctuation
	tokens = [word for word in tokens if word.isalnum()]

	# Remove stopwords
	filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

	# Join filtered tokens back into a single string
	filtered_text = ' '.join(filtered_tokens)

	# Generate wordcloud
	wordcloud = WordCloud(width=600, height=300, background_color='white').generate(filtered_text)

	# Create Matplotlib figure and axes
	fig, ax = plt.subplots(figsize=(8, 4))
	ax.imshow(wordcloud, interpolation='bilinear')
	ax.axis('off')
	ax.set_title(title)

	# Convert Matplotlib figure to PNG image
	buf = BytesIO()
	fig.savefig(buf, format='png')
	buf.seek(0)

	# Convert PNG image to PIL image
	img = Image.open(buf)

	return img # Return the PIL image object

	# Main function
	def main():
	# Load data
	df = load_data()

	# Calculate metrics
	topic_counts, avg_reviews = calculate_metrics(df)

	# Display Balearia logo and main title
	logo_path = "balearia_logo.png" # Replace with the actual path to your logo file
	st.image(logo_path, width=200) # Adjust width as needed
	st.markdown("<h1 style='text-align: center;'>Topic Analysis</h1>", unsafe_allow_html=True)

	# Date slider for interactive filtering
	min_date = df['date'].min()
	max_date = df['date'].max()
	start_date, end_date = st.slider(
	"Select date range:",
	min_value=min_date,
	max_value=max_date,
	value=(min_date, max_date),
	format="MM/DD/YY"
	)

	# Filter data based on selected date range
	filtered_avg_reviews = avg_reviews[(avg_reviews['date'] >= start_date) & (avg_reviews['date'] <= end_date)]

	# Display topics in dynamic columns
	topics = topic_counts['Topic']

	num_columns = 5 # Number of topics per row
	num_rows = (len(topics) + num_columns - 1) // num_columns # Calculate the number of rows needed

	for row in range(num_rows):
	cols = st.columns(num_columns)
	for col in range(num_columns):
	idx = row * num_columns + col
	if idx < len(topics):
	topic = topics[idx]
	with cols[col]:
	# First box: Topic name, number of reviews, filled stars
	avg_review = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]['review'].mean()
	avg_review_rounded = round(avg_review) if not np.isnan(avg_review) else 0
	stars_html = filled_stars(avg_review_rounded)
	st.markdown(f"<div style='border: 1px solid #ddd; padding: 10px; "
	f"border-radius: 5px; text-align: center;'>"
	f"<h3 style='font-size:18px; margin: 0 auto;'>{topic}</h3>"
	f"<p style='font-size:16px;'>{topic_counts[topic_counts['Topic'] == topic]['count'].values[0]} reviews</p>"
	f"<p style='font-size:20px;'>{stars_html}</p>"
	f"</div>", unsafe_allow_html=True)

	# Second box: Line chart
	avg_reviews_topic = filtered_avg_reviews[filtered_avg_reviews['gpt_topics'] == topic]
	fig_line, ax_line = plt.subplots()
	plot_line_chart(avg_reviews_topic, ax_line)
	st.pyplot(fig_line, use_container_width=True)

	# Third box: Star rating distribution
	fig_bar, ax_bar = plt.subplots(figsize=(6, 4)) # Adjust size
	plot_star_distribution(avg_reviews_topic, ax_bar)
	st.pyplot(fig_bar, use_container_width=True)

	# Wordclouds for positive and negative reviews
	st.markdown("<h2 style='text-align: center;'>Wordclouds</h2>", unsafe_allow_html=True)

	# Filter data for positive and negative reviews based on the date range
	positive_df = df[df['review'] >= 3]
	negative_df = df[df['review'] < 3]

	# Concatenate all comments into a single string for positive and negative reviews
	positive_comments = ' '.join(positive_df['comment'].astype(str))
	negative_comments = ' '.join(negative_df['comment'].astype(str))

	# Generate and display positive reviews Wordcloud
	fig_pos_wordcloud = generate_wordcloud(positive_comments, "Positive Reviews Wordcloud")
	st.image(fig_pos_wordcloud, use_column_width=True)

	# Generate and display negative reviews Wordcloud
	fig_neg_wordcloud = generate_wordcloud(negative_comments, "Negative Reviews Wordcloud")
	st.image(fig_neg_wordcloud, use_column_width=True)

	if __name__ == '__main__':
	main()