In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# Set visualization style
sns.set_theme(style="whitegrid")
In [6]:
print("Loading datasets...")
# Ensure the file paths match your project structure
df_reviews = pd.read_csv('data/DisneylandReviews.csv', encoding='latin-1')
# Read the new traffic dataset. Skip the first 4 rows so the 5th row becomes the header.
df_traffic = pd.read_csv('data/Table 650-80001_en.csv', skiprows=4)
print("Cleaning review data...")
# Filter out missing dates
df_reviews = df_reviews[df_reviews['Year_Month'] != 'missing']
# Convert Year_Month to standard datetime period (YYYY-MM)
df_reviews['Year_Month'] = pd.to_datetime(df_reviews['Year_Month'], format='%Y-%m').dt.to_period('M')
print("Review counts by branch:")
print(df_reviews['Branch'].value_counts())
Loading datasets... Cleaning review data... Review counts by branch: Branch Disneyland_California 18202 Disneyland_Paris 12694 Disneyland_HongKong 9147 Name: count, dtype: int64
In [8]:
print("Processing traffic data...")
# Rename the first two columns to 'Year' and 'Month'
df_traffic.rename(columns={df_traffic.columns[0]: 'Year', df_traffic.columns[1]: 'Month'}, inplace=True)
# Drop the first row (which contains the strings "Year" and "Month" from the original CSV structure)
df_traffic = df_traffic.iloc[1:].copy()
# Clean Year and Month columns (convert to string and strip spaces)
df_traffic['Year'] = df_traffic['Year'].astype(str).str.replace(r'\.0$', '', regex=True).str.strip()
df_traffic['Month'] = df_traffic['Month'].astype(str).str.strip()
# Drop rows where Month is empty or 'nan' (these are annual total rows like 2004, 2005 without months)
df_traffic = df_traffic[(df_traffic['Month'] != '') & (df_traffic['Month'].str.lower() != 'nan')]
# Create Year_Month string (e.g., "2004-Jan")
df_traffic['Year_Month_str'] = df_traffic['Year'] + '-' + df_traffic['Month']
# Parse datetime using %Y-%b (since months are 'Jan', 'Feb', etc.)
df_traffic['Year_Month'] = pd.to_datetime(df_traffic['Year_Month_str'], format='%Y-%b').dt.to_period('M')
# Define the columns that represent "Other Visitors" (Foreigners excluding Greater China)
foreign_regions = [
'Africa', 'The Americas', 'Australia, New Zealand and South Pacific',
'Europe', 'Middle East', 'North Asia (1)', 'South and Southeast Asia (2)'
]
# Clean commas and convert these columns to numeric
for col in foreign_regions:
if col in df_traffic.columns:
df_traffic[col] = pd.to_numeric(df_traffic[col].astype(str).str.replace(',', ''), errors='coerce').fillna(0)
# Calculate total Monthly Other Visitor Arrivals by summing the foreign regions
df_traffic['Monthly_Other_Visitor_Arrivals'] = df_traffic[foreign_regions].sum(axis=1)
# Keep only necessary columns for the final traffic dataframe
monthly_traffic = df_traffic[['Year_Month', 'Monthly_Other_Visitor_Arrivals']].copy()
# Calculate expected staying days (Assuming 7 days per person)
monthly_traffic['Expected_Staying_Days_Other_Visitors'] = monthly_traffic['Monthly_Other_Visitor_Arrivals'] * 7
print("Traffic data preview:")
print(monthly_traffic.head())
Processing traffic data...
Traffic data preview:
Year_Month Monthly_Other_Visitor_Arrivals \
24 2004-02 463776.0
25 2004-03 564897.0
26 2004-04 629207.0
27 2004-05 555245.0
28 2004-06 587220.0
Expected_Staying_Days_Other_Visitors
24 3246432.0
25 3954279.0
26 4404449.0
27 3886715.0
28 4110540.0
In [9]:
print("Generating visualizations...")
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
# Plot 1: Rating Distribution (Pre-aggregated for speed)
rating_counts = df_reviews.groupby(['Branch', 'Rating']).size().unstack()
rating_counts.T.plot(kind='bar', ax=axes[0, 0], colormap='Set2')
axes[0, 0].set_title('Rating Distribution by Disneyland Branch', fontsize=14)
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Number of Reviews')
axes[0, 0].tick_params(axis='x', rotation=0)
# Plot 2: HK Reviews Trend (Pre-aggregated and converted to string for fast plotting)
hk_reviews = df_reviews[df_reviews['Branch'] == 'Disneyland_HongKong']
hk_trend = hk_reviews.groupby('Year_Month').size()
hk_trend.index = hk_trend.index.astype(str)
axes[0, 1].plot(hk_trend.index, hk_trend.values, color='coral', linewidth=2)
axes[0, 1].set_title('Hong Kong Disneyland Monthly Review Trend', fontsize=14)
axes[0, 1].set_xlabel('Year-Month')
axes[0, 1].set_ylabel('Number of Reviews')
axes[0, 1].tick_params(axis='x', rotation=45)
# Show only 1 tick per year to avoid x-axis crowding
for i, label in enumerate(axes[0, 1].xaxis.get_ticklabels()):
if i % 12 != 0: label.set_visible(False)
# Plot 3: HK Monthly Foreign Visitors Trend
monthly_traffic['Year_Month_str'] = monthly_traffic['Year_Month'].astype(str)
axes[1, 0].plot(monthly_traffic['Year_Month_str'], monthly_traffic['Monthly_Other_Visitor_Arrivals'], marker='o', color='teal')
axes[1, 0].set_title('HK Monthly "Other Visitors" Arrivals Trend', fontsize=14)
axes[1, 0].set_xlabel('Year-Month')
axes[1, 0].set_ylabel('Total Arrivals')
axes[1, 0].tick_params(axis='x', rotation=45)
# Show 1 tick every 12 months for cleaner x-axis
for i, label in enumerate(axes[1, 0].xaxis.get_ticklabels()):
if i % 12 != 0: label.set_visible(False)
# Hide the empty 4th subplot
axes[1, 1].axis('off')
plt.tight_layout()
plt.show()
Generating visualizations...
In [10]:
print("Splitting and saving datasets...")
branches = ['Disneyland_Paris', 'Disneyland_California', 'Disneyland_HongKong']
# Use tqdm to display a progress bar during the export process
for branch in tqdm(branches, desc="Processing and Exporting CSVs"):
# Filter data for the specific branch
df_branch = df_reviews[df_reviews['Branch'] == branch].copy()
# If it's Hong Kong, merge the traffic data
if branch == 'Disneyland_HongKong':
df_branch = pd.merge(
df_branch,
monthly_traffic[['Year_Month', 'Monthly_Other_Visitor_Arrivals', 'Expected_Staying_Days_Other_Visitors']],
on='Year_Month',
how='left'
)
# Fill missing traffic data with 0 to prevent empty trailing commas (,,) in CSV
df_branch['Monthly_Other_Visitor_Arrivals'].fillna(0, inplace=True)
df_branch['Expected_Staying_Days_Other_Visitors'].fillna(0, inplace=True)
# Convert Year_Month back to string for clean CSV saving
df_branch['Year_Month'] = df_branch['Year_Month'].astype(str)
# Save to CSV
filename = f"data/{branch}_Processed.csv"
df_branch.to_csv(filename, index=False)
print("\nData processing complete! 3 CSV files have been successfully exported to the 'data' folder.")
Splitting and saving datasets...
Processing and Exporting CSVs: 100%|██████████| 3/3 [00:00<00:00, 18.86it/s]
Data processing complete! 3 CSV files have been successfully exported to the 'data' folder.
In [ ]: