import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_theme(style="whitegrid")

print("Loading datasets...")
# Ensure the file paths match your project structure
df_reviews = pd.read_csv('data/DisneylandReviews.csv', encoding='latin-1')

# Read the new traffic dataset. Skip the first 4 rows so the 5th row becomes the header.
df_traffic = pd.read_csv('data/Table 650-80001_en.csv', skiprows=4)

print("Cleaning review data...")
# Filter out missing dates
df_reviews = df_reviews[df_reviews['Year_Month'] != 'missing']

# Convert Year_Month to standard datetime period (YYYY-MM)
df_reviews['Year_Month'] = pd.to_datetime(df_reviews['Year_Month'], format='%Y-%m').dt.to_period('M')

print("Review counts by branch:")
print(df_reviews['Branch'].value_counts())

Loading datasets...
Cleaning review data...
Review counts by branch:
Branch
Disneyland_California    18202
Disneyland_Paris         12694
Disneyland_HongKong       9147
Name: count, dtype: int64

print("Processing traffic data...")

# Rename the first two columns to 'Year' and 'Month'
df_traffic.rename(columns={df_traffic.columns[0]: 'Year', df_traffic.columns[1]: 'Month'}, inplace=True)

# Drop the first row (which contains the strings "Year" and "Month" from the original CSV structure)
df_traffic = df_traffic.iloc[1:].copy()

# Clean Year and Month columns (convert to string and strip spaces)
df_traffic['Year'] = df_traffic['Year'].astype(str).str.replace(r'\.0$', '', regex=True).str.strip()
df_traffic['Month'] = df_traffic['Month'].astype(str).str.strip()

# Drop rows where Month is empty or 'nan' (these are annual total rows like 2004, 2005 without months)
df_traffic = df_traffic[(df_traffic['Month'] != '') & (df_traffic['Month'].str.lower() != 'nan')]

# Create Year_Month string (e.g., "2004-Jan")
df_traffic['Year_Month_str'] = df_traffic['Year'] + '-' + df_traffic['Month']

# Parse datetime using %Y-%b (since months are 'Jan', 'Feb', etc.)
df_traffic['Year_Month'] = pd.to_datetime(df_traffic['Year_Month_str'], format='%Y-%b').dt.to_period('M')

# Define the columns that represent "Other Visitors" (Foreigners excluding Greater China)
foreign_regions = [
    'Africa', 'The Americas', 'Australia, New Zealand and South Pacific',
    'Europe', 'Middle East', 'North Asia (1)', 'South and Southeast Asia (2)'
]

# Clean commas and convert these columns to numeric
for col in foreign_regions:
    if col in df_traffic.columns:
        df_traffic[col] = pd.to_numeric(df_traffic[col].astype(str).str.replace(',', ''), errors='coerce').fillna(0)

# Calculate total Monthly Other Visitor Arrivals by summing the foreign regions
df_traffic['Monthly_Other_Visitor_Arrivals'] = df_traffic[foreign_regions].sum(axis=1)

# Keep only necessary columns for the final traffic dataframe
monthly_traffic = df_traffic[['Year_Month', 'Monthly_Other_Visitor_Arrivals']].copy()

# Calculate expected staying days (Assuming 7 days per person)
monthly_traffic['Expected_Staying_Days_Other_Visitors'] = monthly_traffic['Monthly_Other_Visitor_Arrivals'] * 7

print("Traffic data preview:")
print(monthly_traffic.head())

Processing traffic data...
Traffic data preview:
   Year_Month  Monthly_Other_Visitor_Arrivals  \
24    2004-02                        463776.0   
25    2004-03                        564897.0   
26    2004-04                        629207.0   
27    2004-05                        555245.0   
28    2004-06                        587220.0   

    Expected_Staying_Days_Other_Visitors  
24                             3246432.0  
25                             3954279.0  
26                             4404449.0  
27                             3886715.0  
28                             4110540.0

print("Generating visualizations...")
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Plot 1: Rating Distribution (Pre-aggregated for speed)
rating_counts = df_reviews.groupby(['Branch', 'Rating']).size().unstack()
rating_counts.T.plot(kind='bar', ax=axes[0, 0], colormap='Set2')
axes[0, 0].set_title('Rating Distribution by Disneyland Branch', fontsize=14)
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Number of Reviews')
axes[0, 0].tick_params(axis='x', rotation=0)

# Plot 2: HK Reviews Trend (Pre-aggregated and converted to string for fast plotting)
hk_reviews = df_reviews[df_reviews['Branch'] == 'Disneyland_HongKong']
hk_trend = hk_reviews.groupby('Year_Month').size()
hk_trend.index = hk_trend.index.astype(str)
axes[0, 1].plot(hk_trend.index, hk_trend.values, color='coral', linewidth=2)
axes[0, 1].set_title('Hong Kong Disneyland Monthly Review Trend', fontsize=14)
axes[0, 1].set_xlabel('Year-Month')
axes[0, 1].set_ylabel('Number of Reviews')
axes[0, 1].tick_params(axis='x', rotation=45)
# Show only 1 tick per year to avoid x-axis crowding
for i, label in enumerate(axes[0, 1].xaxis.get_ticklabels()):
    if i % 12 != 0: label.set_visible(False)

# Plot 3: HK Monthly Foreign Visitors Trend
monthly_traffic['Year_Month_str'] = monthly_traffic['Year_Month'].astype(str)
axes[1, 0].plot(monthly_traffic['Year_Month_str'], monthly_traffic['Monthly_Other_Visitor_Arrivals'], marker='o', color='teal')
axes[1, 0].set_title('HK Monthly "Other Visitors" Arrivals Trend', fontsize=14)
axes[1, 0].set_xlabel('Year-Month')
axes[1, 0].set_ylabel('Total Arrivals')
axes[1, 0].tick_params(axis='x', rotation=45)
# Show 1 tick every 12 months for cleaner x-axis
for i, label in enumerate(axes[1, 0].xaxis.get_ticklabels()):
    if i % 12 != 0: label.set_visible(False)

# Hide the empty 4th subplot
axes[1, 1].axis('off')

plt.tight_layout()
plt.show()

Generating visualizations...

print("Splitting and saving datasets...")

branches = ['Disneyland_Paris', 'Disneyland_California', 'Disneyland_HongKong']

# Use tqdm to display a progress bar during the export process
for branch in tqdm(branches, desc="Processing and Exporting CSVs"):
    # Filter data for the specific branch
    df_branch = df_reviews[df_reviews['Branch'] == branch].copy()

    # If it's Hong Kong, merge the traffic data
    if branch == 'Disneyland_HongKong':
        df_branch = pd.merge(
            df_branch,
            monthly_traffic[['Year_Month', 'Monthly_Other_Visitor_Arrivals', 'Expected_Staying_Days_Other_Visitors']],
            on='Year_Month',
            how='left'
        )
        # Fill missing traffic data with 0 to prevent empty trailing commas (,,) in CSV
        df_branch['Monthly_Other_Visitor_Arrivals'].fillna(0, inplace=True)
        df_branch['Expected_Staying_Days_Other_Visitors'].fillna(0, inplace=True)

    # Convert Year_Month back to string for clean CSV saving
    df_branch['Year_Month'] = df_branch['Year_Month'].astype(str)

    # Save to CSV
    filename = f"data/{branch}_Processed.csv"
    df_branch.to_csv(filename, index=False)

print("\nData processing complete! 3 CSV files have been successfully exported to the 'data' folder.")

Splitting and saving datasets...

Processing and Exporting CSVs: 100%|██████████| 3/3 [00:00<00:00, 18.86it/s]

Data processing complete! 3 CSV files have been successfully exported to the 'data' folder.