import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
from tqdm import tqdm
# 初始化 pandas 的进度条支持
tqdm.pandas(desc="正在进行情感分析推理")

# 下载 NLTK 依赖包
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('punkt_tab')

# 设置 seaborn 的绘图风格
sns.set_theme(style="whitegrid")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\17164/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\17164/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\17164/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

# 定义香港迪士尼处理后数据集的文件路径
file_path = 'data/Disneyland_HongKong_Processed.csv'

# 读取 CSV 文件到 pandas DataFrame 中
df_hk = pd.read_csv(file_path)

display(df_hk.head())

# 定义核心维度（方面）及其英文关键词
aspect_dict = {
    'Attractions_and_Shows': ['ride', 'attraction', 'parade', 'show', 'firework', 'castle', 'mountain', 'space', 'iron man'],
    'Food_and_Dining': ['food', 'restaurant', 'eat', 'meal', 'snack', 'drink', 'lunch', 'dinner', 'pricey food'],
    'Staff_and_Service': ['staff', 'cast member', 'service', 'friendly', 'helpful', 'rude', 'attitude'],
    'Crowd_and_WaitTime': ['crowd', 'wait', 'line', 'queue', 'busy', 'packed', 'fastpass', 'hour'],
    'Value_for_Money': ['ticket', 'price', 'expensive', 'worth', 'value', 'money', 'cost']
}

print("Initializing VADER...")
# 初始化 VADER 情感分析器
sia = SentimentIntensityAnalyzer()

print("Loading RoBERTa model... This might take a while.")
# 初始化 RoBERTa 情感分析 Pipeline (device=-1 表示使用 CPU，如果有 GPU 可改为 0)
roberta_analyzer = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    top_k=None,
    device=-1
)
print("Both models loaded successfully!")

Initializing VADER...
Loading RoBERTa model... This might take a while.

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 59731.83it/s]
RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.pooler.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

Both models loaded successfully!

def analyze_aspect_sentiment_dual(text):
    result_dict = {}
    for aspect in aspect_dict.keys():
        result_dict[f"{aspect}_VADER"] = np.nan
        result_dict[f"{aspect}_RoBERTa"] = np.nan

    if not isinstance(text, str):
        return pd.Series(result_dict)

    sentences = nltk.sent_tokenize(text.lower())

    vader_scores = {aspect: [] for aspect in aspect_dict.keys()}
    roberta_scores = {aspect: [] for aspect in aspect_dict.keys()}

    for sentence in sentences:
        matched_aspects = []
        for aspect, keywords in aspect_dict.items():
            if any(re.search(r'\b' + kw + r'\b', sentence) for kw in keywords):
                matched_aspects.append(aspect)

        if matched_aspects:
            # 1. 计算 VADER 得分
            v_score = sia.polarity_scores(sentence)['compound']

            # 2. 计算 RoBERTa 得分 (加入 truncation=True, max_length=512 防止 CUDA 报错)
            r_results = roberta_analyzer(sentence, truncation=True, max_length=512)[0]

            pos_prob = next(item['score'] for item in r_results if item['label'] == 'positive')
            neg_prob = next(item['score'] for item in r_results if item['label'] == 'negative')
            r_score = pos_prob - neg_prob

            for aspect in matched_aspects:
                vader_scores[aspect].append(v_score)
                roberta_scores[aspect].append(r_score)

    for aspect in aspect_dict.keys():
        if vader_scores[aspect]:
            result_dict[f"{aspect}_VADER"] = np.mean(vader_scores[aspect])
            result_dict[f"{aspect}_RoBERTa"] = np.mean(roberta_scores[aspect])

    return pd.Series(result_dict)

print("Performing Dual Aspect-Based Sentiment Analysis...")
print("Processing with both VADER and RoBERTa. Please wait...")

# 应用函数生成双模型特征列
aspect_sentiments = df_hk['Review_Text'].progress_apply(analyze_aspect_sentiment_dual)

# 将新生成的特征列与原始数据集水平合并
df_hk_augmented = pd.concat([df_hk, aspect_sentiments], axis=1)

print("Data augmentation complete!")

Performing Dual Aspect-Based Sentiment Analysis...
Processing with both VADER and RoBERTa. Please wait...

正在进行情感分析推理: 100%|██████████| 9147/9147 [07:07<00:00, 21.40it/s]

Data augmentation complete!

# 提取所有包含情感得分的列名
score_columns = [col for col in df_hk_augmented.columns if '_VADER' in col or '_RoBERTa' in col]

# 转换为长格式以配合 seaborn 的分组箱线图
df_melted = df_hk_augmented.melt(id_vars=['Review_ID'],
                                 value_vars=score_columns,
                                 var_name='Aspect_Model',
                                 value_name='Sentiment_Score')

df_melted = df_melted.dropna(subset=['Sentiment_Score'])

# 拆分 Aspect_Model 列，分离出 Aspect(方面) 和 Model(模型名称)
df_melted[['Aspect', 'Model']] = df_melted['Aspect_Model'].str.rsplit('_', n=1, expand=True)

# 创建分组箱线图进行对比
plt.figure(figsize=(14, 8))
sns.boxplot(data=df_melted, x='Sentiment_Score', y='Aspect', hue='Model', palette=['#4C72B0', '#DD8452'])

plt.axvline(0, color='red', linestyle='--', alpha=0.5)

plt.title('Comparison of Sentiment Scores: VADER vs RoBERTa (Hong Kong Disneyland)', fontsize=16)
plt.xlabel('Sentiment Score (-1: Negative, 1: Positive)', fontsize=12)
plt.ylabel('Aspects', fontsize=12)
plt.legend(title='Model')

plt.show()

df_hk_augmented['Year_Month'] = pd.to_datetime(df_hk_augmented['Year_Month'])

# 计算 Crowd_and_WaitTime 在两个模型下的月度平均分
monthly_trend = df_hk_augmented.groupby('Year_Month')[['Crowd_and_WaitTime_VADER', 'Crowd_and_WaitTime_RoBERTa']].mean().reset_index()
monthly_trend = monthly_trend.dropna()

plt.figure(figsize=(14, 6))

# 绘制 VADER 趋势线
sns.lineplot(data=monthly_trend, x='Year_Month', y='Crowd_and_WaitTime_VADER',
             marker='o', label='VADER', color='#4C72B0')

# 绘制 RoBERTa 趋势线
sns.lineplot(data=monthly_trend, x='Year_Month', y='Crowd_and_WaitTime_RoBERTa',
             marker='s', label='RoBERTa', color='#DD8452')

plt.axhline(0, color='grey', linestyle='--')

plt.title('Trend Comparison: Crowd & Wait Time Sentiment (VADER vs RoBERTa)', fontsize=15)
plt.xlabel('Date (Year-Month)', fontsize=12)
plt.ylabel('Average Sentiment Score', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Model')
plt.tight_layout()

plt.show()

# 定义保存路径，存放在 data 文件夹下
output_path = 'data/Disneyland_HongKong_Augmented.csv'

# 将 DataFrame 保存为 CSV 文件
# index=False 表示不保存行索引（0, 1, 2...）
# encoding='utf-8-sig' 可以防止用 Excel 打开时出现中文/特殊字符乱码
df_hk_augmented.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"增强后的数据已成功保存至: {output_path}")

增强后的数据已成功保存至: data/Disneyland_HongKong_Augmented.csv

	Review_ID	Rating	Year_Month	Reviewer_Location	Review_Text	Branch	Monthly_Other_Visitor_Arrivals	Expected_Staying_Days_Other_Visitors
0	670772142	4	2019-04	Australia	If you've ever been to Disneyland anywhere you...	Disneyland_HongKong	1042157.0	7295099.0
1	670682799	4	2019-05	Philippines	Its been a while since d last time we visit HK...	Disneyland_HongKong	950880.0	6656160.0
2	670623270	4	2019-04	United Arab Emirates	Thanks God it wasn t too hot or too humid wh...	Disneyland_HongKong	1042157.0	7295099.0
3	670607911	4	2019-04	Australia	HK Disneyland is a great compact park. Unfortu...	Disneyland_HongKong	1042157.0	7295099.0
4	670607296	4	2019-04	United Kingdom	the location is not in the city, took around 1...	Disneyland_HongKong	1042157.0	7295099.0