import pandas as pd

df1 = pd.read_csv(
    "hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv"
)

df2 = pd.read_csv(
    "hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv"
)

# df1: 中文情绪 -> 统一标签
# 伤心：0，生气：1，关心：2，惊讶：3，开心：4，平静：5，厌恶：6
# （如 df1 还包含其它情绪，将被置为 NA 并在后续丢弃）
DF1_LABEL_MAP = {
    "伤心": 0,
    "生气": 1,
    "关心": 2,
    "惊讶": 3,
    "开心": 4,
    "平静": 5,
    "厌恶": 6,
}

# df2: emotion 数字 -> 统一标签
# 0：1，1：7，2：2，3：3，4：4，5：5
DF2_EMOTION_MAP = {
    0: 1,
    1: 7,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
}

# 统一列名（将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段）
if "label" not in df1.columns or "text" not in df1.columns:
    raise KeyError(f"df1 缺少必要列: label 或 text，现有列: {list(df1.columns)}")
if "emotion" not in df2.columns or "sample" not in df2.columns:
    raise KeyError(f"df2 缺少必要列: emotion 或 sample，现有列: {list(df2.columns)}")

df1_std = df1.copy()
df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP)
# df1 保持 text 字段名不变

df2_std = df2.copy()
# 确保 emotion 可被当作 int 映射
df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce")
df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP)
# 将 df2 的 sample 重命名为 text
df2_std = df2_std.rename(columns={"sample": "text"})

# 只保留 text 和 label 两列进行合并
final_cols = ["text", "label"]

merged = pd.concat([
    df1_std[final_cols],
    df2_std[final_cols],
], ignore_index=True)

# 丢弃无法映射的样本
merged = merged.dropna(subset=["label"]).copy()
merged["label"] = merged["label"].astype(int)

# 输出
merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig")
print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}")