import pandas as pd df1 = pd.read_csv( "hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv" ) df2 = pd.read_csv( "hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv" ) # df1: 中文情绪 -> 统一标签 # 伤心:0,生气:1,关心:2,惊讶:3,开心:4,平静:5,厌恶:6 # (如 df1 还包含其它情绪,将被置为 NA 并在后续丢弃) DF1_LABEL_MAP = { "伤心": 0, "生气": 1, "关心": 2, "惊讶": 3, "开心": 4, "平静": 5, "厌恶": 6, } # df2: emotion 数字 -> 统一标签 # 0:1,1:7,2:2,3:3,4:4,5:5 DF2_EMOTION_MAP = { 0: 1, 1: 7, 2: 2, 3: 3, 4: 4, 5: 5, } # 统一列名(将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段) if "label" not in df1.columns or "text" not in df1.columns: raise KeyError(f"df1 缺少必要列: label 或 text,现有列: {list(df1.columns)}") if "emotion" not in df2.columns or "sample" not in df2.columns: raise KeyError(f"df2 缺少必要列: emotion 或 sample,现有列: {list(df2.columns)}") df1_std = df1.copy() df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP) # df1 保持 text 字段名不变 df2_std = df2.copy() # 确保 emotion 可被当作 int 映射 df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce") df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP) # 将 df2 的 sample 重命名为 text df2_std = df2_std.rename(columns={"sample": "text"}) # 只保留 text 和 label 两列进行合并 final_cols = ["text", "label"] merged = pd.concat([ df1_std[final_cols], df2_std[final_cols], ], ignore_index=True) # 丢弃无法映射的样本 merged = merged.dropna(subset=["label"]).copy() merged["label"] = merged["label"].astype(int) # 输出 merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig") print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}")