| import pandas as pd | |
| df1 = pd.read_csv( | |
| "hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv" | |
| ) | |
| df2 = pd.read_csv( | |
| "hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv" | |
| ) | |
| # df1: 中文情绪 -> 统一标签 | |
| # 伤心:0,生气:1,关心:2,惊讶:3,开心:4,平静:5,厌恶:6 | |
| # (如 df1 还包含其它情绪,将被置为 NA 并在后续丢弃) | |
| DF1_LABEL_MAP = { | |
| "伤心": 0, | |
| "生气": 1, | |
| "关心": 2, | |
| "惊讶": 3, | |
| "开心": 4, | |
| "平静": 5, | |
| "厌恶": 6, | |
| } | |
| # df2: emotion 数字 -> 统一标签 | |
| # 0:1,1:7,2:2,3:3,4:4,5:5 | |
| DF2_EMOTION_MAP = { | |
| 0: 1, | |
| 1: 7, | |
| 2: 2, | |
| 3: 3, | |
| 4: 4, | |
| 5: 5, | |
| } | |
| # 统一列名(将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段) | |
| if "label" not in df1.columns or "text" not in df1.columns: | |
| raise KeyError(f"df1 缺少必要列: label 或 text,现有列: {list(df1.columns)}") | |
| if "emotion" not in df2.columns or "sample" not in df2.columns: | |
| raise KeyError(f"df2 缺少必要列: emotion 或 sample,现有列: {list(df2.columns)}") | |
| df1_std = df1.copy() | |
| df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP) | |
| # df1 保持 text 字段名不变 | |
| df2_std = df2.copy() | |
| # 确保 emotion 可被当作 int 映射 | |
| df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce") | |
| df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP) | |
| # 将 df2 的 sample 重命名为 text | |
| df2_std = df2_std.rename(columns={"sample": "text"}) | |
| # 只保留 text 和 label 两列进行合并 | |
| final_cols = ["text", "label"] | |
| merged = pd.concat([ | |
| df1_std[final_cols], | |
| df2_std[final_cols], | |
| ], ignore_index=True) | |
| # 丢弃无法映射的样本 | |
| merged = merged.dropna(subset=["label"]).copy() | |
| merged["label"] = merged["label"].astype(int) | |
| # 输出 | |
| merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig") | |
| print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}") | |