File size: 2,093 Bytes
97a5393 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import pandas as pd
df1 = pd.read_csv(
"hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv"
)
df2 = pd.read_csv(
"hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv"
)
# df1: 中文情绪 -> 统一标签
# 伤心:0,生气:1,关心:2,惊讶:3,开心:4,平静:5,厌恶:6
# (如 df1 还包含其它情绪,将被置为 NA 并在后续丢弃)
DF1_LABEL_MAP = {
"伤心": 0,
"生气": 1,
"关心": 2,
"惊讶": 3,
"开心": 4,
"平静": 5,
"厌恶": 6,
}
# df2: emotion 数字 -> 统一标签
# 0:1,1:7,2:2,3:3,4:4,5:5
DF2_EMOTION_MAP = {
0: 1,
1: 7,
2: 2,
3: 3,
4: 4,
5: 5,
}
# 统一列名(将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段)
if "label" not in df1.columns or "text" not in df1.columns:
raise KeyError(f"df1 缺少必要列: label 或 text,现有列: {list(df1.columns)}")
if "emotion" not in df2.columns or "sample" not in df2.columns:
raise KeyError(f"df2 缺少必要列: emotion 或 sample,现有列: {list(df2.columns)}")
df1_std = df1.copy()
df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP)
# df1 保持 text 字段名不变
df2_std = df2.copy()
# 确保 emotion 可被当作 int 映射
df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce")
df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP)
# 将 df2 的 sample 重命名为 text
df2_std = df2_std.rename(columns={"sample": "text"})
# 只保留 text 和 label 两列进行合并
final_cols = ["text", "label"]
merged = pd.concat([
df1_std[final_cols],
df2_std[final_cols],
], ignore_index=True)
# 丢弃无法映射的样本
merged = merged.dropna(subset=["label"]).copy()
merged["label"] = merged["label"].astype(int)
# 输出
merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig")
print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}")
|