File size: 2,093 Bytes
97a5393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd

df1 = pd.read_csv(
    "hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv"
)

df2 = pd.read_csv(
    "hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv"
)

# df1: 中文情绪 -> 统一标签
# 伤心:0,生气:1,关心:2,惊讶:3,开心:4,平静:5,厌恶:6
# (如 df1 还包含其它情绪,将被置为 NA 并在后续丢弃)
DF1_LABEL_MAP = {
    "伤心": 0,
    "生气": 1,
    "关心": 2,
    "惊讶": 3,
    "开心": 4,
    "平静": 5,
    "厌恶": 6,
}

# df2: emotion 数字 -> 统一标签
# 0:1,1:7,2:2,3:3,4:4,5:5
DF2_EMOTION_MAP = {
    0: 1,
    1: 7,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
}

# 统一列名(将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段)
if "label" not in df1.columns or "text" not in df1.columns:
    raise KeyError(f"df1 缺少必要列: label 或 text,现有列: {list(df1.columns)}")
if "emotion" not in df2.columns or "sample" not in df2.columns:
    raise KeyError(f"df2 缺少必要列: emotion 或 sample,现有列: {list(df2.columns)}")

df1_std = df1.copy()
df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP)
# df1 保持 text 字段名不变

df2_std = df2.copy()
# 确保 emotion 可被当作 int 映射
df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce")
df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP)
# 将 df2 的 sample 重命名为 text
df2_std = df2_std.rename(columns={"sample": "text"})

# 只保留 text 和 label 两列进行合并
final_cols = ["text", "label"]

merged = pd.concat([
    df1_std[final_cols],
    df2_std[final_cols],
], ignore_index=True)

# 丢弃无法映射的样本
merged = merged.dropna(subset=["label"]).copy()
merged["label"] = merged["label"].astype(int)

# 输出
merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig")
print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}")