text-emotion-classification / train-data-preload.py

Billy Lin

text-emotion-classification

97a5393 22 days ago

2.09 kB

	import pandas as pd

	df1 = pd.read_csv(
	"hf://datasets/zzhdbw/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset/Simplified_Chinese_Multi-Emotion_Dialogue_Dataset.csv"
	)

	df2 = pd.read_csv(
	"hf://datasets/jakeazcona/short-text-multi-labeled-emotion-classification/FINALDATA.csv"
	)

	# df1: 中文情绪 -> 统一标签
	# 伤心：0，生气：1，关心：2，惊讶：3，开心：4，平静：5，厌恶：6
	# （如 df1 还包含其它情绪，将被置为 NA 并在后续丢弃）
	DF1_LABEL_MAP = {
	"伤心": 0,
	"生气": 1,
	"关心": 2,
	"惊讶": 3,
	"开心": 4,
	"平静": 5,
	"厌恶": 6,
	}

	# df2: emotion 数字 -> 统一标签
	# 0：1，1：7，2：2，3：3，4：4，5：5
	DF2_EMOTION_MAP = {
	0: 1,
	1: 7,
	2: 2,
	3: 3,
	4: 4,
	5: 5,
	}

	# 统一列名（将 df1 的 text 和 df2 的 sample 统一到新表的 text 字段）
	if "label" not in df1.columns or "text" not in df1.columns:
	raise KeyError(f"df1 缺少必要列: label 或 text，现有列: {list(df1.columns)}")
	if "emotion" not in df2.columns or "sample" not in df2.columns:
	raise KeyError(f"df2 缺少必要列: emotion 或 sample，现有列: {list(df2.columns)}")

	df1_std = df1.copy()
	df1_std["label"] = df1_std["label"].map(DF1_LABEL_MAP)
	# df1 保持 text 字段名不变

	df2_std = df2.copy()
	# 确保 emotion 可被当作 int 映射
	df2_std["emotion"] = pd.to_numeric(df2_std["emotion"], errors="coerce")
	df2_std["label"] = df2_std["emotion"].map(DF2_EMOTION_MAP)
	# 将 df2 的 sample 重命名为 text
	df2_std = df2_std.rename(columns={"sample": "text"})

	# 只保留 text 和 label 两列进行合并
	final_cols = ["text", "label"]

	merged = pd.concat([
	df1_std[final_cols],
	df2_std[final_cols],
	], ignore_index=True)

	# 丢弃无法映射的样本
	merged = merged.dropna(subset=["label"]).copy()
	merged["label"] = merged["label"].astype(int)

	# 输出
	merged.to_csv("emotion-classification-train.csv", index=False, encoding="utf-8-sig")
	print(f"merged saved: emotion-classification-train.csv, rows={len(merged)}, cols={len(merged.columns)}")