text-emotion-classification / text-emotion-classification.py

Billy Lin

text-emotion-classification

97a5393 22 days ago

2.15 kB

	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
	from datasets import load_dataset
	import numpy as np

	# 1️⃣ 载入中文 RoBERTa 分词器和模型
	model_name = "hfl/chinese-roberta-wwm-ext"
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# 指定标签数量，比如 8 类情绪
	model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)

	# 2️⃣ 加载自己情绪数据集
	# 需要 CSV 至少包含两列：text（文本）、label（整数标签）
	dataset = load_dataset(
	"csv",
	data_files={
	"train": "emotion-classification-train.csv",
	"test": "emotion-classification-train.csv",
	},
	)

	def preprocess(examples):
	return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

	dataset = dataset.map(preprocess, batched=True)

	# Transformers Trainer 期望标签列名为 label
	if "labels" in dataset["train"].column_names and "label" not in dataset["train"].column_names:
	dataset = dataset.rename_column("labels", "label")

	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	preds = np.argmax(logits, axis=-1)
	acc = (preds == labels).mean().item() if hasattr((preds == labels).mean(), "item") else float((preds == labels).mean())
	return {"accuracy": acc}

	# 3️⃣ 配置训练参数（保存最优模型）
	training_args = TrainingArguments(
	output_dir="./sentiment_roberta",
	eval_strategy="epoch",
	save_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	num_train_epochs=3,
	load_best_model_at_end=True,
	metric_for_best_model="accuracy",
	greater_is_better=True,
	save_total_limit=2,
	fp16=True,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset["train"],
	eval_dataset=dataset["test"],
	compute_metrics=compute_metrics,
	)

	# 4️⃣ 开始训练
	trainer.train()

	# 5️⃣ 显式保存最优模型与分词器到 output_dir
	trainer.save_model(training_args.output_dir)
	tokenizer.save_pretrained(training_args.output_dir)