text-emotion-classification / text-emotion-classification.py
Billy Lin
text-emotion-classification
97a5393
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
# 1️⃣ 载入中文 RoBERTa 分词器和模型
model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 指定标签数量,比如 8 类情绪
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8)
# 2️⃣ 加载自己情绪数据集
# 需要 CSV 至少包含两列:text(文本)、label(整数标签)
dataset = load_dataset(
"csv",
data_files={
"train": "emotion-classification-train.csv",
"test": "emotion-classification-train.csv",
},
)
def preprocess(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
dataset = dataset.map(preprocess, batched=True)
# Transformers Trainer 期望标签列名为 label
if "labels" in dataset["train"].column_names and "label" not in dataset["train"].column_names:
dataset = dataset.rename_column("labels", "label")
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=-1)
acc = (preds == labels).mean().item() if hasattr((preds == labels).mean(), "item") else float((preds == labels).mean())
return {"accuracy": acc}
# 3️⃣ 配置训练参数(保存最优模型)
training_args = TrainingArguments(
output_dir="./sentiment_roberta",
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
greater_is_better=True,
save_total_limit=2,
fp16=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
compute_metrics=compute_metrics,
)
# 4️⃣ 开始训练
trainer.train()
# 5️⃣ 显式保存最优模型与分词器到 output_dir
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)