from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer from datasets import load_dataset import numpy as np # 1️⃣ 载入中文 RoBERTa 分词器和模型 model_name = "hfl/chinese-roberta-wwm-ext" tokenizer = AutoTokenizer.from_pretrained(model_name) # 指定标签数量,比如 8 类情绪 model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8) # 2️⃣ 加载自己情绪数据集 # 需要 CSV 至少包含两列:text(文本)、label(整数标签) dataset = load_dataset( "csv", data_files={ "train": "emotion-classification-train.csv", "test": "emotion-classification-train.csv", }, ) def preprocess(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) dataset = dataset.map(preprocess, batched=True) # Transformers Trainer 期望标签列名为 label if "labels" in dataset["train"].column_names and "label" not in dataset["train"].column_names: dataset = dataset.rename_column("labels", "label") def compute_metrics(eval_pred): logits, labels = eval_pred preds = np.argmax(logits, axis=-1) acc = (preds == labels).mean().item() if hasattr((preds == labels).mean(), "item") else float((preds == labels).mean()) return {"accuracy": acc} # 3️⃣ 配置训练参数(保存最优模型) training_args = TrainingArguments( output_dir="./sentiment_roberta", eval_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, load_best_model_at_end=True, metric_for_best_model="accuracy", greater_is_better=True, save_total_limit=2, fp16=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["test"], compute_metrics=compute_metrics, ) # 4️⃣ 开始训练 trainer.train() # 5️⃣ 显式保存最优模型与分词器到 output_dir trainer.save_model(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir)