| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
| | from datasets import load_dataset |
| | import numpy as np |
| |
|
| | |
| | model_name = "hfl/chinese-roberta-wwm-ext" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| |
|
| | |
| | model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8) |
| |
|
| | |
| | |
| | dataset = load_dataset( |
| | "csv", |
| | data_files={ |
| | "train": "emotion-classification-train.csv", |
| | "test": "emotion-classification-train.csv", |
| | }, |
| | ) |
| |
|
| | def preprocess(examples): |
| | return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) |
| |
|
| | dataset = dataset.map(preprocess, batched=True) |
| |
|
| | |
| | if "labels" in dataset["train"].column_names and "label" not in dataset["train"].column_names: |
| | dataset = dataset.rename_column("labels", "label") |
| |
|
| | def compute_metrics(eval_pred): |
| | logits, labels = eval_pred |
| | preds = np.argmax(logits, axis=-1) |
| | acc = (preds == labels).mean().item() if hasattr((preds == labels).mean(), "item") else float((preds == labels).mean()) |
| | return {"accuracy": acc} |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="./sentiment_roberta", |
| | eval_strategy="epoch", |
| | save_strategy="epoch", |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=8, |
| | per_device_eval_batch_size=8, |
| | num_train_epochs=3, |
| | load_best_model_at_end=True, |
| | metric_for_best_model="accuracy", |
| | greater_is_better=True, |
| | save_total_limit=2, |
| | fp16=True, |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=dataset["train"], |
| | eval_dataset=dataset["test"], |
| | compute_metrics=compute_metrics, |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| | |
| | trainer.save_model(training_args.output_dir) |
| | tokenizer.save_pretrained(training_args.output_dir) |
| |
|