| | |
| | |
| |
|
| | from datasets import load_dataset |
| | from transformers import (AutoTokenizer, AutoModelForSequenceClassification, |
| | Trainer, TrainingArguments) |
| | import evaluate |
| | import numpy as np |
| | import torch |
| | from collections import Counter |
| |
|
| |
|
| | |
| | dataset = load_dataset("Luigi/dinercall-intent") |
| |
|
| | |
| | model_checkpoint = "ckiplab/albert-tiny-chinese" |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
| |
|
| | |
| | def tokenize_function(examples): |
| | return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64) |
| |
|
| | |
| | tokenized_datasets = dataset.map(tokenize_function, batched=True) |
| |
|
| | |
| | num_labels = tokenized_datasets["train"].features["label"].num_classes |
| |
|
| | |
| | label_list = tokenized_datasets["train"]["label"] |
| | label_freq = Counter(label_list) |
| | total = len(label_list) |
| |
|
| | |
| | class_weights = [total / label_freq[i] for i in range(num_labels)] |
| | class_weights_tensor = torch.tensor(class_weights, dtype=torch.float) |
| |
|
| | print("Class weights:", class_weights) |
| |
|
| | |
| | from torch import nn |
| | from transformers import BertPreTrainedModel, BertModel |
| |
|
| | class CustomModel(AutoModelForSequenceClassification): |
| | def __init__(self, config): |
| | super().__init__(config) |
| | self.num_labels = config.num_labels |
| | self.bert = BertModel(config) |
| | self.dropout = nn.Dropout(config.hidden_dropout_prob) |
| | self.classifier = nn.Linear(config.hidden_size, config.num_labels) |
| | self.loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor) |
| |
|
| | def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs): |
| | outputs = self.bert(input_ids, attention_mask=attention_mask) |
| | pooled_output = outputs[1] |
| | pooled_output = self.dropout(pooled_output) |
| | logits = self.classifier(pooled_output) |
| |
|
| | loss = None |
| | if labels is not None: |
| | loss = self.loss_fn(logits, labels) |
| |
|
| | return {"loss": loss, "logits": logits} |
| |
|
| | |
| | model = CustomModel.from_pretrained(model_checkpoint, num_labels=num_labels) |
| |
|
| | |
| | accuracy = evaluate.load("accuracy") |
| | precision = evaluate.load("precision") |
| | recall = evaluate.load("recall") |
| | f1 = evaluate.load("f1") |
| |
|
| | def compute_metrics(eval_pred): |
| | logits, labels = eval_pred |
| | predictions = np.argmax(logits, axis=-1) |
| | return { |
| | "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], |
| | "precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"], |
| | "recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"], |
| | "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"] |
| | } |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="./results", |
| | evaluation_strategy="epoch", |
| | save_strategy="epoch", |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=16, |
| | per_device_eval_batch_size=16, |
| | num_train_epochs=100, |
| | weight_decay=0.01, |
| | logging_dir='./logs', |
| | logging_steps=10, |
| | save_total_limit=2, |
| | push_to_hub=True, |
| | hub_model_id="Luigi/albert-tiny-chinese-dinercall-intent", |
| | hub_private_repo=False |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_datasets["train"], |
| | eval_dataset=tokenized_datasets["test"], |
| | tokenizer=tokenizer, |
| | compute_metrics=compute_metrics, |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| | |
| | model.save_pretrained("./final_model") |
| | tokenizer.save_pretrained("./final_model") |
| |
|