#!/usr/bin/env python # fine_tune_taiwan_restaurant_intents.py from datasets import load_dataset from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments) import evaluate import numpy as np import torch from collections import Counter # Load the dataset from Hugging Face Hub dataset = load_dataset("Luigi/dinercall-intent") # Choose a pretrained model checkpoint. model_checkpoint = "ckiplab/albert-tiny-chinese" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Tokenization function def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64) # Tokenize tokenized_datasets = dataset.map(tokenize_function, batched=True) # Number of intent labels num_labels = tokenized_datasets["train"].features["label"].num_classes # ----------- ๐Ÿ›ก๏ธ Compute class weights for imbalanced training set ----------- label_list = tokenized_datasets["train"]["label"] label_freq = Counter(label_list) total = len(label_list) # Inverse frequency as weight class_weights = [total / label_freq[i] for i in range(num_labels)] class_weights_tensor = torch.tensor(class_weights, dtype=torch.float) print("Class weights:", class_weights) # ----------- ๐Ÿง  Load model with weighted loss function ----------- from torch import nn from transformers import BertPreTrainedModel, BertModel class CustomModel(AutoModelForSequenceClassification): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor) def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs): outputs = self.bert(input_ids, attention_mask=attention_mask) pooled_output = outputs[1] # CLS token pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: loss = self.loss_fn(logits, labels) return {"loss": loss, "logits": logits} # Initialize model model = CustomModel.from_pretrained(model_checkpoint, num_labels=num_labels) # ----------- ๐Ÿ“Š Metrics (precision, recall, F1, accuracy) ----------- accuracy = evaluate.load("accuracy") precision = evaluate.load("precision") recall = evaluate.load("recall") f1 = evaluate.load("f1") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return { "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], "precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"], "recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"], "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"] } # ----------- โš™๏ธ Training arguments ----------- training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=100, weight_decay=0.01, logging_dir='./logs', logging_steps=10, save_total_limit=2, push_to_hub=True,# ๐Ÿ‘ˆ This will push your model after training hub_model_id="Luigi/albert-tiny-chinese-dinercall-intent", # optional, use if you want a custom name hub_private_repo=False # optional, if you want a private repo ) # ----------- ๐Ÿงช Trainer ----------- trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Train! trainer.train() # Save final model model.save_pretrained("./final_model") tokenizer.save_pretrained("./final_model")