BERT-training / traindata.py
picard.tseng
First commit:
050259a
# ==============================
# 訓練BART
# ==============================
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from huggingface_hub import HfApi, HfFolder
# 登入 Hugging Face
hf_token = os.environ["TOGETHER_API_KEY"]
HfFolder.save_token(hf_token)
#push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line
push_to_hub_model_id = "picard47at/tunned_albert_model2"
# 1. Load the dataset
#dataset_name = "picard47at/dataset2"
dataset_name = "Luigi/dinercall-intent"
try:
dataset = load_dataset(dataset_name)
print(f"Dataset '{dataset_name}' loaded successfully.")
print(dataset)
except Exception as e:
print(f"Error loading dataset '{dataset_name}': {e}")
exit()
# Ensure the dataset has 'train' and optionally 'validation' splits
if 'train' not in dataset:
print("Error: The dataset must contain a 'train' split.")
exit()
# If a validation split doesn't exist, create one
if 'validation' not in dataset:
print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.")
dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['validation'] = dataset['test']
del dataset['test']
print(dataset)
# Assuming your dataset has a 'text' column for the input and a 'label' column for the target
text_column = "text" # Adjust if your text column has a different name
label_column = "label" # Adjust if your label column has a different name
# 2. Load the tokenizer and model
checkpoint = "ckiplab/albert-tiny-chinese"
try:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names))
print(f"Tokenizer and model '{checkpoint}' loaded successfully.")
except Exception as e:
print(f"Error loading tokenizer or model '{checkpoint}': {e}")
exit()
# 3. Preprocess the dataset
def tokenize_function(examples):
return tokenizer(examples[text_column], truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# 4. Define training arguments
output_dir = "./albert-tiny-chinese-finetuned2"
batch_size = 16
num_epochs = 100
logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps
#save_steps = logging_steps * 2
save_steps = logging_steps # Save at every logging step
eval_steps = logging_steps
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
push_to_hub=False,
)'''
"""
The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps".
To fix this, I've made the following changes in the Canvas:
Changed evaluation_strategy from "epoch" to "steps".
Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation.
Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency.
"""
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="steps", # Change to "steps" to match save_strategy
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
eval_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
push_to_hub=False,
)
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="steps",
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
eval_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
push_to_hub=True,
hub_model_id=push_to_hub_model_id,
save_total_limit=1, # Add this line
)
# 5. Define a function to compute metrics
def compute_metrics(eval_pred):
predictions = np.argmax(eval_pred.predictions, axis=-1)
labels = eval_pred.label_ids
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
}
# 6. Create the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# 7. Train the model
print("Starting training...")
trainer.train()
print("Training finished!")
# 8. Evaluate the model
print("Evaluating the model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)
# 9. Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.")
# 10. Push to Hub
trainer.push_to_hub()
print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")