Spaces:
No application file
No application file
File size: 5,991 Bytes
050259a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# ==============================
# 訓練BART
# ==============================
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from huggingface_hub import HfApi, HfFolder
# 登入 Hugging Face
hf_token = os.environ["TOGETHER_API_KEY"]
HfFolder.save_token(hf_token)
#push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line
push_to_hub_model_id = "picard47at/tunned_albert_model2"
# 1. Load the dataset
#dataset_name = "picard47at/dataset2"
dataset_name = "Luigi/dinercall-intent"
try:
dataset = load_dataset(dataset_name)
print(f"Dataset '{dataset_name}' loaded successfully.")
print(dataset)
except Exception as e:
print(f"Error loading dataset '{dataset_name}': {e}")
exit()
# Ensure the dataset has 'train' and optionally 'validation' splits
if 'train' not in dataset:
print("Error: The dataset must contain a 'train' split.")
exit()
# If a validation split doesn't exist, create one
if 'validation' not in dataset:
print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.")
dataset = dataset['train'].train_test_split(test_size=0.1)
dataset['validation'] = dataset['test']
del dataset['test']
print(dataset)
# Assuming your dataset has a 'text' column for the input and a 'label' column for the target
text_column = "text" # Adjust if your text column has a different name
label_column = "label" # Adjust if your label column has a different name
# 2. Load the tokenizer and model
checkpoint = "ckiplab/albert-tiny-chinese"
try:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names))
print(f"Tokenizer and model '{checkpoint}' loaded successfully.")
except Exception as e:
print(f"Error loading tokenizer or model '{checkpoint}': {e}")
exit()
# 3. Preprocess the dataset
def tokenize_function(examples):
return tokenizer(examples[text_column], truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# 4. Define training arguments
output_dir = "./albert-tiny-chinese-finetuned2"
batch_size = 16
num_epochs = 100
logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps
#save_steps = logging_steps * 2
save_steps = logging_steps # Save at every logging step
eval_steps = logging_steps
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
push_to_hub=False,
)'''
"""
The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps".
To fix this, I've made the following changes in the Canvas:
Changed evaluation_strategy from "epoch" to "steps".
Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation.
Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency.
"""
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="steps", # Change to "steps" to match save_strategy
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
eval_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
push_to_hub=False,
)
'''
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
evaluation_strategy="steps",
save_strategy="steps",
logging_steps=logging_steps,
save_steps=save_steps,
eval_steps=eval_steps,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
push_to_hub=True,
hub_model_id=push_to_hub_model_id,
save_total_limit=1, # Add this line
)
# 5. Define a function to compute metrics
def compute_metrics(eval_pred):
predictions = np.argmax(eval_pred.predictions, axis=-1)
labels = eval_pred.label_ids
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
}
# 6. Create the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# 7. Train the model
print("Starting training...")
trainer.train()
print("Training finished!")
# 8. Evaluate the model
print("Evaluating the model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)
# 9. Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.")
# 10. Push to Hub
trainer.push_to_hub()
print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")
|