Spaces:

picard47at
/

BERT-training

No application file

File size: 5,991 Bytes

050259a

# ==============================
# 訓練BART
# ==============================
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from huggingface_hub import HfApi, HfFolder
# 登入 Hugging Face
hf_token = os.environ["TOGETHER_API_KEY"]
HfFolder.save_token(hf_token)
#push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line

push_to_hub_model_id = "picard47at/tunned_albert_model2"
# 1. Load the dataset
#dataset_name = "picard47at/dataset2"
dataset_name = "Luigi/dinercall-intent"
try:
    dataset = load_dataset(dataset_name)
    print(f"Dataset '{dataset_name}' loaded successfully.")
    print(dataset)
except Exception as e:
    print(f"Error loading dataset '{dataset_name}': {e}")
    exit()

# Ensure the dataset has 'train' and optionally 'validation' splits
if 'train' not in dataset:
    print("Error: The dataset must contain a 'train' split.")
    exit()

# If a validation split doesn't exist, create one
if 'validation' not in dataset:
    print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.")
    dataset = dataset['train'].train_test_split(test_size=0.1)
    dataset['validation'] = dataset['test']
    del dataset['test']
    print(dataset)

# Assuming your dataset has a 'text' column for the input and a 'label' column for the target
text_column = "text"  # Adjust if your text column has a different name
label_column = "label" # Adjust if your label column has a different name

# 2. Load the tokenizer and model
checkpoint = "ckiplab/albert-tiny-chinese"
try:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names))
    print(f"Tokenizer and model '{checkpoint}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer or model '{checkpoint}': {e}")
    exit()

# 3. Preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples[text_column], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. Define training arguments
output_dir = "./albert-tiny-chinese-finetuned2"
batch_size = 16
num_epochs = 100
logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps
#save_steps = logging_steps * 2

save_steps = logging_steps # Save at every logging step
eval_steps = logging_steps
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
    push_to_hub=False,
)'''

"""
The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps".

To fix this, I've made the following changes in the Canvas:

Changed evaluation_strategy from "epoch" to "steps".
Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation.
Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency.
"""
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps", # Change to "steps" to match save_strategy
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
    push_to_hub=False,
    
)
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=True,
    hub_model_id=push_to_hub_model_id,
    save_total_limit=1,  # Add this line
)
# 5. Define a function to compute metrics
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    labels = eval_pred.label_ids
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# 6. Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 7. Train the model
print("Starting training...")
trainer.train()
print("Training finished!")

# 8. Evaluate the model
print("Evaluating the model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)

# 9. Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.")

# 10. Push to Hub

trainer.push_to_hub()
print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")