File size: 5,991 Bytes
050259a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# ==============================
# 訓練BART
# ==============================
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from huggingface_hub import HfApi, HfFolder
# 登入 Hugging Face
hf_token = os.environ["TOGETHER_API_KEY"]
HfFolder.save_token(hf_token)
#push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line

push_to_hub_model_id = "picard47at/tunned_albert_model2"
# 1. Load the dataset
#dataset_name = "picard47at/dataset2"
dataset_name = "Luigi/dinercall-intent"
try:
    dataset = load_dataset(dataset_name)
    print(f"Dataset '{dataset_name}' loaded successfully.")
    print(dataset)
except Exception as e:
    print(f"Error loading dataset '{dataset_name}': {e}")
    exit()

# Ensure the dataset has 'train' and optionally 'validation' splits
if 'train' not in dataset:
    print("Error: The dataset must contain a 'train' split.")
    exit()

# If a validation split doesn't exist, create one
if 'validation' not in dataset:
    print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.")
    dataset = dataset['train'].train_test_split(test_size=0.1)
    dataset['validation'] = dataset['test']
    del dataset['test']
    print(dataset)

# Assuming your dataset has a 'text' column for the input and a 'label' column for the target
text_column = "text"  # Adjust if your text column has a different name
label_column = "label" # Adjust if your label column has a different name

# 2. Load the tokenizer and model
checkpoint = "ckiplab/albert-tiny-chinese"
try:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names))
    print(f"Tokenizer and model '{checkpoint}' loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer or model '{checkpoint}': {e}")
    exit()

# 3. Preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples[text_column], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 4. Define training arguments
output_dir = "./albert-tiny-chinese-finetuned2"
batch_size = 16
num_epochs = 100
logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps
#save_steps = logging_steps * 2

save_steps = logging_steps # Save at every logging step
eval_steps = logging_steps
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
    push_to_hub=False,
)'''

"""
The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps".

To fix this, I've made the following changes in the Canvas:

Changed evaluation_strategy from "epoch" to "steps".
Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation.
Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency.
"""
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps", # Change to "steps" to match save_strategy
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
    push_to_hub=False,
    
)
'''
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_steps=eval_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=True,
    hub_model_id=push_to_hub_model_id,
    save_total_limit=1,  # Add this line
)
# 5. Define a function to compute metrics
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    labels = eval_pred.label_ids
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# 6. Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 7. Train the model
print("Starting training...")
trainer.train()
print("Training finished!")

# 8. Evaluate the model
print("Evaluating the model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)

# 9. Save the fine-tuned model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.")

# 10. Push to Hub

trainer.push_to_hub()
print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")