Spaces:
No application file
No application file
| # ============================== | |
| # 訓練BART | |
| # ============================== | |
| import os | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| import numpy as np | |
| from sklearn.metrics import precision_recall_fscore_support, accuracy_score | |
| from huggingface_hub import HfApi, HfFolder | |
| # 登入 Hugging Face | |
| hf_token = os.environ["TOGETHER_API_KEY"] | |
| HfFolder.save_token(hf_token) | |
| #push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line | |
| push_to_hub_model_id = "picard47at/tunned_albert_model2" | |
| # 1. Load the dataset | |
| #dataset_name = "picard47at/dataset2" | |
| dataset_name = "Luigi/dinercall-intent" | |
| try: | |
| dataset = load_dataset(dataset_name) | |
| print(f"Dataset '{dataset_name}' loaded successfully.") | |
| print(dataset) | |
| except Exception as e: | |
| print(f"Error loading dataset '{dataset_name}': {e}") | |
| exit() | |
| # Ensure the dataset has 'train' and optionally 'validation' splits | |
| if 'train' not in dataset: | |
| print("Error: The dataset must contain a 'train' split.") | |
| exit() | |
| # If a validation split doesn't exist, create one | |
| if 'validation' not in dataset: | |
| print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.") | |
| dataset = dataset['train'].train_test_split(test_size=0.1) | |
| dataset['validation'] = dataset['test'] | |
| del dataset['test'] | |
| print(dataset) | |
| # Assuming your dataset has a 'text' column for the input and a 'label' column for the target | |
| text_column = "text" # Adjust if your text column has a different name | |
| label_column = "label" # Adjust if your label column has a different name | |
| # 2. Load the tokenizer and model | |
| checkpoint = "ckiplab/albert-tiny-chinese" | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names)) | |
| print(f"Tokenizer and model '{checkpoint}' loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading tokenizer or model '{checkpoint}': {e}") | |
| exit() | |
| # 3. Preprocess the dataset | |
| def tokenize_function(examples): | |
| return tokenizer(examples[text_column], truncation=True) | |
| tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
| # 4. Define training arguments | |
| output_dir = "./albert-tiny-chinese-finetuned2" | |
| batch_size = 16 | |
| num_epochs = 100 | |
| logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps | |
| #save_steps = logging_steps * 2 | |
| save_steps = logging_steps # Save at every logging step | |
| eval_steps = logging_steps | |
| ''' | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=num_epochs, | |
| weight_decay=0.01, | |
| evaluation_strategy="epoch", | |
| save_strategy="steps", | |
| logging_steps=logging_steps, | |
| save_steps=save_steps, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics | |
| push_to_hub=False, | |
| )''' | |
| """ | |
| The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps". | |
| To fix this, I've made the following changes in the Canvas: | |
| Changed evaluation_strategy from "epoch" to "steps". | |
| Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation. | |
| Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency. | |
| """ | |
| ''' | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=num_epochs, | |
| weight_decay=0.01, | |
| evaluation_strategy="steps", # Change to "steps" to match save_strategy | |
| save_strategy="steps", | |
| logging_steps=logging_steps, | |
| save_steps=save_steps, | |
| eval_steps=eval_steps, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics | |
| push_to_hub=False, | |
| ) | |
| ''' | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=batch_size, | |
| per_device_eval_batch_size=batch_size, | |
| num_train_epochs=num_epochs, | |
| weight_decay=0.01, | |
| evaluation_strategy="steps", | |
| save_strategy="steps", | |
| logging_steps=logging_steps, | |
| save_steps=save_steps, | |
| eval_steps=eval_steps, | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| push_to_hub=True, | |
| hub_model_id=push_to_hub_model_id, | |
| save_total_limit=1, # Add this line | |
| ) | |
| # 5. Define a function to compute metrics | |
| def compute_metrics(eval_pred): | |
| predictions = np.argmax(eval_pred.predictions, axis=-1) | |
| labels = eval_pred.label_ids | |
| accuracy = accuracy_score(labels, predictions) | |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro') | |
| return { | |
| 'accuracy': accuracy, | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'f1': f1, | |
| } | |
| # 6. Create the Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_datasets["train"], | |
| eval_dataset=tokenized_datasets["validation"], | |
| tokenizer=tokenizer, | |
| compute_metrics=compute_metrics, | |
| ) | |
| # 7. Train the model | |
| print("Starting training...") | |
| trainer.train() | |
| print("Training finished!") | |
| # 8. Evaluate the model | |
| print("Evaluating the model...") | |
| evaluation_results = trainer.evaluate() | |
| print(evaluation_results) | |
| # 9. Save the fine-tuned model | |
| trainer.save_model(output_dir) | |
| tokenizer.save_pretrained(output_dir) | |
| print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.") | |
| # 10. Push to Hub | |
| trainer.push_to_hub() | |
| print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}") | |