Spaces:

picard47at
/

BERT-training

No application file

BERT-training / traindata.py

picard.tseng

First commit:

050259a 8 months ago

5.99 kB

	# ==============================
	# 訓練BART
	# ==============================
	import os
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
	import numpy as np
	from sklearn.metrics import precision_recall_fscore_support, accuracy_score
	from huggingface_hub import HfApi, HfFolder
	# 登入 Hugging Face
	hf_token = os.environ["TOGETHER_API_KEY"]
	HfFolder.save_token(hf_token)
	#push_to_hub_model_id = "picard47at/tuned-albert-tiny" # Add this line

	push_to_hub_model_id = "picard47at/tunned_albert_model2"
	# 1. Load the dataset
	#dataset_name = "picard47at/dataset2"
	dataset_name = "Luigi/dinercall-intent"
	try:
	dataset = load_dataset(dataset_name)
	print(f"Dataset '{dataset_name}' loaded successfully.")
	print(dataset)
	except Exception as e:
	print(f"Error loading dataset '{dataset_name}': {e}")
	exit()

	# Ensure the dataset has 'train' and optionally 'validation' splits
	if 'train' not in dataset:
	print("Error: The dataset must contain a 'train' split.")
	exit()

	# If a validation split doesn't exist, create one
	if 'validation' not in dataset:
	print("Warning: The dataset does not have a 'validation' split. Creating one from the training data.")
	dataset = dataset['train'].train_test_split(test_size=0.1)
	dataset['validation'] = dataset['test']
	del dataset['test']
	print(dataset)

	# Assuming your dataset has a 'text' column for the input and a 'label' column for the target
	text_column = "text" # Adjust if your text column has a different name
	label_column = "label" # Adjust if your label column has a different name

	# 2. Load the tokenizer and model
	checkpoint = "ckiplab/albert-tiny-chinese"
	try:
	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(dataset['train'].features[label_column].names))
	print(f"Tokenizer and model '{checkpoint}' loaded successfully.")
	except Exception as e:
	print(f"Error loading tokenizer or model '{checkpoint}': {e}")
	exit()

	# 3. Preprocess the dataset
	def tokenize_function(examples):
	return tokenizer(examples[text_column], truncation=True)

	tokenized_datasets = dataset.map(tokenize_function, batched=True)

	# 4. Define training arguments
	output_dir = "./albert-tiny-chinese-finetuned2"
	batch_size = 16
	num_epochs = 100
	logging_steps = len(tokenized_datasets["train"]) // (5 * batch_size) # Log every 5 steps
	#save_steps = logging_steps * 2

	save_steps = logging_steps # Save at every logging step
	eval_steps = logging_steps
	'''
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_epochs,
	weight_decay=0.01,
	evaluation_strategy="epoch",
	save_strategy="steps",
	logging_steps=logging_steps,
	save_steps=save_steps,
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
	push_to_hub=False,
	)'''

	"""
	The error message indicates that load_best_model_at_end requires the evaluation_strategy and save_strategy to have the same value. In the original code, evaluation_strategy was set to "epoch" while save_strategy was set to "steps".

	To fix this, I've made the following changes in the Canvas:

	Changed evaluation_strategy from "epoch" to "steps".
	Set save_steps to logging_steps to ensure a save happens at the same frequency as evaluation.
	Added eval_steps and set it to logging_steps to explicitly control the evaluation frequency.
	"""
	'''
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_epochs,
	weight_decay=0.01,
	evaluation_strategy="steps", # Change to "steps" to match save_strategy
	save_strategy="steps",
	logging_steps=logging_steps,
	save_steps=save_steps,
	eval_steps=eval_steps,
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss", # Can also use "eval_f1" if you adjust compute_metrics
	push_to_hub=False,

	)
	'''
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=2e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=num_epochs,
	weight_decay=0.01,
	evaluation_strategy="steps",
	save_strategy="steps",
	logging_steps=logging_steps,
	save_steps=save_steps,
	eval_steps=eval_steps,
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	push_to_hub=True,
	hub_model_id=push_to_hub_model_id,
	save_total_limit=1, # Add this line
	)
	# 5. Define a function to compute metrics
	def compute_metrics(eval_pred):
	predictions = np.argmax(eval_pred.predictions, axis=-1)
	labels = eval_pred.label_ids
	accuracy = accuracy_score(labels, predictions)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
	return {
	'accuracy': accuracy,
	'precision': precision,
	'recall': recall,
	'f1': f1,
	}

	# 6. Create the Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["validation"],
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
	)

	# 7. Train the model
	print("Starting training...")
	trainer.train()
	print("Training finished!")

	# 8. Evaluate the model
	print("Evaluating the model...")
	evaluation_results = trainer.evaluate()
	print(evaluation_results)

	# 9. Save the fine-tuned model
	trainer.save_model(output_dir)
	tokenizer.save_pretrained(output_dir)
	print(f"Fine-tuned model and tokenizer saved to '{output_dir}'.")

	# 10. Push to Hub

	trainer.push_to_hub()
	print(f"Model pushed to Hugging Face Hub: {push_to_hub_model_id}")