Upload train.py

50a0747 verified over 1 year ago

9.62 kB

	from datasets import load_dataset
	from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
	import torch
	from peft import LoraConfig, get_peft_model
	import transformers
	from datetime import datetime
	import os

	os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 0 3090 1 2080

	def apply_chat_template(example):
	# Define the messages for the system, user, and assistant
	messages = [
	{
	"role": "system",
	"content": "You are a chess grandmaster specializing in finding checkmate moves in any chess position."
	},
	{
	"role": "user",
	"content": f"Given the following chessboard, identify the move that delivers checkmate:\n\n{example['board']}\n\n"
	},
	{
	"role": "assistant",
	"content": f"The move to achieve checkmate is: {example['mate']}"
	}
	]

	# Format the text manually following the template, ensuring proper spacing
	formatted_text = ""
	for msg in messages:
	formatted_text += f"{msg['content']} "

	example["text"] = formatted_text.strip() # Remove trailing spaces
	return example


	def main():
	# Define the local paths to your CSV files
	data_files = {
	'train': '/home/luciano/Documents/Tesis Ezequiel/Tesis/data_boards/high_train.csv',
	'test': '/home/luciano/Documents/Tesis Ezequiel/Tesis/data_boards/high_test.csv',
	}

	# Load the dataset from local CSV files
	dataset = load_dataset(
	'csv',
	data_files=data_files,
	delimiter=',', # Specify the delimiter for CSV
	usecols=['board', 'mate'], # Load only the required columns
	on_bad_lines='skip', # Skip bad lines that cause parsing errors
	)

	# Select a subset of the data for train and test (increase this for actual training)
	# For demonstration, using 5 training examples and 2 test examples
	train_dataset = dataset['train']
	eval_dataset = dataset['test']

	print('Train Dataset:', train_dataset, '\nTest Dataset:', eval_dataset)

	# Apply the chat template
	train_dataset = train_dataset.map(
	apply_chat_template,
	num_proc=2,
	#remove_columns=['board', 'mate']
	)

	eval_dataset = eval_dataset.map(
	apply_chat_template,
	num_proc=2,
	#remove_columns=['board', 'mate'],
	desc="Applying chat template"
	)

	# Inspect the first example after applying the chat template
	print("\nFirst Training Example Text:\n", train_dataset[0]['text'])

	# Configure quantization
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	model_id = 'mistralai/Mistral-7B-Instruct-v0.3'

	# Load the model
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	attn_implementation='eager',
	trust_remote_code=True,
	quantization_config=quantization_config,
	device_map="auto"
	)

	print("Model is loaded on device:", next(model.parameters()).device) # Should return cuda:0 if loaded onto GPU

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	padding_side="right", # Changed to 'right' to align with our padding strategy
	use_fast=False, # needed for now, should be fixed soon
	)
	tokenizer.pad_token = tokenizer.eos_token

	# Verify tokenizer special tokens
	print("\nTokenizer Special Tokens:")
	print("EOS Token:", tokenizer.eos_token)
	print("BOS Token:", tokenizer.bos_token)
	print("PAD Token:", tokenizer.pad_token)

	def generate_and_tokenize_prompt(data_point):
	# Define the prompt and the expected response
	prompt = (
	"You are a chess grandmaster specializing in finding checkmate moves in any chess position. "
	"Given the following chessboard, identify the move that delivers checkmate:\n\n"
	f"{data_point['board']}\n\n"
	)
	response = f"The move to achieve checkmate is: {data_point['mate']}"

	# Tokenize prompt and response together
	tokenized = tokenizer(
	prompt + response,
	padding='max_length',
	truncation=True,
	max_length=200,
	return_tensors='pt',
	)

	input_ids = tokenized['input_ids'][0].tolist()
	attention_mask = tokenized['attention_mask'][0].tolist()

	# Find the start index of the response
	response_start_str = response
	response_start_idx = (prompt + response).find(response_start_str)

	if response_start_idx == -1:
	print("Warning: Response start string not found in the concatenated text.")
	response_start_idx = len(prompt) # Fallback to end of prompt

	# Tokenize the prompt to find the token index
	prompt_tokenized = tokenizer(
	prompt,
	add_special_tokens=False,
	return_tensors='pt'
	)
	prompt_length = prompt_tokenized['input_ids'].shape[1]

	# Create labels: mask the prompt tokens with -100
	labels = [-100] * prompt_length + input_ids[prompt_length:]

	# If the total length is less than max_length, pad the remaining labels with -100
	if len(labels) < 200:
	labels += [-100] * (200 - len(labels))
	else:
	labels = labels[:200]

	# Ensure input_ids and labels are exactly 200 tokens
	input_ids = input_ids[:200]
	attention_mask = attention_mask[:200]
	labels = labels[:200]

	""" # Debug prints to verify correctness
	print("\n--- Tokenization Debug ---")
	print("Prompt Text:\n", prompt)
	print("Response Text:\n", response)
	print("Prompt Token IDs:", prompt_tokenized['input_ids'][0].tolist())
	print("Response Token IDs:", input_ids[prompt_length:])
	print("Combined Input IDs:", input_ids)
	print("Combined Attention Mask:", attention_mask)
	print("Combined Labels:", labels)
	print("Decoded Input IDs:\n", tokenizer.decode(input_ids, skip_special_tokens=False))
	print("--- End of Debug ---\n")"""

	return {
	'input_ids': input_ids,
	'attention_mask': attention_mask,
	'labels': labels
	}


	# Define the tokenization function with proper debugging
	def generate_and_tokenize_prompt_wrapper(x):
	return generate_and_tokenize_prompt(x)

	# Tokenize the datasets
	tokenized_train_dataset = train_dataset.map(
	generate_and_tokenize_prompt_wrapper,
	remove_columns=['text'],
	batched=False,
	)

	tokenized_val_dataset = eval_dataset.map(
	generate_and_tokenize_prompt_wrapper,
	remove_columns=['text'],
	batched=False,
	)

	# Inspect a sample from the tokenized training dataset
	sample = tokenized_train_dataset[0]
	print("\n--- Tokenized Sample ---")
	print("Input IDs:", sample['input_ids'])
	print("Attention Mask:", sample['attention_mask'])
	print("Labels:", sample['labels'])
	print("Decoded Input IDs:\n", tokenizer.decode(sample['input_ids'], skip_special_tokens=False))
	print("--- End of Sample ---\n")

	# Set up LoRA
	lora_config = LoraConfig(
	r=64,
	lora_alpha=16,
	lora_dropout=0.1,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
	)

	model = get_peft_model(model, lora_config)

	project = "tesis"
	base_model_name = "med"
	run_name = f"{base_model_name}-{project}"
	output_dir = f"./{run_name}"

	# Define TrainingArguments
	training_args = transformers.TrainingArguments(
	output_dir=output_dir,
	max_grad_norm=1.0, # Clip gradients to prevent exploding gradients
	warmup_steps=100,
	num_train_epochs=1, # Adjust as needed
	per_device_train_batch_size=11, # 11 3090
	per_device_eval_batch_size=10, # 10 3090
	gradient_accumulation_steps=4, # To simulate a larger batch size
	evaluation_strategy="epoch",
	eval_steps=50, # Adjust based on dataset size
	save_steps=1000, # Adjust based on dataset size
	logging_steps=10, # More frequent logging for debugging
	learning_rate=1e-5,
	fp16=True,
	logging_dir=r"/home/luciano/Documents/Tesis Ezequiel/Tesis/med/logs_med",
	report_to="tensorboard", # Change to "tensorboard" or "wandb" if needed
	run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
	)

	# Initialize the Trainer
	trainer = transformers.Trainer(
	model=model,
	train_dataset=tokenized_train_dataset,
	eval_dataset=tokenized_val_dataset,
	args=training_args,
	data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
	)

	# Disable cache to silence warnings
	model.config.use_cache = False

	# Start training
	trainer.train(resume_from_checkpoint=r'/home/luciano/Documents/Tesis Ezequiel/Tesis/med/med_checkpoint')
	# Save the model and tokenizer
	#trainer.train()
	trainer.save_model("./fine-tuned-model_high")
	tokenizer.save_pretrained("./fine-tuned-model_high")


	if __name__ == "__main__":
	main()