Spaces:

RobertCastagna
/

FIN_LLM

Sleeping

FIN_LLM / training.py

Robert Castagna

update hf page

da0e5e8 about 2 years ago

6.77 kB

	import json,math,datetime
	import torch
	from torch.utils.data import Dataset, DataLoader, random_split
	from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
	from torch.optim import AdamW

	class QuizletDataset(Dataset):
	def __init__(self, json_file):
	with open(json_file, 'r') as f:
	self.data = json.load(f)

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	if torch.is_tensor(idx):
	idx = idx.tolist()
	sample = self.data[idx]
	prompt = sample['prompt']
	answer = sample['messages'][1]['content'] # Assuming the answer is the second message
	return {'prompt': prompt, 'answer': answer}

	def evaluate_model(model, val_dataloader, device):
	model.eval()
	total_eval_loss = 0
	for batch in val_dataloader:
	inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
	input_ids = inputs['input_ids'].to(device)
	attention_mask = inputs['attention_mask'].to(device)
	labels = input_ids.clone().to(device)
	labels[:, :-1] = input_ids[:, 1:]
	labels[:, -1] = -100 # Ignore last token for loss calculation

	with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
	total_eval_loss += outputs.loss.item()

	avg_eval_loss = total_eval_loss / len(val_dataloader)
	perplexity = math.exp(avg_eval_loss)
	return avg_eval_loss, perplexity

	def evaluate_training(model, train_loader, device):
	model.eval()
	total_train_loss = 0

	with torch.no_grad():
	for batch in train_loader:
	inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
	input_ids = inputs['input_ids'].to(device)
	attention_mask = inputs['attention_mask'].to(device)
	labels = input_ids.clone().to(device)

	outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
	total_train_loss += outputs.loss.item()

	avg_train_loss = total_train_loss / len(train_loader)
	return avg_train_loss


	# Assuming the JSON file 'output.json' is in the same directory as the script
	full_dataset = QuizletDataset(json_file='training_data_output.json')

	# Calculate the sizes of the splits for 80/20 train/test
	train_size = int(0.8 * len(full_dataset))
	test_size = len(full_dataset) - train_size

	# Split the dataset into training and test sets
	train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

	# Create DataLoader instances for the training and test sets
	print("Loading data into PyTorch Tensors...")
	train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
	test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

	# Load the tokenizer and model
	print("Loading tokenizer and model...")
	tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
	#print(model) # view the layers of the model to be frozen

	# Freeze all layers
	for param in model.parameters():
	param.requires_grad = False

	# Unfreeze the last n layers
	for layer in model.model.layers[-4:]:
	for param in layer.parameters():
	param.requires_grad = True

	# # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model
	# for param in model.model.embed_tokens.parameters():
	# param.requires_grad = True

	# Unfreeze the output layer
	for param in model.lm_head.parameters():
	param.requires_grad = True

	# Define the optimizer and scheduler
	epochs = 3
	optimizer = AdamW(model.parameters(), lr=1e-5)
	scheduler = get_linear_schedule_with_warmup(
	optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
	)

	# Set up the device
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = model.to(device)

	# Record initial pre-trained model's performance
	initial_loss, initial_perplexity = evaluate_model(model, test_loader, device)
	performance_log = {
	"pretrained": {
	"loss": initial_loss,
	"perplexity": initial_perplexity,
	"timestamp": datetime.datetime.now().isoformat()
	},
	"finetuned": []
	}

	# Training loop
	print("Starting Training...")
	for epoch in range(epochs):
	model.train()

	for batch in train_loader:
	optimizer.zero_grad()

	# Concatenate 'prompt' and 'answer' with a special token in between
	combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])]

	# Tokenize the combined text
	inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512)
	input_ids = inputs['input_ids'].to(device)
	attention_mask = inputs['attention_mask'].to(device)

	# Create labels by shifting input_ids to the left
	labels = input_ids[:, 1:].contiguous().to(device)

	# Ensure input_ids and labels have the same shape by ignoring the last token of input_ids
	input_ids = input_ids[:, :-1].contiguous().to(device)

	# Adjust attention_mask to match new input_ids dimensions
	attention_mask = attention_mask[:, :-1].contiguous().to(device)

	# Forward pass
	outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss

	if loss is not None:
	loss.backward()
	optimizer.step()
	scheduler.step()
	else:
	print(f"No loss to backpropagate for batch {batch}")

	# Evaluate after each epoch and compare with the pre-trained model
	train_loss = evaluate_training(model, train_loader, device)
	finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device)
	epoch_performance = {
	"epoch": epoch + 1,
	"train_loss": train_loss,
	"val_loss": finetuned_loss,
	"perplexity": finetuned_perplexity,
	"timestamp": datetime.datetime.now().isoformat()
	}
	performance_log["finetuned"].append(epoch_performance)

	# Optionally, save the model checkpoint
	# model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin")
	print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}")


	# Save performance log to a JSON file
	print("Saving performance log...")
	training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
	with open(f"performance_log_{training_datetime}.json", "w") as file:
	json.dump(performance_log, file, indent=4)

	model.save_pretrained(f"trained_models/")
	tokenizer.save_pretrained("trained_models/")
	print("Done!")