FIN_LLM / training.py
Robert Castagna
update hf page
da0e5e8
import json,math,datetime
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
class QuizletDataset(Dataset):
def __init__(self, json_file):
with open(json_file, 'r') as f:
self.data = json.load(f)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = self.data[idx]
prompt = sample['prompt']
answer = sample['messages'][1]['content'] # Assuming the answer is the second message
return {'prompt': prompt, 'answer': answer}
def evaluate_model(model, val_dataloader, device):
model.eval()
total_eval_loss = 0
for batch in val_dataloader:
inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
labels = input_ids.clone().to(device)
labels[:, :-1] = input_ids[:, 1:]
labels[:, -1] = -100 # Ignore last token for loss calculation
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
total_eval_loss += outputs.loss.item()
avg_eval_loss = total_eval_loss / len(val_dataloader)
perplexity = math.exp(avg_eval_loss)
return avg_eval_loss, perplexity
def evaluate_training(model, train_loader, device):
model.eval()
total_train_loss = 0
with torch.no_grad():
for batch in train_loader:
inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
labels = input_ids.clone().to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
total_train_loss += outputs.loss.item()
avg_train_loss = total_train_loss / len(train_loader)
return avg_train_loss
# Assuming the JSON file 'output.json' is in the same directory as the script
full_dataset = QuizletDataset(json_file='training_data_output.json')
# Calculate the sizes of the splits for 80/20 train/test
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
# Split the dataset into training and test sets
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])
# Create DataLoader instances for the training and test sets
print("Loading data into PyTorch Tensors...")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
# Load the tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
#print(model) # view the layers of the model to be frozen
# Freeze all layers
for param in model.parameters():
param.requires_grad = False
# Unfreeze the last n layers
for layer in model.model.layers[-4:]:
for param in layer.parameters():
param.requires_grad = True
# # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model
# for param in model.model.embed_tokens.parameters():
# param.requires_grad = True
# Unfreeze the output layer
for param in model.lm_head.parameters():
param.requires_grad = True
# Define the optimizer and scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Record initial pre-trained model's performance
initial_loss, initial_perplexity = evaluate_model(model, test_loader, device)
performance_log = {
"pretrained": {
"loss": initial_loss,
"perplexity": initial_perplexity,
"timestamp": datetime.datetime.now().isoformat()
},
"finetuned": []
}
# Training loop
print("Starting Training...")
for epoch in range(epochs):
model.train()
for batch in train_loader:
optimizer.zero_grad()
# Concatenate 'prompt' and 'answer' with a special token in between
combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])]
# Tokenize the combined text
inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
# Create labels by shifting input_ids to the left
labels = input_ids[:, 1:].contiguous().to(device)
# Ensure input_ids and labels have the same shape by ignoring the last token of input_ids
input_ids = input_ids[:, :-1].contiguous().to(device)
# Adjust attention_mask to match new input_ids dimensions
attention_mask = attention_mask[:, :-1].contiguous().to(device)
# Forward pass
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
if loss is not None:
loss.backward()
optimizer.step()
scheduler.step()
else:
print(f"No loss to backpropagate for batch {batch}")
# Evaluate after each epoch and compare with the pre-trained model
train_loss = evaluate_training(model, train_loader, device)
finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device)
epoch_performance = {
"epoch": epoch + 1,
"train_loss": train_loss,
"val_loss": finetuned_loss,
"perplexity": finetuned_perplexity,
"timestamp": datetime.datetime.now().isoformat()
}
performance_log["finetuned"].append(epoch_performance)
# Optionally, save the model checkpoint
# model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin")
print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}")
# Save performance log to a JSON file
print("Saving performance log...")
training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
with open(f"performance_log_{training_datetime}.json", "w") as file:
json.dump(performance_log, file, indent=4)
model.save_pretrained(f"trained_models/")
tokenizer.save_pretrained("trained_models/")
print("Done!")