import json,math,datetime import torch from torch.utils.data import Dataset, DataLoader, random_split from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup from torch.optim import AdamW class QuizletDataset(Dataset): def __init__(self, json_file): with open(json_file, 'r') as f: self.data = json.load(f) def __len__(self): return len(self.data) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() sample = self.data[idx] prompt = sample['prompt'] answer = sample['messages'][1]['content'] # Assuming the answer is the second message return {'prompt': prompt, 'answer': answer} def evaluate_model(model, val_dataloader, device): model.eval() total_eval_loss = 0 for batch in val_dataloader: inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512) input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) labels = input_ids.clone().to(device) labels[:, :-1] = input_ids[:, 1:] labels[:, -1] = -100 # Ignore last token for loss calculation with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask, labels=labels) total_eval_loss += outputs.loss.item() avg_eval_loss = total_eval_loss / len(val_dataloader) perplexity = math.exp(avg_eval_loss) return avg_eval_loss, perplexity def evaluate_training(model, train_loader, device): model.eval() total_train_loss = 0 with torch.no_grad(): for batch in train_loader: inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512) input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) labels = input_ids.clone().to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) total_train_loss += outputs.loss.item() avg_train_loss = total_train_loss / len(train_loader) return avg_train_loss # Assuming the JSON file 'output.json' is in the same directory as the script full_dataset = QuizletDataset(json_file='training_data_output.json') # Calculate the sizes of the splits for 80/20 train/test train_size = int(0.8 * len(full_dataset)) test_size = len(full_dataset) - train_size # Split the dataset into training and test sets train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size]) # Create DataLoader instances for the training and test sets print("Loading data into PyTorch Tensors...") train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False) # Load the tokenizer and model print("Loading tokenizer and model...") tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') #print(model) # view the layers of the model to be frozen # Freeze all layers for param in model.parameters(): param.requires_grad = False # Unfreeze the last n layers for layer in model.model.layers[-4:]: for param in layer.parameters(): param.requires_grad = True # # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model # for param in model.model.embed_tokens.parameters(): # param.requires_grad = True # Unfreeze the output layer for param in model.lm_head.parameters(): param.requires_grad = True # Define the optimizer and scheduler epochs = 3 optimizer = AdamW(model.parameters(), lr=1e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs ) # Set up the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) # Record initial pre-trained model's performance initial_loss, initial_perplexity = evaluate_model(model, test_loader, device) performance_log = { "pretrained": { "loss": initial_loss, "perplexity": initial_perplexity, "timestamp": datetime.datetime.now().isoformat() }, "finetuned": [] } # Training loop print("Starting Training...") for epoch in range(epochs): model.train() for batch in train_loader: optimizer.zero_grad() # Concatenate 'prompt' and 'answer' with a special token in between combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])] # Tokenize the combined text inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512) input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) # Create labels by shifting input_ids to the left labels = input_ids[:, 1:].contiguous().to(device) # Ensure input_ids and labels have the same shape by ignoring the last token of input_ids input_ids = input_ids[:, :-1].contiguous().to(device) # Adjust attention_mask to match new input_ids dimensions attention_mask = attention_mask[:, :-1].contiguous().to(device) # Forward pass outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs.loss if loss is not None: loss.backward() optimizer.step() scheduler.step() else: print(f"No loss to backpropagate for batch {batch}") # Evaluate after each epoch and compare with the pre-trained model train_loss = evaluate_training(model, train_loader, device) finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device) epoch_performance = { "epoch": epoch + 1, "train_loss": train_loss, "val_loss": finetuned_loss, "perplexity": finetuned_perplexity, "timestamp": datetime.datetime.now().isoformat() } performance_log["finetuned"].append(epoch_performance) # Optionally, save the model checkpoint # model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin") print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}") # Save performance log to a JSON file print("Saving performance log...") training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M') with open(f"performance_log_{training_datetime}.json", "w") as file: json.dump(performance_log, file, indent=4) model.save_pretrained(f"trained_models/") tokenizer.save_pretrained("trained_models/") print("Done!")