Spaces:
Sleeping
Sleeping
| import json,math,datetime | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader, random_split | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup | |
| from torch.optim import AdamW | |
| class QuizletDataset(Dataset): | |
| def __init__(self, json_file): | |
| with open(json_file, 'r') as f: | |
| self.data = json.load(f) | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| if torch.is_tensor(idx): | |
| idx = idx.tolist() | |
| sample = self.data[idx] | |
| prompt = sample['prompt'] | |
| answer = sample['messages'][1]['content'] # Assuming the answer is the second message | |
| return {'prompt': prompt, 'answer': answer} | |
| def evaluate_model(model, val_dataloader, device): | |
| model.eval() | |
| total_eval_loss = 0 | |
| for batch in val_dataloader: | |
| inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| input_ids = inputs['input_ids'].to(device) | |
| attention_mask = inputs['attention_mask'].to(device) | |
| labels = input_ids.clone().to(device) | |
| labels[:, :-1] = input_ids[:, 1:] | |
| labels[:, -1] = -100 # Ignore last token for loss calculation | |
| with torch.no_grad(): | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| total_eval_loss += outputs.loss.item() | |
| avg_eval_loss = total_eval_loss / len(val_dataloader) | |
| perplexity = math.exp(avg_eval_loss) | |
| return avg_eval_loss, perplexity | |
| def evaluate_training(model, train_loader, device): | |
| model.eval() | |
| total_train_loss = 0 | |
| with torch.no_grad(): | |
| for batch in train_loader: | |
| inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| input_ids = inputs['input_ids'].to(device) | |
| attention_mask = inputs['attention_mask'].to(device) | |
| labels = input_ids.clone().to(device) | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| total_train_loss += outputs.loss.item() | |
| avg_train_loss = total_train_loss / len(train_loader) | |
| return avg_train_loss | |
| # Assuming the JSON file 'output.json' is in the same directory as the script | |
| full_dataset = QuizletDataset(json_file='training_data_output.json') | |
| # Calculate the sizes of the splits for 80/20 train/test | |
| train_size = int(0.8 * len(full_dataset)) | |
| test_size = len(full_dataset) - train_size | |
| # Split the dataset into training and test sets | |
| train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size]) | |
| # Create DataLoader instances for the training and test sets | |
| print("Loading data into PyTorch Tensors...") | |
| train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True) | |
| test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False) | |
| # Load the tokenizer and model | |
| print("Loading tokenizer and model...") | |
| tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') | |
| model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') | |
| #print(model) # view the layers of the model to be frozen | |
| # Freeze all layers | |
| for param in model.parameters(): | |
| param.requires_grad = False | |
| # Unfreeze the last n layers | |
| for layer in model.model.layers[-4:]: | |
| for param in layer.parameters(): | |
| param.requires_grad = True | |
| # # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model | |
| # for param in model.model.embed_tokens.parameters(): | |
| # param.requires_grad = True | |
| # Unfreeze the output layer | |
| for param in model.lm_head.parameters(): | |
| param.requires_grad = True | |
| # Define the optimizer and scheduler | |
| epochs = 3 | |
| optimizer = AdamW(model.parameters(), lr=1e-5) | |
| scheduler = get_linear_schedule_with_warmup( | |
| optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs | |
| ) | |
| # Set up the device | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = model.to(device) | |
| # Record initial pre-trained model's performance | |
| initial_loss, initial_perplexity = evaluate_model(model, test_loader, device) | |
| performance_log = { | |
| "pretrained": { | |
| "loss": initial_loss, | |
| "perplexity": initial_perplexity, | |
| "timestamp": datetime.datetime.now().isoformat() | |
| }, | |
| "finetuned": [] | |
| } | |
| # Training loop | |
| print("Starting Training...") | |
| for epoch in range(epochs): | |
| model.train() | |
| for batch in train_loader: | |
| optimizer.zero_grad() | |
| # Concatenate 'prompt' and 'answer' with a special token in between | |
| combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])] | |
| # Tokenize the combined text | |
| inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
| input_ids = inputs['input_ids'].to(device) | |
| attention_mask = inputs['attention_mask'].to(device) | |
| # Create labels by shifting input_ids to the left | |
| labels = input_ids[:, 1:].contiguous().to(device) | |
| # Ensure input_ids and labels have the same shape by ignoring the last token of input_ids | |
| input_ids = input_ids[:, :-1].contiguous().to(device) | |
| # Adjust attention_mask to match new input_ids dimensions | |
| attention_mask = attention_mask[:, :-1].contiguous().to(device) | |
| # Forward pass | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| loss = outputs.loss | |
| if loss is not None: | |
| loss.backward() | |
| optimizer.step() | |
| scheduler.step() | |
| else: | |
| print(f"No loss to backpropagate for batch {batch}") | |
| # Evaluate after each epoch and compare with the pre-trained model | |
| train_loss = evaluate_training(model, train_loader, device) | |
| finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device) | |
| epoch_performance = { | |
| "epoch": epoch + 1, | |
| "train_loss": train_loss, | |
| "val_loss": finetuned_loss, | |
| "perplexity": finetuned_perplexity, | |
| "timestamp": datetime.datetime.now().isoformat() | |
| } | |
| performance_log["finetuned"].append(epoch_performance) | |
| # Optionally, save the model checkpoint | |
| # model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin") | |
| print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}") | |
| # Save performance log to a JSON file | |
| print("Saving performance log...") | |
| training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M') | |
| with open(f"performance_log_{training_datetime}.json", "w") as file: | |
| json.dump(performance_log, file, indent=4) | |
| model.save_pretrained(f"trained_models/") | |
| tokenizer.save_pretrained("trained_models/") | |
| print("Done!") | |