Spaces:
Sleeping
Sleeping
File size: 6,768 Bytes
c00d132 12d3e6f c00d132 da0e5e8 c00d132 12d3e6f c00d132 12d3e6f c00d132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import json,math,datetime
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
class QuizletDataset(Dataset):
def __init__(self, json_file):
with open(json_file, 'r') as f:
self.data = json.load(f)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = self.data[idx]
prompt = sample['prompt']
answer = sample['messages'][1]['content'] # Assuming the answer is the second message
return {'prompt': prompt, 'answer': answer}
def evaluate_model(model, val_dataloader, device):
model.eval()
total_eval_loss = 0
for batch in val_dataloader:
inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
labels = input_ids.clone().to(device)
labels[:, :-1] = input_ids[:, 1:]
labels[:, -1] = -100 # Ignore last token for loss calculation
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
total_eval_loss += outputs.loss.item()
avg_eval_loss = total_eval_loss / len(val_dataloader)
perplexity = math.exp(avg_eval_loss)
return avg_eval_loss, perplexity
def evaluate_training(model, train_loader, device):
model.eval()
total_train_loss = 0
with torch.no_grad():
for batch in train_loader:
inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
labels = input_ids.clone().to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
total_train_loss += outputs.loss.item()
avg_train_loss = total_train_loss / len(train_loader)
return avg_train_loss
# Assuming the JSON file 'output.json' is in the same directory as the script
full_dataset = QuizletDataset(json_file='training_data_output.json')
# Calculate the sizes of the splits for 80/20 train/test
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
# Split the dataset into training and test sets
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])
# Create DataLoader instances for the training and test sets
print("Loading data into PyTorch Tensors...")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
# Load the tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
#print(model) # view the layers of the model to be frozen
# Freeze all layers
for param in model.parameters():
param.requires_grad = False
# Unfreeze the last n layers
for layer in model.model.layers[-4:]:
for param in layer.parameters():
param.requires_grad = True
# # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model
# for param in model.model.embed_tokens.parameters():
# param.requires_grad = True
# Unfreeze the output layer
for param in model.lm_head.parameters():
param.requires_grad = True
# Define the optimizer and scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Record initial pre-trained model's performance
initial_loss, initial_perplexity = evaluate_model(model, test_loader, device)
performance_log = {
"pretrained": {
"loss": initial_loss,
"perplexity": initial_perplexity,
"timestamp": datetime.datetime.now().isoformat()
},
"finetuned": []
}
# Training loop
print("Starting Training...")
for epoch in range(epochs):
model.train()
for batch in train_loader:
optimizer.zero_grad()
# Concatenate 'prompt' and 'answer' with a special token in between
combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])]
# Tokenize the combined text
inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
# Create labels by shifting input_ids to the left
labels = input_ids[:, 1:].contiguous().to(device)
# Ensure input_ids and labels have the same shape by ignoring the last token of input_ids
input_ids = input_ids[:, :-1].contiguous().to(device)
# Adjust attention_mask to match new input_ids dimensions
attention_mask = attention_mask[:, :-1].contiguous().to(device)
# Forward pass
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
if loss is not None:
loss.backward()
optimizer.step()
scheduler.step()
else:
print(f"No loss to backpropagate for batch {batch}")
# Evaluate after each epoch and compare with the pre-trained model
train_loss = evaluate_training(model, train_loader, device)
finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device)
epoch_performance = {
"epoch": epoch + 1,
"train_loss": train_loss,
"val_loss": finetuned_loss,
"perplexity": finetuned_perplexity,
"timestamp": datetime.datetime.now().isoformat()
}
performance_log["finetuned"].append(epoch_performance)
# Optionally, save the model checkpoint
# model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin")
print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}")
# Save performance log to a JSON file
print("Saving performance log...")
training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
with open(f"performance_log_{training_datetime}.json", "w") as file:
json.dump(performance_log, file, indent=4)
model.save_pretrained(f"trained_models/")
tokenizer.save_pretrained("trained_models/")
print("Done!")
|