File size: 6,768 Bytes
c00d132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3e6f
c00d132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da0e5e8
 
 
 
 
 
 
 
 
 
 
c00d132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12d3e6f
 
c00d132
 
12d3e6f
 
c00d132
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import json,math,datetime
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from torch.optim import AdamW

class QuizletDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        sample = self.data[idx]
        prompt = sample['prompt']
        answer = sample['messages'][1]['content']  # Assuming the answer is the second message
        return {'prompt': prompt, 'answer': answer}

def evaluate_model(model, val_dataloader, device):
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:
        inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = input_ids.clone().to(device)
        labels[:, :-1] = input_ids[:, 1:]
        labels[:, -1] = -100  # Ignore last token for loss calculation
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_eval_loss += outputs.loss.item()
    
    avg_eval_loss = total_eval_loss / len(val_dataloader)
    perplexity = math.exp(avg_eval_loss)
    return avg_eval_loss, perplexity

def evaluate_training(model, train_loader, device):
    model.eval()
    total_train_loss = 0

    with torch.no_grad():
        for batch in train_loader:
            inputs = tokenizer(batch['prompt'], return_tensors="pt", padding=True, truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = input_ids.clone().to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_train_loss += outputs.loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    return avg_train_loss


# Assuming the JSON file 'output.json' is in the same directory as the script
full_dataset = QuizletDataset(json_file='training_data_output.json')

# Calculate the sizes of the splits for 80/20 train/test
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size

# Split the dataset into training and test sets
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# Create DataLoader instances for the training and test sets
print("Loading data into PyTorch Tensors...")
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Load the tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
model = AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
#print(model) # view the layers of the model to be frozen

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last n layers
for layer in model.model.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

# # Unfreeze the embedding layer: only want to do if you are adding new tokens to the model
# for param in model.model.embed_tokens.parameters():
#     param.requires_grad = True

# Unfreeze the output layer
for param in model.lm_head.parameters():
    param.requires_grad = True

# Define the optimizer and scheduler
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs
)

# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Record initial pre-trained model's performance
initial_loss, initial_perplexity = evaluate_model(model, test_loader, device)
performance_log = {
    "pretrained": {
        "loss": initial_loss,
        "perplexity": initial_perplexity,
        "timestamp": datetime.datetime.now().isoformat()
    },
    "finetuned": []
}

# Training loop
print("Starting Training...")
for epoch in range(epochs):
    model.train()

    for batch in train_loader:
        optimizer.zero_grad()

        # Concatenate 'prompt' and 'answer' with a special token in between
        combined = [p + tokenizer.eos_token + a for p, a in zip(batch['prompt'], batch['answer'])]

        # Tokenize the combined text
        inputs = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Create labels by shifting input_ids to the left
        labels = input_ids[:, 1:].contiguous().to(device)

        # Ensure input_ids and labels have the same shape by ignoring the last token of input_ids
        input_ids = input_ids[:, :-1].contiguous().to(device)

        # Adjust attention_mask to match new input_ids dimensions
        attention_mask = attention_mask[:, :-1].contiguous().to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        if loss is not None:
            loss.backward()
            optimizer.step()
            scheduler.step()
        else:
            print(f"No loss to backpropagate for batch {batch}")
    
    # Evaluate after each epoch and compare with the pre-trained model
    train_loss = evaluate_training(model, train_loader, device)
    finetuned_loss, finetuned_perplexity = evaluate_model(model, test_loader, device)
    epoch_performance = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": finetuned_loss,
        "perplexity": finetuned_perplexity,
        "timestamp": datetime.datetime.now().isoformat()
    }
    performance_log["finetuned"].append(epoch_performance)

    # Optionally, save the model checkpoint
    # model.save_pretrained(f"model_checkpoint_epoch_{epoch}.bin")
    print(f"Epoch {epoch + 1} / {epochs}. Performance: {epoch_performance}")


# Save performance log to a JSON file
print("Saving performance log...")
training_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
with open(f"performance_log_{training_datetime}.json", "w") as file:
    json.dump(performance_log, file, indent=4)

model.save_pretrained(f"trained_models/")
tokenizer.save_pretrained("trained_models/")
print("Done!")