shamashel commited on
Commit
070fe06
·
1 Parent(s): e3fc82f

refactoring

Browse files
Files changed (4) hide show
  1. bigram.py → bad_gpt.py +99 -69
  2. dataset.py +25 -1
  3. main.py +19 -64
  4. self_attention.py +32 -0
bigram.py → bad_gpt.py RENAMED
@@ -1,70 +1,19 @@
 
1
  from typing import Literal
2
  import torch
3
  import torch.nn as nn
4
  from torch.nn import functional as F
5
  import numpy as np
6
- from datasets import load_dataset
7
 
8
- from encoder import encode, decode, tokens
9
- from self_attention import Head, MultiHead
 
10
 
11
-
12
- class Batcher():
13
- def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
14
- self.device = device
15
- self.batch_size = batch_size
16
- self.block_size = block_size
17
- from dataset import make_dataset
18
- train_data = make_dataset('train')
19
- val_data = make_dataset('validation')
20
- self.train_data = torch.tensor(encode(train_data), dtype=torch.long)
21
- self.val_data = torch.tensor(encode(val_data), dtype=torch.long)
22
- self.vocab = tokens
23
-
24
- def get_batch(self, split: str = 'val'):
25
- data = self.train_data if split == 'train' else self.val_data
26
- random_indexes = torch.randint(
27
- len(data) - self.block_size, (self.batch_size,)).to(self.device)
28
- context_stack = torch.stack(
29
- [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
30
- answer_stack = torch.stack(
31
- [data[i+1:i+self.block_size+1] for i in random_indexes])
32
- return context_stack, answer_stack
33
-
34
-
35
- class FeedForward(nn.Module):
36
- def __init__(self, n_embd: int, dropout: float):
37
- super().__init__()
38
- self.net = nn.Sequential(
39
- # Scale out data before applying ReLU so we get more variance
40
- nn.Linear(n_embd, n_embd * 4),
41
- nn.ReLU(),
42
- # Scale back down before returning, effectively averaging the variance from earlier
43
- nn.Linear(n_embd * 4, n_embd),
44
- nn.Dropout(dropout)
45
- )
46
-
47
- def forward(self, x: torch.Tensor):
48
- return self.net(x)
49
-
50
-
51
- class Block(nn.Module):
52
- def __init__(self, n_embd: int, block_size: int, n_head: int, dropout: float):
53
- super().__init__()
54
- head_size = n_embd // n_head
55
- self.sa_head = MultiHead(
56
- n_head, block_size, n_embd, head_size, dropout)
57
- self.ffwd = FeedForward(n_embd, dropout)
58
- self.norm1 = nn.LayerNorm(n_embd)
59
- self.norm2 = nn.LayerNorm(n_embd)
60
-
61
- def forward(self, x: torch.Tensor):
62
- x = x + self.sa_head(self.norm1(x))
63
- x = x + self.ffwd(self.norm2(x))
64
- return x
65
 
66
 
67
- class BigramLanguageModel(nn.Module):
68
  def __init__(
69
  self,
70
  device: Literal['cuda', 'cpu'],
@@ -102,24 +51,16 @@ class BigramLanguageModel(nn.Module):
102
  x: torch.Tensor = tok_emb + pos_emb
103
  x = self.blocks(x)
104
  logits: torch.Tensor = self.lm_head(x)
105
- if targets is None:
106
- loss = 0
107
- else:
108
- batch, block, vocab = logits.shape
109
- # Reformat logits and targets so each entry can be compared
110
- logits = logits.view(batch * block, vocab)
111
- targets = targets.view(batch * block)
112
- # Compare predicted tokens to actual
113
- loss = F.cross_entropy(logits, targets)
114
- return logits, loss
115
 
116
  # Given a 2d matrix of dimensions token and sentence
117
  # generate new tokens in the next sentence
118
  def generate(self, idx: torch.Tensor, max_new_tokens: int):
119
  for _ in range(max_new_tokens):
 
120
  # Crop out the last block_size tokens
121
  cropped_idx = idx[:, -self.block_size:]
122
- logits, _ = self(cropped_idx)
123
  # Logits has dimensions token, sentence, token_list
124
  # We want to make a new sentence, so only look at the last sentence
125
  logits = logits[:, -1, :]
@@ -144,3 +85,92 @@ def estimate_loss(model: nn.Module, batcher: Batcher, eval_interval: int, device
144
  out[split] = losses.mean()
145
  model.train() # set back to training phase
146
  return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  from typing import Literal
3
  import torch
4
  import torch.nn as nn
5
  from torch.nn import functional as F
6
  import numpy as np
7
+ import logging
8
 
9
+ from encoder import encode, decode
10
+ from self_attention import Block
11
+ from dataset import Batcher
12
 
13
+ logger = logging.getLogger('bad_gpt').getChild(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
+ class BadGPTModel(nn.Module):
17
  def __init__(
18
  self,
19
  device: Literal['cuda', 'cpu'],
 
51
  x: torch.Tensor = tok_emb + pos_emb
52
  x = self.blocks(x)
53
  logits: torch.Tensor = self.lm_head(x)
54
+ return logits
 
 
 
 
 
 
 
 
 
55
 
56
  # Given a 2d matrix of dimensions token and sentence
57
  # generate new tokens in the next sentence
58
  def generate(self, idx: torch.Tensor, max_new_tokens: int):
59
  for _ in range(max_new_tokens):
60
+ print(f'Iteration {_} of {max_new_tokens}')
61
  # Crop out the last block_size tokens
62
  cropped_idx = idx[:, -self.block_size:]
63
+ logits = self(cropped_idx)
64
  # Logits has dimensions token, sentence, token_list
65
  # We want to make a new sentence, so only look at the last sentence
66
  logits = logits[:, -1, :]
 
85
  out[split] = losses.mean()
86
  model.train() # set back to training phase
87
  return out
88
+
89
+
90
+ class BadGPTTrainer():
91
+ def __init__(self, model: BadGPTModel, batcher: Batcher, eval_interval: int, iterations: int, learning_rate: float):
92
+ self.model = model
93
+ self.batcher = batcher
94
+ self.eval_interval = eval_interval
95
+ self.iterations = iterations
96
+ self.learning_rate = learning_rate
97
+ self.device = self.model.device
98
+ self.optimizer = torch.optim.AdamW(
99
+ self.model.parameters(), lr=self.learning_rate)
100
+
101
+ def train(self):
102
+ if os.path.exists('model.pth'):
103
+ logger.debug("Loading model from file...")
104
+ checkpoint = torch.load('model.pth', map_location=self.device)
105
+ self.model.load_state_dict(checkpoint['model_state_dict'])
106
+ logger.debug("Model loaded!")
107
+ else:
108
+ logger.debug("Training model...")
109
+ self._train()
110
+ torch.save({
111
+ 'model_state_dict': self.model.state_dict(),
112
+ 'optimizer_state_dict': self.optimizer.state_dict()
113
+ }, 'model.pth')
114
+ logger.debug("Training complete!")
115
+
116
+ def _train(self):
117
+ for i in range(self.iterations):
118
+ if i % self.eval_interval == 0:
119
+ losses = estimate_loss(
120
+ self.model, self.batcher, self.eval_interval, self.device)
121
+ logger.debug(
122
+ f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
123
+ context_stack, answer_stack = self.batcher.get_batch(split='train')
124
+ _, loss = self.model(context_stack.to(
125
+ self.device), answer_stack.to(self.device))
126
+ self.optimizer.zero_grad(set_to_none=True)
127
+ loss.backward()
128
+ self.optimizer.step()
129
+
130
+
131
+ class BadGPT():
132
+ def __init__(
133
+ self,
134
+ device: Literal['cuda', 'cpu'],
135
+ block_size: int,
136
+ batch_size: int,
137
+ n_embd: int,
138
+ n_head: int,
139
+ n_layers: int,
140
+ dropout: float,
141
+ eval_interval: int,
142
+ iterations: int,
143
+ lr: float
144
+ ):
145
+ self.device = device
146
+ self._batcher = Batcher(
147
+ device=device,
148
+ batch_size=batch_size,
149
+ block_size=block_size
150
+ )
151
+ self._model = BadGPTModel(
152
+ device=device,
153
+ block_size=block_size,
154
+ vocab_size=len(self._batcher.vocab),
155
+ n_embd=n_embd,
156
+ n_head=n_head,
157
+ n_layers=n_layers,
158
+ dropout=dropout
159
+ ).to(device)
160
+ self._trainer = BadGPTTrainer(
161
+ model=self._model,
162
+ batcher=self._batcher,
163
+ eval_interval=eval_interval,
164
+ iterations=iterations,
165
+ learning_rate=lr
166
+ )
167
+ self._trainer.train()
168
+
169
+ def generate(self, prompt: str, response_size: int):
170
+ start_ids = encode(prompt)
171
+ context = torch.tensor(start_ids, dtype=torch.long, device=self.device)
172
+ # add batch dimension. it's just 1 batch, but we still need it cuz tensors
173
+ context = context[None, ...]
174
+ encoded = self._model.generate(
175
+ idx=context, max_new_tokens=response_size)[0]
176
+ return decode(encoded.tolist())
dataset.py CHANGED
@@ -1,5 +1,7 @@
1
  from typing import Literal, Union
2
- from datasets import load_dataset, DatasetDict
 
 
3
 
4
  DatasetType = Union[None, str]
5
 
@@ -19,3 +21,25 @@ def make_dataset(split: Literal['train', 'validation', 'test'] = 'train'):
19
  out = str(list(ds)[0]['text'])
20
  _datasets[split] = out
21
  return str(_datasets[split])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import Literal, Union
2
+ from datasets import load_dataset
3
+ import torch
4
+ from encoder import encode, tokens
5
 
6
  DatasetType = Union[None, str]
7
 
 
21
  out = str(list(ds)[0]['text'])
22
  _datasets[split] = out
23
  return str(_datasets[split])
24
+
25
+
26
+ class Batcher():
27
+ def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
28
+ self.device = device
29
+ self.batch_size = batch_size
30
+ from dataset import make_dataset
31
+ train_data = make_dataset('train')
32
+ val_data = make_dataset('validation')
33
+ self.train_data = torch.tensor(encode(train_data), dtype=torch.long)
34
+ self.val_data = torch.tensor(encode(val_data), dtype=torch.long)
35
+ self.vocab = tokens
36
+
37
+ def get_batch(self, split: str = 'val'):
38
+ data = self.train_data if split == 'train' else self.val_data
39
+ random_indexes = torch.randint(
40
+ len(data) - self.block_size, (self.batch_size,)).to(self.device)
41
+ context_stack = torch.stack(
42
+ [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
43
+ answer_stack = torch.stack(
44
+ [data[i+1:i+self.block_size+1] for i in random_indexes])
45
+ return context_stack, answer_stack
main.py CHANGED
@@ -1,12 +1,6 @@
1
- from typing import Literal
2
  import torch
3
- import torch.nn as nn
4
- from torch.nn import functional as F
5
- import numpy as np
6
- import os
7
 
8
- from encoder import encode, decode
9
- from bigram import BigramLanguageModel, Batcher, estimate_loss
10
 
11
  # HYPERPARAMETERS #
12
  ### Impacts performance ###
@@ -24,60 +18,21 @@ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
24
  # --------------- #
25
 
26
 
27
- def train_model(model: nn.Module, batcher: Batcher, iterations=MAX_ITERS, lr=LEARNING_RATE):
28
- optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
29
- for i in range(iterations):
30
- if i % EVAL_INTERVAL == 0:
31
- losses = estimate_loss(model, batcher, EVAL_INTERVAL, DEVICE)
32
- print(
33
- f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
34
- context_stack, answer_stack = batcher.get_batch(split='train')
35
- _, loss = model(context_stack.to(DEVICE), answer_stack.to(DEVICE))
36
- optimizer.zero_grad(set_to_none=True)
37
- loss.backward()
38
- optimizer.step()
39
- return optimizer
40
-
41
-
42
- b = Batcher(
43
- device=DEVICE,
44
- batch_size=BATCH_SIZE,
45
- block_size=BLOCK_SIZE
46
- )
47
- m = BigramLanguageModel(
48
- device=DEVICE,
49
- block_size=BLOCK_SIZE,
50
- vocab_size=len(b.vocab),
51
- n_embd=NUM_EMBEDDING_DIMENSIONS,
52
- n_head=NUM_HEADS,
53
- n_layers=NUM_LAYERS,
54
- dropout=DROPOUT_RATE
55
- ).to(DEVICE)
56
-
57
-
58
- def run_model(model: nn.Module, response_size: int = BLOCK_SIZE, query: str = ''):
59
- start_ids = encode(query)
60
- context = torch.tensor(start_ids, dtype=torch.long, device=DEVICE)
61
- # add batch dimension. it's just 1 batch, but we still need it cuz tensors
62
- context = context[None, ...]
63
- encoded = model.generate(
64
- idx=context, max_new_tokens=response_size)[0]
65
- return decode(encoded.tolist())
66
-
67
-
68
- if os.path.exists('model.pth'):
69
- print("Loading model from file...")
70
- checkpoint = torch.load('model.pth', map_location=DEVICE)
71
- m.load_state_dict(checkpoint['model_state_dict'])
72
- print("Model loaded!")
73
- else:
74
- print("Training model...")
75
- optimizer = train_model(m, b)
76
- torch.save({
77
- 'model_state_dict': m.state_dict(),
78
- 'optimizer_state_dict': optimizer.state_dict()
79
- }, 'model.pth')
80
- print("Training complete!")
81
- print("Generating response...\n")
82
- resp = run_model(m, 256, 'JULIET:\nRomeo, Romeo, wherefore art thou Romeo?')
83
- print("Response:\n" + resp)
 
 
1
  import torch
 
 
 
 
2
 
3
+ from bad_gpt import BadGPT
 
4
 
5
  # HYPERPARAMETERS #
6
  ### Impacts performance ###
 
18
  # --------------- #
19
 
20
 
21
+ if __name__ == '__main__':
22
+ bad_gpt = BadGPT(
23
+ device=DEVICE,
24
+ batch_size=BATCH_SIZE,
25
+ block_size=BLOCK_SIZE,
26
+ n_embd=NUM_EMBEDDING_DIMENSIONS,
27
+ n_head=NUM_HEADS,
28
+ n_layers=NUM_LAYERS,
29
+ dropout=DROPOUT_RATE,
30
+ eval_interval=EVAL_INTERVAL,
31
+ iterations=MAX_ITERS,
32
+ lr=LEARNING_RATE
33
+ )
34
+
35
+ print("Generating response...\n")
36
+ resp = bad_gpt.generate(
37
+ 'JULIET:\nRomeo, Romeo, wherefore art thou Romeo?', 256)
38
+ print("Response:\n" + resp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
self_attention.py CHANGED
@@ -2,6 +2,38 @@ import torch
2
  from torch import nn
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  class MultiHead(nn.Module):
6
  def __init__(self, num_heads: int, block_size: int, n_embd: int, head_size: int, dropout: float):
7
  super().__init__()
 
2
  from torch import nn
3
 
4
 
5
+ class FeedForward(nn.Module):
6
+ def __init__(self, n_embd: int, dropout: float):
7
+ super().__init__()
8
+ self.net = nn.Sequential(
9
+ # Scale out data before applying ReLU so we get more variance
10
+ nn.Linear(n_embd, n_embd * 4),
11
+ nn.ReLU(),
12
+ # Scale back down before returning, effectively averaging the variance from earlier
13
+ nn.Linear(n_embd * 4, n_embd),
14
+ nn.Dropout(dropout)
15
+ )
16
+
17
+ def forward(self, x: torch.Tensor):
18
+ return self.net(x)
19
+
20
+
21
+ class Block(nn.Module):
22
+ def __init__(self, n_embd: int, block_size: int, n_head: int, dropout: float):
23
+ super().__init__()
24
+ head_size = n_embd // n_head
25
+ self.sa_head = MultiHead(
26
+ n_head, block_size, n_embd, head_size, dropout)
27
+ self.ffwd = FeedForward(n_embd, dropout)
28
+ self.norm1 = nn.LayerNorm(n_embd)
29
+ self.norm2 = nn.LayerNorm(n_embd)
30
+
31
+ def forward(self, x: torch.Tensor):
32
+ x = x + self.sa_head(self.norm1(x))
33
+ x = x + self.ffwd(self.norm2(x))
34
+ return x
35
+
36
+
37
  class MultiHead(nn.Module):
38
  def __init__(self, num_heads: int, block_size: int, n_embd: int, head_size: int, dropout: float):
39
  super().__init__()