shamashel commited on
Commit
3c8279d
·
1 Parent(s): af1a5ca

Clone from github

Browse files
Files changed (8) hide show
  1. .gitignore +5 -0
  2. LICENSE +21 -0
  3. README.md +12 -0
  4. bigram.py +146 -0
  5. encoder.py +14 -0
  6. input.txt +0 -0
  7. main.py +80 -0
  8. pyproject.toml +17 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .vscode
2
+ __pycache__
3
+ .git
4
+ .wolf*
5
+ model.pth
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Michael Gabriel
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,15 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ # Bad GPT
6
+
7
+ Based on the [Let's build GPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) video from Andrej Karpathy.
8
+
9
+ This is just an attempt to recreate the transformer Andrej made in his video with the goal of learning more about torch, transformers, and neural networks in general.
10
+
11
+ To run, make sure `python` `3.10` and `poetry` are installed. You can then run `poetry install` to get the dependencies (it's just torch and numpy).
12
+
13
+ Finally, you can run the code with `poetry run python ./main.py`
14
+
15
+ Note that the first run will train the model and then save the trained weights to `model.pth`. Subsequent runs will load these weights.
bigram.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+ import numpy as np
6
+
7
+ from encoder import encode, decode
8
+ from self_attention import Head, MultiHead
9
+
10
+
11
+ class Batcher():
12
+ def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
13
+ self.device = device
14
+ self.batch_size = batch_size
15
+ self.block_size = block_size
16
+ with open('input.txt', 'r', encoding='utf-8') as f:
17
+ text = f.read()
18
+ my_tensors = torch.tensor(encode(text), dtype=torch.long)
19
+ n = int(0.9*len(my_tensors))
20
+ self.train_data = my_tensors[:n]
21
+ self.val_data = my_tensors[n:]
22
+ self.vocab = set(text)
23
+
24
+ def get_batch(self, split: str = 'val'):
25
+ data = self.train_data if split == 'train' else self.val_data
26
+ random_indexes = torch.randint(
27
+ len(data) - self.block_size, (self.batch_size,)).to(self.device)
28
+ context_stack = torch.stack(
29
+ [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
30
+ answer_stack = torch.stack(
31
+ [data[i+1:i+self.block_size+1] for i in random_indexes])
32
+ return context_stack, answer_stack
33
+
34
+
35
+ class FeedForward(nn.Module):
36
+ def __init__(self, n_embd: int, dropout: float):
37
+ super().__init__()
38
+ self.net = nn.Sequential(
39
+ # Scale out data before applying ReLU so we get more variance
40
+ nn.Linear(n_embd, n_embd * 4),
41
+ nn.ReLU(),
42
+ # Scale back down before returning, effectively averaging the variance from earlier
43
+ nn.Linear(n_embd * 4, n_embd),
44
+ nn.Dropout(dropout)
45
+ )
46
+
47
+ def forward(self, x: torch.Tensor):
48
+ return self.net(x)
49
+
50
+
51
+ class Block(nn.Module):
52
+ def __init__(self, n_embd: int, block_size: int, n_head: int, dropout: float):
53
+ super().__init__()
54
+ head_size = n_embd // n_head
55
+ self.sa_head = MultiHead(
56
+ n_head, block_size, n_embd, head_size, dropout)
57
+ self.ffwd = FeedForward(n_embd, dropout)
58
+ self.norm1 = nn.LayerNorm(n_embd)
59
+ self.norm2 = nn.LayerNorm(n_embd)
60
+
61
+ def forward(self, x: torch.Tensor):
62
+ x = x + self.sa_head(self.norm1(x))
63
+ x = x + self.ffwd(self.norm2(x))
64
+ return x
65
+
66
+
67
+ class BigramLanguageModel(nn.Module):
68
+ def __init__(
69
+ self,
70
+ device: Literal['cuda', 'cpu'],
71
+ block_size: int,
72
+ vocab_size: int,
73
+ n_embd: int,
74
+ n_head: int = 4,
75
+ n_layers: int = 3,
76
+ dropout: float = 0.2
77
+ ):
78
+ super().__init__()
79
+ self.block_size = block_size
80
+ self.vocab_size = vocab_size
81
+ self.n_embd = n_embd
82
+ self.device = device
83
+ # Create a table to embed both token and position
84
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
85
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
86
+ self.lm_head = nn.Linear(n_embd, vocab_size)
87
+ self.expected_loss: np.float64 = np.log(1/vocab_size) * -1
88
+ self.blocks = nn.Sequential(
89
+ *[
90
+ Block(n_embd, block_size, n_head, dropout)
91
+ for _ in range(n_layers)
92
+ ],
93
+ nn.LayerNorm(n_embd)
94
+ )
95
+
96
+ def forward(self, idx: torch.Tensor, targets: torch.Tensor = None):
97
+ # Predict next tokens
98
+ B, T = idx.shape
99
+ tok_emb: torch.Tensor = self.token_embedding_table(idx)
100
+ pos_emb = self.position_embedding_table(
101
+ torch.arange(T, device=self.device))
102
+ x: torch.Tensor = tok_emb + pos_emb
103
+ x = self.blocks(x)
104
+ logits: torch.Tensor = self.lm_head(x)
105
+ if targets is None:
106
+ loss = 0
107
+ else:
108
+ batch, block, vocab = logits.shape
109
+ # Reformat logits and targets so each entry can be compared
110
+ logits = logits.view(batch * block, vocab)
111
+ targets = targets.view(batch * block)
112
+ # Compare predicted tokens to actual
113
+ loss = F.cross_entropy(logits, targets)
114
+ return logits, loss
115
+
116
+ # Given a 2d matrix of dimensions token and sentence
117
+ # generate new tokens in the next sentence
118
+ def generate(self, idx: torch.Tensor, max_new_tokens: int):
119
+ for _ in range(max_new_tokens):
120
+ # Crop out the last block_size tokens
121
+ cropped_idx = idx[:, -self.block_size:]
122
+ logits, _ = self(cropped_idx)
123
+ # Logits has dimensions token, sentence, token_list
124
+ # We want to make a new sentence, so only look at the last sentence
125
+ logits = logits[:, -1, :]
126
+ # Get possible next tokens and select one
127
+ probabilities = F.softmax(logits, dim=-1)
128
+ idx_next = torch.multinomial(probabilities, num_samples=1)
129
+ # Add the new token to the end of the tensor
130
+ idx = torch.cat((idx, idx_next), dim=1)
131
+ return idx
132
+
133
+
134
+ @torch.no_grad()
135
+ def estimate_loss(model: nn.Module, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
136
+ out = {}
137
+ model.eval() # set to eval phase
138
+ for split in ['train', 'val']:
139
+ losses = torch.zeros(eval_interval)
140
+ for k in range(eval_interval):
141
+ x, y = batcher.get_batch(split=split)
142
+ logits, loss = model(x.to(device), y.to(device))
143
+ losses[k] = loss.item()
144
+ out[split] = losses.mean()
145
+ model.train() # set back to training phase
146
+ return out
encoder.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ with open('input.txt', 'r', encoding='utf-8') as f:
2
+ text = f.read()
3
+
4
+ chars = sorted(list(set(text)))
5
+ stoi = {ch: i for i, ch in enumerate(chars)}
6
+ itos = {i: ch for i, ch in enumerate(chars)}
7
+
8
+
9
+ def encode(s: str):
10
+ return [stoi[c] for c in s]
11
+
12
+
13
+ def decode(l: list[int]):
14
+ return ''.join([itos[i] for i in l])
input.txt ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+ import numpy as np
6
+ import os
7
+
8
+ from encoder import encode, decode
9
+ from bigram import BigramLanguageModel, Batcher, estimate_loss
10
+
11
+ # HYPERPARAMETERS #
12
+ ### Impacts performance ###
13
+ BATCH_SIZE = 64 # how many sequences of tokens will we process in parallel
14
+ BLOCK_SIZE = 256 # how long is a single token sequence (context length)
15
+ LEARNING_RATE = 1e-4
16
+ NUM_EMBEDDING_DIMENSIONS = 384
17
+ NUM_HEADS = 6
18
+ NUM_LAYERS = 6
19
+ MAX_ITERS = 5000
20
+ ### Others ###
21
+ EVAL_INTERVAL = 500
22
+ DROPOUT_RATE = 0.2
23
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
24
+ # --------------- #
25
+
26
+
27
+ def train_model(model: nn.Module, batcher: Batcher, iterations=MAX_ITERS, lr=LEARNING_RATE):
28
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
29
+ for i in range(iterations):
30
+ if i % EVAL_INTERVAL == 0:
31
+ losses = estimate_loss(model, batcher, EVAL_INTERVAL, DEVICE)
32
+ print(
33
+ f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
34
+ context_stack, answer_stack = batcher.get_batch(split='train')
35
+ _, loss = model(context_stack.to(DEVICE), answer_stack.to(DEVICE))
36
+ optimizer.zero_grad(set_to_none=True)
37
+ loss.backward()
38
+ optimizer.step()
39
+ return optimizer
40
+
41
+
42
+ b = Batcher(
43
+ device=DEVICE,
44
+ batch_size=BATCH_SIZE,
45
+ block_size=BLOCK_SIZE
46
+ )
47
+ m = BigramLanguageModel(
48
+ device=DEVICE,
49
+ block_size=BLOCK_SIZE,
50
+ vocab_size=len(b.vocab),
51
+ n_embd=NUM_EMBEDDING_DIMENSIONS,
52
+ n_head=NUM_HEADS,
53
+ n_layers=NUM_LAYERS,
54
+ dropout=DROPOUT_RATE
55
+ ).to(DEVICE)
56
+
57
+
58
+ def run_model(model: nn.Module, response_size: int = BLOCK_SIZE):
59
+ context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
60
+ encoded = model.generate(
61
+ idx=context, max_new_tokens=response_size)[0]
62
+ return decode(encoded.tolist())
63
+
64
+
65
+ if os.path.exists('model.pth'):
66
+ print("Loading model from file...")
67
+ checkpoint = torch.load('model.pth')
68
+ m.load_state_dict(checkpoint['model_state_dict'])
69
+ print("Model loaded!")
70
+ else:
71
+ print("Training model...")
72
+ optimizer = train_model(m, b)
73
+ torch.save({
74
+ 'model_state_dict': m.state_dict(),
75
+ 'optimizer_state_dict': optimizer.state_dict()
76
+ }, 'model.pth')
77
+ print("Training complete!")
78
+ print("Generating response...\n")
79
+ resp = run_model(m, 256)
80
+ print("Response:", resp)
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "bad-gpt"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = []
6
+ readme = "README.md"
7
+ package-mode = false
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ torch = "^2.3.0"
12
+ numpy = "^1.26.4"
13
+
14
+
15
+ [build-system]
16
+ requires = ["poetry-core"]
17
+ build-backend = "poetry.core.masonry.api"