Clone from github
Browse files- .gitignore +5 -0
- LICENSE +21 -0
- README.md +12 -0
- bigram.py +146 -0
- encoder.py +14 -0
- input.txt +0 -0
- main.py +80 -0
- pyproject.toml +17 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.vscode
|
| 2 |
+
__pycache__
|
| 3 |
+
.git
|
| 4 |
+
.wolf*
|
| 5 |
+
model.pth
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 Michael Gabriel
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,3 +1,15 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
# Bad GPT
|
| 6 |
+
|
| 7 |
+
Based on the [Let's build GPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) video from Andrej Karpathy.
|
| 8 |
+
|
| 9 |
+
This is just an attempt to recreate the transformer Andrej made in his video with the goal of learning more about torch, transformers, and neural networks in general.
|
| 10 |
+
|
| 11 |
+
To run, make sure `python` `3.10` and `poetry` are installed. You can then run `poetry install` to get the dependencies (it's just torch and numpy).
|
| 12 |
+
|
| 13 |
+
Finally, you can run the code with `poetry run python ./main.py`
|
| 14 |
+
|
| 15 |
+
Note that the first run will train the model and then save the trained weights to `model.pth`. Subsequent runs will load these weights.
|
bigram.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from torch.nn import functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from encoder import encode, decode
|
| 8 |
+
from self_attention import Head, MultiHead
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Batcher():
|
| 12 |
+
def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
|
| 13 |
+
self.device = device
|
| 14 |
+
self.batch_size = batch_size
|
| 15 |
+
self.block_size = block_size
|
| 16 |
+
with open('input.txt', 'r', encoding='utf-8') as f:
|
| 17 |
+
text = f.read()
|
| 18 |
+
my_tensors = torch.tensor(encode(text), dtype=torch.long)
|
| 19 |
+
n = int(0.9*len(my_tensors))
|
| 20 |
+
self.train_data = my_tensors[:n]
|
| 21 |
+
self.val_data = my_tensors[n:]
|
| 22 |
+
self.vocab = set(text)
|
| 23 |
+
|
| 24 |
+
def get_batch(self, split: str = 'val'):
|
| 25 |
+
data = self.train_data if split == 'train' else self.val_data
|
| 26 |
+
random_indexes = torch.randint(
|
| 27 |
+
len(data) - self.block_size, (self.batch_size,)).to(self.device)
|
| 28 |
+
context_stack = torch.stack(
|
| 29 |
+
[data[i:i+self.block_size] for i in random_indexes]).to(self.device)
|
| 30 |
+
answer_stack = torch.stack(
|
| 31 |
+
[data[i+1:i+self.block_size+1] for i in random_indexes])
|
| 32 |
+
return context_stack, answer_stack
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class FeedForward(nn.Module):
|
| 36 |
+
def __init__(self, n_embd: int, dropout: float):
|
| 37 |
+
super().__init__()
|
| 38 |
+
self.net = nn.Sequential(
|
| 39 |
+
# Scale out data before applying ReLU so we get more variance
|
| 40 |
+
nn.Linear(n_embd, n_embd * 4),
|
| 41 |
+
nn.ReLU(),
|
| 42 |
+
# Scale back down before returning, effectively averaging the variance from earlier
|
| 43 |
+
nn.Linear(n_embd * 4, n_embd),
|
| 44 |
+
nn.Dropout(dropout)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
def forward(self, x: torch.Tensor):
|
| 48 |
+
return self.net(x)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class Block(nn.Module):
|
| 52 |
+
def __init__(self, n_embd: int, block_size: int, n_head: int, dropout: float):
|
| 53 |
+
super().__init__()
|
| 54 |
+
head_size = n_embd // n_head
|
| 55 |
+
self.sa_head = MultiHead(
|
| 56 |
+
n_head, block_size, n_embd, head_size, dropout)
|
| 57 |
+
self.ffwd = FeedForward(n_embd, dropout)
|
| 58 |
+
self.norm1 = nn.LayerNorm(n_embd)
|
| 59 |
+
self.norm2 = nn.LayerNorm(n_embd)
|
| 60 |
+
|
| 61 |
+
def forward(self, x: torch.Tensor):
|
| 62 |
+
x = x + self.sa_head(self.norm1(x))
|
| 63 |
+
x = x + self.ffwd(self.norm2(x))
|
| 64 |
+
return x
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class BigramLanguageModel(nn.Module):
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
device: Literal['cuda', 'cpu'],
|
| 71 |
+
block_size: int,
|
| 72 |
+
vocab_size: int,
|
| 73 |
+
n_embd: int,
|
| 74 |
+
n_head: int = 4,
|
| 75 |
+
n_layers: int = 3,
|
| 76 |
+
dropout: float = 0.2
|
| 77 |
+
):
|
| 78 |
+
super().__init__()
|
| 79 |
+
self.block_size = block_size
|
| 80 |
+
self.vocab_size = vocab_size
|
| 81 |
+
self.n_embd = n_embd
|
| 82 |
+
self.device = device
|
| 83 |
+
# Create a table to embed both token and position
|
| 84 |
+
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
|
| 85 |
+
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
| 86 |
+
self.lm_head = nn.Linear(n_embd, vocab_size)
|
| 87 |
+
self.expected_loss: np.float64 = np.log(1/vocab_size) * -1
|
| 88 |
+
self.blocks = nn.Sequential(
|
| 89 |
+
*[
|
| 90 |
+
Block(n_embd, block_size, n_head, dropout)
|
| 91 |
+
for _ in range(n_layers)
|
| 92 |
+
],
|
| 93 |
+
nn.LayerNorm(n_embd)
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def forward(self, idx: torch.Tensor, targets: torch.Tensor = None):
|
| 97 |
+
# Predict next tokens
|
| 98 |
+
B, T = idx.shape
|
| 99 |
+
tok_emb: torch.Tensor = self.token_embedding_table(idx)
|
| 100 |
+
pos_emb = self.position_embedding_table(
|
| 101 |
+
torch.arange(T, device=self.device))
|
| 102 |
+
x: torch.Tensor = tok_emb + pos_emb
|
| 103 |
+
x = self.blocks(x)
|
| 104 |
+
logits: torch.Tensor = self.lm_head(x)
|
| 105 |
+
if targets is None:
|
| 106 |
+
loss = 0
|
| 107 |
+
else:
|
| 108 |
+
batch, block, vocab = logits.shape
|
| 109 |
+
# Reformat logits and targets so each entry can be compared
|
| 110 |
+
logits = logits.view(batch * block, vocab)
|
| 111 |
+
targets = targets.view(batch * block)
|
| 112 |
+
# Compare predicted tokens to actual
|
| 113 |
+
loss = F.cross_entropy(logits, targets)
|
| 114 |
+
return logits, loss
|
| 115 |
+
|
| 116 |
+
# Given a 2d matrix of dimensions token and sentence
|
| 117 |
+
# generate new tokens in the next sentence
|
| 118 |
+
def generate(self, idx: torch.Tensor, max_new_tokens: int):
|
| 119 |
+
for _ in range(max_new_tokens):
|
| 120 |
+
# Crop out the last block_size tokens
|
| 121 |
+
cropped_idx = idx[:, -self.block_size:]
|
| 122 |
+
logits, _ = self(cropped_idx)
|
| 123 |
+
# Logits has dimensions token, sentence, token_list
|
| 124 |
+
# We want to make a new sentence, so only look at the last sentence
|
| 125 |
+
logits = logits[:, -1, :]
|
| 126 |
+
# Get possible next tokens and select one
|
| 127 |
+
probabilities = F.softmax(logits, dim=-1)
|
| 128 |
+
idx_next = torch.multinomial(probabilities, num_samples=1)
|
| 129 |
+
# Add the new token to the end of the tensor
|
| 130 |
+
idx = torch.cat((idx, idx_next), dim=1)
|
| 131 |
+
return idx
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@torch.no_grad()
|
| 135 |
+
def estimate_loss(model: nn.Module, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
|
| 136 |
+
out = {}
|
| 137 |
+
model.eval() # set to eval phase
|
| 138 |
+
for split in ['train', 'val']:
|
| 139 |
+
losses = torch.zeros(eval_interval)
|
| 140 |
+
for k in range(eval_interval):
|
| 141 |
+
x, y = batcher.get_batch(split=split)
|
| 142 |
+
logits, loss = model(x.to(device), y.to(device))
|
| 143 |
+
losses[k] = loss.item()
|
| 144 |
+
out[split] = losses.mean()
|
| 145 |
+
model.train() # set back to training phase
|
| 146 |
+
return out
|
encoder.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
with open('input.txt', 'r', encoding='utf-8') as f:
|
| 2 |
+
text = f.read()
|
| 3 |
+
|
| 4 |
+
chars = sorted(list(set(text)))
|
| 5 |
+
stoi = {ch: i for i, ch in enumerate(chars)}
|
| 6 |
+
itos = {i: ch for i, ch in enumerate(chars)}
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def encode(s: str):
|
| 10 |
+
return [stoi[c] for c in s]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def decode(l: list[int]):
|
| 14 |
+
return ''.join([itos[i] for i in l])
|
input.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
main.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from torch.nn import functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
from encoder import encode, decode
|
| 9 |
+
from bigram import BigramLanguageModel, Batcher, estimate_loss
|
| 10 |
+
|
| 11 |
+
# HYPERPARAMETERS #
|
| 12 |
+
### Impacts performance ###
|
| 13 |
+
BATCH_SIZE = 64 # how many sequences of tokens will we process in parallel
|
| 14 |
+
BLOCK_SIZE = 256 # how long is a single token sequence (context length)
|
| 15 |
+
LEARNING_RATE = 1e-4
|
| 16 |
+
NUM_EMBEDDING_DIMENSIONS = 384
|
| 17 |
+
NUM_HEADS = 6
|
| 18 |
+
NUM_LAYERS = 6
|
| 19 |
+
MAX_ITERS = 5000
|
| 20 |
+
### Others ###
|
| 21 |
+
EVAL_INTERVAL = 500
|
| 22 |
+
DROPOUT_RATE = 0.2
|
| 23 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 24 |
+
# --------------- #
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def train_model(model: nn.Module, batcher: Batcher, iterations=MAX_ITERS, lr=LEARNING_RATE):
|
| 28 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
| 29 |
+
for i in range(iterations):
|
| 30 |
+
if i % EVAL_INTERVAL == 0:
|
| 31 |
+
losses = estimate_loss(model, batcher, EVAL_INTERVAL, DEVICE)
|
| 32 |
+
print(
|
| 33 |
+
f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
| 34 |
+
context_stack, answer_stack = batcher.get_batch(split='train')
|
| 35 |
+
_, loss = model(context_stack.to(DEVICE), answer_stack.to(DEVICE))
|
| 36 |
+
optimizer.zero_grad(set_to_none=True)
|
| 37 |
+
loss.backward()
|
| 38 |
+
optimizer.step()
|
| 39 |
+
return optimizer
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
b = Batcher(
|
| 43 |
+
device=DEVICE,
|
| 44 |
+
batch_size=BATCH_SIZE,
|
| 45 |
+
block_size=BLOCK_SIZE
|
| 46 |
+
)
|
| 47 |
+
m = BigramLanguageModel(
|
| 48 |
+
device=DEVICE,
|
| 49 |
+
block_size=BLOCK_SIZE,
|
| 50 |
+
vocab_size=len(b.vocab),
|
| 51 |
+
n_embd=NUM_EMBEDDING_DIMENSIONS,
|
| 52 |
+
n_head=NUM_HEADS,
|
| 53 |
+
n_layers=NUM_LAYERS,
|
| 54 |
+
dropout=DROPOUT_RATE
|
| 55 |
+
).to(DEVICE)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def run_model(model: nn.Module, response_size: int = BLOCK_SIZE):
|
| 59 |
+
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
|
| 60 |
+
encoded = model.generate(
|
| 61 |
+
idx=context, max_new_tokens=response_size)[0]
|
| 62 |
+
return decode(encoded.tolist())
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if os.path.exists('model.pth'):
|
| 66 |
+
print("Loading model from file...")
|
| 67 |
+
checkpoint = torch.load('model.pth')
|
| 68 |
+
m.load_state_dict(checkpoint['model_state_dict'])
|
| 69 |
+
print("Model loaded!")
|
| 70 |
+
else:
|
| 71 |
+
print("Training model...")
|
| 72 |
+
optimizer = train_model(m, b)
|
| 73 |
+
torch.save({
|
| 74 |
+
'model_state_dict': m.state_dict(),
|
| 75 |
+
'optimizer_state_dict': optimizer.state_dict()
|
| 76 |
+
}, 'model.pth')
|
| 77 |
+
print("Training complete!")
|
| 78 |
+
print("Generating response...\n")
|
| 79 |
+
resp = run_model(m, 256)
|
| 80 |
+
print("Response:", resp)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "bad-gpt"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = ""
|
| 5 |
+
authors = []
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
package-mode = false
|
| 8 |
+
|
| 9 |
+
[tool.poetry.dependencies]
|
| 10 |
+
python = "^3.10"
|
| 11 |
+
torch = "^2.3.0"
|
| 12 |
+
numpy = "^1.26.4"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
[build-system]
|
| 16 |
+
requires = ["poetry-core"]
|
| 17 |
+
build-backend = "poetry.core.masonry.api"
|