Clone from github

Browse files

Files changed (8) hide show

.gitignore +5 -0
LICENSE +21 -0
README.md +12 -0
bigram.py +146 -0
encoder.py +14 -0
input.txt +0 -0
main.py +80 -0
pyproject.toml +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.vscode
+__pycache__
+.git
+.wolf*
+model.pth

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Michael Gabriel
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,15 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+# Bad GPT
+Based on the [Let's build GPT](https://www.youtube.com/watch?v=kCc8FmEb1nY) video from Andrej Karpathy.
+This is just an attempt to recreate the transformer Andrej made in his video with the goal of learning more about torch, transformers, and neural networks in general.
+To run, make sure `python` `3.10` and `poetry` are installed. You can then run `poetry install` to get the dependencies (it's just torch and numpy).
+Finally, you can run the code with `poetry run python ./main.py`
+Note that the first run will train the model and then save the trained weights to `model.pth`. Subsequent runs will load these weights.

bigram.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from typing import Literal
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+from encoder import encode, decode
+from self_attention import Head, MultiHead
+class Batcher():
+    def __init__(self, device: Literal['cuda', 'cpu'], batch_size: int, block_size: int):
+        self.device = device
+        self.batch_size = batch_size
+        self.block_size = block_size
+        with open('input.txt', 'r', encoding='utf-8') as f:
+            text = f.read()
+            my_tensors = torch.tensor(encode(text), dtype=torch.long)
+            n = int(0.9*len(my_tensors))
+            self.train_data = my_tensors[:n]
+            self.val_data = my_tensors[n:]
+            self.vocab = set(text)
+    def get_batch(self, split: str = 'val'):
+        data = self.train_data if split == 'train' else self.val_data
+        random_indexes = torch.randint(
+            len(data) - self.block_size, (self.batch_size,)).to(self.device)
+        context_stack = torch.stack(
+            [data[i:i+self.block_size] for i in random_indexes]).to(self.device)
+        answer_stack = torch.stack(
+            [data[i+1:i+self.block_size+1] for i in random_indexes])
+        return context_stack, answer_stack
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.net = nn.Sequential(
+            # Scale out data before applying ReLU so we get more variance
+            nn.Linear(n_embd, n_embd * 4),
+            nn.ReLU(),
+            # Scale back down before returning, effectively averaging the variance from earlier
+            nn.Linear(n_embd * 4, n_embd),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x: torch.Tensor):
+        return self.net(x)
+class Block(nn.Module):
+    def __init__(self, n_embd: int, block_size: int, n_head: int, dropout: float):
+        super().__init__()
+        head_size = n_embd // n_head
+        self.sa_head = MultiHead(
+            n_head, block_size, n_embd, head_size, dropout)
+        self.ffwd = FeedForward(n_embd, dropout)
+        self.norm1 = nn.LayerNorm(n_embd)
+        self.norm2 = nn.LayerNorm(n_embd)
+    def forward(self, x: torch.Tensor):
+        x = x + self.sa_head(self.norm1(x))
+        x = x + self.ffwd(self.norm2(x))
+        return x
+class BigramLanguageModel(nn.Module):
+    def __init__(
+        self,
+        device: Literal['cuda', 'cpu'],
+        block_size: int,
+        vocab_size: int,
+        n_embd: int,
+        n_head: int = 4,
+        n_layers: int = 3,
+        dropout: float = 0.2
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.device = device
+        # Create a table to embed both token and position
+        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+        self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+        self.expected_loss: np.float64 = np.log(1/vocab_size) * -1
+        self.blocks = nn.Sequential(
+            *[
+                Block(n_embd, block_size, n_head, dropout)
+                for _ in range(n_layers)
+            ],
+            nn.LayerNorm(n_embd)
+        )
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor = None):
+        # Predict next tokens
+        B, T = idx.shape
+        tok_emb: torch.Tensor = self.token_embedding_table(idx)
+        pos_emb = self.position_embedding_table(
+            torch.arange(T, device=self.device))
+        x: torch.Tensor = tok_emb + pos_emb
+        x = self.blocks(x)
+        logits: torch.Tensor = self.lm_head(x)
+        if targets is None:
+            loss = 0
+        else:
+            batch, block, vocab = logits.shape
+            # Reformat logits and targets so each entry can be compared
+            logits = logits.view(batch * block, vocab)
+            targets = targets.view(batch * block)
+            # Compare predicted tokens to actual
+            loss = F.cross_entropy(logits, targets)
+        return logits, loss
+    # Given a 2d matrix of dimensions token and sentence
+    # generate new tokens in the next sentence
+    def generate(self, idx: torch.Tensor, max_new_tokens: int):
+        for _ in range(max_new_tokens):
+            # Crop out the last block_size tokens
+            cropped_idx = idx[:, -self.block_size:]
+            logits, _ = self(cropped_idx)
+            # Logits has dimensions token, sentence, token_list
+            # We want to make a new sentence, so only look at the last sentence
+            logits = logits[:, -1, :]
+            # Get possible next tokens and select one
+            probabilities = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probabilities, num_samples=1)
+            # Add the new token to the end of the tensor
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+@torch.no_grad()
+def estimate_loss(model: nn.Module, batcher: Batcher, eval_interval: int, device: Literal['cuda', 'cpu'] = 'cuda'):
+    out = {}
+    model.eval()  # set to eval phase
+    for split in ['train', 'val']:
+        losses = torch.zeros(eval_interval)
+        for k in range(eval_interval):
+            x, y = batcher.get_batch(split=split)
+            logits, loss = model(x.to(device), y.to(device))
+            losses[k] = loss.item()
+        out[split] = losses.mean()
+    model.train()  # set back to training phase
+    return out

encoder.py ADDED Viewed

	@@ -0,0 +1,14 @@

+with open('input.txt', 'r', encoding='utf-8') as f:
+    text = f.read()
+chars = sorted(list(set(text)))
+stoi = {ch: i for i, ch in enumerate(chars)}
+itos = {i: ch for i, ch in enumerate(chars)}
+def encode(s: str):
+    return [stoi[c] for c in s]
+def decode(l: list[int]):
+    return ''.join([itos[i] for i in l])

input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Literal
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+import os
+from encoder import encode, decode
+from bigram import BigramLanguageModel, Batcher, estimate_loss
+# HYPERPARAMETERS #
+### Impacts performance ###
+BATCH_SIZE = 64  # how many sequences of tokens will we process in parallel
+BLOCK_SIZE = 256  # how long is a single token sequence (context length)
+LEARNING_RATE = 1e-4
+NUM_EMBEDDING_DIMENSIONS = 384
+NUM_HEADS = 6
+NUM_LAYERS = 6
+MAX_ITERS = 5000
+### Others ###
+EVAL_INTERVAL = 500
+DROPOUT_RATE = 0.2
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# --------------- #
+def train_model(model: nn.Module, batcher: Batcher, iterations=MAX_ITERS, lr=LEARNING_RATE):
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    for i in range(iterations):
+        if i % EVAL_INTERVAL == 0:
+            losses = estimate_loss(model, batcher, EVAL_INTERVAL, DEVICE)
+            print(
+                f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
+        context_stack, answer_stack = batcher.get_batch(split='train')
+        _, loss = model(context_stack.to(DEVICE), answer_stack.to(DEVICE))
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        optimizer.step()
+    return optimizer
+b = Batcher(
+    device=DEVICE,
+    batch_size=BATCH_SIZE,
+    block_size=BLOCK_SIZE
+)
+m = BigramLanguageModel(
+    device=DEVICE,
+    block_size=BLOCK_SIZE,
+    vocab_size=len(b.vocab),
+    n_embd=NUM_EMBEDDING_DIMENSIONS,
+    n_head=NUM_HEADS,
+    n_layers=NUM_LAYERS,
+    dropout=DROPOUT_RATE
+).to(DEVICE)
+def run_model(model: nn.Module, response_size: int = BLOCK_SIZE):
+    context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
+    encoded = model.generate(
+        idx=context, max_new_tokens=response_size)[0]
+    return decode(encoded.tolist())
+if os.path.exists('model.pth'):
+    print("Loading model from file...")
+    checkpoint = torch.load('model.pth')
+    m.load_state_dict(checkpoint['model_state_dict'])
+    print("Model loaded!")
+else:
+    print("Training model...")
+    optimizer = train_model(m, b)
+    torch.save({
+        'model_state_dict': m.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict()
+    }, 'model.pth')
+    print("Training complete!")
+print("Generating response...\n")
+resp = run_model(m, 256)
+print("Response:", resp)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[tool.poetry]
+name = "bad-gpt"
+version = "0.1.0"
+description = ""
+authors = []
+readme = "README.md"
+package-mode = false
+[tool.poetry.dependencies]
+python = "^3.10"
+torch = "^2.3.0"
+numpy = "^1.26.4"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"