Spaces:

ibrahimmkhalid
/

llm-from-scratch

Sleeping

App Files Files Community

ibrahimmkhalid commited on Nov 1

Commit

5649c37

1 Parent(s): 6b094a2

autoformat

Browse files

Files changed (6) hide show

GPTLanguageModelClass.py +51 -39
app.py +34 -14
bigram/bigram_testing.sync.py +24 -17
extract.py +7 -3
simple_gpt/gpt_shakespeare.sync.py +67 -48
train.py +37 -22

GPTLanguageModelClass.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
 class hyperparams:
     block_size = 128
     batch_size = 32
@@ -14,6 +15,7 @@ class hyperparams:
     dropout = 0.2
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 block_size = hyperparams.block_size
 batch_size = hyperparams.batch_size
 max_iters = hyperparams.max_iters
@@ -25,36 +27,40 @@ n_layer = hyperparams.n_layer
 dropout = hyperparams.dropout
 device = hyperparams.device
 class Head(nn.Module):
-    """ one head of self-attention """
     def __init__(self, head_size):
         super().__init__()
         self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
-        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         # input of size (batch, time-step, channels)
         # output of size (batch, time-step, head size)
-        B,T,C = x.shape
-        k = self.key(x)   # (B,T,hs)
-        q = self.query(x) # (B,T,hs)
         # compute attention scores ("affinities")
-        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
-        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
-        wei = F.softmax(wei, dim=-1) # (B, T, T)
         wei = self.dropout(wei)
         # perform the weighted aggregation of the values
-        v = self.value(x) # (B,T,hs)
-        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
         return out
 class MultiHeadAttention(nn.Module):
-    """ multiple heads of self-attention in parallel """
     def __init__(self, num_heads, head_size):
         super().__init__()
@@ -63,12 +69,15 @@ class MultiHeadAttention(nn.Module):
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
-        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
         out = self.dropout(self.proj(out))
         return out
 class FeedFoward(nn.Module):
-    """ a simple linear layer followed by a non-linearity """
     def __init__(self, n_embd):
         super().__init__()
@@ -81,9 +90,10 @@ class FeedFoward(nn.Module):
     def forward(self, x):
         return self.net(x)
 class Block(nn.Module):
-    """ Transformer block: communication followed by computation """
     def __init__(self, n_embd, n_head):
         # n_embd: embedding dimension, n_head: the number of heads we'd like
@@ -100,17 +110,19 @@ class Block(nn.Module):
         y = self.ffwd(x)
         x = self.ln2(x + y)
         return x
 class GPTLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
         self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
-        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
         self.lm_head = nn.Linear(n_embd, vocab_size)
         self.apply(self._init_weights)
     def _init_weights(self, module):
@@ -123,25 +135,26 @@ class GPTLanguageModel(nn.Module):
     def forward(self, index, targets=None):
         B, T = index.shape
         # idx and targets are both (B,T) tensor of integers
-        tok_emb = self.token_embedding_table(index) # (B,T,C)
-        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
-        x = tok_emb + pos_emb # (B,T,C)
-        x = self.blocks(x) # (B,T,C)
-        x = self.ln_f(x) # (B,T,C)
-        logits = self.lm_head(x) # (B,T,vocab_size)
         if targets is None:
             loss = None
         else:
             B, T, C = logits.shape
-            logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
-            targets = targets.view(B*T)
-            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
@@ -150,12 +163,11 @@ class GPTLanguageModel(nn.Module):
             # get the predictions
             logits, loss = self.forward(index_cond)
             # focus only on the last time step
-            logits = logits[:, -1, :] # becomes (B, C)
             # apply softmax to get probabilities
-            probs = F.softmax(logits, dim=-1) # (B, C)
             # sample from the distribution
-            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
             # append sampled index to the running sequence
-            index = torch.cat((index, index_next), dim=1) # (B, T+1)
         return index

 import torch.nn as nn
 from torch.nn import functional as F
 class hyperparams:
     block_size = 128
     batch_size = 32
     dropout = 0.2
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 block_size = hyperparams.block_size
 batch_size = hyperparams.batch_size
 max_iters = hyperparams.max_iters
 dropout = hyperparams.dropout
 device = hyperparams.device
 class Head(nn.Module):
+    """one head of self-attention"""
     def __init__(self, head_size):
         super().__init__()
         self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         # input of size (batch, time-step, channels)
         # output of size (batch, time-step, head size)
+        B, T, C = x.shape
+        k = self.key(x)  # (B,T,hs)
+        q = self.query(x)  # (B,T,hs)
         # compute attention scores ("affinities")
+        wei = (
+            q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+        )  # (B, T, hs) @ (B, hs, T) -> (B, T, T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))  # (B, T, T)
+        wei = F.softmax(wei, dim=-1)  # (B, T, T)
         wei = self.dropout(wei)
         # perform the weighted aggregation of the values
+        v = self.value(x)  # (B,T,hs)
+        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
         return out
 class MultiHeadAttention(nn.Module):
+    """multiple heads of self-attention in parallel"""
     def __init__(self, num_heads, head_size):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
+        out = torch.cat(
+            [h(x) for h in self.heads], dim=-1
+        )  # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
         out = self.dropout(self.proj(out))
         return out
 class FeedFoward(nn.Module):
+    """a simple linear layer followed by a non-linearity"""
     def __init__(self, n_embd):
         super().__init__()
     def forward(self, x):
         return self.net(x)
 class Block(nn.Module):
+    """Transformer block: communication followed by computation"""
     def __init__(self, n_embd, n_head):
         # n_embd: embedding dimension, n_head: the number of heads we'd like
         y = self.ffwd(x)
         x = self.ln2(x + y)
         return x
 class GPTLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
         self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
         self.lm_head = nn.Linear(n_embd, vocab_size)
         self.apply(self._init_weights)
     def _init_weights(self, module):
     def forward(self, index, targets=None):
         B, T = index.shape
         # idx and targets are both (B,T) tensor of integers
+        tok_emb = self.token_embedding_table(index)  # (B,T,C)
+        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
+        x = tok_emb + pos_emb  # (B,T,C)
+        x = self.blocks(x)  # (B,T,C)
+        x = self.ln_f(x)  # (B,T,C)
+        logits = self.lm_head(x)  # (B,T,vocab_size)
         if targets is None:
             loss = None
         else:
             B, T, C = logits.shape
+            logits = logits.view(
+                B * T, C
+            )  # reshape to what torch.cross_entropy expects
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
             # get the predictions
             logits, loss = self.forward(index_cond)
             # focus only on the last time step
+            logits = logits[:, -1, :]  # becomes (B, C)
             # apply softmax to get probabilities
+            probs = F.softmax(logits, dim=-1)  # (B, C)
             # sample from the distribution
+            index_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
             # append sampled index to the running sequence
+            index = torch.cat((index, index_next), dim=1)  # (B, T+1)
         return index

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 import torch
 import os
-from GPTLanguageModelClass import *
 block_size = hyperparams.block_size
 batch_size = hyperparams.batch_size
@@ -14,36 +14,56 @@ n_layer = hyperparams.n_layer
 dropout = hyperparams.dropout
 device = hyperparams.device
-st.title('LLM from scratch Demo')
 st.write(f"Using device: {device}")
 if not os.path.exists("./vocab.txt"):
     raise Exception("Please run extract.py first")
 chars = ""
-with open("./vocab.txt", 'r', encoding='utf-8') as f:
     text = f.read()
     chars = sorted(list(set(text)))
 string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
-encode = lambda s: [string_to_int[ch] for ch in s]
-decode = lambda x: ''.join([int_to_string[i] for i in x])
-model_pickle_path = './model.pt'
-st.write('loading model parameters...')
-with open(model_pickle_path, 'rb') as f:
-    model = torch.load(f, map_location=device)
-st.write('model loaded successfully!')
-prompt = ''
-prompt = st.text_area('Prompt:', value=prompt, height=100, max_chars=block_size - 1, key='prompt')
 if len(prompt) != 0:
     context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
     max_new_tokens = block_size - len(prompt)
-    generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=max_new_tokens)[0].tolist())
-    st.write('Generated text:')
     st.write(generated_chars)

 import streamlit as st
 import torch
 import os
+from GPTLanguageModelClass import hyperparams
 block_size = hyperparams.block_size
 batch_size = hyperparams.batch_size
 dropout = hyperparams.dropout
 device = hyperparams.device
+st.title("LLM from scratch Demo")
 st.write(f"Using device: {device}")
 if not os.path.exists("./vocab.txt"):
     raise Exception("Please run extract.py first")
 chars = ""
+with open("./vocab.txt", "r", encoding="utf-8") as f:
     text = f.read()
     chars = sorted(list(set(text)))
+st.write(f"Vocab size: {len(chars)}")
+st.write(f"Block size: {block_size}")
+st.write(f"Batch size: {batch_size}")
+st.write(f"Max iters: {max_iters}")
+st.write(f"Learning rate: {learning_rate}")
+st.write(f"Eval every: {eval_every}")
+st.write(f"n_embd: {n_embd}")
+st.write(f"n_head: {n_head}")
+st.write(f"n_layer: {n_layer}")
+st.write(f"dropout: {dropout}")
 string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
+def encode(s):
+    return [string_to_int[ch] for ch in s]
+def decode(x):
+    return "".join([int_to_string[i] for i in x])
+model_pickle_path = "./model.pt"
+st.write("loading model parameters...")
+with open(model_pickle_path, "rb") as f:
+    model = torch.load(f, map_location=device, weights_only=False)
+st.write("model loaded successfully!")
+prompt = ""
+prompt = st.text_area(
+    "Prompt:", value=prompt, height=100, max_chars=block_size - 1, key="prompt"
+)
 if len(prompt) != 0:
     context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
     max_new_tokens = block_size - len(prompt)
+    generated_chars = decode(
+        model.generate(context.unsqueeze(0), max_new_tokens=max_new_tokens)[0].tolist()
+    )
+    st.write("Generated text:")
     st.write(generated_chars)

bigram/bigram_testing.sync.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
 block_size = 8
@@ -40,7 +41,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
-decode = lambda x: ''.join([int_to_string[i] for i in x])
 data = torch.tensor(encode(text), dtype=torch.long, device=device)
@@ -50,20 +51,23 @@ n = int(0.8 * len(data))
 train_data = data[:n]
 val_data = data[n:]
 # %%
 def get_batch(split):
-    data = train_data if split == 'train' else val_data
     ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([data[i:i+block_size] for i in ix])
-    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 # %%
-x, y = get_batch('train')
 # %%
 class BigramLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
@@ -75,34 +79,38 @@ class BigramLanguageModel(nn.Module):
             loss = None
         else:
             B, T, C = logits.shape
-            logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
-            targets = targets.view(B*T)
-            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
             # get the predictions
             logits, loss = self.forward(index)
             # focus only on the last time step
-            logits = logits[:, -1, :] # becomes (B, C)
             # apply softmax to get probabilities
-            probs = F.softmax(logits, dim=-1) # (B, C)
             # sample from the distribution
-            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
             # append sampled index to the running sequence
-            index = torch.cat((index, index_next), dim=1) # (B, T+1)
         return index
 # %%
 model = BigramLanguageModel(vocab_size).to(device)
-context = torch.zeros((1,1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)
 # %% [markdown]
-#
 # ### Some common optimizers
 # 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
 # 2. **Gradient Descent (GD):**  is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
@@ -118,7 +126,7 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 for iter in range(max_iters):
     # sample a batch
-    xb, yb = get_batch('train')
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
@@ -133,7 +141,6 @@ print(loss.item())
 # %%
-context = torch.zeros((1,1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
 block_size = 8
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
+decode = lambda x: "".join([int_to_string[i] for i in x])
 data = torch.tensor(encode(text), dtype=torch.long, device=device)
 train_data = data[:n]
 val_data = data[n:]
 # %%
 def get_batch(split):
+    data = train_data if split == "train" else val_data
     ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 # %%
+x, y = get_batch("train")
 # %%
 class BigramLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
             loss = None
         else:
             B, T, C = logits.shape
+            logits = logits.view(
+                B * T, C
+            )  # reshape to what torch.cross_entropy expects
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
             # get the predictions
             logits, loss = self.forward(index)
             # focus only on the last time step
+            logits = logits[:, -1, :]  # becomes (B, C)
             # apply softmax to get probabilities
+            probs = F.softmax(logits, dim=-1)  # (B, C)
             # sample from the distribution
+            index_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
             # append sampled index to the running sequence
+            index = torch.cat((index, index_next), dim=1)  # (B, T+1)
         return index
 # %%
 model = BigramLanguageModel(vocab_size).to(device)
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)
 # %% [markdown]
+#
 # ### Some common optimizers
 # 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
 # 2. **Gradient Descent (GD):**  is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
 for iter in range(max_iters):
     # sample a batch
+    xb, yb = get_batch("train")
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
 # %%
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)

extract.py CHANGED Viewed

@@ -2,13 +2,17 @@ import os
 import lzma
 from tqdm import tqdm
 def xz_files_in_dir(directory):
     files = []
     for filename in os.listdir(directory):
-        if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
             files.append(filename)
     return files
 tarxz_path = "./openwebtext.tar.xz"
 folder_path = "./openwebtext"
 output_file_train = "./openwebtext/train_split.txt"
@@ -29,7 +33,7 @@ files = xz_files_in_dir(folder_path)
 total_files = len(files)
 # Calculate the split indices
-split_index = int(total_files * 0.9) # 90% for training
 files_train = files[:split_index]
 files_val = files[split_index:]
@@ -62,4 +66,4 @@ if not os.path.exists(output_file_val):
 if not os.path.exists(vocab_file):
     with open(vocab_file, "w", encoding="utf-8") as vfile:
         for char in vocab:
-            vfile.write(char + '\n')

 import lzma
 from tqdm import tqdm
 def xz_files_in_dir(directory):
     files = []
     for filename in os.listdir(directory):
+        if filename.endswith(".xz") and os.path.isfile(
+            os.path.join(directory, filename)
+        ):
             files.append(filename)
     return files
 tarxz_path = "./openwebtext.tar.xz"
 folder_path = "./openwebtext"
 output_file_train = "./openwebtext/train_split.txt"
 total_files = len(files)
 # Calculate the split indices
+split_index = int(total_files * 0.9)  # 90% for training
 files_train = files[:split_index]
 files_val = files[split_index:]
 if not os.path.exists(vocab_file):
     with open(vocab_file, "w", encoding="utf-8") as vfile:
         for char in vocab:
+            vfile.write(char + "\n")

simple_gpt/gpt_shakespeare.sync.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
 block_size = 128
@@ -42,7 +43,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
-decode = lambda x: ''.join([int_to_string[i] for i in x])
 data = torch.tensor(encode(text), dtype=torch.long, device=device)
@@ -52,21 +53,23 @@ n = int(0.8 * len(data))
 train_data = data[:n]
 val_data = data[n:]
 # %%
 def get_batch(split):
-    data = train_data if split == 'train' else val_data
     ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([data[i:i+block_size] for i in ix])
-    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 # %%
 @torch.no_grad()
 def estimate_loss():
     out = {}
     model.eval()
-    for split in ['train', 'val']:
         losses = torch.zeros(eval_every)
         for k in range(eval_every):
             X, Y = get_batch(split)
@@ -76,41 +79,46 @@ def estimate_loss():
     model.train()
     return out
 # %%
 class Head(nn.Module):
-    """ one head of self-attention """
     def __init__(self, head_size):
         super().__init__()
         self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
-        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         # input of size (batch, time-step, channels)
         # output of size (batch, time-step, head size)
-        B,T,C = x.shape
-        k = self.key(x)   # (B,T,hs)
-        q = self.query(x) # (B,T,hs)
         # compute attention scores ("affinities")
-        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
-        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
-        wei = F.softmax(wei, dim=-1) # (B, T, T)
         wei = self.dropout(wei)
         # perform the weighted aggregation of the values
-        v = self.value(x) # (B,T,hs)
-        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
         return out
 # [1, 0, 0]
 # [1, 0.6, 0]
 # [1, 0.6, 0.4]
 class MultiHeadAttention(nn.Module):
-    """ multiple heads of self-attention in parallel """
     def __init__(self, num_heads, head_size):
         super().__init__()
@@ -119,13 +127,15 @@ class MultiHeadAttention(nn.Module):
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
-        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
         out = self.dropout(self.proj(out))
         return out
 class FeedFoward(nn.Module):
-    """ a simple linear layer followed by a non-linearity """
     def __init__(self, n_embd):
         super().__init__()
@@ -138,9 +148,10 @@ class FeedFoward(nn.Module):
     def forward(self, x):
         return self.net(x)
 class Block(nn.Module):
-    """ Transformer block: communication followed by computation """
     def __init__(self, n_embd, n_head):
         # n_embd: embedding dimension, n_head: the number of heads we'd like
@@ -157,17 +168,19 @@ class Block(nn.Module):
         y = self.ffwd(x)
         x = self.ln2(x + y)
         return x
 class GPTLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
         self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
-        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
-        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
         self.lm_head = nn.Linear(n_embd, vocab_size)
         self.apply(self._init_weights)
     def _init_weights(self, module):
@@ -180,25 +193,26 @@ class GPTLanguageModel(nn.Module):
     def forward(self, index, targets=None):
         B, T = index.shape
         # idx and targets are both (B,T) tensor of integers
-        tok_emb = self.token_embedding_table(index) # (B,T,C)
-        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
-        x = tok_emb + pos_emb # (B,T,C)
-        x = self.blocks(x) # (B,T,C)
-        x = self.ln_f(x) # (B,T,C)
-        logits = self.lm_head(x) # (B,T,vocab_size)
         if targets is None:
             loss = None
         else:
             B, T, C = logits.shape
-            logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
-            targets = targets.view(B*T)
-            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
@@ -207,15 +221,16 @@ class GPTLanguageModel(nn.Module):
             # get the predictions
             logits, loss = self.forward(index_cond)
             # focus only on the last time step
-            logits = logits[:, -1, :] # becomes (B, C)
             # apply softmax to get probabilities
-            probs = F.softmax(logits, dim=-1) # (B, C)
             # sample from the distribution
-            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
             # append sampled index to the running sequence
-            index = torch.cat((index, index_next), dim=1) # (B, T+1)
         return index
 model = GPTLanguageModel(vocab_size).to(device)
 # create a PyTorch optimizer
@@ -224,10 +239,12 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 for iter in range(max_iters):
     if iter % eval_every == 0:
         losses = estimate_loss()
-        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
     # sample a batch of data
-    xb, yb = get_batch('train')
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
@@ -238,14 +255,16 @@ print(loss.item())
 # %%
-context = torch.zeros((1,1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)
 # %%
-prompt = 'To be or not to be,'
 context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
-generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
 print(generated_chars)

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(device)
 block_size = 128
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
+decode = lambda x: "".join([int_to_string[i] for i in x])
 data = torch.tensor(encode(text), dtype=torch.long, device=device)
 train_data = data[:n]
 val_data = data[n:]
 # %%
 def get_batch(split):
+    data = train_data if split == "train" else val_data
     ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 # %%
 @torch.no_grad()
 def estimate_loss():
     out = {}
     model.eval()
+    for split in ["train", "val"]:
         losses = torch.zeros(eval_every)
         for k in range(eval_every):
             X, Y = get_batch(split)
     model.train()
     return out
 # %%
 class Head(nn.Module):
+    """one head of self-attention"""
     def __init__(self, head_size):
         super().__init__()
         self.key = nn.Linear(n_embd, head_size, bias=False)
         self.query = nn.Linear(n_embd, head_size, bias=False)
         self.value = nn.Linear(n_embd, head_size, bias=False)
+        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
         # input of size (batch, time-step, channels)
         # output of size (batch, time-step, head size)
+        B, T, C = x.shape
+        k = self.key(x)  # (B,T,hs)
+        q = self.query(x)  # (B,T,hs)
         # compute attention scores ("affinities")
+        wei = (
+            q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
+        )  # (B, T, hs) @ (B, hs, T) -> (B, T, T)
+        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))  # (B, T, T)
+        wei = F.softmax(wei, dim=-1)  # (B, T, T)
         wei = self.dropout(wei)
         # perform the weighted aggregation of the values
+        v = self.value(x)  # (B,T,hs)
+        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
         return out
 # [1, 0, 0]
 # [1, 0.6, 0]
 # [1, 0.6, 0.4]
 class MultiHeadAttention(nn.Module):
+    """multiple heads of self-attention in parallel"""
     def __init__(self, num_heads, head_size):
         super().__init__()
         self.dropout = nn.Dropout(dropout)
     def forward(self, x):
+        out = torch.cat(
+            [h(x) for h in self.heads], dim=-1
+        )  # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
         out = self.dropout(self.proj(out))
         return out
 class FeedFoward(nn.Module):
+    """a simple linear layer followed by a non-linearity"""
     def __init__(self, n_embd):
         super().__init__()
     def forward(self, x):
         return self.net(x)
 class Block(nn.Module):
+    """Transformer block: communication followed by computation"""
     def __init__(self, n_embd, n_head):
         # n_embd: embedding dimension, n_head: the number of heads we'd like
         y = self.ffwd(x)
         x = self.ln2(x + y)
         return x
 class GPTLanguageModel(nn.Module):
     def __init__(self, vocab_size):
         super().__init__()
         self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
         self.position_embedding_table = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
         self.lm_head = nn.Linear(n_embd, vocab_size)
         self.apply(self._init_weights)
     def _init_weights(self, module):
     def forward(self, index, targets=None):
         B, T = index.shape
         # idx and targets are both (B,T) tensor of integers
+        tok_emb = self.token_embedding_table(index)  # (B,T,C)
+        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T,C)
+        x = tok_emb + pos_emb  # (B,T,C)
+        x = self.blocks(x)  # (B,T,C)
+        x = self.ln_f(x)  # (B,T,C)
+        logits = self.lm_head(x)  # (B,T,vocab_size)
         if targets is None:
             loss = None
         else:
             B, T, C = logits.shape
+            logits = logits.view(
+                B * T, C
+            )  # reshape to what torch.cross_entropy expects
+            targets = targets.view(B * T)
+            loss = F.cross_entropy(logits, targets)
         return logits, loss
     def generate(self, index, max_new_tokens):
         # index is (B, T) array of indices in the current context
         for _ in range(max_new_tokens):
             # get the predictions
             logits, loss = self.forward(index_cond)
             # focus only on the last time step
+            logits = logits[:, -1, :]  # becomes (B, C)
             # apply softmax to get probabilities
+            probs = F.softmax(logits, dim=-1)  # (B, C)
             # sample from the distribution
+            index_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
             # append sampled index to the running sequence
+            index = torch.cat((index, index_next), dim=1)  # (B, T+1)
         return index
 model = GPTLanguageModel(vocab_size).to(device)
 # create a PyTorch optimizer
 for iter in range(max_iters):
     if iter % eval_every == 0:
         losses = estimate_loss()
+        print(
+            f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
+        )
     # sample a batch of data
+    xb, yb = get_batch("train")
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
 # %%
+context = torch.zeros((1, 1), dtype=torch.long, device=device)
 generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
 print(generated_chars)
 # %%
+prompt = "To be or not to be,"
 context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
+generated_chars = decode(
+    model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist()
+)
 print(generated_chars)

train.py CHANGED Viewed

@@ -17,54 +17,66 @@ device = hyperparams.device
 print(device)
-if not os.path.exists("./vocab.txt") or not os.path.exists("./openwebtext/train_split.txt") or not os.path.exists("./openwebtext/val_split.txt"):
     raise Exception("Please run extract.py first")
 chars = ""
-with open("./vocab.txt", 'r', encoding='utf-8') as f:
     text = f.read()
     chars = sorted(list(set(text)))
 vocab_size = len(chars)
 string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
-decode = lambda x: ''.join([int_to_string[i] for i in x])
 # memory map for using small snippets of text from a single file of any size
 def get_random_chunk(split):
-    filename = "./openwebtext/train_split.txt" if split == 'train' else "./openwebtext/val_split.txt"
-    with open(filename, 'rb') as f:
         with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
             # Determine the file size and a random position to start reading
             file_size = len(mm)
-            start_pos = random.randint(0, (file_size) - block_size*batch_size)
             # Seek to the random position and read the block of text
             mm.seek(start_pos)
-            block = mm.read(block_size*batch_size-1)
             # Decode the block to a string, ignoring any invalid byte sequences
-            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
             # Train and test splits
             data = torch.tensor(encode(decoded_block), dtype=torch.long)
     return data
 def get_batch(split):
     data = get_random_chunk(split)
     ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([data[i:i+block_size] for i in ix])
-    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 @torch.no_grad()
 def estimate_loss():
     out = {}
     model.eval()
-    for split in ['train', 'val']:
         losses = torch.zeros(eval_every)
         for k in range(eval_every):
             X, Y = get_batch(split)
@@ -74,24 +86,27 @@ def estimate_loss():
     model.train()
     return out
 model = GPTLanguageModel(vocab_size).to(device)
-model_pickle_path = './model.pt'
 if os.path.exists(model_pickle_path):
-    print('loading model parameters...')
-    with open(model_pickle_path, 'rb') as f:
         model = torch.load(f, map_location=device)
-    print('loaded successfully!')
 # create a PyTorch optimizer
 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 for iter in range(max_iters):
     if iter % eval_every == 0:
         losses = estimate_loss()
-        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
     # sample a batch of data
-    xb, yb = get_batch('train')
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
@@ -100,6 +115,6 @@ for iter in range(max_iters):
     optimizer.step()
 print(loss.item())
-with open(model_pickle_path, 'wb') as f:
     torch.save(model, f)
-print('model saved')

 print(device)
+if (
+    not os.path.exists("./vocab.txt")
+    or not os.path.exists("./openwebtext/train_split.txt")
+    or not os.path.exists("./openwebtext/val_split.txt")
+):
     raise Exception("Please run extract.py first")
 chars = ""
+with open("./vocab.txt", "r", encoding="utf-8") as f:
     text = f.read()
     chars = sorted(list(set(text)))
 vocab_size = len(chars)
 string_to_int = {ch: i for i, ch in enumerate(chars)}
 int_to_string = {i: ch for i, ch in enumerate(chars)}
 encode = lambda s: [string_to_int[ch] for ch in s]
+decode = lambda x: "".join([int_to_string[i] for i in x])
 # memory map for using small snippets of text from a single file of any size
 def get_random_chunk(split):
+    filename = (
+        "./openwebtext/train_split.txt"
+        if split == "train"
+        else "./openwebtext/val_split.txt"
+    )
+    with open(filename, "rb") as f:
         with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
             # Determine the file size and a random position to start reading
             file_size = len(mm)
+            start_pos = random.randint(0, (file_size) - block_size * batch_size)
             # Seek to the random position and read the block of text
             mm.seek(start_pos)
+            block = mm.read(block_size * batch_size - 1)
             # Decode the block to a string, ignoring any invalid byte sequences
+            decoded_block = block.decode("utf-8", errors="ignore").replace("\r", "")
             # Train and test splits
             data = torch.tensor(encode(decoded_block), dtype=torch.long)
     return data
 def get_batch(split):
     data = get_random_chunk(split)
     ix = torch.randint(len(data) - block_size, (batch_size,))
+    x = torch.stack([data[i : i + block_size] for i in ix])
+    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
     x, y = x.to(device), y.to(device)
     return x, y
 @torch.no_grad()
 def estimate_loss():
     out = {}
     model.eval()
+    for split in ["train", "val"]:
         losses = torch.zeros(eval_every)
         for k in range(eval_every):
             X, Y = get_batch(split)
     model.train()
     return out
 model = GPTLanguageModel(vocab_size).to(device)
+model_pickle_path = "./model.pt"
 if os.path.exists(model_pickle_path):
+    print("loading model parameters...")
+    with open(model_pickle_path, "rb") as f:
         model = torch.load(f, map_location=device)
+    print("loaded successfully!")
 # create a PyTorch optimizer
 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 for iter in range(max_iters):
     if iter % eval_every == 0:
         losses = estimate_loss()
+        print(
+            f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
+        )
     # sample a batch of data
+    xb, yb = get_batch("train")
     # evaluate the loss
     logits, loss = model.forward(xb, yb)
     optimizer.step()
 print(loss.item())
+with open(model_pickle_path, "wb") as f:
     torch.save(model, f)
+print("model saved")