Spaces:
Sleeping
Sleeping
Commit
·
5649c37
1
Parent(s):
6b094a2
autoformat
Browse files- GPTLanguageModelClass.py +51 -39
- app.py +34 -14
- bigram/bigram_testing.sync.py +24 -17
- extract.py +7 -3
- simple_gpt/gpt_shakespeare.sync.py +67 -48
- train.py +37 -22
GPTLanguageModelClass.py
CHANGED
|
@@ -2,6 +2,7 @@ import torch
|
|
| 2 |
import torch.nn as nn
|
| 3 |
from torch.nn import functional as F
|
| 4 |
|
|
|
|
| 5 |
class hyperparams:
|
| 6 |
block_size = 128
|
| 7 |
batch_size = 32
|
|
@@ -14,6 +15,7 @@ class hyperparams:
|
|
| 14 |
dropout = 0.2
|
| 15 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 16 |
|
|
|
|
| 17 |
block_size = hyperparams.block_size
|
| 18 |
batch_size = hyperparams.batch_size
|
| 19 |
max_iters = hyperparams.max_iters
|
|
@@ -25,36 +27,40 @@ n_layer = hyperparams.n_layer
|
|
| 25 |
dropout = hyperparams.dropout
|
| 26 |
device = hyperparams.device
|
| 27 |
|
|
|
|
| 28 |
class Head(nn.Module):
|
| 29 |
-
"""
|
| 30 |
|
| 31 |
def __init__(self, head_size):
|
| 32 |
super().__init__()
|
| 33 |
self.key = nn.Linear(n_embd, head_size, bias=False)
|
| 34 |
self.query = nn.Linear(n_embd, head_size, bias=False)
|
| 35 |
self.value = nn.Linear(n_embd, head_size, bias=False)
|
| 36 |
-
self.register_buffer(
|
| 37 |
|
| 38 |
self.dropout = nn.Dropout(dropout)
|
| 39 |
|
| 40 |
def forward(self, x):
|
| 41 |
# input of size (batch, time-step, channels)
|
| 42 |
# output of size (batch, time-step, head size)
|
| 43 |
-
B,T,C = x.shape
|
| 44 |
-
k = self.key(x)
|
| 45 |
-
q = self.query(x)
|
| 46 |
# compute attention scores ("affinities")
|
| 47 |
-
wei =
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
wei = self.dropout(wei)
|
| 51 |
# perform the weighted aggregation of the values
|
| 52 |
-
v = self.value(x)
|
| 53 |
-
out = wei @ v
|
| 54 |
return out
|
| 55 |
|
|
|
|
| 56 |
class MultiHeadAttention(nn.Module):
|
| 57 |
-
"""
|
| 58 |
|
| 59 |
def __init__(self, num_heads, head_size):
|
| 60 |
super().__init__()
|
|
@@ -63,12 +69,15 @@ class MultiHeadAttention(nn.Module):
|
|
| 63 |
self.dropout = nn.Dropout(dropout)
|
| 64 |
|
| 65 |
def forward(self, x):
|
| 66 |
-
out = torch.cat(
|
|
|
|
|
|
|
| 67 |
out = self.dropout(self.proj(out))
|
| 68 |
return out
|
| 69 |
-
|
|
|
|
| 70 |
class FeedFoward(nn.Module):
|
| 71 |
-
"""
|
| 72 |
|
| 73 |
def __init__(self, n_embd):
|
| 74 |
super().__init__()
|
|
@@ -81,9 +90,10 @@ class FeedFoward(nn.Module):
|
|
| 81 |
|
| 82 |
def forward(self, x):
|
| 83 |
return self.net(x)
|
| 84 |
-
|
|
|
|
| 85 |
class Block(nn.Module):
|
| 86 |
-
"""
|
| 87 |
|
| 88 |
def __init__(self, n_embd, n_head):
|
| 89 |
# n_embd: embedding dimension, n_head: the number of heads we'd like
|
|
@@ -100,17 +110,19 @@ class Block(nn.Module):
|
|
| 100 |
y = self.ffwd(x)
|
| 101 |
x = self.ln2(x + y)
|
| 102 |
return x
|
| 103 |
-
|
|
|
|
| 104 |
class GPTLanguageModel(nn.Module):
|
| 105 |
def __init__(self, vocab_size):
|
| 106 |
super().__init__()
|
| 107 |
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
|
| 108 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
| 109 |
-
self.blocks = nn.Sequential(
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
self.lm_head = nn.Linear(n_embd, vocab_size)
|
| 112 |
-
|
| 113 |
-
|
| 114 |
self.apply(self._init_weights)
|
| 115 |
|
| 116 |
def _init_weights(self, module):
|
|
@@ -123,25 +135,26 @@ class GPTLanguageModel(nn.Module):
|
|
| 123 |
|
| 124 |
def forward(self, index, targets=None):
|
| 125 |
B, T = index.shape
|
| 126 |
-
|
| 127 |
-
|
| 128 |
# idx and targets are both (B,T) tensor of integers
|
| 129 |
-
tok_emb = self.token_embedding_table(index)
|
| 130 |
-
pos_emb = self.position_embedding_table(torch.arange(T, device=device))
|
| 131 |
-
x = tok_emb + pos_emb
|
| 132 |
-
x = self.blocks(x)
|
| 133 |
-
x = self.ln_f(x)
|
| 134 |
-
logits = self.lm_head(x)
|
| 135 |
-
|
| 136 |
if targets is None:
|
| 137 |
loss = None
|
| 138 |
else:
|
| 139 |
B, T, C = logits.shape
|
| 140 |
-
logits = logits.view(
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
return logits, loss
|
| 144 |
-
|
| 145 |
def generate(self, index, max_new_tokens):
|
| 146 |
# index is (B, T) array of indices in the current context
|
| 147 |
for _ in range(max_new_tokens):
|
|
@@ -150,12 +163,11 @@ class GPTLanguageModel(nn.Module):
|
|
| 150 |
# get the predictions
|
| 151 |
logits, loss = self.forward(index_cond)
|
| 152 |
# focus only on the last time step
|
| 153 |
-
logits = logits[:, -1, :]
|
| 154 |
# apply softmax to get probabilities
|
| 155 |
-
probs = F.softmax(logits, dim=-1)
|
| 156 |
# sample from the distribution
|
| 157 |
-
index_next = torch.multinomial(probs, num_samples=1)
|
| 158 |
# append sampled index to the running sequence
|
| 159 |
-
index = torch.cat((index, index_next), dim=1)
|
| 160 |
return index
|
| 161 |
-
|
|
|
|
| 2 |
import torch.nn as nn
|
| 3 |
from torch.nn import functional as F
|
| 4 |
|
| 5 |
+
|
| 6 |
class hyperparams:
|
| 7 |
block_size = 128
|
| 8 |
batch_size = 32
|
|
|
|
| 15 |
dropout = 0.2
|
| 16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 17 |
|
| 18 |
+
|
| 19 |
block_size = hyperparams.block_size
|
| 20 |
batch_size = hyperparams.batch_size
|
| 21 |
max_iters = hyperparams.max_iters
|
|
|
|
| 27 |
dropout = hyperparams.dropout
|
| 28 |
device = hyperparams.device
|
| 29 |
|
| 30 |
+
|
| 31 |
class Head(nn.Module):
|
| 32 |
+
"""one head of self-attention"""
|
| 33 |
|
| 34 |
def __init__(self, head_size):
|
| 35 |
super().__init__()
|
| 36 |
self.key = nn.Linear(n_embd, head_size, bias=False)
|
| 37 |
self.query = nn.Linear(n_embd, head_size, bias=False)
|
| 38 |
self.value = nn.Linear(n_embd, head_size, bias=False)
|
| 39 |
+
self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
|
| 40 |
|
| 41 |
self.dropout = nn.Dropout(dropout)
|
| 42 |
|
| 43 |
def forward(self, x):
|
| 44 |
# input of size (batch, time-step, channels)
|
| 45 |
# output of size (batch, time-step, head size)
|
| 46 |
+
B, T, C = x.shape
|
| 47 |
+
k = self.key(x) # (B,T,hs)
|
| 48 |
+
q = self.query(x) # (B,T,hs)
|
| 49 |
# compute attention scores ("affinities")
|
| 50 |
+
wei = (
|
| 51 |
+
q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
|
| 52 |
+
) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
|
| 53 |
+
wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
|
| 54 |
+
wei = F.softmax(wei, dim=-1) # (B, T, T)
|
| 55 |
wei = self.dropout(wei)
|
| 56 |
# perform the weighted aggregation of the values
|
| 57 |
+
v = self.value(x) # (B,T,hs)
|
| 58 |
+
out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
|
| 59 |
return out
|
| 60 |
|
| 61 |
+
|
| 62 |
class MultiHeadAttention(nn.Module):
|
| 63 |
+
"""multiple heads of self-attention in parallel"""
|
| 64 |
|
| 65 |
def __init__(self, num_heads, head_size):
|
| 66 |
super().__init__()
|
|
|
|
| 69 |
self.dropout = nn.Dropout(dropout)
|
| 70 |
|
| 71 |
def forward(self, x):
|
| 72 |
+
out = torch.cat(
|
| 73 |
+
[h(x) for h in self.heads], dim=-1
|
| 74 |
+
) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
|
| 75 |
out = self.dropout(self.proj(out))
|
| 76 |
return out
|
| 77 |
+
|
| 78 |
+
|
| 79 |
class FeedFoward(nn.Module):
|
| 80 |
+
"""a simple linear layer followed by a non-linearity"""
|
| 81 |
|
| 82 |
def __init__(self, n_embd):
|
| 83 |
super().__init__()
|
|
|
|
| 90 |
|
| 91 |
def forward(self, x):
|
| 92 |
return self.net(x)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
class Block(nn.Module):
|
| 96 |
+
"""Transformer block: communication followed by computation"""
|
| 97 |
|
| 98 |
def __init__(self, n_embd, n_head):
|
| 99 |
# n_embd: embedding dimension, n_head: the number of heads we'd like
|
|
|
|
| 110 |
y = self.ffwd(x)
|
| 111 |
x = self.ln2(x + y)
|
| 112 |
return x
|
| 113 |
+
|
| 114 |
+
|
| 115 |
class GPTLanguageModel(nn.Module):
|
| 116 |
def __init__(self, vocab_size):
|
| 117 |
super().__init__()
|
| 118 |
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
|
| 119 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
| 120 |
+
self.blocks = nn.Sequential(
|
| 121 |
+
*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
|
| 122 |
+
)
|
| 123 |
+
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
|
| 124 |
self.lm_head = nn.Linear(n_embd, vocab_size)
|
| 125 |
+
|
|
|
|
| 126 |
self.apply(self._init_weights)
|
| 127 |
|
| 128 |
def _init_weights(self, module):
|
|
|
|
| 135 |
|
| 136 |
def forward(self, index, targets=None):
|
| 137 |
B, T = index.shape
|
| 138 |
+
|
|
|
|
| 139 |
# idx and targets are both (B,T) tensor of integers
|
| 140 |
+
tok_emb = self.token_embedding_table(index) # (B,T,C)
|
| 141 |
+
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
|
| 142 |
+
x = tok_emb + pos_emb # (B,T,C)
|
| 143 |
+
x = self.blocks(x) # (B,T,C)
|
| 144 |
+
x = self.ln_f(x) # (B,T,C)
|
| 145 |
+
logits = self.lm_head(x) # (B,T,vocab_size)
|
| 146 |
+
|
| 147 |
if targets is None:
|
| 148 |
loss = None
|
| 149 |
else:
|
| 150 |
B, T, C = logits.shape
|
| 151 |
+
logits = logits.view(
|
| 152 |
+
B * T, C
|
| 153 |
+
) # reshape to what torch.cross_entropy expects
|
| 154 |
+
targets = targets.view(B * T)
|
| 155 |
+
loss = F.cross_entropy(logits, targets)
|
| 156 |
return logits, loss
|
| 157 |
+
|
| 158 |
def generate(self, index, max_new_tokens):
|
| 159 |
# index is (B, T) array of indices in the current context
|
| 160 |
for _ in range(max_new_tokens):
|
|
|
|
| 163 |
# get the predictions
|
| 164 |
logits, loss = self.forward(index_cond)
|
| 165 |
# focus only on the last time step
|
| 166 |
+
logits = logits[:, -1, :] # becomes (B, C)
|
| 167 |
# apply softmax to get probabilities
|
| 168 |
+
probs = F.softmax(logits, dim=-1) # (B, C)
|
| 169 |
# sample from the distribution
|
| 170 |
+
index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
|
| 171 |
# append sampled index to the running sequence
|
| 172 |
+
index = torch.cat((index, index_next), dim=1) # (B, T+1)
|
| 173 |
return index
|
|
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import torch
|
| 3 |
import os
|
| 4 |
-
from GPTLanguageModelClass import
|
| 5 |
|
| 6 |
block_size = hyperparams.block_size
|
| 7 |
batch_size = hyperparams.batch_size
|
|
@@ -14,36 +14,56 @@ n_layer = hyperparams.n_layer
|
|
| 14 |
dropout = hyperparams.dropout
|
| 15 |
device = hyperparams.device
|
| 16 |
|
| 17 |
-
st.title(
|
| 18 |
|
| 19 |
st.write(f"Using device: {device}")
|
| 20 |
|
| 21 |
if not os.path.exists("./vocab.txt"):
|
| 22 |
raise Exception("Please run extract.py first")
|
| 23 |
chars = ""
|
| 24 |
-
with open("./vocab.txt",
|
| 25 |
text = f.read()
|
| 26 |
chars = sorted(list(set(text)))
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
string_to_int = {ch: i for i, ch in enumerate(chars)}
|
| 29 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
-
model_pickle_path =
|
| 36 |
|
| 37 |
-
st.write(
|
| 38 |
-
with open(model_pickle_path,
|
| 39 |
-
model = torch.load(f, map_location=device)
|
| 40 |
-
st.write(
|
| 41 |
|
| 42 |
-
prompt =
|
| 43 |
-
prompt = st.text_area(
|
|
|
|
|
|
|
| 44 |
if len(prompt) != 0:
|
| 45 |
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
|
| 46 |
max_new_tokens = block_size - len(prompt)
|
| 47 |
-
generated_chars = decode(
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
st.write(generated_chars)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import torch
|
| 3 |
import os
|
| 4 |
+
from GPTLanguageModelClass import hyperparams
|
| 5 |
|
| 6 |
block_size = hyperparams.block_size
|
| 7 |
batch_size = hyperparams.batch_size
|
|
|
|
| 14 |
dropout = hyperparams.dropout
|
| 15 |
device = hyperparams.device
|
| 16 |
|
| 17 |
+
st.title("LLM from scratch Demo")
|
| 18 |
|
| 19 |
st.write(f"Using device: {device}")
|
| 20 |
|
| 21 |
if not os.path.exists("./vocab.txt"):
|
| 22 |
raise Exception("Please run extract.py first")
|
| 23 |
chars = ""
|
| 24 |
+
with open("./vocab.txt", "r", encoding="utf-8") as f:
|
| 25 |
text = f.read()
|
| 26 |
chars = sorted(list(set(text)))
|
| 27 |
|
| 28 |
+
st.write(f"Vocab size: {len(chars)}")
|
| 29 |
+
st.write(f"Block size: {block_size}")
|
| 30 |
+
st.write(f"Batch size: {batch_size}")
|
| 31 |
+
st.write(f"Max iters: {max_iters}")
|
| 32 |
+
st.write(f"Learning rate: {learning_rate}")
|
| 33 |
+
st.write(f"Eval every: {eval_every}")
|
| 34 |
+
st.write(f"n_embd: {n_embd}")
|
| 35 |
+
st.write(f"n_head: {n_head}")
|
| 36 |
+
st.write(f"n_layer: {n_layer}")
|
| 37 |
+
st.write(f"dropout: {dropout}")
|
| 38 |
+
|
| 39 |
string_to_int = {ch: i for i, ch in enumerate(chars)}
|
| 40 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 41 |
|
| 42 |
+
|
| 43 |
+
def encode(s):
|
| 44 |
+
return [string_to_int[ch] for ch in s]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def decode(x):
|
| 48 |
+
return "".join([int_to_string[i] for i in x])
|
| 49 |
|
| 50 |
|
| 51 |
+
model_pickle_path = "./model.pt"
|
| 52 |
|
| 53 |
+
st.write("loading model parameters...")
|
| 54 |
+
with open(model_pickle_path, "rb") as f:
|
| 55 |
+
model = torch.load(f, map_location=device, weights_only=False)
|
| 56 |
+
st.write("model loaded successfully!")
|
| 57 |
|
| 58 |
+
prompt = ""
|
| 59 |
+
prompt = st.text_area(
|
| 60 |
+
"Prompt:", value=prompt, height=100, max_chars=block_size - 1, key="prompt"
|
| 61 |
+
)
|
| 62 |
if len(prompt) != 0:
|
| 63 |
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
|
| 64 |
max_new_tokens = block_size - len(prompt)
|
| 65 |
+
generated_chars = decode(
|
| 66 |
+
model.generate(context.unsqueeze(0), max_new_tokens=max_new_tokens)[0].tolist()
|
| 67 |
+
)
|
| 68 |
+
st.write("Generated text:")
|
| 69 |
st.write(generated_chars)
|
bigram/bigram_testing.sync.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
import torch
|
| 15 |
import torch.nn as nn
|
| 16 |
from torch.nn import functional as F
|
|
|
|
| 17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 18 |
print(device)
|
| 19 |
block_size = 8
|
|
@@ -40,7 +41,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
|
|
| 40 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 41 |
|
| 42 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 43 |
-
decode = lambda x:
|
| 44 |
|
| 45 |
data = torch.tensor(encode(text), dtype=torch.long, device=device)
|
| 46 |
|
|
@@ -50,20 +51,23 @@ n = int(0.8 * len(data))
|
|
| 50 |
train_data = data[:n]
|
| 51 |
val_data = data[n:]
|
| 52 |
|
|
|
|
| 53 |
# %%
|
| 54 |
def get_batch(split):
|
| 55 |
-
data = train_data if split ==
|
| 56 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 57 |
-
x = torch.stack([data[i:i+block_size] for i in ix])
|
| 58 |
-
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
|
| 59 |
x, y = x.to(device), y.to(device)
|
| 60 |
return x, y
|
| 61 |
|
|
|
|
| 62 |
# %%
|
| 63 |
-
x, y = get_batch(
|
| 64 |
|
| 65 |
# %%
|
| 66 |
|
|
|
|
| 67 |
class BigramLanguageModel(nn.Module):
|
| 68 |
def __init__(self, vocab_size):
|
| 69 |
super().__init__()
|
|
@@ -75,34 +79,38 @@ class BigramLanguageModel(nn.Module):
|
|
| 75 |
loss = None
|
| 76 |
else:
|
| 77 |
B, T, C = logits.shape
|
| 78 |
-
logits = logits.view(
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
return logits, loss
|
|
|
|
| 82 |
def generate(self, index, max_new_tokens):
|
| 83 |
# index is (B, T) array of indices in the current context
|
| 84 |
for _ in range(max_new_tokens):
|
| 85 |
# get the predictions
|
| 86 |
logits, loss = self.forward(index)
|
| 87 |
# focus only on the last time step
|
| 88 |
-
logits = logits[:, -1, :]
|
| 89 |
# apply softmax to get probabilities
|
| 90 |
-
probs = F.softmax(logits, dim=-1)
|
| 91 |
# sample from the distribution
|
| 92 |
-
index_next = torch.multinomial(probs, num_samples=1)
|
| 93 |
# append sampled index to the running sequence
|
| 94 |
-
index = torch.cat((index, index_next), dim=1)
|
| 95 |
return index
|
| 96 |
|
|
|
|
| 97 |
# %%
|
| 98 |
model = BigramLanguageModel(vocab_size).to(device)
|
| 99 |
|
| 100 |
-
context = torch.zeros((1,1), dtype=torch.long, device=device)
|
| 101 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 102 |
print(generated_chars)
|
| 103 |
|
| 104 |
# %% [markdown]
|
| 105 |
-
#
|
| 106 |
# ### Some common optimizers
|
| 107 |
# 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
|
| 108 |
# 2. **Gradient Descent (GD):** is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
|
|
@@ -118,7 +126,7 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
|
| 118 |
|
| 119 |
for iter in range(max_iters):
|
| 120 |
# sample a batch
|
| 121 |
-
xb, yb = get_batch(
|
| 122 |
|
| 123 |
# evaluate the loss
|
| 124 |
logits, loss = model.forward(xb, yb)
|
|
@@ -133,7 +141,6 @@ print(loss.item())
|
|
| 133 |
|
| 134 |
# %%
|
| 135 |
|
| 136 |
-
context = torch.zeros((1,1), dtype=torch.long, device=device)
|
| 137 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 138 |
print(generated_chars)
|
| 139 |
-
|
|
|
|
| 14 |
import torch
|
| 15 |
import torch.nn as nn
|
| 16 |
from torch.nn import functional as F
|
| 17 |
+
|
| 18 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 19 |
print(device)
|
| 20 |
block_size = 8
|
|
|
|
| 41 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 42 |
|
| 43 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 44 |
+
decode = lambda x: "".join([int_to_string[i] for i in x])
|
| 45 |
|
| 46 |
data = torch.tensor(encode(text), dtype=torch.long, device=device)
|
| 47 |
|
|
|
|
| 51 |
train_data = data[:n]
|
| 52 |
val_data = data[n:]
|
| 53 |
|
| 54 |
+
|
| 55 |
# %%
|
| 56 |
def get_batch(split):
|
| 57 |
+
data = train_data if split == "train" else val_data
|
| 58 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 59 |
+
x = torch.stack([data[i : i + block_size] for i in ix])
|
| 60 |
+
y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
|
| 61 |
x, y = x.to(device), y.to(device)
|
| 62 |
return x, y
|
| 63 |
|
| 64 |
+
|
| 65 |
# %%
|
| 66 |
+
x, y = get_batch("train")
|
| 67 |
|
| 68 |
# %%
|
| 69 |
|
| 70 |
+
|
| 71 |
class BigramLanguageModel(nn.Module):
|
| 72 |
def __init__(self, vocab_size):
|
| 73 |
super().__init__()
|
|
|
|
| 79 |
loss = None
|
| 80 |
else:
|
| 81 |
B, T, C = logits.shape
|
| 82 |
+
logits = logits.view(
|
| 83 |
+
B * T, C
|
| 84 |
+
) # reshape to what torch.cross_entropy expects
|
| 85 |
+
targets = targets.view(B * T)
|
| 86 |
+
loss = F.cross_entropy(logits, targets)
|
| 87 |
return logits, loss
|
| 88 |
+
|
| 89 |
def generate(self, index, max_new_tokens):
|
| 90 |
# index is (B, T) array of indices in the current context
|
| 91 |
for _ in range(max_new_tokens):
|
| 92 |
# get the predictions
|
| 93 |
logits, loss = self.forward(index)
|
| 94 |
# focus only on the last time step
|
| 95 |
+
logits = logits[:, -1, :] # becomes (B, C)
|
| 96 |
# apply softmax to get probabilities
|
| 97 |
+
probs = F.softmax(logits, dim=-1) # (B, C)
|
| 98 |
# sample from the distribution
|
| 99 |
+
index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
|
| 100 |
# append sampled index to the running sequence
|
| 101 |
+
index = torch.cat((index, index_next), dim=1) # (B, T+1)
|
| 102 |
return index
|
| 103 |
|
| 104 |
+
|
| 105 |
# %%
|
| 106 |
model = BigramLanguageModel(vocab_size).to(device)
|
| 107 |
|
| 108 |
+
context = torch.zeros((1, 1), dtype=torch.long, device=device)
|
| 109 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 110 |
print(generated_chars)
|
| 111 |
|
| 112 |
# %% [markdown]
|
| 113 |
+
#
|
| 114 |
# ### Some common optimizers
|
| 115 |
# 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
|
| 116 |
# 2. **Gradient Descent (GD):** is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
|
|
|
|
| 126 |
|
| 127 |
for iter in range(max_iters):
|
| 128 |
# sample a batch
|
| 129 |
+
xb, yb = get_batch("train")
|
| 130 |
|
| 131 |
# evaluate the loss
|
| 132 |
logits, loss = model.forward(xb, yb)
|
|
|
|
| 141 |
|
| 142 |
# %%
|
| 143 |
|
| 144 |
+
context = torch.zeros((1, 1), dtype=torch.long, device=device)
|
| 145 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 146 |
print(generated_chars)
|
|
|
extract.py
CHANGED
|
@@ -2,13 +2,17 @@ import os
|
|
| 2 |
import lzma
|
| 3 |
from tqdm import tqdm
|
| 4 |
|
|
|
|
| 5 |
def xz_files_in_dir(directory):
|
| 6 |
files = []
|
| 7 |
for filename in os.listdir(directory):
|
| 8 |
-
if filename.endswith(".xz") and os.path.isfile(
|
|
|
|
|
|
|
| 9 |
files.append(filename)
|
| 10 |
return files
|
| 11 |
|
|
|
|
| 12 |
tarxz_path = "./openwebtext.tar.xz"
|
| 13 |
folder_path = "./openwebtext"
|
| 14 |
output_file_train = "./openwebtext/train_split.txt"
|
|
@@ -29,7 +33,7 @@ files = xz_files_in_dir(folder_path)
|
|
| 29 |
total_files = len(files)
|
| 30 |
|
| 31 |
# Calculate the split indices
|
| 32 |
-
split_index = int(total_files * 0.9)
|
| 33 |
files_train = files[:split_index]
|
| 34 |
files_val = files[split_index:]
|
| 35 |
|
|
@@ -62,4 +66,4 @@ if not os.path.exists(output_file_val):
|
|
| 62 |
if not os.path.exists(vocab_file):
|
| 63 |
with open(vocab_file, "w", encoding="utf-8") as vfile:
|
| 64 |
for char in vocab:
|
| 65 |
-
vfile.write(char +
|
|
|
|
| 2 |
import lzma
|
| 3 |
from tqdm import tqdm
|
| 4 |
|
| 5 |
+
|
| 6 |
def xz_files_in_dir(directory):
|
| 7 |
files = []
|
| 8 |
for filename in os.listdir(directory):
|
| 9 |
+
if filename.endswith(".xz") and os.path.isfile(
|
| 10 |
+
os.path.join(directory, filename)
|
| 11 |
+
):
|
| 12 |
files.append(filename)
|
| 13 |
return files
|
| 14 |
|
| 15 |
+
|
| 16 |
tarxz_path = "./openwebtext.tar.xz"
|
| 17 |
folder_path = "./openwebtext"
|
| 18 |
output_file_train = "./openwebtext/train_split.txt"
|
|
|
|
| 33 |
total_files = len(files)
|
| 34 |
|
| 35 |
# Calculate the split indices
|
| 36 |
+
split_index = int(total_files * 0.9) # 90% for training
|
| 37 |
files_train = files[:split_index]
|
| 38 |
files_val = files[split_index:]
|
| 39 |
|
|
|
|
| 66 |
if not os.path.exists(vocab_file):
|
| 67 |
with open(vocab_file, "w", encoding="utf-8") as vfile:
|
| 68 |
for char in vocab:
|
| 69 |
+
vfile.write(char + "\n")
|
simple_gpt/gpt_shakespeare.sync.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
import torch
|
| 15 |
import torch.nn as nn
|
| 16 |
from torch.nn import functional as F
|
|
|
|
| 17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 18 |
print(device)
|
| 19 |
block_size = 128
|
|
@@ -42,7 +43,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
|
|
| 42 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 43 |
|
| 44 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 45 |
-
decode = lambda x:
|
| 46 |
|
| 47 |
data = torch.tensor(encode(text), dtype=torch.long, device=device)
|
| 48 |
|
|
@@ -52,21 +53,23 @@ n = int(0.8 * len(data))
|
|
| 52 |
train_data = data[:n]
|
| 53 |
val_data = data[n:]
|
| 54 |
|
|
|
|
| 55 |
# %%
|
| 56 |
def get_batch(split):
|
| 57 |
-
data = train_data if split ==
|
| 58 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 59 |
-
x = torch.stack([data[i:i+block_size] for i in ix])
|
| 60 |
-
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
|
| 61 |
x, y = x.to(device), y.to(device)
|
| 62 |
return x, y
|
| 63 |
|
|
|
|
| 64 |
# %%
|
| 65 |
@torch.no_grad()
|
| 66 |
def estimate_loss():
|
| 67 |
out = {}
|
| 68 |
model.eval()
|
| 69 |
-
for split in [
|
| 70 |
losses = torch.zeros(eval_every)
|
| 71 |
for k in range(eval_every):
|
| 72 |
X, Y = get_batch(split)
|
|
@@ -76,41 +79,46 @@ def estimate_loss():
|
|
| 76 |
model.train()
|
| 77 |
return out
|
| 78 |
|
|
|
|
| 79 |
# %%
|
| 80 |
|
|
|
|
| 81 |
class Head(nn.Module):
|
| 82 |
-
"""
|
| 83 |
|
| 84 |
def __init__(self, head_size):
|
| 85 |
super().__init__()
|
| 86 |
self.key = nn.Linear(n_embd, head_size, bias=False)
|
| 87 |
self.query = nn.Linear(n_embd, head_size, bias=False)
|
| 88 |
self.value = nn.Linear(n_embd, head_size, bias=False)
|
| 89 |
-
self.register_buffer(
|
| 90 |
|
| 91 |
self.dropout = nn.Dropout(dropout)
|
| 92 |
|
| 93 |
def forward(self, x):
|
| 94 |
# input of size (batch, time-step, channels)
|
| 95 |
# output of size (batch, time-step, head size)
|
| 96 |
-
B,T,C = x.shape
|
| 97 |
-
k = self.key(x)
|
| 98 |
-
q = self.query(x)
|
| 99 |
# compute attention scores ("affinities")
|
| 100 |
-
wei =
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
wei = self.dropout(wei)
|
| 104 |
# perform the weighted aggregation of the values
|
| 105 |
-
v = self.value(x)
|
| 106 |
-
out = wei @ v
|
| 107 |
return out
|
| 108 |
|
|
|
|
| 109 |
# [1, 0, 0]
|
| 110 |
# [1, 0.6, 0]
|
| 111 |
# [1, 0.6, 0.4]
|
| 112 |
class MultiHeadAttention(nn.Module):
|
| 113 |
-
"""
|
| 114 |
|
| 115 |
def __init__(self, num_heads, head_size):
|
| 116 |
super().__init__()
|
|
@@ -119,13 +127,15 @@ class MultiHeadAttention(nn.Module):
|
|
| 119 |
self.dropout = nn.Dropout(dropout)
|
| 120 |
|
| 121 |
def forward(self, x):
|
| 122 |
-
out = torch.cat(
|
|
|
|
|
|
|
| 123 |
out = self.dropout(self.proj(out))
|
| 124 |
return out
|
| 125 |
-
|
| 126 |
|
| 127 |
class FeedFoward(nn.Module):
|
| 128 |
-
"""
|
| 129 |
|
| 130 |
def __init__(self, n_embd):
|
| 131 |
super().__init__()
|
|
@@ -138,9 +148,10 @@ class FeedFoward(nn.Module):
|
|
| 138 |
|
| 139 |
def forward(self, x):
|
| 140 |
return self.net(x)
|
| 141 |
-
|
|
|
|
| 142 |
class Block(nn.Module):
|
| 143 |
-
"""
|
| 144 |
|
| 145 |
def __init__(self, n_embd, n_head):
|
| 146 |
# n_embd: embedding dimension, n_head: the number of heads we'd like
|
|
@@ -157,17 +168,19 @@ class Block(nn.Module):
|
|
| 157 |
y = self.ffwd(x)
|
| 158 |
x = self.ln2(x + y)
|
| 159 |
return x
|
| 160 |
-
|
|
|
|
| 161 |
class GPTLanguageModel(nn.Module):
|
| 162 |
def __init__(self, vocab_size):
|
| 163 |
super().__init__()
|
| 164 |
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
|
| 165 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
| 166 |
-
self.blocks = nn.Sequential(
|
| 167 |
-
|
|
|
|
|
|
|
| 168 |
self.lm_head = nn.Linear(n_embd, vocab_size)
|
| 169 |
-
|
| 170 |
-
|
| 171 |
self.apply(self._init_weights)
|
| 172 |
|
| 173 |
def _init_weights(self, module):
|
|
@@ -180,25 +193,26 @@ class GPTLanguageModel(nn.Module):
|
|
| 180 |
|
| 181 |
def forward(self, index, targets=None):
|
| 182 |
B, T = index.shape
|
| 183 |
-
|
| 184 |
-
|
| 185 |
# idx and targets are both (B,T) tensor of integers
|
| 186 |
-
tok_emb = self.token_embedding_table(index)
|
| 187 |
-
pos_emb = self.position_embedding_table(torch.arange(T, device=device))
|
| 188 |
-
x = tok_emb + pos_emb
|
| 189 |
-
x = self.blocks(x)
|
| 190 |
-
x = self.ln_f(x)
|
| 191 |
-
logits = self.lm_head(x)
|
| 192 |
-
|
| 193 |
if targets is None:
|
| 194 |
loss = None
|
| 195 |
else:
|
| 196 |
B, T, C = logits.shape
|
| 197 |
-
logits = logits.view(
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
| 200 |
return logits, loss
|
| 201 |
-
|
| 202 |
def generate(self, index, max_new_tokens):
|
| 203 |
# index is (B, T) array of indices in the current context
|
| 204 |
for _ in range(max_new_tokens):
|
|
@@ -207,15 +221,16 @@ class GPTLanguageModel(nn.Module):
|
|
| 207 |
# get the predictions
|
| 208 |
logits, loss = self.forward(index_cond)
|
| 209 |
# focus only on the last time step
|
| 210 |
-
logits = logits[:, -1, :]
|
| 211 |
# apply softmax to get probabilities
|
| 212 |
-
probs = F.softmax(logits, dim=-1)
|
| 213 |
# sample from the distribution
|
| 214 |
-
index_next = torch.multinomial(probs, num_samples=1)
|
| 215 |
# append sampled index to the running sequence
|
| 216 |
-
index = torch.cat((index, index_next), dim=1)
|
| 217 |
return index
|
| 218 |
|
|
|
|
| 219 |
model = GPTLanguageModel(vocab_size).to(device)
|
| 220 |
|
| 221 |
# create a PyTorch optimizer
|
|
@@ -224,10 +239,12 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
|
| 224 |
for iter in range(max_iters):
|
| 225 |
if iter % eval_every == 0:
|
| 226 |
losses = estimate_loss()
|
| 227 |
-
print(
|
|
|
|
|
|
|
| 228 |
|
| 229 |
# sample a batch of data
|
| 230 |
-
xb, yb = get_batch(
|
| 231 |
|
| 232 |
# evaluate the loss
|
| 233 |
logits, loss = model.forward(xb, yb)
|
|
@@ -238,14 +255,16 @@ print(loss.item())
|
|
| 238 |
|
| 239 |
# %%
|
| 240 |
|
| 241 |
-
context = torch.zeros((1,1), dtype=torch.long, device=device)
|
| 242 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 243 |
print(generated_chars)
|
| 244 |
|
| 245 |
|
| 246 |
# %%
|
| 247 |
|
| 248 |
-
prompt =
|
| 249 |
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
|
| 250 |
-
generated_chars = decode(
|
|
|
|
|
|
|
| 251 |
print(generated_chars)
|
|
|
|
| 14 |
import torch
|
| 15 |
import torch.nn as nn
|
| 16 |
from torch.nn import functional as F
|
| 17 |
+
|
| 18 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 19 |
print(device)
|
| 20 |
block_size = 128
|
|
|
|
| 43 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 44 |
|
| 45 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 46 |
+
decode = lambda x: "".join([int_to_string[i] for i in x])
|
| 47 |
|
| 48 |
data = torch.tensor(encode(text), dtype=torch.long, device=device)
|
| 49 |
|
|
|
|
| 53 |
train_data = data[:n]
|
| 54 |
val_data = data[n:]
|
| 55 |
|
| 56 |
+
|
| 57 |
# %%
|
| 58 |
def get_batch(split):
|
| 59 |
+
data = train_data if split == "train" else val_data
|
| 60 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 61 |
+
x = torch.stack([data[i : i + block_size] for i in ix])
|
| 62 |
+
y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
|
| 63 |
x, y = x.to(device), y.to(device)
|
| 64 |
return x, y
|
| 65 |
|
| 66 |
+
|
| 67 |
# %%
|
| 68 |
@torch.no_grad()
|
| 69 |
def estimate_loss():
|
| 70 |
out = {}
|
| 71 |
model.eval()
|
| 72 |
+
for split in ["train", "val"]:
|
| 73 |
losses = torch.zeros(eval_every)
|
| 74 |
for k in range(eval_every):
|
| 75 |
X, Y = get_batch(split)
|
|
|
|
| 79 |
model.train()
|
| 80 |
return out
|
| 81 |
|
| 82 |
+
|
| 83 |
# %%
|
| 84 |
|
| 85 |
+
|
| 86 |
class Head(nn.Module):
|
| 87 |
+
"""one head of self-attention"""
|
| 88 |
|
| 89 |
def __init__(self, head_size):
|
| 90 |
super().__init__()
|
| 91 |
self.key = nn.Linear(n_embd, head_size, bias=False)
|
| 92 |
self.query = nn.Linear(n_embd, head_size, bias=False)
|
| 93 |
self.value = nn.Linear(n_embd, head_size, bias=False)
|
| 94 |
+
self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
|
| 95 |
|
| 96 |
self.dropout = nn.Dropout(dropout)
|
| 97 |
|
| 98 |
def forward(self, x):
|
| 99 |
# input of size (batch, time-step, channels)
|
| 100 |
# output of size (batch, time-step, head size)
|
| 101 |
+
B, T, C = x.shape
|
| 102 |
+
k = self.key(x) # (B,T,hs)
|
| 103 |
+
q = self.query(x) # (B,T,hs)
|
| 104 |
# compute attention scores ("affinities")
|
| 105 |
+
wei = (
|
| 106 |
+
q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
|
| 107 |
+
) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
|
| 108 |
+
wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
|
| 109 |
+
wei = F.softmax(wei, dim=-1) # (B, T, T)
|
| 110 |
wei = self.dropout(wei)
|
| 111 |
# perform the weighted aggregation of the values
|
| 112 |
+
v = self.value(x) # (B,T,hs)
|
| 113 |
+
out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
|
| 114 |
return out
|
| 115 |
|
| 116 |
+
|
| 117 |
# [1, 0, 0]
|
| 118 |
# [1, 0.6, 0]
|
| 119 |
# [1, 0.6, 0.4]
|
| 120 |
class MultiHeadAttention(nn.Module):
|
| 121 |
+
"""multiple heads of self-attention in parallel"""
|
| 122 |
|
| 123 |
def __init__(self, num_heads, head_size):
|
| 124 |
super().__init__()
|
|
|
|
| 127 |
self.dropout = nn.Dropout(dropout)
|
| 128 |
|
| 129 |
def forward(self, x):
|
| 130 |
+
out = torch.cat(
|
| 131 |
+
[h(x) for h in self.heads], dim=-1
|
| 132 |
+
) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
|
| 133 |
out = self.dropout(self.proj(out))
|
| 134 |
return out
|
| 135 |
+
|
| 136 |
|
| 137 |
class FeedFoward(nn.Module):
|
| 138 |
+
"""a simple linear layer followed by a non-linearity"""
|
| 139 |
|
| 140 |
def __init__(self, n_embd):
|
| 141 |
super().__init__()
|
|
|
|
| 148 |
|
| 149 |
def forward(self, x):
|
| 150 |
return self.net(x)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
class Block(nn.Module):
|
| 154 |
+
"""Transformer block: communication followed by computation"""
|
| 155 |
|
| 156 |
def __init__(self, n_embd, n_head):
|
| 157 |
# n_embd: embedding dimension, n_head: the number of heads we'd like
|
|
|
|
| 168 |
y = self.ffwd(x)
|
| 169 |
x = self.ln2(x + y)
|
| 170 |
return x
|
| 171 |
+
|
| 172 |
+
|
| 173 |
class GPTLanguageModel(nn.Module):
|
| 174 |
def __init__(self, vocab_size):
|
| 175 |
super().__init__()
|
| 176 |
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
|
| 177 |
self.position_embedding_table = nn.Embedding(block_size, n_embd)
|
| 178 |
+
self.blocks = nn.Sequential(
|
| 179 |
+
*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
|
| 180 |
+
)
|
| 181 |
+
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
|
| 182 |
self.lm_head = nn.Linear(n_embd, vocab_size)
|
| 183 |
+
|
|
|
|
| 184 |
self.apply(self._init_weights)
|
| 185 |
|
| 186 |
def _init_weights(self, module):
|
|
|
|
| 193 |
|
| 194 |
def forward(self, index, targets=None):
|
| 195 |
B, T = index.shape
|
| 196 |
+
|
|
|
|
| 197 |
# idx and targets are both (B,T) tensor of integers
|
| 198 |
+
tok_emb = self.token_embedding_table(index) # (B,T,C)
|
| 199 |
+
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
|
| 200 |
+
x = tok_emb + pos_emb # (B,T,C)
|
| 201 |
+
x = self.blocks(x) # (B,T,C)
|
| 202 |
+
x = self.ln_f(x) # (B,T,C)
|
| 203 |
+
logits = self.lm_head(x) # (B,T,vocab_size)
|
| 204 |
+
|
| 205 |
if targets is None:
|
| 206 |
loss = None
|
| 207 |
else:
|
| 208 |
B, T, C = logits.shape
|
| 209 |
+
logits = logits.view(
|
| 210 |
+
B * T, C
|
| 211 |
+
) # reshape to what torch.cross_entropy expects
|
| 212 |
+
targets = targets.view(B * T)
|
| 213 |
+
loss = F.cross_entropy(logits, targets)
|
| 214 |
return logits, loss
|
| 215 |
+
|
| 216 |
def generate(self, index, max_new_tokens):
|
| 217 |
# index is (B, T) array of indices in the current context
|
| 218 |
for _ in range(max_new_tokens):
|
|
|
|
| 221 |
# get the predictions
|
| 222 |
logits, loss = self.forward(index_cond)
|
| 223 |
# focus only on the last time step
|
| 224 |
+
logits = logits[:, -1, :] # becomes (B, C)
|
| 225 |
# apply softmax to get probabilities
|
| 226 |
+
probs = F.softmax(logits, dim=-1) # (B, C)
|
| 227 |
# sample from the distribution
|
| 228 |
+
index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
|
| 229 |
# append sampled index to the running sequence
|
| 230 |
+
index = torch.cat((index, index_next), dim=1) # (B, T+1)
|
| 231 |
return index
|
| 232 |
|
| 233 |
+
|
| 234 |
model = GPTLanguageModel(vocab_size).to(device)
|
| 235 |
|
| 236 |
# create a PyTorch optimizer
|
|
|
|
| 239 |
for iter in range(max_iters):
|
| 240 |
if iter % eval_every == 0:
|
| 241 |
losses = estimate_loss()
|
| 242 |
+
print(
|
| 243 |
+
f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
|
| 244 |
+
)
|
| 245 |
|
| 246 |
# sample a batch of data
|
| 247 |
+
xb, yb = get_batch("train")
|
| 248 |
|
| 249 |
# evaluate the loss
|
| 250 |
logits, loss = model.forward(xb, yb)
|
|
|
|
| 255 |
|
| 256 |
# %%
|
| 257 |
|
| 258 |
+
context = torch.zeros((1, 1), dtype=torch.long, device=device)
|
| 259 |
generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
|
| 260 |
print(generated_chars)
|
| 261 |
|
| 262 |
|
| 263 |
# %%
|
| 264 |
|
| 265 |
+
prompt = "To be or not to be,"
|
| 266 |
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
|
| 267 |
+
generated_chars = decode(
|
| 268 |
+
model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist()
|
| 269 |
+
)
|
| 270 |
print(generated_chars)
|
train.py
CHANGED
|
@@ -17,54 +17,66 @@ device = hyperparams.device
|
|
| 17 |
|
| 18 |
print(device)
|
| 19 |
|
| 20 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
raise Exception("Please run extract.py first")
|
| 22 |
chars = ""
|
| 23 |
-
with open("./vocab.txt",
|
| 24 |
text = f.read()
|
| 25 |
chars = sorted(list(set(text)))
|
| 26 |
-
|
| 27 |
vocab_size = len(chars)
|
| 28 |
|
| 29 |
string_to_int = {ch: i for i, ch in enumerate(chars)}
|
| 30 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 31 |
|
| 32 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 33 |
-
decode = lambda x:
|
|
|
|
|
|
|
| 34 |
# memory map for using small snippets of text from a single file of any size
|
| 35 |
def get_random_chunk(split):
|
| 36 |
-
filename =
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
|
| 39 |
# Determine the file size and a random position to start reading
|
| 40 |
file_size = len(mm)
|
| 41 |
-
start_pos = random.randint(0, (file_size) - block_size*batch_size)
|
| 42 |
|
| 43 |
# Seek to the random position and read the block of text
|
| 44 |
mm.seek(start_pos)
|
| 45 |
-
block = mm.read(block_size*batch_size-1)
|
| 46 |
|
| 47 |
# Decode the block to a string, ignoring any invalid byte sequences
|
| 48 |
-
decoded_block = block.decode(
|
| 49 |
-
|
| 50 |
# Train and test splits
|
| 51 |
data = torch.tensor(encode(decoded_block), dtype=torch.long)
|
| 52 |
-
|
| 53 |
return data
|
| 54 |
|
|
|
|
| 55 |
def get_batch(split):
|
| 56 |
data = get_random_chunk(split)
|
| 57 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 58 |
-
x = torch.stack([data[i:i+block_size] for i in ix])
|
| 59 |
-
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
|
| 60 |
x, y = x.to(device), y.to(device)
|
| 61 |
return x, y
|
| 62 |
|
|
|
|
| 63 |
@torch.no_grad()
|
| 64 |
def estimate_loss():
|
| 65 |
out = {}
|
| 66 |
model.eval()
|
| 67 |
-
for split in [
|
| 68 |
losses = torch.zeros(eval_every)
|
| 69 |
for k in range(eval_every):
|
| 70 |
X, Y = get_batch(split)
|
|
@@ -74,24 +86,27 @@ def estimate_loss():
|
|
| 74 |
model.train()
|
| 75 |
return out
|
| 76 |
|
|
|
|
| 77 |
model = GPTLanguageModel(vocab_size).to(device)
|
| 78 |
|
| 79 |
-
model_pickle_path =
|
| 80 |
if os.path.exists(model_pickle_path):
|
| 81 |
-
print(
|
| 82 |
-
with open(model_pickle_path,
|
| 83 |
model = torch.load(f, map_location=device)
|
| 84 |
-
print(
|
| 85 |
# create a PyTorch optimizer
|
| 86 |
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
| 87 |
|
| 88 |
for iter in range(max_iters):
|
| 89 |
if iter % eval_every == 0:
|
| 90 |
losses = estimate_loss()
|
| 91 |
-
print(
|
|
|
|
|
|
|
| 92 |
|
| 93 |
# sample a batch of data
|
| 94 |
-
xb, yb = get_batch(
|
| 95 |
|
| 96 |
# evaluate the loss
|
| 97 |
logits, loss = model.forward(xb, yb)
|
|
@@ -100,6 +115,6 @@ for iter in range(max_iters):
|
|
| 100 |
optimizer.step()
|
| 101 |
print(loss.item())
|
| 102 |
|
| 103 |
-
with open(model_pickle_path,
|
| 104 |
torch.save(model, f)
|
| 105 |
-
print(
|
|
|
|
| 17 |
|
| 18 |
print(device)
|
| 19 |
|
| 20 |
+
if (
|
| 21 |
+
not os.path.exists("./vocab.txt")
|
| 22 |
+
or not os.path.exists("./openwebtext/train_split.txt")
|
| 23 |
+
or not os.path.exists("./openwebtext/val_split.txt")
|
| 24 |
+
):
|
| 25 |
raise Exception("Please run extract.py first")
|
| 26 |
chars = ""
|
| 27 |
+
with open("./vocab.txt", "r", encoding="utf-8") as f:
|
| 28 |
text = f.read()
|
| 29 |
chars = sorted(list(set(text)))
|
| 30 |
+
|
| 31 |
vocab_size = len(chars)
|
| 32 |
|
| 33 |
string_to_int = {ch: i for i, ch in enumerate(chars)}
|
| 34 |
int_to_string = {i: ch for i, ch in enumerate(chars)}
|
| 35 |
|
| 36 |
encode = lambda s: [string_to_int[ch] for ch in s]
|
| 37 |
+
decode = lambda x: "".join([int_to_string[i] for i in x])
|
| 38 |
+
|
| 39 |
+
|
| 40 |
# memory map for using small snippets of text from a single file of any size
|
| 41 |
def get_random_chunk(split):
|
| 42 |
+
filename = (
|
| 43 |
+
"./openwebtext/train_split.txt"
|
| 44 |
+
if split == "train"
|
| 45 |
+
else "./openwebtext/val_split.txt"
|
| 46 |
+
)
|
| 47 |
+
with open(filename, "rb") as f:
|
| 48 |
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
|
| 49 |
# Determine the file size and a random position to start reading
|
| 50 |
file_size = len(mm)
|
| 51 |
+
start_pos = random.randint(0, (file_size) - block_size * batch_size)
|
| 52 |
|
| 53 |
# Seek to the random position and read the block of text
|
| 54 |
mm.seek(start_pos)
|
| 55 |
+
block = mm.read(block_size * batch_size - 1)
|
| 56 |
|
| 57 |
# Decode the block to a string, ignoring any invalid byte sequences
|
| 58 |
+
decoded_block = block.decode("utf-8", errors="ignore").replace("\r", "")
|
| 59 |
+
|
| 60 |
# Train and test splits
|
| 61 |
data = torch.tensor(encode(decoded_block), dtype=torch.long)
|
| 62 |
+
|
| 63 |
return data
|
| 64 |
|
| 65 |
+
|
| 66 |
def get_batch(split):
|
| 67 |
data = get_random_chunk(split)
|
| 68 |
ix = torch.randint(len(data) - block_size, (batch_size,))
|
| 69 |
+
x = torch.stack([data[i : i + block_size] for i in ix])
|
| 70 |
+
y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
|
| 71 |
x, y = x.to(device), y.to(device)
|
| 72 |
return x, y
|
| 73 |
|
| 74 |
+
|
| 75 |
@torch.no_grad()
|
| 76 |
def estimate_loss():
|
| 77 |
out = {}
|
| 78 |
model.eval()
|
| 79 |
+
for split in ["train", "val"]:
|
| 80 |
losses = torch.zeros(eval_every)
|
| 81 |
for k in range(eval_every):
|
| 82 |
X, Y = get_batch(split)
|
|
|
|
| 86 |
model.train()
|
| 87 |
return out
|
| 88 |
|
| 89 |
+
|
| 90 |
model = GPTLanguageModel(vocab_size).to(device)
|
| 91 |
|
| 92 |
+
model_pickle_path = "./model.pt"
|
| 93 |
if os.path.exists(model_pickle_path):
|
| 94 |
+
print("loading model parameters...")
|
| 95 |
+
with open(model_pickle_path, "rb") as f:
|
| 96 |
model = torch.load(f, map_location=device)
|
| 97 |
+
print("loaded successfully!")
|
| 98 |
# create a PyTorch optimizer
|
| 99 |
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
| 100 |
|
| 101 |
for iter in range(max_iters):
|
| 102 |
if iter % eval_every == 0:
|
| 103 |
losses = estimate_loss()
|
| 104 |
+
print(
|
| 105 |
+
f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
|
| 106 |
+
)
|
| 107 |
|
| 108 |
# sample a batch of data
|
| 109 |
+
xb, yb = get_batch("train")
|
| 110 |
|
| 111 |
# evaluate the loss
|
| 112 |
logits, loss = model.forward(xb, yb)
|
|
|
|
| 115 |
optimizer.step()
|
| 116 |
print(loss.item())
|
| 117 |
|
| 118 |
+
with open(model_pickle_path, "wb") as f:
|
| 119 |
torch.save(model, f)
|
| 120 |
+
print("model saved")
|