ibrahimmkhalid commited on
Commit
5649c37
·
1 Parent(s): 6b094a2

autoformat

Browse files
GPTLanguageModelClass.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  import torch.nn as nn
3
  from torch.nn import functional as F
4
 
 
5
  class hyperparams:
6
  block_size = 128
7
  batch_size = 32
@@ -14,6 +15,7 @@ class hyperparams:
14
  dropout = 0.2
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
 
 
17
  block_size = hyperparams.block_size
18
  batch_size = hyperparams.batch_size
19
  max_iters = hyperparams.max_iters
@@ -25,36 +27,40 @@ n_layer = hyperparams.n_layer
25
  dropout = hyperparams.dropout
26
  device = hyperparams.device
27
 
 
28
  class Head(nn.Module):
29
- """ one head of self-attention """
30
 
31
  def __init__(self, head_size):
32
  super().__init__()
33
  self.key = nn.Linear(n_embd, head_size, bias=False)
34
  self.query = nn.Linear(n_embd, head_size, bias=False)
35
  self.value = nn.Linear(n_embd, head_size, bias=False)
36
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
37
 
38
  self.dropout = nn.Dropout(dropout)
39
 
40
  def forward(self, x):
41
  # input of size (batch, time-step, channels)
42
  # output of size (batch, time-step, head size)
43
- B,T,C = x.shape
44
- k = self.key(x) # (B,T,hs)
45
- q = self.query(x) # (B,T,hs)
46
  # compute attention scores ("affinities")
47
- wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
48
- wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
49
- wei = F.softmax(wei, dim=-1) # (B, T, T)
 
 
50
  wei = self.dropout(wei)
51
  # perform the weighted aggregation of the values
52
- v = self.value(x) # (B,T,hs)
53
- out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
54
  return out
55
 
 
56
  class MultiHeadAttention(nn.Module):
57
- """ multiple heads of self-attention in parallel """
58
 
59
  def __init__(self, num_heads, head_size):
60
  super().__init__()
@@ -63,12 +69,15 @@ class MultiHeadAttention(nn.Module):
63
  self.dropout = nn.Dropout(dropout)
64
 
65
  def forward(self, x):
66
- out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
 
 
67
  out = self.dropout(self.proj(out))
68
  return out
69
-
 
70
  class FeedFoward(nn.Module):
71
- """ a simple linear layer followed by a non-linearity """
72
 
73
  def __init__(self, n_embd):
74
  super().__init__()
@@ -81,9 +90,10 @@ class FeedFoward(nn.Module):
81
 
82
  def forward(self, x):
83
  return self.net(x)
84
-
 
85
  class Block(nn.Module):
86
- """ Transformer block: communication followed by computation """
87
 
88
  def __init__(self, n_embd, n_head):
89
  # n_embd: embedding dimension, n_head: the number of heads we'd like
@@ -100,17 +110,19 @@ class Block(nn.Module):
100
  y = self.ffwd(x)
101
  x = self.ln2(x + y)
102
  return x
103
-
 
104
  class GPTLanguageModel(nn.Module):
105
  def __init__(self, vocab_size):
106
  super().__init__()
107
  self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
108
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
109
- self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
110
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
 
 
111
  self.lm_head = nn.Linear(n_embd, vocab_size)
112
-
113
-
114
  self.apply(self._init_weights)
115
 
116
  def _init_weights(self, module):
@@ -123,25 +135,26 @@ class GPTLanguageModel(nn.Module):
123
 
124
  def forward(self, index, targets=None):
125
  B, T = index.shape
126
-
127
-
128
  # idx and targets are both (B,T) tensor of integers
129
- tok_emb = self.token_embedding_table(index) # (B,T,C)
130
- pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
131
- x = tok_emb + pos_emb # (B,T,C)
132
- x = self.blocks(x) # (B,T,C)
133
- x = self.ln_f(x) # (B,T,C)
134
- logits = self.lm_head(x) # (B,T,vocab_size)
135
-
136
  if targets is None:
137
  loss = None
138
  else:
139
  B, T, C = logits.shape
140
- logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
141
- targets = targets.view(B*T)
142
- loss = F.cross_entropy(logits, targets)
 
 
143
  return logits, loss
144
-
145
  def generate(self, index, max_new_tokens):
146
  # index is (B, T) array of indices in the current context
147
  for _ in range(max_new_tokens):
@@ -150,12 +163,11 @@ class GPTLanguageModel(nn.Module):
150
  # get the predictions
151
  logits, loss = self.forward(index_cond)
152
  # focus only on the last time step
153
- logits = logits[:, -1, :] # becomes (B, C)
154
  # apply softmax to get probabilities
155
- probs = F.softmax(logits, dim=-1) # (B, C)
156
  # sample from the distribution
157
- index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
158
  # append sampled index to the running sequence
159
- index = torch.cat((index, index_next), dim=1) # (B, T+1)
160
  return index
161
-
 
2
  import torch.nn as nn
3
  from torch.nn import functional as F
4
 
5
+
6
  class hyperparams:
7
  block_size = 128
8
  batch_size = 32
 
15
  dropout = 0.2
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
 
18
+
19
  block_size = hyperparams.block_size
20
  batch_size = hyperparams.batch_size
21
  max_iters = hyperparams.max_iters
 
27
  dropout = hyperparams.dropout
28
  device = hyperparams.device
29
 
30
+
31
  class Head(nn.Module):
32
+ """one head of self-attention"""
33
 
34
  def __init__(self, head_size):
35
  super().__init__()
36
  self.key = nn.Linear(n_embd, head_size, bias=False)
37
  self.query = nn.Linear(n_embd, head_size, bias=False)
38
  self.value = nn.Linear(n_embd, head_size, bias=False)
39
+ self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
40
 
41
  self.dropout = nn.Dropout(dropout)
42
 
43
  def forward(self, x):
44
  # input of size (batch, time-step, channels)
45
  # output of size (batch, time-step, head size)
46
+ B, T, C = x.shape
47
+ k = self.key(x) # (B,T,hs)
48
+ q = self.query(x) # (B,T,hs)
49
  # compute attention scores ("affinities")
50
+ wei = (
51
+ q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
52
+ ) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
53
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
54
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
55
  wei = self.dropout(wei)
56
  # perform the weighted aggregation of the values
57
+ v = self.value(x) # (B,T,hs)
58
+ out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
59
  return out
60
 
61
+
62
  class MultiHeadAttention(nn.Module):
63
+ """multiple heads of self-attention in parallel"""
64
 
65
  def __init__(self, num_heads, head_size):
66
  super().__init__()
 
69
  self.dropout = nn.Dropout(dropout)
70
 
71
  def forward(self, x):
72
+ out = torch.cat(
73
+ [h(x) for h in self.heads], dim=-1
74
+ ) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
75
  out = self.dropout(self.proj(out))
76
  return out
77
+
78
+
79
  class FeedFoward(nn.Module):
80
+ """a simple linear layer followed by a non-linearity"""
81
 
82
  def __init__(self, n_embd):
83
  super().__init__()
 
90
 
91
  def forward(self, x):
92
  return self.net(x)
93
+
94
+
95
  class Block(nn.Module):
96
+ """Transformer block: communication followed by computation"""
97
 
98
  def __init__(self, n_embd, n_head):
99
  # n_embd: embedding dimension, n_head: the number of heads we'd like
 
110
  y = self.ffwd(x)
111
  x = self.ln2(x + y)
112
  return x
113
+
114
+
115
  class GPTLanguageModel(nn.Module):
116
  def __init__(self, vocab_size):
117
  super().__init__()
118
  self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
119
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
120
+ self.blocks = nn.Sequential(
121
+ *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
122
+ )
123
+ self.ln_f = nn.LayerNorm(n_embd) # final layer norm
124
  self.lm_head = nn.Linear(n_embd, vocab_size)
125
+
 
126
  self.apply(self._init_weights)
127
 
128
  def _init_weights(self, module):
 
135
 
136
  def forward(self, index, targets=None):
137
  B, T = index.shape
138
+
 
139
  # idx and targets are both (B,T) tensor of integers
140
+ tok_emb = self.token_embedding_table(index) # (B,T,C)
141
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
142
+ x = tok_emb + pos_emb # (B,T,C)
143
+ x = self.blocks(x) # (B,T,C)
144
+ x = self.ln_f(x) # (B,T,C)
145
+ logits = self.lm_head(x) # (B,T,vocab_size)
146
+
147
  if targets is None:
148
  loss = None
149
  else:
150
  B, T, C = logits.shape
151
+ logits = logits.view(
152
+ B * T, C
153
+ ) # reshape to what torch.cross_entropy expects
154
+ targets = targets.view(B * T)
155
+ loss = F.cross_entropy(logits, targets)
156
  return logits, loss
157
+
158
  def generate(self, index, max_new_tokens):
159
  # index is (B, T) array of indices in the current context
160
  for _ in range(max_new_tokens):
 
163
  # get the predictions
164
  logits, loss = self.forward(index_cond)
165
  # focus only on the last time step
166
+ logits = logits[:, -1, :] # becomes (B, C)
167
  # apply softmax to get probabilities
168
+ probs = F.softmax(logits, dim=-1) # (B, C)
169
  # sample from the distribution
170
+ index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
171
  # append sampled index to the running sequence
172
+ index = torch.cat((index, index_next), dim=1) # (B, T+1)
173
  return index
 
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import torch
3
  import os
4
- from GPTLanguageModelClass import *
5
 
6
  block_size = hyperparams.block_size
7
  batch_size = hyperparams.batch_size
@@ -14,36 +14,56 @@ n_layer = hyperparams.n_layer
14
  dropout = hyperparams.dropout
15
  device = hyperparams.device
16
 
17
- st.title('LLM from scratch Demo')
18
 
19
  st.write(f"Using device: {device}")
20
 
21
  if not os.path.exists("./vocab.txt"):
22
  raise Exception("Please run extract.py first")
23
  chars = ""
24
- with open("./vocab.txt", 'r', encoding='utf-8') as f:
25
  text = f.read()
26
  chars = sorted(list(set(text)))
27
 
 
 
 
 
 
 
 
 
 
 
 
28
  string_to_int = {ch: i for i, ch in enumerate(chars)}
29
  int_to_string = {i: ch for i, ch in enumerate(chars)}
30
 
31
- encode = lambda s: [string_to_int[ch] for ch in s]
32
- decode = lambda x: ''.join([int_to_string[i] for i in x])
 
 
 
 
 
33
 
34
 
35
- model_pickle_path = './model.pt'
36
 
37
- st.write('loading model parameters...')
38
- with open(model_pickle_path, 'rb') as f:
39
- model = torch.load(f, map_location=device)
40
- st.write('model loaded successfully!')
41
 
42
- prompt = ''
43
- prompt = st.text_area('Prompt:', value=prompt, height=100, max_chars=block_size - 1, key='prompt')
 
 
44
  if len(prompt) != 0:
45
  context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
46
  max_new_tokens = block_size - len(prompt)
47
- generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=max_new_tokens)[0].tolist())
48
- st.write('Generated text:')
 
 
49
  st.write(generated_chars)
 
1
  import streamlit as st
2
  import torch
3
  import os
4
+ from GPTLanguageModelClass import hyperparams
5
 
6
  block_size = hyperparams.block_size
7
  batch_size = hyperparams.batch_size
 
14
  dropout = hyperparams.dropout
15
  device = hyperparams.device
16
 
17
+ st.title("LLM from scratch Demo")
18
 
19
  st.write(f"Using device: {device}")
20
 
21
  if not os.path.exists("./vocab.txt"):
22
  raise Exception("Please run extract.py first")
23
  chars = ""
24
+ with open("./vocab.txt", "r", encoding="utf-8") as f:
25
  text = f.read()
26
  chars = sorted(list(set(text)))
27
 
28
+ st.write(f"Vocab size: {len(chars)}")
29
+ st.write(f"Block size: {block_size}")
30
+ st.write(f"Batch size: {batch_size}")
31
+ st.write(f"Max iters: {max_iters}")
32
+ st.write(f"Learning rate: {learning_rate}")
33
+ st.write(f"Eval every: {eval_every}")
34
+ st.write(f"n_embd: {n_embd}")
35
+ st.write(f"n_head: {n_head}")
36
+ st.write(f"n_layer: {n_layer}")
37
+ st.write(f"dropout: {dropout}")
38
+
39
  string_to_int = {ch: i for i, ch in enumerate(chars)}
40
  int_to_string = {i: ch for i, ch in enumerate(chars)}
41
 
42
+
43
+ def encode(s):
44
+ return [string_to_int[ch] for ch in s]
45
+
46
+
47
+ def decode(x):
48
+ return "".join([int_to_string[i] for i in x])
49
 
50
 
51
+ model_pickle_path = "./model.pt"
52
 
53
+ st.write("loading model parameters...")
54
+ with open(model_pickle_path, "rb") as f:
55
+ model = torch.load(f, map_location=device, weights_only=False)
56
+ st.write("model loaded successfully!")
57
 
58
+ prompt = ""
59
+ prompt = st.text_area(
60
+ "Prompt:", value=prompt, height=100, max_chars=block_size - 1, key="prompt"
61
+ )
62
  if len(prompt) != 0:
63
  context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
64
  max_new_tokens = block_size - len(prompt)
65
+ generated_chars = decode(
66
+ model.generate(context.unsqueeze(0), max_new_tokens=max_new_tokens)[0].tolist()
67
+ )
68
+ st.write("Generated text:")
69
  st.write(generated_chars)
bigram/bigram_testing.sync.py CHANGED
@@ -14,6 +14,7 @@
14
  import torch
15
  import torch.nn as nn
16
  from torch.nn import functional as F
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  print(device)
19
  block_size = 8
@@ -40,7 +41,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
40
  int_to_string = {i: ch for i, ch in enumerate(chars)}
41
 
42
  encode = lambda s: [string_to_int[ch] for ch in s]
43
- decode = lambda x: ''.join([int_to_string[i] for i in x])
44
 
45
  data = torch.tensor(encode(text), dtype=torch.long, device=device)
46
 
@@ -50,20 +51,23 @@ n = int(0.8 * len(data))
50
  train_data = data[:n]
51
  val_data = data[n:]
52
 
 
53
  # %%
54
  def get_batch(split):
55
- data = train_data if split == 'train' else val_data
56
  ix = torch.randint(len(data) - block_size, (batch_size,))
57
- x = torch.stack([data[i:i+block_size] for i in ix])
58
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
59
  x, y = x.to(device), y.to(device)
60
  return x, y
61
 
 
62
  # %%
63
- x, y = get_batch('train')
64
 
65
  # %%
66
 
 
67
  class BigramLanguageModel(nn.Module):
68
  def __init__(self, vocab_size):
69
  super().__init__()
@@ -75,34 +79,38 @@ class BigramLanguageModel(nn.Module):
75
  loss = None
76
  else:
77
  B, T, C = logits.shape
78
- logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
79
- targets = targets.view(B*T)
80
- loss = F.cross_entropy(logits, targets)
 
 
81
  return logits, loss
 
82
  def generate(self, index, max_new_tokens):
83
  # index is (B, T) array of indices in the current context
84
  for _ in range(max_new_tokens):
85
  # get the predictions
86
  logits, loss = self.forward(index)
87
  # focus only on the last time step
88
- logits = logits[:, -1, :] # becomes (B, C)
89
  # apply softmax to get probabilities
90
- probs = F.softmax(logits, dim=-1) # (B, C)
91
  # sample from the distribution
92
- index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
93
  # append sampled index to the running sequence
94
- index = torch.cat((index, index_next), dim=1) # (B, T+1)
95
  return index
96
 
 
97
  # %%
98
  model = BigramLanguageModel(vocab_size).to(device)
99
 
100
- context = torch.zeros((1,1), dtype=torch.long, device=device)
101
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
102
  print(generated_chars)
103
 
104
  # %% [markdown]
105
- #
106
  # ### Some common optimizers
107
  # 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
108
  # 2. **Gradient Descent (GD):** is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
@@ -118,7 +126,7 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
118
 
119
  for iter in range(max_iters):
120
  # sample a batch
121
- xb, yb = get_batch('train')
122
 
123
  # evaluate the loss
124
  logits, loss = model.forward(xb, yb)
@@ -133,7 +141,6 @@ print(loss.item())
133
 
134
  # %%
135
 
136
- context = torch.zeros((1,1), dtype=torch.long, device=device)
137
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
138
  print(generated_chars)
139
-
 
14
  import torch
15
  import torch.nn as nn
16
  from torch.nn import functional as F
17
+
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
  print(device)
20
  block_size = 8
 
41
  int_to_string = {i: ch for i, ch in enumerate(chars)}
42
 
43
  encode = lambda s: [string_to_int[ch] for ch in s]
44
+ decode = lambda x: "".join([int_to_string[i] for i in x])
45
 
46
  data = torch.tensor(encode(text), dtype=torch.long, device=device)
47
 
 
51
  train_data = data[:n]
52
  val_data = data[n:]
53
 
54
+
55
  # %%
56
  def get_batch(split):
57
+ data = train_data if split == "train" else val_data
58
  ix = torch.randint(len(data) - block_size, (batch_size,))
59
+ x = torch.stack([data[i : i + block_size] for i in ix])
60
+ y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
61
  x, y = x.to(device), y.to(device)
62
  return x, y
63
 
64
+
65
  # %%
66
+ x, y = get_batch("train")
67
 
68
  # %%
69
 
70
+
71
  class BigramLanguageModel(nn.Module):
72
  def __init__(self, vocab_size):
73
  super().__init__()
 
79
  loss = None
80
  else:
81
  B, T, C = logits.shape
82
+ logits = logits.view(
83
+ B * T, C
84
+ ) # reshape to what torch.cross_entropy expects
85
+ targets = targets.view(B * T)
86
+ loss = F.cross_entropy(logits, targets)
87
  return logits, loss
88
+
89
  def generate(self, index, max_new_tokens):
90
  # index is (B, T) array of indices in the current context
91
  for _ in range(max_new_tokens):
92
  # get the predictions
93
  logits, loss = self.forward(index)
94
  # focus only on the last time step
95
+ logits = logits[:, -1, :] # becomes (B, C)
96
  # apply softmax to get probabilities
97
+ probs = F.softmax(logits, dim=-1) # (B, C)
98
  # sample from the distribution
99
+ index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
100
  # append sampled index to the running sequence
101
+ index = torch.cat((index, index_next), dim=1) # (B, T+1)
102
  return index
103
 
104
+
105
  # %%
106
  model = BigramLanguageModel(vocab_size).to(device)
107
 
108
+ context = torch.zeros((1, 1), dtype=torch.long, device=device)
109
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
110
  print(generated_chars)
111
 
112
  # %% [markdown]
113
+ #
114
  # ### Some common optimizers
115
  # 1. **Mean Squared Error (MSE)**: MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
116
  # 2. **Gradient Descent (GD):** is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
 
126
 
127
  for iter in range(max_iters):
128
  # sample a batch
129
+ xb, yb = get_batch("train")
130
 
131
  # evaluate the loss
132
  logits, loss = model.forward(xb, yb)
 
141
 
142
  # %%
143
 
144
+ context = torch.zeros((1, 1), dtype=torch.long, device=device)
145
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
146
  print(generated_chars)
 
extract.py CHANGED
@@ -2,13 +2,17 @@ import os
2
  import lzma
3
  from tqdm import tqdm
4
 
 
5
  def xz_files_in_dir(directory):
6
  files = []
7
  for filename in os.listdir(directory):
8
- if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
 
 
9
  files.append(filename)
10
  return files
11
 
 
12
  tarxz_path = "./openwebtext.tar.xz"
13
  folder_path = "./openwebtext"
14
  output_file_train = "./openwebtext/train_split.txt"
@@ -29,7 +33,7 @@ files = xz_files_in_dir(folder_path)
29
  total_files = len(files)
30
 
31
  # Calculate the split indices
32
- split_index = int(total_files * 0.9) # 90% for training
33
  files_train = files[:split_index]
34
  files_val = files[split_index:]
35
 
@@ -62,4 +66,4 @@ if not os.path.exists(output_file_val):
62
  if not os.path.exists(vocab_file):
63
  with open(vocab_file, "w", encoding="utf-8") as vfile:
64
  for char in vocab:
65
- vfile.write(char + '\n')
 
2
  import lzma
3
  from tqdm import tqdm
4
 
5
+
6
  def xz_files_in_dir(directory):
7
  files = []
8
  for filename in os.listdir(directory):
9
+ if filename.endswith(".xz") and os.path.isfile(
10
+ os.path.join(directory, filename)
11
+ ):
12
  files.append(filename)
13
  return files
14
 
15
+
16
  tarxz_path = "./openwebtext.tar.xz"
17
  folder_path = "./openwebtext"
18
  output_file_train = "./openwebtext/train_split.txt"
 
33
  total_files = len(files)
34
 
35
  # Calculate the split indices
36
+ split_index = int(total_files * 0.9) # 90% for training
37
  files_train = files[:split_index]
38
  files_val = files[split_index:]
39
 
 
66
  if not os.path.exists(vocab_file):
67
  with open(vocab_file, "w", encoding="utf-8") as vfile:
68
  for char in vocab:
69
+ vfile.write(char + "\n")
simple_gpt/gpt_shakespeare.sync.py CHANGED
@@ -14,6 +14,7 @@
14
  import torch
15
  import torch.nn as nn
16
  from torch.nn import functional as F
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  print(device)
19
  block_size = 128
@@ -42,7 +43,7 @@ string_to_int = {ch: i for i, ch in enumerate(chars)}
42
  int_to_string = {i: ch for i, ch in enumerate(chars)}
43
 
44
  encode = lambda s: [string_to_int[ch] for ch in s]
45
- decode = lambda x: ''.join([int_to_string[i] for i in x])
46
 
47
  data = torch.tensor(encode(text), dtype=torch.long, device=device)
48
 
@@ -52,21 +53,23 @@ n = int(0.8 * len(data))
52
  train_data = data[:n]
53
  val_data = data[n:]
54
 
 
55
  # %%
56
  def get_batch(split):
57
- data = train_data if split == 'train' else val_data
58
  ix = torch.randint(len(data) - block_size, (batch_size,))
59
- x = torch.stack([data[i:i+block_size] for i in ix])
60
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
61
  x, y = x.to(device), y.to(device)
62
  return x, y
63
 
 
64
  # %%
65
  @torch.no_grad()
66
  def estimate_loss():
67
  out = {}
68
  model.eval()
69
- for split in ['train', 'val']:
70
  losses = torch.zeros(eval_every)
71
  for k in range(eval_every):
72
  X, Y = get_batch(split)
@@ -76,41 +79,46 @@ def estimate_loss():
76
  model.train()
77
  return out
78
 
 
79
  # %%
80
 
 
81
  class Head(nn.Module):
82
- """ one head of self-attention """
83
 
84
  def __init__(self, head_size):
85
  super().__init__()
86
  self.key = nn.Linear(n_embd, head_size, bias=False)
87
  self.query = nn.Linear(n_embd, head_size, bias=False)
88
  self.value = nn.Linear(n_embd, head_size, bias=False)
89
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
90
 
91
  self.dropout = nn.Dropout(dropout)
92
 
93
  def forward(self, x):
94
  # input of size (batch, time-step, channels)
95
  # output of size (batch, time-step, head size)
96
- B,T,C = x.shape
97
- k = self.key(x) # (B,T,hs)
98
- q = self.query(x) # (B,T,hs)
99
  # compute attention scores ("affinities")
100
- wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
101
- wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
102
- wei = F.softmax(wei, dim=-1) # (B, T, T)
 
 
103
  wei = self.dropout(wei)
104
  # perform the weighted aggregation of the values
105
- v = self.value(x) # (B,T,hs)
106
- out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
107
  return out
108
 
 
109
  # [1, 0, 0]
110
  # [1, 0.6, 0]
111
  # [1, 0.6, 0.4]
112
  class MultiHeadAttention(nn.Module):
113
- """ multiple heads of self-attention in parallel """
114
 
115
  def __init__(self, num_heads, head_size):
116
  super().__init__()
@@ -119,13 +127,15 @@ class MultiHeadAttention(nn.Module):
119
  self.dropout = nn.Dropout(dropout)
120
 
121
  def forward(self, x):
122
- out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
 
 
123
  out = self.dropout(self.proj(out))
124
  return out
125
-
126
 
127
  class FeedFoward(nn.Module):
128
- """ a simple linear layer followed by a non-linearity """
129
 
130
  def __init__(self, n_embd):
131
  super().__init__()
@@ -138,9 +148,10 @@ class FeedFoward(nn.Module):
138
 
139
  def forward(self, x):
140
  return self.net(x)
141
-
 
142
  class Block(nn.Module):
143
- """ Transformer block: communication followed by computation """
144
 
145
  def __init__(self, n_embd, n_head):
146
  # n_embd: embedding dimension, n_head: the number of heads we'd like
@@ -157,17 +168,19 @@ class Block(nn.Module):
157
  y = self.ffwd(x)
158
  x = self.ln2(x + y)
159
  return x
160
-
 
161
  class GPTLanguageModel(nn.Module):
162
  def __init__(self, vocab_size):
163
  super().__init__()
164
  self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
165
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
166
- self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
167
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
 
 
168
  self.lm_head = nn.Linear(n_embd, vocab_size)
169
-
170
-
171
  self.apply(self._init_weights)
172
 
173
  def _init_weights(self, module):
@@ -180,25 +193,26 @@ class GPTLanguageModel(nn.Module):
180
 
181
  def forward(self, index, targets=None):
182
  B, T = index.shape
183
-
184
-
185
  # idx and targets are both (B,T) tensor of integers
186
- tok_emb = self.token_embedding_table(index) # (B,T,C)
187
- pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
188
- x = tok_emb + pos_emb # (B,T,C)
189
- x = self.blocks(x) # (B,T,C)
190
- x = self.ln_f(x) # (B,T,C)
191
- logits = self.lm_head(x) # (B,T,vocab_size)
192
-
193
  if targets is None:
194
  loss = None
195
  else:
196
  B, T, C = logits.shape
197
- logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
198
- targets = targets.view(B*T)
199
- loss = F.cross_entropy(logits, targets)
 
 
200
  return logits, loss
201
-
202
  def generate(self, index, max_new_tokens):
203
  # index is (B, T) array of indices in the current context
204
  for _ in range(max_new_tokens):
@@ -207,15 +221,16 @@ class GPTLanguageModel(nn.Module):
207
  # get the predictions
208
  logits, loss = self.forward(index_cond)
209
  # focus only on the last time step
210
- logits = logits[:, -1, :] # becomes (B, C)
211
  # apply softmax to get probabilities
212
- probs = F.softmax(logits, dim=-1) # (B, C)
213
  # sample from the distribution
214
- index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
215
  # append sampled index to the running sequence
216
- index = torch.cat((index, index_next), dim=1) # (B, T+1)
217
  return index
218
 
 
219
  model = GPTLanguageModel(vocab_size).to(device)
220
 
221
  # create a PyTorch optimizer
@@ -224,10 +239,12 @@ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
224
  for iter in range(max_iters):
225
  if iter % eval_every == 0:
226
  losses = estimate_loss()
227
- print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
 
 
228
 
229
  # sample a batch of data
230
- xb, yb = get_batch('train')
231
 
232
  # evaluate the loss
233
  logits, loss = model.forward(xb, yb)
@@ -238,14 +255,16 @@ print(loss.item())
238
 
239
  # %%
240
 
241
- context = torch.zeros((1,1), dtype=torch.long, device=device)
242
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
243
  print(generated_chars)
244
 
245
 
246
  # %%
247
 
248
- prompt = 'To be or not to be,'
249
  context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
250
- generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
 
 
251
  print(generated_chars)
 
14
  import torch
15
  import torch.nn as nn
16
  from torch.nn import functional as F
17
+
18
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
  print(device)
20
  block_size = 128
 
43
  int_to_string = {i: ch for i, ch in enumerate(chars)}
44
 
45
  encode = lambda s: [string_to_int[ch] for ch in s]
46
+ decode = lambda x: "".join([int_to_string[i] for i in x])
47
 
48
  data = torch.tensor(encode(text), dtype=torch.long, device=device)
49
 
 
53
  train_data = data[:n]
54
  val_data = data[n:]
55
 
56
+
57
  # %%
58
  def get_batch(split):
59
+ data = train_data if split == "train" else val_data
60
  ix = torch.randint(len(data) - block_size, (batch_size,))
61
+ x = torch.stack([data[i : i + block_size] for i in ix])
62
+ y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
63
  x, y = x.to(device), y.to(device)
64
  return x, y
65
 
66
+
67
  # %%
68
  @torch.no_grad()
69
  def estimate_loss():
70
  out = {}
71
  model.eval()
72
+ for split in ["train", "val"]:
73
  losses = torch.zeros(eval_every)
74
  for k in range(eval_every):
75
  X, Y = get_batch(split)
 
79
  model.train()
80
  return out
81
 
82
+
83
  # %%
84
 
85
+
86
  class Head(nn.Module):
87
+ """one head of self-attention"""
88
 
89
  def __init__(self, head_size):
90
  super().__init__()
91
  self.key = nn.Linear(n_embd, head_size, bias=False)
92
  self.query = nn.Linear(n_embd, head_size, bias=False)
93
  self.value = nn.Linear(n_embd, head_size, bias=False)
94
+ self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
95
 
96
  self.dropout = nn.Dropout(dropout)
97
 
98
  def forward(self, x):
99
  # input of size (batch, time-step, channels)
100
  # output of size (batch, time-step, head size)
101
+ B, T, C = x.shape
102
+ k = self.key(x) # (B,T,hs)
103
+ q = self.query(x) # (B,T,hs)
104
  # compute attention scores ("affinities")
105
+ wei = (
106
+ q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
107
+ ) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
108
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
109
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
110
  wei = self.dropout(wei)
111
  # perform the weighted aggregation of the values
112
+ v = self.value(x) # (B,T,hs)
113
+ out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
114
  return out
115
 
116
+
117
  # [1, 0, 0]
118
  # [1, 0.6, 0]
119
  # [1, 0.6, 0.4]
120
  class MultiHeadAttention(nn.Module):
121
+ """multiple heads of self-attention in parallel"""
122
 
123
  def __init__(self, num_heads, head_size):
124
  super().__init__()
 
127
  self.dropout = nn.Dropout(dropout)
128
 
129
  def forward(self, x):
130
+ out = torch.cat(
131
+ [h(x) for h in self.heads], dim=-1
132
+ ) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
133
  out = self.dropout(self.proj(out))
134
  return out
135
+
136
 
137
  class FeedFoward(nn.Module):
138
+ """a simple linear layer followed by a non-linearity"""
139
 
140
  def __init__(self, n_embd):
141
  super().__init__()
 
148
 
149
  def forward(self, x):
150
  return self.net(x)
151
+
152
+
153
  class Block(nn.Module):
154
+ """Transformer block: communication followed by computation"""
155
 
156
  def __init__(self, n_embd, n_head):
157
  # n_embd: embedding dimension, n_head: the number of heads we'd like
 
168
  y = self.ffwd(x)
169
  x = self.ln2(x + y)
170
  return x
171
+
172
+
173
  class GPTLanguageModel(nn.Module):
174
  def __init__(self, vocab_size):
175
  super().__init__()
176
  self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
177
  self.position_embedding_table = nn.Embedding(block_size, n_embd)
178
+ self.blocks = nn.Sequential(
179
+ *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
180
+ )
181
+ self.ln_f = nn.LayerNorm(n_embd) # final layer norm
182
  self.lm_head = nn.Linear(n_embd, vocab_size)
183
+
 
184
  self.apply(self._init_weights)
185
 
186
  def _init_weights(self, module):
 
193
 
194
  def forward(self, index, targets=None):
195
  B, T = index.shape
196
+
 
197
  # idx and targets are both (B,T) tensor of integers
198
+ tok_emb = self.token_embedding_table(index) # (B,T,C)
199
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
200
+ x = tok_emb + pos_emb # (B,T,C)
201
+ x = self.blocks(x) # (B,T,C)
202
+ x = self.ln_f(x) # (B,T,C)
203
+ logits = self.lm_head(x) # (B,T,vocab_size)
204
+
205
  if targets is None:
206
  loss = None
207
  else:
208
  B, T, C = logits.shape
209
+ logits = logits.view(
210
+ B * T, C
211
+ ) # reshape to what torch.cross_entropy expects
212
+ targets = targets.view(B * T)
213
+ loss = F.cross_entropy(logits, targets)
214
  return logits, loss
215
+
216
  def generate(self, index, max_new_tokens):
217
  # index is (B, T) array of indices in the current context
218
  for _ in range(max_new_tokens):
 
221
  # get the predictions
222
  logits, loss = self.forward(index_cond)
223
  # focus only on the last time step
224
+ logits = logits[:, -1, :] # becomes (B, C)
225
  # apply softmax to get probabilities
226
+ probs = F.softmax(logits, dim=-1) # (B, C)
227
  # sample from the distribution
228
+ index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
229
  # append sampled index to the running sequence
230
+ index = torch.cat((index, index_next), dim=1) # (B, T+1)
231
  return index
232
 
233
+
234
  model = GPTLanguageModel(vocab_size).to(device)
235
 
236
  # create a PyTorch optimizer
 
239
  for iter in range(max_iters):
240
  if iter % eval_every == 0:
241
  losses = estimate_loss()
242
+ print(
243
+ f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
244
+ )
245
 
246
  # sample a batch of data
247
+ xb, yb = get_batch("train")
248
 
249
  # evaluate the loss
250
  logits, loss = model.forward(xb, yb)
 
255
 
256
  # %%
257
 
258
+ context = torch.zeros((1, 1), dtype=torch.long, device=device)
259
  generated_chars = decode(model.generate(context, max_new_tokens=100)[0].tolist())
260
  print(generated_chars)
261
 
262
 
263
  # %%
264
 
265
+ prompt = "To be or not to be,"
266
  context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
267
+ generated_chars = decode(
268
+ model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist()
269
+ )
270
  print(generated_chars)
train.py CHANGED
@@ -17,54 +17,66 @@ device = hyperparams.device
17
 
18
  print(device)
19
 
20
- if not os.path.exists("./vocab.txt") or not os.path.exists("./openwebtext/train_split.txt") or not os.path.exists("./openwebtext/val_split.txt"):
 
 
 
 
21
  raise Exception("Please run extract.py first")
22
  chars = ""
23
- with open("./vocab.txt", 'r', encoding='utf-8') as f:
24
  text = f.read()
25
  chars = sorted(list(set(text)))
26
-
27
  vocab_size = len(chars)
28
 
29
  string_to_int = {ch: i for i, ch in enumerate(chars)}
30
  int_to_string = {i: ch for i, ch in enumerate(chars)}
31
 
32
  encode = lambda s: [string_to_int[ch] for ch in s]
33
- decode = lambda x: ''.join([int_to_string[i] for i in x])
 
 
34
  # memory map for using small snippets of text from a single file of any size
35
  def get_random_chunk(split):
36
- filename = "./openwebtext/train_split.txt" if split == 'train' else "./openwebtext/val_split.txt"
37
- with open(filename, 'rb') as f:
 
 
 
 
38
  with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
39
  # Determine the file size and a random position to start reading
40
  file_size = len(mm)
41
- start_pos = random.randint(0, (file_size) - block_size*batch_size)
42
 
43
  # Seek to the random position and read the block of text
44
  mm.seek(start_pos)
45
- block = mm.read(block_size*batch_size-1)
46
 
47
  # Decode the block to a string, ignoring any invalid byte sequences
48
- decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
49
-
50
  # Train and test splits
51
  data = torch.tensor(encode(decoded_block), dtype=torch.long)
52
-
53
  return data
54
 
 
55
  def get_batch(split):
56
  data = get_random_chunk(split)
57
  ix = torch.randint(len(data) - block_size, (batch_size,))
58
- x = torch.stack([data[i:i+block_size] for i in ix])
59
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
60
  x, y = x.to(device), y.to(device)
61
  return x, y
62
 
 
63
  @torch.no_grad()
64
  def estimate_loss():
65
  out = {}
66
  model.eval()
67
- for split in ['train', 'val']:
68
  losses = torch.zeros(eval_every)
69
  for k in range(eval_every):
70
  X, Y = get_batch(split)
@@ -74,24 +86,27 @@ def estimate_loss():
74
  model.train()
75
  return out
76
 
 
77
  model = GPTLanguageModel(vocab_size).to(device)
78
 
79
- model_pickle_path = './model.pt'
80
  if os.path.exists(model_pickle_path):
81
- print('loading model parameters...')
82
- with open(model_pickle_path, 'rb') as f:
83
  model = torch.load(f, map_location=device)
84
- print('loaded successfully!')
85
  # create a PyTorch optimizer
86
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
87
 
88
  for iter in range(max_iters):
89
  if iter % eval_every == 0:
90
  losses = estimate_loss()
91
- print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
 
 
92
 
93
  # sample a batch of data
94
- xb, yb = get_batch('train')
95
 
96
  # evaluate the loss
97
  logits, loss = model.forward(xb, yb)
@@ -100,6 +115,6 @@ for iter in range(max_iters):
100
  optimizer.step()
101
  print(loss.item())
102
 
103
- with open(model_pickle_path, 'wb') as f:
104
  torch.save(model, f)
105
- print('model saved')
 
17
 
18
  print(device)
19
 
20
+ if (
21
+ not os.path.exists("./vocab.txt")
22
+ or not os.path.exists("./openwebtext/train_split.txt")
23
+ or not os.path.exists("./openwebtext/val_split.txt")
24
+ ):
25
  raise Exception("Please run extract.py first")
26
  chars = ""
27
+ with open("./vocab.txt", "r", encoding="utf-8") as f:
28
  text = f.read()
29
  chars = sorted(list(set(text)))
30
+
31
  vocab_size = len(chars)
32
 
33
  string_to_int = {ch: i for i, ch in enumerate(chars)}
34
  int_to_string = {i: ch for i, ch in enumerate(chars)}
35
 
36
  encode = lambda s: [string_to_int[ch] for ch in s]
37
+ decode = lambda x: "".join([int_to_string[i] for i in x])
38
+
39
+
40
  # memory map for using small snippets of text from a single file of any size
41
  def get_random_chunk(split):
42
+ filename = (
43
+ "./openwebtext/train_split.txt"
44
+ if split == "train"
45
+ else "./openwebtext/val_split.txt"
46
+ )
47
+ with open(filename, "rb") as f:
48
  with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
49
  # Determine the file size and a random position to start reading
50
  file_size = len(mm)
51
+ start_pos = random.randint(0, (file_size) - block_size * batch_size)
52
 
53
  # Seek to the random position and read the block of text
54
  mm.seek(start_pos)
55
+ block = mm.read(block_size * batch_size - 1)
56
 
57
  # Decode the block to a string, ignoring any invalid byte sequences
58
+ decoded_block = block.decode("utf-8", errors="ignore").replace("\r", "")
59
+
60
  # Train and test splits
61
  data = torch.tensor(encode(decoded_block), dtype=torch.long)
62
+
63
  return data
64
 
65
+
66
  def get_batch(split):
67
  data = get_random_chunk(split)
68
  ix = torch.randint(len(data) - block_size, (batch_size,))
69
+ x = torch.stack([data[i : i + block_size] for i in ix])
70
+ y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
71
  x, y = x.to(device), y.to(device)
72
  return x, y
73
 
74
+
75
  @torch.no_grad()
76
  def estimate_loss():
77
  out = {}
78
  model.eval()
79
+ for split in ["train", "val"]:
80
  losses = torch.zeros(eval_every)
81
  for k in range(eval_every):
82
  X, Y = get_batch(split)
 
86
  model.train()
87
  return out
88
 
89
+
90
  model = GPTLanguageModel(vocab_size).to(device)
91
 
92
+ model_pickle_path = "./model.pt"
93
  if os.path.exists(model_pickle_path):
94
+ print("loading model parameters...")
95
+ with open(model_pickle_path, "rb") as f:
96
  model = torch.load(f, map_location=device)
97
+ print("loaded successfully!")
98
  # create a PyTorch optimizer
99
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
100
 
101
  for iter in range(max_iters):
102
  if iter % eval_every == 0:
103
  losses = estimate_loss()
104
+ print(
105
+ f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}"
106
+ )
107
 
108
  # sample a batch of data
109
+ xb, yb = get_batch("train")
110
 
111
  # evaluate the loss
112
  logits, loss = model.forward(xb, yb)
 
115
  optimizer.step()
116
  print(loss.item())
117
 
118
+ with open(model_pickle_path, "wb") as f:
119
  torch.save(model, f)
120
+ print("model saved")