ibrahimmkhalid commited on
Commit
e0646b5
·
1 Parent(s): 5e3f56c

clean up and retrain

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. GPTLanguageModelClass.py +161 -0
  3. app.py +12 -148
  4. model.pt +2 -2
  5. train_gpt_openwebtext.py +12 -153
.gitignore CHANGED
@@ -3,5 +3,6 @@
3
 
4
  venv/
5
  .ipynb_checkpoints/
 
6
 
7
  openwebtext/
 
3
 
4
  venv/
5
  .ipynb_checkpoints/
6
+ __pycache__/
7
 
8
  openwebtext/
GPTLanguageModelClass.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+
5
+ class hyperparams:
6
+ block_size = 128
7
+ batch_size = 32
8
+ max_iters = 12000
9
+ learning_rate = 3e-4
10
+ eval_every = 100
11
+ n_embd = 384
12
+ n_head = 8
13
+ n_layer = 8
14
+ dropout = 0.2
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ block_size = hyperparams.block_size
18
+ batch_size = hyperparams.batch_size
19
+ max_iters = hyperparams.max_iters
20
+ learning_rate = hyperparams.learning_rate
21
+ eval_every = hyperparams.eval_every
22
+ n_embd = hyperparams.n_embd
23
+ n_head = hyperparams.n_head
24
+ n_layer = hyperparams.n_layer
25
+ dropout = hyperparams.dropout
26
+ device = hyperparams.device
27
+
28
+ class Head(nn.Module):
29
+ """ one head of self-attention """
30
+
31
+ def __init__(self, head_size):
32
+ super().__init__()
33
+ self.key = nn.Linear(n_embd, head_size, bias=False)
34
+ self.query = nn.Linear(n_embd, head_size, bias=False)
35
+ self.value = nn.Linear(n_embd, head_size, bias=False)
36
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
37
+
38
+ self.dropout = nn.Dropout(dropout)
39
+
40
+ def forward(self, x):
41
+ # input of size (batch, time-step, channels)
42
+ # output of size (batch, time-step, head size)
43
+ B,T,C = x.shape
44
+ k = self.key(x) # (B,T,hs)
45
+ q = self.query(x) # (B,T,hs)
46
+ # compute attention scores ("affinities")
47
+ wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
48
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
49
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
50
+ wei = self.dropout(wei)
51
+ # perform the weighted aggregation of the values
52
+ v = self.value(x) # (B,T,hs)
53
+ out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
54
+ return out
55
+
56
+ class MultiHeadAttention(nn.Module):
57
+ """ multiple heads of self-attention in parallel """
58
+
59
+ def __init__(self, num_heads, head_size):
60
+ super().__init__()
61
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
62
+ self.proj = nn.Linear(head_size * num_heads, n_embd)
63
+ self.dropout = nn.Dropout(dropout)
64
+
65
+ def forward(self, x):
66
+ out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
67
+ out = self.dropout(self.proj(out))
68
+ return out
69
+
70
+ class FeedFoward(nn.Module):
71
+ """ a simple linear layer followed by a non-linearity """
72
+
73
+ def __init__(self, n_embd):
74
+ super().__init__()
75
+ self.net = nn.Sequential(
76
+ nn.Linear(n_embd, 4 * n_embd),
77
+ nn.ReLU(),
78
+ nn.Linear(4 * n_embd, n_embd),
79
+ nn.Dropout(dropout),
80
+ )
81
+
82
+ def forward(self, x):
83
+ return self.net(x)
84
+
85
+ class Block(nn.Module):
86
+ """ Transformer block: communication followed by computation """
87
+
88
+ def __init__(self, n_embd, n_head):
89
+ # n_embd: embedding dimension, n_head: the number of heads we'd like
90
+ super().__init__()
91
+ head_size = n_embd // n_head
92
+ self.sa = MultiHeadAttention(n_head, head_size)
93
+ self.ffwd = FeedFoward(n_embd)
94
+ self.ln1 = nn.LayerNorm(n_embd)
95
+ self.ln2 = nn.LayerNorm(n_embd)
96
+
97
+ def forward(self, x):
98
+ y = self.sa(x)
99
+ x = self.ln1(x + y)
100
+ y = self.ffwd(x)
101
+ x = self.ln2(x + y)
102
+ return x
103
+
104
+ class GPTLanguageModel(nn.Module):
105
+ def __init__(self, vocab_size):
106
+ super().__init__()
107
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
108
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
109
+ self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
110
+ self.ln_f = nn.LayerNorm(n_embd) # final layer norm
111
+ self.lm_head = nn.Linear(n_embd, vocab_size)
112
+
113
+
114
+ self.apply(self._init_weights)
115
+
116
+ def _init_weights(self, module):
117
+ if isinstance(module, nn.Linear):
118
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
119
+ if module.bias is not None:
120
+ torch.nn.init.zeros_(module.bias)
121
+ elif isinstance(module, nn.Embedding):
122
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
123
+
124
+ def forward(self, index, targets=None):
125
+ B, T = index.shape
126
+
127
+
128
+ # idx and targets are both (B,T) tensor of integers
129
+ tok_emb = self.token_embedding_table(index) # (B,T,C)
130
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
131
+ x = tok_emb + pos_emb # (B,T,C)
132
+ x = self.blocks(x) # (B,T,C)
133
+ x = self.ln_f(x) # (B,T,C)
134
+ logits = self.lm_head(x) # (B,T,vocab_size)
135
+
136
+ if targets is None:
137
+ loss = None
138
+ else:
139
+ B, T, C = logits.shape
140
+ logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
141
+ targets = targets.view(B*T)
142
+ loss = F.cross_entropy(logits, targets)
143
+ return logits, loss
144
+
145
+ def generate(self, index, max_new_tokens):
146
+ # index is (B, T) array of indices in the current context
147
+ for _ in range(max_new_tokens):
148
+ # crop idx to the last block_size tokens
149
+ index_cond = index[:, -block_size:]
150
+ # get the predictions
151
+ logits, loss = self.forward(index_cond)
152
+ # focus only on the last time step
153
+ logits = logits[:, -1, :] # becomes (B, C)
154
+ # apply softmax to get probabilities
155
+ probs = F.softmax(logits, dim=-1) # (B, C)
156
+ # sample from the distribution
157
+ index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
158
+ # append sampled index to the running sequence
159
+ index = torch.cat((index, index_next), dim=1) # (B, T+1)
160
+ return index
161
+
app.py CHANGED
@@ -1,158 +1,22 @@
1
  import streamlit as st
2
  import torch
3
- import torch.nn as nn
4
- from torch.nn import functional as F
5
  import os
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  st.title('LLM from scratch Demo')
8
- st.subheader('Maintenance mode: please come back later')
9
 
10
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
  st.write(f"Using device: {device}")
12
- block_size = 128
13
- batch_size = 32
14
- max_iters = 4000
15
- learning_rate = 3e-4
16
- eval_every = 500
17
- n_embd = 384
18
- n_head = 8
19
- n_layer = 8
20
- dropout = 0.2
21
-
22
-
23
- class Head(nn.Module):
24
- """ one head of self-attention """
25
-
26
- def __init__(self, head_size):
27
- super().__init__()
28
- self.key = nn.Linear(n_embd, head_size, bias=False)
29
- self.query = nn.Linear(n_embd, head_size, bias=False)
30
- self.value = nn.Linear(n_embd, head_size, bias=False)
31
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
32
-
33
- self.dropout = nn.Dropout(dropout)
34
-
35
- def forward(self, x):
36
- # input of size (batch, time-step, channels)
37
- # output of size (batch, time-step, head size)
38
- B,T,C = x.shape
39
- k = self.key(x) # (B,T,hs)
40
- q = self.query(x) # (B,T,hs)
41
- # compute attention scores ("affinities")
42
- wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
43
- wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
44
- wei = F.softmax(wei, dim=-1) # (B, T, T)
45
- wei = self.dropout(wei)
46
- # perform the weighted aggregation of the values
47
- v = self.value(x) # (B,T,hs)
48
- out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
49
- return out
50
-
51
- class MultiHeadAttention(nn.Module):
52
- """ multiple heads of self-attention in parallel """
53
-
54
- def __init__(self, num_heads, head_size):
55
- super().__init__()
56
- self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
57
- self.proj = nn.Linear(head_size * num_heads, n_embd)
58
- self.dropout = nn.Dropout(dropout)
59
-
60
- def forward(self, x):
61
- out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
62
- out = self.dropout(self.proj(out))
63
- return out
64
-
65
- class FeedFoward(nn.Module):
66
- """ a simple linear layer followed by a non-linearity """
67
-
68
- def __init__(self, n_embd):
69
- super().__init__()
70
- self.net = nn.Sequential(
71
- nn.Linear(n_embd, 4 * n_embd),
72
- nn.ReLU(),
73
- nn.Linear(4 * n_embd, n_embd),
74
- nn.Dropout(dropout),
75
- )
76
-
77
- def forward(self, x):
78
- return self.net(x)
79
-
80
- class Block(nn.Module):
81
- """ Transformer block: communication followed by computation """
82
-
83
- def __init__(self, n_embd, n_head):
84
- # n_embd: embedding dimension, n_head: the number of heads we'd like
85
- super().__init__()
86
- head_size = n_embd // n_head
87
- self.sa = MultiHeadAttention(n_head, head_size)
88
- self.ffwd = FeedFoward(n_embd)
89
- self.ln1 = nn.LayerNorm(n_embd)
90
- self.ln2 = nn.LayerNorm(n_embd)
91
-
92
- def forward(self, x):
93
- y = self.sa(x)
94
- x = self.ln1(x + y)
95
- y = self.ffwd(x)
96
- x = self.ln2(x + y)
97
- return x
98
-
99
- class GPTLanguageModel(nn.Module):
100
- def __init__(self, vocab_size):
101
- super().__init__()
102
- self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
103
- self.position_embedding_table = nn.Embedding(block_size, n_embd)
104
- self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
105
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
106
- self.lm_head = nn.Linear(n_embd, vocab_size)
107
-
108
-
109
- self.apply(self._init_weights)
110
-
111
- def _init_weights(self, module):
112
- if isinstance(module, nn.Linear):
113
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
114
- if module.bias is not None:
115
- torch.nn.init.zeros_(module.bias)
116
- elif isinstance(module, nn.Embedding):
117
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
118
-
119
- def forward(self, index, targets=None):
120
- B, T = index.shape
121
-
122
-
123
- # idx and targets are both (B,T) tensor of integers
124
- tok_emb = self.token_embedding_table(index) # (B,T,C)
125
- pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
126
- x = tok_emb + pos_emb # (B,T,C)
127
- x = self.blocks(x) # (B,T,C)
128
- x = self.ln_f(x) # (B,T,C)
129
- logits = self.lm_head(x) # (B,T,vocab_size)
130
-
131
- if targets is None:
132
- loss = None
133
- else:
134
- B, T, C = logits.shape
135
- logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
136
- targets = targets.view(B*T)
137
- loss = F.cross_entropy(logits, targets)
138
- return logits, loss
139
-
140
- def generate(self, index, max_new_tokens):
141
- # index is (B, T) array of indices in the current context
142
- for _ in range(max_new_tokens):
143
- # crop idx to the last block_size tokens
144
- index_cond = index[:, -block_size:]
145
- # get the predictions
146
- logits, loss = self.forward(index_cond)
147
- # focus only on the last time step
148
- logits = logits[:, -1, :] # becomes (B, C)
149
- # apply softmax to get probabilities
150
- probs = F.softmax(logits, dim=-1) # (B, C)
151
- # sample from the distribution
152
- index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
153
- # append sampled index to the running sequence
154
- index = torch.cat((index, index_next), dim=1) # (B, T+1)
155
- return index
156
 
157
  if not os.path.exists("./vocab.txt"):
158
  raise Exception("Please run extract.py first")
 
1
  import streamlit as st
2
  import torch
 
 
3
  import os
4
+ from GPTLanguageModelClass import *
5
+
6
+ block_size = hyperparams.block_size
7
+ batch_size = hyperparams.batch_size
8
+ max_iters = hyperparams.max_iters
9
+ learning_rate = hyperparams.learning_rate
10
+ eval_every = hyperparams.eval_every
11
+ n_embd = hyperparams.n_embd
12
+ n_head = hyperparams.n_head
13
+ n_layer = hyperparams.n_layer
14
+ dropout = hyperparams.dropout
15
+ device = hyperparams.device
16
 
17
  st.title('LLM from scratch Demo')
 
18
 
 
19
  st.write(f"Using device: {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  if not os.path.exists("./vocab.txt"):
22
  raise Exception("Please run extract.py first")
model.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04e95f8e46dd7b7b894d288f3c2b75bb0a535fb266960803587a9f552e6b5a73
3
- size 160274578
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c91a6742cac446d27f433efefdd50501c421e443a3927cf31c37454f9b23247c
3
+ size 160301382
train_gpt_openwebtext.py CHANGED
@@ -1,22 +1,21 @@
1
  import torch
2
- import torch.nn as nn
3
- from torch.nn import functional as F
4
  import mmap
5
  import random
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
-
9
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
  print(device)
11
- block_size = 128
12
- batch_size = 32
13
- max_iters = 4000
14
- learning_rate = 3e-4
15
- eval_every = 500
16
- n_embd = 384
17
- n_head = 8
18
- n_layer = 8
19
- dropout = 0.2
20
 
21
  if not os.path.exists("./vocab.txt") or not os.path.exists("./openwebtext/train_split.txt") or not os.path.exists("./openwebtext/val_split.txt"):
22
  raise Exception("Please run extract.py first")
@@ -53,7 +52,6 @@ def get_random_chunk(split):
53
 
54
  return data
55
 
56
-
57
  def get_batch(split):
58
  data = get_random_chunk(split)
59
  ix = torch.randint(len(data) - block_size, (batch_size,))
@@ -76,145 +74,6 @@ def estimate_loss():
76
  model.train()
77
  return out
78
 
79
-
80
- class Head(nn.Module):
81
- """ one head of self-attention """
82
-
83
- def __init__(self, head_size):
84
- super().__init__()
85
- self.key = nn.Linear(n_embd, head_size, bias=False)
86
- self.query = nn.Linear(n_embd, head_size, bias=False)
87
- self.value = nn.Linear(n_embd, head_size, bias=False)
88
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
89
-
90
- self.dropout = nn.Dropout(dropout)
91
-
92
- def forward(self, x):
93
- # input of size (batch, time-step, channels)
94
- # output of size (batch, time-step, head size)
95
- B,T,C = x.shape
96
- k = self.key(x) # (B,T,hs)
97
- q = self.query(x) # (B,T,hs)
98
- # compute attention scores ("affinities")
99
- wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
100
- wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
101
- wei = F.softmax(wei, dim=-1) # (B, T, T)
102
- wei = self.dropout(wei)
103
- # perform the weighted aggregation of the values
104
- v = self.value(x) # (B,T,hs)
105
- out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
106
- return out
107
-
108
- # [1, 0, 0]
109
- # [1, 0.6, 0]
110
- # [1, 0.6, 0.4]
111
- class MultiHeadAttention(nn.Module):
112
- """ multiple heads of self-attention in parallel """
113
-
114
- def __init__(self, num_heads, head_size):
115
- super().__init__()
116
- self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
117
- self.proj = nn.Linear(head_size * num_heads, n_embd)
118
- self.dropout = nn.Dropout(dropout)
119
-
120
- def forward(self, x):
121
- out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
122
- out = self.dropout(self.proj(out))
123
- return out
124
-
125
-
126
- class FeedFoward(nn.Module):
127
- """ a simple linear layer followed by a non-linearity """
128
-
129
- def __init__(self, n_embd):
130
- super().__init__()
131
- self.net = nn.Sequential(
132
- nn.Linear(n_embd, 4 * n_embd),
133
- nn.ReLU(),
134
- nn.Linear(4 * n_embd, n_embd),
135
- nn.Dropout(dropout),
136
- )
137
-
138
- def forward(self, x):
139
- return self.net(x)
140
-
141
- class Block(nn.Module):
142
- """ Transformer block: communication followed by computation """
143
-
144
- def __init__(self, n_embd, n_head):
145
- # n_embd: embedding dimension, n_head: the number of heads we'd like
146
- super().__init__()
147
- head_size = n_embd // n_head
148
- self.sa = MultiHeadAttention(n_head, head_size)
149
- self.ffwd = FeedFoward(n_embd)
150
- self.ln1 = nn.LayerNorm(n_embd)
151
- self.ln2 = nn.LayerNorm(n_embd)
152
-
153
- def forward(self, x):
154
- y = self.sa(x)
155
- x = self.ln1(x + y)
156
- y = self.ffwd(x)
157
- x = self.ln2(x + y)
158
- return x
159
-
160
- class GPTLanguageModel(nn.Module):
161
- def __init__(self, vocab_size):
162
- super().__init__()
163
- self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
164
- self.position_embedding_table = nn.Embedding(block_size, n_embd)
165
- self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
166
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
167
- self.lm_head = nn.Linear(n_embd, vocab_size)
168
-
169
-
170
- self.apply(self._init_weights)
171
-
172
- def _init_weights(self, module):
173
- if isinstance(module, nn.Linear):
174
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
175
- if module.bias is not None:
176
- torch.nn.init.zeros_(module.bias)
177
- elif isinstance(module, nn.Embedding):
178
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
179
-
180
- def forward(self, index, targets=None):
181
- B, T = index.shape
182
-
183
-
184
- # idx and targets are both (B,T) tensor of integers
185
- tok_emb = self.token_embedding_table(index) # (B,T,C)
186
- pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
187
- x = tok_emb + pos_emb # (B,T,C)
188
- x = self.blocks(x) # (B,T,C)
189
- x = self.ln_f(x) # (B,T,C)
190
- logits = self.lm_head(x) # (B,T,vocab_size)
191
-
192
- if targets is None:
193
- loss = None
194
- else:
195
- B, T, C = logits.shape
196
- logits = logits.view(B*T, C) # reshape to what torch.cross_entropy expects
197
- targets = targets.view(B*T)
198
- loss = F.cross_entropy(logits, targets)
199
- return logits, loss
200
-
201
- def generate(self, index, max_new_tokens):
202
- # index is (B, T) array of indices in the current context
203
- for _ in range(max_new_tokens):
204
- # crop idx to the last block_size tokens
205
- index_cond = index[:, -block_size:]
206
- # get the predictions
207
- logits, loss = self.forward(index_cond)
208
- # focus only on the last time step
209
- logits = logits[:, -1, :] # becomes (B, C)
210
- # apply softmax to get probabilities
211
- probs = F.softmax(logits, dim=-1) # (B, C)
212
- # sample from the distribution
213
- index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
214
- # append sampled index to the running sequence
215
- index = torch.cat((index, index_next), dim=1) # (B, T+1)
216
- return index
217
-
218
  model = GPTLanguageModel(vocab_size).to(device)
219
 
220
  model_pickle_path = './model.pt'
 
1
  import torch
 
 
2
  import mmap
3
  import random
4
  import os
5
+ from GPTLanguageModelClass import *
6
+
7
+ block_size = hyperparams.block_size
8
+ batch_size = hyperparams.batch_size
9
+ max_iters = hyperparams.max_iters
10
+ learning_rate = hyperparams.learning_rate
11
+ eval_every = hyperparams.eval_every
12
+ n_embd = hyperparams.n_embd
13
+ n_head = hyperparams.n_head
14
+ n_layer = hyperparams.n_layer
15
+ dropout = hyperparams.dropout
16
+ device = hyperparams.device
17
 
 
 
18
  print(device)
 
 
 
 
 
 
 
 
 
19
 
20
  if not os.path.exists("./vocab.txt") or not os.path.exists("./openwebtext/train_split.txt") or not os.path.exists("./openwebtext/val_split.txt"):
21
  raise Exception("Please run extract.py first")
 
52
 
53
  return data
54
 
 
55
  def get_batch(split):
56
  data = get_random_chunk(split)
57
  ix = torch.randint(len(data) - block_size, (batch_size,))
 
74
  model.train()
75
  return out
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  model = GPTLanguageModel(vocab_size).to(device)
78
 
79
  model_pickle_path = './model.pt'