Raymond commited on
Commit
ea507ec
·
1 Parent(s): 556f06a
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. app.py +47 -0
  3. improved-v5.bin +3 -0
  4. model.py +185 -0
  5. requirements.txt +2 -0
  6. tokenizer.py +14 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ improved-v5.bin filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import time
4
+
5
+ from tokenizer import encode, decode, vocab_size
6
+ from model import *
7
+
8
+ model = TokenBasedLanguageModel()
9
+ m = model.to(device)
10
+
11
+ print("Loading checkpoint from file")
12
+ checkpoint = torch.load("improved-v5.bin")
13
+ model.load_state_dict(checkpoint["model_state_dict"])
14
+ print("State restored")
15
+
16
+ def generate_llm(prompt, max_tokens = 512, analyze_probs = False):
17
+ prompt_encoded = encode(prompt) # trigger book 2 intro
18
+ #encode("[1]{.ePub-B}\n") # trigger first chapter
19
+ context = torch.tensor(prompt_encoded, dtype = torch.long, device = device).view(1, len(prompt_encoded))
20
+ output = prompt[:]
21
+ start_time = time.time()
22
+ token_count = 0
23
+ probtext = ""
24
+ for encoded_token_pair in model.generate(context, max_new_tokens=max_tokens, stream = True, stream_probs = analyze_probs):
25
+ probtext = ""
26
+ encoded_token = encoded_token_pair
27
+ if analyze_probs:
28
+ [encoded_token, probs] = encoded_token_pair
29
+ prob_list = []
30
+ for token_id in range(vocab_size):
31
+ prob_list.append([token_id, probs[token_id]])
32
+ prob_list.sort(key = lambda x: x[1], reverse = True)
33
+ for prob_pair in prob_list[:25]:
34
+ probtext += f'"{decode([prob_pair[0]])}": {prob_pair[1]}\n'
35
+ else:
36
+ probtext = "Feature disabled."
37
+ part = decode([encoded_token])
38
+ output += part
39
+ token_count += 1
40
+ yield [output, str(token_count / (time.time() - start_time)) + "tok/s " + str(token_count) + " tokens generated.", probtext]
41
+ return [output, str(token_count / (time.time() - start_time)) + "tok/s " + str(token_count) + " tokens generated.", probtext]
42
+
43
+ demo = gr.Interface(generate_llm,
44
+ inputs=[gr.TextArea(placeholder = "In the midst of chaos."), gr.Number(value = 512, maximum = 2048, minimum = 1, step = 1, label = "Max tokens"), gr.Checkbox(label = "Show probs, 10x slower")],
45
+ outputs=[gr.TextArea(label = "Output"), gr.Text(placeholder = "tok/s and other stats", label = "Stats"), gr.TextArea(label = "Probability stats")])
46
+
47
+ demo.launch(share = True)
improved-v5.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1386b24f3429dcfd1c5caa12d97496989ba099da464953dbf9cf9d76e515a5c8
3
+ size 399120992
model.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import collections
4
+ from torch.nn import functional as F
5
+ from torch.nn import RMSNorm
6
+
7
+ from tokenizer import vocab_size, encode, decode, tiktoken_encoding
8
+
9
+ # hyperparameters
10
+ batch_size = 64 # how many independent sequences will we process in parallel?
11
+ block_size = 128 # what is the maximum context length for predictions?
12
+ max_iters = 45 * 1000
13
+ eval_interval = 500
14
+ learning_rate = 1e-3
15
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+ eval_iters = 500
17
+ n_embd = 128
18
+ n_head = 4
19
+ n_layer = 10
20
+ dropout = 0.02
21
+ TRAIN = True
22
+ PRETRAIN_PERCENTAGE = 0.6
23
+ REP_PENALTY_DECAY = 0.95
24
+ # ------------
25
+
26
+ class Head(nn.Module):
27
+ """ one head of self-attention """
28
+
29
+ def __init__(self, head_size):
30
+ super().__init__()
31
+ self.key = nn.Linear(n_embd, head_size, bias=False)
32
+ self.query = nn.Linear(n_embd, head_size, bias=False)
33
+ self.value = nn.Linear(n_embd, head_size, bias=False)
34
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
35
+
36
+ self.dropout = nn.Dropout(dropout)
37
+
38
+ def forward(self, x):
39
+ B,T,C = x.shape
40
+ k = self.key(x) # (B,T,C)
41
+ q = self.query(x) # (B,T,C)
42
+ # compute attention scores ("affinities")
43
+ wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
44
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
45
+ wei = F.softmax(wei, dim=-1) # (B, T, T)
46
+ wei = self.dropout(wei)
47
+ # perform the weighted aggregation of the values
48
+ v = self.value(x) # (B,T,C)
49
+ out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
50
+ return out
51
+
52
+ class MultiHeadAttention(nn.Module):
53
+ """ multiple heads of self-attention in parallel """
54
+
55
+ def __init__(self, num_heads, head_size):
56
+ super().__init__()
57
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
58
+ self.proj = nn.Linear(n_embd, n_embd)
59
+ self.dropout = nn.Dropout(dropout)
60
+
61
+ def forward(self, x):
62
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
63
+ out = self.dropout(self.proj(out))
64
+ return out
65
+
66
+ class FeedFoward(nn.Module):
67
+ """ a simple linear layer followed by a non-linearity """
68
+
69
+ def __init__(self, n_embd):
70
+ super().__init__()
71
+ self.net = nn.Sequential(
72
+ nn.Linear(n_embd, 4 * n_embd),
73
+ SwiGLU(4 * n_embd, 4 * n_embd),
74
+ nn.Linear(4 * n_embd, n_embd),
75
+ nn.Dropout(dropout),
76
+ )
77
+
78
+ def forward(self, x):
79
+ return self.net(x)
80
+
81
+ # NOTE: I AM TESTING CODE FROM https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/activations.py
82
+ # be aware I do not know how this works entirely
83
+ class SwiGLU(nn.Module):
84
+ r"""
85
+ A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
86
+ but uses SiLU / Swish instead of GeLU.
87
+
88
+ Parameters:
89
+ dim_in (`int`): The number of channels in the input.
90
+ dim_out (`int`): The number of channels in the output.
91
+ bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
92
+ """
93
+
94
+ def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
95
+ super().__init__()
96
+ self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
97
+ self.activation = nn.SiLU()
98
+
99
+ def forward(self, hidden_states):
100
+ hidden_states = self.proj(hidden_states)
101
+ hidden_states, gate = hidden_states.chunk(2, dim=-1)
102
+ return hidden_states * self.activation(gate)
103
+
104
+ class Block(nn.Module):
105
+ """ Transformer block: communication followed by computation """
106
+
107
+ def __init__(self, n_embd, n_head):
108
+ # n_embd: embedding dimension, n_head: the number of heads we'd like
109
+ super().__init__()
110
+ head_size = n_embd // n_head
111
+ self.sa = MultiHeadAttention(n_head, head_size)
112
+ self.ffwd = FeedFoward(n_embd)
113
+ self.ln1 = nn.RMSNorm(n_embd) # orig a LayerNorm
114
+ self.ln2 = nn.RMSNorm(n_embd) # orig a LayerNorm
115
+
116
+ def forward(self, x):
117
+ x = x + self.sa(self.ln1(x))
118
+ x = x + self.ffwd(self.ln2(x))
119
+ return x
120
+
121
+ # super simple bigram model
122
+ class TokenBasedLanguageModel(nn.Module):
123
+
124
+ def __init__(self):
125
+ super().__init__()
126
+ # each token directly reads off the logits for the next token from a lookup table
127
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
128
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
129
+ self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
130
+ self.ln_f = nn.RMSNorm(n_embd) # final orig layer norm
131
+ self.lm_head = nn.Linear(n_embd, vocab_size)
132
+
133
+ def forward(self, idx, targets=None):
134
+ B, T = idx.shape
135
+
136
+ # idx and targets are both (B,T) tensor of integers
137
+ tok_emb = self.token_embedding_table(idx) # (B,T,C)
138
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
139
+ x = tok_emb + pos_emb # (B,T,C)
140
+ x = self.blocks(x) # (B,T,C)
141
+ x = self.ln_f(x) # (B,T,C)
142
+ logits = self.lm_head(x) # (B,T,vocab_size)
143
+
144
+ if targets is None:
145
+ loss = None
146
+ else:
147
+ B, T, C = logits.shape
148
+ logits = logits.view(B*T, C)
149
+ targets = targets.view(B*T)
150
+ loss = F.cross_entropy(logits, targets)
151
+
152
+ return logits, loss
153
+
154
+ @torch.no_grad
155
+ def generate(self, idx, max_new_tokens, stream = False, stream_probs = False):
156
+ # idx is (B, T) array of indices in the current context
157
+ token_modifiers = collections.defaultdict(lambda x: 1)
158
+ for _ in range(max_new_tokens):
159
+ # crop idx to the last block_size tokens
160
+ idx_cond = idx[:, -block_size:]
161
+ # get the predictions
162
+ logits, loss = self(idx_cond)
163
+ # focus only on the last time step
164
+ logits = logits[:, -1, :] # becomes (B, C)
165
+
166
+ # apply softmax to get probabilities
167
+ probs = F.softmax(logits, dim=-1) # (B, C)
168
+ # apply rep penalty
169
+ #for token in token_modifiers:
170
+ # token_modifiers[token] *= REP_PENALTY_DECAY
171
+ # for batch in range(probs.shape[0]):
172
+ # probs[batch][token] *= (1 - REP_PENALTY_DECAY)
173
+ # print(probs.shape)
174
+ # sample from the distribution
175
+ idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
176
+ if stream:
177
+ if stream_probs:
178
+ yield [idx_next, probs[0].tolist()]
179
+ else:
180
+ yield idx_next
181
+ token_modifiers[idx_next] = REP_PENALTY_DECAY;
182
+ # append sampled index to the running sequence
183
+ idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
184
+ if not stream:
185
+ return idx
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ tiktoken
tokenizer.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+
3
+ tiktoken_encoding = tiktoken.get_encoding("cl100k_base") # this used in gpt-4 amd 3.5-turbo
4
+ # old:
5
+ #.get_encoding("o200k_base") # this is used for gpt-4o apparently
6
+ vocab_size = tiktoken_encoding.n_vocab
7
+ print("vocab_size updated to",vocab_size)
8
+
9
+ def encode(text):
10
+ return tiktoken_encoding.encode(text)
11
+
12
+ def decode(tokens):
13
+ return tiktoken_encoding.decode(tokens)
14
+