crumb commited on
Commit
4c250c1
·
1 Parent(s): f8213b5

Upload 3 files

Browse files
Files changed (3) hide show
  1. CGPT-124m.pt +3 -0
  2. TestLossEvaluation.ipynb +258 -0
  3. modeling_cgpt.py +236 -0
CGPT-124m.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af9f83a8bc3866c87362238a416a010ec77fbd6834c239992bfde699efda098
3
+ size 347777437
TestLossEvaluation.ipynb ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "e80a19f9-2837-4418-8edb-f841d280f270",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Loaded tokenizer with vocab size 50257\n",
14
+ "number of parameters: 123542016\n"
15
+ ]
16
+ },
17
+ {
18
+ "data": {
19
+ "text/plain": [
20
+ "<All keys matched successfully>"
21
+ ]
22
+ },
23
+ "execution_count": 1,
24
+ "metadata": {},
25
+ "output_type": "execute_result"
26
+ }
27
+ ],
28
+ "source": [
29
+ "from modeling_cgpt import GPTConfig, GPT, sample\n",
30
+ "import torch\n",
31
+ "import tiktoken\n",
32
+ "tokenizer = tiktoken.get_encoding(\"r50k_base\") # r50k_base\n",
33
+ "vocab_size = tokenizer.n_vocab\n",
34
+ "print(\"Loaded tokenizer with vocab size\", vocab_size)\n",
35
+ "\n",
36
+ "config = GPTConfig(\n",
37
+ " block_size = 2048,\n",
38
+ " vocab_size = 50257,\n",
39
+ " n_layer = 12,\n",
40
+ " n_head = 12,\n",
41
+ " n_embd = 768,\n",
42
+ " bias = False,\n",
43
+ ")\n",
44
+ "gpt = GPT(config).cuda()\n",
45
+ "gpt.load_state_dict(torch.load('CGPT-124m.pt'))"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 7,
51
+ "id": "c2bb4711-2845-4405-908b-aa660ebdd39b",
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stderr",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "No config specified, defaulting to: the_pile/all\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "from datasets import load_dataset\n",
64
+ "dataset = load_dataset('EleutherAI/the_pile', streaming=True, split='test')\n",
65
+ "\n",
66
+ "def truncate_or_pad(ids, max_len, eot_token=tokenizer.eot_token):\n",
67
+ " if len(ids) < max_len:\n",
68
+ " ids = ids+[eot_token]*(max_len-len(ids))\n",
69
+ " elif len(ids) > max_len:\n",
70
+ " ids = ids[:max_len]\n",
71
+ " return ids\n",
72
+ "\n",
73
+ "def create_example(text, context_length, eot_token):\n",
74
+ " ex = truncate_or_pad(tokenizer.encode(text, allowed_special={'<|endoftext|>'}), context_length, eot_token)\n",
75
+ " return torch.tensor(ex)\n",
76
+ "\n",
77
+ "class CustomDataloader:\n",
78
+ " def __init__(self, dataset):\n",
79
+ " self.dataset = iter(dataset)\n",
80
+ " def get_next_batch(self, size):\n",
81
+ " return [create_example(next(self.dataset)['text'] + ' ' + tokenizer.decode([tokenizer.eot_token]) + ' ' + next(self.dataset)['text'], context_length, tokenizer.eot_token).unsqueeze(0) for i in range(size)]\n",
82
+ " # return torch.tensor(tokenizer.encode(next(self.dataset)['text'])[:2048]).unsqueeze(0)\n",
83
+ " def iter(self, batch_size, total):\n",
84
+ " for i in range(total):\n",
85
+ " yield torch.cat(self.get_next_batch(batch_size), 0)\n",
86
+ " # yield self.get_next_batch(1)"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 8,
92
+ "id": "8f1fdf45-3176-43ed-af8e-528488b210e2",
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "dataloader = CustomDataloader(dataset)"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 9,
102
+ "id": "24e360b1-71e9-4ebe-b473-ba7dac8ad5cb",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "for param in gpt.parameters():\n",
107
+ " param.requires_grad = False"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 10,
113
+ "id": "87d724a8-a9aa-4df4-8e56-2377ec54ae86",
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stdout",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "Evaluating on 512 samples from the test set.\n"
121
+ ]
122
+ },
123
+ {
124
+ "data": {
125
+ "application/vnd.jupyter.widget-view+json": {
126
+ "model_id": "b144fd8064ff4564ad013304f555d26e",
127
+ "version_major": 2,
128
+ "version_minor": 0
129
+ },
130
+ "text/plain": [
131
+ " 0%| | 0/512 [00:00<?, ?it/s]"
132
+ ]
133
+ },
134
+ "metadata": {},
135
+ "output_type": "display_data"
136
+ }
137
+ ],
138
+ "source": [
139
+ "from tqdm.auto import tqdm\n",
140
+ "import torch.nn.functional as F\n",
141
+ "context_length=2048\n",
142
+ "\n",
143
+ "# approximate it, i dont want this to take hours and hours\n",
144
+ "bs = 1\n",
145
+ "steps = 512\n",
146
+ "print(f\"Evaluating on {steps*bs} samples from the test set.\")\n",
147
+ "loss_accumulator = 0\n",
148
+ "for i, X in enumerate(tqdm(dataloader.iter(bs,steps), total=steps)):\n",
149
+ " labels = X.cuda()\n",
150
+ " logits = gpt(labels)\n",
151
+ " shift_logits = logits[..., :-1, :].contiguous()\n",
152
+ " shift_labels = labels[..., 1:].contiguous()\n",
153
+ " loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))\n",
154
+ " loss_accumulator += loss / steps"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 11,
160
+ "id": "849de6f7-1d91-48be-8d4e-0e8e507843ba",
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "data": {
165
+ "text/plain": [
166
+ "tensor(1.8915, device='cuda:0')"
167
+ ]
168
+ },
169
+ "execution_count": 11,
170
+ "metadata": {},
171
+ "output_type": "execute_result"
172
+ }
173
+ ],
174
+ "source": [
175
+ "loss_accumulator"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 12,
181
+ "id": "31221616-e00c-4968-8802-8388b9c524cd",
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "Text: i hate this movie\n",
189
+ "Sentiment: negative\n",
190
+ "\n",
191
+ "Text: That was Great!\n",
192
+ "Sentiment: positive\n",
193
+ "\n",
194
+ "Text: smells like flowers in here\n",
195
+ "Sentiment: positive\n",
196
+ "\n",
197
+ "Text: oo :3\n",
198
+ "Sentiment: positive\n",
199
+ "\n"
200
+ ]
201
+ }
202
+ ],
203
+ "source": [
204
+ "temperature = 0.1\n",
205
+ "top_k=2\n",
206
+ "top_p=0.95\n",
207
+ "max_new_tokens=2\n",
208
+ "prompt = \"\"\"\n",
209
+ "Text: i hate this movie\n",
210
+ "Sentiment: negative\n",
211
+ "\n",
212
+ "Text: That was Great!\n",
213
+ "Sentiment: positive\n",
214
+ "\n",
215
+ "Text: smells like flowers in here\n",
216
+ "Sentiment: positive\n",
217
+ "\n",
218
+ "Text: oo :3\n",
219
+ "Sentiment:\n",
220
+ "\"\"\".strip()\n",
221
+ "\n",
222
+ "input_ids = torch.tensor(tokenizer.encode(prompt)).cuda()\n",
223
+ "outputs = sample(gpt, input_ids, temperature=temperature, top_k=top_k, top_p=top_p, max_new_tokens=max_new_tokens).flatten().tolist()\n",
224
+ "output_string = tokenizer.decode(outputs)\n",
225
+ "print(output_string)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "56d4b992-1212-4d35-833d-fb458f3cd367",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": []
235
+ }
236
+ ],
237
+ "metadata": {
238
+ "kernelspec": {
239
+ "display_name": "Python 3 (ipykernel)",
240
+ "language": "python",
241
+ "name": "python3"
242
+ },
243
+ "language_info": {
244
+ "codemirror_mode": {
245
+ "name": "ipython",
246
+ "version": 3
247
+ },
248
+ "file_extension": ".py",
249
+ "mimetype": "text/x-python",
250
+ "name": "python",
251
+ "nbconvert_exporter": "python",
252
+ "pygments_lexer": "ipython3",
253
+ "version": "3.10.4"
254
+ }
255
+ },
256
+ "nbformat": 4,
257
+ "nbformat_minor": 5
258
+ }
modeling_cgpt.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm.auto import tqdm
2
+ import tiktoken
3
+ import math
4
+ from dataclasses import dataclass
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn import functional as F
8
+ from einops import rearrange
9
+
10
+ # rotary positional embedding w/ xpos
11
+ # https://arxiv.org/abs/2104.09864
12
+ # https://arxiv.org/abs/2212.10554v1
13
+
14
+ def exists(val):
15
+ return val is not None
16
+
17
+ class RotaryEmbedding(nn.Module):
18
+ def __init__(
19
+ self,
20
+ dim,
21
+ scale_base = 512,
22
+ use_xpos = True
23
+ ):
24
+ super().__init__()
25
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
26
+ self.register_buffer("inv_freq", inv_freq)
27
+
28
+ self.use_xpos = use_xpos
29
+ self.scale_base = scale_base
30
+ scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
31
+ self.register_buffer('scale', scale)
32
+
33
+ @property
34
+ def device(self):
35
+ return next(self.buffers()).device
36
+
37
+ def forward(self, seq_len):
38
+ device = self.device
39
+ t = torch.arange(seq_len, device = device).type_as(self.inv_freq)
40
+ freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
41
+ freqs = torch.cat((freqs, freqs), dim = -1)
42
+
43
+ if not self.use_xpos:
44
+ return freqs, torch.ones(1, device = device)
45
+
46
+ power = (t - (seq_len // 2)) / self.scale_base
47
+ scale = self.scale ** rearrange(power, 'n -> n 1')
48
+ scale = torch.cat((scale, scale), dim = -1)
49
+
50
+ return freqs, scale
51
+
52
+ def rotate_half(x):
53
+ x1, x2 = x.chunk(2, dim=-1)
54
+ return torch.cat((-x2, x1), dim=-1)
55
+
56
+ def apply_rotary_pos_emb(pos, t, scale = 1.):
57
+ return (t * pos.cos() * scale) + (rotate_half(t) * pos.sin() * scale)
58
+
59
+
60
+ #@title minimal GPT implementation in PyTorch (karpathy)
61
+ """ super minimal decoder-only gpt """
62
+
63
+ torch.manual_seed(1337)
64
+
65
+ class RMSNorm(nn.Module):
66
+ def __init__(self, dim):
67
+ super().__init__()
68
+ self.scale = dim ** 0.5
69
+ self.gamma = nn.Parameter(torch.ones(dim))
70
+
71
+ def forward(self, x):
72
+ return F.normalize(x, dim = -1) * self.scale * self.gamma
73
+
74
+ class CausalSelfAttention(nn.Module):
75
+
76
+ def __init__(self, config):
77
+ super().__init__()
78
+ assert config.n_embd % config.n_head == 0
79
+ # key, query, value projections for all heads, but in a batch
80
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
81
+ # output projection
82
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
83
+ # regularization
84
+ self.n_head = config.n_head
85
+ self.n_embd = config.n_embd
86
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
87
+ .view(1, 1, config.block_size, config.block_size))
88
+
89
+ def forward(self, x, rotary_emb=None):
90
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
91
+
92
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
93
+ q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
94
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
95
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
96
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
97
+
98
+
99
+ if exists(rotary_emb):
100
+ freqs, scale = rotary_emb
101
+ q = apply_rotary_pos_emb(freqs, q, scale)
102
+ k = apply_rotary_pos_emb(freqs, k, scale ** -1)
103
+
104
+ # manual implementation of attention
105
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
106
+
107
+ # apply causal mask
108
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
109
+
110
+ att = F.softmax(att, dim=-1)
111
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
112
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
113
+
114
+ # output projection
115
+ y = self.c_proj(y)
116
+ return y
117
+
118
+ class MLP(nn.Module):
119
+ def __init__(self, config):
120
+ super().__init__()
121
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
122
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
123
+ self.nonlin = nn.GELU()
124
+ def forward(self, x):
125
+ x = self.c_fc(x)
126
+ x = self.nonlin(x)
127
+ x = self.c_proj(x)
128
+ return x
129
+
130
+ class Block(nn.Module):
131
+ def __init__(self, config):
132
+ super().__init__()
133
+ self.ln = RMSNorm(config.n_embd)
134
+ self.attn = CausalSelfAttention(config)
135
+ self.mlp = MLP(config)
136
+ def forward(self, x, rotary_emb=None):
137
+ lnx = self.ln(x)
138
+ x = x + self.attn(lnx, rotary_emb) + self.mlp(lnx)
139
+ return x
140
+
141
+
142
+ @dataclass
143
+ class GPTConfig:
144
+ block_size: int = 1024
145
+ vocab_size: int = 50257
146
+ n_layer: int = 6
147
+ n_head: int = 8
148
+ n_embd: int = 512
149
+ bias: bool = False
150
+
151
+ class GPT(nn.Module):
152
+
153
+ def __init__(self, config):
154
+ super().__init__()
155
+ assert config.vocab_size is not None
156
+ assert config.block_size is not None
157
+ self.config = config
158
+
159
+ self.transformer = nn.ModuleDict(dict(
160
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
161
+ wpe = RotaryEmbedding(config.n_embd//config.n_head),
162
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
163
+ ln_f = RMSNorm(config.n_embd),
164
+ ))
165
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
166
+ self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
167
+
168
+ # init all weights
169
+ self.apply(self._init_weights)
170
+ # apply special scaled init to the residual projections, per GPT-2 paper
171
+ for pn, p in self.named_parameters():
172
+ if pn.endswith('c_proj.weight'):
173
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
174
+
175
+ # report number of parameters
176
+ print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))
177
+
178
+ def _init_weights(self, module):
179
+ if isinstance(module, nn.Linear):
180
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
181
+ if module.bias is not None:
182
+ torch.nn.init.zeros_(module.bias)
183
+ elif isinstance(module, nn.Embedding):
184
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
185
+
186
+ def forward(self, idx):
187
+ device = idx.device
188
+ b, t = idx.size()
189
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
190
+ # pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
191
+ pos_emb = self.transformer.wpe(t)
192
+
193
+ # forward the GPT model itself
194
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
195
+ # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
196
+ x = tok_emb
197
+ for block in self.transformer.h:
198
+ x = block(x, rotary_emb=pos_emb)
199
+ x = self.transformer.ln_f(x)
200
+ logits = self.lm_head(x)
201
+ return logits
202
+
203
+
204
+ # prtobably also from karpathy or maybe max woolf idk i've been copy/pasting it between my projects
205
+ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
206
+ assert logits.dim() == 1
207
+ top_k = min(top_k, logits.size(-1)) # Safety check
208
+ if top_k > 0:
209
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
210
+ logits[indices_to_remove] = filter_value
211
+
212
+ if top_p > 0.0:
213
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
214
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits.float(), dim=-1), dim=-1)
215
+ sorted_indices_to_remove = cumulative_probs > top_p
216
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
217
+ sorted_indices_to_remove[..., 0] = 0
218
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
219
+ logits[indices_to_remove] = filter_value
220
+ return logits
221
+
222
+ def next_token(logits, temperature=1., top_k=0, top_p=0.9):
223
+ logits = logits / temperature
224
+ filtered_logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
225
+ probabilities = F.softmax(filtered_logits.float(), dim=-1)
226
+ next_token = torch.multinomial(probabilities, 1)
227
+ return next_token
228
+
229
+ def sample(gpt, input_ids, temperature=0.7, top_k=0, top_p=0, max_new_tokens=16):
230
+ for i in range(max_new_tokens):
231
+ logits = gpt(input_ids.unsqueeze(0).cuda())[:,-1,:][0] / temperature
232
+ filtered_logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
233
+ probabilities = F.softmax(filtered_logits.float(), dim=-1)
234
+ next_token=torch.multinomial(probabilities, 1)
235
+ input_ids = torch.cat([input_ids, next_token], -1)
236
+ return input_ids