BeardedMonster commited on
Commit
0eea688
·
verified ·
1 Parent(s): 90a4df4

Upload GPTJXForCausalLM

Browse files
Files changed (4) hide show
  1. config.json +6 -2
  2. configuration.py +44 -0
  3. model.safetensors +1 -1
  4. modeling.py +240 -0
config.json CHANGED
@@ -2,17 +2,21 @@
2
  "architectures": [
3
  "GPTJXForCausalLM"
4
  ],
 
 
 
 
5
  "bias": false,
6
  "block_size": 32768,
7
  "dropout": 0.0,
8
  "kv_cache_dtype": "float32",
9
  "max_batch_size": 1,
10
- "model_type": "SabiYarn",
11
  "n_embd": 768,
12
  "n_heads": 12,
13
  "n_layer": 12,
14
  "torch_dtype": "float32",
15
  "transformers_version": "4.41.2",
16
- "use_kv_cache": false,
17
  "vocab_size": 52050
18
  }
 
2
  "architectures": [
3
  "GPTJXForCausalLM"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration.GPTJXConfig",
7
+ "AutoModelForCausalLM": "modeling.GPTJXForCausalLM"
8
+ },
9
  "bias": false,
10
  "block_size": 32768,
11
  "dropout": 0.0,
12
  "kv_cache_dtype": "float32",
13
  "max_batch_size": 1,
14
+ "model_type": "sabiyarn",
15
  "n_embd": 768,
16
  "n_heads": 12,
17
  "n_layer": 12,
18
  "torch_dtype": "float32",
19
  "transformers_version": "4.41.2",
20
+ "use_kv_cache": true,
21
  "vocab_size": 52050
22
  }
configuration.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModelForCausalLM
3
+ from transformers.modeling_outputs import CausalLMOutputWithPast
4
+ from typing import List, Optional, Tuple
5
+ from torch import nn
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import math
9
+
10
+ repo_name = "BeardedMonster/SabiYarn-125M"
11
+
12
+
13
+ class GPTJXConfig(PretrainedConfig):
14
+ """Configuration class for SabiYarn model."""
15
+
16
+ model_type = "sabiyarn"
17
+
18
+ def __init__(
19
+ self,
20
+ block_size: int = 32768,
21
+ vocab_size: int = 52050, # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
22
+ n_layer: int = 12,
23
+ n_heads: int = 12,
24
+ n_embd: int = 768,
25
+ dropout: float = 0.0,
26
+ max_batch_size: int = 1,
27
+ use_kv_cache: bool = True,
28
+ bias: bool = False, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
29
+ kv_cache_dtype: str = "float32", # "float32" or "float16" for memory savings
30
+ **kwargs
31
+ ):
32
+ self.block_size = block_size
33
+ self.vocab_size = vocab_size
34
+ self.n_layer = n_layer
35
+ self.n_heads = n_heads
36
+ self.n_embd = n_embd
37
+ self.dropout = dropout
38
+ self.bias = bias
39
+ self.use_kv_cache = use_kv_cache
40
+ self.max_batch_size = max_batch_size
41
+ self.kv_cache_dtype = kv_cache_dtype # Memory optimization: use float16 for cache
42
+
43
+ super().__init__(**kwargs)
44
+
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98e01174c01f2066c677f4f42d3a3a400716fd7392544f6dc025fafd8db557f8
3
  size 600460048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5213d5d6257a3d04c633136ad54312be99c2584c7cd3b1522069bb4da9a9277
3
  size 600460048
modeling.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SabiYarn Model Implementation - Optimized Version
3
+ Memory-efficient with performance optimizations for generation.
4
+ Matches original implementation exactly but with memory optimizations.
5
+ """
6
+
7
+ from transformers import PreTrainedModel, AutoConfig, AutoModel, AutoModelForCausalLM
8
+ from transformers.modeling_outputs import CausalLMOutputWithPast
9
+ # use package-relative import to avoid colliding with unrelated `model` packages
10
+ from .configuration import GPTJXConfig
11
+ from typing import Optional
12
+ from torch import nn
13
+ import torch
14
+ import torch.nn.functional as F
15
+ import math
16
+
17
+
18
+ from transformers import AutoConfig, PreTrainedModel, AutoModelForCausalLM
19
+ from typing import List, Optional
20
+ from torch import nn
21
+ # from model import LayerNorm, BlockJ
22
+ from transformers.modeling_outputs import CausalLMOutputWithPast
23
+ import torch
24
+ import math
25
+ from torch.nn import functional as F
26
+ from transformers import AutoConfig, AutoModel
27
+
28
+
29
+
30
+
31
+ class LayerNorm(nn.Module):
32
+ """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
33
+
34
+ def __init__(self, ndim, bias):
35
+ super().__init__()
36
+ self.weight = nn.Parameter(torch.ones(ndim))
37
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
38
+
39
+ def forward(self, input):
40
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
41
+
42
+ class CausalSelfAttention(nn.Module):
43
+
44
+ def __init__(self, config):
45
+ super().__init__()
46
+ assert config.n_embd % config.n_heads == 0
47
+ # key, query, value projections for all heads, but in a batch
48
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
49
+ # output projection
50
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
51
+ # regularization
52
+ self.attn_dropout = nn.Dropout(config.dropout)
53
+ self.resid_dropout = nn.Dropout(config.dropout)
54
+ self.n_heads = config.n_heads
55
+ self.n_embd = config.n_embd
56
+ self.dropout = config.dropout
57
+ # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
58
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
59
+ # if not self.flash:
60
+ # print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
61
+ # causal mask to ensure that attention is only applied to the left in the input sequence
62
+
63
+ def forward(self, x, attn_mask=None):
64
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
65
+
66
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
67
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
68
+ k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2) # (B, nh, T, hs)
69
+ q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2) # (B, nh, T, hs)
70
+ v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2) # (B, nh, T, hs)
71
+
72
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
73
+ if self.flash:
74
+ if attn_mask is not None:
75
+ # efficient attention using Flash Attention CUDA kernels
76
+ attn_mask = attn_mask.to(torch.bool)
77
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0)
78
+ else:
79
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
80
+ else:
81
+ # manual implementation of attention
82
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
83
+ att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
84
+ att = F.softmax(att, dim=-1)
85
+ att = self.attn_dropout(att)
86
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
87
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
88
+
89
+ # output projection
90
+ y = self.resid_dropout(self.c_proj(y))
91
+ return y
92
+
93
+ class MLP(nn.Module):
94
+
95
+ def __init__(self, config):
96
+ super().__init__()
97
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
98
+ self.gelu = nn.GELU()
99
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
100
+ self.dropout = nn.Dropout(config.dropout)
101
+
102
+ def forward(self, x):
103
+ x = self.c_fc(x)
104
+ x = self.gelu(x)
105
+ x = self.c_proj(x)
106
+ x = self.dropout(x)
107
+ return x
108
+
109
+ class BlockJ(nn.Module):
110
+
111
+ def __init__(self, config):
112
+ super().__init__()
113
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
114
+ self.j = LayerNorm(config.n_embd, config.n_embd)
115
+ self.attn = CausalSelfAttention(config)
116
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
117
+ self.mlp = MLP(config)
118
+
119
+ def forward(self, x, attn_mask=None):
120
+ h = x
121
+ x = self.ln_1(x)
122
+ x = h + self.attn(x, attn_mask) + self.j(x)
123
+ x = x + self.mlp(self.ln_2(x))
124
+ return x
125
+
126
+
127
+ class GPTJXForCausalLM(PreTrainedModel):
128
+ config_class = GPTJXConfig
129
+ base_model_prefix = "transformer"
130
+ is_parallelizable = True
131
+ supports_gradient_checkpointing = True
132
+ _no_split_modules = ["BlockJ"]
133
+ # _skip_keys_device_placement = "past_key_values"
134
+ _supports_flash_attn_2 = True
135
+ _tied_weights_keys = ["lm_head.weight"]
136
+
137
+
138
+ def __init__(self, config):
139
+ super().__init__(config)
140
+ assert config.vocab_size is not None
141
+ assert config.block_size is not None
142
+ self.config = config
143
+
144
+ self.transformer = nn.ModuleDict(dict(
145
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
146
+ wpe = nn.Embedding(config.block_size, config.n_embd),
147
+ drop = nn.Dropout(config.dropout),
148
+ h = nn.ModuleList([BlockJ(config) for _ in range(config.n_layer)]),
149
+ ln_f = LayerNorm(config.n_embd, bias=config.bias),
150
+ ))
151
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
152
+ self.transformer.wte.weight = self.lm_head.weight
153
+
154
+ self.apply(self._init_weights)
155
+
156
+ for pn, p in self.named_parameters():
157
+ if pn.endswith('c_proj.weight'):
158
+ torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
159
+
160
+ print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
161
+
162
+ def get_num_params(self, non_embedding=True):
163
+ """
164
+ Return the number of parameters in the model.
165
+ For non-embedding count (default), the position embeddings get subtracted.
166
+ The token embeddings would too, except due to the parameter sharing these
167
+ params are actually used as weights in the final layer, so we include them.
168
+ """
169
+ n_params = sum(p.numel() for p in self.parameters())
170
+ if non_embedding:
171
+ n_params -= self.transformer.wpe.weight.numel()
172
+ return n_params
173
+
174
+
175
+ def get_input_embeddings(self):
176
+ return self.wte
177
+
178
+ def set_input_embeddings(self, new_embeddings):
179
+ self.wte = new_embeddings
180
+
181
+ def forward(self, idx, targets=None, attn_mask= None, output_hidden_states: Optional[bool] = None, **kwargs):
182
+ device = idx.device
183
+ b, t = idx.size()
184
+
185
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
186
+ pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
187
+
188
+ # forward the GPT model itself
189
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
190
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
191
+ x = self.transformer.drop(tok_emb + pos_emb)
192
+ for block in self.transformer.h:
193
+ x = block(x, attn_mask=attn_mask)
194
+ x = self.transformer.ln_f(x)
195
+
196
+ # logits = self.lm_head(x) # logits over the entire sequence, shape (b, t, vocab_size)
197
+ if targets is not None:
198
+ # if we are given some desired targets also calculate the loss
199
+ logits = self.lm_head(x)
200
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-100)
201
+ else:
202
+ # inference-time mini-optimization: only forward the lm_head on the very last position
203
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
204
+ loss = None
205
+
206
+ return CausalLMOutputWithPast(
207
+ loss=loss,
208
+ logits=logits,
209
+ hidden_states=x if output_hidden_states else None,
210
+ attentions= None,
211
+ )
212
+
213
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
214
+ # Default model inputs
215
+ model_inputs = {"idx": input_ids}
216
+
217
+ # Add attention mask if provided
218
+ if attention_mask is not None:
219
+ model_inputs["attn_mask"] = attention_mask
220
+
221
+ return model_inputs
222
+
223
+
224
+ def crop_block_size(self, block_size):
225
+ assert block_size <= self.config.block_size
226
+ self.config.block_size = block_size
227
+ self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
228
+ for block in self.transformer.h:
229
+ if hasattr(block.attn, 'bias'):
230
+ block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
231
+
232
+
233
+ AutoConfig.register("sabiyarn", GPTJXConfig)
234
+ AutoModel.register(GPTJXConfig,GPTJXForCausalLM)
235
+ AutoModelForCausalLM.register(GPTJXConfig, GPTJXForCausalLM)
236
+
237
+
238
+
239
+
240
+