Fill-Mask
Transformers
Safetensors
English
avey-b
custom_code
Devang Acharya commited on
Commit
4125545
·
verified ·
1 Parent(s): f5b038c

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 50280,
3
+ "[MASK]": 50281
4
+ }
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["AveyForMaskedLM"],
3
+ "auto_map": {
4
+ "AutoConfig": "configuration_avey.AveyConfig",
5
+ "AutoModel": "modeling_avey.AveyModel",
6
+ "AutoModelForMaskedLM": "modeling_avey.AveyForMaskedLM",
7
+ "AutoModelForSequenceClassification": "modeling_avey.AveyForSequenceClassification",
8
+ "AutoModelForTokenClassification": "modeling_avey.AveyForTokenClassification"
9
+ },
10
+ "chunk_size": 256,
11
+ "context_proportion": 0.5,
12
+ "d_embed": 768,
13
+ "dtype": "float32",
14
+ "eps": 1e-12,
15
+ "expansion_factor": 4,
16
+ "hidden_size": 768,
17
+ "k": 3,
18
+ "max_position_embeddings": 4294967296,
19
+ "model_type": "avey-b",
20
+ "n_layers": 30,
21
+ "transformers_version": "4.57.1",
22
+ "vocab_size": 50368
23
+ }
configuration_avey.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class AveyConfig(PretrainedConfig):
5
+ model_type = "avey-b"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size: int = 50304,
10
+ context_len: int = 512,
11
+ d_embed: int = 768,
12
+ n_layers: int = 26,
13
+ chunk_size: int = 128,
14
+ k: int = 3,
15
+ eps=1e-12,
16
+ **kwargs,
17
+ ):
18
+ self.vocab_size = vocab_size
19
+ self.d_embed = d_embed
20
+ self.n_layers = n_layers
21
+ self.chunk_size = chunk_size
22
+ self.k = k
23
+ self.eps = eps
24
+
25
+ # for compatibility with the eval lib
26
+ self.max_position_embeddings = context_len
27
+ self.hidden_size = d_embed
28
+ super().__init__(**kwargs)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e153bbf63e4b2ffb4548e01f5aaf3bb9273bf9523e1c42f505d427dc4d0c07af
3
+ size 655734680
modeling_avey.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .configuration_avey import AveyConfig
2
+ import torch
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+ from transformers import PreTrainedModel
6
+ from transformers.modeling_outputs import (
7
+ BaseModelOutput,
8
+ MaskedLMOutput,
9
+ SequenceClassifierOutput,
10
+ TokenClassifierOutput
11
+ )
12
+ from torch.nn import (
13
+ BCEWithLogitsLoss,
14
+ CrossEntropyLoss,
15
+ MSELoss
16
+ )
17
+
18
+
19
+ class StaticLayer(nn.Module):
20
+ def __init__(self, config: AveyConfig):
21
+ super().__init__()
22
+ self.norm = nn.RMSNorm(config.d_embed, eps=config.eps)
23
+ self.enricher = nn.Linear(config.d_embed, config.d_embed * 4)
24
+
25
+ proj_size = config.chunk_size
26
+ self.spatial_proj = nn.Parameter(torch.empty(proj_size, proj_size))
27
+ nn.init.xavier_normal_(self.spatial_proj)
28
+
29
+ self.fuser = nn.Linear(int(config.d_embed * 3), config.d_embed)
30
+ self.alpha = nn.Parameter(torch.tensor(1.0))
31
+
32
+ @torch.compile
33
+ def forward(self, x):
34
+ _, T, _ = x.shape
35
+ res = x
36
+ x = self.norm(x)
37
+ x = self.enricher(x)
38
+ x = F.gelu(x)
39
+ x, bypass = x.chunk(2, dim=-1)
40
+
41
+ x, gate = x.chunk(2, dim=-1)
42
+ x = self.spatial_proj[:T, :T] @ x
43
+ x = gate * x
44
+
45
+ x = torch.cat([x, bypass], dim=-1)
46
+ x = self.fuser(x)
47
+ return x + (self.alpha * res)
48
+
49
+
50
+ class DynamicLayer(nn.Module):
51
+ def __init__(self, config: AveyConfig):
52
+ super().__init__()
53
+ self.norm = nn.RMSNorm(config.d_embed, eps=config.eps)
54
+ self.enricher = nn.Linear(config.d_embed, config.d_embed * 4)
55
+ self.fuser = nn.Linear(config.d_embed * 3, config.d_embed)
56
+ self.alpha = nn.Parameter(torch.tensor(1.0))
57
+
58
+ @torch.compile
59
+ def forward(self, x):
60
+ _, T, _ = x.shape
61
+ res = x
62
+ x = self.norm(x)
63
+ x = self.enricher(x)
64
+ x = F.gelu(x)
65
+ x, bypass = x.chunk(2, dim=-1)
66
+
67
+ x, gate = x.chunk(2, dim=-1)
68
+ x_norm = F.normalize(x, p=2, dim=-1)
69
+ sim_scores = (x_norm @ x_norm.mT)
70
+ x = F.normalize(sim_scores, p=1, dim=-1) @ x
71
+ x = gate * x
72
+
73
+ x = torch.cat([x, bypass], dim=-1)
74
+ x = self.fuser(x)
75
+ return x + (self.alpha * res)
76
+
77
+
78
+ class Ranker(nn.Module):
79
+ def __init__(self, config):
80
+ super().__init__()
81
+ self.chunk_size = config.chunk_size
82
+ self.k = config.k + 1
83
+ self.extended_len = self.k * config.chunk_size
84
+ self.eps = config.eps
85
+ self.down_proj = nn.Parameter(torch.empty(self.chunk_size, self.extended_len))
86
+ nn.init.xavier_normal_(self.down_proj)
87
+
88
+ def preprocess(self, x):
89
+ B, T, E = x.shape
90
+ cs, L = self.chunk_size, self.extended_len
91
+
92
+ padded = False
93
+ orig_T = T
94
+ if T % cs != 0:
95
+ pad_len = cs - (T % cs)
96
+ pad = torch.zeros(B, pad_len, E, device=x.device, dtype=x.dtype)
97
+ x = torch.cat([x, pad], dim=1)
98
+ T += pad_len
99
+ padded = True
100
+
101
+ N = T // cs
102
+ x_chunks = x.view(B, N, cs, E)
103
+
104
+ extended = []
105
+ for i in range(0, N):
106
+ cur = x_chunks[:, i]
107
+ others = x_chunks[:, :i]
108
+ cat = self._extend(others, cur) # (B, ≤k⋅cs+cs, E)
109
+
110
+ # pad or truncate to length L
111
+ cur_len = cat.size(1)
112
+ if cur_len < L:
113
+ pad2 = torch.zeros(B, L - cur_len, E, device=x.device, dtype=x.dtype)
114
+ cat = torch.cat([pad2, cat], dim=1)
115
+ else:
116
+ cat = cat[:, -L:]
117
+
118
+ extended.append(cat)
119
+
120
+ ext = torch.stack(extended, dim=1) # (B, N, L, E)
121
+ ext = (self.down_proj @ ext) + x_chunks
122
+ h = ext.view(B * N, cs, E)
123
+
124
+ state = {
125
+ "B": B,
126
+ "N": N,
127
+ "orig_T": orig_T,
128
+ "padded": padded
129
+ }
130
+ return h, state
131
+
132
+ def contract(self, h, st):
133
+ B, cs = st["B"], self.chunk_size
134
+ N = st["N"]
135
+ padded = st["padded"]
136
+ orig_T = st["orig_T"]
137
+
138
+ E = h.size(-1)
139
+ final_chunks = h.view(B, N, cs, E)
140
+
141
+ out = final_chunks.reshape(B, N * cs, E)
142
+
143
+ if padded:
144
+ out = out[:, :orig_T, :]
145
+
146
+ return out
147
+
148
+ def _extend(self, other_chunks, cur_chunk):
149
+ B, cs, E = cur_chunk.shape
150
+ if other_chunks is None or other_chunks.size(1) == 0:
151
+ return cur_chunk
152
+
153
+ i = other_chunks.size(1)
154
+ num_sel = min(i, self.k - 1)
155
+ if num_sel <= 0:
156
+ return cur_chunk
157
+
158
+ # l2 normalize
159
+ cn = other_chunks / (other_chunks.norm(dim=-1, keepdim=True) + self.eps)
160
+ cm = cur_chunk / (cur_chunk.norm(dim=-1, keepdim=True) + self.eps)
161
+
162
+ # cosine sim
163
+ cm_e = cm.unsqueeze(1) # (B, 1, cs, E)
164
+ ct = cn.transpose(-1, -2) # (B, i, E, cs)
165
+ sims = torch.matmul(cm_e, ct) # (B, i, cs, cs)
166
+ mx, _ = sims.max(dim=-1) # (B, i, cs)
167
+ scores = mx.sum(dim=-1) # (B, i)
168
+
169
+ # topk
170
+ topk_vals, topk_idx = scores.topk(num_sel, dim=1)
171
+
172
+ # normalize weights
173
+ v_min = topk_vals.min(dim=-1, keepdim=True)[0] # (B, 1)
174
+ w = topk_vals / (v_min + self.eps) # (B, num_sel)
175
+ w = w.unsqueeze(-1).unsqueeze(-1) # (B, num_sel, 1, 1)
176
+
177
+ # gather
178
+ idx_e = topk_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, cs, E)
179
+ sel = other_chunks.gather(1, idx_e) # (B, num_sel, cs, E)
180
+
181
+ # weight & flatten
182
+ wt = (sel * w).reshape(B, num_sel * cs, E)
183
+
184
+ return torch.cat([wt, cur_chunk], dim=1) # (B, ≤k⋅cs+cs, E)
185
+
186
+
187
+ class AveyPreTrainedModel(PreTrainedModel):
188
+ config_class = AveyConfig
189
+
190
+ def _init_weights(self, module):
191
+ if isinstance(module, nn.Linear):
192
+ nn.init.xavier_normal_(module.weight)
193
+ if module.bias is not None:
194
+ module.bias.data.zero_()
195
+ elif isinstance(module, nn.Embedding):
196
+ nn.init.xavier_normal_(module.weight)
197
+ if module.padding_idx is not None:
198
+ module.weight.data[module.padding_idx].zero_()
199
+
200
+
201
+ class AveyModel(AveyPreTrainedModel):
202
+ def __init__(self, config):
203
+ super().__init__(config)
204
+ self.chunk_size = config.chunk_size
205
+ self.embed = nn.Embedding(config.vocab_size, config.d_embed)
206
+ self.layers = nn.ModuleList([
207
+ DynamicLayer(config) if (i+1) % 2 == 0 else StaticLayer(config)
208
+ for i in range(config.n_layers)
209
+ ])
210
+ self.ranker = Ranker(config)
211
+ self.apply(self._init_weights)
212
+
213
+ def _get_hidden(self, input_ids):
214
+ x = self.embed(input_ids)
215
+ x, state = self.ranker.preprocess(x)
216
+ for layer in self.layers:
217
+ x = layer(x)
218
+ x = self.ranker.contract(x, state)
219
+ return x
220
+
221
+ def forward(self, input_ids, **kwargs):
222
+ x = self._get_hidden(input_ids)
223
+ return BaseModelOutput(last_hidden_state=x)
224
+
225
+
226
+ class AveyForMaskedLM(AveyModel):
227
+ def __init__(self, config):
228
+ super().__init__(config)
229
+ self.apply(self._init_weights)
230
+
231
+ def forward(self, input_ids, labels=None, **kwargs):
232
+ x = self._get_hidden(input_ids)
233
+ logits = F.linear(x, self.embed.weight)
234
+
235
+ loss = None
236
+ if labels is not None:
237
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)
238
+
239
+ return MaskedLMOutput(logits=logits, loss=loss)
240
+
241
+
242
+ class AveyForSequenceClassification(AveyModel):
243
+ def __init__(self, config):
244
+ super().__init__(config)
245
+ self.num_labels = config.num_labels
246
+ self.dense = nn.Sequential(
247
+ nn.Linear(self.config.d_embed, self.config.d_embed*2),
248
+ nn.GELU(),
249
+ nn.Linear(self.config.d_embed*2, self.config.d_embed*2),
250
+ nn.GELU(),
251
+ nn.Linear(self.config.d_embed*2, self.config.d_embed)
252
+ )
253
+ self.classifier = nn.Linear(config.d_embed, config.num_labels)
254
+ self.apply(self._init_weights)
255
+
256
+ def forward(self, input_ids, labels=None, **kwargs):
257
+ x = self._get_hidden(input_ids)
258
+ x = x.mean(dim=1)
259
+ x = self.dense(x)
260
+ logits = self.classifier(x)
261
+
262
+ loss = None
263
+ if labels is not None:
264
+ if self.num_labels == 1:
265
+ loss = MSELoss()(logits.squeeze(), labels.squeeze())
266
+ elif labels.dtype in (torch.long, torch.int):
267
+ loss = CrossEntropyLoss()(logits.view(-1, self.num_labels), labels.view(-1))
268
+ else:
269
+ loss = BCEWithLogitsLoss()(logits, labels)
270
+
271
+ return SequenceClassifierOutput(logits=logits, loss=loss)
272
+
273
+
274
+ class AveyForTokenClassification(AveyModel):
275
+ def __init__(self, config):
276
+ super().__init__(config)
277
+ self.num_labels = config.num_labels
278
+ self.dense = nn.Sequential(
279
+ nn.Linear(config.d_embed, config.d_embed),
280
+ nn.Tanh()
281
+ )
282
+ self.classifier = nn.Linear(config.d_embed, config.num_labels)
283
+ self.apply(self._init_weights)
284
+
285
+ def forward(self, input_ids, labels=None, **kwargs):
286
+ x = self._get_hidden(input_ids)
287
+ x = self.dense(x)
288
+ logits = self.classifier(x)
289
+
290
+ loss = None
291
+ if labels is not None:
292
+ loss = CrossEntropyLoss()(logits.view(-1, self.num_labels), labels.view(-1))
293
+
294
+ return TokenClassifierOutput(logits=logits, loss=loss)
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "[MASK]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50280": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50281": {
13
+ "content": "[MASK]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|endoftext|>",
22
+ "clean_up_tokenization_spaces": false,
23
+ "eos_token": "<|endoftext|>",
24
+ "extra_special_tokens": {},
25
+ "mask_token": "[MASK]",
26
+ "model_max_length": 4294967296,
27
+ "tokenizer_class": "GPT2Tokenizer",
28
+ "unk_token": "<|endoftext|>"
29
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff