trixyL commited on
Commit
3b0f576
·
0 Parent(s):

add: model lfs

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -0
  2. merges.txt +0 -0
  3. model.safetensors +3 -0
  4. modelling_llada.py +457 -0
  5. special_tokens.json +271 -0
  6. vocab.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9eef7010bd80b2d736ce95af88b32b2af738406247c7fdc9f6f2b84922c61fc
3
+ size 16031197344
modelling_llada.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adapted from https://huggingface.co/GSAI-ML/LLaDA-8B-Base/blob/main/modeling_llada.py
2
+
3
+ from typing import (
4
+ Optional,
5
+ Tuple,
6
+ )
7
+
8
+ from typing import Optional, Tuple
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch import einsum
14
+
15
+ class RMSLayerNorm(nn.Module):
16
+ """
17
+ RMS layer norm, a simplified :class:`LayerNorm` implementation
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ d_model: int,
23
+ eps: float = 1e-5,
24
+ device: torch.device = "cuda",
25
+ ):
26
+ super().__init__()
27
+ self.eps = eps
28
+ self.weight = nn.Parameter(torch.ones(d_model, device=device))
29
+
30
+ nn.init.ones_(self.weight)
31
+
32
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
33
+ og_dtype = x.dtype
34
+ x = x.to(torch.float32)
35
+ variance = x.pow(2).mean(-1, keepdim=True)
36
+ x = x * torch.rsqrt(variance + self.eps)
37
+ x = x.to(og_dtype)
38
+
39
+ return self.weight * x
40
+
41
+ class RotaryEmbedding(nn.Module):
42
+ def __init__(
43
+ self,
44
+ rope_theta:float,
45
+ d_model: int,
46
+ n_heads: int,
47
+ max_sequence_length: int,
48
+ device: torch.device
49
+ ):
50
+ super().__init__()
51
+ self.rope_theta = rope_theta
52
+ self.d_model = d_model
53
+ self.n_heads = n_heads
54
+ self.get_rotary_embedding(max_sequence_length, device)
55
+
56
+ def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
57
+
58
+ dim = self.d_model // self.n_heads
59
+ inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
60
+ seq = torch.arange(seq_len, device=device, dtype=torch.float)
61
+ freqs = einsum("i , j -> i j", seq, inv_freq)
62
+ positions = torch.cat((freqs, freqs), dim=-1)
63
+ pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
64
+
65
+ return pos_sin, pos_cos
66
+
67
+ def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
68
+ B, nh, T, hs = x.size()
69
+ x = x.view(B, nh, T, 2, hs // 2)
70
+ x1, x2 = x.unbind(dim=-2)
71
+
72
+ return torch.cat((-x2, x1), dim=-1)
73
+
74
+ def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
75
+ return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
76
+
77
+ def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
78
+ query_len, key_len = q.shape[-2], k.shape[-2] # could be different if layer_past not None
79
+ pos_sin, pos_cos = self.get_rotary_embedding(key_len, q.device)
80
+ pos_sin = pos_sin.type_as(q)
81
+ pos_cos = pos_cos.type_as(q)
82
+ q = self.apply_rotary_pos_emb(
83
+ pos_sin[:, :, key_len - query_len : key_len, :],
84
+ pos_cos[:, :, key_len - query_len : key_len, :],
85
+ q,
86
+ )
87
+ k = self.apply_rotary_pos_emb(pos_sin, pos_cos, k)
88
+
89
+ return q.type_as(q), k.type_as(k)
90
+
91
+ class SwiGLU(nn.Module):
92
+ def __init__(self):
93
+ super().__init__()
94
+
95
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
96
+ x, gate = x.chunk(2, dim=-1)
97
+ return F.silu(gate) * x
98
+
99
+ @property
100
+ def output_multiplier(self) -> float:
101
+ return 0.5
102
+
103
+
104
+ class SiLU(nn.SiLU):
105
+ @property
106
+ def output_multiplier(self) -> float:
107
+ return 1.0
108
+
109
+
110
+ class LLaDALlamaBlock(nn.Module):
111
+ """
112
+ This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
113
+ (plus another skip connection). This block is similar to `LLaDASequentialBlock`
114
+ but some operations have slightly different implementations to imitate the
115
+ behavior of Llama.
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ layer_id: int,
121
+ mlp_ratio: int,
122
+ d_model: int,
123
+ n_heads: int,
124
+ rope_theta: float,
125
+ max_sequence_length: int,
126
+ mlp_hidden_size: int,
127
+ device: torch.device,
128
+ ):
129
+ super().__init__()
130
+ self.layer_id = layer_id
131
+ self.hidden_size = (
132
+ mlp_hidden_size if mlp_hidden_size is not None else mlp_ratio * d_model
133
+ )
134
+ assert d_model % n_heads == 0
135
+
136
+ self.n_heads = n_heads
137
+
138
+ # Activation function.
139
+ self.act = SiLU()
140
+ assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
141
+
142
+ # Attention output projection.
143
+ self.attn_out = nn.Linear(
144
+ d_model, d_model, bias=False, device=device
145
+ )
146
+
147
+ # Feed-forward output projection.
148
+ self.ff_out = nn.Linear(
149
+ int(self.act.output_multiplier * self.hidden_size),
150
+ d_model,
151
+ bias=False,
152
+ device=device,
153
+ )
154
+ self.ff_out._is_residual = True
155
+
156
+ # Rotary embeddings.
157
+ self.rotary_emb = RotaryEmbedding(rope_theta=rope_theta, d_model=d_model, n_heads=n_heads, max_sequence_length=max_sequence_length, device=device)
158
+
159
+ # Layer norms.
160
+ self.attn_norm = RMSLayerNorm(d_model=d_model, device=device)
161
+ self.ff_norm = RMSLayerNorm(d_model=d_model, device=device)
162
+
163
+ # Attention input projection. Projects x -> (q, k, v)
164
+ q_proj_out_dim = d_model
165
+ k_proj_out_dim = d_model
166
+ v_proj_out_dim = d_model
167
+ self.q_proj = nn.Linear(
168
+ d_model, q_proj_out_dim, bias=False, device=device,
169
+ )
170
+ self.k_proj = nn.Linear(
171
+ d_model, k_proj_out_dim, bias=False, device=device,
172
+ )
173
+ self.v_proj = nn.Linear(
174
+ d_model, v_proj_out_dim, bias=False, device=device,
175
+ )
176
+
177
+ # Feed-forward input projection.
178
+ self.ff_proj = nn.Linear(
179
+ d_model, self.hidden_size, bias=False, device=device
180
+ )
181
+ # new add
182
+ self.up_proj = nn.Linear(
183
+ d_model, self.hidden_size, bias=False, device=device,
184
+ )
185
+
186
+ def attention(
187
+ self,
188
+ q: torch.Tensor,
189
+ k: torch.Tensor,
190
+ v: torch.Tensor,
191
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
192
+ B, T, C = q.size() # batch size, sequence length, d_model
193
+
194
+ # Move head forward to be next to the batch dim.
195
+ # shape: (B, nh, T, hs)
196
+ q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
197
+ # shape: (B, n_kv_h, T, hs)
198
+ k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
199
+ # shape: (B, n_kv_h, T, hs)
200
+ v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
201
+
202
+ q, k = self.rotary_emb(q, k)
203
+
204
+ # Get the attention scores.
205
+ # shape: (B, nh, T, hs)
206
+ att = F.scaled_dot_product_attention(
207
+ q,
208
+ k,
209
+ v,
210
+ attn_mask=None,
211
+ dropout_p=0.0,
212
+ is_causal=False,
213
+ )
214
+
215
+ # Re-assemble all head outputs side-by-side.
216
+ att = att.transpose(1, 2).contiguous().view(B, T, C)
217
+
218
+ # Apply output projection.
219
+ return self.attn_out(att)
220
+
221
+ def forward(
222
+ self,
223
+ x: torch.Tensor,
224
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
225
+ x_normed = self.attn_norm(x)
226
+ q = self.q_proj(x_normed)
227
+ k = self.k_proj(x_normed)
228
+ v = self.v_proj(x_normed)
229
+
230
+ att = self.attention(q, k, v)
231
+
232
+ # Add attention scores.
233
+ # shape: (B, T, C)
234
+ x = x + att
235
+
236
+ # Add feed-forward projection.
237
+ # shape: (batch_size, seq_len, d_model)
238
+ og_x = x
239
+ x = self.ff_norm(x)
240
+ x, x_up = self.ff_proj(x), self.up_proj(x) # new add
241
+
242
+ x = self.act(x)
243
+ x = x * x_up # new add
244
+ x = self.ff_out(x)
245
+ x = og_x + x
246
+
247
+ return x
248
+
249
+
250
+ class LLaDASequentialBlock(nn.Module):
251
+ """
252
+ This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
253
+ (plus another skip connection).
254
+ """
255
+
256
+ def __init__(
257
+ self,
258
+ layer_id: int,
259
+ mlp_ratio: int,
260
+ d_model: int,
261
+ n_heads: int,
262
+ rope_theta: float,
263
+ max_sequence_length: int,
264
+ mlp_hidden_size: int,
265
+ device: torch.device,
266
+ ):
267
+ super().__init__()
268
+ self.layer_id = layer_id
269
+ self.hidden_size = (
270
+ mlp_hidden_size if mlp_hidden_size is not None else mlp_ratio * d_model
271
+ )
272
+ assert d_model % n_heads == 0
273
+
274
+ self.n_heads = n_heads
275
+
276
+ # Activation function.
277
+ self.act = SwiGLU()
278
+ assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
279
+
280
+ # Attention output projection.
281
+ self.attn_out = nn.Linear(
282
+ d_model, d_model, bias=False, device=device
283
+ )
284
+
285
+ # Feed-forward output projection.
286
+ self.ff_out = nn.Linear(
287
+ int(self.act.output_multiplier * self.hidden_size),
288
+ d_model,
289
+ bias=False,
290
+ device=device,
291
+ )
292
+ self.ff_out._is_residual = True
293
+
294
+ # Rotary embeddings.
295
+ self.rotary_emb = RotaryEmbedding(rope_theta=rope_theta, d_model=d_model, n_heads=n_heads, max_sequence_length=max_sequence_length, device=device)
296
+
297
+ # Layer norms.
298
+ self.attn_norm = RMSLayerNorm(d_model=d_model, device=device)
299
+ self.ff_norm = RMSLayerNorm(d_model=d_model, device=device)
300
+
301
+ # Attention input projection. Projects x -> (q, k, v)
302
+ self.fused_dims = (
303
+ d_model,
304
+ d_model,
305
+ d_model,
306
+ )
307
+ self.att_proj = nn.Linear(
308
+ d_model, sum(self.fused_dims), bias=False, device=device
309
+ )
310
+
311
+ # Feed-forward input projection.
312
+ self.ff_proj = nn.Linear(
313
+ d_model, self.hidden_size, bias=False, device=device
314
+ )
315
+
316
+ def attention(
317
+ self,
318
+ q: torch.Tensor,
319
+ k: torch.Tensor,
320
+ v: torch.Tensor,
321
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
322
+ B, T, C = q.size() # batch size, sequence length, d_model
323
+
324
+ # Move head forward to be next to the batch dim.
325
+ # shape: (B, nh, T, hs)
326
+ q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
327
+ # shape: (B, n_kv_h, T, hs)
328
+ k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
329
+ # shape: (B, n_kv_h, T, hs)
330
+ v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
331
+
332
+ q, k = self.rotary_emb(q, k)
333
+
334
+ # Get the attention scores.
335
+ # shape: (B, nh, T, hs)
336
+ att = F.scaled_dot_product_attention(
337
+ q,
338
+ k,
339
+ v,
340
+ attn_mask=None,
341
+ dropout_p=0.0,
342
+ is_causal=False,
343
+ )
344
+
345
+ # Re-assemble all head outputs side-by-side.
346
+ att = att.transpose(1, 2).contiguous().view(B, T, C)
347
+
348
+ # Apply output projection.
349
+ return self.attn_out(att)
350
+
351
+ def forward(
352
+ self,
353
+ x: torch.Tensor,
354
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
355
+ q, k, v = self.att_proj(self.attn_norm(x)).split(self.fused_dims, dim=-1)
356
+
357
+ att = self.attention(q, k, v)
358
+
359
+ # Add attention scores.
360
+ # shape: (B, T, C)
361
+ x = x + att
362
+
363
+ # Add feed-forward projection.
364
+ # shape: (batch_size, seq_len, d_model)
365
+ og_x = x
366
+ x = self.ff_norm(x)
367
+ x = self.ff_proj(x)
368
+
369
+ x = self.act(x)
370
+ x = self.ff_out(x)
371
+ x = og_x + x
372
+
373
+ return x
374
+
375
+ class LLaDAModel(nn.Module):
376
+ def __init__(
377
+ self,
378
+ mlp_ratio: int,
379
+ d_model: int,
380
+ n_heads: int,
381
+ rope_theta: float,
382
+ max_sequence_length: int,
383
+ vocab_size: int,
384
+ n_layers: int,
385
+ mlp_hidden_size: int,
386
+ device: torch.device,
387
+ ):
388
+ super().__init__()
389
+ self.transformer = nn.ModuleDict(
390
+ dict(
391
+ wte=nn.Embedding(
392
+ vocab_size, d_model, device=device
393
+ ),
394
+ ln_f=RMSLayerNorm(d_model=d_model, device=device),
395
+ )
396
+ )
397
+
398
+ blocks = [
399
+ LLaDALlamaBlock(
400
+ layer_id=i,
401
+ mlp_ratio=mlp_ratio,
402
+ d_model=d_model,
403
+ n_heads=n_heads,
404
+ rope_theta=rope_theta,
405
+ max_sequence_length=max_sequence_length,
406
+ mlp_hidden_size=mlp_hidden_size,
407
+ device=device,
408
+ )
409
+ for i in range(n_layers)
410
+ ]
411
+ self.transformer.update({"blocks": nn.ModuleList(blocks)})
412
+
413
+ self.transformer.update(
414
+ {
415
+ "ff_out": nn.Linear(
416
+ d_model,
417
+ vocab_size,
418
+ bias=False,
419
+ device=device,
420
+ )
421
+ }
422
+ )
423
+
424
+ @property
425
+ def device(self) -> torch.device:
426
+ device: torch.device = self.transformer.wte.weight.device # type: ignore
427
+ return device
428
+
429
+ def forward(
430
+ self,
431
+ input_ids: torch.LongTensor,
432
+ last_logits_only: bool = False,
433
+ ) -> torch.Tensor:
434
+ """
435
+ :param input_ids: A tensor of shape `(batch_size, seq_len)`.
436
+ :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
437
+ This can speed up decoding when you only care about the next token.
438
+ """
439
+
440
+ # Get embeddings of input.
441
+ # shape: (batch_size, seq_len, d_model)
442
+ x = self.transformer.wte(input_ids)
443
+
444
+ for block_idx, block in enumerate(self.transformer.blocks):
445
+ x = block(x)
446
+
447
+ if last_logits_only:
448
+ # shape: (batch_size, 1, d_model)
449
+ x = x[:, -1, :].unsqueeze(1)
450
+
451
+ # Apply final layer norm.
452
+ # shape: (batch_size, seq_len or 1, d_model)
453
+ x = self.transformer.ln_f(x) # type: ignore
454
+
455
+ logits = self.transformer.ff_out(x) # type: ignore
456
+
457
+ return logits
special_tokens.json ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|startoftext|>": 126080,
3
+ "<|endoftext|>": 126081,
4
+ "[CLS]": 126082,
5
+ "[gMASK]": 126083,
6
+ "<|reserved_token_0|>": 126084,
7
+ "<|reserved_token_1|>": 126085,
8
+ "<|reserved_token_2|>": 126086,
9
+ "<|reserved_token_3|>": 126087,
10
+ "<|reserved_token_4|>": 126088,
11
+ "<|reserved_token_5|>": 126089,
12
+ "<|reserved_token_6|>": 126090,
13
+ "<|reserved_token_7|>": 126091,
14
+ "<|reserved_token_8|>": 126092,
15
+ "<|reserved_token_9|>": 126093,
16
+ "<|reserved_token_10|>": 126094,
17
+ "<|reserved_token_11|>": 126095,
18
+ "<|reserved_token_12|>": 126096,
19
+ "<|reserved_token_13|>": 126097,
20
+ "<|reserved_token_14|>": 126098,
21
+ "<|reserved_token_15|>": 126099,
22
+ "<|reserved_token_16|>": 126100,
23
+ "<|reserved_token_17|>": 126101,
24
+ "<|reserved_token_18|>": 126102,
25
+ "<|reserved_token_19|>": 126103,
26
+ "<|reserved_token_20|>": 126104,
27
+ "<|reserved_token_21|>": 126105,
28
+ "<|reserved_token_22|>": 126106,
29
+ "<|reserved_token_23|>": 126107,
30
+ "<|reserved_token_24|>": 126108,
31
+ "<|reserved_token_25|>": 126109,
32
+ "<|reserved_token_26|>": 126110,
33
+ "<|reserved_token_27|>": 126111,
34
+ "<|reserved_token_28|>": 126112,
35
+ "<|reserved_token_29|>": 126113,
36
+ "<|reserved_token_30|>": 126114,
37
+ "<|reserved_token_31|>": 126115,
38
+ "<|reserved_token_32|>": 126116,
39
+ "<|reserved_token_33|>": 126117,
40
+ "<|reserved_token_34|>": 126118,
41
+ "<|reserved_token_35|>": 126119,
42
+ "<|reserved_token_36|>": 126120,
43
+ "<|reserved_token_37|>": 126121,
44
+ "<|reserved_token_38|>": 126122,
45
+ "<|reserved_token_39|>": 126123,
46
+ "<|reserved_token_40|>": 126124,
47
+ "<|reserved_token_41|>": 126125,
48
+ "<|reserved_token_42|>": 126126,
49
+ "<|reserved_token_43|>": 126127,
50
+ "<|reserved_token_44|>": 126128,
51
+ "<|reserved_token_45|>": 126129,
52
+ "<|reserved_token_46|>": 126130,
53
+ "<|reserved_token_47|>": 126131,
54
+ "<|reserved_token_48|>": 126132,
55
+ "<|reserved_token_49|>": 126133,
56
+ "<|reserved_token_50|>": 126134,
57
+ "<|reserved_token_51|>": 126135,
58
+ "<|reserved_token_52|>": 126136,
59
+ "<|reserved_token_53|>": 126137,
60
+ "<|reserved_token_54|>": 126138,
61
+ "<|reserved_token_55|>": 126139,
62
+ "<|reserved_token_56|>": 126140,
63
+ "<|reserved_token_57|>": 126141,
64
+ "<|reserved_token_58|>": 126142,
65
+ "<|reserved_token_59|>": 126143,
66
+ "<|reserved_token_60|>": 126144,
67
+ "<|reserved_token_61|>": 126145,
68
+ "<|reserved_token_62|>": 126146,
69
+ "<|reserved_token_63|>": 126147,
70
+ "<|reserved_token_64|>": 126148,
71
+ "<|reserved_token_65|>": 126149,
72
+ "<|reserved_token_66|>": 126150,
73
+ "<|reserved_token_67|>": 126151,
74
+ "<|reserved_token_68|>": 126152,
75
+ "<|reserved_token_69|>": 126153,
76
+ "<|reserved_token_70|>": 126154,
77
+ "<|reserved_token_71|>": 126155,
78
+ "<|reserved_token_72|>": 126156,
79
+ "<|reserved_token_73|>": 126157,
80
+ "<|reserved_token_74|>": 126158,
81
+ "<|reserved_token_75|>": 126159,
82
+ "<|reserved_token_76|>": 126160,
83
+ "<|reserved_token_77|>": 126161,
84
+ "<|reserved_token_78|>": 126162,
85
+ "<|reserved_token_79|>": 126163,
86
+ "<|reserved_token_80|>": 126164,
87
+ "<|reserved_token_81|>": 126165,
88
+ "<|reserved_token_82|>": 126166,
89
+ "<|reserved_token_83|>": 126167,
90
+ "<|reserved_token_84|>": 126168,
91
+ "<|reserved_token_85|>": 126169,
92
+ "<|reserved_token_86|>": 126170,
93
+ "<|reserved_token_87|>": 126171,
94
+ "<|reserved_token_88|>": 126172,
95
+ "<|reserved_token_89|>": 126173,
96
+ "<|reserved_token_90|>": 126174,
97
+ "<|reserved_token_91|>": 126175,
98
+ "<|reserved_token_92|>": 126176,
99
+ "<|reserved_token_93|>": 126177,
100
+ "<|reserved_token_94|>": 126178,
101
+ "<|reserved_token_95|>": 126179,
102
+ "<|reserved_token_96|>": 126180,
103
+ "<|reserved_token_97|>": 126181,
104
+ "<|reserved_token_98|>": 126182,
105
+ "<|reserved_token_99|>": 126183,
106
+ "<|reserved_token_100|>": 126184,
107
+ "<|reserved_token_101|>": 126185,
108
+ "<|reserved_token_102|>": 126186,
109
+ "<|reserved_token_103|>": 126187,
110
+ "<|reserved_token_104|>": 126188,
111
+ "<|reserved_token_105|>": 126189,
112
+ "<|reserved_token_106|>": 126190,
113
+ "<|reserved_token_107|>": 126191,
114
+ "<|reserved_token_108|>": 126192,
115
+ "<|reserved_token_109|>": 126193,
116
+ "<|reserved_token_110|>": 126194,
117
+ "<|reserved_token_111|>": 126195,
118
+ "<|reserved_token_112|>": 126196,
119
+ "<|reserved_token_113|>": 126197,
120
+ "<|reserved_token_114|>": 126198,
121
+ "<|reserved_token_115|>": 126199,
122
+ "<|reserved_token_116|>": 126200,
123
+ "<|reserved_token_117|>": 126201,
124
+ "<|reserved_token_118|>": 126202,
125
+ "<|reserved_token_119|>": 126203,
126
+ "<|reserved_token_120|>": 126204,
127
+ "<|reserved_token_121|>": 126205,
128
+ "<|reserved_token_122|>": 126206,
129
+ "<|reserved_token_123|>": 126207,
130
+ "<|reserved_token_124|>": 126208,
131
+ "<|reserved_token_125|>": 126209,
132
+ "<|reserved_token_126|>": 126210,
133
+ "<|reserved_token_127|>": 126211,
134
+ "<|reserved_token_128|>": 126212,
135
+ "<|reserved_token_129|>": 126213,
136
+ "<|reserved_token_130|>": 126214,
137
+ "<|reserved_token_131|>": 126215,
138
+ "<|reserved_token_132|>": 126216,
139
+ "<|reserved_token_133|>": 126217,
140
+ "<|reserved_token_134|>": 126218,
141
+ "<|reserved_token_135|>": 126219,
142
+ "<|reserved_token_136|>": 126220,
143
+ "<|reserved_token_137|>": 126221,
144
+ "<|reserved_token_138|>": 126222,
145
+ "<|reserved_token_139|>": 126223,
146
+ "<|reserved_token_140|>": 126224,
147
+ "<|reserved_token_141|>": 126225,
148
+ "<|reserved_token_142|>": 126226,
149
+ "<|reserved_token_143|>": 126227,
150
+ "<|reserved_token_144|>": 126228,
151
+ "<|reserved_token_145|>": 126229,
152
+ "<|reserved_token_146|>": 126230,
153
+ "<|reserved_token_147|>": 126231,
154
+ "<|reserved_token_148|>": 126232,
155
+ "<|reserved_token_149|>": 126233,
156
+ "<|reserved_token_150|>": 126234,
157
+ "<|reserved_token_151|>": 126235,
158
+ "<|reserved_token_152|>": 126236,
159
+ "<|reserved_token_153|>": 126237,
160
+ "<|reserved_token_154|>": 126238,
161
+ "<|reserved_token_155|>": 126239,
162
+ "<|reserved_token_156|>": 126240,
163
+ "<|reserved_token_157|>": 126241,
164
+ "<|reserved_token_158|>": 126242,
165
+ "<|reserved_token_159|>": 126243,
166
+ "<|reserved_token_160|>": 126244,
167
+ "<|reserved_token_161|>": 126245,
168
+ "<|reserved_token_162|>": 126246,
169
+ "<|reserved_token_163|>": 126247,
170
+ "<|reserved_token_164|>": 126248,
171
+ "<|reserved_token_165|>": 126249,
172
+ "<|reserved_token_166|>": 126250,
173
+ "<|reserved_token_167|>": 126251,
174
+ "<|reserved_token_168|>": 126252,
175
+ "<|reserved_token_169|>": 126253,
176
+ "<|reserved_token_170|>": 126254,
177
+ "<|reserved_token_171|>": 126255,
178
+ "<|reserved_token_172|>": 126256,
179
+ "<|reserved_token_173|>": 126257,
180
+ "<|reserved_token_174|>": 126258,
181
+ "<|reserved_token_175|>": 126259,
182
+ "<|reserved_token_176|>": 126260,
183
+ "<|reserved_token_177|>": 126261,
184
+ "<|reserved_token_178|>": 126262,
185
+ "<|reserved_token_179|>": 126263,
186
+ "<|reserved_token_180|>": 126264,
187
+ "<|reserved_token_181|>": 126265,
188
+ "<|reserved_token_182|>": 126266,
189
+ "<|reserved_token_183|>": 126267,
190
+ "<|reserved_token_184|>": 126268,
191
+ "<|reserved_token_185|>": 126269,
192
+ "<|reserved_token_186|>": 126270,
193
+ "<|reserved_token_187|>": 126271,
194
+ "<|reserved_token_188|>": 126272,
195
+ "<|reserved_token_189|>": 126273,
196
+ "<|reserved_token_190|>": 126274,
197
+ "<|reserved_token_191|>": 126275,
198
+ "<|reserved_token_192|>": 126276,
199
+ "<|reserved_token_193|>": 126277,
200
+ "<|reserved_token_194|>": 126278,
201
+ "<|reserved_token_195|>": 126279,
202
+ "<|reserved_token_196|>": 126280,
203
+ "<|reserved_token_197|>": 126281,
204
+ "<|reserved_token_198|>": 126282,
205
+ "<|reserved_token_199|>": 126283,
206
+ "<|reserved_token_200|>": 126284,
207
+ "<|reserved_token_201|>": 126285,
208
+ "<|reserved_token_202|>": 126286,
209
+ "<|reserved_token_203|>": 126287,
210
+ "<|reserved_token_204|>": 126288,
211
+ "<|reserved_token_205|>": 126289,
212
+ "<|reserved_token_206|>": 126290,
213
+ "<|reserved_token_207|>": 126291,
214
+ "<|reserved_token_208|>": 126292,
215
+ "<|reserved_token_209|>": 126293,
216
+ "<|reserved_token_210|>": 126294,
217
+ "<|reserved_token_211|>": 126295,
218
+ "<|reserved_token_212|>": 126296,
219
+ "<|reserved_token_213|>": 126297,
220
+ "<|reserved_token_214|>": 126298,
221
+ "<|reserved_token_215|>": 126299,
222
+ "<|reserved_token_216|>": 126300,
223
+ "<|reserved_token_217|>": 126301,
224
+ "<|reserved_token_218|>": 126302,
225
+ "<|reserved_token_219|>": 126303,
226
+ "<|reserved_token_220|>": 126304,
227
+ "<|reserved_token_221|>": 126305,
228
+ "<|reserved_token_222|>": 126306,
229
+ "<|reserved_token_223|>": 126307,
230
+ "<|reserved_token_224|>": 126308,
231
+ "<|reserved_token_225|>": 126309,
232
+ "<|reserved_token_226|>": 126310,
233
+ "<|reserved_token_227|>": 126311,
234
+ "<|reserved_token_228|>": 126312,
235
+ "<|reserved_token_229|>": 126313,
236
+ "<|reserved_token_230|>": 126314,
237
+ "<|reserved_token_231|>": 126315,
238
+ "<|reserved_token_232|>": 126316,
239
+ "<|reserved_token_233|>": 126317,
240
+ "<|reserved_token_234|>": 126318,
241
+ "<|reserved_token_235|>": 126319,
242
+ "<|reserved_token_236|>": 126320,
243
+ "<|reserved_token_237|>": 126321,
244
+ "<|reserved_token_238|>": 126322,
245
+ "<|reserved_token_239|>": 126323,
246
+ "<|reserved_token_240|>": 126324,
247
+ "<|reserved_token_241|>": 126325,
248
+ "<|reserved_token_242|>": 126326,
249
+ "<|reserved_token_243|>": 126327,
250
+ "<|reserved_token_244|>": 126328,
251
+ "<|reserved_token_245|>": 126329,
252
+ "<|reserved_token_246|>": 126330,
253
+ "<|reserved_token_247|>": 126331,
254
+ "<|reserved_token_248|>": 126332,
255
+ "<|reserved_token_249|>": 126333,
256
+ "<|reserved_token_250|>": 126334,
257
+ "<|reserved_token_251|>": 126335,
258
+ "<|mdm_mask|>": 126336,
259
+ "<|reserved_token_253|>": 126337,
260
+ "<|reserved_token_254|>": 126338,
261
+ "<|reserved_token_255|>": 126339,
262
+ "<role>": 126340,
263
+ "</role>": 126341,
264
+ "<|arithmetic_start|>": 126342,
265
+ "<|arithmetic_end|>": 126343,
266
+ "<|number_start|>": 126344,
267
+ "<|number_end|>": 126345,
268
+ "<|start_header_id|>": 126346,
269
+ "<|end_header_id|>": 126347,
270
+ "<|eot_id|>": 126348
271
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff