k050506koch commited on
Commit
9d9edb6
·
verified ·
1 Parent(s): 096ed6f

upload weights

Browse files
added_tokens.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|call|>": 50262,
3
+ "<|channel|>": 50260,
4
+ "<|constrain|>": 50263,
5
+ "<|end|>": 50258,
6
+ "<|message|>": 50259,
7
+ "<|return|>": 50261,
8
+ "<|start|>": 50257
9
+ }
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for m in messages %}{% if m['role'] == 'assistant' %}<|start|>assistant<|channel|>final<|message|>{{ m['content'] }}<|end|>{% elif m['role'] == 'developer' %}<|start|>developer<|message|>{{ m['content'] }}<|end|>{% else %}<|start|>{{ m['role'] }}<|message|>{{ m['content'] }}<|end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|start|>assistant<|channel|>final<|message|>{% endif %}
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GPT4DevForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_gpt4dev.GPT4DevConfig",
7
+ "AutoModel": "modeling_gpt4dev.GPT4DevForCausalLM",
8
+ "AutoModelForCausalLM": "modeling_gpt4dev.GPT4DevForCausalLM"
9
+ },
10
+ "compat_prefill_tokens": 0,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_epsilon": 1e-05,
15
+ "max_position_embeddings": 1024,
16
+ "model_type": "gpt4dev",
17
+ "multi_query": true,
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 12,
20
+ "qkv_bias": true,
21
+ "rope_theta": 10000.0,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.52.4",
25
+ "vocab_size": 50264
26
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.52.4"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afcba36e53071eaef2acdf78b948f73b1ba4cacf5989dce6f475a13b4cd9cf6f
3
+ size 709224088
modeling_gpt4dev.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, torch, torch.nn as nn, torch.nn.functional as F
2
+ from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin
3
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
4
+ from typing import Optional, Tuple, List
5
+
6
+
7
+ class GPT4DevConfig(PretrainedConfig):
8
+ model_type = "gpt4dev"
9
+ def __init__(
10
+ self,
11
+ vocab_size=50257,
12
+ hidden_size=768,
13
+ num_hidden_layers=12,
14
+ num_attention_heads=12,
15
+ intermediate_size=3072,
16
+ max_position_embeddings=1024,
17
+ rope_theta=10000.0,
18
+ qkv_bias=True,
19
+ layer_norm_epsilon=1e-5,
20
+ initializer_range=0.02,
21
+ multi_query=True,
22
+ architectures=None,
23
+ tie_word_embeddings=False,
24
+ compat_prefill_tokens: int = 0,
25
+ **kwargs,
26
+ ):
27
+ super().__init__(
28
+ vocab_size=vocab_size,
29
+ hidden_size=hidden_size,
30
+ num_hidden_layers=num_hidden_layers,
31
+ num_attention_heads=num_attention_heads,
32
+ intermediate_size=intermediate_size,
33
+ max_position_embeddings=max_position_embeddings,
34
+ rope_theta=rope_theta,
35
+ qkv_bias=qkv_bias,
36
+ layer_norm_epsilon=layer_norm_epsilon,
37
+ initializer_range=initializer_range,
38
+ multi_query=multi_query,
39
+ architectures=architectures,
40
+ tie_word_embeddings=tie_word_embeddings,
41
+ compat_prefill_tokens=compat_prefill_tokens,
42
+ **kwargs,
43
+ )
44
+
45
+
46
+ def rope_cache(seq_len, dim, theta, device, dtype=torch.float32):
47
+ # Note: kept float32 to match training-time math used in early checkpoints
48
+ inv = 1.0 / (theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
49
+ t = torch.arange(seq_len, device=device, dtype=torch.float32)
50
+ freqs = torch.outer(t, inv)
51
+ return torch.polar(torch.ones_like(freqs), freqs).to(dtype)
52
+
53
+
54
+ def apply_rope(x, rope):
55
+ # x: (..., D) with D even; rope: (T, D/2). In legacy math this can be float (cos-only)
56
+ xc = torch.view_as_complex(x.to(torch.float32).reshape(*x.shape[:-1], -1, 2))
57
+ yc = xc * rope.to(xc.dtype)
58
+ y = torch.view_as_real(yc).reshape(*x.shape[:-1], -1)
59
+ return y.to(x.dtype)
60
+
61
+
62
+ class MQA(nn.Module):
63
+ def __init__(self, config: GPT4DevConfig):
64
+ super().__init__()
65
+ h, d = config.num_attention_heads, config.hidden_size // config.num_attention_heads
66
+ self.h, self.d = h, d
67
+ self.qkv = nn.Linear(config.hidden_size, h * d + 2 * d, bias=config.qkv_bias)
68
+ self.out = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
69
+
70
+ def forward(
71
+ self,
72
+ x: torch.Tensor,
73
+ rope: torch.Tensor,
74
+ past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
75
+ ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
76
+ B, T, _ = x.shape
77
+ qkv = self.qkv(x)
78
+ q, kv = qkv.split(self.h * self.d, dim=-1)
79
+ k_new, v_new = kv.split(self.d, dim=-1) # (B, T, d)
80
+
81
+ # queries to head dim; apply RoPE
82
+ q = q.view(B, T, self.h, self.d).transpose(1, 2) # (B, h, T, d)
83
+ q = apply_rope(q, rope)
84
+
85
+ # rotate new k
86
+ k_new = apply_rope(k_new.unsqueeze(1), rope).squeeze(1) # (B, T, d)
87
+
88
+ # concat cache
89
+ if past_kv is not None and past_kv[0] is not None:
90
+ k_cat = torch.cat([past_kv[0], k_new], dim=1)
91
+ v_cat = torch.cat([past_kv[1], v_new], dim=1)
92
+ else:
93
+ k_cat, v_cat = k_new, v_new
94
+
95
+ # expand KV
96
+ k_exp = k_cat.unsqueeze(1).expand(-1, self.h, -1, -1) # (B, h, S, d)
97
+ v_exp = v_cat.unsqueeze(1).expand(-1, self.h, -1, -1) # (B, h, S, d)
98
+
99
+ B, h, T, d = q.shape
100
+ S = k_exp.size(2)
101
+ past_len = S - T
102
+ attn = torch.matmul(q, k_exp.transpose(-2, -1)) / math.sqrt(d)
103
+
104
+ # Offset-aware causal mask
105
+ idx_t = torch.arange(T, device=q.device)[:, None]
106
+ idx_s = torch.arange(S, device=q.device)[None, :]
107
+ mask = idx_s > idx_t + past_len
108
+ attn = attn.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
109
+
110
+ attn = F.softmax(attn, dim=-1)
111
+ y = torch.matmul(attn, v_exp)
112
+ y = y.transpose(1, 2).reshape(B, T, -1)
113
+ return self.out(y), (k_cat, v_cat)
114
+
115
+ def forward_compat(self, x: torch.Tensor, rope: torch.Tensor) -> torch.Tensor:
116
+ B, T, _ = x.shape
117
+ qkv = self.qkv(x)
118
+ q, kv = qkv.split(self.h * self.d, dim=-1)
119
+ k, v = kv.split(self.d, dim=-1)
120
+ q = q.view(B, T, self.h, self.d).transpose(1, 2) # (B,h,T,d)
121
+ k = k.unsqueeze(1).expand(-1, self.h, -1, -1) # (B,h,T,d)
122
+ v = v.unsqueeze(1).expand(-1, self.h, -1, -1) # (B,h,T,d)
123
+ q = apply_rope(q, rope)
124
+ k = apply_rope(k, rope)
125
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
126
+ return self.out(y.transpose(1, 2).reshape(B, T, -1))
127
+
128
+
129
+ class SwiGLU(nn.Module):
130
+ def __init__(self, hidden_dim, intermediate_dim):
131
+ super().__init__()
132
+ self.w1 = nn.Linear(hidden_dim, intermediate_dim * 2, bias=True)
133
+ self.w2 = nn.Linear(intermediate_dim, hidden_dim, bias=False)
134
+ def forward(self, x):
135
+ x_g, x_v = self.w1(x).chunk(2, dim=-1)
136
+ return self.w2(F.silu(x_g) * x_v)
137
+
138
+
139
+ class Block(nn.Module):
140
+ def __init__(self, config):
141
+ super().__init__()
142
+ self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
143
+ self.attn = MQA(config) if config.multi_query else nn.MultiheadAttention(
144
+ config.hidden_size, config.num_attention_heads, bias=config.qkv_bias, batch_first=True)
145
+ self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
146
+ self.mlp = SwiGLU(config.hidden_size, config.intermediate_size)
147
+ self.gradient_checkpointing = False
148
+
149
+ def forward(
150
+ self,
151
+ x: torch.Tensor,
152
+ rope: torch.Tensor,
153
+ past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
154
+ use_checkpoint: bool = False,
155
+ ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
156
+ def custom_forward(x_, rope_):
157
+ a, new_kv = self.attn(self.ln1(x_), rope_, past_kv)
158
+ x_ = x_ + a
159
+ x_ = x_ + self.mlp(self.ln2(x_))
160
+ return x_, new_kv
161
+ if use_checkpoint and self.training:
162
+ y, new_kv = torch.utils.checkpoint.checkpoint(custom_forward, x, rope, use_reentrant=False)
163
+ return y, new_kv
164
+ else:
165
+ return custom_forward(x, rope)
166
+
167
+ def forward_compat(self, x: torch.Tensor, rope: torch.Tensor, use_checkpoint: bool = False) -> torch.Tensor:
168
+ def custom_forward(x_, rope_):
169
+ a = self.attn.forward_compat(self.ln1(x_), rope_)
170
+ x_ = x_ + a
171
+ x_ = x_ + self.mlp(self.ln2(x_))
172
+ return x_
173
+ if use_checkpoint and self.training:
174
+ return torch.utils.checkpoint.checkpoint(custom_forward, x, rope, use_reentrant=False)
175
+ else:
176
+ return custom_forward(x, rope)
177
+
178
+
179
+ class GPT4DevPreTrained(PreTrainedModel):
180
+ config_class = GPT4DevConfig
181
+ base_model_prefix = "transformer"
182
+ supports_gradient_checkpointing = True
183
+ _no_split_modules = ["Block"]
184
+
185
+ def _init_weights(self, module):
186
+ if isinstance(module, (nn.Linear, nn.Embedding)):
187
+ nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
188
+ if isinstance(module, nn.Linear) and module.bias is not None:
189
+ nn.init.zeros_(module.bias)
190
+
191
+
192
+ class GPT4DevForCausalLM(GPT4DevPreTrained, GenerationMixin):
193
+ def __init__(self, config):
194
+ super().__init__(config)
195
+ self.embed = nn.Embedding(config.vocab_size, config.hidden_size)
196
+ self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
197
+ self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
198
+ self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
199
+ self.rope_cache = None
200
+ self.post_init()
201
+
202
+ # embeddings tie helpers
203
+ def get_input_embeddings(self):
204
+ return self.embed
205
+ def set_input_embeddings(self, new_embeddings):
206
+ self.embed = new_embeddings
207
+ if getattr(self.config, "tie_word_embeddings", True) and self.get_output_embeddings() is not None:
208
+ with torch.no_grad():
209
+ self.get_output_embeddings().weight = self.embed.weight
210
+ def get_output_embeddings(self):
211
+ return self.head
212
+ def set_output_embeddings(self, new_lm_head):
213
+ self.head = new_lm_head
214
+ def tie_weights(self):
215
+ if getattr(self.config, "tie_word_embeddings", True):
216
+ self.head.weight = self.embed.weight
217
+
218
+ # generation helpers (legacy tuple KV-cache)
219
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, past_key_values=None, **kwargs):
220
+ # Until compat_prefill_tokens, avoid slicing and ignore cache to mirror legacy behavior
221
+ cutoff = int(getattr(self.config, "compat_prefill_tokens", 0) or 0)
222
+ if past_key_values is not None and input_ids is not None and input_ids.size(1) < cutoff:
223
+ past_key_values = None # drop cache, process full prefix
224
+ elif past_key_values is not None:
225
+ # normal cached decode path
226
+ input_ids = input_ids[:, -1:]
227
+ if attention_mask is not None and attention_mask.dim() == 2 and torch.all(attention_mask == 1):
228
+ attention_mask = None
229
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values, "use_cache": True}
230
+
231
+ def _reorder_cache(self, past_key_values, beam_idx):
232
+ if isinstance(past_key_values, (tuple, list)):
233
+ reordered = []
234
+ for k, v in past_key_values:
235
+ if k is None or v is None:
236
+ reordered.append((k, v))
237
+ else:
238
+ reordered.append((k.index_select(0, beam_idx), v.index_select(0, beam_idx)))
239
+ return tuple(reordered)
240
+ return past_key_values
241
+
242
+ # RoPE utilities (kept float32 behavior to mirror training)
243
+ def _rope_slice(self, past_len: int, T: int, device, dtype):
244
+ if self.rope_cache is None or self.rope_cache.device != device:
245
+ self.rope_cache = rope_cache(
246
+ self.config.max_position_embeddings,
247
+ self.config.hidden_size // self.config.num_attention_heads,
248
+ self.config.rope_theta, device, dtype=torch.float32
249
+ )
250
+ need = past_len + T
251
+ if need > self.rope_cache.size(0):
252
+ self.rope_cache = rope_cache(
253
+ self.config.max_position_embeddings,
254
+ self.config.hidden_size // self.config.num_attention_heads,
255
+ self.config.rope_theta, device, dtype=torch.float32
256
+ )
257
+ return self.rope_cache[past_len: past_len + T]
258
+
259
+ def _set_gradient_checkpointing(self, module, value=False):
260
+ if isinstance(module, Block):
261
+ module.gradient_checkpointing = value
262
+
263
+ def forward(
264
+ self,
265
+ input_ids,
266
+ labels=None,
267
+ attention_mask=None,
268
+ past_key_values=None,
269
+ use_cache=None,
270
+ **kwargs,
271
+ ):
272
+ B, T = input_ids.shape
273
+ x = self.embed(input_ids)
274
+
275
+ past = past_key_values
276
+ use_cache = True if (use_cache is None) else use_cache
277
+ new_past: List[Tuple[torch.Tensor, torch.Tensor]] = [] if use_cache else None
278
+
279
+ past_len = 0
280
+ if past is not None and isinstance(past, (tuple, list)) and past and past[0] is not None:
281
+ past_len = past[0][0].size(1)
282
+
283
+ rope = self._rope_slice(past_len, T, x.device, x.dtype)
284
+ for i, blk in enumerate(self.blocks):
285
+ pkv = None if past is None else (past[i] if i < len(past) else None)
286
+ x, new_kv = blk(x, rope, past_kv=pkv, use_checkpoint=(self.is_gradient_checkpointing and self.training))
287
+ if use_cache and new_past is not None:
288
+ new_past.append(new_kv)
289
+
290
+ logits = self.head(self.ln_f(x))
291
+
292
+ loss = None
293
+ if labels is not None:
294
+ shift_logits = logits[..., :-1, :].contiguous()
295
+ shift_labels = labels[..., 1:].contiguous()
296
+ loss = F.cross_entropy(
297
+ shift_logits.view(-1, shift_logits.size(-1)),
298
+ shift_labels.view(-1),
299
+ ignore_index=-100,
300
+ )
301
+
302
+ return CausalLMOutputWithCrossAttentions(
303
+ loss=loss,
304
+ logits=logits,
305
+ past_key_values=tuple(new_past) if use_cache else None,
306
+ )
307
+
308
+
309
+ GPT4DevConfig.auto_map = {
310
+ "AutoConfig": "modeling_gpt4dev.GPT4DevConfig",
311
+ "AutoModel": "modeling_gpt4dev.GPT4DevForCausalLM",
312
+ "AutoModelForCausalLM": "modeling_gpt4dev.GPT4DevForCausalLM",
313
+ }
314
+
special_tokens_map.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|message|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<|channel|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<|return|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<|call|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ },
45
+ {
46
+ "content": "<|constrain|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false
51
+ }
52
+ ],
53
+ "bos_token": {
54
+ "content": "<|endoftext|>",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false
59
+ },
60
+ "eos_token": {
61
+ "content": "<|end|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false
66
+ },
67
+ "pad_token": "<|end|>",
68
+ "unk_token": {
69
+ "content": "<|endoftext|>",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false
74
+ }
75
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "<|start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "50258": {
21
+ "content": "<|end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "50259": {
29
+ "content": "<|message|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50260": {
37
+ "content": "<|channel|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "50261": {
45
+ "content": "<|return|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "50262": {
53
+ "content": "<|call|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "50263": {
61
+ "content": "<|constrain|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ }
68
+ },
69
+ "additional_special_tokens": [
70
+ "<|start|>",
71
+ "<|end|>",
72
+ "<|message|>",
73
+ "<|channel|>",
74
+ "<|return|>",
75
+ "<|call|>",
76
+ "<|constrain|>"
77
+ ],
78
+ "bos_token": "<|endoftext|>",
79
+ "clean_up_tokenization_spaces": false,
80
+ "eos_token": "<|end|>",
81
+ "extra_special_tokens": {},
82
+ "model_max_length": 8192,
83
+ "pad_token": "<|end|>",
84
+ "tokenizer_class": "GPT2Tokenizer",
85
+ "unk_token": "<|endoftext|>"
86
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff