if001 commited on
Commit
64fc9e2
·
verified ·
1 Parent(s): 8ee4730

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ResidualNetForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_residualnet.ResidualNetConfig",
7
+ "AutoModel": "modeling_residualnet.ResidualNetModel",
8
+ "AutoModelForCausalLM": "modeling_residualnet.ResidualNetForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": null,
12
+ "embd_pdrop": 0.0,
13
+ "eos_token_id": 151645,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 128,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 64,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "ResidualNetConfig",
20
+ "name": "residual-tiny",
21
+ "num_attention_heads": 4,
22
+ "num_hidden_layers": 4,
23
+ "num_key_value_heads": 4,
24
+ "original_max_position_embeddings": 1024,
25
+ "pad_token_id": 151645,
26
+ "resid_pdrop": 0.0,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_scaling": null,
29
+ "rope_theta": 10000.0,
30
+ "sliding_window": null,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.48.2",
34
+ "use_cache": true,
35
+ "vocab_size": 151669
36
+ }
configuration_residualnet.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ from transformers.models.phi3.configuration_phi3 import Phi3Config
3
+ class ResidualNetConfig(Phi3Config):
4
+ model_type = "ResidualNetConfig"
5
+ def __init__(self, **kwargs):
6
+ super().__init__(**kwargs)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6dca59cfa6bb47533b74fe5b471785c6dd689e339e48a9259955bec389e52fc
3
+ size 79368760
modeling_residualnet.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 「差分→Attn→Dense を前半層で繰り返し、後半層は“Denseでのアップスケール(=長さ+1)”→Attn→Dense を繰り返して最終的に元の seq_len に戻す」アーキテクチャ
3
+ """
4
+
5
+ from typing import Optional, Tuple, List
6
+ import torch
7
+ from torch import nn
8
+
9
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
10
+ from transformers.modeling_outputs import CausalLMOutputWithPast
11
+ from transformers.generation.utils import GenerationMixin
12
+
13
+ # from transformers.models.phi3.configuration_phi3 import Phi3Config
14
+ # from transformers.models.phi3.modeling_phi3 import (
15
+ # Phi3PreTrainedModel,
16
+ # Phi3RotaryEmbedding,
17
+ # Phi3RMSNorm,
18
+ # Phi3Attention,
19
+ # # Phi3SdpaAttention, # 既定の SDPA 注意
20
+ # Phi3MLP,
21
+ # )
22
+ from models.phi3_config import Phi3Config
23
+ from models.phi3 import (
24
+ Phi3PreTrainedModel,
25
+ Phi3RMSNorm,
26
+ Phi3MLP,
27
+ # Phi3SdpaAttention,
28
+ Phi3Attention,
29
+ Phi3RotaryEmbedding,
30
+ )
31
+
32
+ class ResidualNetConfig(Phi3Config):
33
+ model_type = "ResidualNetConfig"
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+
37
+ # ---------- 長さ変換用の前処理 ----------
38
+
39
+ class DiffPreprocessor(nn.Module):
40
+ """一次差分: (B, L, H) -> (B, L-1, H) と 2D mask の AND 縮約"""
41
+ def forward(
42
+ self,
43
+ hidden_states: torch.Tensor,
44
+ attention_mask_2d: Optional[torch.Tensor],
45
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
46
+ # hidden_states: (B, L, H)
47
+ x1 = hidden_states[:, 1:, :]
48
+ x0 = hidden_states[:, :-1, :]
49
+ diff = x1 - x0 # (B, L-1, H)
50
+
51
+ if attention_mask_2d is not None:
52
+ m = (attention_mask_2d[:, 1:].bool() & attention_mask_2d[:, :-1].bool()).to(attention_mask_2d.dtype)
53
+ else:
54
+ m = None
55
+ return diff, m
56
+
57
+
58
+ class IntegratePreprocessor(nn.Module):
59
+ """
60
+ 学習可能な“積分”で (B, m, H) -> (B, m+1, H)
61
+ 1) seed y0 = MLP(mean_pool(z))
62
+ 2) y = cumsum([y0, z], dim=1)
63
+ """
64
+ def __init__(self, hidden_size: int):
65
+ super().__init__()
66
+ self.seed_mlp = nn.Sequential(
67
+ nn.Linear(hidden_size, hidden_size, bias=True),
68
+ nn.SiLU(),
69
+ nn.Linear(hidden_size, hidden_size, bias=True),
70
+ )
71
+
72
+ def forward(
73
+ self,
74
+ hidden_states: torch.Tensor,
75
+ attention_mask_2d: Optional[torch.Tensor],
76
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
77
+ # hidden_states: (B, m, H)
78
+ if attention_mask_2d is not None:
79
+ denom = attention_mask_2d.sum(dim=1, keepdim=True).clamp_min(1)
80
+ pooled = (hidden_states * attention_mask_2d.unsqueeze(-1)).sum(dim=1) / denom # (B, H)
81
+ batch_valid = (attention_mask_2d.sum(dim=1) > 0).to(attention_mask_2d.dtype) # (B,)
82
+ else:
83
+ pooled = hidden_states.mean(dim=1)
84
+ batch_valid = None
85
+
86
+ y0 = self.seed_mlp(pooled).unsqueeze(1) # (B,1,H)
87
+ y = torch.cumsum(torch.cat([y0, hidden_states], dim=1), dim=1) # (B, m+1, H)
88
+
89
+ if attention_mask_2d is not None:
90
+ new_first = batch_valid.unsqueeze(1) # (B,1)
91
+ mask = torch.cat([new_first, attention_mask_2d], dim=1)
92
+ else:
93
+ mask = None
94
+ return y, mask
95
+
96
+
97
+ # ---------- レイヤーブロック(Phi3 部品で構成) ----------
98
+
99
+ class ResidualDiffLayer(nn.Module):
100
+ """
101
+ (差分で L-1) -> Attn -> MLP
102
+ - RoPE は Phi-3 と同様に Attention 内で適用
103
+ - 各層で position_ids を 0..len-1 に張り直す
104
+ """
105
+ def __init__(self, config: ResidualNetConfig, layer_idx: int, rotary_emb: Phi3RotaryEmbedding):
106
+ super().__init__()
107
+ self.config = config
108
+ self.layer_idx = layer_idx
109
+ self.input_norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
110
+ self.pre = DiffPreprocessor()
111
+ # self.attn = Phi3SdpaAttention(config, layer_idx=layer_idx)
112
+ self.attn = Phi3Attention(config, layer_idx=layer_idx)
113
+ self.dropout_attn = nn.Dropout(config.resid_pdrop)
114
+ self.post_norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
115
+ self.mlp = Phi3MLP(config)
116
+ self.dropout_mlp = nn.Dropout(config.resid_pdrop)
117
+ self.rotary_emb = rotary_emb # 共有 RoPE インスタンス
118
+
119
+ def _to_4d_mask(
120
+ self, mask2d: Optional[torch.Tensor], bsz: int, seqlen: int, hidden_states: torch.Tensor
121
+ ) -> Optional[torch.Tensor]:
122
+ if mask2d is None:
123
+ return None
124
+ return _prepare_4d_causal_attention_mask(
125
+ mask2d, (bsz, seqlen), hidden_states, past_key_values_length=0, sliding_window=self.config.sliding_window
126
+ )
127
+
128
+ def forward(
129
+ self,
130
+ hidden_states: torch.Tensor, # (B, L, H)
131
+ attention_mask_2d: Optional[torch.Tensor], # (B, L)
132
+ position_ids: Optional[torch.LongTensor], # (B, L)
133
+ output_attentions: bool = False,
134
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
135
+ x = self.input_norm(hidden_states)
136
+ # L -> L-1
137
+ x, mask2d = self.pre(x, attention_mask_2d)
138
+ bsz, seqlen, _ = x.shape
139
+
140
+ # position_ids を再生成(0..seqlen-1)
141
+ device = x.device
142
+ pos_ids = torch.arange(seqlen, device=device).unsqueeze(0).expand(bsz, -1)
143
+ position_embeddings = self.rotary_emb(hidden_states, pos_ids)
144
+
145
+ attn_mask_4d = self._to_4d_mask(mask2d, bsz, seqlen, x)
146
+
147
+ attn_out, attn_weights = self.attn(
148
+ hidden_states=x,
149
+ attention_mask=attn_mask_4d,
150
+ position_ids=pos_ids,
151
+ position_embeddings=position_embeddings,
152
+ past_key_value=None,
153
+ output_attentions=output_attentions,
154
+ use_cache=False,
155
+ )
156
+ x = x + self.dropout_attn(attn_out)
157
+ h = self.post_norm(x)
158
+ h = self.mlp(h)
159
+ x = x + self.dropout_mlp(h)
160
+ return x, mask2d, attn_weights if output_attentions else None
161
+
162
+
163
+ class IntegrateUpscaleLayer(nn.Module):
164
+ """
165
+ (積分で L+1) -> Attn -> MLP
166
+ """
167
+ def __init__(self, config: ResidualNetConfig, layer_idx: int, rotary_emb: Phi3RotaryEmbedding):
168
+ super().__init__()
169
+ self.config = config
170
+ self.layer_idx = layer_idx
171
+ self.input_norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
172
+ self.pre = IntegratePreprocessor(config.hidden_size)
173
+ # self.attn = Phi3SdpaAttention(config, layer_idx=layer_idx)
174
+ self.attn = Phi3Attention(config, layer_idx=layer_idx)
175
+ self.dropout_attn = nn.Dropout(config.resid_pdrop)
176
+ self.post_norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
177
+ self.mlp = Phi3MLP(config)
178
+ self.dropout_mlp = nn.Dropout(config.resid_pdrop)
179
+ self.rotary_emb = rotary_emb
180
+
181
+ def _to_4d_mask(
182
+ self, mask2d: Optional[torch.Tensor], bsz: int, seqlen: int, hidden_states: torch.Tensor
183
+ ) -> Optional[torch.Tensor]:
184
+ if mask2d is None:
185
+ return None
186
+ return _prepare_4d_causal_attention_mask(
187
+ mask2d, (bsz, seqlen), hidden_states, past_key_values_length=0, sliding_window=self.config.sliding_window
188
+ )
189
+
190
+ def forward(
191
+ self,
192
+ hidden_states: torch.Tensor, # (B, L, H)
193
+ attention_mask_2d: Optional[torch.Tensor], # (B, L)
194
+ position_ids: Optional[torch.LongTensor], # (B, L)
195
+ output_attentions: bool = False,
196
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
197
+ x = self.input_norm(hidden_states)
198
+ # L -> L+1
199
+ x, mask2d = self.pre(x, attention_mask_2d)
200
+ bsz, seqlen, _ = x.shape
201
+
202
+ # position_ids を再生成(0..seqlen-1)
203
+ device = x.device
204
+ pos_ids = torch.arange(seqlen, device=device).unsqueeze(0).expand(bsz, -1)
205
+ position_embeddings = self.rotary_emb(hidden_states, pos_ids)
206
+
207
+ attn_mask_4d = self._to_4d_mask(mask2d, bsz, seqlen, x)
208
+
209
+ attn_out, attn_weights = self.attn(
210
+ hidden_states=x,
211
+ attention_mask=attn_mask_4d,
212
+ position_ids=pos_ids,
213
+ position_embeddings=position_embeddings,
214
+ past_key_value=None,
215
+ output_attentions=output_attentions,
216
+ use_cache=False,
217
+ )
218
+ x = x + self.dropout_attn(attn_out)
219
+ h = self.post_norm(x)
220
+ h = self.mlp(h)
221
+ x = x + self.dropout_mlp(h)
222
+ return x, mask2d, attn_weights if output_attentions else None
223
+
224
+
225
+ # ---------- モデル本体(Phi3PreTrainedModel を継承) ----------
226
+
227
+ class ResidualNetModel(Phi3PreTrainedModel):
228
+ """
229
+ 前半: ResidualDiffLayer × (N/2) で系列長を縮約
230
+ 後半: IntegrateUpscaleLayer × (N/2) で系列長を復元
231
+ """
232
+ def __init__(self, config: ResidualNetConfig):
233
+ super().__init__(config)
234
+ assert config.num_hidden_layers % 2 == 0, "num_hidden_layers は偶数にしてください。"
235
+
236
+ self.padding_idx = config.pad_token_id
237
+ self.vocab_size = config.vocab_size
238
+
239
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
240
+ self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
241
+ self.rotary_emb = Phi3RotaryEmbedding(config=config)
242
+ self.gradient_checkpointing = False
243
+
244
+ half = config.num_hidden_layers // 2
245
+ # 前半 (down)
246
+ self.down_layers = nn.ModuleList(
247
+ [ResidualDiffLayer(config, layer_idx=i, rotary_emb=self.rotary_emb) for i in range(half)]
248
+ )
249
+ # 後半 (up)
250
+ self.up_layers = nn.ModuleList(
251
+ [IntegrateUpscaleLayer(config, layer_idx=half + i, rotary_emb=self.rotary_emb) for i in range(half)]
252
+ )
253
+
254
+ # Initialize weights and apply final processing
255
+ self.post_init()
256
+
257
+ def forward(
258
+ self,
259
+ input_ids: Optional[torch.LongTensor] = None,
260
+ attention_mask: Optional[torch.Tensor] = None, # (B, L) in {0,1}
261
+ position_ids: Optional[torch.LongTensor] = None,
262
+ inputs_embeds: Optional[torch.FloatTensor] = None,
263
+ output_attentions: Optional[bool] = None,
264
+ output_hidden_states: Optional[bool] = None,
265
+ return_dict: Optional[bool] = None,
266
+ use_cache: Optional[bool] = None, # 未対応(強制 False)
267
+ ):
268
+ output_attentions = output_attentions if output_attentions is not None else False
269
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else False
270
+ return_dict = True if return_dict is None else return_dict
271
+
272
+ if input_ids is None and inputs_embeds is None:
273
+ raise ValueError("You must specify either input_ids or inputs_embeds.")
274
+
275
+ if inputs_embeds is None:
276
+ hidden_states = self.embed_tokens(input_ids) # (B, L, H)
277
+ else:
278
+ hidden_states = inputs_embeds
279
+
280
+ mask2d = attention_mask
281
+ bsz, orig_len, _ = hidden_states.shape
282
+
283
+ all_hidden_states: List[torch.Tensor] = [] if output_hidden_states else None
284
+ all_attns: List[torch.Tensor] = [] if output_attentions else None
285
+
286
+ # ---- 前半: 差分で縮約 ----
287
+ for layer in self.down_layers:
288
+ if output_hidden_states:
289
+ all_hidden_states.append(hidden_states)
290
+ hidden_states, mask2d, attn = layer(
291
+ hidden_states, mask2d, position_ids, output_attentions=output_attentions
292
+ )
293
+ if output_attentions:
294
+ all_attns.append(attn)
295
+
296
+ # ---- 後半: 積分で復元 ----
297
+ for layer in self.up_layers:
298
+ if output_hidden_states:
299
+ all_hidden_states.append(hidden_states)
300
+ hidden_states, mask2d, attn = layer(
301
+ hidden_states, mask2d, position_ids, output_attentions=output_attentions
302
+ )
303
+ if output_attentions:
304
+ all_attns.append(attn)
305
+
306
+ # 最終長の整合性(念のため)
307
+ if hidden_states.size(1) != orig_len:
308
+ raise RuntimeError(f"seq_len が復元されていません: got {hidden_states.size(1)} vs {orig_len}")
309
+
310
+ hidden_states = self.norm(hidden_states)
311
+
312
+ if not return_dict:
313
+ out = (hidden_states,)
314
+ if output_hidden_states:
315
+ out = out + (all_hidden_states,)
316
+ if output_attentions:
317
+ out = out + (all_attns,)
318
+ return out
319
+
320
+ return {
321
+ "last_hidden_state": hidden_states,
322
+ "hidden_states": all_hidden_states,
323
+ "attentions": all_attns,
324
+ }
325
+
326
+
327
+ # ---------- CausalLM ヘッド(Phi3PreTrainedModel + GenerationMixin) ----------
328
+
329
+ class ResidualNetForCausalLM(Phi3PreTrainedModel, GenerationMixin):
330
+ _tied_weights_keys = ["lm_head.weight"]
331
+ _tp_plan = {"lm_head": "colwise_rep"}
332
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
333
+
334
+ def __init__(self, config: ResidualNetConfig):
335
+ super().__init__(config)
336
+ self.model = ResidualNetModel(config)
337
+ self.vocab_size = config.vocab_size
338
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
339
+
340
+ # weight tying
341
+ self.lm_head.weight = self.model.embed_tokens.weight
342
+
343
+ # Initialize weights and apply final processing
344
+ self.post_init()
345
+
346
+ def forward(
347
+ self,
348
+ input_ids: Optional[torch.LongTensor] = None,
349
+ attention_mask: Optional[torch.Tensor] = None,
350
+ position_ids: Optional[torch.LongTensor] = None,
351
+ inputs_embeds: Optional[torch.FloatTensor] = None,
352
+ labels: Optional[torch.LongTensor] = None,
353
+ output_attentions: Optional[bool] = None,
354
+ output_hidden_states: Optional[bool] = None,
355
+ return_dict: Optional[bool] = None,
356
+ use_cache: Optional[bool] = None, # 未対応
357
+ past_key_values: Optional[List[torch.Tensor]] = None, # 未対応
358
+ ) -> CausalLMOutputWithPast:
359
+ return_dict = True if return_dict is None else return_dict
360
+
361
+ model_out = self.model(
362
+ input_ids=input_ids,
363
+ attention_mask=attention_mask,
364
+ position_ids=position_ids,
365
+ inputs_embeds=inputs_embeds,
366
+ output_attentions=output_attentions,
367
+ output_hidden_states=output_hidden_states,
368
+ return_dict=True,
369
+ use_cache=False,
370
+ )
371
+ hidden_states = model_out["last_hidden_state"] # (B, L, H)
372
+ logits = self.lm_head(hidden_states).float()
373
+
374
+ loss = None
375
+ if labels is not None:
376
+ # 因果言語モデリング損失
377
+ shift_logits = logits[:, :-1, :].contiguous()
378
+ shift_labels = labels[:, 1:].contiguous()
379
+ loss_fct = nn.CrossEntropyLoss()
380
+ loss = loss_fct(shift_logits.view(-1, self.vocab_size), shift_labels.view(-1))
381
+
382
+ if not return_dict:
383
+ return (logits, loss)
384
+
385
+ return CausalLMOutputWithPast(
386
+ loss=loss,
387
+ logits=logits,
388
+ past_key_values=None, # 未対応
389
+ hidden_states=model_out["hidden_states"],
390
+ attentions=model_out["attentions"],
391
+ )
392
+
393
+ @property
394
+ def base_model(self):
395
+ return self.model
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff