team6013 commited on
Commit
a6166d0
·
verified ·
1 Parent(s): d29c6d0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ForwardVisualTokensArchForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_forward_visual_tokens_llava_arch.ForwardVisualTokensArchConfig",
7
+ "AutoModel": "modeling_forward_visual_tokens_llava_arch.ForwardVisualTokensArchModel",
8
+ "AutoModelForCausalLM": "modeling_forward_visual_tokens_llava_arch.ForwardVisualTokensArchForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "dtype": "float32",
12
+ "eos_token_id": [
13
+ 151645,
14
+ 151643
15
+ ],
16
+ "hidden_act": "silu",
17
+ "hidden_size": 4096,
18
+ "image_token_id": 151655,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 22016,
21
+ "layer_types": [
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention"
54
+ ],
55
+ "max_position_embeddings": 32768,
56
+ "max_window_layers": 28,
57
+ "model_type": "forward_visual_tokens_llava_arch",
58
+ "num_attention_heads": 32,
59
+ "num_hidden_layers": 32,
60
+ "num_key_value_heads": 32,
61
+ "p_processor_name_or_path": "Qwen/Qwen3-4B",
62
+ "pad_token_id": 151643,
63
+ "perceiver_name_or_path": "team6013/DPA-LLaVA-0.6B",
64
+ "rms_norm_eps": 1e-06,
65
+ "rope_scaling": null,
66
+ "rope_theta": 10000.0,
67
+ "sliding_window": null,
68
+ "t_tokenizer_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
69
+ "thinker_name_or_path": "Qwen/Qwen3-4B",
70
+ "tie_word_embeddings": false,
71
+ "transformers_version": "4.57.1",
72
+ "use_cache": true,
73
+ "use_sliding_window": false,
74
+ "visual_bandwidth": 1,
75
+ "vocab_size": 151936
76
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.57.1"
4
+ }
modeling_forward_visual_tokens_llava_arch.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from typing import Optional, Union
4
+ import json
5
+ import os
6
+ from datetime import datetime
7
+ from transformers import (
8
+ Qwen2_5_VLForConditionalGeneration,
9
+ AutoTokenizer,
10
+ AutoProcessor,
11
+ Qwen3ForCausalLM,
12
+ Qwen3Config
13
+ )
14
+ from transformers import Qwen2PreTrainedModel
15
+ from transformers.generation import GenerationMixin
16
+ from transformers.processing_utils import Unpack
17
+ from transformers.utils import is_torchdynamo_compiling, ModelOutput
18
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
19
+ Qwen2_5_VLModelOutputWithPast,
20
+ )
21
+ from .modeling_llava_baseline import LLaVABaselineModelForConditionalGeneration, LLaVABaselineConfig
22
+ # Compatibility fix: KwargsForCausalLM doesn't exist in newer transformers versions
23
+ # try:
24
+ # from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import KwargsForCausalLM
25
+ # except ImportError:
26
+ # from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import TransformersKwargs as KwargsForCausalLM
27
+ from transformers.modeling_outputs import CausalLMOutputWithPast
28
+
29
+ from dataclasses import dataclass
30
+
31
+ from transformers.utils import auto_docstring
32
+ from transformers import Qwen2Config
33
+
34
+ IMG_START_ID = 151652
35
+ IMG_PAD_ID = 151655
36
+ IMG_END_ID = 151653
37
+
38
+ IMG_THINKER_PAD_ID = 151655
39
+ IMG_THINKER_START_ID = 151652
40
+ IMG_THINKER_END_ID = 151653
41
+
42
+
43
+
44
+ class ForwardVisualTokensArchConfig(Qwen2Config):
45
+ model_type = "forward_visual_tokens_llava_arch"
46
+ keys_to_ignore_at_inference = ["past_key_values"]
47
+
48
+ def __init__(
49
+ self,
50
+ use_cache=True,
51
+ perceiver_name_or_path="../LLaVA-baseline-checkpoint-6000",
52
+ thinker_name_or_path="Qwen/Qwen3-4B",
53
+ t_tokenizer_name_or_path="../dpa_qwen3_tokenizer",
54
+ p_tokenizer_name_or_path="../dpa_qwen25_processor",
55
+ visual_bandwidth=1,
56
+ **kwargs,
57
+ ):
58
+ self.use_cache = use_cache
59
+ self.perceiver_name_or_path = perceiver_name_or_path
60
+ self.thinker_name_or_path = thinker_name_or_path
61
+ self.t_tokenizer_name_or_path = t_tokenizer_name_or_path
62
+ self.p_processor_name_or_path = p_tokenizer_name_or_path
63
+
64
+ self.visual_bandwidth = visual_bandwidth
65
+ self.image_token_id = IMG_PAD_ID
66
+
67
+ super().__init__(**kwargs)
68
+
69
+
70
+ class ForwardVisualTokensArchPreTrainedModel(Qwen2PreTrainedModel):
71
+ config_class = ForwardVisualTokensArchConfig
72
+
73
+
74
+ def add_special_tokens(tkz):
75
+ additional_special_tokens = [f"<im_msg-{i}>" for i in range(128)]
76
+ tkz.add_special_tokens({"additional_special_tokens": additional_special_tokens})
77
+ mapping = {
78
+ tok: tkz._convert_token_to_id_with_added_voc(tok)
79
+ for tok in additional_special_tokens
80
+ }
81
+ return tkz, mapping
82
+
83
+
84
+ @dataclass
85
+ @auto_docstring(
86
+ custom_intro="""
87
+ Base class for Llava outputs, with hidden states and attentions.
88
+ """
89
+ )
90
+ class ForwardVisualTokensArchOutputWithPast(ModelOutput):
91
+ r"""
92
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
93
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
94
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
95
+
96
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
97
+ `past_key_values` input) to speed up sequential decoding.
98
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
99
+ The rope index difference between sequence length and multimodal rope.
100
+ """
101
+
102
+ past_key_values: Optional[list[torch.FloatTensor]] = None
103
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
104
+ attentions: Optional[tuple[torch.FloatTensor]] = None
105
+ logits: Optional[tuple[torch.FloatTensor]] = None
106
+
107
+
108
+ class ForwardVisualTokensArchModel(ForwardVisualTokensArchPreTrainedModel, GenerationMixin):
109
+ def __init__(self, config: ForwardVisualTokensArchConfig):
110
+ super().__init__(config)
111
+
112
+ assert self.config.perceiver_name_or_path is not None
113
+ assert self.config.thinker_name_or_path is not None
114
+
115
+ assert self.config.p_processor_name_or_path is not None
116
+ assert self.config.t_tokenizer_name_or_path is not None
117
+
118
+ perceiver_config = LLaVABaselineConfig.from_pretrained(
119
+ self.config.perceiver_name_or_path,
120
+ )
121
+ self.perceiver = LLaVABaselineModelForConditionalGeneration(perceiver_config)
122
+ # self.perceiver.gradient_checkpointing_enable()
123
+
124
+ self.p_processor = AutoProcessor.from_pretrained(
125
+ self.config.p_processor_name_or_path
126
+ )
127
+ self.p_processor.tokenizer.padding_side = "left"
128
+
129
+ thinker_config = Qwen3Config.from_pretrained(self.config.thinker_name_or_path)
130
+ self.thinker = Qwen3ForCausalLM(thinker_config)
131
+ # self.thinker.gradient_checkpointing_enable()
132
+
133
+ self.t_tokenizer = AutoTokenizer.from_pretrained(
134
+ self.config.t_tokenizer_name_or_path, padding_side="left"
135
+ )
136
+
137
+ self.linear_align_dim = torch.nn.Sequential(
138
+ torch.nn.Linear(#因为是嫁接的模型,必须直接访问里面的language model的config才是真实维度
139
+ self.perceiver.model.vlm.language_model.config.hidden_size, self.perceiver.model.vlm.language_model.config.hidden_size
140
+ ),
141
+ torch.nn.ReLU(),
142
+ torch.nn.Linear(
143
+ self.perceiver.model.vlm.language_model.config.hidden_size, self.thinker.config.hidden_size
144
+ ),
145
+ )
146
+
147
+ self.config: ForwardVisualTokensArchConfig
148
+
149
+ def get_visual_message_tokens(self):
150
+ size = self.config.visual_bandwidth
151
+ tokens = [f"<im_msg-{i}>" for i in range(size)]
152
+ return tokens
153
+
154
+ def get_visual_message_token_ids(self, model):
155
+ tokens = self.get_visual_message_tokens()
156
+ if model == "p":
157
+ ids = self.p_processor.tokenizer.convert_tokens_to_ids(tokens)
158
+ elif model == "t":
159
+ ids = self.t_tokenizer.convert_tokens_to_ids(tokens)
160
+ else:
161
+ raise NotImplementedError
162
+ return ids
163
+
164
+ def get_visual_message(self):
165
+ message = "".join(self.get_visual_message_tokens())
166
+ return message
167
+
168
+ def chat(self, images, msgs, *args, **kwargs):
169
+ assert len(images) == len(msgs)
170
+ assert args == ()
171
+ assert "max_new_tokens" not in kwargs
172
+
173
+ # p_prompt_template = 'Encode the image into {num_feat} tokens, including information related to the question. Here is the question: {question}'
174
+ p_prompt_template = "{question}"
175
+ questions = []
176
+ p_images = []
177
+ p_texts = []
178
+
179
+ for i in range(len(images)):
180
+ image = images[i]
181
+ msg_list = msgs[i]
182
+
183
+ # print(f'Image-{i}: {image}')
184
+ # print(f'Msg-{i}: {msg_list}')
185
+
186
+ if not (len(msg_list) == 1 and msg_list[0]["role"] == "user"):
187
+ raise ValueError(
188
+ f"Each message list must contain a single user dictionary. Error at index {i}."
189
+ )
190
+
191
+ pil_image = (
192
+ Image.open(image).convert("RGB") if isinstance(image, str) else image
193
+ )
194
+ p_images.append(pil_image)
195
+
196
+ question = msg_list[0]["content"]
197
+ questions.append(question)
198
+
199
+ p_message = [
200
+ {
201
+ "role": "user",
202
+ "content": [
203
+ {"type": "image", "image": image},
204
+ {
205
+ "type": "text",
206
+ "text": p_prompt_template.format(question=question),
207
+ },
208
+ # {'type': 'text', 'text': p_prompt_template.format(num_feat=self.config.visual_bandwidth,
209
+ # question=question)}
210
+ ],
211
+ }
212
+ # {'role': 'assisstant', 'content': [
213
+ # {'type': 'text', 'text': self.get_visual_message()}
214
+ # ]}
215
+ ]
216
+ # print(f'P-Message-{i}: {p_message}')
217
+ p_texts.append(
218
+ self.p_processor.apply_chat_template(
219
+ p_message, tokenize=False, add_generation_prompt=False
220
+ )
221
+ )
222
+
223
+ # print(f'{p_texts=}')
224
+ perceiver_inputs = self.p_processor(
225
+ text=p_texts,
226
+ images=p_images,
227
+ padding=True,
228
+ return_tensors="pt",
229
+ ).to(self.device)
230
+
231
+ # print('Token IDs of perceiver inputs',
232
+ # perceiver_inputs['input_ids'].tolist())
233
+ # print('Tokens of perceiver inputs', [
234
+ # self.p_processor.tokenizer.convert_ids_to_tokens(ids) for ids in perceiver_inputs['input_ids']])
235
+
236
+ # t_prompt_template = '{question} Image: ' + self.get_visual_message()
237
+ t_prompt_template = "<image>{question}"
238
+ t_texts = []
239
+ for i in range(len(questions)):
240
+ prompt = t_prompt_template.format(question=questions[i])
241
+
242
+ p_input_ids = perceiver_inputs["input_ids"][i].tolist()
243
+ img_start_idx = p_input_ids.index(IMG_START_ID)
244
+ img_end_idx = p_input_ids.index(IMG_END_ID)
245
+
246
+ assert img_start_idx < img_end_idx
247
+
248
+ prompt = prompt.replace(
249
+ "<image>",
250
+ "<|vision_start|>"
251
+ + "<|image_pad|>" * (img_end_idx - img_start_idx - 1)
252
+ + "<|vision_end|>",
253
+ )
254
+ message = [
255
+ {"role": "user", "content": prompt},
256
+ # {"role": "assistant", "content": "<think>\n\n</think>\n\n"}
257
+ ]
258
+ t_texts.append(
259
+ self.t_tokenizer.apply_chat_template(
260
+ message,
261
+ tokenize=False,
262
+ add_generation_prompt=True,
263
+ enable_thinking=True,
264
+ # message, tokenize=False, add_generation_prompt=True, enable_thinking=False
265
+ # ))
266
+ )
267
+ + "<think>\n\n"
268
+ )
269
+ # print(f'\n\n##T-Message-{i}: {t_texts[-1]}')
270
+
271
+ model_inputs_t = self.t_tokenizer(
272
+ t_texts, return_tensors="pt", padding=True
273
+ ).to(self.thinker.device)
274
+
275
+ model_inputs_t["input_ids_of_perceiver"] = perceiver_inputs["input_ids"]
276
+ model_inputs_t["attention_mask_of_perceiver"] = perceiver_inputs[
277
+ "attention_mask"
278
+ ]
279
+ model_inputs_t["pixel_values"] = perceiver_inputs["pixel_values"]
280
+ model_inputs_t["image_grid_thw"] = perceiver_inputs["image_grid_thw"]
281
+
282
+ # print(
283
+ # f'Thinker generation config: {self.thinker.generation_config.to_dict()}')
284
+ thinker_generation_params = kwargs.get("thinker_generation_params", {})
285
+ thinker_generation_params["max_new_tokens"] = thinker_generation_params.get(
286
+ "max_new_tokens", 32768
287
+ )
288
+
289
+ assert model_inputs_t["pixel_values"] is not None
290
+
291
+ with torch.inference_mode():
292
+ generated_ids_t = self.generate(
293
+ **model_inputs_t,
294
+ **thinker_generation_params,
295
+ eos_token_id=self.t_tokenizer.eos_token_id,
296
+ )
297
+ # print(f'Thinker output ids: {generated_ids_t}')
298
+ # print(
299
+ # f'Thinker output toks: {[self.t_tokenizer.convert_ids_to_tokens(ids) for ids in generated_ids_t]}')
300
+
301
+ final_responses = []
302
+ for i in range(len(msgs)):
303
+ output_ids = generated_ids_t[i][len(model_inputs_t.input_ids[i]) :].tolist()
304
+ try:
305
+ # 寻找 </think> token (151668)
306
+ index = len(output_ids) - output_ids[::-1].index(151668)
307
+ print(
308
+ f"len output_ids: {len(output_ids)}, subtract {output_ids[::-1].index(151668)}"
309
+ )
310
+ except ValueError:
311
+ index = 0
312
+
313
+ thinking_content = self.t_tokenizer.decode(
314
+ output_ids[:index], skip_special_tokens=True
315
+ ).strip("\n")
316
+ # print(f"\n\n##Thinking content-{i}: {thinking_content}")
317
+
318
+ print(f"content ids: {output_ids[index:]}")
319
+
320
+ content = self.t_tokenizer.decode(
321
+ output_ids[index:], skip_special_tokens=True
322
+ ).strip("\n")
323
+ final_responses.append(content)
324
+ # print(f"\n\n##Answer content-{i}: {content}")
325
+
326
+ # return [x[0] for x in self.generate([image], [msgs], *args, **kwargs)]
327
+ return final_responses
328
+
329
+ # NOTE: All inputs should be considered as inputs to thinker
330
+ # The thinker consumes multimodal data by calling perceiver
331
+ def prepare_inputs_for_generation(
332
+ self,
333
+ input_ids,
334
+ past_key_values=None,
335
+ input_ids_of_perceiver=None,
336
+ attention_mask_of_perceiver=None,
337
+ attention_mask=None,
338
+ inputs_embeds=None,
339
+ cache_position=None,
340
+ position_ids=None,
341
+ use_cache=True,
342
+ pixel_values=None,
343
+ pixel_values_videos=None,
344
+ image_grid_thw=None,
345
+ video_grid_thw=None,
346
+ second_per_grid_ts=None,
347
+ **kwargs,
348
+ ):
349
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
350
+ assert pixel_values is not None
351
+ model_inputs = super().prepare_inputs_for_generation(
352
+ input_ids,
353
+ attention_mask=attention_mask,
354
+ input_ids_of_perceiver=input_ids_of_perceiver,
355
+ attention_mask_of_perceiver=attention_mask_of_perceiver,
356
+ past_key_values=past_key_values,
357
+ inputs_embeds=inputs_embeds,
358
+ cache_position=cache_position,
359
+ position_ids=position_ids,
360
+ pixel_values=pixel_values,
361
+ pixel_values_videos=pixel_values_videos,
362
+ image_grid_thw=image_grid_thw,
363
+ video_grid_thw=video_grid_thw,
364
+ second_per_grid_ts=second_per_grid_ts,
365
+ use_cache=use_cache,
366
+ **kwargs,
367
+ )
368
+ # print(f'\n@@@@ prepare inputs for generation', f'##${model_inputs["pixel_values"].shape}$##', flush=True)
369
+
370
+ # # Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
371
+ # model_inputs["position_ids"] = None
372
+
373
+ assert model_inputs["pixel_values"] is not None
374
+ if cache_position[0] != 0:
375
+ # print(f'Cache hit, skip pixel values encoding', flush=True)
376
+ model_inputs["pixel_values"] = None
377
+ # model_inputs["pixel_values_videos"] = None
378
+
379
+ return model_inputs
380
+
381
+ @auto_docstring
382
+ def forward(
383
+ self,
384
+ input_ids: torch.LongTensor = None,
385
+ attention_mask: Optional[torch.Tensor] = None,
386
+ input_ids_of_perceiver: torch.LongTensor = None,
387
+ attention_mask_of_perceiver: Optional[torch.Tensor] = None,
388
+ position_ids: Optional[torch.LongTensor] = None,
389
+ past_key_values: Optional[list[torch.FloatTensor]] = None,
390
+ inputs_embeds: Optional[torch.FloatTensor] = None,
391
+ use_cache: Optional[bool] = None,
392
+ output_attentions: Optional[bool] = None,
393
+ output_hidden_states: Optional[bool] = None,
394
+ return_dict: Optional[bool] = None,
395
+ pixel_values: Optional[torch.Tensor] = None,
396
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
397
+ image_grid_thw: Optional[torch.LongTensor] = None,
398
+ video_grid_thw: Optional[torch.LongTensor] = None,
399
+ rope_deltas: Optional[torch.LongTensor] = None,
400
+ cache_position: Optional[torch.LongTensor] = None,
401
+ second_per_grid_ts: Optional[torch.Tensor] = None,
402
+ **kwargs,
403
+ ) -> Union[tuple, Qwen2_5_VLModelOutputWithPast]:
404
+
405
+ t_input_ids = input_ids
406
+ del input_ids
407
+
408
+ if inputs_embeds is None:
409
+ inputs_embeds = self.thinker.get_input_embeddings()(t_input_ids)
410
+
411
+ if pixel_values is not None:
412
+ p_msg_st_id = IMG_START_ID
413
+ p_msg_ed_id = IMG_END_ID
414
+ p_msg_st_list = []
415
+ p_msg_ed_list = []
416
+
417
+ # Iterate over batch: each element may contain multiple images (in packing mode)
418
+ for batch_idx, perceiver_sample_input_ids in enumerate(input_ids_of_perceiver):
419
+ # Find ALL image start/end tokens in this (potentially packed) sequence
420
+ st_indices = (perceiver_sample_input_ids == p_msg_st_id).nonzero(
421
+ as_tuple=True
422
+ )[0]
423
+ ed_indices = (perceiver_sample_input_ids == p_msg_ed_id).nonzero(
424
+ as_tuple=True
425
+ )[0]
426
+ samples = (perceiver_sample_input_ids == 151644).nonzero(
427
+ as_tuple=True
428
+ )[0]
429
+
430
+ # In packing mode: multiple images per sequence (len(st_indices) = pack_size)
431
+ # In non-packing mode: one image per sequence (len(st_indices) = 1)
432
+ assert len(st_indices) >= 1, f"No start token found in perceiver input {batch_idx}"
433
+ assert len(ed_indices) >= 1, f"No end token found in perceiver input {batch_idx}"
434
+ assert len(st_indices) == len(ed_indices), f"Mismatched start/end tokens in batch {batch_idx}"
435
+
436
+ # Collect start/end positions for all images in this batch element
437
+ for st, ed in zip(st_indices, ed_indices):
438
+ p_msg_st_list.append(st)
439
+ p_msg_ed_list.append(ed)
440
+
441
+ # Prepare perceiver inputs
442
+ perceiver_kwargs = {
443
+ 'input_ids': input_ids_of_perceiver,
444
+ 'pixel_values': pixel_values,
445
+ 'attention_mask': attention_mask_of_perceiver,
446
+ 'image_grid_thw': image_grid_thw,
447
+ 'output_hidden_states': True,
448
+ }
449
+
450
+ # TEMPORARY: Disable position_ids for perceiver to debug hang issue
451
+ # Add position_ids if available (for packing support)
452
+ position_ids_of_perceiver = kwargs.get('position_ids_of_perceiver')
453
+ if position_ids_of_perceiver is not None:
454
+ perceiver_kwargs['position_ids'] = position_ids_of_perceiver
455
+
456
+ out = self.perceiver(**perceiver_kwargs)
457
+
458
+ # only keep last layer hidden states, release other layers
459
+ last_layer_hiddens = out.hidden_states[-1]
460
+ # print(f"Perceiver last_layer_hiddens shape: {last_layer_hiddens.shape}")
461
+
462
+ # 释放不需要的中间变量,但保留梯度
463
+ if hasattr(out, "hidden_states"):
464
+ del out.hidden_states # 释放其他层的隐藏状态
465
+ if hasattr(out, "attentions"):
466
+ del out.attentions # 释放注意力权重
467
+
468
+ # Extract visual features from all images
469
+ # p_msg_st_list and p_msg_ed_list contain positions for all images in order
470
+ # We need to track which batch element each position belongs to
471
+ batch_msg = []
472
+ img_idx = 0 # Track which image we're processing
473
+ for batch_idx, perceiver_sample_input_ids in enumerate(input_ids_of_perceiver):
474
+ # Find how many images are in this batch element
475
+ st_indices = (perceiver_sample_input_ids == p_msg_st_id).nonzero(as_tuple=True)[0]
476
+ num_images_in_batch = len(st_indices)
477
+
478
+ # Extract features for each image in this batch element
479
+ for _ in range(num_images_in_batch):
480
+ st = p_msg_st_list[img_idx]
481
+ ed = p_msg_ed_list[img_idx]
482
+ # Extract from the correct batch element's hidden states
483
+ msg_feat = last_layer_hiddens[batch_idx, st : ed + 1, :]
484
+ batch_msg.append(msg_feat)
485
+ img_idx += 1
486
+ # print(f"Extracted {len(batch_msg)} image features from {input_ids_of_perceiver.shape[0]} batch elements")
487
+
488
+ image_features = torch.cat(batch_msg, dim=0)
489
+ image_features = self.linear_align_dim(image_features)
490
+
491
+ n_msg_features = image_features.shape[0]
492
+ msg_mask = (
493
+ (t_input_ids == IMG_THINKER_START_ID)
494
+ | (t_input_ids == IMG_THINKER_END_ID)
495
+ | (t_input_ids == IMG_THINKER_PAD_ID)
496
+ )
497
+ n_msg_tokens = msg_mask.sum()
498
+
499
+ if not is_torchdynamo_compiling() and n_msg_tokens != n_msg_features:
500
+ raise ValueError(
501
+ f"Image features and image tokens do not match: tokens: {n_msg_tokens}, features {n_msg_features}"
502
+ )
503
+
504
+ mask_unsqueezed = msg_mask.unsqueeze(-1)
505
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
506
+
507
+ image_mask = mask_expanded.to(inputs_embeds.device)
508
+ image_features = image_features.to(
509
+ inputs_embeds.device, inputs_embeds.dtype
510
+ )
511
+
512
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_features)
513
+
514
+ del last_layer_hiddens, batch_msg, mask_expanded, mask_unsqueezed
515
+
516
+ outputs = self.thinker(
517
+ input_ids=None,
518
+ position_ids=position_ids,
519
+ attention_mask=attention_mask,
520
+ past_key_values=past_key_values,
521
+ inputs_embeds=inputs_embeds,
522
+ use_cache=use_cache,
523
+ output_attentions=output_attentions,
524
+ output_hidden_states=output_hidden_states,
525
+ return_dict=True,
526
+ cache_position=cache_position,
527
+ **kwargs,
528
+ )
529
+
530
+ output = ForwardVisualTokensArchOutputWithPast(
531
+ past_key_values=outputs.past_key_values,
532
+ hidden_states=outputs.hidden_states,
533
+ attentions=outputs.attentions,
534
+ logits=outputs.logits,
535
+ )
536
+
537
+ return output if return_dict else output.to_tuple()
538
+
539
+
540
+ class ForwardVisualTokensArchForCausalLM(ForwardVisualTokensArchPreTrainedModel, GenerationMixin):
541
+ def __init__(self, config: ForwardVisualTokensArchConfig):
542
+ super().__init__(config)
543
+ self.model = ForwardVisualTokensArchModel(config)
544
+ self.vocab_size = config.vocab_size
545
+
546
+ self.lm_head = self.model.thinker.lm_head
547
+ # del self.model.thinker.lm_head
548
+
549
+ self.config.eos_token_id = self.model.thinker.generation_config.eos_token_id
550
+ if self.model.t_tokenizer.pad_token_id is None:
551
+ self.model.t_tokenizer.pad_token = self.model.t_tokenizer.eos_token
552
+
553
+ self.config.pad_token_id = self.model.t_tokenizer.pad_token_id
554
+ print(
555
+ f"Config eos_token_id: {self.config.eos_token_id}, pad_token_id: {self.config.pad_token_id}"
556
+ )
557
+
558
+ self.post_init()
559
+
560
+ def get_input_embeddings(self):
561
+ return self.model.thinker.get_input_embeddings()
562
+
563
+ def set_input_embeddings(self, value):
564
+ self.model.thinker.set_input_embeddings(value)
565
+
566
+ def _register_perceiver_embedding_gradient_hook(self):
567
+ try:
568
+ embedding_layer = self.model.perceiver.get_input_embeddings()
569
+ print(
570
+ f"Successfully located Perceiver's embedding layer: {embedding_layer}"
571
+ )
572
+
573
+ trainable_token_ids = self.model.get_visual_message_token_ids("p")
574
+ if not trainable_token_ids:
575
+ print(
576
+ "WARNING: No trainable token IDs found for Perceiver. Hook will not be effective."
577
+ )
578
+ return
579
+
580
+ print(f"Target trainable token IDs for Perceiver: {trainable_token_ids}")
581
+
582
+ vocab_size, _ = embedding_layer.weight.shape
583
+ mask = torch.zeros_like(embedding_layer.weight)
584
+
585
+ for token_id in trainable_token_ids:
586
+ mask[token_id, :] = 1.0
587
+
588
+ def grad_mask_hook(grad):
589
+ return grad.mul_(mask)
590
+
591
+ embedding_layer.weight.register_hook(grad_mask_hook)
592
+
593
+ print("=" * 70)
594
+ print("SUCCESS: PERCEIVER embedding gradient hook has been registered.")
595
+ print(
596
+ f"Only embeddings for the following Perceiver token IDs will be updated: {trainable_token_ids}"
597
+ )
598
+ print("This message should only appear ONCE at the beginning of training.")
599
+ print("=" * 70)
600
+
601
+ except Exception as e:
602
+ print(
603
+ f"ERROR: Failed to register Perceiver embedding gradient hook. Reason: {e}"
604
+ )
605
+
606
+ # def get_output_embeddings(self):
607
+ # return self.model.thinker.get_output_embeddings()
608
+
609
+ def forward(
610
+ self,
611
+ input_ids: torch.LongTensor = None,
612
+ labels: Optional[torch.LongTensor] = None,
613
+ **kwargs,
614
+ ) -> Union[tuple, CausalLMOutputWithPast]:
615
+ # For lora training
616
+ kwargs['return_dict'] = True
617
+ return_dict = kwargs.get("return_dict", True)
618
+
619
+ # print('------------------------------------------------', flush=True)
620
+ # print(f'input_ids: {input_ids.shape}', flush=True)
621
+ outputs = self.model(
622
+ input_ids=input_ids,
623
+ # index=index,
624
+ # return_dict=True,
625
+ **kwargs,
626
+ )
627
+
628
+ logits = outputs.logits
629
+ loss = None
630
+
631
+ if labels is not None:
632
+ loss = self.loss_function(
633
+ logits=logits,
634
+ labels=labels,
635
+ vocab_size=self.config.vocab_size,
636
+ **kwargs,
637
+ )
638
+
639
+ if not return_dict:
640
+ output = (logits,) + outputs[1:]
641
+ return ((loss,) + output) if loss is not None else output
642
+
643
+ return CausalLMOutputWithPast(
644
+ loss=loss,
645
+ logits=logits,
646
+ past_key_values=outputs.past_key_values,
647
+ hidden_states=outputs.hidden_states,
648
+ attentions=outputs.attentions,
649
+ )
650
+
651
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
652
+ return self.model.prepare_inputs_for_generation(input_ids, **kwargs)
653
+
654
+ def chat(self, images, msgs, *args, **kwargs):
655
+ return self.model.chat(images, msgs, *args, **kwargs)
modeling_llava_baseline.py ADDED
@@ -0,0 +1,711 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, List, Optional, Tuple, Union
2
+
3
+ import torch
4
+ import torch.distributed as dist
5
+ import torch.nn as nn
6
+ import transformers.models.qwen2_5_vl.modeling_qwen2_5_vl as qwen25
7
+ import transformers.models.qwen3.modeling_qwen3 as qwen3
8
+ from transformers import (Qwen2_5_VLModel, Qwen2Config,
9
+ Qwen2PreTrainedModel, AutoConfig)
10
+ from transformers.cache_utils import Cache, DynamicCache
11
+ from transformers.configuration_utils import PretrainedConfig
12
+ from transformers.generation import GenerationMixin
13
+ from transformers.masking_utils import (ALL_MASK_ATTENTION_FUNCTIONS,
14
+ BlockMask,
15
+ _is_torch_greater_or_equal_than_2_6,
16
+ and_masks,
17
+ causal_mask_function,
18
+ or_masks,
19
+ packed_sequence_mask_function)
20
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
21
+ from transformers.modeling_outputs import BaseModelOutputWithPast
22
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
23
+ # from transformers.models.qwen3.modeling_qwen3 import Qwen3Attention, Qwen3Model, eager_attention_forward
24
+ # from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast, Qwen2_5_VLRotaryEmbedding, apply_multimodal_rotary_pos_emb
25
+ from transformers.processing_utils import Unpack
26
+ from transformers.utils import auto_docstring
27
+ from transformers.utils.deprecation import deprecate_kwarg
28
+ try:
29
+ from transformers.masking_utils import _is_torch_xpu_available
30
+ except:
31
+ _is_torch_xpu_available = False
32
+ from transformers.masking_utils import sliding_window_causal_mask_function
33
+
34
+
35
+ def find_packed_sequence_indices(position_ids: torch.Tensor) -> torch.Tensor:
36
+ """
37
+ Find the indices of the sequence to which each new query token in the sequence belongs when using packed
38
+ tensor format (i.e. several sequences packed in the same batch dimension).
39
+
40
+ Args:
41
+ position_ids (`torch.Tensor`)
42
+ A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
43
+
44
+ Returns:
45
+ A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we
46
+ pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]].
47
+ """
48
+ # What separate different sequences is when 2 consecutive positions_ids are separated by more than 1. So
49
+ # taking the diff (by prepending the first value - 1 to keep correct indexing) and applying cumsum to the result
50
+ # gives exactly the sequence indices
51
+ # Note that we assume that a single sequence cannot span several batch dimensions, i.e. 1 single sequence
52
+ # cannot be part of the end of the first batch dim and the start of the 2nd one for example
53
+ first_dummy_value = position_ids[:, :1] - 1 # We just need the diff on this first value to be 1
54
+ position_diff = torch.diff(position_ids, prepend=first_dummy_value, dim=-1)
55
+ packed_sequence_mask = (position_diff < 0).cumsum(-1)
56
+
57
+ # Here it would be nice to return None if we did not detect packed sequence format, i.e. if `packed_sequence_mask[:, -1] == 0`
58
+ # but it causes issues with export
59
+ return packed_sequence_mask
60
+
61
+
62
+ def _preprocess_mask_arguments(
63
+ config: PretrainedConfig,
64
+ input_embeds: torch.Tensor,
65
+ attention_mask: Optional[Union[torch.Tensor, BlockMask]],
66
+ cache_position: torch.Tensor,
67
+ past_key_values: Optional[Cache],
68
+ position_ids: Optional[torch.Tensor],
69
+ layer_idx: Optional[int],
70
+ ) -> tuple[bool, Optional[Union[torch.Tensor, BlockMask]], int, int]:
71
+ """
72
+ Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the
73
+ key-value length and offsets, and if we should early exit or not.
74
+
75
+ Args:
76
+ config (`PretrainedConfig`):
77
+ The model config.
78
+ input_embeds (`torch.Tensor`):
79
+ The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
80
+ batch size, query length and dtype.
81
+ attention_mask (`torch.Tensor`, optional):
82
+ The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
83
+ It can also be an already prepared 4D mask, in which case it is returned as-is.
84
+ cache_position (`torch.Tensor`):
85
+ A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
86
+ past_key_values (`Cache`, optional):
87
+ The past key values, if we use a cache.
88
+ position_ids (`torch.Tensor`, optional)
89
+ A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
90
+ layer_idx (`int`, optional):
91
+ If `past_key_values` is not None, this is the layer index of the cache from which to get the key-value
92
+ length and offset. Indeed, for hybrid caches, different layers may return different lengths.
93
+
94
+ Returns:
95
+ early_exit (`bool`):
96
+ Whether we should early exit mask creation, and return the mask as-is.
97
+ attention_mask (`torch.Tensor` or `BlockMask` or `None`):
98
+ The attention mask to either return immediately, or to use in downstream mask creation.
99
+ packed_sequence_mask (`torch.Tensor`, optional):
100
+ In case we detected packed sequence format, this is a tensor where each similar integer indicates that
101
+ the tokens belong to the same sequence.
102
+ kv_length (`int`):
103
+ The size that the key and value states will have during the attention computation.
104
+ kv_offset (`int`):
105
+ An offset to indicate at which first position the key and values states will refer to.
106
+ """
107
+ # If the mask is already 4D, simply return as-is (it was already prepared, or it is custom)
108
+ if isinstance(attention_mask, (torch.Tensor, BlockMask)) and len(attention_mask.shape) == 4:
109
+ return True, attention_mask, None, None, None
110
+
111
+ # For TGI/vLLM backends, or other custom attention without equivalent mask creation: we don't need a mask!
112
+ # Note: it's not ideal to check the `_global_mapping` attribute instead of the object itself, however otherwise
113
+ # full graph dynamo tracing (i.e. torch.export or compile with `fullgraph=True`) will fail on Python<3.11
114
+ # with `torch._dynamo.exc.Unsupported: 'inline in skipfiles:Mapping.__contains__ | __contains__, skipped
115
+ # according trace_rules.lookup SKIP_DIRS'` -- can be removed when we require Python>=3.11
116
+ if config._attn_implementation not in ALL_MASK_ATTENTION_FUNCTIONS._global_mapping:
117
+ return True, None, None, None, None
118
+
119
+ # Move the mask to correct device, and potentially switch dtype for efficiency
120
+ if attention_mask is not None and attention_mask.ndim == 2:
121
+ attention_mask = attention_mask.to(device=cache_position.device, dtype=torch.bool)
122
+
123
+ # If using a cache, it can give all information about mask sizes based on seen tokens
124
+ if past_key_values is not None:
125
+ kv_length, kv_offset = past_key_values.get_mask_sizes(cache_position, layer_idx)
126
+ # Otherwise, the sizes are simply the input sizes
127
+ else:
128
+ kv_length, kv_offset = input_embeds.shape[1], 0
129
+
130
+ # We check the position_ids for potential packed sequence format (only if the 2D attention mask is explicitly None,
131
+ # and we don't have past_key_values, i.e. generally a training setup)
132
+ packed_sequence_mask = None
133
+ if position_ids is not None and attention_mask is None and past_key_values is None:
134
+ batch_size = input_embeds.shape[0]
135
+ # The position ids are sometimes just unsqueezed, without being expanded
136
+ if batch_size != position_ids.shape[0]:
137
+ position_ids = position_ids.expand(batch_size, -1)
138
+ packed_sequence_mask = find_packed_sequence_indices(position_ids)
139
+
140
+ return False, attention_mask, packed_sequence_mask, kv_length, kv_offset
141
+
142
+
143
+ def create_causal_mask(
144
+ config: PretrainedConfig,
145
+ input_embeds: torch.Tensor,
146
+ attention_mask: Optional[torch.Tensor],
147
+ cache_position: torch.Tensor,
148
+ past_key_values: Optional[Cache],
149
+ position_ids: Optional[torch.Tensor] = None,
150
+ or_mask_function: Optional[Callable] = None,
151
+ and_mask_function: Optional[Callable] = None,
152
+ ) -> Optional[Union[torch.Tensor, BlockMask]]:
153
+ """
154
+ Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
155
+ has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
156
+ to what is needed in the `modeling_xxx.py` files).
157
+
158
+ Args:
159
+ config (`PretrainedConfig`):
160
+ The model config.
161
+ input_embeds (`torch.Tensor`):
162
+ The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
163
+ batch size, query length and dtype.
164
+ attention_mask (`torch.Tensor`, optional):
165
+ The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
166
+ It can also be an already prepared 4D mask, in which case it is returned as-is.
167
+ cache_position (`torch.Tensor`):
168
+ A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
169
+ past_key_values (`Cache`, optional):
170
+ The past key values, if we use a cache.
171
+ position_ids (`torch.Tensor`, optional)
172
+ A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
173
+ or_mask_function (`Callable`, optional):
174
+ An optional mask function to combine with the causal mask function (by doing the union of both). This is
175
+ useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
176
+ and_mask_function (`Callable`, optional):
177
+ An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
178
+ useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
179
+ """
180
+ # If we have an hybrid cache structure, here we want to create the mask for the full layers
181
+ if hasattr(past_key_values, "is_sliding") and False in past_key_values.is_sliding:
182
+ layer_idx = past_key_values.is_sliding.index(False)
183
+ else:
184
+ layer_idx = 0
185
+
186
+ early_exit, attention_mask, packed_sequence_mask, kv_length, kv_offset = _preprocess_mask_arguments(
187
+ config, input_embeds, attention_mask, cache_position, past_key_values, position_ids, layer_idx
188
+ )
189
+ if early_exit:
190
+ return attention_mask
191
+
192
+ batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
193
+ mask_factory_function = causal_mask_function
194
+ mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
195
+
196
+ # Do not allow skip if we are compiling (this is to match BC)
197
+ # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
198
+ if _is_torch_xpu_available:
199
+ allow_is_causal_skip = True
200
+ else:
201
+ allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
202
+
203
+ # Allow slight deviations from causal mask
204
+ # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
205
+ # padding mask, etc) as the resulting mask may otherwise not be correct!
206
+ if or_mask_function is not None:
207
+ if not _is_torch_greater_or_equal_than_2_6:
208
+ raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
209
+ mask_factory_function = or_masks(mask_factory_function, or_mask_function)
210
+ allow_is_causal_skip = False
211
+ if and_mask_function is not None:
212
+ if not _is_torch_greater_or_equal_than_2_6:
213
+ raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
214
+ mask_factory_function = and_masks(mask_factory_function, and_mask_function)
215
+ allow_is_causal_skip = False
216
+
217
+ # If we detected packing format
218
+ if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
219
+ mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
220
+ allow_is_causal_skip = False
221
+
222
+ # We now create the mask
223
+ causal_mask = mask_interface(
224
+ batch_size=batch_size,
225
+ cache_position=cache_position,
226
+ kv_length=kv_length,
227
+ kv_offset=kv_offset,
228
+ mask_function=mask_factory_function,
229
+ attention_mask=attention_mask,
230
+ allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
231
+ dtype=dtype, # Additional kwarg for eager
232
+ config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
233
+ )
234
+ return causal_mask
235
+
236
+
237
+ def create_sliding_window_causal_mask(
238
+ config: PretrainedConfig,
239
+ input_embeds: torch.Tensor,
240
+ attention_mask: Optional[torch.Tensor],
241
+ cache_position: torch.Tensor,
242
+ past_key_values: Optional[Cache],
243
+ position_ids: Optional[torch.Tensor] = None,
244
+ or_mask_function: Optional[Callable] = None,
245
+ and_mask_function: Optional[Callable] = None,
246
+ ) -> Optional[Union[torch.Tensor, BlockMask]]:
247
+ """
248
+ Create a sliding window causal mask based on the attention implementation used (stored in the config). This type
249
+ of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this
250
+ function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the
251
+ `modeling_xxx.py` files).
252
+
253
+ Args:
254
+ config (`PretrainedConfig`):
255
+ The model config.
256
+ input_embeds (`torch.Tensor`):
257
+ The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
258
+ batch size, query length and dtype.
259
+ attention_mask (`torch.Tensor`, optional):
260
+ The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
261
+ It can also be an already prepared 4D mask, in which case it is returned as-is.
262
+ cache_position (`torch.Tensor`):
263
+ A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
264
+ past_key_values (`Cache`, optional):
265
+ The past key values, if we use a cache.
266
+ position_ids (`torch.Tensor`, optional)
267
+ A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences.
268
+ or_mask_function (`Callable`, optional):
269
+ An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is
270
+ useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
271
+ and_mask_function (`Callable`, optional):
272
+ An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is
273
+ useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling.
274
+ """
275
+ # If we have an hybrid cache structure, here we want to create the mask for the sliding layers
276
+ if hasattr(past_key_values, "is_sliding") and True in past_key_values.is_sliding:
277
+ layer_idx = past_key_values.is_sliding.index(True)
278
+ else:
279
+ layer_idx = 0
280
+
281
+ early_exit, attention_mask, packed_sequence_mask, kv_length, kv_offset = _preprocess_mask_arguments(
282
+ config, input_embeds, attention_mask, cache_position, past_key_values, position_ids, layer_idx
283
+ )
284
+ if early_exit:
285
+ return attention_mask
286
+
287
+ sliding_window = getattr(config, "sliding_window", None)
288
+ if sliding_window is None:
289
+ raise ValueError("Could not find a `sliding_window` argument in the config, or it is not set")
290
+
291
+ batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
292
+ mask_factory_function = sliding_window_causal_mask_function(sliding_window)
293
+ mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
294
+
295
+ # Do not allow skip if we are compiling (this is to match BC)
296
+ # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
297
+ allow_is_causal_skip = not getattr(past_key_values, "is_compileable", False)
298
+
299
+ # Allow slight deviations from causal mask
300
+ # Note that it is very important to apply this before any other deviations of the mask (such as packed sequence mask,
301
+ # padding mask, etc) as the resulting mask may otherwise not be correct!
302
+ if or_mask_function is not None:
303
+ if not _is_torch_greater_or_equal_than_2_6:
304
+ raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
305
+ mask_factory_function = or_masks(mask_factory_function, or_mask_function)
306
+ allow_is_causal_skip = False
307
+ if and_mask_function is not None:
308
+ if not _is_torch_greater_or_equal_than_2_6:
309
+ raise ValueError("Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6")
310
+ mask_factory_function = and_masks(mask_factory_function, and_mask_function)
311
+ allow_is_causal_skip = False
312
+
313
+ # If we detected packing format
314
+ if packed_sequence_mask is not None and _is_torch_greater_or_equal_than_2_6:
315
+ mask_factory_function = and_masks(mask_factory_function, packed_sequence_mask_function(packed_sequence_mask))
316
+ allow_is_causal_skip = False
317
+
318
+ # We now create the mask
319
+ causal_mask = mask_interface(
320
+ batch_size=batch_size,
321
+ cache_position=cache_position,
322
+ kv_length=kv_length,
323
+ kv_offset=kv_offset,
324
+ mask_function=mask_factory_function,
325
+ attention_mask=attention_mask,
326
+ allow_is_causal_skip=allow_is_causal_skip, # additional kwarg for sdpa
327
+ local_size=sliding_window, # Additional kwarg for sdpa
328
+ dtype=dtype, # Additional kwarg for eager
329
+ config=config, # Pass the config as well, in case someone wants to easily have their own mask_interface
330
+ )
331
+ return causal_mask
332
+
333
+ class Qwen3Attention(qwen3.Qwen3Attention):
334
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
335
+ def forward(
336
+ self,
337
+ hidden_states: torch.Tensor,
338
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
339
+ attention_mask: Optional[torch.Tensor],
340
+ past_key_values: Optional[Cache] = None,
341
+ cache_position: Optional[torch.LongTensor] = None,
342
+ **kwargs: Unpack[FlashAttentionKwargs],
343
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
344
+
345
+ input_shape = hidden_states.shape[:-1]
346
+ hidden_shape = (*input_shape, -1, self.head_dim)
347
+
348
+ query_states = self.q_norm(self.q_proj(
349
+ hidden_states).view(hidden_shape)).transpose(1, 2)
350
+ key_states = self.k_norm(self.k_proj(
351
+ hidden_states).view(hidden_shape)).transpose(1, 2)
352
+ value_states = self.v_proj(hidden_states).view(
353
+ hidden_shape).transpose(1, 2)
354
+
355
+ # 获取 3D 的 cos 和 sin,用于多模态 RoPE
356
+ cos, sin = position_embeddings
357
+
358
+ # 调用多模态的 RoPE 函数
359
+ mrope_section = self.rope_scaling["mrope_section"]
360
+ query_states, key_states = qwen25.apply_multimodal_rotary_pos_emb(
361
+ query_states, key_states, cos, sin, mrope_section
362
+ )
363
+
364
+ if past_key_values is not None:
365
+ cache_kwargs = {"sin": sin, "cos": cos,
366
+ "cache_position": cache_position}
367
+ key_states, value_states = past_key_values.update(
368
+ key_states, value_states, self.layer_idx, cache_kwargs)
369
+
370
+ attention_interface: Callable = qwen3.eager_attention_forward
371
+ if self.config._attn_implementation != "eager":
372
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
373
+ assert False, (
374
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
375
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
376
+ )
377
+ else:
378
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
379
+
380
+ attn_output, attn_weights = attention_interface(
381
+ self,
382
+ query_states,
383
+ key_states,
384
+ value_states,
385
+ attention_mask,
386
+ dropout=0.0 if not self.training else self.attention_dropout,
387
+ scaling=self.scaling,
388
+ sliding_window=self.sliding_window,
389
+ **kwargs,
390
+ )
391
+
392
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
393
+ attn_output = self.o_proj(attn_output)
394
+ return attn_output, attn_weights
395
+
396
+
397
+ class Qwen3DecoderLayer(qwen3.Qwen3DecoderLayer):
398
+ def __init__(self, config: qwen3.Qwen3Config, layer_idx: int):
399
+ super().__init__(config, layer_idx)
400
+ self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
401
+
402
+
403
+ class Qwen3Model(qwen3.Qwen3PreTrainedModel):
404
+ def __init__(self, config: qwen3.Qwen3Config):
405
+ super().__init__(config)
406
+ self.padding_idx = config.pad_token_id
407
+ self.vocab_size = config.vocab_size
408
+
409
+ self.embed_tokens = nn.Embedding(
410
+ config.vocab_size, config.hidden_size, self.padding_idx)
411
+ self.layers = nn.ModuleList(
412
+ [Qwen3DecoderLayer(config, layer_idx)
413
+ for layer_idx in range(config.num_hidden_layers)]
414
+ )
415
+ self.norm = qwen3.Qwen3RMSNorm(
416
+ config.hidden_size, eps=config.rms_norm_eps)
417
+ self.rotary_emb = qwen3.Qwen3RotaryEmbedding(config=config)
418
+ self.gradient_checkpointing = False
419
+ self.has_sliding_layers = "sliding_attention" in self.config.layer_types
420
+
421
+ # Initialize weights and apply final processing
422
+ self.post_init()
423
+
424
+ def get_input_embeddings(self):
425
+ """
426
+ For transformers library version compatability.
427
+ """
428
+ return self.embed_tokens
429
+
430
+ @auto_docstring
431
+ def forward(
432
+ self,
433
+ input_ids: Optional[torch.LongTensor] = None,
434
+ attention_mask: Optional[torch.Tensor] = None,
435
+ position_ids: Optional[torch.LongTensor] = None,
436
+ past_key_values: Optional[Cache] = None,
437
+ inputs_embeds: Optional[torch.FloatTensor] = None,
438
+ use_cache: Optional[bool] = None,
439
+ cache_position: Optional[torch.LongTensor] = None,
440
+ **kwargs: Unpack,
441
+ ):
442
+ if (input_ids is None) ^ (inputs_embeds is not None):
443
+ raise ValueError(
444
+ "You must specify exactly one of input_ids or inputs_embeds")
445
+
446
+ if inputs_embeds is None:
447
+ inputs_embeds = self.embed_tokens(input_ids)
448
+
449
+ if use_cache and past_key_values is None:
450
+ past_key_values = DynamicCache(config=self.config)
451
+
452
+ if cache_position is None:
453
+ past_seen_tokens = past_key_values.get_seq_length(
454
+ ) if past_key_values is not None else 0
455
+ cache_position = torch.arange(
456
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
457
+ )
458
+
459
+ if position_ids is None:
460
+ position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
461
+ elif position_ids.ndim == 2:
462
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
463
+
464
+ if position_ids.ndim == 3 and position_ids.shape[0] == 4:
465
+ position_ids = position_ids[1:]
466
+ t_position_ids = position_ids[1]
467
+ else:
468
+ t_position_ids = position_ids[0]
469
+
470
+
471
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
472
+ mask_kwargs = {
473
+ "config": self.config,
474
+ "input_embeds": inputs_embeds,
475
+ "attention_mask": attention_mask,
476
+ "cache_position": cache_position,
477
+ "past_key_values": past_key_values,
478
+ "position_ids": t_position_ids,
479
+ }
480
+ causal_mask_mapping = {
481
+ "full_attention": create_causal_mask(**mask_kwargs),
482
+ }
483
+ if self.has_sliding_layers:
484
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(
485
+ **mask_kwargs)
486
+
487
+ hidden_states = inputs_embeds
488
+ all_hidden_states = ()
489
+
490
+ # create position embeddings to be shared across the decoder layers
491
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
492
+
493
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
494
+ all_hidden_states += (hidden_states,)
495
+ hidden_states = decoder_layer(
496
+ hidden_states,
497
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
498
+ position_ids=position_ids,
499
+ past_key_values=past_key_values,
500
+ use_cache=use_cache,
501
+ cache_position=cache_position,
502
+ position_embeddings=position_embeddings,
503
+ **kwargs,
504
+ )
505
+ if isinstance(hidden_states, tuple):
506
+ hidden_states = hidden_states[0]
507
+
508
+ hidden_states = self.norm(hidden_states)
509
+ all_hidden_states += (hidden_states,)
510
+ return BaseModelOutputWithPast(
511
+ last_hidden_state=hidden_states,
512
+ hidden_states=all_hidden_states,# for transformers library version compatability
513
+ past_key_values=past_key_values if use_cache else None,
514
+ )
515
+
516
+
517
+
518
+ class LLaVABaselineConfig(Qwen2Config):
519
+ model_type = "llava_baseline"
520
+ keys_to_ignore_at_inference = ["past_key_values"]
521
+
522
+ def __init__(self,
523
+ vit_path='Qwen/Qwen2.5-VL-3B-Instruct',
524
+ llm_path='Qwen/Qwen3-4B',
525
+ **kwargs):
526
+ self.vit_path = vit_path
527
+ self.llm_path = llm_path
528
+ super().__init__(**kwargs)
529
+
530
+ # Remove text_config and vision_config if they exist as dicts
531
+ # to prevent GenerationConfig from trying to call .to_dict() on them
532
+ if hasattr(self, 'text_config') and isinstance(self.text_config, dict):
533
+ delattr(self, 'text_config')
534
+ if hasattr(self, 'vision_config') and isinstance(self.vision_config, dict):
535
+ delattr(self, 'vision_config')
536
+
537
+
538
+ class LLaVABaselinePreTrainedModel(Qwen2PreTrainedModel):
539
+ config_class = LLaVABaselineConfig
540
+
541
+
542
+ class LLaVABaselineModel(LLaVABaselinePreTrainedModel):
543
+ def __init__(self, config: LLaVABaselineConfig):
544
+ super().__init__(config)
545
+ # self.vlm = Qwen2_5_VLModel.from_pretrained(
546
+ # config.vit_path, low_cpu_mem_usage=True)
547
+ # self.vlm.language_model = Qwen3Model.from_pretrained(config.llm_path)
548
+ vlm_config = AutoConfig.from_pretrained(config.vit_path)
549
+ language_config = AutoConfig.from_pretrained(config.llm_path)
550
+ self.vlm = Qwen2_5_VLModel(vlm_config)
551
+ self.vlm.language_model = Qwen3Model(language_config)
552
+ self.vlm.language_model.rotary_emb = qwen25.Qwen2_5_VLRotaryEmbedding(
553
+ config=config)
554
+
555
+ # Set rope_scaling for each attention layer
556
+ for layer in self.vlm.language_model.layers:
557
+ layer.self_attn.rope_scaling = self.vlm.config.rope_scaling
558
+
559
+ # Adapt patch merger MLP output dimension to match LLM hidden size
560
+ llm_hidden_size = self.vlm.language_model.config.hidden_size
561
+ patch_merger = self.vlm.visual.merger
562
+ mlp_input_dim = patch_merger.hidden_size
563
+ original_output_dim = patch_merger.mlp[2].out_features
564
+ if original_output_dim != llm_hidden_size:
565
+ new_mlp = nn.Sequential(
566
+ nn.Linear(mlp_input_dim, mlp_input_dim),
567
+ nn.GELU(),
568
+ nn.Linear(mlp_input_dim, llm_hidden_size)
569
+ )
570
+ patch_merger.mlp = new_mlp
571
+
572
+ self.config: LLaVABaselineConfig
573
+
574
+ def forward(self, *args, **kwargs):
575
+ return self.vlm.forward(*args, **kwargs)
576
+
577
+
578
+ class LLaVABaselineModelForConditionalGeneration(LLaVABaselinePreTrainedModel, GenerationMixin):
579
+ def __init__(self, config: LLaVABaselineConfig):
580
+ super().__init__(config)
581
+ self.model = LLaVABaselineModel(config)
582
+ self.lm_head = nn.Linear(self.model.vlm.language_model.config.hidden_size,
583
+ self.model.vlm.language_model.config.vocab_size, bias=False)
584
+
585
+ self.post_init()
586
+
587
+ def tie_weights(self):
588
+ """
589
+ Tie the weights between the input embeddings and the output embeddings.
590
+ """
591
+ if getattr(self.model.vlm.language_model.config.get_text_config(decoder=True), "tie_word_embeddings", True):
592
+ output_embeddings = self.get_output_embeddings()
593
+ if output_embeddings is not None:
594
+ self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
595
+
596
+ def get_input_embeddings(self):
597
+ return self.model.vlm.get_input_embeddings()
598
+
599
+ def set_input_embeddings(self, value):
600
+ self.model.vlm.set_input_embeddings(value)
601
+
602
+ def get_output_embeddings(self):
603
+ return self.lm_head
604
+
605
+ def set_output_embeddings(self, new_embeddings):
606
+ self.lm_head = new_embeddings
607
+
608
+ def set_decoder(self, decoder):
609
+ self.model = decoder
610
+
611
+ def get_decoder(self):
612
+ return self.model
613
+
614
+ @property
615
+ def language_model(self):
616
+ return self.model.vlm.language_model
617
+
618
+ @property
619
+ def visual(self):
620
+ return self.model.vlm.visual
621
+
622
+ def forward(
623
+ self,
624
+ input_ids: torch.LongTensor = None,
625
+ attention_mask: Optional[torch.Tensor] = None,
626
+ position_ids: Optional[torch.LongTensor] = None,
627
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
628
+ inputs_embeds: Optional[torch.FloatTensor] = None,
629
+ labels: Optional[torch.LongTensor] = None,
630
+ use_cache: Optional[bool] = None,
631
+ output_attentions: Optional[bool] = None,
632
+ output_hidden_states: Optional[bool] = None,
633
+ return_dict: Optional[bool] = None,
634
+ pixel_values: Optional[torch.Tensor] = None,
635
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
636
+ image_grid_thw: Optional[torch.LongTensor] = None,
637
+ video_grid_thw: Optional[torch.LongTensor] = None,
638
+ rope_deltas: Optional[torch.LongTensor] = None,
639
+ cache_position: Optional[torch.LongTensor] = None,
640
+ second_per_grid_ts: Optional[torch.Tensor] = None,
641
+ **kwargs
642
+ ) -> Union[Tuple, qwen25.Qwen2_5_VLCausalLMOutputWithPast]:
643
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
644
+ output_hidden_states = (
645
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
646
+ )
647
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
648
+
649
+ outputs = self.model(
650
+ input_ids=input_ids,
651
+ pixel_values=pixel_values,
652
+ pixel_values_videos=pixel_values_videos,
653
+ image_grid_thw=image_grid_thw,
654
+ video_grid_thw=video_grid_thw,
655
+ second_per_grid_ts=second_per_grid_ts,
656
+ position_ids=position_ids,
657
+ attention_mask=attention_mask,
658
+ past_key_values=past_key_values,
659
+ inputs_embeds=inputs_embeds,
660
+ use_cache=use_cache,
661
+ output_attentions=output_attentions,
662
+ output_hidden_states=output_hidden_states,
663
+ return_dict=return_dict,
664
+ cache_position=cache_position,
665
+ )
666
+
667
+ hidden_states = outputs[0]
668
+ logits = self.lm_head(hidden_states)
669
+
670
+ loss = None
671
+ if labels is not None:
672
+ loss = self.loss_function(
673
+ logits=logits, labels=labels, vocab_size=self.config.vocab_size)
674
+
675
+ rank = dist.get_rank() if dist.is_initialized() else 'N/A'
676
+ num_items = (labels != -100).sum().item()
677
+ loss_sum = loss.item() * num_items
678
+
679
+ if not return_dict:
680
+ output = (logits,) + outputs[1:]
681
+ return (loss,) + output if loss is not None else output
682
+
683
+ return qwen25.Qwen2_5_VLCausalLMOutputWithPast(
684
+ loss=loss,
685
+ logits=logits,
686
+ past_key_values=outputs.past_key_values,
687
+ hidden_states=outputs.hidden_states,
688
+ attentions=outputs.attentions,
689
+ rope_deltas=outputs.rope_deltas,
690
+ )
691
+
692
+ def prepare_inputs_for_generation(
693
+ self,
694
+ input_ids,
695
+ past_key_values=None,
696
+ attention_mask=None,
697
+ inputs_embeds=None,
698
+ cache_position=None,
699
+ position_ids=None,
700
+ use_cache=True,
701
+ pixel_values=None,
702
+ pixel_values_videos=None,
703
+ image_grid_thw=None,
704
+ video_grid_thw=None,
705
+ second_per_grid_ts=None,
706
+ **kwargs,
707
+ ):
708
+ return self.model.vlm.prepare_inputs_for_generation(input_ids, **kwargs)
709
+
710
+
711
+ __all__ = ["LLaVABaselineModelForConditionalGeneration", "LLaVABaselineConfig"]
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Qwen2_5_VLProcessor"
19
+ }
pytorch_model-00001-of-00005.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb4b5a69b8c7ab2a446595d62dfd92b6c7142dad8c0f000b39f3f03fb3e0ede
3
+ size 4992065246
pytorch_model-00002-of-00005.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ae793f0f200ffa51af4d2bf12a38fe6ebbd55d967e7d0cfe86b0dc9404e4cf8
3
+ size 4936657797
pytorch_model-00003-of-00005.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f889c92122d11edb6663c5ed2fab30cacca751c85ec03e20484801cd8badf08
3
+ size 4944341612
pytorch_model-00004-of-00005.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e7f0197f41c99a829818cb095760bfe58a8ea4ad4d0e2daec3c9e8725eb495e
3
+ size 4944341564
pytorch_model-00005-of-00005.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e09c1732837392a24dedba2ec50efa725f4a2a1be71ce8f7ed16aab38d3a4bfe
3
+ size 1325524675
pytorch_model.bin.index.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78dd0349b2245dd29033a8de5342142774df4631a4255d552ccf58b52c2da943
3
+ size 11446736
tokenizer_config.json ADDED
@@ -0,0 +1,1379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<im_msg-0>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<im_msg-1>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<im_msg-2>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<im_msg-3>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<im_msg-4>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<im_msg-5>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<im_msg-6>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<im_msg-7>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<im_msg-8>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<im_msg-9>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<im_msg-10>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<im_msg-11>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<im_msg-12>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<im_msg-13>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<im_msg-14>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<im_msg-15>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<im_msg-16>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<im_msg-17>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<im_msg-18>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<im_msg-19>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<im_msg-20>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<im_msg-21>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<im_msg-22>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<im_msg-23>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<im_msg-24>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<im_msg-25>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<im_msg-26>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<im_msg-27>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "151697": {
438
+ "content": "<im_msg-28>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "151698": {
446
+ "content": "<im_msg-29>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "151699": {
454
+ "content": "<im_msg-30>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "151700": {
462
+ "content": "<im_msg-31>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "151701": {
470
+ "content": "<im_msg-32>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "151702": {
478
+ "content": "<im_msg-33>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "151703": {
486
+ "content": "<im_msg-34>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "151704": {
494
+ "content": "<im_msg-35>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "151705": {
502
+ "content": "<im_msg-36>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "151706": {
510
+ "content": "<im_msg-37>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "151707": {
518
+ "content": "<im_msg-38>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "151708": {
526
+ "content": "<im_msg-39>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "151709": {
534
+ "content": "<im_msg-40>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "151710": {
542
+ "content": "<im_msg-41>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "151711": {
550
+ "content": "<im_msg-42>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "151712": {
558
+ "content": "<im_msg-43>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "151713": {
566
+ "content": "<im_msg-44>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "151714": {
574
+ "content": "<im_msg-45>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "151715": {
582
+ "content": "<im_msg-46>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "151716": {
590
+ "content": "<im_msg-47>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "151717": {
598
+ "content": "<im_msg-48>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "151718": {
606
+ "content": "<im_msg-49>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "151719": {
614
+ "content": "<im_msg-50>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "151720": {
622
+ "content": "<im_msg-51>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "151721": {
630
+ "content": "<im_msg-52>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "151722": {
638
+ "content": "<im_msg-53>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "151723": {
646
+ "content": "<im_msg-54>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "151724": {
654
+ "content": "<im_msg-55>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "151725": {
662
+ "content": "<im_msg-56>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "151726": {
670
+ "content": "<im_msg-57>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "151727": {
678
+ "content": "<im_msg-58>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "151728": {
686
+ "content": "<im_msg-59>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "151729": {
694
+ "content": "<im_msg-60>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "151730": {
702
+ "content": "<im_msg-61>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "151731": {
710
+ "content": "<im_msg-62>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "151732": {
718
+ "content": "<im_msg-63>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "151733": {
726
+ "content": "<im_msg-64>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "151734": {
734
+ "content": "<im_msg-65>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "151735": {
742
+ "content": "<im_msg-66>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "151736": {
750
+ "content": "<im_msg-67>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "151737": {
758
+ "content": "<im_msg-68>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "151738": {
766
+ "content": "<im_msg-69>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "151739": {
774
+ "content": "<im_msg-70>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "151740": {
782
+ "content": "<im_msg-71>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "151741": {
790
+ "content": "<im_msg-72>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "151742": {
798
+ "content": "<im_msg-73>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "151743": {
806
+ "content": "<im_msg-74>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "151744": {
814
+ "content": "<im_msg-75>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "151745": {
822
+ "content": "<im_msg-76>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "151746": {
830
+ "content": "<im_msg-77>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ },
837
+ "151747": {
838
+ "content": "<im_msg-78>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": true
844
+ },
845
+ "151748": {
846
+ "content": "<im_msg-79>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": true
852
+ },
853
+ "151749": {
854
+ "content": "<im_msg-80>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "151750": {
862
+ "content": "<im_msg-81>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "151751": {
870
+ "content": "<im_msg-82>",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": true
876
+ },
877
+ "151752": {
878
+ "content": "<im_msg-83>",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": true
884
+ },
885
+ "151753": {
886
+ "content": "<im_msg-84>",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": true
892
+ },
893
+ "151754": {
894
+ "content": "<im_msg-85>",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": true
900
+ },
901
+ "151755": {
902
+ "content": "<im_msg-86>",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": true
908
+ },
909
+ "151756": {
910
+ "content": "<im_msg-87>",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": true
916
+ },
917
+ "151757": {
918
+ "content": "<im_msg-88>",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": true
924
+ },
925
+ "151758": {
926
+ "content": "<im_msg-89>",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": true
932
+ },
933
+ "151759": {
934
+ "content": "<im_msg-90>",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": true
940
+ },
941
+ "151760": {
942
+ "content": "<im_msg-91>",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": true
948
+ },
949
+ "151761": {
950
+ "content": "<im_msg-92>",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": true
956
+ },
957
+ "151762": {
958
+ "content": "<im_msg-93>",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": true
964
+ },
965
+ "151763": {
966
+ "content": "<im_msg-94>",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": true
972
+ },
973
+ "151764": {
974
+ "content": "<im_msg-95>",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": true
980
+ },
981
+ "151765": {
982
+ "content": "<im_msg-96>",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": true
988
+ },
989
+ "151766": {
990
+ "content": "<im_msg-97>",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": true
996
+ },
997
+ "151767": {
998
+ "content": "<im_msg-98>",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": true
1004
+ },
1005
+ "151768": {
1006
+ "content": "<im_msg-99>",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": true
1012
+ },
1013
+ "151769": {
1014
+ "content": "<im_msg-100>",
1015
+ "lstrip": false,
1016
+ "normalized": false,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": true
1020
+ },
1021
+ "151770": {
1022
+ "content": "<im_msg-101>",
1023
+ "lstrip": false,
1024
+ "normalized": false,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": true
1028
+ },
1029
+ "151771": {
1030
+ "content": "<im_msg-102>",
1031
+ "lstrip": false,
1032
+ "normalized": false,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": true
1036
+ },
1037
+ "151772": {
1038
+ "content": "<im_msg-103>",
1039
+ "lstrip": false,
1040
+ "normalized": false,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": true
1044
+ },
1045
+ "151773": {
1046
+ "content": "<im_msg-104>",
1047
+ "lstrip": false,
1048
+ "normalized": false,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": true
1052
+ },
1053
+ "151774": {
1054
+ "content": "<im_msg-105>",
1055
+ "lstrip": false,
1056
+ "normalized": false,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": true
1060
+ },
1061
+ "151775": {
1062
+ "content": "<im_msg-106>",
1063
+ "lstrip": false,
1064
+ "normalized": false,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": true
1068
+ },
1069
+ "151776": {
1070
+ "content": "<im_msg-107>",
1071
+ "lstrip": false,
1072
+ "normalized": false,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": true
1076
+ },
1077
+ "151777": {
1078
+ "content": "<im_msg-108>",
1079
+ "lstrip": false,
1080
+ "normalized": false,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": true
1084
+ },
1085
+ "151778": {
1086
+ "content": "<im_msg-109>",
1087
+ "lstrip": false,
1088
+ "normalized": false,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": true
1092
+ },
1093
+ "151779": {
1094
+ "content": "<im_msg-110>",
1095
+ "lstrip": false,
1096
+ "normalized": false,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": true
1100
+ },
1101
+ "151780": {
1102
+ "content": "<im_msg-111>",
1103
+ "lstrip": false,
1104
+ "normalized": false,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": true
1108
+ },
1109
+ "151781": {
1110
+ "content": "<im_msg-112>",
1111
+ "lstrip": false,
1112
+ "normalized": false,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": true
1116
+ },
1117
+ "151782": {
1118
+ "content": "<im_msg-113>",
1119
+ "lstrip": false,
1120
+ "normalized": false,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": true
1124
+ },
1125
+ "151783": {
1126
+ "content": "<im_msg-114>",
1127
+ "lstrip": false,
1128
+ "normalized": false,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": true
1132
+ },
1133
+ "151784": {
1134
+ "content": "<im_msg-115>",
1135
+ "lstrip": false,
1136
+ "normalized": false,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": true
1140
+ },
1141
+ "151785": {
1142
+ "content": "<im_msg-116>",
1143
+ "lstrip": false,
1144
+ "normalized": false,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": true
1148
+ },
1149
+ "151786": {
1150
+ "content": "<im_msg-117>",
1151
+ "lstrip": false,
1152
+ "normalized": false,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": true
1156
+ },
1157
+ "151787": {
1158
+ "content": "<im_msg-118>",
1159
+ "lstrip": false,
1160
+ "normalized": false,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": true
1164
+ },
1165
+ "151788": {
1166
+ "content": "<im_msg-119>",
1167
+ "lstrip": false,
1168
+ "normalized": false,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": true
1172
+ },
1173
+ "151789": {
1174
+ "content": "<im_msg-120>",
1175
+ "lstrip": false,
1176
+ "normalized": false,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": true
1180
+ },
1181
+ "151790": {
1182
+ "content": "<im_msg-121>",
1183
+ "lstrip": false,
1184
+ "normalized": false,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": true
1188
+ },
1189
+ "151791": {
1190
+ "content": "<im_msg-122>",
1191
+ "lstrip": false,
1192
+ "normalized": false,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": true
1196
+ },
1197
+ "151792": {
1198
+ "content": "<im_msg-123>",
1199
+ "lstrip": false,
1200
+ "normalized": false,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": true
1204
+ },
1205
+ "151793": {
1206
+ "content": "<im_msg-124>",
1207
+ "lstrip": false,
1208
+ "normalized": false,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": true
1212
+ },
1213
+ "151794": {
1214
+ "content": "<im_msg-125>",
1215
+ "lstrip": false,
1216
+ "normalized": false,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": true
1220
+ },
1221
+ "151795": {
1222
+ "content": "<im_msg-126>",
1223
+ "lstrip": false,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": true
1228
+ },
1229
+ "151796": {
1230
+ "content": "<im_msg-127>",
1231
+ "lstrip": false,
1232
+ "normalized": false,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": true
1236
+ }
1237
+ },
1238
+ "additional_special_tokens": [
1239
+ "<im_msg-0>",
1240
+ "<im_msg-1>",
1241
+ "<im_msg-2>",
1242
+ "<im_msg-3>",
1243
+ "<im_msg-4>",
1244
+ "<im_msg-5>",
1245
+ "<im_msg-6>",
1246
+ "<im_msg-7>",
1247
+ "<im_msg-8>",
1248
+ "<im_msg-9>",
1249
+ "<im_msg-10>",
1250
+ "<im_msg-11>",
1251
+ "<im_msg-12>",
1252
+ "<im_msg-13>",
1253
+ "<im_msg-14>",
1254
+ "<im_msg-15>",
1255
+ "<im_msg-16>",
1256
+ "<im_msg-17>",
1257
+ "<im_msg-18>",
1258
+ "<im_msg-19>",
1259
+ "<im_msg-20>",
1260
+ "<im_msg-21>",
1261
+ "<im_msg-22>",
1262
+ "<im_msg-23>",
1263
+ "<im_msg-24>",
1264
+ "<im_msg-25>",
1265
+ "<im_msg-26>",
1266
+ "<im_msg-27>",
1267
+ "<im_msg-28>",
1268
+ "<im_msg-29>",
1269
+ "<im_msg-30>",
1270
+ "<im_msg-31>",
1271
+ "<im_msg-32>",
1272
+ "<im_msg-33>",
1273
+ "<im_msg-34>",
1274
+ "<im_msg-35>",
1275
+ "<im_msg-36>",
1276
+ "<im_msg-37>",
1277
+ "<im_msg-38>",
1278
+ "<im_msg-39>",
1279
+ "<im_msg-40>",
1280
+ "<im_msg-41>",
1281
+ "<im_msg-42>",
1282
+ "<im_msg-43>",
1283
+ "<im_msg-44>",
1284
+ "<im_msg-45>",
1285
+ "<im_msg-46>",
1286
+ "<im_msg-47>",
1287
+ "<im_msg-48>",
1288
+ "<im_msg-49>",
1289
+ "<im_msg-50>",
1290
+ "<im_msg-51>",
1291
+ "<im_msg-52>",
1292
+ "<im_msg-53>",
1293
+ "<im_msg-54>",
1294
+ "<im_msg-55>",
1295
+ "<im_msg-56>",
1296
+ "<im_msg-57>",
1297
+ "<im_msg-58>",
1298
+ "<im_msg-59>",
1299
+ "<im_msg-60>",
1300
+ "<im_msg-61>",
1301
+ "<im_msg-62>",
1302
+ "<im_msg-63>",
1303
+ "<im_msg-64>",
1304
+ "<im_msg-65>",
1305
+ "<im_msg-66>",
1306
+ "<im_msg-67>",
1307
+ "<im_msg-68>",
1308
+ "<im_msg-69>",
1309
+ "<im_msg-70>",
1310
+ "<im_msg-71>",
1311
+ "<im_msg-72>",
1312
+ "<im_msg-73>",
1313
+ "<im_msg-74>",
1314
+ "<im_msg-75>",
1315
+ "<im_msg-76>",
1316
+ "<im_msg-77>",
1317
+ "<im_msg-78>",
1318
+ "<im_msg-79>",
1319
+ "<im_msg-80>",
1320
+ "<im_msg-81>",
1321
+ "<im_msg-82>",
1322
+ "<im_msg-83>",
1323
+ "<im_msg-84>",
1324
+ "<im_msg-85>",
1325
+ "<im_msg-86>",
1326
+ "<im_msg-87>",
1327
+ "<im_msg-88>",
1328
+ "<im_msg-89>",
1329
+ "<im_msg-90>",
1330
+ "<im_msg-91>",
1331
+ "<im_msg-92>",
1332
+ "<im_msg-93>",
1333
+ "<im_msg-94>",
1334
+ "<im_msg-95>",
1335
+ "<im_msg-96>",
1336
+ "<im_msg-97>",
1337
+ "<im_msg-98>",
1338
+ "<im_msg-99>",
1339
+ "<im_msg-100>",
1340
+ "<im_msg-101>",
1341
+ "<im_msg-102>",
1342
+ "<im_msg-103>",
1343
+ "<im_msg-104>",
1344
+ "<im_msg-105>",
1345
+ "<im_msg-106>",
1346
+ "<im_msg-107>",
1347
+ "<im_msg-108>",
1348
+ "<im_msg-109>",
1349
+ "<im_msg-110>",
1350
+ "<im_msg-111>",
1351
+ "<im_msg-112>",
1352
+ "<im_msg-113>",
1353
+ "<im_msg-114>",
1354
+ "<im_msg-115>",
1355
+ "<im_msg-116>",
1356
+ "<im_msg-117>",
1357
+ "<im_msg-118>",
1358
+ "<im_msg-119>",
1359
+ "<im_msg-120>",
1360
+ "<im_msg-121>",
1361
+ "<im_msg-122>",
1362
+ "<im_msg-123>",
1363
+ "<im_msg-124>",
1364
+ "<im_msg-125>",
1365
+ "<im_msg-126>",
1366
+ "<im_msg-127>"
1367
+ ],
1368
+ "bos_token": null,
1369
+ "clean_up_tokenization_spaces": false,
1370
+ "eos_token": "<|im_end|>",
1371
+ "errors": "replace",
1372
+ "extra_special_tokens": {},
1373
+ "model_max_length": 131072,
1374
+ "pad_token": "<|endoftext|>",
1375
+ "padding_side": "left",
1376
+ "split_special_tokens": false,
1377
+ "tokenizer_class": "Qwen2Tokenizer",
1378
+ "unk_token": null
1379
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff