p1k0 commited on
Commit
63fb832
·
verified ·
1 Parent(s): 3d37146

Add files using upload-large-folder tool

Browse files
Files changed (40) hide show
  1. grpo/qwen2.5vl-7b-thinking_v2_full_comet_grpo/v13-20250907-200700/checkpoint-750/model-00002-of-00004.safetensors +3 -0
  2. internvl3-8b-instruct-lora_epoch10_5e-6/added_tokens.json +33 -0
  3. internvl3-8b-instruct-lora_epoch10_5e-6/args.json +371 -0
  4. internvl3-8b-instruct-lora_epoch10_5e-6/config.json +226 -0
  5. internvl3-8b-instruct-lora_epoch10_5e-6/configuration_intern_vit.py +120 -0
  6. internvl3-8b-instruct-lora_epoch10_5e-6/configuration_internvl_chat.py +97 -0
  7. internvl3-8b-instruct-lora_epoch10_5e-6/conversation.py +391 -0
  8. internvl3-8b-instruct-lora_epoch10_5e-6/generation_config.json +4 -0
  9. internvl3-8b-instruct-lora_epoch10_5e-6/merges.txt +0 -0
  10. internvl3-8b-instruct-lora_epoch10_5e-6/model.safetensors.index.json +692 -0
  11. internvl3-8b-instruct-lora_epoch10_5e-6/modeling_intern_vit.py +431 -0
  12. internvl3-8b-instruct-lora_epoch10_5e-6/modeling_internvl_chat_cd.py +1198 -0
  13. internvl3-8b-instruct-lora_epoch10_5e-6/modeling_qwen2_cd.py +1950 -0
  14. internvl3-8b-instruct-lora_epoch10_5e-6/preprocessor_config.json +19 -0
  15. internvl3-8b-instruct-lora_epoch10_5e-6/special_tokens_map.json +31 -0
  16. internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer_config.json +281 -0
  17. internvl3-8b-instruct-lora_epoch10_5e-6/vocab.json +0 -0
  18. llava-ov-lora/preprocessor_config.json +171 -0
  19. llava-ov-lora/processor_config.json +7 -0
  20. llava-ov-lora/special_tokens_map.json +20 -0
  21. llava-ov-lora/tokenizer_config.json +65 -0
  22. llava-ov-lora/video_processor/preprocessor_config.json +25 -0
  23. llava-ov-lora/vocab.json +0 -0
  24. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/args.json +375 -0
  25. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/added_tokens.json +24 -0
  26. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/args.json +375 -0
  27. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/chat_template.json +3 -0
  28. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/config.json +66 -0
  29. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/generation_config.json +12 -0
  30. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/latest +1 -0
  31. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/merges.txt +0 -0
  32. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/model.safetensors.index.json +736 -0
  33. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/preprocessor_config.json +19 -0
  34. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/special_tokens_map.json +31 -0
  35. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer_config.json +209 -0
  36. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/trainer_state.json +658 -0
  37. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/vocab.json +0 -0
  38. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/zero_to_fp32.py +760 -0
  39. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/logging.jsonl +65 -0
  40. qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/val_dataset.jsonl +0 -0
grpo/qwen2.5vl-7b-thinking_v2_full_comet_grpo/v13-20250907-200700/checkpoint-750/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a25cbce87391e4fb9c0e580eb58eea1b4a8ff611b6463d37ff87c8c5ad7e260
3
+ size 4991495816
internvl3-8b-instruct-lora_epoch10_5e-6/added_tokens.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151666,
4
+ "</quad>": 151669,
5
+ "</ref>": 151671,
6
+ "</tool_call>": 151658,
7
+ "<IMG_CONTEXT>": 151667,
8
+ "<box>": 151672,
9
+ "<img>": 151665,
10
+ "<quad>": 151668,
11
+ "<ref>": 151670,
12
+ "<tool_call>": 151657,
13
+ "<|box_end|>": 151649,
14
+ "<|box_start|>": 151648,
15
+ "<|endoftext|>": 151643,
16
+ "<|file_sep|>": 151664,
17
+ "<|fim_middle|>": 151660,
18
+ "<|fim_pad|>": 151662,
19
+ "<|fim_prefix|>": 151659,
20
+ "<|fim_suffix|>": 151661,
21
+ "<|im_end|>": 151645,
22
+ "<|im_start|>": 151644,
23
+ "<|image_pad|>": 151655,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|video_pad|>": 151656,
30
+ "<|vision_end|>": 151653,
31
+ "<|vision_pad|>": 151654,
32
+ "<|vision_start|>": 151652
33
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/args.json ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
3
+ "model_type": "internvl3",
4
+ "model_revision": null,
5
+ "task_type": "causal_lm",
6
+ "torch_dtype": "bfloat16",
7
+ "attn_impl": null,
8
+ "num_labels": null,
9
+ "problem_type": null,
10
+ "rope_scaling": null,
11
+ "device_map": null,
12
+ "max_memory": {},
13
+ "local_repo_path": null,
14
+ "template": "internvl2_5",
15
+ "system": null,
16
+ "max_length": 32768,
17
+ "truncation_strategy": "delete",
18
+ "max_pixels": null,
19
+ "agent_template": null,
20
+ "norm_bbox": null,
21
+ "response_prefix": null,
22
+ "padding_side": "right",
23
+ "loss_scale": "default",
24
+ "sequence_parallel_size": 1,
25
+ "use_chat_template": true,
26
+ "template_backend": "swift",
27
+ "dataset": [
28
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/ambi_normal_772.json",
29
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/mma_train_126.json",
30
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/sp_train_102.json"
31
+ ],
32
+ "val_dataset": [],
33
+ "split_dataset_ratio": 0.01,
34
+ "data_seed": 42,
35
+ "dataset_num_proc": 1,
36
+ "dataset_shuffle": true,
37
+ "val_dataset_shuffle": false,
38
+ "streaming": false,
39
+ "interleave_prob": null,
40
+ "stopping_strategy": "first_exhausted",
41
+ "shuffle_buffer_size": 1000,
42
+ "enable_cache": false,
43
+ "download_mode": "reuse_dataset_if_exists",
44
+ "columns": {},
45
+ "strict": false,
46
+ "remove_unused_columns": true,
47
+ "model_name": [
48
+ null,
49
+ null
50
+ ],
51
+ "model_author": [
52
+ null,
53
+ null
54
+ ],
55
+ "custom_dataset_info": [],
56
+ "quant_method": null,
57
+ "quant_bits": null,
58
+ "hqq_axis": null,
59
+ "bnb_4bit_compute_dtype": "bfloat16",
60
+ "bnb_4bit_quant_type": "nf4",
61
+ "bnb_4bit_use_double_quant": true,
62
+ "bnb_4bit_quant_storage": null,
63
+ "max_new_tokens": 64,
64
+ "temperature": 0.0,
65
+ "top_k": null,
66
+ "top_p": null,
67
+ "repetition_penalty": null,
68
+ "num_beams": 1,
69
+ "stream": false,
70
+ "stop_words": [],
71
+ "logprobs": false,
72
+ "top_logprobs": null,
73
+ "ckpt_dir": null,
74
+ "load_dataset_config": null,
75
+ "lora_modules": [],
76
+ "tuner_backend": "peft",
77
+ "train_type": "lora",
78
+ "adapters": [],
79
+ "external_plugins": [],
80
+ "seed": 42,
81
+ "model_kwargs": {},
82
+ "load_args": false,
83
+ "load_data_args": false,
84
+ "use_hf": false,
85
+ "hub_token": null,
86
+ "custom_register_path": [],
87
+ "ignore_args_error": false,
88
+ "use_swift_lora": false,
89
+ "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/internvl3-8b-instruct-lora/v11-20250528-193547",
90
+ "overwrite_output_dir": false,
91
+ "do_train": false,
92
+ "do_eval": false,
93
+ "do_predict": false,
94
+ "eval_strategy": "epoch",
95
+ "prediction_loss_only": false,
96
+ "per_device_train_batch_size": 2,
97
+ "per_device_eval_batch_size": 2,
98
+ "per_gpu_train_batch_size": null,
99
+ "per_gpu_eval_batch_size": null,
100
+ "gradient_accumulation_steps": 2,
101
+ "eval_accumulation_steps": null,
102
+ "eval_delay": 0,
103
+ "torch_empty_cache_steps": null,
104
+ "learning_rate": 5e-06,
105
+ "weight_decay": 0.1,
106
+ "adam_beta1": 0.9,
107
+ "adam_beta2": 0.95,
108
+ "adam_epsilon": 1e-08,
109
+ "max_grad_norm": 1.0,
110
+ "num_train_epochs": 10.0,
111
+ "max_steps": -1,
112
+ "lr_scheduler_type": "cosine",
113
+ "lr_scheduler_kwargs": null,
114
+ "warmup_ratio": 0.05,
115
+ "warmup_steps": 0,
116
+ "log_level": "passive",
117
+ "log_level_replica": "warning",
118
+ "log_on_each_node": true,
119
+ "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/internvl3-8b-instruct-lora/v11-20250528-193547/runs",
120
+ "logging_strategy": "steps",
121
+ "logging_first_step": true,
122
+ "logging_steps": 1,
123
+ "logging_nan_inf_filter": true,
124
+ "save_strategy": "steps",
125
+ "save_steps": 500,
126
+ "save_total_limit": 5,
127
+ "save_safetensors": true,
128
+ "save_on_each_node": false,
129
+ "save_only_model": false,
130
+ "restore_callback_states_from_checkpoint": false,
131
+ "no_cuda": false,
132
+ "use_cpu": false,
133
+ "use_mps_device": false,
134
+ "jit_mode_eval": false,
135
+ "use_ipex": false,
136
+ "bf16": true,
137
+ "fp16": false,
138
+ "fp16_opt_level": "O1",
139
+ "half_precision_backend": "auto",
140
+ "bf16_full_eval": false,
141
+ "fp16_full_eval": false,
142
+ "tf32": null,
143
+ "local_rank": 0,
144
+ "ddp_backend": null,
145
+ "tpu_num_cores": null,
146
+ "tpu_metrics_debug": false,
147
+ "debug": null,
148
+ "dataloader_drop_last": false,
149
+ "eval_steps": null,
150
+ "dataloader_num_workers": 4,
151
+ "dataloader_prefetch_factor": null,
152
+ "past_index": -1,
153
+ "run_name": null,
154
+ "disable_tqdm": null,
155
+ "label_names": null,
156
+ "load_best_model_at_end": false,
157
+ "metric_for_best_model": "loss",
158
+ "greater_is_better": false,
159
+ "ignore_data_skip": false,
160
+ "fsdp": "",
161
+ "fsdp_min_num_params": 0,
162
+ "fsdp_config": null,
163
+ "tp_size": 0,
164
+ "fsdp_transformer_layer_cls_to_wrap": null,
165
+ "accelerator_config": {
166
+ "dispatch_batches": false
167
+ },
168
+ "deepspeed": {
169
+ "fp16": {
170
+ "enabled": "auto",
171
+ "loss_scale": 0,
172
+ "loss_scale_window": 1000,
173
+ "initial_scale_power": 16,
174
+ "hysteresis": 2,
175
+ "min_loss_scale": 1
176
+ },
177
+ "bf16": {
178
+ "enabled": "auto"
179
+ },
180
+ "zero_optimization": {
181
+ "stage": 3,
182
+ "offload_optimizer": {
183
+ "device": "none",
184
+ "pin_memory": true
185
+ },
186
+ "offload_param": {
187
+ "device": "none",
188
+ "pin_memory": true
189
+ },
190
+ "overlap_comm": false,
191
+ "contiguous_gradients": true,
192
+ "sub_group_size": 1000000000.0,
193
+ "reduce_bucket_size": "auto",
194
+ "zero_quantized_weights": false,
195
+ "zero_quantized_gradients": false,
196
+ "stage3_prefetch_bucket_size": "auto",
197
+ "stage3_param_persistence_threshold": "auto",
198
+ "stage3_max_live_parameters": 1000000000.0,
199
+ "stage3_max_reuse_distance": 1000000000.0,
200
+ "stage3_gather_16bit_weights_on_model_save": true
201
+ },
202
+ "gradient_accumulation_steps": "auto",
203
+ "gradient_clipping": "auto",
204
+ "steps_per_print": 2000,
205
+ "train_batch_size": "auto",
206
+ "train_micro_batch_size_per_gpu": "auto",
207
+ "wall_clock_breakdown": false
208
+ },
209
+ "label_smoothing_factor": 0.0,
210
+ "optim": "adamw_torch",
211
+ "optim_args": null,
212
+ "adafactor": false,
213
+ "group_by_length": false,
214
+ "length_column_name": "length",
215
+ "report_to": [
216
+ "wandb"
217
+ ],
218
+ "ddp_find_unused_parameters": null,
219
+ "ddp_bucket_cap_mb": null,
220
+ "ddp_broadcast_buffers": null,
221
+ "dataloader_pin_memory": true,
222
+ "dataloader_persistent_workers": false,
223
+ "skip_memory_metrics": true,
224
+ "use_legacy_prediction_loop": false,
225
+ "push_to_hub": false,
226
+ "resume_from_checkpoint": null,
227
+ "hub_model_id": null,
228
+ "hub_strategy": "every_save",
229
+ "hub_private_repo": null,
230
+ "hub_always_push": false,
231
+ "gradient_checkpointing": true,
232
+ "gradient_checkpointing_kwargs": null,
233
+ "include_inputs_for_metrics": false,
234
+ "include_for_metrics": [],
235
+ "eval_do_concat_batches": true,
236
+ "fp16_backend": "auto",
237
+ "push_to_hub_model_id": null,
238
+ "push_to_hub_organization": null,
239
+ "push_to_hub_token": null,
240
+ "mp_parameters": "",
241
+ "auto_find_batch_size": false,
242
+ "full_determinism": false,
243
+ "torchdynamo": null,
244
+ "ray_scope": "last",
245
+ "ddp_timeout": 1800,
246
+ "torch_compile": false,
247
+ "torch_compile_backend": null,
248
+ "torch_compile_mode": null,
249
+ "include_tokens_per_second": false,
250
+ "include_num_input_tokens_seen": false,
251
+ "neftune_noise_alpha": null,
252
+ "optim_target_modules": null,
253
+ "batch_eval_metrics": false,
254
+ "eval_on_start": false,
255
+ "use_liger_kernel": false,
256
+ "eval_use_gather_object": false,
257
+ "average_tokens_across_devices": false,
258
+ "sortish_sampler": false,
259
+ "predict_with_generate": false,
260
+ "generation_max_length": null,
261
+ "generation_num_beams": null,
262
+ "generation_config": null,
263
+ "check_model": true,
264
+ "acc_strategy": "token",
265
+ "train_dataloader_shuffle": true,
266
+ "metric_warmup_step": 0,
267
+ "fsdp_num": 1,
268
+ "acc_steps": 1,
269
+ "eval_use_evalscope": false,
270
+ "eval_datasets": [],
271
+ "eval_limit": null,
272
+ "eval_datasets_args": null,
273
+ "eval_generation_config": null,
274
+ "freeze_parameters": [
275
+ "vision_model",
276
+ "mlp1"
277
+ ],
278
+ "freeze_parameters_ratio": 0.0,
279
+ "trainable_parameters": [],
280
+ "freeze_llm": false,
281
+ "freeze_vit": true,
282
+ "freeze_aligner": true,
283
+ "target_modules": [
284
+ "all-linear"
285
+ ],
286
+ "target_regex": null,
287
+ "modules_to_save": [],
288
+ "lora_rank": 8,
289
+ "lora_alpha": 16,
290
+ "lora_dropout": 0.1,
291
+ "lora_bias": "none",
292
+ "lora_dtype": null,
293
+ "lorap_lr_ratio": null,
294
+ "use_rslora": false,
295
+ "use_dora": false,
296
+ "lora_ga_batch_size": 2,
297
+ "lora_ga_iters": 2,
298
+ "lora_ga_max_length": 1024,
299
+ "lora_ga_direction": "ArB2r",
300
+ "lora_ga_scale": "stable",
301
+ "lora_ga_stable_gamma": 16,
302
+ "init_weights": true,
303
+ "fourier_n_frequency": 2000,
304
+ "fourier_scaling": 300.0,
305
+ "boft_block_size": 4,
306
+ "boft_block_num": 0,
307
+ "boft_n_butterfly_factor": 1,
308
+ "boft_dropout": 0.0,
309
+ "vera_rank": 256,
310
+ "vera_projection_prng_key": 0,
311
+ "vera_dropout": 0.0,
312
+ "vera_d_initial": 0.1,
313
+ "adapter_act": "gelu",
314
+ "adapter_length": 128,
315
+ "use_galore": false,
316
+ "galore_target_modules": null,
317
+ "galore_rank": 128,
318
+ "galore_update_proj_gap": 50,
319
+ "galore_scale": 1.0,
320
+ "galore_proj_type": "std",
321
+ "galore_optim_per_parameter": false,
322
+ "galore_with_embedding": false,
323
+ "galore_quantization": false,
324
+ "galore_proj_quant": false,
325
+ "galore_proj_bits": 4,
326
+ "galore_proj_group_size": 256,
327
+ "galore_cos_threshold": 0.4,
328
+ "galore_gamma_proj": 2,
329
+ "galore_queue_size": 5,
330
+ "adalora_target_r": 8,
331
+ "adalora_init_r": 12,
332
+ "adalora_tinit": 0,
333
+ "adalora_tfinal": 0,
334
+ "adalora_deltaT": 1,
335
+ "adalora_beta1": 0.85,
336
+ "adalora_beta2": 0.85,
337
+ "adalora_orth_reg_weight": 0.5,
338
+ "llamapro_num_new_blocks": 4,
339
+ "llamapro_num_groups": null,
340
+ "lisa_activated_layers": 0,
341
+ "lisa_step_interval": 20,
342
+ "reft_layer_key": null,
343
+ "reft_layers": null,
344
+ "reft_rank": 4,
345
+ "reft_intervention_type": "LoreftIntervention",
346
+ "reft_args": null,
347
+ "swanlab_token": null,
348
+ "swanlab_project": null,
349
+ "swanlab_workspace": null,
350
+ "swanlab_exp_name": null,
351
+ "swanlab_mode": "cloud",
352
+ "add_version": true,
353
+ "resume_only_model": false,
354
+ "create_checkpoint_symlink": false,
355
+ "packing": false,
356
+ "lazy_tokenize": true,
357
+ "loss_type": null,
358
+ "optimizer": null,
359
+ "metric": null,
360
+ "zero_hpz_partition_size": null,
361
+ "rank": 0,
362
+ "global_world_size": 3,
363
+ "local_world_size": 3,
364
+ "model_suffix": "InternVL3-8B-Instruct",
365
+ "model_info": "ModelInfo(model_type='internvl3', model_dir='/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'factor': 2.0, 'rope_type': 'dynamic', 'type': 'dynamic'}, config=None, task_type='causal_lm', num_labels=None)",
366
+ "model_meta": "ModelMeta(model_type='internvl3', model_groups=[ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B', hf_model_id='OpenGVLab/InternVL3-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B', hf_model_id='OpenGVLab/InternVL3-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B', hf_model_id='OpenGVLab/InternVL3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B', hf_model_id='OpenGVLab/InternVL3-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B', hf_model_id='OpenGVLab/InternVL3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B', hf_model_id='OpenGVLab/InternVL3-38B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B', hf_model_id='OpenGVLab/InternVL3-78B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='internvl2_5', get_function=<function get_model_tokenizer_internvl at 0x7fbd7ccc1ea0>, model_arch='internvl', architectures=['InternVLChatModel'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37.2', 'timm'], tags=['vision', 'video'])",
367
+ "model_dir": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
368
+ "hub": "<class 'swift.hub.hub.MSHub'>",
369
+ "evaluation_strategy": "epoch",
370
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/internvl3-8b-instruct-lora/v11-20250528-193547', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-06, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/internvl3-8b-instruct-lora/v11-20250528-193547/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/internvl3-8b-instruct-lora/v11-20250528-193547', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None)"
371
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/config.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "InternVLChatModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
8
+ "AutoModel": "modeling_internvl_chat_cd.InternVLChatModel",
9
+ "AutoModelForCausalLM": "modeling_internvl_chat_cd.InternVLChatModel"
10
+ },
11
+ "downsample_ratio": 0.5,
12
+ "dynamic_image_size": true,
13
+ "force_image_size": 448,
14
+ "hidden_size": 3584,
15
+ "image_fold": null,
16
+ "keys_to_ignore_at_inference": [
17
+ "past_key_values"
18
+ ],
19
+ "llm_config": {
20
+ "_attn_implementation_autoset": true,
21
+ "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct",
22
+ "add_cross_attention": false,
23
+ "architectures": [
24
+ "Qwen2ForCausalLM"
25
+ ],
26
+ "attention_dropout": 0.0,
27
+ "bad_words_ids": null,
28
+ "begin_suppress_tokens": null,
29
+ "bos_token_id": 151643,
30
+ "chunk_size_feed_forward": 0,
31
+ "cross_attention_hidden_size": null,
32
+ "decoder_start_token_id": null,
33
+ "diversity_penalty": 0.0,
34
+ "do_sample": false,
35
+ "early_stopping": false,
36
+ "encoder_no_repeat_ngram_size": 0,
37
+ "eos_token_id": 151643,
38
+ "exponential_decay_length_penalty": null,
39
+ "finetuning_task": null,
40
+ "forced_bos_token_id": null,
41
+ "forced_eos_token_id": null,
42
+ "hidden_act": "silu",
43
+ "hidden_size": 3584,
44
+ "id2label": {
45
+ "0": "LABEL_0",
46
+ "1": "LABEL_1"
47
+ },
48
+ "initializer_range": 0.02,
49
+ "intermediate_size": 18944,
50
+ "is_decoder": false,
51
+ "is_encoder_decoder": false,
52
+ "label2id": {
53
+ "LABEL_0": 0,
54
+ "LABEL_1": 1
55
+ },
56
+ "length_penalty": 1.0,
57
+ "max_length": 20,
58
+ "max_position_embeddings": 32768,
59
+ "max_window_layers": 70,
60
+ "min_length": 0,
61
+ "model_type": "qwen2",
62
+ "moe_config": null,
63
+ "no_repeat_ngram_size": 0,
64
+ "num_attention_heads": 28,
65
+ "num_beam_groups": 1,
66
+ "num_beams": 1,
67
+ "num_hidden_layers": 28,
68
+ "num_key_value_heads": 4,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 151643,
74
+ "prefix": null,
75
+ "problem_type": null,
76
+ "pruned_heads": {},
77
+ "remove_invalid_values": false,
78
+ "repetition_penalty": 1.0,
79
+ "return_dict": true,
80
+ "return_dict_in_generate": false,
81
+ "rms_norm_eps": 1e-06,
82
+ "rope_scaling": {
83
+ "factor": 2.0,
84
+ "rope_type": "dynamic",
85
+ "type": "dynamic"
86
+ },
87
+ "rope_theta": 1000000.0,
88
+ "sep_token_id": null,
89
+ "sliding_window": null,
90
+ "suppress_tokens": null,
91
+ "task_specific_params": null,
92
+ "temperature": 1.0,
93
+ "tf_legacy_loss": false,
94
+ "tie_encoder_decoder": false,
95
+ "tie_word_embeddings": false,
96
+ "tokenizer_class": null,
97
+ "top_k": 50,
98
+ "top_p": 1.0,
99
+ "torch_dtype": "bfloat16",
100
+ "torchscript": false,
101
+ "transformers_version": "4.51.3",
102
+ "typical_p": 1.0,
103
+ "use_bfloat16": true,
104
+ "use_cache": true,
105
+ "use_sliding_window": false,
106
+ "vocab_size": 151674
107
+ },
108
+ "max_dynamic_patch": 12,
109
+ "min_dynamic_patch": 1,
110
+ "model_type": "internvl_chat",
111
+ "pad2square": false,
112
+ "pad_token_id": 151643,
113
+ "ps_version": "v2",
114
+ "select_layer": -1,
115
+ "template": "internvl2_5",
116
+ "tie_word_embeddings": false,
117
+ "torch_dtype": "bfloat16",
118
+ "transformers_version": null,
119
+ "use_backbone_lora": 0,
120
+ "use_llm_lora": 0,
121
+ "use_thumbnail": true,
122
+ "vision_config": {
123
+ "_attn_implementation_autoset": true,
124
+ "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
125
+ "add_cross_attention": false,
126
+ "architectures": [
127
+ "InternVisionModel"
128
+ ],
129
+ "attention_dropout": 0.0,
130
+ "auto_map": {
131
+ "AutoConfig": "configuration_intern_vit.InternVisionConfig",
132
+ "AutoModel": "modeling_intern_vit.InternVisionModel"
133
+ },
134
+ "bad_words_ids": null,
135
+ "begin_suppress_tokens": null,
136
+ "bos_token_id": null,
137
+ "capacity_factor": 1.2,
138
+ "chunk_size_feed_forward": 0,
139
+ "cross_attention_hidden_size": null,
140
+ "decoder_start_token_id": null,
141
+ "diversity_penalty": 0.0,
142
+ "do_sample": false,
143
+ "drop_path_rate": 0.0,
144
+ "dropout": 0.0,
145
+ "early_stopping": false,
146
+ "encoder_no_repeat_ngram_size": 0,
147
+ "eos_token_id": null,
148
+ "eval_capacity_factor": 1.4,
149
+ "exponential_decay_length_penalty": null,
150
+ "finetuning_task": null,
151
+ "forced_bos_token_id": null,
152
+ "forced_eos_token_id": null,
153
+ "hidden_act": "gelu",
154
+ "hidden_size": 1024,
155
+ "id2label": {
156
+ "0": "LABEL_0",
157
+ "1": "LABEL_1"
158
+ },
159
+ "image_size": 448,
160
+ "initializer_factor": 0.1,
161
+ "initializer_range": 1e-10,
162
+ "intermediate_size": 4096,
163
+ "is_decoder": false,
164
+ "is_encoder_decoder": false,
165
+ "label2id": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1
168
+ },
169
+ "laux_allreduce": "all_nodes",
170
+ "layer_norm_eps": 1e-06,
171
+ "length_penalty": 1.0,
172
+ "max_length": 20,
173
+ "min_length": 0,
174
+ "model_type": "intern_vit_6b",
175
+ "moe_coeff_ratio": 0.5,
176
+ "moe_intermediate_size": 768,
177
+ "moe_output_scale": 4.0,
178
+ "no_repeat_ngram_size": 0,
179
+ "noisy_gate_policy": "RSample_before",
180
+ "norm_type": "layer_norm",
181
+ "num_attention_heads": 16,
182
+ "num_beam_groups": 1,
183
+ "num_beams": 1,
184
+ "num_channels": 3,
185
+ "num_experts": 8,
186
+ "num_hidden_layers": 24,
187
+ "num_return_sequences": 1,
188
+ "num_routed_experts": 4,
189
+ "num_shared_experts": 4,
190
+ "output_attentions": false,
191
+ "output_hidden_states": false,
192
+ "output_scores": false,
193
+ "pad_token_id": 151643,
194
+ "patch_size": 14,
195
+ "prefix": null,
196
+ "problem_type": null,
197
+ "pruned_heads": {},
198
+ "qk_normalization": false,
199
+ "qkv_bias": true,
200
+ "remove_invalid_values": false,
201
+ "repetition_penalty": 1.0,
202
+ "return_dict": true,
203
+ "return_dict_in_generate": false,
204
+ "sep_token_id": null,
205
+ "shared_expert_intermediate_size": 3072,
206
+ "suppress_tokens": null,
207
+ "task_specific_params": null,
208
+ "temperature": 1.0,
209
+ "tf_legacy_loss": false,
210
+ "tie_encoder_decoder": false,
211
+ "tie_word_embeddings": true,
212
+ "tokenizer_class": null,
213
+ "top_k": 50,
214
+ "top_p": 1.0,
215
+ "torch_dtype": "bfloat16",
216
+ "torchscript": false,
217
+ "transformers_version": "4.51.3",
218
+ "typical_p": 1.0,
219
+ "use_bfloat16": true,
220
+ "use_flash_attn": true,
221
+ "use_moe": false,
222
+ "use_residual": true,
223
+ "use_rts": false,
224
+ "use_weighted_residual": false
225
+ }
226
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/configuration_intern_vit.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import os
8
+ from typing import Union
9
+
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class InternVisionConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
19
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
20
+
21
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
22
+ documentation from [`PretrainedConfig`] for more information.
23
+
24
+ Args:
25
+ num_channels (`int`, *optional*, defaults to 3):
26
+ Number of color channels in the input images (e.g., 3 for RGB).
27
+ patch_size (`int`, *optional*, defaults to 14):
28
+ The size (resolution) of each patch.
29
+ image_size (`int`, *optional*, defaults to 224):
30
+ The size (resolution) of each image.
31
+ qkv_bias (`bool`, *optional*, defaults to `False`):
32
+ Whether to add a bias to the queries and values in the self-attention layers.
33
+ hidden_size (`int`, *optional*, defaults to 3200):
34
+ Dimensionality of the encoder layers and the pooler layer.
35
+ num_attention_heads (`int`, *optional*, defaults to 25):
36
+ Number of attention heads for each attention layer in the Transformer encoder.
37
+ intermediate_size (`int`, *optional*, defaults to 12800):
38
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
39
+ qk_normalization (`bool`, *optional*, defaults to `True`):
40
+ Whether to normalize the queries and keys in the self-attention layers.
41
+ num_hidden_layers (`int`, *optional*, defaults to 48):
42
+ Number of hidden layers in the Transformer encoder.
43
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
44
+ Whether to use flash attention mechanism.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
46
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
47
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
48
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
49
+ The epsilon used by the layer normalization layers.
50
+ dropout (`float`, *optional*, defaults to 0.0):
51
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
52
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
53
+ Dropout rate for stochastic depth.
54
+ attention_dropout (`float`, *optional*, defaults to 0.0):
55
+ The dropout ratio for the attention probabilities.
56
+ initializer_range (`float`, *optional*, defaults to 0.02):
57
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
58
+ initializer_factor (`float`, *optional*, defaults to 0.1):
59
+ A factor for layer scale.
60
+ """
61
+
62
+ model_type = 'intern_vit_6b'
63
+
64
+ def __init__(
65
+ self,
66
+ num_channels=3,
67
+ patch_size=14,
68
+ image_size=224,
69
+ qkv_bias=False,
70
+ hidden_size=3200,
71
+ num_attention_heads=25,
72
+ intermediate_size=12800,
73
+ qk_normalization=True,
74
+ num_hidden_layers=48,
75
+ use_flash_attn=True,
76
+ hidden_act='gelu',
77
+ norm_type='rms_norm',
78
+ layer_norm_eps=1e-6,
79
+ dropout=0.0,
80
+ drop_path_rate=0.0,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=0.1,
84
+ **kwargs,
85
+ ):
86
+ super().__init__(**kwargs)
87
+
88
+ self.hidden_size = hidden_size
89
+ self.intermediate_size = intermediate_size
90
+ self.dropout = dropout
91
+ self.drop_path_rate = drop_path_rate
92
+ self.num_hidden_layers = num_hidden_layers
93
+ self.num_attention_heads = num_attention_heads
94
+ self.num_channels = num_channels
95
+ self.patch_size = patch_size
96
+ self.image_size = image_size
97
+ self.initializer_range = initializer_range
98
+ self.initializer_factor = initializer_factor
99
+ self.attention_dropout = attention_dropout
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.norm_type = norm_type
103
+ self.qkv_bias = qkv_bias
104
+ self.qk_normalization = qk_normalization
105
+ self.use_flash_attn = use_flash_attn
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+
111
+ if 'vision_config' in config_dict:
112
+ config_dict = config_dict['vision_config']
113
+
114
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
115
+ logger.warning(
116
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
117
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
118
+ )
119
+
120
+ return cls.from_dict(config_dict, **kwargs)
internvl3-8b-instruct-lora_epoch10_5e-6/configuration_internvl_chat.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from transformers import AutoConfig, LlamaConfig, Qwen2Config
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ from .configuration_intern_vit import InternVisionConfig
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class InternVLChatConfig(PretrainedConfig):
19
+ model_type = 'internvl_chat'
20
+ is_composition = True
21
+
22
+ def __init__(
23
+ self,
24
+ vision_config=None,
25
+ llm_config=None,
26
+ use_backbone_lora=0,
27
+ use_llm_lora=0,
28
+ select_layer=-1,
29
+ force_image_size=None,
30
+ downsample_ratio=0.5,
31
+ template=None,
32
+ dynamic_image_size=False,
33
+ use_thumbnail=False,
34
+ ps_version='v1',
35
+ min_dynamic_patch=1,
36
+ max_dynamic_patch=6,
37
+ **kwargs):
38
+ super().__init__(**kwargs)
39
+
40
+ if vision_config is None:
41
+ vision_config = {'architectures': ['InternVisionModel']}
42
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
43
+
44
+ if llm_config is None:
45
+ llm_config = {'architectures': ['Qwen2ForCausalLM']}
46
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
47
+
48
+ self.vision_config = InternVisionConfig(**vision_config)
49
+ if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
50
+ self.llm_config = LlamaConfig(**llm_config)
51
+ elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM':
52
+ self.llm_config = Qwen2Config(**llm_config)
53
+ else:
54
+ raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
55
+ self.use_backbone_lora = use_backbone_lora
56
+ self.use_llm_lora = use_llm_lora
57
+ self.select_layer = select_layer
58
+ self.force_image_size = force_image_size
59
+ self.downsample_ratio = downsample_ratio
60
+ self.template = template
61
+ self.dynamic_image_size = dynamic_image_size
62
+ self.use_thumbnail = use_thumbnail
63
+ self.ps_version = ps_version # pixel shuffle version
64
+ self.min_dynamic_patch = min_dynamic_patch
65
+ self.max_dynamic_patch = max_dynamic_patch
66
+ # By default, we use tie_word_embeddings=False for models of all sizes.
67
+ self.tie_word_embeddings = self.llm_config.tie_word_embeddings
68
+
69
+ logger.info(f'vision_select_layer: {self.select_layer}')
70
+ logger.info(f'ps_version: {self.ps_version}')
71
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
72
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
73
+
74
+ def to_dict(self):
75
+ """
76
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
77
+
78
+ Returns:
79
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
80
+ """
81
+ output = copy.deepcopy(self.__dict__)
82
+ output['vision_config'] = self.vision_config.to_dict()
83
+ output['llm_config'] = self.llm_config.to_dict()
84
+ output['model_type'] = self.__class__.model_type
85
+ output['use_backbone_lora'] = self.use_backbone_lora
86
+ output['use_llm_lora'] = self.use_llm_lora
87
+ output['select_layer'] = self.select_layer
88
+ output['force_image_size'] = self.force_image_size
89
+ output['downsample_ratio'] = self.downsample_ratio
90
+ output['template'] = self.template
91
+ output['dynamic_image_size'] = self.dynamic_image_size
92
+ output['use_thumbnail'] = self.use_thumbnail
93
+ output['ps_version'] = self.ps_version
94
+ output['min_dynamic_patch'] = self.min_dynamic_patch
95
+ output['max_dynamic_patch'] = self.max_dynamic_patch
96
+
97
+ return output
internvl3-8b-instruct-lora_epoch10_5e-6/conversation.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+
7
+ Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
8
+ """
9
+
10
+ import dataclasses
11
+ from enum import IntEnum, auto
12
+ from typing import Dict, List, Tuple, Union
13
+
14
+
15
+ class SeparatorStyle(IntEnum):
16
+ """Separator styles."""
17
+
18
+ ADD_COLON_SINGLE = auto()
19
+ ADD_COLON_TWO = auto()
20
+ ADD_COLON_SPACE_SINGLE = auto()
21
+ NO_COLON_SINGLE = auto()
22
+ NO_COLON_TWO = auto()
23
+ ADD_NEW_LINE_SINGLE = auto()
24
+ LLAMA2 = auto()
25
+ CHATGLM = auto()
26
+ CHATML = auto()
27
+ CHATINTERN = auto()
28
+ DOLLY = auto()
29
+ RWKV = auto()
30
+ PHOENIX = auto()
31
+ ROBIN = auto()
32
+ FALCON_CHAT = auto()
33
+ CHATGLM3 = auto()
34
+ INTERNVL_ZH = auto()
35
+ MPT = auto()
36
+
37
+
38
+ @dataclasses.dataclass
39
+ class Conversation:
40
+ """A class that manages prompt templates and keeps all conversation history."""
41
+
42
+ # The name of this template
43
+ name: str
44
+ # The template of the system prompt
45
+ system_template: str = '{system_message}'
46
+ # The system message
47
+ system_message: str = ''
48
+ # The names of two roles
49
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
50
+ # All messages. Each item is (role, message).
51
+ messages: List[List[str]] = ()
52
+ # The number of few shot examples
53
+ offset: int = 0
54
+ # The separator style and configurations
55
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
56
+ sep: str = '\n'
57
+ sep2: str = None
58
+ # Stop criteria (the default one is EOS token)
59
+ stop_str: Union[str, List[str]] = None
60
+ # Stops generation if meeting any token in this list
61
+ stop_token_ids: List[int] = None
62
+
63
+ def get_prompt(self) -> str:
64
+ """Get the prompt for generation."""
65
+ system_prompt = self.system_template.format(system_message=self.system_message)
66
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
67
+ ret = system_prompt + self.sep
68
+ for role, message in self.messages:
69
+ if message:
70
+ ret += role + ': ' + message + self.sep
71
+ else:
72
+ ret += role + ':'
73
+ return ret
74
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
75
+ seps = [self.sep, self.sep2]
76
+ ret = system_prompt + seps[0]
77
+ for i, (role, message) in enumerate(self.messages):
78
+ if message:
79
+ ret += role + ': ' + message + seps[i % 2]
80
+ else:
81
+ ret += role + ':'
82
+ return ret
83
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
84
+ ret = system_prompt + self.sep
85
+ for role, message in self.messages:
86
+ if message:
87
+ ret += role + ': ' + message + self.sep
88
+ else:
89
+ ret += role + ': ' # must be end with a space
90
+ return ret
91
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
92
+ ret = '' if system_prompt == '' else system_prompt + self.sep
93
+ for role, message in self.messages:
94
+ if message:
95
+ ret += role + '\n' + message + self.sep
96
+ else:
97
+ ret += role + '\n'
98
+ return ret
99
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
100
+ ret = system_prompt
101
+ for role, message in self.messages:
102
+ if message:
103
+ ret += role + message + self.sep
104
+ else:
105
+ ret += role
106
+ return ret
107
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
108
+ seps = [self.sep, self.sep2]
109
+ ret = system_prompt
110
+ for i, (role, message) in enumerate(self.messages):
111
+ if message:
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ return ret
116
+ elif self.sep_style == SeparatorStyle.RWKV:
117
+ ret = system_prompt
118
+ for i, (role, message) in enumerate(self.messages):
119
+ if message:
120
+ ret += (
121
+ role
122
+ + ': '
123
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
124
+ )
125
+ ret += '\n\n'
126
+ else:
127
+ ret += role + ':'
128
+ return ret
129
+ elif self.sep_style == SeparatorStyle.LLAMA2:
130
+ seps = [self.sep, self.sep2]
131
+ if self.system_message:
132
+ ret = system_prompt
133
+ else:
134
+ ret = '[INST] '
135
+ for i, (role, message) in enumerate(self.messages):
136
+ tag = self.roles[i % 2]
137
+ if message:
138
+ if i == 0:
139
+ ret += message + ' '
140
+ else:
141
+ ret += tag + ' ' + message + seps[i % 2]
142
+ else:
143
+ ret += tag
144
+ return ret
145
+ elif self.sep_style == SeparatorStyle.CHATGLM:
146
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
147
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
148
+ round_add_n = 1 if self.name == 'chatglm2' else 0
149
+ if system_prompt:
150
+ ret = system_prompt + self.sep
151
+ else:
152
+ ret = ''
153
+
154
+ for i, (role, message) in enumerate(self.messages):
155
+ if i % 2 == 0:
156
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
157
+
158
+ if message:
159
+ ret += f'{role}:{message}{self.sep}'
160
+ else:
161
+ ret += f'{role}:'
162
+ return ret
163
+ elif self.sep_style == SeparatorStyle.CHATML:
164
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
165
+ for role, message in self.messages:
166
+ if message:
167
+ ret += role + '\n' + message + self.sep + '\n'
168
+ else:
169
+ ret += role + '\n'
170
+ return ret
171
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
172
+ ret = ''
173
+ if self.system_message:
174
+ ret += system_prompt
175
+ for role, message in self.messages:
176
+ if message:
177
+ ret += role + '\n' + ' ' + message
178
+ else:
179
+ ret += role
180
+ return ret
181
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
182
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
183
+ seps = [self.sep, self.sep2]
184
+ ret = system_prompt
185
+ for i, (role, message) in enumerate(self.messages):
186
+ # if i % 2 == 0:
187
+ # ret += "<s>"
188
+ if message:
189
+ ret += role + ':' + message + seps[i % 2] + '\n'
190
+ else:
191
+ ret += role + ':'
192
+ return ret
193
+ elif self.sep_style == SeparatorStyle.DOLLY:
194
+ seps = [self.sep, self.sep2]
195
+ ret = system_prompt
196
+ for i, (role, message) in enumerate(self.messages):
197
+ if message:
198
+ ret += role + ':\n' + message + seps[i % 2]
199
+ if i % 2 == 1:
200
+ ret += '\n\n'
201
+ else:
202
+ ret += role + ':\n'
203
+ return ret
204
+ elif self.sep_style == SeparatorStyle.PHOENIX:
205
+ ret = system_prompt
206
+ for role, message in self.messages:
207
+ if message:
208
+ ret += role + ': ' + '<s>' + message + '</s>'
209
+ else:
210
+ ret += role + ': ' + '<s>'
211
+ return ret
212
+ elif self.sep_style == SeparatorStyle.ROBIN:
213
+ ret = system_prompt + self.sep
214
+ for role, message in self.messages:
215
+ if message:
216
+ ret += role + ':\n' + message + self.sep
217
+ else:
218
+ ret += role + ':\n'
219
+ return ret
220
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
221
+ ret = ''
222
+ if self.system_message:
223
+ ret += system_prompt + self.sep
224
+ for role, message in self.messages:
225
+ if message:
226
+ ret += role + ': ' + message + self.sep
227
+ else:
228
+ ret += role + ':'
229
+
230
+ return ret
231
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
232
+ seps = [self.sep, self.sep2]
233
+ ret = self.system_message + seps[0]
234
+ for i, (role, message) in enumerate(self.messages):
235
+ if message:
236
+ ret += role + ': ' + message + seps[i % 2]
237
+ else:
238
+ ret += role + ':'
239
+ return ret
240
+ elif self.sep_style == SeparatorStyle.MPT:
241
+ ret = system_prompt + self.sep
242
+ for role, message in self.messages:
243
+ if message:
244
+ if type(message) is tuple:
245
+ message, _, _ = message
246
+ ret += role + message + self.sep
247
+ else:
248
+ ret += role
249
+ return ret
250
+ else:
251
+ raise ValueError(f'Invalid style: {self.sep_style}')
252
+
253
+ def set_system_message(self, system_message: str):
254
+ """Set the system message."""
255
+ self.system_message = system_message
256
+
257
+ def append_message(self, role: str, message: str):
258
+ """Append a new message."""
259
+ self.messages.append([role, message])
260
+
261
+ def update_last_message(self, message: str):
262
+ """Update the last output.
263
+
264
+ The last message is typically set to be None when constructing the prompt,
265
+ so we need to update it in-place after getting the response from a model.
266
+ """
267
+ self.messages[-1][1] = message
268
+
269
+ def to_gradio_chatbot(self):
270
+ """Convert the conversation to gradio chatbot format."""
271
+ ret = []
272
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
273
+ if i % 2 == 0:
274
+ ret.append([msg, None])
275
+ else:
276
+ ret[-1][-1] = msg
277
+ return ret
278
+
279
+ def to_openai_api_messages(self):
280
+ """Convert the conversation to OpenAI chat completion format."""
281
+ ret = [{'role': 'system', 'content': self.system_message}]
282
+
283
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
284
+ if i % 2 == 0:
285
+ ret.append({'role': 'user', 'content': msg})
286
+ else:
287
+ if msg is not None:
288
+ ret.append({'role': 'assistant', 'content': msg})
289
+ return ret
290
+
291
+ def copy(self):
292
+ return Conversation(
293
+ name=self.name,
294
+ system_template=self.system_template,
295
+ system_message=self.system_message,
296
+ roles=self.roles,
297
+ messages=[[x, y] for x, y in self.messages],
298
+ offset=self.offset,
299
+ sep_style=self.sep_style,
300
+ sep=self.sep,
301
+ sep2=self.sep2,
302
+ stop_str=self.stop_str,
303
+ stop_token_ids=self.stop_token_ids,
304
+ )
305
+
306
+ def dict(self):
307
+ return {
308
+ 'template_name': self.name,
309
+ 'system_message': self.system_message,
310
+ 'roles': self.roles,
311
+ 'messages': self.messages,
312
+ 'offset': self.offset,
313
+ }
314
+
315
+
316
+ # A global registry for all conversation templates
317
+ conv_templates: Dict[str, Conversation] = {}
318
+
319
+
320
+ def register_conv_template(template: Conversation, override: bool = False):
321
+ """Register a new conversation template."""
322
+ if not override:
323
+ assert (
324
+ template.name not in conv_templates
325
+ ), f'{template.name} has been registered.'
326
+
327
+ conv_templates[template.name] = template
328
+
329
+
330
+ def get_conv_template(name: str) -> Conversation:
331
+ """Get a conversation template."""
332
+ return conv_templates[name].copy()
333
+
334
+
335
+ # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
336
+ # is that during training, the preprocessing function for the Hermes-2 template doesn't add
337
+ # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
338
+ # Therefore, they are completely equivalent during inference.
339
+ register_conv_template(
340
+ Conversation(
341
+ name='Hermes-2',
342
+ system_template='<|im_start|>system\n{system_message}',
343
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
344
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
345
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
346
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
347
+ sep_style=SeparatorStyle.MPT,
348
+ sep='<|im_end|>',
349
+ stop_str='<|endoftext|>',
350
+ )
351
+ )
352
+
353
+
354
+ register_conv_template(
355
+ Conversation(
356
+ name='internlm2-chat',
357
+ system_template='<|im_start|>system\n{system_message}',
358
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
359
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
360
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
361
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
362
+ sep_style=SeparatorStyle.MPT,
363
+ sep='<|im_end|>',
364
+ )
365
+ )
366
+
367
+
368
+ register_conv_template(
369
+ Conversation(
370
+ name='phi3-chat',
371
+ system_template='<|system|>\n{system_message}',
372
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
373
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
374
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
375
+ roles=('<|user|>\n', '<|assistant|>\n'),
376
+ sep_style=SeparatorStyle.MPT,
377
+ sep='<|end|>',
378
+ )
379
+ )
380
+
381
+
382
+ register_conv_template(
383
+ Conversation(
384
+ name='internvl2_5',
385
+ system_template='<|im_start|>system\n{system_message}',
386
+ system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
387
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
388
+ sep_style=SeparatorStyle.MPT,
389
+ sep='<|im_end|>\n',
390
+ )
391
+ )
internvl3-8b-instruct-lora_epoch10_5e-6/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.51.3"
4
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
internvl3-8b-instruct-lora_epoch10_5e-6/model.safetensors.index.json ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15888747520
4
+ },
5
+ "weight_map": {
6
+ "language_model.lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "language_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "language_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "language_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "language_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "language_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "language_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "language_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "language_model.model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "language_model.model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "language_model.model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "language_model.model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "language_model.model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "language_model.model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "language_model.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "language_model.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "language_model.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "language_model.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "language_model.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "language_model.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "language_model.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "language_model.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "language_model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "language_model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "language_model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "language_model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "language_model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "language_model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "language_model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "language_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
117
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
118
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
121
+ "language_model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "language_model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "language_model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "language_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
131
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
132
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "language_model.model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
134
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
135
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
136
+ "language_model.model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
137
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
138
+ "language_model.model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
139
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
140
+ "language_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "language_model.model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "language_model.model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "language_model.model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "language_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "language_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "language_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "language_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "language_model.model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "language_model.model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "language_model.model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "language_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "language_model.model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "language_model.model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "language_model.model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "language_model.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "language_model.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "language_model.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "language_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "language_model.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "language_model.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "language_model.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "language_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "language_model.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "language_model.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "language_model.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "language_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "language_model.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "language_model.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "language_model.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "language_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "language_model.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "language_model.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "language_model.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "language_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "language_model.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "language_model.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "language_model.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "language_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "language_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "language_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "language_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "language_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "language_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "language_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "language_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "language_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "language_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "language_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "language_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
309
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
310
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
311
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
312
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
313
+ "language_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
316
+ "language_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "language_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "language_model.model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
326
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
327
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
328
+ "language_model.model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
329
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
330
+ "language_model.model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
331
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
332
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "language_model.model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "language_model.model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "language_model.model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
345
+ "mlp1.0.bias": "model-00004-of-00004.safetensors",
346
+ "mlp1.0.weight": "model-00004-of-00004.safetensors",
347
+ "mlp1.1.bias": "model-00004-of-00004.safetensors",
348
+ "mlp1.1.weight": "model-00004-of-00004.safetensors",
349
+ "mlp1.3.bias": "model-00004-of-00004.safetensors",
350
+ "mlp1.3.weight": "model-00004-of-00004.safetensors",
351
+ "vision_model.embeddings.class_embedding": "model-00001-of-00004.safetensors",
352
+ "vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
353
+ "vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
354
+ "vision_model.embeddings.position_embedding": "model-00001-of-00004.safetensors",
355
+ "vision_model.encoder.layers.0.attn.proj.bias": "model-00001-of-00004.safetensors",
356
+ "vision_model.encoder.layers.0.attn.proj.weight": "model-00001-of-00004.safetensors",
357
+ "vision_model.encoder.layers.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
358
+ "vision_model.encoder.layers.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
359
+ "vision_model.encoder.layers.0.ls1": "model-00001-of-00004.safetensors",
360
+ "vision_model.encoder.layers.0.ls2": "model-00001-of-00004.safetensors",
361
+ "vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
362
+ "vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
363
+ "vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
364
+ "vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
365
+ "vision_model.encoder.layers.0.norm1.bias": "model-00001-of-00004.safetensors",
366
+ "vision_model.encoder.layers.0.norm1.weight": "model-00001-of-00004.safetensors",
367
+ "vision_model.encoder.layers.0.norm2.bias": "model-00001-of-00004.safetensors",
368
+ "vision_model.encoder.layers.0.norm2.weight": "model-00001-of-00004.safetensors",
369
+ "vision_model.encoder.layers.1.attn.proj.bias": "model-00001-of-00004.safetensors",
370
+ "vision_model.encoder.layers.1.attn.proj.weight": "model-00001-of-00004.safetensors",
371
+ "vision_model.encoder.layers.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
372
+ "vision_model.encoder.layers.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
373
+ "vision_model.encoder.layers.1.ls1": "model-00001-of-00004.safetensors",
374
+ "vision_model.encoder.layers.1.ls2": "model-00001-of-00004.safetensors",
375
+ "vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
376
+ "vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
377
+ "vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
378
+ "vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
379
+ "vision_model.encoder.layers.1.norm1.bias": "model-00001-of-00004.safetensors",
380
+ "vision_model.encoder.layers.1.norm1.weight": "model-00001-of-00004.safetensors",
381
+ "vision_model.encoder.layers.1.norm2.bias": "model-00001-of-00004.safetensors",
382
+ "vision_model.encoder.layers.1.norm2.weight": "model-00001-of-00004.safetensors",
383
+ "vision_model.encoder.layers.10.attn.proj.bias": "model-00001-of-00004.safetensors",
384
+ "vision_model.encoder.layers.10.attn.proj.weight": "model-00001-of-00004.safetensors",
385
+ "vision_model.encoder.layers.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
386
+ "vision_model.encoder.layers.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
387
+ "vision_model.encoder.layers.10.ls1": "model-00001-of-00004.safetensors",
388
+ "vision_model.encoder.layers.10.ls2": "model-00001-of-00004.safetensors",
389
+ "vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
390
+ "vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
391
+ "vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
392
+ "vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
393
+ "vision_model.encoder.layers.10.norm1.bias": "model-00001-of-00004.safetensors",
394
+ "vision_model.encoder.layers.10.norm1.weight": "model-00001-of-00004.safetensors",
395
+ "vision_model.encoder.layers.10.norm2.bias": "model-00001-of-00004.safetensors",
396
+ "vision_model.encoder.layers.10.norm2.weight": "model-00001-of-00004.safetensors",
397
+ "vision_model.encoder.layers.11.attn.proj.bias": "model-00001-of-00004.safetensors",
398
+ "vision_model.encoder.layers.11.attn.proj.weight": "model-00001-of-00004.safetensors",
399
+ "vision_model.encoder.layers.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
400
+ "vision_model.encoder.layers.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
401
+ "vision_model.encoder.layers.11.ls1": "model-00001-of-00004.safetensors",
402
+ "vision_model.encoder.layers.11.ls2": "model-00001-of-00004.safetensors",
403
+ "vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
404
+ "vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
405
+ "vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
406
+ "vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
407
+ "vision_model.encoder.layers.11.norm1.bias": "model-00001-of-00004.safetensors",
408
+ "vision_model.encoder.layers.11.norm1.weight": "model-00001-of-00004.safetensors",
409
+ "vision_model.encoder.layers.11.norm2.bias": "model-00001-of-00004.safetensors",
410
+ "vision_model.encoder.layers.11.norm2.weight": "model-00001-of-00004.safetensors",
411
+ "vision_model.encoder.layers.12.attn.proj.bias": "model-00001-of-00004.safetensors",
412
+ "vision_model.encoder.layers.12.attn.proj.weight": "model-00001-of-00004.safetensors",
413
+ "vision_model.encoder.layers.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
414
+ "vision_model.encoder.layers.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
415
+ "vision_model.encoder.layers.12.ls1": "model-00001-of-00004.safetensors",
416
+ "vision_model.encoder.layers.12.ls2": "model-00001-of-00004.safetensors",
417
+ "vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
418
+ "vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
419
+ "vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
420
+ "vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
421
+ "vision_model.encoder.layers.12.norm1.bias": "model-00001-of-00004.safetensors",
422
+ "vision_model.encoder.layers.12.norm1.weight": "model-00001-of-00004.safetensors",
423
+ "vision_model.encoder.layers.12.norm2.bias": "model-00001-of-00004.safetensors",
424
+ "vision_model.encoder.layers.12.norm2.weight": "model-00001-of-00004.safetensors",
425
+ "vision_model.encoder.layers.13.attn.proj.bias": "model-00001-of-00004.safetensors",
426
+ "vision_model.encoder.layers.13.attn.proj.weight": "model-00001-of-00004.safetensors",
427
+ "vision_model.encoder.layers.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
428
+ "vision_model.encoder.layers.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
429
+ "vision_model.encoder.layers.13.ls1": "model-00001-of-00004.safetensors",
430
+ "vision_model.encoder.layers.13.ls2": "model-00001-of-00004.safetensors",
431
+ "vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
432
+ "vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
433
+ "vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
434
+ "vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
435
+ "vision_model.encoder.layers.13.norm1.bias": "model-00001-of-00004.safetensors",
436
+ "vision_model.encoder.layers.13.norm1.weight": "model-00001-of-00004.safetensors",
437
+ "vision_model.encoder.layers.13.norm2.bias": "model-00001-of-00004.safetensors",
438
+ "vision_model.encoder.layers.13.norm2.weight": "model-00001-of-00004.safetensors",
439
+ "vision_model.encoder.layers.14.attn.proj.bias": "model-00001-of-00004.safetensors",
440
+ "vision_model.encoder.layers.14.attn.proj.weight": "model-00001-of-00004.safetensors",
441
+ "vision_model.encoder.layers.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
442
+ "vision_model.encoder.layers.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
443
+ "vision_model.encoder.layers.14.ls1": "model-00001-of-00004.safetensors",
444
+ "vision_model.encoder.layers.14.ls2": "model-00001-of-00004.safetensors",
445
+ "vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
446
+ "vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
447
+ "vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
448
+ "vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
449
+ "vision_model.encoder.layers.14.norm1.bias": "model-00001-of-00004.safetensors",
450
+ "vision_model.encoder.layers.14.norm1.weight": "model-00001-of-00004.safetensors",
451
+ "vision_model.encoder.layers.14.norm2.bias": "model-00001-of-00004.safetensors",
452
+ "vision_model.encoder.layers.14.norm2.weight": "model-00001-of-00004.safetensors",
453
+ "vision_model.encoder.layers.15.attn.proj.bias": "model-00001-of-00004.safetensors",
454
+ "vision_model.encoder.layers.15.attn.proj.weight": "model-00001-of-00004.safetensors",
455
+ "vision_model.encoder.layers.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
456
+ "vision_model.encoder.layers.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
457
+ "vision_model.encoder.layers.15.ls1": "model-00001-of-00004.safetensors",
458
+ "vision_model.encoder.layers.15.ls2": "model-00001-of-00004.safetensors",
459
+ "vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
460
+ "vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
461
+ "vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
462
+ "vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
463
+ "vision_model.encoder.layers.15.norm1.bias": "model-00001-of-00004.safetensors",
464
+ "vision_model.encoder.layers.15.norm1.weight": "model-00001-of-00004.safetensors",
465
+ "vision_model.encoder.layers.15.norm2.bias": "model-00001-of-00004.safetensors",
466
+ "vision_model.encoder.layers.15.norm2.weight": "model-00001-of-00004.safetensors",
467
+ "vision_model.encoder.layers.16.attn.proj.bias": "model-00001-of-00004.safetensors",
468
+ "vision_model.encoder.layers.16.attn.proj.weight": "model-00001-of-00004.safetensors",
469
+ "vision_model.encoder.layers.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
470
+ "vision_model.encoder.layers.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
471
+ "vision_model.encoder.layers.16.ls1": "model-00001-of-00004.safetensors",
472
+ "vision_model.encoder.layers.16.ls2": "model-00001-of-00004.safetensors",
473
+ "vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
474
+ "vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
475
+ "vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
476
+ "vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
477
+ "vision_model.encoder.layers.16.norm1.bias": "model-00001-of-00004.safetensors",
478
+ "vision_model.encoder.layers.16.norm1.weight": "model-00001-of-00004.safetensors",
479
+ "vision_model.encoder.layers.16.norm2.bias": "model-00001-of-00004.safetensors",
480
+ "vision_model.encoder.layers.16.norm2.weight": "model-00001-of-00004.safetensors",
481
+ "vision_model.encoder.layers.17.attn.proj.bias": "model-00001-of-00004.safetensors",
482
+ "vision_model.encoder.layers.17.attn.proj.weight": "model-00001-of-00004.safetensors",
483
+ "vision_model.encoder.layers.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
484
+ "vision_model.encoder.layers.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
485
+ "vision_model.encoder.layers.17.ls1": "model-00001-of-00004.safetensors",
486
+ "vision_model.encoder.layers.17.ls2": "model-00001-of-00004.safetensors",
487
+ "vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
488
+ "vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
489
+ "vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
490
+ "vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
491
+ "vision_model.encoder.layers.17.norm1.bias": "model-00001-of-00004.safetensors",
492
+ "vision_model.encoder.layers.17.norm1.weight": "model-00001-of-00004.safetensors",
493
+ "vision_model.encoder.layers.17.norm2.bias": "model-00001-of-00004.safetensors",
494
+ "vision_model.encoder.layers.17.norm2.weight": "model-00001-of-00004.safetensors",
495
+ "vision_model.encoder.layers.18.attn.proj.bias": "model-00001-of-00004.safetensors",
496
+ "vision_model.encoder.layers.18.attn.proj.weight": "model-00001-of-00004.safetensors",
497
+ "vision_model.encoder.layers.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
498
+ "vision_model.encoder.layers.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
499
+ "vision_model.encoder.layers.18.ls1": "model-00001-of-00004.safetensors",
500
+ "vision_model.encoder.layers.18.ls2": "model-00001-of-00004.safetensors",
501
+ "vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
502
+ "vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
503
+ "vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
504
+ "vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
505
+ "vision_model.encoder.layers.18.norm1.bias": "model-00001-of-00004.safetensors",
506
+ "vision_model.encoder.layers.18.norm1.weight": "model-00001-of-00004.safetensors",
507
+ "vision_model.encoder.layers.18.norm2.bias": "model-00001-of-00004.safetensors",
508
+ "vision_model.encoder.layers.18.norm2.weight": "model-00001-of-00004.safetensors",
509
+ "vision_model.encoder.layers.19.attn.proj.bias": "model-00001-of-00004.safetensors",
510
+ "vision_model.encoder.layers.19.attn.proj.weight": "model-00001-of-00004.safetensors",
511
+ "vision_model.encoder.layers.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
512
+ "vision_model.encoder.layers.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
513
+ "vision_model.encoder.layers.19.ls1": "model-00001-of-00004.safetensors",
514
+ "vision_model.encoder.layers.19.ls2": "model-00001-of-00004.safetensors",
515
+ "vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
516
+ "vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
517
+ "vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
518
+ "vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
519
+ "vision_model.encoder.layers.19.norm1.bias": "model-00001-of-00004.safetensors",
520
+ "vision_model.encoder.layers.19.norm1.weight": "model-00001-of-00004.safetensors",
521
+ "vision_model.encoder.layers.19.norm2.bias": "model-00001-of-00004.safetensors",
522
+ "vision_model.encoder.layers.19.norm2.weight": "model-00001-of-00004.safetensors",
523
+ "vision_model.encoder.layers.2.attn.proj.bias": "model-00001-of-00004.safetensors",
524
+ "vision_model.encoder.layers.2.attn.proj.weight": "model-00001-of-00004.safetensors",
525
+ "vision_model.encoder.layers.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
526
+ "vision_model.encoder.layers.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
527
+ "vision_model.encoder.layers.2.ls1": "model-00001-of-00004.safetensors",
528
+ "vision_model.encoder.layers.2.ls2": "model-00001-of-00004.safetensors",
529
+ "vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
530
+ "vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
531
+ "vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
532
+ "vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
533
+ "vision_model.encoder.layers.2.norm1.bias": "model-00001-of-00004.safetensors",
534
+ "vision_model.encoder.layers.2.norm1.weight": "model-00001-of-00004.safetensors",
535
+ "vision_model.encoder.layers.2.norm2.bias": "model-00001-of-00004.safetensors",
536
+ "vision_model.encoder.layers.2.norm2.weight": "model-00001-of-00004.safetensors",
537
+ "vision_model.encoder.layers.20.attn.proj.bias": "model-00001-of-00004.safetensors",
538
+ "vision_model.encoder.layers.20.attn.proj.weight": "model-00001-of-00004.safetensors",
539
+ "vision_model.encoder.layers.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
540
+ "vision_model.encoder.layers.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
541
+ "vision_model.encoder.layers.20.ls1": "model-00001-of-00004.safetensors",
542
+ "vision_model.encoder.layers.20.ls2": "model-00001-of-00004.safetensors",
543
+ "vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
544
+ "vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
545
+ "vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
546
+ "vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
547
+ "vision_model.encoder.layers.20.norm1.bias": "model-00001-of-00004.safetensors",
548
+ "vision_model.encoder.layers.20.norm1.weight": "model-00001-of-00004.safetensors",
549
+ "vision_model.encoder.layers.20.norm2.bias": "model-00001-of-00004.safetensors",
550
+ "vision_model.encoder.layers.20.norm2.weight": "model-00001-of-00004.safetensors",
551
+ "vision_model.encoder.layers.21.attn.proj.bias": "model-00001-of-00004.safetensors",
552
+ "vision_model.encoder.layers.21.attn.proj.weight": "model-00001-of-00004.safetensors",
553
+ "vision_model.encoder.layers.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
554
+ "vision_model.encoder.layers.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
555
+ "vision_model.encoder.layers.21.ls1": "model-00001-of-00004.safetensors",
556
+ "vision_model.encoder.layers.21.ls2": "model-00001-of-00004.safetensors",
557
+ "vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
558
+ "vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
559
+ "vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
560
+ "vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
561
+ "vision_model.encoder.layers.21.norm1.bias": "model-00001-of-00004.safetensors",
562
+ "vision_model.encoder.layers.21.norm1.weight": "model-00001-of-00004.safetensors",
563
+ "vision_model.encoder.layers.21.norm2.bias": "model-00001-of-00004.safetensors",
564
+ "vision_model.encoder.layers.21.norm2.weight": "model-00001-of-00004.safetensors",
565
+ "vision_model.encoder.layers.22.attn.proj.bias": "model-00001-of-00004.safetensors",
566
+ "vision_model.encoder.layers.22.attn.proj.weight": "model-00001-of-00004.safetensors",
567
+ "vision_model.encoder.layers.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
568
+ "vision_model.encoder.layers.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
569
+ "vision_model.encoder.layers.22.ls1": "model-00001-of-00004.safetensors",
570
+ "vision_model.encoder.layers.22.ls2": "model-00001-of-00004.safetensors",
571
+ "vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
572
+ "vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
573
+ "vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
574
+ "vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
575
+ "vision_model.encoder.layers.22.norm1.bias": "model-00001-of-00004.safetensors",
576
+ "vision_model.encoder.layers.22.norm1.weight": "model-00001-of-00004.safetensors",
577
+ "vision_model.encoder.layers.22.norm2.bias": "model-00001-of-00004.safetensors",
578
+ "vision_model.encoder.layers.22.norm2.weight": "model-00001-of-00004.safetensors",
579
+ "vision_model.encoder.layers.23.attn.proj.bias": "model-00001-of-00004.safetensors",
580
+ "vision_model.encoder.layers.23.attn.proj.weight": "model-00001-of-00004.safetensors",
581
+ "vision_model.encoder.layers.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
582
+ "vision_model.encoder.layers.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
583
+ "vision_model.encoder.layers.23.ls1": "model-00001-of-00004.safetensors",
584
+ "vision_model.encoder.layers.23.ls2": "model-00001-of-00004.safetensors",
585
+ "vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
586
+ "vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
587
+ "vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
588
+ "vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
589
+ "vision_model.encoder.layers.23.norm1.bias": "model-00001-of-00004.safetensors",
590
+ "vision_model.encoder.layers.23.norm1.weight": "model-00001-of-00004.safetensors",
591
+ "vision_model.encoder.layers.23.norm2.bias": "model-00001-of-00004.safetensors",
592
+ "vision_model.encoder.layers.23.norm2.weight": "model-00001-of-00004.safetensors",
593
+ "vision_model.encoder.layers.3.attn.proj.bias": "model-00001-of-00004.safetensors",
594
+ "vision_model.encoder.layers.3.attn.proj.weight": "model-00001-of-00004.safetensors",
595
+ "vision_model.encoder.layers.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
596
+ "vision_model.encoder.layers.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
597
+ "vision_model.encoder.layers.3.ls1": "model-00001-of-00004.safetensors",
598
+ "vision_model.encoder.layers.3.ls2": "model-00001-of-00004.safetensors",
599
+ "vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
600
+ "vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
601
+ "vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
602
+ "vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
603
+ "vision_model.encoder.layers.3.norm1.bias": "model-00001-of-00004.safetensors",
604
+ "vision_model.encoder.layers.3.norm1.weight": "model-00001-of-00004.safetensors",
605
+ "vision_model.encoder.layers.3.norm2.bias": "model-00001-of-00004.safetensors",
606
+ "vision_model.encoder.layers.3.norm2.weight": "model-00001-of-00004.safetensors",
607
+ "vision_model.encoder.layers.4.attn.proj.bias": "model-00001-of-00004.safetensors",
608
+ "vision_model.encoder.layers.4.attn.proj.weight": "model-00001-of-00004.safetensors",
609
+ "vision_model.encoder.layers.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
610
+ "vision_model.encoder.layers.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
611
+ "vision_model.encoder.layers.4.ls1": "model-00001-of-00004.safetensors",
612
+ "vision_model.encoder.layers.4.ls2": "model-00001-of-00004.safetensors",
613
+ "vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
614
+ "vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
615
+ "vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
616
+ "vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
617
+ "vision_model.encoder.layers.4.norm1.bias": "model-00001-of-00004.safetensors",
618
+ "vision_model.encoder.layers.4.norm1.weight": "model-00001-of-00004.safetensors",
619
+ "vision_model.encoder.layers.4.norm2.bias": "model-00001-of-00004.safetensors",
620
+ "vision_model.encoder.layers.4.norm2.weight": "model-00001-of-00004.safetensors",
621
+ "vision_model.encoder.layers.5.attn.proj.bias": "model-00001-of-00004.safetensors",
622
+ "vision_model.encoder.layers.5.attn.proj.weight": "model-00001-of-00004.safetensors",
623
+ "vision_model.encoder.layers.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
624
+ "vision_model.encoder.layers.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
625
+ "vision_model.encoder.layers.5.ls1": "model-00001-of-00004.safetensors",
626
+ "vision_model.encoder.layers.5.ls2": "model-00001-of-00004.safetensors",
627
+ "vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
628
+ "vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
629
+ "vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
630
+ "vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
631
+ "vision_model.encoder.layers.5.norm1.bias": "model-00001-of-00004.safetensors",
632
+ "vision_model.encoder.layers.5.norm1.weight": "model-00001-of-00004.safetensors",
633
+ "vision_model.encoder.layers.5.norm2.bias": "model-00001-of-00004.safetensors",
634
+ "vision_model.encoder.layers.5.norm2.weight": "model-00001-of-00004.safetensors",
635
+ "vision_model.encoder.layers.6.attn.proj.bias": "model-00001-of-00004.safetensors",
636
+ "vision_model.encoder.layers.6.attn.proj.weight": "model-00001-of-00004.safetensors",
637
+ "vision_model.encoder.layers.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
638
+ "vision_model.encoder.layers.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
639
+ "vision_model.encoder.layers.6.ls1": "model-00001-of-00004.safetensors",
640
+ "vision_model.encoder.layers.6.ls2": "model-00001-of-00004.safetensors",
641
+ "vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
642
+ "vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
643
+ "vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
644
+ "vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
645
+ "vision_model.encoder.layers.6.norm1.bias": "model-00001-of-00004.safetensors",
646
+ "vision_model.encoder.layers.6.norm1.weight": "model-00001-of-00004.safetensors",
647
+ "vision_model.encoder.layers.6.norm2.bias": "model-00001-of-00004.safetensors",
648
+ "vision_model.encoder.layers.6.norm2.weight": "model-00001-of-00004.safetensors",
649
+ "vision_model.encoder.layers.7.attn.proj.bias": "model-00001-of-00004.safetensors",
650
+ "vision_model.encoder.layers.7.attn.proj.weight": "model-00001-of-00004.safetensors",
651
+ "vision_model.encoder.layers.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
652
+ "vision_model.encoder.layers.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
653
+ "vision_model.encoder.layers.7.ls1": "model-00001-of-00004.safetensors",
654
+ "vision_model.encoder.layers.7.ls2": "model-00001-of-00004.safetensors",
655
+ "vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
656
+ "vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
657
+ "vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
658
+ "vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
659
+ "vision_model.encoder.layers.7.norm1.bias": "model-00001-of-00004.safetensors",
660
+ "vision_model.encoder.layers.7.norm1.weight": "model-00001-of-00004.safetensors",
661
+ "vision_model.encoder.layers.7.norm2.bias": "model-00001-of-00004.safetensors",
662
+ "vision_model.encoder.layers.7.norm2.weight": "model-00001-of-00004.safetensors",
663
+ "vision_model.encoder.layers.8.attn.proj.bias": "model-00001-of-00004.safetensors",
664
+ "vision_model.encoder.layers.8.attn.proj.weight": "model-00001-of-00004.safetensors",
665
+ "vision_model.encoder.layers.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
666
+ "vision_model.encoder.layers.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
667
+ "vision_model.encoder.layers.8.ls1": "model-00001-of-00004.safetensors",
668
+ "vision_model.encoder.layers.8.ls2": "model-00001-of-00004.safetensors",
669
+ "vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
670
+ "vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
671
+ "vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
672
+ "vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
673
+ "vision_model.encoder.layers.8.norm1.bias": "model-00001-of-00004.safetensors",
674
+ "vision_model.encoder.layers.8.norm1.weight": "model-00001-of-00004.safetensors",
675
+ "vision_model.encoder.layers.8.norm2.bias": "model-00001-of-00004.safetensors",
676
+ "vision_model.encoder.layers.8.norm2.weight": "model-00001-of-00004.safetensors",
677
+ "vision_model.encoder.layers.9.attn.proj.bias": "model-00001-of-00004.safetensors",
678
+ "vision_model.encoder.layers.9.attn.proj.weight": "model-00001-of-00004.safetensors",
679
+ "vision_model.encoder.layers.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
680
+ "vision_model.encoder.layers.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
681
+ "vision_model.encoder.layers.9.ls1": "model-00001-of-00004.safetensors",
682
+ "vision_model.encoder.layers.9.ls2": "model-00001-of-00004.safetensors",
683
+ "vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
684
+ "vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
685
+ "vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
686
+ "vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
687
+ "vision_model.encoder.layers.9.norm1.bias": "model-00001-of-00004.safetensors",
688
+ "vision_model.encoder.layers.9.norm1.weight": "model-00001-of-00004.safetensors",
689
+ "vision_model.encoder.layers.9.norm2.bias": "model-00001-of-00004.safetensors",
690
+ "vision_model.encoder.layers.9.norm2.weight": "model-00001-of-00004.safetensors"
691
+ }
692
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/modeling_intern_vit.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ from typing import Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from einops import rearrange
13
+ from timm.layers import DropPath
14
+ from torch import nn
15
+ from transformers.activations import ACT2FN
16
+ from transformers.modeling_outputs import (BaseModelOutput,
17
+ BaseModelOutputWithPooling)
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import logging
20
+
21
+ from .configuration_intern_vit import InternVisionConfig
22
+
23
+ try:
24
+ from flash_attn.bert_padding import pad_input, unpad_input
25
+ from flash_attn.flash_attn_interface import \
26
+ flash_attn_varlen_qkvpacked_func
27
+ has_flash_attn = True
28
+ except:
29
+ print('FlashAttention2 is not installed.')
30
+ has_flash_attn = False
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ class FlashAttention(nn.Module):
36
+ """Implement the scaled dot product attention with softmax.
37
+ Arguments
38
+ ---------
39
+ softmax_scale: The temperature to use for the softmax attention.
40
+ (default: 1/sqrt(d_keys) where d_keys is computed at
41
+ runtime)
42
+ attention_dropout: The dropout rate to apply to the attention
43
+ (default: 0.0)
44
+ """
45
+
46
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
47
+ super().__init__()
48
+ self.softmax_scale = softmax_scale
49
+ self.dropout_p = attention_dropout
50
+
51
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
52
+ max_s=None, need_weights=False):
53
+ """Implements the multihead softmax attention.
54
+ Arguments
55
+ ---------
56
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
57
+ if unpadded: (nnz, 3, h, d)
58
+ key_padding_mask: a bool tensor of shape (B, S)
59
+ """
60
+ assert not need_weights
61
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
62
+ assert qkv.is_cuda
63
+
64
+ if cu_seqlens is None:
65
+ batch_size = qkv.shape[0]
66
+ seqlen = qkv.shape[1]
67
+ if key_padding_mask is None:
68
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
69
+ max_s = seqlen
70
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
71
+ device=qkv.device)
72
+ output = flash_attn_varlen_qkvpacked_func(
73
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
74
+ softmax_scale=self.softmax_scale, causal=causal
75
+ )
76
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
77
+ else:
78
+ nheads = qkv.shape[-2]
79
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
80
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
81
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
82
+ output_unpad = flash_attn_varlen_qkvpacked_func(
83
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
84
+ softmax_scale=self.softmax_scale, causal=causal
85
+ )
86
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
87
+ indices, batch_size, seqlen),
88
+ 'b s (h d) -> b s h d', h=nheads)
89
+ else:
90
+ assert max_s is not None
91
+ output = flash_attn_varlen_qkvpacked_func(
92
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
93
+ softmax_scale=self.softmax_scale, causal=causal
94
+ )
95
+
96
+ return output, None
97
+
98
+
99
+ class InternRMSNorm(nn.Module):
100
+ def __init__(self, hidden_size, eps=1e-6):
101
+ super().__init__()
102
+ self.weight = nn.Parameter(torch.ones(hidden_size))
103
+ self.variance_epsilon = eps
104
+
105
+ def forward(self, hidden_states):
106
+ input_dtype = hidden_states.dtype
107
+ hidden_states = hidden_states.to(torch.float32)
108
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
109
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
110
+ return self.weight * hidden_states.to(input_dtype)
111
+
112
+
113
+ try:
114
+ from apex.normalization import FusedRMSNorm
115
+
116
+ InternRMSNorm = FusedRMSNorm # noqa
117
+
118
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
119
+ except ImportError:
120
+ # using the normal InternRMSNorm
121
+ pass
122
+ except Exception:
123
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
124
+ pass
125
+
126
+
127
+ NORM2FN = {
128
+ 'rms_norm': InternRMSNorm,
129
+ 'layer_norm': nn.LayerNorm,
130
+ }
131
+
132
+
133
+ class InternVisionEmbeddings(nn.Module):
134
+ def __init__(self, config: InternVisionConfig):
135
+ super().__init__()
136
+ self.config = config
137
+ self.embed_dim = config.hidden_size
138
+ self.image_size = config.image_size
139
+ self.patch_size = config.patch_size
140
+
141
+ self.class_embedding = nn.Parameter(
142
+ torch.randn(1, 1, self.embed_dim),
143
+ )
144
+
145
+ self.patch_embedding = nn.Conv2d(
146
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
147
+ )
148
+
149
+ self.num_patches = (self.image_size // self.patch_size) ** 2
150
+ self.num_positions = self.num_patches + 1
151
+
152
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
153
+
154
+ def _get_pos_embed(self, pos_embed, H, W):
155
+ target_dtype = pos_embed.dtype
156
+ pos_embed = pos_embed.float().reshape(
157
+ 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
158
+ pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
159
+ reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
160
+ return pos_embed
161
+
162
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
163
+ target_dtype = self.patch_embedding.weight.dtype
164
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
165
+ batch_size, _, height, width = patch_embeds.shape
166
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
167
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
168
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
169
+ position_embedding = torch.cat([
170
+ self.position_embedding[:, :1, :],
171
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
172
+ ], dim=1)
173
+ embeddings = embeddings + position_embedding.to(target_dtype)
174
+ return embeddings
175
+
176
+
177
+ class InternAttention(nn.Module):
178
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
179
+
180
+ def __init__(self, config: InternVisionConfig):
181
+ super().__init__()
182
+ self.config = config
183
+ self.embed_dim = config.hidden_size
184
+ self.num_heads = config.num_attention_heads
185
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
186
+ if config.use_flash_attn and not has_flash_attn:
187
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
188
+ self.head_dim = self.embed_dim // self.num_heads
189
+ if self.head_dim * self.num_heads != self.embed_dim:
190
+ raise ValueError(
191
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
192
+ f' {self.num_heads}).'
193
+ )
194
+
195
+ self.scale = self.head_dim ** -0.5
196
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
197
+ self.attn_drop = nn.Dropout(config.attention_dropout)
198
+ self.proj_drop = nn.Dropout(config.dropout)
199
+
200
+ self.qk_normalization = config.qk_normalization
201
+
202
+ if self.qk_normalization:
203
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
204
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
205
+
206
+ if self.use_flash_attn:
207
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
208
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
209
+
210
+ def _naive_attn(self, x):
211
+ B, N, C = x.shape
212
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
213
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
214
+
215
+ if self.qk_normalization:
216
+ B_, H_, N_, D_ = q.shape
217
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
218
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
219
+
220
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
221
+ attn = attn.softmax(dim=-1)
222
+ attn = self.attn_drop(attn)
223
+
224
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
225
+ x = self.proj(x)
226
+ x = self.proj_drop(x)
227
+ return x
228
+
229
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
230
+ qkv = self.qkv(x)
231
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
232
+
233
+ if self.qk_normalization:
234
+ q, k, v = qkv.unbind(2)
235
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
236
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
237
+ qkv = torch.stack([q, k, v], dim=2)
238
+
239
+ context, _ = self.inner_attn(
240
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
241
+ )
242
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
243
+ outs = self.proj_drop(outs)
244
+ return outs
245
+
246
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
247
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
248
+ return x
249
+
250
+
251
+ class InternMLP(nn.Module):
252
+ def __init__(self, config: InternVisionConfig):
253
+ super().__init__()
254
+ self.config = config
255
+ self.act = ACT2FN[config.hidden_act]
256
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
257
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
258
+
259
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
260
+ hidden_states = self.fc1(hidden_states)
261
+ hidden_states = self.act(hidden_states)
262
+ hidden_states = self.fc2(hidden_states)
263
+ return hidden_states
264
+
265
+
266
+ class InternVisionEncoderLayer(nn.Module):
267
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
268
+ super().__init__()
269
+ self.embed_dim = config.hidden_size
270
+ self.intermediate_size = config.intermediate_size
271
+ self.norm_type = config.norm_type
272
+
273
+ self.attn = InternAttention(config)
274
+ self.mlp = InternMLP(config)
275
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
276
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
277
+
278
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
279
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
280
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
281
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
282
+
283
+ def forward(
284
+ self,
285
+ hidden_states: torch.Tensor,
286
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
287
+ """
288
+ Args:
289
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
290
+ """
291
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1)
292
+
293
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2)
294
+
295
+ return hidden_states
296
+
297
+
298
+ class InternVisionEncoder(nn.Module):
299
+ """
300
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
301
+ [`InternEncoderLayer`].
302
+
303
+ Args:
304
+ config (`InternConfig`):
305
+ The corresponding vision configuration for the `InternEncoder`.
306
+ """
307
+
308
+ def __init__(self, config: InternVisionConfig):
309
+ super().__init__()
310
+ self.config = config
311
+ # stochastic depth decay rule
312
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
313
+ self.layers = nn.ModuleList([
314
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
315
+ self.gradient_checkpointing = True
316
+
317
+ def forward(
318
+ self,
319
+ inputs_embeds,
320
+ output_hidden_states: Optional[bool] = None,
321
+ return_dict: Optional[bool] = None,
322
+ ) -> Union[Tuple, BaseModelOutput]:
323
+ r"""
324
+ Args:
325
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
326
+ Embedded representation of the inputs. Should be float, not int tokens.
327
+ output_hidden_states (`bool`, *optional*):
328
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
329
+ for more detail.
330
+ return_dict (`bool`, *optional*):
331
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
332
+ """
333
+ output_hidden_states = (
334
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
335
+ )
336
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
337
+
338
+ encoder_states = () if output_hidden_states else None
339
+ hidden_states = inputs_embeds
340
+
341
+ for idx, encoder_layer in enumerate(self.layers):
342
+ if output_hidden_states:
343
+ encoder_states = encoder_states + (hidden_states,)
344
+ if self.gradient_checkpointing and self.training:
345
+ layer_outputs = torch.utils.checkpoint.checkpoint(
346
+ encoder_layer,
347
+ hidden_states)
348
+ else:
349
+ layer_outputs = encoder_layer(
350
+ hidden_states,
351
+ )
352
+ hidden_states = layer_outputs
353
+
354
+ if output_hidden_states:
355
+ encoder_states = encoder_states + (hidden_states,)
356
+
357
+ if not return_dict:
358
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
359
+ return BaseModelOutput(
360
+ last_hidden_state=hidden_states, hidden_states=encoder_states
361
+ )
362
+
363
+
364
+ class InternVisionModel(PreTrainedModel):
365
+ main_input_name = 'pixel_values'
366
+ _supports_flash_attn_2 = True
367
+ supports_gradient_checkpointing = True
368
+ config_class = InternVisionConfig
369
+ _no_split_modules = ['InternVisionEncoderLayer']
370
+
371
+ def __init__(self, config: InternVisionConfig):
372
+ super().__init__(config)
373
+ self.config = config
374
+
375
+ self.embeddings = InternVisionEmbeddings(config)
376
+ self.encoder = InternVisionEncoder(config)
377
+
378
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
379
+ pos_emb = self.embeddings.position_embedding
380
+ _, num_positions, embed_dim = pos_emb.shape
381
+ cls_emb = pos_emb[:, :1, :]
382
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
383
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
384
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
385
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
386
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
387
+ self.embeddings.image_size = new_size
388
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
389
+
390
+ def get_input_embeddings(self):
391
+ return self.embeddings
392
+
393
+ def forward(
394
+ self,
395
+ pixel_values: Optional[torch.FloatTensor] = None,
396
+ output_hidden_states: Optional[bool] = None,
397
+ return_dict: Optional[bool] = None,
398
+ pixel_embeds: Optional[torch.FloatTensor] = None,
399
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
400
+ output_hidden_states = (
401
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
402
+ )
403
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
404
+
405
+ if pixel_values is None and pixel_embeds is None:
406
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
407
+
408
+ if pixel_embeds is not None:
409
+ hidden_states = pixel_embeds
410
+ else:
411
+ if len(pixel_values.shape) == 4:
412
+ hidden_states = self.embeddings(pixel_values)
413
+ else:
414
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
415
+ encoder_outputs = self.encoder(
416
+ inputs_embeds=hidden_states,
417
+ output_hidden_states=output_hidden_states,
418
+ return_dict=return_dict,
419
+ )
420
+ last_hidden_state = encoder_outputs.last_hidden_state
421
+ pooled_output = last_hidden_state[:, 0, :]
422
+
423
+ if not return_dict:
424
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
425
+
426
+ return BaseModelOutputWithPooling(
427
+ last_hidden_state=last_hidden_state,
428
+ pooler_output=pooled_output,
429
+ hidden_states=encoder_outputs.hidden_states,
430
+ attentions=encoder_outputs.attentions,
431
+ )
internvl3-8b-instruct-lora_epoch10_5e-6/modeling_internvl_chat_cd.py ADDED
@@ -0,0 +1,1198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import warnings
8
+ from typing import List, Optional, Tuple, Union
9
+
10
+ from sympy import im
11
+ import torch.utils.checkpoint
12
+ import transformers
13
+ from torch import nn
14
+ from torch.nn import CrossEntropyLoss
15
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM)
16
+ from .modeling_qwen2_cd import Qwen2ForCausalLM
17
+ # from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
18
+ # Qwen2ForCausalLM)
19
+ from transformers.modeling_outputs import CausalLMOutputWithPast
20
+ from transformers.modeling_utils import PreTrainedModel
21
+ from transformers.utils import ModelOutput, logging
22
+
23
+ from .configuration_internvl_chat import InternVLChatConfig
24
+ from .conversation import get_conv_template
25
+ from .modeling_intern_vit import InternVisionModel, has_flash_attn
26
+ import re
27
+ import copy
28
+
29
+
30
+ logger = logging.get_logger(__name__)
31
+
32
+
33
+ def version_cmp(v1, v2, op='eq'):
34
+ import operator
35
+
36
+ from packaging import version
37
+ op_func = getattr(operator, op)
38
+ return op_func(version.parse(v1), version.parse(v2))
39
+
40
+
41
+ class InternVLChatModel(PreTrainedModel):
42
+ config_class = InternVLChatConfig
43
+ main_input_name = 'pixel_values'
44
+ base_model_prefix = 'language_model'
45
+ _supports_flash_attn_2 = True
46
+ supports_gradient_checkpointing = True
47
+ _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Qwen2DecoderLayer']
48
+
49
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
50
+ super().__init__(config)
51
+
52
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
53
+ image_size = config.force_image_size or config.vision_config.image_size
54
+ patch_size = config.vision_config.patch_size
55
+ self.patch_size = patch_size
56
+ self.select_layer = config.select_layer
57
+ self.template = config.template
58
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
59
+ self.downsample_ratio = config.downsample_ratio
60
+ self.ps_version = config.ps_version
61
+ use_flash_attn = use_flash_attn if has_flash_attn else False
62
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
63
+ config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
64
+
65
+ logger.info(f'num_image_token: {self.num_image_token}')
66
+ logger.info(f'ps_version: {self.ps_version}')
67
+ if vision_model is not None:
68
+ self.vision_model = vision_model
69
+ else:
70
+ self.vision_model = InternVisionModel(config.vision_config)
71
+ if language_model is not None:
72
+ self.language_model = language_model
73
+ else:
74
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
75
+ self.language_model = LlamaForCausalLM(config.llm_config)
76
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
77
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
78
+ else:
79
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
80
+
81
+ vit_hidden_size = config.vision_config.hidden_size
82
+ llm_hidden_size = config.llm_config.hidden_size
83
+
84
+ self.mlp1 = nn.Sequential(
85
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
86
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
87
+ nn.GELU(),
88
+ nn.Linear(llm_hidden_size, llm_hidden_size)
89
+ )
90
+
91
+ self.img_context_token_id = None
92
+ self.conv_template = get_conv_template(self.template)
93
+ self.system_message = self.conv_template.system_message
94
+
95
+ def forward(
96
+ self,
97
+ pixel_values: torch.FloatTensor,
98
+ input_ids: torch.LongTensor = None,
99
+ attention_mask: Optional[torch.Tensor] = None,
100
+ position_ids: Optional[torch.LongTensor] = None,
101
+ image_flags: Optional[torch.LongTensor] = None,
102
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
103
+ labels: Optional[torch.LongTensor] = None,
104
+ use_cache: Optional[bool] = None,
105
+ output_attentions: Optional[bool] = None,
106
+ output_hidden_states: Optional[bool] = None,
107
+ return_dict: Optional[bool] = None,
108
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
109
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
110
+
111
+ image_flags = image_flags.squeeze(-1)
112
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
113
+
114
+ vit_embeds = self.extract_feature(pixel_values)
115
+ vit_embeds = vit_embeds[image_flags == 1]
116
+ vit_batch_size = pixel_values.shape[0]
117
+
118
+ B, N, C = input_embeds.shape
119
+ input_embeds = input_embeds.reshape(B * N, C)
120
+
121
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
122
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
123
+
124
+ input_ids = input_ids.reshape(B * N)
125
+ selected = (input_ids == self.img_context_token_id)
126
+ try:
127
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
128
+ except Exception as e:
129
+ vit_embeds = vit_embeds.reshape(-1, C)
130
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
131
+ f'vit_embeds.shape={vit_embeds.shape}')
132
+ n_token = min(selected.sum(), vit_embeds.size(0))
133
+ input_embeds[selected][:n_token] = input_embeds[selected][:n_token] * 0.0 + vit_embeds[:n_token]
134
+
135
+ input_embeds = input_embeds.reshape(B, N, C)
136
+
137
+ outputs = self.language_model(
138
+ inputs_embeds=input_embeds,
139
+ attention_mask=attention_mask,
140
+ position_ids=position_ids,
141
+ past_key_values=past_key_values,
142
+ use_cache=use_cache,
143
+ output_attentions=output_attentions,
144
+ output_hidden_states=output_hidden_states,
145
+ return_dict=return_dict,
146
+ )
147
+ logits = outputs.logits
148
+
149
+ loss = None
150
+ if labels is not None:
151
+ # Shift so that tokens < n predict n
152
+ shift_logits = logits[..., :-1, :].contiguous()
153
+ shift_labels = labels[..., 1:].contiguous()
154
+ # Flatten the tokens
155
+ loss_fct = CrossEntropyLoss()
156
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
157
+ shift_labels = shift_labels.view(-1)
158
+ # Enable model parallelism
159
+ shift_labels = shift_labels.to(shift_logits.device)
160
+ loss = loss_fct(shift_logits, shift_labels)
161
+
162
+ if not return_dict:
163
+ output = (logits,) + outputs[1:]
164
+ return (loss,) + output if loss is not None else output
165
+
166
+ return CausalLMOutputWithPast(
167
+ loss=loss,
168
+ logits=logits,
169
+ past_key_values=outputs.past_key_values,
170
+ hidden_states=outputs.hidden_states,
171
+ attentions=outputs.attentions,
172
+ )
173
+
174
+ def pixel_shuffle(self, x, scale_factor=0.5):
175
+ n, w, h, c = x.size()
176
+ # N, W, H, C --> N, W, H * scale, C // scale
177
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
178
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
179
+ x = x.permute(0, 2, 1, 3).contiguous()
180
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
181
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
182
+ int(c / (scale_factor * scale_factor)))
183
+ if self.ps_version == 'v1':
184
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
185
+ 'which results in a transposed image.')
186
+ else:
187
+ x = x.permute(0, 2, 1, 3).contiguous()
188
+ return x
189
+
190
+ def extract_feature(self, pixel_values):
191
+ if self.select_layer == -1:
192
+ vit_embeds = self.vision_model(
193
+ pixel_values=pixel_values,
194
+ output_hidden_states=False,
195
+ return_dict=True).last_hidden_state
196
+ else:
197
+ vit_embeds = self.vision_model(
198
+ pixel_values=pixel_values,
199
+ output_hidden_states=True,
200
+ return_dict=True).hidden_states[self.select_layer]
201
+ vit_embeds = vit_embeds[:, 1:, :]
202
+
203
+ h = w = int(vit_embeds.shape[1] ** 0.5)
204
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
205
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
206
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
207
+ vit_embeds = self.mlp1(vit_embeds)
208
+ return vit_embeds
209
+
210
+ def get_mask_img(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
211
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
212
+ verbose=False,test_mcd=False):
213
+
214
+ if history is None and pixel_values is not None and '<image>' not in question:
215
+ question = '<image>\n' + question
216
+
217
+ if num_patches_list is None:
218
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
219
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
220
+
221
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
222
+ self.img_context_token_id = img_context_token_id
223
+
224
+ template = get_conv_template(self.template)
225
+ template.system_message = self.system_message
226
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
227
+
228
+ history = [] if history is None else history
229
+ for (old_question, old_answer) in history:
230
+ template.append_message(template.roles[0], old_question)
231
+ template.append_message(template.roles[1], old_answer)
232
+ template.append_message(template.roles[0], question)
233
+ template.append_message(template.roles[1], None)
234
+ query = template.get_prompt()
235
+
236
+ if verbose and pixel_values is not None:
237
+ image_bs = pixel_values.shape[0]
238
+ print(f'dynamic ViT batch size: {image_bs}')
239
+
240
+ for num_patches in num_patches_list:
241
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
242
+ query = query.replace('<image>', image_tokens, 1)
243
+
244
+ model_inputs = tokenizer(
245
+ query,
246
+ return_tensors='pt',
247
+ padding=True,
248
+ )
249
+ input_ids = model_inputs['input_ids'].to(self.device)
250
+ attention_mask = model_inputs['attention_mask'].to(self.device)
251
+
252
+ if test_mcd:
253
+ vit_embeds = self.extract_feature(pixel_values)
254
+ B_img = vit_embeds.shape[0] # B_img = 13, 256, 3584
255
+ test_text = self.get_text_only_embeds(input_ids)
256
+ only_text_expand = test_text.expand(B_img, -1, -1) # 复制成 [13, N_text, C]
257
+
258
+ importance_scores_all = []
259
+
260
+ for i in range(B_img):
261
+ vit_embeds_i = vit_embeds[i] # (N_patches, C)
262
+ text_i = only_text_expand[i] # (N_text, C),复制过来的是一样的
263
+
264
+ # 归一化
265
+ vit_embeds_i = vit_embeds_i / vit_embeds_i.norm(dim=-1, keepdim=True)
266
+ text_i = text_i / text_i.norm(dim=-1, keepdim=True)
267
+
268
+ # 相似度
269
+ similarity = vit_embeds_i @ text_i.T # (N_patches, N_text)
270
+
271
+ """原始做法"""
272
+ # importance打分
273
+ importance_scores = similarity.mean(dim=1) # (N_patches,)
274
+ importance_scores_all.append(importance_scores)
275
+
276
+
277
+ topk_indices_all = []
278
+ for importance_scores in importance_scores_all:
279
+ topk_r = int(importance_scores.shape[0] * self.pr)
280
+ topk_ir = int(importance_scores.shape[0] * (self.overall_pr - self.pr))
281
+
282
+ topk_indices_r = importance_scores.topk(topk_r, largest=True).indices
283
+ topk_indices_ir = importance_scores.topk(topk_ir, largest=False).indices
284
+
285
+ topk_indices = torch.cat((topk_indices_r, topk_indices_ir), dim=0)
286
+
287
+ topk_indices_all.append(topk_indices)
288
+
289
+ return topk_indices_all
290
+
291
+
292
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
293
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
294
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
295
+ if history is not None or return_history:
296
+ print('Now multi-turn chat is not supported in batch_chat.')
297
+ raise NotImplementedError
298
+
299
+ if image_counts is not None:
300
+ num_patches_list = image_counts
301
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
302
+
303
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
304
+ self.img_context_token_id = img_context_token_id
305
+
306
+ if verbose and pixel_values is not None:
307
+ image_bs = pixel_values.shape[0]
308
+ print(f'dynamic ViT batch size: {image_bs}')
309
+
310
+ queries = []
311
+ for idx, num_patches in enumerate(num_patches_list):
312
+ question = questions[idx]
313
+ if pixel_values is not None and '<image>' not in question:
314
+ question = '<image>\n' + question
315
+ template = get_conv_template(self.template)
316
+ template.system_message = self.system_message
317
+ template.append_message(template.roles[0], question)
318
+ template.append_message(template.roles[1], None)
319
+ query = template.get_prompt()
320
+
321
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
322
+ query = query.replace('<image>', image_tokens, 1)
323
+ queries.append(query)
324
+
325
+ tokenizer.padding_side = 'left'
326
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
327
+ input_ids = model_inputs['input_ids'].to(self.device)
328
+ attention_mask = model_inputs['attention_mask'].to(self.device)
329
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
330
+ generation_config['eos_token_id'] = eos_token_id
331
+ generation_output = self.generate(
332
+ pixel_values=pixel_values,
333
+ input_ids=input_ids,
334
+ attention_mask=attention_mask,
335
+ **generation_config
336
+ )
337
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
338
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
339
+ return responses
340
+
341
+ def chat_noV(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
342
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
343
+ verbose=False,test_mcd=False):
344
+
345
+
346
+ template = get_conv_template(self.template)
347
+ template.system_message = self.system_message
348
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
349
+
350
+ history = [] if history is None else history
351
+ for (old_question, old_answer) in history:
352
+ template.append_message(template.roles[0], old_question)
353
+ template.append_message(template.roles[1], old_answer)
354
+ template.append_message(template.roles[0], question)
355
+ template.append_message(template.roles[1], None)
356
+ query = template.get_prompt()
357
+ query_mcd = query[:]
358
+
359
+
360
+ model_inputs = tokenizer(
361
+ query,
362
+ return_tensors='pt',
363
+ padding=True,
364
+ )
365
+ input_ids = model_inputs['input_ids'].to(self.device)
366
+ attention_mask = model_inputs['attention_mask'].to(self.device)
367
+
368
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
369
+ self.img_context_token_id = img_context_token_id
370
+
371
+
372
+ generation_config['eos_token_id'] = eos_token_id
373
+ generation_output = self.generate(
374
+ input_ids=input_ids,
375
+ attention_mask=attention_mask,
376
+ **generation_config
377
+ )
378
+ if isinstance(generation_output, torch.Tensor):
379
+ generation_output = generation_output
380
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
381
+ else:
382
+ response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
383
+ response = response.split(template.sep.strip())[0].strip()
384
+ history.append((question, response))
385
+ if return_history:
386
+ return response, history
387
+ else:
388
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
389
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
390
+ if verbose:
391
+ print(query_to_print, response)
392
+ return response
393
+
394
+ def original_chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
395
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
396
+ verbose=False,test_mcd=False):
397
+
398
+ if history is None and pixel_values is not None and '<image>' not in question:
399
+ question = '<image>\n' + question
400
+
401
+ if num_patches_list is None:
402
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
403
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
404
+
405
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
406
+ self.img_context_token_id = img_context_token_id
407
+
408
+ template = get_conv_template(self.template)
409
+ template.system_message = self.system_message
410
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
411
+
412
+ history = [] if history is None else history
413
+ for (old_question, old_answer) in history:
414
+ template.append_message(template.roles[0], old_question)
415
+ template.append_message(template.roles[1], old_answer)
416
+ template.append_message(template.roles[0], question)
417
+ template.append_message(template.roles[1], None)
418
+ query = template.get_prompt()
419
+ query_mcd = query[:]
420
+
421
+ if verbose and pixel_values is not None:
422
+ image_bs = pixel_values.shape[0]
423
+ print(f'dynamic ViT batch size: {image_bs}')
424
+
425
+ for num_patches in num_patches_list:
426
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
427
+ query = query.replace('<image>', image_tokens, 1)
428
+
429
+ model_inputs = tokenizer(
430
+ query,
431
+ return_tensors='pt',
432
+ padding=True,
433
+ )
434
+ input_ids = model_inputs['input_ids'].to(self.device)
435
+ attention_mask = model_inputs['attention_mask'].to(self.device)
436
+
437
+ generation_config['eos_token_id'] = eos_token_id
438
+ generation_output = self.generate(
439
+ pixel_values=pixel_values,
440
+ input_ids=input_ids,
441
+ attention_mask=attention_mask,
442
+ test_mcd=test_mcd,
443
+ **generation_config
444
+ )
445
+ if isinstance(generation_output, torch.Tensor):
446
+ generation_output = generation_output
447
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
448
+ else:
449
+ response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
450
+ response = response.split(template.sep.strip())[0].strip()
451
+ history.append((question, response))
452
+ if return_history:
453
+ return response, history
454
+ else:
455
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
456
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
457
+ if verbose:
458
+ print(query_to_print, response)
459
+ return response
460
+
461
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
462
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
463
+ verbose=False,
464
+ icd_sp_temp=None, lcd_qs=None, scd_qs=None, pixel_values_vcd=None, mcd=False, sid=False, pure_text=None, pixel_values_notn=None, only_l=False, original_sp=None, one_attn=False):
465
+
466
+
467
+ if history is None and pixel_values is not None and '<image>' not in question:
468
+ question = '<image>\n' + question
469
+
470
+ if num_patches_list is None:
471
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
472
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
473
+
474
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
475
+ self.img_context_token_id = img_context_token_id
476
+
477
+ template = get_conv_template(self.template)
478
+ template.system_message = self.system_message
479
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
480
+
481
+ history = [] if history is None else history
482
+ for (old_question, old_answer) in history:
483
+ template.append_message(template.roles[0], old_question)
484
+ template.append_message(template.roles[1], old_answer)
485
+ template.append_message(template.roles[0], question)
486
+ template.append_message(template.roles[1], None)
487
+ query = template.get_prompt()
488
+ query_mcd = query[:]
489
+
490
+ if verbose and pixel_values is not None:
491
+ image_bs = pixel_values.shape[0]
492
+ print(f'dynamic ViT batch size: {image_bs}')
493
+
494
+ for num_patches in num_patches_list:
495
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
496
+ query = query.replace('<image>', image_tokens, 1)
497
+
498
+ """icd"""
499
+ if icd_sp_temp is not None:
500
+ # query_icd = re.sub(r'(<\|im_start\|>system\n).*?(<\|im_end\|>)', fr'\1{icd_sp_temp}\2', query, flags=re.DOTALL)
501
+ # query_icd = re.sub(r'(<\|im_start\|>system\n).*?(<\|im_end\|>)', fr'\1{"ignore"}\2', query, flags=re.DOTALL)
502
+ query_icd = re.sub(r'(</img>\n).*?(<\|im_end\|>)', fr'\1{icd_sp_temp}\2', query, flags=re.DOTALL)
503
+
504
+ queries = [query, query_icd]
505
+ model_inputs = tokenizer(
506
+ queries,
507
+ padding="longest", # 动态padding到最长的那条
508
+ return_tensors='pt',
509
+ )
510
+
511
+ # 拆分
512
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
513
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
514
+
515
+ input_ids_icd = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
516
+ attention_mask_icd = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
517
+ if lcd_qs is not None:
518
+ l_qs = lcd_qs[0]
519
+ l_sp_temp = lcd_qs[1]
520
+ query_lcd = re.sub(r'(<\|im_start\|>system\n).*?(<\|im_end\|>)', fr'\1{l_sp_temp}\2', query, flags=re.DOTALL)
521
+ query_lcd = re.sub(r'(</img>\n).*?(<\|im_end\|>)', fr'\1{l_qs}\2', query_lcd, flags=re.DOTALL)
522
+
523
+ queries = [query, query_lcd]
524
+ model_inputs = tokenizer(
525
+ queries,
526
+ padding="longest", # 动态padding到最长的那条
527
+ return_tensors='pt',
528
+ )
529
+
530
+ # 拆分
531
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
532
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
533
+
534
+ input_ids_lcd = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
535
+ attention_mask_lcd = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
536
+ if scd_qs is not None:
537
+
538
+ query_scd = re.sub(r'(</img>\n).*?(<\|im_end\|>)', fr'\1{scd_qs}\2', query, flags=re.DOTALL)
539
+
540
+ queries = [query, query_scd]
541
+ model_inputs = tokenizer(
542
+ queries,
543
+ padding="longest", # 动态padding到最长的那条
544
+ return_tensors='pt',
545
+ )
546
+
547
+ # 拆分
548
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
549
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
550
+
551
+ input_ids_scd = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
552
+ attention_mask_scd = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
553
+
554
+ if mcd:
555
+ # only_img = self.extract_feature(pixel_values) # 13, 256, 3584
556
+ # print(only_img.shape)
557
+ # pure_inputs = tokenizer(pure_text, return_tensors='pt')
558
+ # pure_input_ids = pure_inputs['input_ids'].to(self.device)
559
+ # only_text = self.language_model.get_input_embeddings()(pure_input_ids)
560
+
561
+ num_patches_list_notn = [pixel_values_notn.shape[0]] if pixel_values_notn is not None else []
562
+ assert pixel_values_notn is None or len(pixel_values_notn) == sum(num_patches_list_notn)
563
+ for num_patches in num_patches_list_notn:
564
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
565
+ query_mcd = query_mcd.replace('<image>', image_tokens, 1)
566
+ queries = [query, query_mcd]
567
+ model_inputs = tokenizer(
568
+ queries,
569
+ padding="longest", # 动态padding到最长的那条
570
+ return_tensors='pt',
571
+ )
572
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
573
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
574
+
575
+ input_ids_mcd = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
576
+ attention_mask_mcd = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
577
+ only_text = self.get_text_only_embeds(input_ids_mcd)
578
+
579
+
580
+ if only_l:
581
+ template = get_conv_template(self.template)
582
+ template.system_message = original_sp
583
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
584
+
585
+ history = [] if history is None else history
586
+ for (old_question, old_answer) in history:
587
+ template.append_message(template.roles[0], old_question)
588
+ template.append_message(template.roles[1], old_answer)
589
+ template.append_message(template.roles[0], question)
590
+ template.append_message(template.roles[1], None)
591
+ query_text = template.get_prompt()
592
+
593
+ queries = [query, query_text]
594
+ model_inputs = tokenizer(
595
+ queries,
596
+ padding="longest", # 动态padding到最长的那条
597
+ return_tensors='pt',
598
+ )
599
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
600
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
601
+
602
+ input_ids_text = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
603
+ attention_mask_text = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
604
+
605
+ if one_attn:
606
+ oa_query = "<image>\n"+ pure_text
607
+ for num_patches in num_patches_list:
608
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
609
+ oa_query = oa_query.replace('<image>', image_tokens, 1)
610
+
611
+ queries = [query, oa_query]
612
+ model_inputs = tokenizer(
613
+ queries,
614
+ padding="longest", # 动态padding到最长的那条
615
+ return_tensors='pt',
616
+ )
617
+ input_ids = model_inputs['input_ids'][0].unsqueeze(0).to(self.device)
618
+ attention_mask = model_inputs['attention_mask'][0].unsqueeze(0).to(self.device)
619
+
620
+ input_ids_oa = model_inputs['input_ids'][1].unsqueeze(0).to(self.device)
621
+ attention_mask_oa = model_inputs['attention_mask'][1].unsqueeze(0).to(self.device)
622
+
623
+ # text_ids = tokenizer(pure_text, return_tensors='pt')
624
+ # input_ids_text = text_ids['input_ids'].to(self.device)
625
+ # input_ids_oa = input_ids_oa.squeeze(0).tolist() # [N]
626
+ # input_ids_text = input_ids_text.squeeze(0).tolist() # [M]
627
+
628
+ # for i in range(len(input_ids_oa) - len(input_ids_text) + 1):
629
+ # if input_ids_oa[i:i+len(input_ids_text)] == input_ids_text:
630
+ # print( i, i + len(input_ids_text)) # 开区间
631
+
632
+
633
+ else:
634
+ model_inputs = tokenizer(
635
+ query,
636
+ return_tensors='pt',
637
+ padding=True,
638
+ )
639
+ input_ids = model_inputs['input_ids'].to(self.device)
640
+ attention_mask = model_inputs['attention_mask'].to(self.device)
641
+
642
+
643
+
644
+
645
+ generation_config['eos_token_id'] = eos_token_id
646
+ generation_output = self.generate(
647
+ pixel_values=pixel_values,
648
+ input_ids=input_ids,
649
+ attention_mask=attention_mask,
650
+ input_ids_icd=input_ids_icd if icd_sp_temp is not None else None,
651
+ attention_mask_icd=attention_mask_icd if icd_sp_temp is not None else None,
652
+ input_ids_lcd=input_ids_lcd if lcd_qs is not None else None,
653
+ attention_mask_lcd=attention_mask_lcd if lcd_qs is not None else None,
654
+ input_ids_scd=input_ids_scd if scd_qs is not None else None,
655
+ attention_mask_scd=attention_mask_scd if scd_qs is not None else None,
656
+ pixel_values_vcd=pixel_values_vcd,
657
+ only_text=only_text if mcd else None,
658
+ pixel_values_notn=pixel_values_notn,
659
+ input_ids_mcd = input_ids_mcd if mcd else None,
660
+ attention_mask_mcd = attention_mask_mcd if mcd else None,
661
+ sid = sid,
662
+ input_ids_text = input_ids_text if only_l else None,
663
+ attention_mask_text = attention_mask_text if only_l else None,
664
+ input_ids_oa = input_ids_oa if one_attn else None,
665
+ attention_mask_oa = attention_mask_oa if one_attn else None,
666
+ **generation_config
667
+ )
668
+ if isinstance(generation_output, torch.Tensor):
669
+ generation_output = generation_output
670
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
671
+ else:
672
+ response = tokenizer.batch_decode(generation_output.sequences, skip_special_tokens=True)[0]
673
+ response = response.split(template.sep.strip())[0].strip()
674
+ history.append((question, response))
675
+ if return_history:
676
+ return response, history
677
+ else:
678
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
679
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
680
+ if verbose:
681
+ print(query_to_print, response)
682
+ return response
683
+
684
+ def process_input_embeds(self, input_ids, vit_embeds):
685
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
686
+ B, N, C = input_embeds.shape
687
+ input_embeds = input_embeds.reshape(B * N, C)
688
+
689
+ input_ids = input_ids.reshape(B * N)
690
+ selected = (input_ids == self.img_context_token_id)
691
+ assert selected.sum() != 0
692
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
693
+
694
+ input_embeds = input_embeds.reshape(B, N, C)
695
+ return input_embeds
696
+
697
+ def get_image_position(self, input_ids, img_context_token_id):
698
+ """
699
+ 返回 image token 的起始与结束位置(闭区间)
700
+ input_ids: [B, N]
701
+ """
702
+ B, N = input_ids.shape
703
+ input_ids_flat = input_ids.reshape(B * N)
704
+ selected = (input_ids_flat == img_context_token_id)
705
+ assert selected.sum() > 0, "No image context tokens found"
706
+
707
+ image_positions = torch.nonzero(selected, as_tuple=False).squeeze(-1)
708
+ image_start = image_positions[0].item()
709
+ image_end = image_positions[-1].item() + 1 # 注意 +1 变成开区间(适合 slicing)
710
+
711
+ return (image_start, image_end)
712
+
713
+ def draw(self, importance_scores_all):
714
+ import matplotlib.pyplot as plt
715
+ import numpy as np
716
+
717
+ for i, scores in enumerate(importance_scores_all):
718
+ scores_np = scores.float().cpu().numpy()
719
+
720
+ plt.figure(figsize=(8, 4))
721
+ plt.hist(scores_np, bins=50, color='skyblue', edgecolor='black')
722
+ plt.title(f"Distribution of Patch Importance Scores - Image {i+1}")
723
+ plt.xlabel("Importance Score")
724
+ plt.ylabel("Frequency")
725
+ plt.grid(True)
726
+
727
+ # 保存图像
728
+ save_path = f"importance_scores_distribution_img_{i+1}.png"
729
+ plt.savefig(save_path, dpi=300)
730
+ plt.close()
731
+
732
+ plt.figure(figsize=(10, 5))
733
+
734
+ colors = ['skyblue', 'salmon', 'lightgreen', 'orange', 'purple']
735
+ bins = 50
736
+
737
+ for i, scores in enumerate(importance_scores_all):
738
+ scores_np = scores.float().detach().cpu().numpy()
739
+ plt.hist(scores_np, bins=bins, alpha=0.5, color=colors[i % len(colors)], label=f'Image {i+1}')
740
+
741
+ # 计算每张图的90%阈值
742
+ threshold = np.percentile(scores_np, 90)
743
+ plt.axvline(x=threshold, linestyle='--', color=colors[i % len(colors)], label=f'Img {i+1} 90%: {threshold:.2f}')
744
+
745
+ plt.title("Importance Score Distributions Across Images")
746
+ plt.xlabel("Importance Score")
747
+ plt.ylabel("Frequency")
748
+ plt.legend()
749
+ plt.grid(True)
750
+ plt.tight_layout()
751
+ # plt.show()
752
+ plt.savefig("importance_scores_distribution_all.png", dpi=300)
753
+
754
+ def mask_topk_subimages_by_score(
755
+ self,
756
+ importance_scores_all, # List[Tensor], 每个子图的 importance scores: [N_patches]
757
+ img_token_indices_per_img, # Tensor, shape: [B_img, N_patches], 每个子图中 img token 的位置索引
758
+ attention_mask, # Tensor, shape: [B, N]
759
+ top_ratio: float = 0.2, # 比例,如 0.2 表示选前 20% 的子图
760
+ agg: str = 'max' # 聚合方式:'max' 或 'mean'
761
+ ):
762
+ """
763
+ 从每个子图提取 summary 分数,选出 top_ratio 子图进行 mask。
764
+
765
+ Returns:
766
+ attention_mask_mcd: 处理后的 attention_mask
767
+ selected_indices: 被mask的 token 位置(1D索引)
768
+ top_subimage_indices: 被选中的子图索引
769
+ """
770
+
771
+ B_img = len(importance_scores_all)
772
+ device = attention_mask.device
773
+
774
+ num_top_subimages = max(1, int(B_img * top_ratio))
775
+ # Step 1: 聚合每个子图的得分
776
+ if agg == 'max':
777
+ subimage_scores = torch.tensor(
778
+ [scores.max().item() for scores in importance_scores_all],
779
+ device=device
780
+ )
781
+ top_subimage_indices = subimage_scores.topk(num_top_subimages, largest=True).indices
782
+ elif agg == 'min':
783
+ subimage_scores = torch.tensor(
784
+ [scores.min().item() for scores in importance_scores_all],
785
+ device=device
786
+ )
787
+ top_subimage_indices = subimage_scores.topk(num_top_subimages, largest=False).indices
788
+ else:
789
+ raise ValueError(f"Unsupported agg method: {agg}. Use 'max' or 'mean'.")
790
+
791
+ # Step 3: 获取所有被 mask 的 token 索引
792
+ selected_indices = []
793
+ for idx in top_subimage_indices:
794
+ patch_token_indices = img_token_indices_per_img[idx] # [N_patches]
795
+ selected_indices.append(patch_token_indices)
796
+ selected_indices = torch.cat(selected_indices, dim=0) # [Total_masked_tokens]
797
+
798
+ # Step 4: mask 掉对应的 token
799
+ attn_flat = attention_mask.reshape(-1).clone()
800
+ attn_flat[selected_indices] = 0
801
+ attention_mask_mcd = attn_flat.reshape_as(attention_mask)
802
+
803
+ return attention_mask_mcd, selected_indices, top_subimage_indices
804
+
805
+ def mask_patch_below_mean(
806
+ self,
807
+ importance_scores_all, # List[Tensor], 每个子图的 patch importance scores
808
+ img_token_indices_per_img, # Tensor, shape: [B_img, N_patches]
809
+ attention_mask, # Tensor, shape: [B, N]
810
+ direction
811
+ ):
812
+ """
813
+ 将每个子图中小于该子图均值的 patch vision token 的 attention mask 设置为 0。
814
+
815
+ Returns:
816
+ attention_mask_mcd: 处理后的 attention_mask
817
+ selected_indices: 被 mask 的视觉 token 索引(1D索引)
818
+ """
819
+ device = attention_mask.device
820
+ selected_indices = []
821
+
822
+ for img_idx, scores in enumerate(importance_scores_all):
823
+ patch_indices = img_token_indices_per_img[img_idx] # shape: [N_patches]
824
+ scores = scores.to(device)
825
+
826
+ threshold = scores.mean()
827
+ if direction == 'mask_r':
828
+ mask = scores > threshold # shape: [N_patches], bool
829
+ else:
830
+ mask = scores < threshold # shape: [N_patches], bool
831
+
832
+ selected = patch_indices[mask] # 选择需要被 mask 的 token index
833
+ selected_indices.append(selected)
834
+
835
+ selected_indices = torch.cat(selected_indices, dim=0)
836
+
837
+ # 修改 attention mask
838
+ attn_flat = attention_mask.reshape(-1).clone()
839
+ attn_flat[selected_indices] = 0
840
+ attention_mask_mcd = attn_flat.reshape_as(attention_mask)
841
+
842
+ return attention_mask_mcd, selected_indices
843
+
844
+ def mask_patch_by_topk_and_bottomk(
845
+ self,
846
+ importance_scores_all, # List[Tensor],每张子图的 patch importance 分数
847
+ img_token_indices_per_img, # Tensor,shape: [B_img, patch_num],图像 token 的位置索引
848
+ attention_mask, # Tensor,shape: [B, N]
849
+ topk_ratio: float = 0.1,
850
+ overall_ratio: float = 0.2
851
+ ):
852
+ """
853
+ 对每个子图,选出 topk_ratio 和 bottomk 的 patch,mask 掉这些 patch。
854
+
855
+ Returns:
856
+ attention_mask_mcd: 已更新的 attention_mask
857
+ selected_indices: 被mask掉的视觉 token 的位置索引(1D)
858
+ topk_indices_all: 所有子图中被选中的 patch 索引(原始 patch 位置)
859
+ """
860
+ device = attention_mask.device
861
+ B_img = len(importance_scores_all)
862
+ selected_indices = []
863
+ topk_indices_all = []
864
+
865
+ for img_idx in range(B_img):
866
+ scores = importance_scores_all[img_idx]
867
+ patch_token_indices = img_token_indices_per_img[img_idx] # shape: [N_patches]
868
+
869
+ topk = int(scores.shape[0] * topk_ratio)
870
+ bottomk = int(scores.shape[0] * (overall_ratio - topk_ratio))
871
+
872
+ topk_indices = scores.topk(topk, largest=True).indices
873
+ bottomk_indices = scores.topk(bottomk, largest=False).indices
874
+
875
+ combined = torch.cat([topk_indices, bottomk_indices], dim=0) # [K+K']
876
+ topk_indices_all.append(combined)
877
+
878
+ selected = patch_token_indices[combined]
879
+ selected_indices.append(selected)
880
+
881
+ selected_indices = torch.cat(selected_indices, dim=0)
882
+
883
+ # 修改 attention mask
884
+ attn_flat = attention_mask.reshape(-1).clone()
885
+ attn_flat[selected_indices] = 0
886
+ attention_mask_mcd = attn_flat.reshape_as(attention_mask)
887
+
888
+ return attention_mask_mcd, selected_indices, topk_indices_all
889
+
890
+ @torch.no_grad()
891
+ def generate(
892
+ self,
893
+ pixel_values: Optional[torch.FloatTensor] = None,
894
+ input_ids: Optional[torch.FloatTensor] = None,
895
+ attention_mask: Optional[torch.LongTensor] = None,
896
+ visual_features: Optional[torch.FloatTensor] = None,
897
+ generation_config: Optional[GenerationConfig] = None,
898
+ output_hidden_states: Optional[bool] = None,
899
+ input_ids_icd: Optional[torch.FloatTensor] = None,
900
+ attention_mask_icd: Optional[torch.LongTensor] = None,
901
+ input_ids_lcd: Optional[torch.FloatTensor] = None,
902
+ attention_mask_lcd: Optional[torch.LongTensor] = None,
903
+ input_ids_scd: Optional[torch.FloatTensor] = None,
904
+ attention_mask_scd: Optional[torch.LongTensor] = None,
905
+ pixel_values_vcd: Optional[torch.FloatTensor] = None,
906
+ only_text: Optional[torch.FloatTensor] = None,
907
+ sid: Optional[bool] = None,
908
+ pixel_values_notn: Optional[torch.FloatTensor] = None,
909
+ input_ids_mcd: Optional[torch.FloatTensor] = None,
910
+ attention_mask_mcd: Optional[torch.LongTensor] = None,
911
+ test_mcd: Optional[bool] = None,
912
+ input_ids_text: Optional[torch.FloatTensor] = None,
913
+ attention_mask_text: Optional[torch.LongTensor] = None,
914
+ input_ids_oa: Optional[torch.FloatTensor] = None,
915
+ attention_mask_oa: Optional[torch.LongTensor] = None,
916
+ **generate_kwargs,
917
+ ) -> torch.LongTensor:
918
+
919
+ assert self.img_context_token_id is not None
920
+ if pixel_values is not None:
921
+ if visual_features is not None:
922
+ vit_embeds = visual_features
923
+ else:
924
+ vit_embeds = self.extract_feature(pixel_values)
925
+
926
+ input_embeds = self.process_input_embeds(input_ids, vit_embeds)
927
+ # print(pixel_values.shape)
928
+ # print(vit_embeds.shape)
929
+ image_position = self.get_image_position(input_ids, self.img_context_token_id)
930
+
931
+ """vcd"""
932
+ if pixel_values_vcd is not None:
933
+ if visual_features is not None:
934
+ vit_embeds_vcd = visual_features
935
+ else:
936
+ vit_embeds_vcd = self.extract_feature(pixel_values_vcd)
937
+ input_embeds_vcd = self.process_input_embeds(input_ids, vit_embeds_vcd)
938
+
939
+ """icd"""
940
+ if input_ids_icd is not None:
941
+ input_embeds_icd = self.process_input_embeds(input_ids_icd, vit_embeds)
942
+
943
+ """lcd"""
944
+ if input_ids_lcd is not None:
945
+ input_embeds_lcd = self.process_input_embeds(input_ids_lcd, vit_embeds)
946
+ """scd"""
947
+ if input_ids_scd is not None:
948
+ input_embeds_scd = self.process_input_embeds(input_ids_scd, vit_embeds)
949
+ # """mcd one"""
950
+ # if only_text is not None:
951
+ # B, N, C = only_text.shape
952
+ # only_text = only_text.reshape(B*N, C)
953
+ # vit_embeds_mcd = vit_embeds.clone()
954
+ # vit_embeds_mcd = vit_embeds_mcd.reshape(-1, C)
955
+ # only_text = only_text / only_text.norm(dim=-1, keepdim=True)
956
+ # vit_embeds_mcd = vit_embeds_mcd / vit_embeds_mcd.norm(dim=-1, keepdim=True)
957
+
958
+ # similarities = (vit_embeds_mcd @ only_text.T) # (B, B) 或 (batch_text, batch_image)
959
+ # importance_scores = similarities.mean(dim=1)
960
+
961
+ # overall_pr = self.overall_pr
962
+ # pr = self.pr
963
+ # topk_r = int(vit_embeds_mcd.shape[0]*pr)
964
+ # topk_ir = int(vit_embeds_mcd.shape[0]*(overall_pr-pr))
965
+ # topk_indices_r = importance_scores.topk(topk_r, largest=True).indices
966
+ # topk_indices_ir = importance_scores.topk(topk_ir, largest=False).indices
967
+ # topk_indices = torch.cat((topk_indices_r, topk_indices_ir), dim=-1)
968
+
969
+ # input_ids_flat = input_ids.reshape(-1) # [B*N]
970
+ # img_token_mask = (input_ids_flat == self.img_context_token_id) # [B*N], True/False
971
+
972
+ # img_token_indices = img_token_mask.nonzero(as_tuple=True)[0] # 拿到是图像token的位置(1D索引)
973
+ # assert (input_ids_flat[img_token_indices] == self.img_context_token_id).all(), "Error: img_token_indices 不全是图像token!"
974
+ # selected_indices = img_token_indices[topk_indices] # 在input_embeds展平后的位置
975
+
976
+ # """软置零"""
977
+ # # input_embeds_flat = input_embeds.reshape(-1, input_embeds.shape[-1]) # [B*N, C]
978
+
979
+ # # # expected_embeds = vit_embeds.reshape(-1, C)[topk_indices] # shape: [topk, C]
980
+ # # # # 计算差异
981
+ # # # selected_embeds = input_embeds_flat[selected_indices] # shape: [topk, C]
982
+ # # # diff = (selected_embeds - expected_embeds).abs().max()
983
+
984
+ # # # print(f"最大绝对差异: {diff.item()}")
985
+
986
+ # # # 将这些位置置零
987
+ # # input_embeds_flat[selected_indices] = 0
988
+
989
+ # # # 恢复回 [B, N, C]
990
+ # # input_embeds_mcd = input_embeds_flat.reshape(input_embeds.shape)
991
+
992
+ # """直接不关注"""
993
+ # input_embeds_mcd = input_embeds.clone()
994
+ # attn_flat = attention_mask.reshape(-1).clone() # clone() 防止原地写破坏梯度
995
+ # attn_flat[selected_indices] = 0
996
+ # attention_mask_mcd = attn_flat.reshape_as(attention_mask)
997
+
998
+ """mcd sep"""
999
+ if only_text is not None:
1000
+ B, N, C = only_text.shape
1001
+ # vit_embeds_mcd = vit_embeds.clone()
1002
+ vit_embeds_mcd = self.extract_feature(pixel_values_notn)
1003
+ # print("vit_embeds_mcd.shape", vit_embeds_mcd.shape)
1004
+ # print("vit_embeds.shape", vit_embeds.shape)
1005
+ B_img = vit_embeds_mcd.shape[0] # B_img = 13, 256, 3584
1006
+ only_text_expand = only_text.expand(B_img, -1, -1) # 复制成 [13, N_text, C]
1007
+
1008
+ importance_scores_all = []
1009
+
1010
+ for i in range(B_img):
1011
+ vit_embeds_i = vit_embeds_mcd[i] # (N_patches, C)
1012
+ text_i = only_text_expand[i] # (N_text, C),复制过来的是一样的
1013
+
1014
+ # 归一化
1015
+ vit_embeds_i = vit_embeds_i / vit_embeds_i.norm(dim=-1, keepdim=True)
1016
+ text_i = text_i / text_i.norm(dim=-1, keepdim=True)
1017
+
1018
+ # 相似度
1019
+ similarity = vit_embeds_i @ text_i.T # (N_patches, N_text)
1020
+
1021
+ """原始做法"""
1022
+ # importance打分
1023
+ importance_scores = similarity.mean(dim=1) # (N_patches,)
1024
+ importance_scores_all.append(importance_scores)
1025
+
1026
+ """先选token,把token对图片相似性作为重要性分数"""
1027
+ # token_importance = similarity.mean(dim=0) # shape: (N_text,)
1028
+
1029
+ # most_token_idx = token_importance.argmax()
1030
+ # # most_token_idx = token_importance.argmin()
1031
+
1032
+ # importance_scores = similarity[:, most_token_idx] # shape: (N_patches,)
1033
+
1034
+ # importance_scores_all.append(importance_scores)
1035
+
1036
+ input_ids_flat = input_ids_mcd.reshape(-1) # [B*N]
1037
+ img_token_mask = (input_ids_flat == self.img_context_token_id) # [B*N], True/False
1038
+
1039
+ img_token_indices = img_token_mask.nonzero(as_tuple=True)[0] # 拿到是图像token的位置(1D索引)
1040
+ assert (input_ids_flat[img_token_indices] == self.img_context_token_id).all(), "Error: img_token_indices 不全是图像token!"
1041
+ img_token_indices_per_img = img_token_indices.reshape(B_img, -1) # [13, patch_per_img]
1042
+ input_embeds_mcd = self.process_input_embeds(input_ids_mcd, vit_embeds_mcd)
1043
+
1044
+ # img_token_indices应该被分成 B_img块,每一块对应一个增强图
1045
+ """子图mask"""
1046
+ # attention_mask_mcd, selected_indices, top_subimage_indices = self.mask_topk_subimages_by_score(
1047
+ # importance_scores_all=importance_scores_all,
1048
+ # img_token_indices_per_img=img_token_indices_per_img,
1049
+ # attention_mask=attention_mask_mcd,
1050
+ # top_ratio=self.overall_pr - self.pr, # 前 20%
1051
+ # agg='min' # 也可以用 'mean'
1052
+ # )
1053
+
1054
+
1055
+ """topk"""
1056
+ attention_mask_mcd, selected_indices, topk_indices_all = self.mask_patch_by_topk_and_bottomk(
1057
+ importance_scores_all=importance_scores_all,
1058
+ img_token_indices_per_img=img_token_indices_per_img,
1059
+ attention_mask=attention_mask_mcd,
1060
+ topk_ratio=self.pr,
1061
+ overall_ratio=self.overall_pr
1062
+ )
1063
+
1064
+ """mask均值"""
1065
+ # attention_mask_mcd, selected_indices = self.mask_patch_below_mean(
1066
+ # importance_scores_all=importance_scores_all,
1067
+ # img_token_indices_per_img=img_token_indices_per_img,
1068
+ # attention_mask=attention_mask_mcd,
1069
+ # direction= "mask_r" if self.pr > 0 else "mask_i" # mask_r: mask掉大于均值的, mask_i: mask掉小于均值的
1070
+ # )
1071
+
1072
+
1073
+ if test_mcd:
1074
+ B_img = vit_embeds.shape[0] # B_img = 13, 256, 3584
1075
+ test_text = self.get_text_only_embeds(input_ids)
1076
+ only_text_expand = test_text.expand(B_img, -1, -1) # 复制成 [13, N_text, C]
1077
+
1078
+ importance_scores_all = []
1079
+
1080
+ for i in range(B_img):
1081
+ vit_embeds_i = vit_embeds[i] # (N_patches, C)
1082
+ text_i = only_text_expand[i] # (N_text, C),复制过来的是一样的
1083
+
1084
+ # 归一化
1085
+ vit_embeds_i = vit_embeds_i / vit_embeds_i.norm(dim=-1, keepdim=True)
1086
+ text_i = text_i / text_i.norm(dim=-1, keepdim=True)
1087
+
1088
+ # 相似度
1089
+ similarity = vit_embeds_i @ text_i.T # (N_patches, N_text)
1090
+
1091
+ """原始做法"""
1092
+ # importance打分
1093
+ importance_scores = similarity.mean(dim=1) # (N_patches,)
1094
+ importance_scores_all.append(importance_scores)
1095
+
1096
+
1097
+ topk_indices_all = []
1098
+ for importance_scores in importance_scores_all:
1099
+ topk_r = int(importance_scores.shape[0] * self.pr)
1100
+ topk_ir = int(importance_scores.shape[0] * (self.overall_pr - self.pr))
1101
+
1102
+ topk_indices_r = importance_scores.topk(topk_r, largest=True).indices
1103
+ topk_indices_ir = importance_scores.topk(topk_ir, largest=False).indices
1104
+
1105
+ topk_indices = torch.cat((topk_indices_r, topk_indices_ir), dim=0)
1106
+
1107
+ topk_indices_all.append(topk_indices)
1108
+
1109
+ input_ids_flat = input_ids.reshape(-1) # [B*N]
1110
+ img_token_mask = (input_ids_flat == self.img_context_token_id) # [B*N], True/False
1111
+
1112
+ img_token_indices = img_token_mask.nonzero(as_tuple=True)[0] # 拿到是图像token的位置(1D索引)
1113
+ assert (input_ids_flat[img_token_indices] == self.img_context_token_id).all(), "Error: img_token_indices 不全是图像token!"
1114
+
1115
+ # img_token_indices应该被分成 B_img块,每一块对应一个增强图
1116
+ img_token_indices_per_img = img_token_indices.reshape(B_img, -1) # [13, patch_per_img]
1117
+
1118
+
1119
+ selected_indices = []
1120
+
1121
+ for img_idx in range(B_img):
1122
+ img_selected = img_token_indices_per_img[img_idx][topk_indices_all[img_idx]] # input_embeds里的位置
1123
+ selected_indices.append(img_selected)
1124
+
1125
+
1126
+ selected_indices = torch.cat(selected_indices, dim=0) # [topk_total]
1127
+
1128
+ """直接不关注"""
1129
+ attn_flat = attention_mask.reshape(-1).clone() # clone() 防止原地写破坏梯度
1130
+ attn_flat[selected_indices] = 0
1131
+ attention_mask = attn_flat.reshape_as(attention_mask)
1132
+
1133
+ """only_l"""
1134
+ if input_ids_text is not None:
1135
+ input_embeds_text = self.language_model.get_input_embeddings()(input_ids_text)
1136
+
1137
+ """attn mcd"""
1138
+ if input_ids_oa is not None:
1139
+ input_embeds_oa = self.process_input_embeds(input_ids_oa, vit_embeds)
1140
+ image_position = self.get_image_position(input_ids_oa, self.img_context_token_id)
1141
+
1142
+ else:
1143
+ input_embeds = self.process_input_embeds(input_ids, vit_embeds)
1144
+
1145
+ # print("inputs_embeds_oa", input_embeds_oa.shape)
1146
+ # print("inputs_embeds", input_embeds.shape)
1147
+
1148
+ outputs = self.language_model.generate(
1149
+ inputs_embeds=input_embeds,
1150
+ attention_mask=attention_mask,
1151
+ generation_config=generation_config,
1152
+ output_hidden_states=output_hidden_states,
1153
+ use_cache=True,
1154
+ inputs_embeds_icd=input_embeds_icd if input_ids_icd is not None else None,
1155
+ attention_mask_icd=attention_mask_icd if input_ids_icd is not None else None,
1156
+ inputs_embeds_lcd=input_embeds_lcd if input_ids_lcd is not None else None,
1157
+ attention_mask_lcd=attention_mask_lcd if input_ids_lcd is not None else None,
1158
+ inputs_embeds_scd=input_embeds_scd if input_ids_scd is not None else None,
1159
+ attention_mask_scd=attention_mask_scd if input_ids_scd is not None else None,
1160
+ inputs_embeds_vcd = input_embeds_vcd if pixel_values_vcd is not None else None,
1161
+ attention_mask_vcd = attention_mask if pixel_values_vcd is not None else None,
1162
+ inputs_embeds_mcd = input_embeds_mcd if only_text is not None else None,
1163
+ attention_mask_mcd = attention_mask_mcd if only_text is not None else None,
1164
+ image_position = image_position if sid or input_ids_oa is not None else None,
1165
+ inputs_embeds_text = input_embeds_text if input_ids_text is not None else None,
1166
+ attention_mask_text = attention_mask_text if input_ids_text is not None else None,
1167
+ inputs_embeds_oa = input_embeds_oa if input_ids_oa is not None else None,
1168
+ attention_mask_oa = attention_mask_oa if input_ids_oa is not None else None,
1169
+ **generate_kwargs,
1170
+ )
1171
+
1172
+ return outputs
1173
+
1174
+ @property
1175
+ def lm_head(self):
1176
+ return self.language_model.get_output_embeddings()
1177
+
1178
+ def get_input_embeddings(self):
1179
+ return self.language_model.get_input_embeddings()
1180
+
1181
+ def get_output_embeddings(self):
1182
+ return self.language_model.get_output_embeddings()
1183
+
1184
+ def get_text_only_embeds(self, input_ids):
1185
+ # 先拿到所有 input_ids 对应的embedding
1186
+ input_embeds = self.language_model.get_input_embeddings()(input_ids) # (B, N, C)
1187
+
1188
+ # 找到不是 img_context_token_id 的位置
1189
+ mask = (input_ids != self.img_context_token_id)
1190
+
1191
+ # 只保留非图片token的部分
1192
+ text_only_embeds = []
1193
+ for embeds, mask_row in zip(input_embeds, mask):
1194
+ text_only_embeds.append(embeds[mask_row]) # 取出一行中不是img_ctx的位置
1195
+
1196
+ # text_only_embeds是List[Tensor(有效token数, C)]
1197
+ return torch.stack(text_only_embeds) # (B, 有效token数, C)
1198
+
internvl3-8b-instruct-lora_epoch10_5e-6/modeling_qwen2_cd.py ADDED
@@ -0,0 +1,1950 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/qwen2/modular_qwen2.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_qwen2.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ from typing import Callable, List, Optional, Tuple, Union
8
+
9
+ import torch
10
+ from torch import nn
11
+ import copy
12
+ import sys
13
+
14
+ from transformers.activations import ACT2FN
15
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
16
+ from transformers.generation import GenerationMixin
17
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
18
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
19
+ from transformers.modeling_outputs import (
20
+ BaseModelOutputWithPast,
21
+ CausalLMOutputWithPast,
22
+ QuestionAnsweringModelOutput,
23
+ SequenceClassifierOutputWithPast,
24
+ TokenClassifierOutput,
25
+ )
26
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
27
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
28
+ from transformers.processing_utils import Unpack
29
+ from transformers.utils import (
30
+ LossKwargs,
31
+ add_code_sample_docstrings,
32
+ add_start_docstrings,
33
+ add_start_docstrings_to_model_forward,
34
+ logging,
35
+ replace_return_docstrings,
36
+ )
37
+ from transformers.utils.deprecation import deprecate_kwarg
38
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
39
+ from transformers.cache_utils import Cache
40
+ from transformers.generation.logits_process import (
41
+ LogitsProcessorList,
42
+ )
43
+ from transformers.generation.stopping_criteria import (
44
+ StoppingCriteriaList,
45
+ )
46
+ from transformers.generation.streamers import BaseStreamer
47
+ from dataclasses import dataclass
48
+ import os
49
+ from transformers import GenerationConfig
50
+ from transformers.utils import ModelOutput
51
+ @dataclass
52
+ class GenerateDecoderOnlyOutput(ModelOutput):
53
+ """
54
+ Outputs of decoder-only generation models, when using non-beam methods.
55
+
56
+ Args:
57
+ sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
58
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
59
+ if all batches finished early due to the `eos_token_id`.
60
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
61
+ Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
62
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
63
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
64
+ logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
65
+ Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
66
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
67
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
68
+ attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
69
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
70
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
71
+ hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
72
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
73
+ `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
74
+ past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
75
+ Returns the model cache, used to speed up decoding. Different models have a different cache format, check
76
+ the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
77
+ """
78
+
79
+ sequences: torch.LongTensor = None
80
+ scores: Optional[Tuple[torch.FloatTensor]] = None
81
+ logits: Optional[Tuple[torch.FloatTensor]] = None
82
+ attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
83
+ hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
84
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
85
+
86
+
87
+ @dataclass
88
+ class GenerateEncoderDecoderOutput(ModelOutput):
89
+ """
90
+ Outputs of encoder-decoder generation models, when using non-beam methods.
91
+
92
+ Args:
93
+ sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
94
+ The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
95
+ if all batches finished early due to the `eos_token_id`.
96
+ scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
97
+ Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
98
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
99
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
100
+ logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
101
+ Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
102
+ at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
103
+ each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
104
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
105
+ Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
106
+ sequence_length, sequence_length)`.
107
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
108
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
109
+ shape `(batch_size, sequence_length, hidden_size)`.
110
+ decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
111
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
112
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
113
+ cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
114
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
115
+ `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
116
+ decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
117
+ Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
118
+ `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
119
+ past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
120
+ Returns the model cache, used to speed up decoding. Different models have a different cache format, check
121
+ the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
122
+ """
123
+
124
+ sequences: torch.LongTensor = None
125
+ scores: Optional[Tuple[torch.FloatTensor]] = None
126
+ logits: Optional[Tuple[torch.FloatTensor]] = None
127
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
128
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
129
+ decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
130
+ cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
131
+ decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
132
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
133
+
134
+ GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput]
135
+
136
+
137
+
138
+ logger = logging.get_logger(__name__)
139
+
140
+ _CHECKPOINT_FOR_DOC = "meta-qwen2/Qwen2-2-7b-hf"
141
+ _CONFIG_FOR_DOC = "Qwen2Config"
142
+
143
+
144
+ class Qwen2MLP(nn.Module):
145
+ def __init__(self, config):
146
+ super().__init__()
147
+ self.config = config
148
+ self.hidden_size = config.hidden_size
149
+ self.intermediate_size = config.intermediate_size
150
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
151
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
152
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
153
+ self.act_fn = ACT2FN[config.hidden_act]
154
+
155
+ def forward(self, x):
156
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
157
+ return down_proj
158
+
159
+
160
+ def rotate_half(x):
161
+ """Rotates half the hidden dims of the input."""
162
+ x1 = x[..., : x.shape[-1] // 2]
163
+ x2 = x[..., x.shape[-1] // 2 :]
164
+ return torch.cat((-x2, x1), dim=-1)
165
+
166
+
167
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
168
+ """Applies Rotary Position Embedding to the query and key tensors.
169
+
170
+ Args:
171
+ q (`torch.Tensor`): The query tensor.
172
+ k (`torch.Tensor`): The key tensor.
173
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
174
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
175
+ position_ids (`torch.Tensor`, *optional*):
176
+ Deprecated and unused.
177
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
178
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
179
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
180
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
181
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
182
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
183
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
184
+ Returns:
185
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
186
+ """
187
+ cos = cos.unsqueeze(unsqueeze_dim)
188
+ sin = sin.unsqueeze(unsqueeze_dim)
189
+ q_embed = (q * cos) + (rotate_half(q) * sin)
190
+ k_embed = (k * cos) + (rotate_half(k) * sin)
191
+ return q_embed, k_embed
192
+
193
+
194
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
195
+ """
196
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
197
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
198
+ """
199
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
200
+ if n_rep == 1:
201
+ return hidden_states
202
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
203
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
204
+
205
+
206
+ def eager_attention_forward(
207
+ module: nn.Module,
208
+ query: torch.Tensor,
209
+ key: torch.Tensor,
210
+ value: torch.Tensor,
211
+ attention_mask: Optional[torch.Tensor],
212
+ scaling: float,
213
+ dropout: float = 0.0,
214
+ **kwargs,
215
+ ):
216
+ key_states = repeat_kv(key, module.num_key_value_groups)
217
+ value_states = repeat_kv(value, module.num_key_value_groups)
218
+
219
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
220
+ if attention_mask is not None:
221
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
222
+ attn_weights = attn_weights + causal_mask
223
+
224
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
225
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
226
+ attn_output = torch.matmul(attn_weights, value_states)
227
+ attn_output = attn_output.transpose(1, 2).contiguous()
228
+
229
+ return attn_output, attn_weights
230
+
231
+
232
+ class Qwen2Attention(nn.Module):
233
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
234
+
235
+ def __init__(self, config: Qwen2Config, layer_idx: int):
236
+ super().__init__()
237
+ self.config = config
238
+ self.layer_idx = layer_idx
239
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
240
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
241
+ self.scaling = self.head_dim**-0.5
242
+ self.attention_dropout = config.attention_dropout
243
+ self.is_causal = True
244
+ self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
245
+ self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
246
+ self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
247
+ self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
248
+
249
+ def forward(
250
+ self,
251
+ hidden_states: torch.Tensor,
252
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
253
+ attention_mask: Optional[torch.Tensor],
254
+ past_key_value: Optional[Cache] = None,
255
+ cache_position: Optional[torch.LongTensor] = None,
256
+ **kwargs: Unpack[FlashAttentionKwargs],
257
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
258
+ input_shape = hidden_states.shape[:-1]
259
+ hidden_shape = (*input_shape, -1, self.head_dim)
260
+
261
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
262
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
263
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
264
+
265
+ cos, sin = position_embeddings
266
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
267
+
268
+ if past_key_value is not None:
269
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
270
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
271
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
272
+
273
+ sliding_window = None
274
+ if (
275
+ self.config.use_sliding_window
276
+ and getattr(self.config, "sliding_window", None) is not None
277
+ and self.layer_idx >= self.config.max_window_layers
278
+ ):
279
+ sliding_window = self.config.sliding_window
280
+
281
+ attention_interface: Callable = eager_attention_forward
282
+ if self.config._attn_implementation != "eager":
283
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
284
+ logger.warning_once(
285
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
286
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
287
+ )
288
+ else:
289
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
290
+
291
+ attn_output, attn_weights = attention_interface(
292
+ self,
293
+ query_states,
294
+ key_states,
295
+ value_states,
296
+ attention_mask,
297
+ dropout=0.0 if not self.training else self.attention_dropout,
298
+ scaling=self.scaling,
299
+ sliding_window=sliding_window, # main diff with Llama
300
+ **kwargs,
301
+ )
302
+
303
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
304
+ attn_output = self.o_proj(attn_output)
305
+ return attn_output, attn_weights
306
+
307
+
308
+ class Qwen2RMSNorm(nn.Module):
309
+ def __init__(self, hidden_size, eps=1e-6):
310
+ """
311
+ Qwen2RMSNorm is equivalent to T5LayerNorm
312
+ """
313
+ super().__init__()
314
+ self.weight = nn.Parameter(torch.ones(hidden_size))
315
+ self.variance_epsilon = eps
316
+
317
+ def forward(self, hidden_states):
318
+ input_dtype = hidden_states.dtype
319
+ hidden_states = hidden_states.to(torch.float32)
320
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
321
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
322
+ return self.weight * hidden_states.to(input_dtype)
323
+
324
+ def extra_repr(self):
325
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
326
+
327
+
328
+ class Qwen2DecoderLayer(nn.Module):
329
+ def __init__(self, config: Qwen2Config, layer_idx: int):
330
+ super().__init__()
331
+ self.hidden_size = config.hidden_size
332
+ self.self_attn = Qwen2Attention(config=config, layer_idx=layer_idx)
333
+ self.mlp = Qwen2MLP(config)
334
+ self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
335
+ self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
336
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
337
+ logger.warning_once(
338
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
339
+ "unexpected results may be encountered."
340
+ )
341
+
342
+ def forward(
343
+ self,
344
+ hidden_states: torch.Tensor,
345
+ attention_mask: Optional[torch.Tensor] = None,
346
+ position_ids: Optional[torch.LongTensor] = None,
347
+ past_key_value: Optional[Cache] = None,
348
+ output_attentions: Optional[bool] = False,
349
+ use_cache: Optional[bool] = False,
350
+ cache_position: Optional[torch.LongTensor] = None,
351
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
352
+ **kwargs: Unpack[FlashAttentionKwargs],
353
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
354
+ residual = hidden_states
355
+
356
+ hidden_states = self.input_layernorm(hidden_states)
357
+
358
+ # Self Attention
359
+ hidden_states, self_attn_weights = self.self_attn(
360
+ hidden_states=hidden_states,
361
+ attention_mask=attention_mask,
362
+ position_ids=position_ids,
363
+ past_key_value=past_key_value,
364
+ output_attentions=output_attentions,
365
+ use_cache=use_cache,
366
+ cache_position=cache_position,
367
+ position_embeddings=position_embeddings,
368
+ **kwargs,
369
+ )
370
+ hidden_states = residual + hidden_states
371
+
372
+ # Fully Connected
373
+ residual = hidden_states
374
+ hidden_states = self.post_attention_layernorm(hidden_states)
375
+ hidden_states = self.mlp(hidden_states)
376
+ hidden_states = residual + hidden_states
377
+
378
+ outputs = (hidden_states,)
379
+ if output_attentions:
380
+ outputs += (self_attn_weights,)
381
+
382
+ return outputs
383
+
384
+
385
+ class Qwen2RotaryEmbedding(nn.Module):
386
+ def __init__(self, config: Qwen2Config, device=None):
387
+ super().__init__()
388
+ # BC: "rope_type" was originally "type"
389
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
390
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
391
+ else:
392
+ self.rope_type = "default"
393
+ self.max_seq_len_cached = config.max_position_embeddings
394
+ self.original_max_seq_len = config.max_position_embeddings
395
+
396
+ self.config = config
397
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
398
+
399
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
400
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
401
+ self.original_inv_freq = self.inv_freq
402
+
403
+ def _dynamic_frequency_update(self, position_ids, device):
404
+ """
405
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
406
+ 1 - growing beyond the cached sequence length (allow scaling)
407
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
408
+ """
409
+ seq_len = torch.max(position_ids) + 1
410
+ if seq_len > self.max_seq_len_cached: # growth
411
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
412
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
413
+ self.max_seq_len_cached = seq_len
414
+
415
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
416
+ # This .to() is needed if the model has been moved to a device after being initialized (because
417
+ # the buffer is automatically moved, but not the original copy)
418
+ self.original_inv_freq = self.original_inv_freq.to(device)
419
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
420
+ self.max_seq_len_cached = self.original_max_seq_len
421
+
422
+ @torch.no_grad()
423
+ def forward(self, x, position_ids):
424
+ if "dynamic" in self.rope_type:
425
+ self._dynamic_frequency_update(position_ids, device=x.device)
426
+
427
+ # Core RoPE block
428
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
429
+ position_ids_expanded = position_ids[:, None, :].float()
430
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
431
+ device_type = x.device.type
432
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
433
+ with torch.autocast(device_type=device_type, enabled=False):
434
+ freqs = (inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()).transpose(1, 2)
435
+ emb = torch.cat((freqs, freqs), dim=-1)
436
+ cos = emb.cos()
437
+ sin = emb.sin()
438
+
439
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
440
+ cos = cos * self.attention_scaling
441
+ sin = sin * self.attention_scaling
442
+
443
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
444
+
445
+
446
+ QWEN2_START_DOCSTRING = r"""
447
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
448
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
449
+ etc.)
450
+
451
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
452
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
453
+ and behavior.
454
+
455
+ Parameters:
456
+ config ([`Qwen2Config`]):
457
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
458
+ load the weights associated with the model, only the configuration. Check out the
459
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
460
+ """
461
+
462
+
463
+ @add_start_docstrings(
464
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
465
+ QWEN2_START_DOCSTRING,
466
+ )
467
+ class Qwen2PreTrainedModel(PreTrainedModel):
468
+ config_class = Qwen2Config
469
+ base_model_prefix = "model"
470
+ supports_gradient_checkpointing = True
471
+ _no_split_modules = ["Qwen2DecoderLayer"]
472
+ _skip_keys_device_placement = ["past_key_values"]
473
+ _supports_flash_attn_2 = True
474
+ _supports_sdpa = True
475
+ _supports_flex_attn = True
476
+ _supports_cache_class = True
477
+ _supports_quantized_cache = True
478
+ _supports_static_cache = True
479
+ _supports_attention_backend = True
480
+
481
+ def _init_weights(self, module):
482
+ std = self.config.initializer_range
483
+ if isinstance(module, nn.Linear):
484
+ module.weight.data.normal_(mean=0.0, std=std)
485
+ if module.bias is not None:
486
+ module.bias.data.zero_()
487
+ elif isinstance(module, nn.Embedding):
488
+ module.weight.data.normal_(mean=0.0, std=std)
489
+ if module.padding_idx is not None:
490
+ module.weight.data[module.padding_idx].zero_()
491
+
492
+
493
+ QWEN2_INPUTS_DOCSTRING = r"""
494
+ Args:
495
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
496
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
497
+ it.
498
+
499
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
500
+ [`PreTrainedTokenizer.__call__`] for details.
501
+
502
+ [What are input IDs?](../glossary#input-ids)
503
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
504
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
505
+
506
+ - 1 for tokens that are **not masked**,
507
+ - 0 for tokens that are **masked**.
508
+
509
+ [What are attention masks?](../glossary#attention-mask)
510
+
511
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
512
+ [`PreTrainedTokenizer.__call__`] for details.
513
+
514
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
515
+ `past_key_values`).
516
+
517
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
518
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
519
+ information on the default strategy.
520
+
521
+ - 1 indicates the head is **not masked**,
522
+ - 0 indicates the head is **masked**.
523
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
524
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
525
+ config.n_positions - 1]`.
526
+
527
+ [What are position IDs?](../glossary#position-ids)
528
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
529
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
530
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
531
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
532
+
533
+ Two formats are allowed:
534
+ - a [`~cache_utils.Cache`] instance, see our
535
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
536
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
537
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
538
+ cache format.
539
+
540
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
541
+ legacy cache format will be returned.
542
+
543
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
544
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
545
+ of shape `(batch_size, sequence_length)`.
546
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
547
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
548
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
549
+ model's internal embedding lookup matrix.
550
+ use_cache (`bool`, *optional*):
551
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
552
+ `past_key_values`).
553
+ output_attentions (`bool`, *optional*):
554
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
555
+ tensors for more detail.
556
+ output_hidden_states (`bool`, *optional*):
557
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
558
+ more detail.
559
+ return_dict (`bool`, *optional*):
560
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
561
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
562
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
563
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
564
+ the complete sequence length.
565
+ """
566
+
567
+
568
+ @add_start_docstrings(
569
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
570
+ QWEN2_START_DOCSTRING,
571
+ )
572
+ class Qwen2Model(Qwen2PreTrainedModel):
573
+ """
574
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
575
+
576
+ Args:
577
+ config: Qwen2Config
578
+ """
579
+
580
+ def __init__(self, config: Qwen2Config):
581
+ super().__init__(config)
582
+ self.padding_idx = config.pad_token_id
583
+ self.vocab_size = config.vocab_size
584
+
585
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
586
+ self.layers = nn.ModuleList(
587
+ [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
588
+ )
589
+ self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
590
+ self.rotary_emb = Qwen2RotaryEmbedding(config=config)
591
+ self.gradient_checkpointing = False
592
+
593
+ # Initialize weights and apply final processing
594
+ self.post_init()
595
+
596
+ def get_input_embeddings(self):
597
+ return self.embed_tokens
598
+
599
+ def set_input_embeddings(self, value):
600
+ self.embed_tokens = value
601
+
602
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
603
+ def forward(
604
+ self,
605
+ input_ids: torch.LongTensor = None,
606
+ attention_mask: Optional[torch.Tensor] = None,
607
+ position_ids: Optional[torch.LongTensor] = None,
608
+ past_key_values: Optional[Cache] = None,
609
+ inputs_embeds: Optional[torch.FloatTensor] = None,
610
+ use_cache: Optional[bool] = None,
611
+ output_attentions: Optional[bool] = None,
612
+ output_hidden_states: Optional[bool] = None,
613
+ return_dict: Optional[bool] = None,
614
+ cache_position: Optional[torch.LongTensor] = None,
615
+ image_position = None,
616
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
617
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
618
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
619
+ output_hidden_states = (
620
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
621
+ )
622
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
623
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
624
+
625
+ if (input_ids is None) ^ (inputs_embeds is not None):
626
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
627
+ elif input_ids is not None:
628
+ batch_size, seq_length = input_ids.shape
629
+ elif inputs_embeds is not None:
630
+ batch_size, seq_length, _ = inputs_embeds.shape
631
+ seq_length_with_past = seq_length
632
+
633
+ if self.gradient_checkpointing and self.training and use_cache:
634
+ logger.warning_once(
635
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
636
+ )
637
+ use_cache = False
638
+
639
+ if inputs_embeds is None:
640
+ inputs_embeds = self.embed_tokens(input_ids)
641
+
642
+ if use_cache and past_key_values is None:
643
+ past_key_values = DynamicCache()
644
+
645
+ if cache_position is None:
646
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
647
+ cache_position = torch.arange(
648
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
649
+ )
650
+
651
+ if position_ids is None:
652
+ position_ids = cache_position.unsqueeze(0)
653
+
654
+ if output_attentions:
655
+ self.config._attn_implementation = "eager"
656
+ causal_mask = self._update_causal_mask(
657
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
658
+ )
659
+
660
+ hidden_states = inputs_embeds
661
+
662
+ # create position embeddings to be shared across the decoder layers
663
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
664
+
665
+ # decoder layers
666
+ all_hidden_states = () if output_hidden_states else None
667
+ all_self_attns = () if output_attentions else None
668
+
669
+ for idx, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
670
+ if output_hidden_states:
671
+ all_hidden_states += (hidden_states,)
672
+
673
+ if self.gradient_checkpointing and self.training:
674
+ layer_outputs = self._gradient_checkpointing_func(
675
+ decoder_layer.__call__,
676
+ hidden_states,
677
+ causal_mask,
678
+ position_ids,
679
+ past_key_values,
680
+ output_attentions,
681
+ use_cache,
682
+ cache_position,
683
+ position_embeddings,
684
+ )
685
+ else:
686
+ if image_position is not None:
687
+ AGG_LAYER = 2
688
+ SYS_LENGTH= image_position[0]
689
+ IMAGE_TOKEN_LENGTH = image_position[1] - image_position[0]
690
+ ATTENTION_RANK = 0.1
691
+ if idx<AGG_LAYER:
692
+ new_attention_mask = causal_mask
693
+
694
+ elif idx==AGG_LAYER:
695
+ if idx!=0:
696
+ last_layer_attention = layer_outputs[1]
697
+ # print("attn imple", self.config._attn_implementation)
698
+ # print("last_layer_attention:", last_layer_attention.shape)
699
+ # compute average attention over different head
700
+ last_layer_attention_avg = torch.mean(last_layer_attention, dim=1)[0]
701
+ # generate new attention mask based on the average attention, sample the top ATTENTION_RANK tokens with highest attention
702
+ last_layer_attention_avg_last_tok = last_layer_attention_avg[-1]
703
+ # get the attention in image token
704
+ last_layer_attention_avg_last_tok_image = last_layer_attention_avg_last_tok[SYS_LENGTH:SYS_LENGTH+IMAGE_TOKEN_LENGTH]
705
+ # print("last_layer_attention_avg_last_tok_image:", last_layer_attention_avg_last_tok_image.shape)
706
+ # get the indexs of the top ATTENTION_RANK tokens
707
+ top_attention_rank_index = last_layer_attention_avg_last_tok_image.topk(int(IMAGE_TOKEN_LENGTH * ATTENTION_RANK), largest=False).indices + SYS_LENGTH
708
+ # print("top_attention_rank_index:", top_attention_rank_index.shape)
709
+
710
+ key_len = last_layer_attention.size(-1)
711
+ gen_attention_mask = torch.ones((batch_size, key_len), dtype=torch.bool, device=inputs_embeds.device)
712
+ gen_attention_mask[:,SYS_LENGTH:SYS_LENGTH+IMAGE_TOKEN_LENGTH] = False
713
+ # print("gegn_attention_mask:", gen_attention_mask)
714
+ gen_attention_mask[:,top_attention_rank_index] = True
715
+ # print("gen_attention_mask true:", gen_attention_mask)
716
+ gen_attention_mask = self._update_causal_mask(
717
+ gen_attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
718
+ )
719
+ # print("gen_attention_mask:", gen_attention_mask.shape)
720
+ # if gen_attention_mask is not None and gen_attention_mask.dtype != torch.bool:
721
+ # print("Masked positions (float):", (gen_attention_mask == torch.finfo(gen_attention_mask.dtype).min).sum().item())
722
+ # sys.exit()
723
+
724
+ new_attention_mask = gen_attention_mask
725
+ del last_layer_attention
726
+ del last_layer_attention_avg
727
+ del last_layer_attention_avg_last_tok
728
+ del last_layer_attention_avg_last_tok_image
729
+ del top_attention_rank_index
730
+ torch.cuda.empty_cache()
731
+ else:
732
+ new_attention_mask = gen_attention_mask
733
+
734
+ else:
735
+ new_attention_mask = causal_mask
736
+
737
+
738
+ layer_outputs = decoder_layer(
739
+ hidden_states,
740
+ attention_mask=new_attention_mask,
741
+ position_ids=position_ids,
742
+ past_key_value=past_key_values,
743
+ output_attentions=output_attentions,
744
+ use_cache=use_cache,
745
+ cache_position=cache_position,
746
+ position_embeddings=position_embeddings,
747
+ **flash_attn_kwargs,
748
+ )
749
+ # layer_outputs = decoder_layer(
750
+ # hidden_states,
751
+ # attention_mask=causal_mask,
752
+ # position_ids=position_ids,
753
+ # past_key_value=past_key_values,
754
+ # output_attentions=output_attentions,
755
+ # use_cache=use_cache,
756
+ # cache_position=cache_position,
757
+ # position_embeddings=position_embeddings,
758
+ # **flash_attn_kwargs,
759
+ # )
760
+
761
+ hidden_states = layer_outputs[0]
762
+
763
+ if output_attentions:
764
+ all_self_attns += (layer_outputs[1],)
765
+
766
+ hidden_states = self.norm(hidden_states)
767
+
768
+ # add hidden states from the last decoder layer
769
+ if output_hidden_states:
770
+ all_hidden_states += (hidden_states,)
771
+
772
+ output = BaseModelOutputWithPast(
773
+ last_hidden_state=hidden_states,
774
+ past_key_values=past_key_values if use_cache else None,
775
+ hidden_states=all_hidden_states,
776
+ attentions=all_self_attns,
777
+ )
778
+ return output if return_dict else output.to_tuple()
779
+
780
+ def _update_causal_mask(
781
+ self,
782
+ attention_mask: torch.Tensor,
783
+ input_tensor: torch.Tensor,
784
+ cache_position: torch.Tensor,
785
+ past_key_values: Cache,
786
+ output_attentions: bool = False,
787
+ ):
788
+ if self.config._attn_implementation == "flash_attention_2":
789
+ if attention_mask is not None and past_key_values is not None:
790
+ is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
791
+ if is_padding_right:
792
+ raise ValueError(
793
+ "You are attempting to perform batched generation with padding_side='right'"
794
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
795
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
796
+ )
797
+ if attention_mask is not None and 0.0 in attention_mask:
798
+ return attention_mask
799
+ return None
800
+
801
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
802
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
803
+ # to infer the attention mask.
804
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
805
+ using_static_cache = isinstance(past_key_values, StaticCache)
806
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
807
+
808
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
809
+ if (
810
+ self.config._attn_implementation == "sdpa"
811
+ and not (using_static_cache or using_sliding_window_cache)
812
+ and not output_attentions
813
+ ):
814
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
815
+ attention_mask,
816
+ inputs_embeds=input_tensor,
817
+ past_key_values_length=past_seen_tokens,
818
+ sliding_window=self.config.sliding_window,
819
+ is_training=self.training,
820
+ ):
821
+ return None
822
+
823
+ dtype, device = input_tensor.dtype, input_tensor.device
824
+ min_dtype = torch.finfo(dtype).min
825
+ sequence_length = input_tensor.shape[1]
826
+ # SlidingWindowCache or StaticCache
827
+ if using_sliding_window_cache or using_static_cache:
828
+ target_length = past_key_values.get_max_cache_shape()
829
+ # DynamicCache or no cache
830
+ else:
831
+ target_length = (
832
+ attention_mask.shape[-1]
833
+ if isinstance(attention_mask, torch.Tensor)
834
+ else past_seen_tokens + sequence_length + 1
835
+ )
836
+
837
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
838
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
839
+ attention_mask,
840
+ sequence_length=sequence_length,
841
+ target_length=target_length,
842
+ dtype=dtype,
843
+ device=device,
844
+ cache_position=cache_position,
845
+ batch_size=input_tensor.shape[0],
846
+ config=self.config,
847
+ past_key_values=past_key_values,
848
+ )
849
+
850
+ if (
851
+ self.config._attn_implementation == "sdpa"
852
+ and attention_mask is not None
853
+ and attention_mask.device.type in ["cuda", "xpu"]
854
+ and not output_attentions
855
+ ):
856
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
857
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
858
+ # Details: https://github.com/pytorch/pytorch/issues/110213
859
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
860
+
861
+ return causal_mask
862
+
863
+ @staticmethod
864
+ def _prepare_4d_causal_attention_mask_with_cache_position(
865
+ attention_mask: torch.Tensor,
866
+ sequence_length: int,
867
+ target_length: int,
868
+ dtype: torch.dtype,
869
+ device: torch.device,
870
+ cache_position: torch.Tensor,
871
+ batch_size: int,
872
+ config: Qwen2Config,
873
+ past_key_values: Cache,
874
+ ):
875
+ """
876
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
877
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
878
+
879
+ Args:
880
+ attention_mask (`torch.Tensor`):
881
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
882
+ sequence_length (`int`):
883
+ The sequence length being processed.
884
+ target_length (`int`):
885
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
886
+ dtype (`torch.dtype`):
887
+ The dtype to use for the 4D attention mask.
888
+ device (`torch.device`):
889
+ The device to place the 4D attention mask on.
890
+ cache_position (`torch.Tensor`):
891
+ Indices depicting the position of the input sequence tokens in the sequence.
892
+ batch_size (`torch.Tensor`):
893
+ Batch size.
894
+ config (`Qwen2Config`):
895
+ The model's configuration class
896
+ past_key_values (`Cache`):
897
+ The cache class that is being used currently to generate
898
+ """
899
+ if attention_mask is not None and attention_mask.dim() == 4:
900
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
901
+ causal_mask = attention_mask
902
+ else:
903
+ min_dtype = torch.finfo(dtype).min
904
+ causal_mask = torch.full(
905
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
906
+ )
907
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
908
+ if config.sliding_window is not None:
909
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
910
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
911
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
912
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
913
+ cache_position.reshape(-1, 1) - config.sliding_window
914
+ )
915
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
916
+ causal_mask *= diagonal_attend_mask
917
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
918
+ if attention_mask is not None:
919
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
920
+ if attention_mask.shape[-1] > target_length:
921
+ attention_mask = attention_mask[:, :target_length]
922
+ mask_length = attention_mask.shape[-1]
923
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
924
+ causal_mask.device
925
+ )
926
+ padding_mask = padding_mask == 0
927
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
928
+ padding_mask, min_dtype
929
+ )
930
+ return causal_mask
931
+
932
+
933
+ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
934
+
935
+
936
+ class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
937
+ _tied_weights_keys = ["lm_head.weight"]
938
+ _tp_plan = {"lm_head": "colwise_rep"}
939
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
940
+
941
+ def __init__(self, config):
942
+ super().__init__(config)
943
+ self.model = Qwen2Model(config)
944
+ self.vocab_size = config.vocab_size
945
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
946
+
947
+ # Initialize weights and apply final processing
948
+ self.post_init()
949
+
950
+ def get_input_embeddings(self):
951
+ return self.model.embed_tokens
952
+
953
+ def set_input_embeddings(self, value):
954
+ self.model.embed_tokens = value
955
+
956
+ def get_output_embeddings(self):
957
+ return self.lm_head
958
+
959
+ def set_output_embeddings(self, new_embeddings):
960
+ self.lm_head = new_embeddings
961
+
962
+ def set_decoder(self, decoder):
963
+ self.model = decoder
964
+
965
+ def get_decoder(self):
966
+ return self.model
967
+
968
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
969
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
970
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
971
+ def forward(
972
+ self,
973
+ input_ids: torch.LongTensor = None,
974
+ attention_mask: Optional[torch.Tensor] = None,
975
+ position_ids: Optional[torch.LongTensor] = None,
976
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
977
+ inputs_embeds: Optional[torch.FloatTensor] = None,
978
+ labels: Optional[torch.LongTensor] = None,
979
+ use_cache: Optional[bool] = None,
980
+ output_attentions: Optional[bool] = None,
981
+ output_hidden_states: Optional[bool] = None,
982
+ return_dict: Optional[bool] = None,
983
+ cache_position: Optional[torch.LongTensor] = None,
984
+ logits_to_keep: Union[int, torch.Tensor] = 0,
985
+ inputs_embeds_icd=None,
986
+ attention_mask_icd=None,
987
+ inputs_embeds_lcd=None,
988
+ attention_mask_lcd=None,
989
+ inputs_embeds_scd=None,
990
+ attention_mask_scd=None,
991
+ inputs_embeds_vcd=None,
992
+ attention_mask_vcd=None,
993
+ inputs_embeds_mcd = None,
994
+ attention_mask_mcd = None,
995
+ inputs_embeds_text = None,
996
+ attention_mask_text = None,
997
+ image_position=None,
998
+ inputs_embeds_oa = None,
999
+ attention_mask_oa = None,
1000
+ **kwargs: Unpack[KwargsForCausalLM],
1001
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1002
+ r"""
1003
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1004
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1005
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1006
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1007
+
1008
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
1009
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
1010
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1011
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1012
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
1013
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
1014
+
1015
+ Returns:
1016
+
1017
+ Example:
1018
+
1019
+ ```python
1020
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
1021
+
1022
+ >>> model = Qwen2ForCausalLM.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
1023
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
1024
+
1025
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1026
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1027
+
1028
+ >>> # Generate
1029
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1030
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1031
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1032
+ ```"""
1033
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1034
+ output_hidden_states = (
1035
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1036
+ )
1037
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1038
+
1039
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1040
+ outputs = self.model(
1041
+ input_ids=input_ids,
1042
+ attention_mask=attention_mask,
1043
+ position_ids=position_ids,
1044
+ past_key_values=past_key_values,
1045
+ inputs_embeds=inputs_embeds,
1046
+ use_cache=use_cache,
1047
+ output_attentions=output_attentions,
1048
+ output_hidden_states=output_hidden_states,
1049
+ return_dict=return_dict,
1050
+ cache_position=cache_position,
1051
+ image_position=image_position,
1052
+ **kwargs,
1053
+ )
1054
+
1055
+ hidden_states = outputs[0]
1056
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1057
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1058
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1059
+
1060
+ loss = None
1061
+ if labels is not None:
1062
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
1063
+
1064
+ if not return_dict:
1065
+ output = (logits,) + outputs[1:]
1066
+ return (loss,) + output if loss is not None else output
1067
+
1068
+ return CausalLMOutputWithPast(
1069
+ loss=loss,
1070
+ logits=logits,
1071
+ past_key_values=outputs.past_key_values,
1072
+ hidden_states=outputs.hidden_states,
1073
+ attentions=outputs.attentions,
1074
+ )
1075
+
1076
+
1077
+ def _sample(
1078
+ self,
1079
+ input_ids: torch.LongTensor,
1080
+ logits_processor: LogitsProcessorList,
1081
+ stopping_criteria: StoppingCriteriaList,
1082
+ generation_config: GenerationConfig,
1083
+ synced_gpus: bool,
1084
+ streamer: Optional["BaseStreamer"],
1085
+ **model_kwargs,
1086
+ ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
1087
+ r"""
1088
+ Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
1089
+ can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
1090
+
1091
+ Parameters:
1092
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1093
+ The sequence used as a prompt for the generation.
1094
+ logits_processor (`LogitsProcessorList`):
1095
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
1096
+ used to modify the prediction scores of the language modeling head applied at each generation step.
1097
+ stopping_criteria (`StoppingCriteriaList`):
1098
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
1099
+ used to tell if the generation loop should stop.
1100
+ generation_config ([`~generation.GenerationConfig`]):
1101
+ The generation configuration to be used as parametrization of the decoding method.
1102
+ synced_gpus (`bool`):
1103
+ Whether to continue running the while loop until max_length (needed to avoid deadlocking with
1104
+ `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
1105
+ streamer (`BaseStreamer`, *optional*):
1106
+ Streamer object that will be used to stream the generated sequences. Generated tokens are passed
1107
+ through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
1108
+ model_kwargs:
1109
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
1110
+ an encoder-decoder model the kwargs should include `encoder_outputs`.
1111
+
1112
+ Return:
1113
+ [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
1114
+ A `torch.LongTensor` containing the generated tokens (default behaviour) or a
1115
+ [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
1116
+ `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
1117
+ `model.config.is_encoder_decoder=True`.
1118
+ """
1119
+ # init values
1120
+ pad_token_id = generation_config._pad_token_tensor
1121
+ output_attentions = generation_config.output_attentions
1122
+ output_hidden_states = generation_config.output_hidden_states
1123
+ output_scores = generation_config.output_scores
1124
+ output_logits = generation_config.output_logits
1125
+ return_dict_in_generate = generation_config.return_dict_in_generate
1126
+ has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
1127
+ do_sample = generation_config.do_sample
1128
+
1129
+ # init attention / hidden states / scores tuples
1130
+ scores = () if (return_dict_in_generate and output_scores) else None
1131
+ raw_logits = () if (return_dict_in_generate and output_logits) else None
1132
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
1133
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
1134
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
1135
+
1136
+ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
1137
+ if return_dict_in_generate and self.config.is_encoder_decoder:
1138
+ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
1139
+ encoder_hidden_states = (
1140
+ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
1141
+ )
1142
+
1143
+ # keep track of which sequences are already finished
1144
+ batch_size, cur_len = input_ids.shape
1145
+ this_peer_finished = False
1146
+ unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
1147
+ model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
1148
+
1149
+ model_forward = self.__call__
1150
+ if isinstance(model_kwargs.get("past_key_values"), Cache):
1151
+ is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
1152
+ if getattr(self, "hf_quantizer", None) is not None:
1153
+ is_compileable &= self.hf_quantizer.is_compileable
1154
+ is_compileable = is_compileable and not generation_config.disable_compile
1155
+ if is_compileable and (
1156
+ self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
1157
+ ):
1158
+ os.environ["TOKENIZERS_PARALLELISM"] = "0"
1159
+ model_forward = self.get_compiled_call(generation_config.compile_config)
1160
+
1161
+
1162
+ """icd"""
1163
+ if model_kwargs.get("inputs_embeds_icd") is not None:
1164
+ input_embeds_icd = model_kwargs["inputs_embeds_icd"]
1165
+ model_kwargs_icd = copy.deepcopy(model_kwargs)
1166
+ model_kwargs_icd["inputs_embeds"] = input_embeds_icd
1167
+ model_kwargs_icd["attention_mask"] = model_kwargs["attention_mask_icd"]
1168
+ model_kwargs_icd.pop("inputs_embeds_icd")
1169
+ model_kwargs_icd.pop("attention_mask_icd")
1170
+ # input_ids_icd = input_ids.clone()
1171
+ else:
1172
+ model_kwargs_icd = None
1173
+
1174
+ """lcd"""
1175
+ if model_kwargs.get("inputs_embeds_lcd") is not None:
1176
+ input_embeds_lcd = model_kwargs["inputs_embeds_lcd"]
1177
+ model_kwargs_lcd = copy.deepcopy(model_kwargs)
1178
+ model_kwargs_lcd["inputs_embeds"] = input_embeds_lcd
1179
+ model_kwargs_lcd["attention_mask"] = model_kwargs["attention_mask_lcd"]
1180
+ model_kwargs_lcd.pop("inputs_embeds_lcd")
1181
+ model_kwargs_lcd.pop("attention_mask_lcd")
1182
+ else:
1183
+ model_kwargs_lcd = None
1184
+
1185
+ """scd"""
1186
+ if model_kwargs.get("inputs_embeds_scd") is not None:
1187
+ input_embeds_scd = model_kwargs["inputs_embeds_scd"]
1188
+ model_kwargs_scd = copy.deepcopy(model_kwargs)
1189
+ model_kwargs_scd["inputs_embeds"] = input_embeds_scd
1190
+ model_kwargs_scd["attention_mask"] = model_kwargs["attention_mask_scd"]
1191
+ model_kwargs_scd.pop("inputs_embeds_scd")
1192
+ model_kwargs_scd.pop("attention_mask_scd")
1193
+ else:
1194
+ model_kwargs_scd = None
1195
+
1196
+
1197
+ """vcd"""
1198
+ if model_kwargs.get("inputs_embeds_vcd") is not None:
1199
+ input_embeds_vcd = model_kwargs["inputs_embeds_vcd"]
1200
+ model_kwargs_vcd = copy.deepcopy(model_kwargs)
1201
+ model_kwargs_vcd["inputs_embeds"] = input_embeds_vcd
1202
+ model_kwargs_vcd["attention_mask"] = model_kwargs["attention_mask_vcd"]
1203
+ model_kwargs_vcd.pop("inputs_embeds_vcd")
1204
+ model_kwargs_vcd.pop("attention_mask_vcd")
1205
+ else:
1206
+ model_kwargs_vcd = None
1207
+
1208
+
1209
+ """mcd"""
1210
+ if model_kwargs.get("inputs_embeds_mcd") is not None:
1211
+ input_embeds_mcd = model_kwargs["inputs_embeds_mcd"]
1212
+ model_kwargs_mcd = copy.deepcopy(model_kwargs)
1213
+ model_kwargs_mcd["inputs_embeds"] = input_embeds_mcd
1214
+ model_kwargs_mcd["attention_mask"] = model_kwargs["attention_mask_mcd"]
1215
+ model_kwargs_mcd.pop("inputs_embeds_mcd")
1216
+ model_kwargs_mcd.pop("attention_mask_mcd")
1217
+ else:
1218
+ model_kwargs_mcd = None
1219
+
1220
+ """no vision"""
1221
+ if model_kwargs.get("inputs_embeds_text") is not None:
1222
+ input_embeds_text = model_kwargs["inputs_embeds_text"]
1223
+ model_kwargs_text = copy.deepcopy(model_kwargs)
1224
+ model_kwargs_text["inputs_embeds"] = input_embeds_text
1225
+ model_kwargs_text["attention_mask"] = model_kwargs["attention_mask_text"]
1226
+ model_kwargs_text.pop("inputs_embeds_text")
1227
+ model_kwargs_text.pop("attention_mask_text")
1228
+ else:
1229
+ model_kwargs_text = None
1230
+
1231
+ """attn mcd"""
1232
+ if model_kwargs.get("inputs_embeds_oa") is not None:
1233
+ input_embeds_oa = model_kwargs["inputs_embeds_oa"]
1234
+ model_kwargs_oa = copy.deepcopy(model_kwargs)
1235
+ model_kwargs_oa["inputs_embeds_oa"] = input_embeds_oa
1236
+ model_kwargs_oa["attention_mask"] = model_kwargs["attention_mask_oa"]
1237
+ model_kwargs_oa.pop("inputs_embeds_oa")
1238
+ model_kwargs_oa.pop("attention_mask_oa")
1239
+ else:
1240
+ model_kwargs_oa = None
1241
+
1242
+ """sid"""
1243
+ if model_kwargs.get("image_position") is not None:
1244
+ if model_kwargs_oa is not None:
1245
+ image_position = model_kwargs_oa.pop("image_position")
1246
+ model_kwargs_mcd = copy.deepcopy(model_kwargs)
1247
+ model_kwargs_mcd.pop("image_position")
1248
+ model_kwargs_sid = None
1249
+ else:
1250
+ model_kwargs_sid = copy.deepcopy(model_kwargs)
1251
+ model_kwargs.pop("image_position")
1252
+ else:
1253
+ model_kwargs_sid = None
1254
+
1255
+
1256
+ is_prefill = True
1257
+ while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
1258
+ # prepare model inputs
1259
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1260
+
1261
+ # prepare variable output controls (note: some models won't accept all output controls)
1262
+ model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
1263
+ model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
1264
+
1265
+ """icd"""
1266
+ if model_kwargs_icd is not None:
1267
+
1268
+ model_inputs_icd = self.prepare_inputs_for_generation(input_ids, **model_kwargs_icd)
1269
+
1270
+ model_inputs_icd.update({"output_attentions": output_attentions} if output_attentions else {})
1271
+ model_inputs_icd.update(
1272
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1273
+ )
1274
+ """lcd"""
1275
+ if model_kwargs_lcd is not None:
1276
+ model_inputs_lcd = self.prepare_inputs_for_generation(input_ids, **model_kwargs_lcd)
1277
+
1278
+ model_inputs_lcd.update({"output_attentions": output_attentions} if output_attentions else {})
1279
+ model_inputs_lcd.update(
1280
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1281
+ )
1282
+ """scd"""
1283
+ if model_kwargs_scd is not None:
1284
+ model_inputs_scd = self.prepare_inputs_for_generation(input_ids, **model_kwargs_scd)
1285
+
1286
+ model_inputs_scd.update({"output_attentions": output_attentions} if output_attentions else {})
1287
+ model_inputs_scd.update(
1288
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1289
+ )
1290
+ """vcd"""
1291
+ if model_kwargs_vcd is not None:
1292
+ model_inputs_vcd = self.prepare_inputs_for_generation(input_ids, **model_kwargs_vcd)
1293
+
1294
+ model_inputs_vcd.update({"output_attentions": output_attentions} if output_attentions else {})
1295
+ model_inputs_vcd.update(
1296
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1297
+ )
1298
+
1299
+ """mcd"""
1300
+ if model_kwargs_mcd is not None:
1301
+ model_inputs_mcd = self.prepare_inputs_for_generation(input_ids, **model_kwargs_mcd)
1302
+
1303
+ model_inputs_mcd.update({"output_attentions": output_attentions} if output_attentions else {})
1304
+ model_inputs_mcd.update(
1305
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1306
+ )
1307
+
1308
+ """no vision"""
1309
+ if model_kwargs_text is not None:
1310
+ model_inputs_text = self.prepare_inputs_for_generation(input_ids, **model_kwargs_text)
1311
+
1312
+ model_inputs_text.update({"output_attentions": output_attentions} if output_attentions else {})
1313
+ model_inputs_text.update(
1314
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1315
+ )
1316
+
1317
+ """sid"""
1318
+ if model_kwargs_sid is not None:
1319
+ print("sid is not none")
1320
+ model_inputs_sid = self.prepare_inputs_for_generation(input_ids, **model_kwargs_sid)
1321
+
1322
+ model_inputs_sid.update({"output_attentions": output_attentions} if output_attentions else {})
1323
+ model_inputs_sid.update(
1324
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1325
+ )
1326
+
1327
+ """attn mcd"""
1328
+ if model_kwargs_oa is not None:
1329
+ model_inputs_oa = self.prepare_inputs_for_generation(input_ids, **model_kwargs_oa)
1330
+
1331
+ model_inputs_oa.update({"output_attentions": output_attentions} if output_attentions else {})
1332
+ model_inputs_oa.update(
1333
+ {"output_hidden_states": output_hidden_states} if output_hidden_states else {}
1334
+ )
1335
+
1336
+
1337
+ if is_prefill:
1338
+ if model_kwargs_oa is not None:
1339
+ model_inputs_oa.update({"output_attentions": True})
1340
+ assert model_inputs_oa.get("image_position", None) is None, "image_position is not None"
1341
+ outputs_oa = self(**model_inputs_oa, return_dict=True)
1342
+ attentions = outputs_oa.attentions
1343
+
1344
+ img_token_range = list(range(image_position[0], image_position[1]))
1345
+ text_token_range = list(range(image_position[1]+2, model_inputs_oa["inputs_embeds"].shape[1]))
1346
+ layer_attentions = []
1347
+
1348
+ for layer_att in attentions: # shape: [B, H, T, T]
1349
+ B, H, T, _ = layer_att.shape
1350
+
1351
+ # attention from text tokens → to image tokens
1352
+ text_indices = text_token_range
1353
+
1354
+ att = layer_att[:, :, text_indices, :][:, :, :, img_token_range] # shape: [B, H, T_text, T_img]
1355
+ att = att.mean(dim=2) # → [B, H, T_img]
1356
+
1357
+ # 聚合 head
1358
+ att = att.mean(dim=1) # [B, T_img]
1359
+ layer_attentions.append(att)
1360
+
1361
+ # 聚合层
1362
+ img_attn_score = torch.stack(layer_attentions, dim=0).mean(dim=0)
1363
+ mask_num = int(len(img_token_range)*0.1)
1364
+ topk = torch.topk(img_attn_score, mask_num, largest=True)[1]
1365
+
1366
+ img_token_range_tensor = torch.tensor(img_token_range, device=img_attn_score.device)
1367
+ topk_input_idx = img_token_range_tensor[topk[0]] # B=1,取第一条即可
1368
+
1369
+ # 更新 attention_mask
1370
+ attention_mask = model_inputs_mcd["attention_mask"] # shape: [B, T]
1371
+ attention_mask[:, topk_input_idx] = 0
1372
+ model_inputs_mcd["attention_mask"] = attention_mask
1373
+
1374
+
1375
+ if is_prefill:
1376
+ model_inputs.update({"output_attentions": {}})
1377
+ # model_inputs.update({"output_attentions": True})
1378
+ outputs = self(**model_inputs, return_dict=True)
1379
+ if model_kwargs_icd is not None:
1380
+ outputs_icd = self(**model_inputs_icd, return_dict=True)
1381
+ if model_kwargs_lcd is not None:
1382
+ outputs_lcd = self(**model_inputs_lcd, return_dict=True)
1383
+ if model_kwargs_scd is not None:
1384
+ outputs_scd = self(**model_inputs_scd, return_dict=True)
1385
+ if model_kwargs_vcd is not None:
1386
+ outputs_vcd = self(**model_inputs_vcd, return_dict=True)
1387
+ if model_kwargs_mcd is not None:
1388
+ outputs_mcd = self(**model_inputs_mcd, return_dict=True)
1389
+ if model_kwargs_text is not None:
1390
+ outputs_text = self(**model_inputs_text, return_dict=True)
1391
+ if model_kwargs_sid is not None:
1392
+ outputs_sid = self(**model_inputs_sid, return_dict=True)
1393
+ is_prefill = False
1394
+ else:
1395
+ outputs = model_forward(**model_inputs, return_dict=True)
1396
+ if model_kwargs_icd is not None:
1397
+ outputs_icd = model_forward(**model_inputs_icd, return_dict=True)
1398
+ if model_kwargs_lcd is not None:
1399
+ outputs_lcd = model_forward(**model_inputs_lcd, return_dict=True)
1400
+ if model_kwargs_scd is not None:
1401
+ outputs_scd = model_forward(**model_inputs_scd, return_dict=True)
1402
+ if model_kwargs_vcd is not None:
1403
+ outputs_vcd = model_forward(**model_inputs_vcd, return_dict=True)
1404
+ if model_kwargs_mcd is not None:
1405
+ outputs_mcd = model_forward(**model_inputs_mcd, return_dict=True)
1406
+ if model_kwargs_text is not None:
1407
+ outputs_text = model_forward(**model_inputs_text, return_dict=True)
1408
+ if model_kwargs_sid is not None:
1409
+ outputs_sid = model_forward(**model_inputs_sid, return_dict=True)
1410
+
1411
+ # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
1412
+ model_kwargs = self._update_model_kwargs_for_generation(
1413
+ outputs,
1414
+ model_kwargs,
1415
+ is_encoder_decoder=self.config.is_encoder_decoder,
1416
+ )
1417
+ if model_kwargs_icd is not None:
1418
+ model_kwargs_icd = self._update_model_kwargs_for_generation(
1419
+ outputs_icd,
1420
+ model_kwargs_icd,
1421
+ is_encoder_decoder=self.config.is_encoder_decoder,
1422
+ )
1423
+ if model_kwargs_lcd is not None:
1424
+ model_kwargs_lcd = self._update_model_kwargs_for_generation(
1425
+ outputs_lcd,
1426
+ model_kwargs_lcd,
1427
+ is_encoder_decoder=self.config.is_encoder_decoder,
1428
+ )
1429
+ if model_kwargs_scd is not None:
1430
+ model_kwargs_scd = self._update_model_kwargs_for_generation(
1431
+ outputs_scd,
1432
+ model_kwargs_scd,
1433
+ is_encoder_decoder=self.config.is_encoder_decoder,
1434
+ )
1435
+ if model_kwargs_vcd is not None:
1436
+ model_kwargs_vcd = self._update_model_kwargs_for_generation(
1437
+ outputs_vcd,
1438
+ model_kwargs_vcd,
1439
+ is_encoder_decoder=self.config.is_encoder_decoder,
1440
+ )
1441
+
1442
+ if model_kwargs_mcd is not None:
1443
+ model_kwargs_mcd = self._update_model_kwargs_for_generation(
1444
+ outputs_mcd,
1445
+ model_kwargs_mcd,
1446
+ is_encoder_decoder=self.config.is_encoder_decoder,
1447
+ )
1448
+
1449
+ if model_kwargs_text is not None:
1450
+ model_kwargs_text = self._update_model_kwargs_for_generation(
1451
+ outputs_text,
1452
+ model_kwargs_text,
1453
+ is_encoder_decoder=self.config.is_encoder_decoder,
1454
+ )
1455
+
1456
+ if model_kwargs_sid is not None:
1457
+ model_kwargs_sid = self._update_model_kwargs_for_generation(
1458
+ outputs_sid,
1459
+ model_kwargs_sid,
1460
+ is_encoder_decoder=self.config.is_encoder_decoder,
1461
+ )
1462
+
1463
+ if synced_gpus and this_peer_finished:
1464
+ continue
1465
+
1466
+ # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
1467
+ # (the clone itself is always small)
1468
+ next_token_logits = outputs.logits[:, -1, :].clone().float()
1469
+ next_token_logits = next_token_logits.to(input_ids.device)
1470
+
1471
+ """icd"""
1472
+ if model_kwargs_icd is not None:
1473
+ cd_alpha = 1
1474
+ cd_beta = 0.1
1475
+
1476
+ next_token_logits_icd = outputs_icd.logits[:, -1, :].clone().float()
1477
+ next_token_logits_icd = next_token_logits_icd.to(input_ids.device)
1478
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1479
+
1480
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_icd
1481
+ icd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1482
+ """lcd"""
1483
+ if model_kwargs_lcd is not None:
1484
+ cd_alpha = 1
1485
+ cd_beta = 0.5
1486
+
1487
+ next_token_logits_lcd = outputs_lcd.logits[:, -1, :].clone().float()
1488
+ next_token_logits_lcd = next_token_logits_lcd.to(input_ids.device)
1489
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1490
+
1491
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_lcd
1492
+ lcd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1493
+ """scd"""
1494
+ if model_kwargs_scd is not None:
1495
+ cd_alpha = 1
1496
+ cd_beta = 0.5
1497
+
1498
+ next_token_logits_scd = outputs_scd.logits[:, -1, :].clone().float()
1499
+ next_token_logits_scd = next_token_logits_scd.to(input_ids.device)
1500
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1501
+
1502
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_scd
1503
+ scd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1504
+
1505
+ """vcd"""
1506
+ if model_kwargs_vcd is not None:
1507
+ cd_alpha = 1
1508
+ cd_beta = 0.5
1509
+
1510
+ next_token_logits_vcd = outputs_vcd.logits[:, -1, :].clone().float()
1511
+ next_token_logits_vcd = next_token_logits_vcd.to(input_ids.device)
1512
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1513
+
1514
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_vcd
1515
+ vcd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1516
+
1517
+ if model_kwargs_mcd is not None and model_kwargs_text is not None:
1518
+ cd_alpha = 1
1519
+ cd_beta = 0.5
1520
+
1521
+ next_token_logits_text = outputs_text.logits[:, -1, :].clone().float()
1522
+ next_token_logits_text = next_token_logits_text.to(input_ids.device)
1523
+ next_token_logits_mcd = outputs_mcd.logits[:, -1, :].clone().float()
1524
+ next_token_logits_mcd = next_token_logits_mcd.to(input_ids.device)
1525
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1526
+
1527
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_text
1528
+ diffs = diffs + 0.5*next_token_logits_mcd
1529
+ combine_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1530
+ else:
1531
+ """mcd"""
1532
+ if model_kwargs_mcd is not None:
1533
+ cd_alpha = 1
1534
+ cd_beta = 0.5
1535
+
1536
+ next_token_logits_mcd = outputs_mcd.logits[:, -1, :].clone().float()
1537
+ next_token_logits_mcd = next_token_logits_mcd.to(input_ids.device)
1538
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1539
+
1540
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_mcd
1541
+ # diffs = next_token_logits + 8.0*next_token_logits_mcd
1542
+ mcd_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1543
+
1544
+ """no vision"""
1545
+ if model_kwargs_text is not None:
1546
+ cd_alpha = 1
1547
+ cd_beta = 0.5
1548
+
1549
+ next_token_logits_text = outputs_text.logits[:, -1, :].clone().float()
1550
+ next_token_logits_text = next_token_logits_text.to(input_ids.device)
1551
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1552
+
1553
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_text
1554
+ text_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1555
+
1556
+ if model_kwargs_sid is not None:
1557
+ cd_alpha = 1
1558
+ cd_beta = 0.5
1559
+
1560
+ next_token_logits_sid = outputs_sid.logits[:, -1, :].clone().float()
1561
+ next_token_logits_sid = next_token_logits_sid.to(input_ids.device)
1562
+ cutoff = torch.log(torch.tensor(cd_beta)) + next_token_logits.max(dim=-1, keepdim=True).values
1563
+
1564
+ diffs = (1+cd_alpha)*next_token_logits - cd_alpha*next_token_logits_sid
1565
+ sid_logits = diffs.masked_fill(next_token_logits < cutoff, -float("inf"))
1566
+
1567
+ logits_list = []
1568
+ if model_kwargs_icd is not None:
1569
+ logits_list.append(icd_logits)
1570
+ if model_kwargs_lcd is not None:
1571
+ logits_list.append(lcd_logits)
1572
+ if model_kwargs_scd is not None:
1573
+ logits_list.append(scd_logits)
1574
+ if model_kwargs_vcd is not None:
1575
+ logits_list.append(vcd_logits)
1576
+ if model_kwargs_mcd is not None and model_kwargs_text is not None:
1577
+ logits_list.append(combine_logits)
1578
+ else:
1579
+ if model_kwargs_mcd is not None:
1580
+ logits_list.append(mcd_logits)
1581
+ if model_kwargs_text is not None:
1582
+ logits_list.append(text_logits)
1583
+ if model_kwargs_sid is not None:
1584
+ logits_list.append(sid_logits)
1585
+
1586
+ if len(logits_list)>0:
1587
+ assert len(logits_list) == 1
1588
+ cd_logits = sum(logits_list, torch.zeros_like(logits_list[0]))
1589
+ else:
1590
+ cd_logits = next_token_logits
1591
+
1592
+ # pre-process distribution
1593
+ # next_token_scores = logits_processor(input_ids, next_token_logits)
1594
+ next_token_scores = logits_processor(input_ids, cd_logits)
1595
+
1596
+ # Store scores, attentions and hidden_states when required
1597
+ if return_dict_in_generate:
1598
+ if output_scores:
1599
+ scores += (next_token_scores,)
1600
+ if output_logits:
1601
+ raw_logits += (next_token_logits,)
1602
+ if output_attentions:
1603
+ decoder_attentions += (
1604
+ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
1605
+ )
1606
+ if self.config.is_encoder_decoder:
1607
+ cross_attentions += (outputs.cross_attentions,)
1608
+
1609
+ if output_hidden_states:
1610
+ decoder_hidden_states += (
1611
+ (outputs.decoder_hidden_states,)
1612
+ if self.config.is_encoder_decoder
1613
+ else (outputs.hidden_states,)
1614
+ )
1615
+
1616
+ # token selection
1617
+ if do_sample:
1618
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
1619
+ # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
1620
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
1621
+ else:
1622
+ next_tokens = torch.argmax(next_token_scores, dim=-1)
1623
+
1624
+ # finished sentences should have their next token be a padding token
1625
+ if has_eos_stopping_criteria:
1626
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
1627
+
1628
+ # update generated ids, model inputs, and length for next step
1629
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1630
+ if streamer is not None:
1631
+ streamer.put(next_tokens.cpu())
1632
+
1633
+ unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
1634
+ this_peer_finished = unfinished_sequences.max() == 0
1635
+ cur_len += 1
1636
+
1637
+ # This is needed to properly delete outputs.logits which may be very large for first iteration
1638
+ # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
1639
+ del outputs
1640
+
1641
+ if streamer is not None:
1642
+ streamer.end()
1643
+
1644
+ if return_dict_in_generate:
1645
+ if self.config.is_encoder_decoder:
1646
+ return GenerateEncoderDecoderOutput(
1647
+ sequences=input_ids,
1648
+ scores=scores,
1649
+ logits=raw_logits,
1650
+ encoder_attentions=encoder_attentions,
1651
+ encoder_hidden_states=encoder_hidden_states,
1652
+ decoder_attentions=decoder_attentions,
1653
+ cross_attentions=cross_attentions,
1654
+ decoder_hidden_states=decoder_hidden_states,
1655
+ past_key_values=model_kwargs.get("past_key_values"),
1656
+ )
1657
+ else:
1658
+ return GenerateDecoderOnlyOutput(
1659
+ sequences=input_ids,
1660
+ scores=scores,
1661
+ logits=raw_logits,
1662
+ attentions=decoder_attentions,
1663
+ hidden_states=decoder_hidden_states,
1664
+ past_key_values=model_kwargs.get("past_key_values"),
1665
+ )
1666
+ else:
1667
+ return input_ids
1668
+
1669
+
1670
+ @add_start_docstrings(
1671
+ """
1672
+ The Qwen2 Model transformer with a sequence classification head on top (linear layer).
1673
+
1674
+ [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1675
+ (e.g. GPT-2) do.
1676
+
1677
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1678
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1679
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1680
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1681
+ each row of the batch).
1682
+ """,
1683
+ QWEN2_START_DOCSTRING,
1684
+ )
1685
+ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
1686
+ def __init__(self, config):
1687
+ super().__init__(config)
1688
+ self.num_labels = config.num_labels
1689
+ self.model = Qwen2Model(config)
1690
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1691
+
1692
+ # Initialize weights and apply final processing
1693
+ self.post_init()
1694
+
1695
+ def get_input_embeddings(self):
1696
+ return self.model.embed_tokens
1697
+
1698
+ def set_input_embeddings(self, value):
1699
+ self.model.embed_tokens = value
1700
+
1701
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1702
+ def forward(
1703
+ self,
1704
+ input_ids: Optional[torch.LongTensor] = None,
1705
+ attention_mask: Optional[torch.Tensor] = None,
1706
+ position_ids: Optional[torch.LongTensor] = None,
1707
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1708
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1709
+ labels: Optional[torch.LongTensor] = None,
1710
+ use_cache: Optional[bool] = None,
1711
+ output_attentions: Optional[bool] = None,
1712
+ output_hidden_states: Optional[bool] = None,
1713
+ return_dict: Optional[bool] = None,
1714
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1715
+ r"""
1716
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1717
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1718
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1719
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1720
+ """
1721
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1722
+
1723
+ transformer_outputs = self.model(
1724
+ input_ids,
1725
+ attention_mask=attention_mask,
1726
+ position_ids=position_ids,
1727
+ past_key_values=past_key_values,
1728
+ inputs_embeds=inputs_embeds,
1729
+ use_cache=use_cache,
1730
+ output_attentions=output_attentions,
1731
+ output_hidden_states=output_hidden_states,
1732
+ return_dict=return_dict,
1733
+ )
1734
+ hidden_states = transformer_outputs[0]
1735
+ logits = self.score(hidden_states)
1736
+
1737
+ if input_ids is not None:
1738
+ batch_size = input_ids.shape[0]
1739
+ else:
1740
+ batch_size = inputs_embeds.shape[0]
1741
+
1742
+ if self.config.pad_token_id is None and batch_size != 1:
1743
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1744
+ if self.config.pad_token_id is None:
1745
+ last_non_pad_token = -1
1746
+ elif input_ids is not None:
1747
+ # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
1748
+ non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
1749
+ token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
1750
+ last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
1751
+ else:
1752
+ last_non_pad_token = -1
1753
+ logger.warning_once(
1754
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
1755
+ "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
1756
+ )
1757
+
1758
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
1759
+
1760
+ loss = None
1761
+ if labels is not None:
1762
+ loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
1763
+
1764
+ if not return_dict:
1765
+ output = (pooled_logits,) + transformer_outputs[1:]
1766
+ return ((loss,) + output) if loss is not None else output
1767
+
1768
+ return SequenceClassifierOutputWithPast(
1769
+ loss=loss,
1770
+ logits=pooled_logits,
1771
+ past_key_values=transformer_outputs.past_key_values,
1772
+ hidden_states=transformer_outputs.hidden_states,
1773
+ attentions=transformer_outputs.attentions,
1774
+ )
1775
+
1776
+
1777
+ @add_start_docstrings(
1778
+ """
1779
+ The Qwen2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1780
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1781
+ """,
1782
+ QWEN2_START_DOCSTRING,
1783
+ )
1784
+ class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
1785
+ def __init__(self, config):
1786
+ super().__init__(config)
1787
+ self.num_labels = config.num_labels
1788
+ self.model = Qwen2Model(config)
1789
+ if getattr(config, "classifier_dropout", None) is not None:
1790
+ classifier_dropout = config.classifier_dropout
1791
+ elif getattr(config, "hidden_dropout", None) is not None:
1792
+ classifier_dropout = config.hidden_dropout
1793
+ else:
1794
+ classifier_dropout = 0.1
1795
+ self.dropout = nn.Dropout(classifier_dropout)
1796
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1797
+
1798
+ # Initialize weights and apply final processing
1799
+ self.post_init()
1800
+
1801
+ def get_input_embeddings(self):
1802
+ return self.model.embed_tokens
1803
+
1804
+ def set_input_embeddings(self, value):
1805
+ self.model.embed_tokens = value
1806
+
1807
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1808
+ @add_code_sample_docstrings(
1809
+ checkpoint=_CHECKPOINT_FOR_DOC,
1810
+ output_type=TokenClassifierOutput,
1811
+ config_class=_CONFIG_FOR_DOC,
1812
+ )
1813
+ def forward(
1814
+ self,
1815
+ input_ids: Optional[torch.LongTensor] = None,
1816
+ attention_mask: Optional[torch.Tensor] = None,
1817
+ position_ids: Optional[torch.LongTensor] = None,
1818
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1819
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1820
+ labels: Optional[torch.LongTensor] = None,
1821
+ use_cache: Optional[bool] = None,
1822
+ output_attentions: Optional[bool] = None,
1823
+ output_hidden_states: Optional[bool] = None,
1824
+ return_dict: Optional[bool] = None,
1825
+ ) -> Union[Tuple, TokenClassifierOutput]:
1826
+ r"""
1827
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1828
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1829
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1830
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1831
+ """
1832
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1833
+
1834
+ outputs = self.model(
1835
+ input_ids,
1836
+ attention_mask=attention_mask,
1837
+ position_ids=position_ids,
1838
+ past_key_values=past_key_values,
1839
+ inputs_embeds=inputs_embeds,
1840
+ use_cache=use_cache,
1841
+ output_attentions=output_attentions,
1842
+ output_hidden_states=output_hidden_states,
1843
+ return_dict=return_dict,
1844
+ )
1845
+ sequence_output = outputs[0]
1846
+ sequence_output = self.dropout(sequence_output)
1847
+ logits = self.score(sequence_output)
1848
+
1849
+ loss = None
1850
+ if labels is not None:
1851
+ loss = self.loss_function(logits, labels, self.config)
1852
+
1853
+ if not return_dict:
1854
+ output = (logits,) + outputs[2:]
1855
+ return ((loss,) + output) if loss is not None else output
1856
+
1857
+ return TokenClassifierOutput(
1858
+ loss=loss,
1859
+ logits=logits,
1860
+ hidden_states=outputs.hidden_states,
1861
+ attentions=outputs.attentions,
1862
+ )
1863
+
1864
+
1865
+ @add_start_docstrings(
1866
+ """
1867
+ The Qwen2 Model transformer with a span classification head on top for extractive question-answering tasks like
1868
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1869
+ """,
1870
+ QWEN2_START_DOCSTRING,
1871
+ )
1872
+ class Qwen2ForQuestionAnswering(Qwen2PreTrainedModel):
1873
+ base_model_prefix = "transformer"
1874
+
1875
+ def __init__(self, config):
1876
+ super().__init__(config)
1877
+ self.transformer = Qwen2Model(config)
1878
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1879
+
1880
+ # Initialize weights and apply final processing
1881
+ self.post_init()
1882
+
1883
+ def get_input_embeddings(self):
1884
+ return self.transformer.embed_tokens
1885
+
1886
+ def set_input_embeddings(self, value):
1887
+ self.transformer.embed_tokens = value
1888
+
1889
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1890
+ def forward(
1891
+ self,
1892
+ input_ids: Optional[torch.LongTensor] = None,
1893
+ attention_mask: Optional[torch.FloatTensor] = None,
1894
+ position_ids: Optional[torch.LongTensor] = None,
1895
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1896
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1897
+ start_positions: Optional[torch.LongTensor] = None,
1898
+ end_positions: Optional[torch.LongTensor] = None,
1899
+ output_attentions: Optional[bool] = None,
1900
+ output_hidden_states: Optional[bool] = None,
1901
+ return_dict: Optional[bool] = None,
1902
+ **kwargs,
1903
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1904
+ r"""
1905
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1906
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1907
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1908
+ are not taken into account for computing the loss.
1909
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1910
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1911
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1912
+ are not taken into account for computing the loss.
1913
+ """
1914
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1915
+
1916
+ outputs = self.transformer(
1917
+ input_ids,
1918
+ attention_mask=attention_mask,
1919
+ position_ids=position_ids,
1920
+ past_key_values=past_key_values,
1921
+ inputs_embeds=inputs_embeds,
1922
+ output_attentions=output_attentions,
1923
+ output_hidden_states=output_hidden_states,
1924
+ return_dict=return_dict,
1925
+ )
1926
+
1927
+ sequence_output = outputs[0]
1928
+
1929
+ logits = self.qa_outputs(sequence_output)
1930
+ start_logits, end_logits = logits.split(1, dim=-1)
1931
+ start_logits = start_logits.squeeze(-1).contiguous()
1932
+ end_logits = end_logits.squeeze(-1).contiguous()
1933
+
1934
+ loss = None
1935
+ if start_positions is not None and end_positions is not None:
1936
+ loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
1937
+
1938
+ if not return_dict:
1939
+ output = (start_logits, end_logits) + outputs[2:]
1940
+ return ((loss,) + output) if loss is not None else output
1941
+
1942
+ return QuestionAnsweringModelOutput(
1943
+ loss=loss,
1944
+ start_logits=start_logits,
1945
+ end_logits=end_logits,
1946
+ hidden_states=outputs.hidden_states,
1947
+ attentions=outputs.attentions,
1948
+ )
1949
+
1950
+
internvl3-8b-instruct-lora_epoch10_5e-6/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer_config.json ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<img>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "151666": {
191
+ "content": "</img>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "151667": {
199
+ "content": "<IMG_CONTEXT>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "151668": {
207
+ "content": "<quad>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "151669": {
215
+ "content": "</quad>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<ref>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</ref>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ }
254
+ },
255
+ "additional_special_tokens": [
256
+ "<|im_start|>",
257
+ "<|im_end|>",
258
+ "<|object_ref_start|>",
259
+ "<|object_ref_end|>",
260
+ "<|box_start|>",
261
+ "<|box_end|>",
262
+ "<|quad_start|>",
263
+ "<|quad_end|>",
264
+ "<|vision_start|>",
265
+ "<|vision_end|>",
266
+ "<|vision_pad|>",
267
+ "<|image_pad|>",
268
+ "<|video_pad|>"
269
+ ],
270
+ "bos_token": null,
271
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
272
+ "clean_up_tokenization_spaces": false,
273
+ "eos_token": "<|im_end|>",
274
+ "errors": "replace",
275
+ "extra_special_tokens": {},
276
+ "model_max_length": 1000000,
277
+ "pad_token": "<|endoftext|>",
278
+ "split_special_tokens": false,
279
+ "tokenizer_class": "Qwen2Tokenizer",
280
+ "unk_token": null
281
+ }
internvl3-8b-instruct-lora_epoch10_5e-6/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
llava-ov-lora/preprocessor_config.json ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_grid_pinpoints": [
8
+ [
9
+ 384,
10
+ 384
11
+ ],
12
+ [
13
+ 384,
14
+ 768
15
+ ],
16
+ [
17
+ 384,
18
+ 1152
19
+ ],
20
+ [
21
+ 384,
22
+ 1536
23
+ ],
24
+ [
25
+ 384,
26
+ 1920
27
+ ],
28
+ [
29
+ 384,
30
+ 2304
31
+ ],
32
+ [
33
+ 768,
34
+ 384
35
+ ],
36
+ [
37
+ 768,
38
+ 768
39
+ ],
40
+ [
41
+ 768,
42
+ 1152
43
+ ],
44
+ [
45
+ 768,
46
+ 1536
47
+ ],
48
+ [
49
+ 768,
50
+ 1920
51
+ ],
52
+ [
53
+ 768,
54
+ 2304
55
+ ],
56
+ [
57
+ 1152,
58
+ 384
59
+ ],
60
+ [
61
+ 1152,
62
+ 768
63
+ ],
64
+ [
65
+ 1152,
66
+ 1152
67
+ ],
68
+ [
69
+ 1152,
70
+ 1536
71
+ ],
72
+ [
73
+ 1152,
74
+ 1920
75
+ ],
76
+ [
77
+ 1152,
78
+ 2304
79
+ ],
80
+ [
81
+ 1536,
82
+ 384
83
+ ],
84
+ [
85
+ 1536,
86
+ 768
87
+ ],
88
+ [
89
+ 1536,
90
+ 1152
91
+ ],
92
+ [
93
+ 1536,
94
+ 1536
95
+ ],
96
+ [
97
+ 1536,
98
+ 1920
99
+ ],
100
+ [
101
+ 1536,
102
+ 2304
103
+ ],
104
+ [
105
+ 1920,
106
+ 384
107
+ ],
108
+ [
109
+ 1920,
110
+ 768
111
+ ],
112
+ [
113
+ 1920,
114
+ 1152
115
+ ],
116
+ [
117
+ 1920,
118
+ 1536
119
+ ],
120
+ [
121
+ 1920,
122
+ 1920
123
+ ],
124
+ [
125
+ 1920,
126
+ 2304
127
+ ],
128
+ [
129
+ 2304,
130
+ 384
131
+ ],
132
+ [
133
+ 2304,
134
+ 768
135
+ ],
136
+ [
137
+ 2304,
138
+ 1152
139
+ ],
140
+ [
141
+ 2304,
142
+ 1536
143
+ ],
144
+ [
145
+ 2304,
146
+ 1920
147
+ ],
148
+ [
149
+ 2304,
150
+ 2304
151
+ ]
152
+ ],
153
+ "image_mean": [
154
+ 0.5,
155
+ 0.5,
156
+ 0.5
157
+ ],
158
+ "image_processor_type": "LlavaOnevisionImageProcessor",
159
+ "image_std": [
160
+ 0.5,
161
+ 0.5,
162
+ 0.5
163
+ ],
164
+ "processor_class": "LlavaOnevisionProcessor",
165
+ "resample": 3,
166
+ "rescale_factor": 0.00392156862745098,
167
+ "size": {
168
+ "height": 384,
169
+ "width": 384
170
+ }
171
+ }
llava-ov-lora/processor_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_token": "<image>",
3
+ "num_image_tokens": 729,
4
+ "processor_class": "LlavaOnevisionProcessor",
5
+ "video_token": "<video>",
6
+ "vision_feature_select_strategy": "full"
7
+ }
llava-ov-lora/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
llava-ov-lora/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<image>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<video>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "additional_special_tokens": [
46
+ "<|im_start|>",
47
+ "<|im_end|>"
48
+ ],
49
+ "bos_token": null,
50
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
51
+ "clean_up_tokenization_spaces": false,
52
+ "eos_token": "<|im_end|>",
53
+ "errors": "replace",
54
+ "extra_special_tokens": {},
55
+ "max_length": null,
56
+ "model_max_length": 131072,
57
+ "pad_to_multiple_of": null,
58
+ "pad_token": "<|endoftext|>",
59
+ "pad_token_type_id": 0,
60
+ "padding_side": "right",
61
+ "processor_class": "LlavaOnevisionProcessor",
62
+ "split_special_tokens": false,
63
+ "tokenizer_class": "Qwen2Tokenizer",
64
+ "unk_token": null
65
+ }
llava-ov-lora/video_processor/preprocessor_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "LlavaOnevisionVideoProcessor",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "processor_class": "LlavaOnevisionProcessor",
19
+ "resample": 3,
20
+ "rescale_factor": 0.00392156862745098,
21
+ "size": {
22
+ "height": 384,
23
+ "width": 384
24
+ }
25
+ }
llava-ov-lora/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/args.json ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 2,
10
+ "per_device_eval_batch_size": 2,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 2e-06,
18
+ "weight_decay": 0.0001,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 5.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.1,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 5,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": 5,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": null,
65
+ "dataloader_num_workers": 4,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": true,
73
+ "metric_for_best_model": "eval_loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "tp_size": 0,
80
+ "fsdp_transformer_layer_cls_to_wrap": null,
81
+ "accelerator_config": {
82
+ "dispatch_batches": false
83
+ },
84
+ "deepspeed": {
85
+ "fp16": {
86
+ "enabled": "auto",
87
+ "loss_scale": 0,
88
+ "loss_scale_window": 1000,
89
+ "initial_scale_power": 16,
90
+ "hysteresis": 2,
91
+ "min_loss_scale": 1
92
+ },
93
+ "bf16": {
94
+ "enabled": "auto"
95
+ },
96
+ "zero_optimization": {
97
+ "stage": 3,
98
+ "offload_optimizer": {
99
+ "device": "none",
100
+ "pin_memory": true
101
+ },
102
+ "offload_param": {
103
+ "device": "none",
104
+ "pin_memory": true
105
+ },
106
+ "overlap_comm": false,
107
+ "contiguous_gradients": true,
108
+ "sub_group_size": 1000000000.0,
109
+ "reduce_bucket_size": "auto",
110
+ "zero_quantized_weights": false,
111
+ "zero_quantized_gradients": false,
112
+ "stage3_prefetch_bucket_size": "auto",
113
+ "stage3_param_persistence_threshold": "auto",
114
+ "stage3_max_live_parameters": 1000000000.0,
115
+ "stage3_max_reuse_distance": 1000000000.0,
116
+ "stage3_gather_16bit_weights_on_model_save": true
117
+ },
118
+ "gradient_accumulation_steps": "auto",
119
+ "gradient_clipping": "auto",
120
+ "steps_per_print": 2000,
121
+ "train_batch_size": "auto",
122
+ "train_micro_batch_size_per_gpu": "auto",
123
+ "wall_clock_breakdown": false
124
+ },
125
+ "label_smoothing_factor": 0.0,
126
+ "optim": "adamw_torch",
127
+ "optim_args": null,
128
+ "adafactor": false,
129
+ "group_by_length": false,
130
+ "length_column_name": "length",
131
+ "report_to": [
132
+ "swanlab"
133
+ ],
134
+ "ddp_find_unused_parameters": null,
135
+ "ddp_bucket_cap_mb": null,
136
+ "ddp_broadcast_buffers": null,
137
+ "dataloader_pin_memory": true,
138
+ "dataloader_persistent_workers": false,
139
+ "skip_memory_metrics": true,
140
+ "use_legacy_prediction_loop": false,
141
+ "push_to_hub": false,
142
+ "resume_from_checkpoint": null,
143
+ "hub_model_id": null,
144
+ "hub_strategy": "every_save",
145
+ "hub_token": null,
146
+ "hub_private_repo": null,
147
+ "hub_always_push": false,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "eval_use_gather_object": false,
174
+ "average_tokens_across_devices": false,
175
+ "sortish_sampler": false,
176
+ "predict_with_generate": false,
177
+ "generation_max_length": null,
178
+ "generation_num_beams": null,
179
+ "generation_config": null,
180
+ "vit_gradient_checkpointing": null,
181
+ "check_model": true,
182
+ "acc_strategy": "token",
183
+ "train_dataloader_shuffle": true,
184
+ "max_epochs": null,
185
+ "aligner_lr": null,
186
+ "vit_lr": null,
187
+ "optimizer": null,
188
+ "use_logits_to_keep": null,
189
+ "channels": null,
190
+ "metric_warmup_step": 0,
191
+ "fsdp_num": 1,
192
+ "acc_steps": 1,
193
+ "eval_use_evalscope": false,
194
+ "eval_datasets": [],
195
+ "eval_limit": null,
196
+ "eval_datasets_args": null,
197
+ "eval_generation_config": null,
198
+ "model": "/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct",
199
+ "model_type": "qwen2_5_vl",
200
+ "model_revision": null,
201
+ "task_type": "causal_lm",
202
+ "torch_dtype": "bfloat16",
203
+ "attn_impl": null,
204
+ "num_labels": null,
205
+ "problem_type": null,
206
+ "rope_scaling": null,
207
+ "device_map": null,
208
+ "max_memory": {},
209
+ "local_repo_path": null,
210
+ "init_strategy": null,
211
+ "template": "qwen2_5_vl",
212
+ "system": null,
213
+ "max_length": 32768,
214
+ "truncation_strategy": "delete",
215
+ "max_pixels": null,
216
+ "agent_template": null,
217
+ "norm_bbox": null,
218
+ "use_chat_template": true,
219
+ "padding_free": false,
220
+ "padding_side": "right",
221
+ "loss_scale": "default",
222
+ "sequence_parallel_size": 1,
223
+ "response_prefix": null,
224
+ "template_backend": "swift",
225
+ "dataset": [
226
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/ambi_normal_train_thinking_772.json",
227
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/mma_train_thinking_126.json",
228
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/sp_train_thinking_102.json"
229
+ ],
230
+ "val_dataset": [],
231
+ "split_dataset_ratio": 0.1,
232
+ "dataset_num_proc": 1,
233
+ "load_from_cache_file": true,
234
+ "dataset_shuffle": true,
235
+ "val_dataset_shuffle": false,
236
+ "streaming": false,
237
+ "interleave_prob": null,
238
+ "stopping_strategy": "first_exhausted",
239
+ "shuffle_buffer_size": 1000,
240
+ "download_mode": "reuse_dataset_if_exists",
241
+ "columns": {},
242
+ "strict": false,
243
+ "model_name": null,
244
+ "model_author": null,
245
+ "custom_dataset_info": [],
246
+ "quant_method": null,
247
+ "quant_bits": null,
248
+ "hqq_axis": null,
249
+ "bnb_4bit_compute_dtype": "bfloat16",
250
+ "bnb_4bit_quant_type": "nf4",
251
+ "bnb_4bit_use_double_quant": true,
252
+ "bnb_4bit_quant_storage": null,
253
+ "max_new_tokens": 64,
254
+ "temperature": 0.0,
255
+ "top_k": null,
256
+ "top_p": null,
257
+ "repetition_penalty": null,
258
+ "num_beams": 1,
259
+ "stream": false,
260
+ "stop_words": [],
261
+ "logprobs": false,
262
+ "top_logprobs": null,
263
+ "ckpt_dir": null,
264
+ "lora_modules": [],
265
+ "tuner_backend": "peft",
266
+ "train_type": "full",
267
+ "adapters": [],
268
+ "external_plugins": [],
269
+ "model_kwargs": {},
270
+ "load_args": false,
271
+ "load_data_args": false,
272
+ "packing": false,
273
+ "packing_cache": null,
274
+ "custom_register_path": [],
275
+ "use_hf": false,
276
+ "ignore_args_error": false,
277
+ "use_swift_lora": false,
278
+ "freeze_parameters": [
279
+ "visual",
280
+ "visual.merger"
281
+ ],
282
+ "freeze_parameters_regex": null,
283
+ "freeze_parameters_ratio": 0.0,
284
+ "trainable_parameters": [],
285
+ "trainable_parameters_regex": null,
286
+ "freeze_llm": false,
287
+ "freeze_vit": true,
288
+ "freeze_aligner": true,
289
+ "target_modules": [
290
+ "all-linear"
291
+ ],
292
+ "target_regex": null,
293
+ "modules_to_save": [],
294
+ "lora_rank": 8,
295
+ "lora_alpha": 32,
296
+ "lora_dropout": 0.05,
297
+ "lora_bias": "none",
298
+ "lora_dtype": null,
299
+ "lorap_lr_ratio": null,
300
+ "use_rslora": false,
301
+ "use_dora": false,
302
+ "lora_ga_batch_size": 2,
303
+ "lora_ga_iters": 2,
304
+ "lora_ga_max_length": 1024,
305
+ "lora_ga_direction": "ArB2r",
306
+ "lora_ga_scale": "stable",
307
+ "lora_ga_stable_gamma": 16,
308
+ "init_weights": true,
309
+ "fourier_n_frequency": 2000,
310
+ "fourier_scaling": 300.0,
311
+ "boft_block_size": 4,
312
+ "boft_block_num": 0,
313
+ "boft_n_butterfly_factor": 1,
314
+ "boft_dropout": 0.0,
315
+ "vera_rank": 256,
316
+ "vera_projection_prng_key": 0,
317
+ "vera_dropout": 0.0,
318
+ "vera_d_initial": 0.1,
319
+ "adapter_act": "gelu",
320
+ "adapter_length": 128,
321
+ "use_galore": false,
322
+ "galore_target_modules": null,
323
+ "galore_rank": 128,
324
+ "galore_update_proj_gap": 50,
325
+ "galore_scale": 1.0,
326
+ "galore_proj_type": "std",
327
+ "galore_optim_per_parameter": false,
328
+ "galore_with_embedding": false,
329
+ "galore_quantization": false,
330
+ "galore_proj_quant": false,
331
+ "galore_proj_bits": 4,
332
+ "galore_proj_group_size": 256,
333
+ "galore_cos_threshold": 0.4,
334
+ "galore_gamma_proj": 2,
335
+ "galore_queue_size": 5,
336
+ "adalora_target_r": 8,
337
+ "adalora_init_r": 12,
338
+ "adalora_tinit": 0,
339
+ "adalora_tfinal": 0,
340
+ "adalora_deltaT": 1,
341
+ "adalora_beta1": 0.85,
342
+ "adalora_beta2": 0.85,
343
+ "adalora_orth_reg_weight": 0.5,
344
+ "llamapro_num_new_blocks": 4,
345
+ "llamapro_num_groups": null,
346
+ "lisa_activated_layers": 0,
347
+ "lisa_step_interval": 20,
348
+ "reft_layer_key": null,
349
+ "reft_layers": null,
350
+ "reft_rank": 4,
351
+ "reft_intervention_type": "LoreftIntervention",
352
+ "reft_args": null,
353
+ "swanlab_token": null,
354
+ "swanlab_project": null,
355
+ "swanlab_workspace": null,
356
+ "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
357
+ "swanlab_mode": "cloud",
358
+ "add_version": true,
359
+ "resume_only_model": false,
360
+ "create_checkpoint_symlink": false,
361
+ "lazy_tokenize": true,
362
+ "loss_type": null,
363
+ "metric": null,
364
+ "zero_hpz_partition_size": null,
365
+ "rank": 0,
366
+ "global_world_size": 4,
367
+ "local_world_size": 4,
368
+ "model_suffix": "Qwen2.5-VL-7B-Instruct",
369
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
370
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7f93b21f0550>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
371
+ "model_dir": "/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct",
372
+ "hub": "<class 'swift.hub.hub.MSHub'>",
373
+ "evaluation_strategy": "epoch",
374
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=2e-06, weight_decay=0.0001, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, vit_gradient_checkpointing=True, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, use_logits_to_keep=None, channels=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
375
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/args.json ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 2,
10
+ "per_device_eval_batch_size": 2,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 2e-06,
18
+ "weight_decay": 0.0001,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 5.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.1,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 5,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": 5,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": null,
65
+ "dataloader_num_workers": 4,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": true,
73
+ "metric_for_best_model": "eval_loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "tp_size": 0,
80
+ "fsdp_transformer_layer_cls_to_wrap": null,
81
+ "accelerator_config": {
82
+ "dispatch_batches": false
83
+ },
84
+ "deepspeed": {
85
+ "fp16": {
86
+ "enabled": "auto",
87
+ "loss_scale": 0,
88
+ "loss_scale_window": 1000,
89
+ "initial_scale_power": 16,
90
+ "hysteresis": 2,
91
+ "min_loss_scale": 1
92
+ },
93
+ "bf16": {
94
+ "enabled": "auto"
95
+ },
96
+ "zero_optimization": {
97
+ "stage": 3,
98
+ "offload_optimizer": {
99
+ "device": "none",
100
+ "pin_memory": true
101
+ },
102
+ "offload_param": {
103
+ "device": "none",
104
+ "pin_memory": true
105
+ },
106
+ "overlap_comm": false,
107
+ "contiguous_gradients": true,
108
+ "sub_group_size": 1000000000.0,
109
+ "reduce_bucket_size": "auto",
110
+ "zero_quantized_weights": false,
111
+ "zero_quantized_gradients": false,
112
+ "stage3_prefetch_bucket_size": "auto",
113
+ "stage3_param_persistence_threshold": "auto",
114
+ "stage3_max_live_parameters": 1000000000.0,
115
+ "stage3_max_reuse_distance": 1000000000.0,
116
+ "stage3_gather_16bit_weights_on_model_save": true
117
+ },
118
+ "gradient_accumulation_steps": "auto",
119
+ "gradient_clipping": "auto",
120
+ "steps_per_print": 2000,
121
+ "train_batch_size": "auto",
122
+ "train_micro_batch_size_per_gpu": "auto",
123
+ "wall_clock_breakdown": false
124
+ },
125
+ "label_smoothing_factor": 0.0,
126
+ "optim": "adamw_torch",
127
+ "optim_args": null,
128
+ "adafactor": false,
129
+ "group_by_length": false,
130
+ "length_column_name": "length",
131
+ "report_to": [
132
+ "swanlab"
133
+ ],
134
+ "ddp_find_unused_parameters": null,
135
+ "ddp_bucket_cap_mb": null,
136
+ "ddp_broadcast_buffers": null,
137
+ "dataloader_pin_memory": true,
138
+ "dataloader_persistent_workers": false,
139
+ "skip_memory_metrics": true,
140
+ "use_legacy_prediction_loop": false,
141
+ "push_to_hub": false,
142
+ "resume_from_checkpoint": null,
143
+ "hub_model_id": null,
144
+ "hub_strategy": "every_save",
145
+ "hub_token": null,
146
+ "hub_private_repo": null,
147
+ "hub_always_push": false,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "eval_use_gather_object": false,
174
+ "average_tokens_across_devices": false,
175
+ "sortish_sampler": false,
176
+ "predict_with_generate": false,
177
+ "generation_max_length": null,
178
+ "generation_num_beams": null,
179
+ "generation_config": null,
180
+ "vit_gradient_checkpointing": null,
181
+ "check_model": true,
182
+ "acc_strategy": "token",
183
+ "train_dataloader_shuffle": true,
184
+ "max_epochs": null,
185
+ "aligner_lr": null,
186
+ "vit_lr": null,
187
+ "optimizer": null,
188
+ "use_logits_to_keep": null,
189
+ "channels": null,
190
+ "metric_warmup_step": 0,
191
+ "fsdp_num": 1,
192
+ "acc_steps": 1,
193
+ "eval_use_evalscope": false,
194
+ "eval_datasets": [],
195
+ "eval_limit": null,
196
+ "eval_datasets_args": null,
197
+ "eval_generation_config": null,
198
+ "model": "/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct",
199
+ "model_type": "qwen2_5_vl",
200
+ "model_revision": null,
201
+ "task_type": "causal_lm",
202
+ "torch_dtype": "bfloat16",
203
+ "attn_impl": null,
204
+ "num_labels": null,
205
+ "problem_type": null,
206
+ "rope_scaling": null,
207
+ "device_map": null,
208
+ "max_memory": {},
209
+ "local_repo_path": null,
210
+ "init_strategy": null,
211
+ "template": "qwen2_5_vl",
212
+ "system": null,
213
+ "max_length": 32768,
214
+ "truncation_strategy": "delete",
215
+ "max_pixels": null,
216
+ "agent_template": null,
217
+ "norm_bbox": null,
218
+ "use_chat_template": true,
219
+ "padding_free": false,
220
+ "padding_side": "right",
221
+ "loss_scale": "default",
222
+ "sequence_parallel_size": 1,
223
+ "response_prefix": null,
224
+ "template_backend": "swift",
225
+ "dataset": [
226
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/ambi_normal_train_thinking_772.json",
227
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/mma_train_thinking_126.json",
228
+ "/mnt/data/users/liamding/data/3AM_Plus/final/training/qvq-thinking_answer/sp_train_thinking_102.json"
229
+ ],
230
+ "val_dataset": [],
231
+ "split_dataset_ratio": 0.1,
232
+ "dataset_num_proc": 1,
233
+ "load_from_cache_file": true,
234
+ "dataset_shuffle": true,
235
+ "val_dataset_shuffle": false,
236
+ "streaming": false,
237
+ "interleave_prob": null,
238
+ "stopping_strategy": "first_exhausted",
239
+ "shuffle_buffer_size": 1000,
240
+ "download_mode": "reuse_dataset_if_exists",
241
+ "columns": {},
242
+ "strict": false,
243
+ "model_name": null,
244
+ "model_author": null,
245
+ "custom_dataset_info": [],
246
+ "quant_method": null,
247
+ "quant_bits": null,
248
+ "hqq_axis": null,
249
+ "bnb_4bit_compute_dtype": "bfloat16",
250
+ "bnb_4bit_quant_type": "nf4",
251
+ "bnb_4bit_use_double_quant": true,
252
+ "bnb_4bit_quant_storage": null,
253
+ "max_new_tokens": 64,
254
+ "temperature": 0.0,
255
+ "top_k": null,
256
+ "top_p": null,
257
+ "repetition_penalty": null,
258
+ "num_beams": 1,
259
+ "stream": false,
260
+ "stop_words": [],
261
+ "logprobs": false,
262
+ "top_logprobs": null,
263
+ "ckpt_dir": null,
264
+ "lora_modules": [],
265
+ "tuner_backend": "peft",
266
+ "train_type": "full",
267
+ "adapters": [],
268
+ "external_plugins": [],
269
+ "model_kwargs": {},
270
+ "load_args": false,
271
+ "load_data_args": false,
272
+ "packing": false,
273
+ "packing_cache": null,
274
+ "custom_register_path": [],
275
+ "use_hf": false,
276
+ "ignore_args_error": false,
277
+ "use_swift_lora": false,
278
+ "freeze_parameters": [
279
+ "visual",
280
+ "visual.merger"
281
+ ],
282
+ "freeze_parameters_regex": null,
283
+ "freeze_parameters_ratio": 0.0,
284
+ "trainable_parameters": [],
285
+ "trainable_parameters_regex": null,
286
+ "freeze_llm": false,
287
+ "freeze_vit": true,
288
+ "freeze_aligner": true,
289
+ "target_modules": [
290
+ "all-linear"
291
+ ],
292
+ "target_regex": null,
293
+ "modules_to_save": [],
294
+ "lora_rank": 8,
295
+ "lora_alpha": 32,
296
+ "lora_dropout": 0.05,
297
+ "lora_bias": "none",
298
+ "lora_dtype": null,
299
+ "lorap_lr_ratio": null,
300
+ "use_rslora": false,
301
+ "use_dora": false,
302
+ "lora_ga_batch_size": 2,
303
+ "lora_ga_iters": 2,
304
+ "lora_ga_max_length": 1024,
305
+ "lora_ga_direction": "ArB2r",
306
+ "lora_ga_scale": "stable",
307
+ "lora_ga_stable_gamma": 16,
308
+ "init_weights": true,
309
+ "fourier_n_frequency": 2000,
310
+ "fourier_scaling": 300.0,
311
+ "boft_block_size": 4,
312
+ "boft_block_num": 0,
313
+ "boft_n_butterfly_factor": 1,
314
+ "boft_dropout": 0.0,
315
+ "vera_rank": 256,
316
+ "vera_projection_prng_key": 0,
317
+ "vera_dropout": 0.0,
318
+ "vera_d_initial": 0.1,
319
+ "adapter_act": "gelu",
320
+ "adapter_length": 128,
321
+ "use_galore": false,
322
+ "galore_target_modules": null,
323
+ "galore_rank": 128,
324
+ "galore_update_proj_gap": 50,
325
+ "galore_scale": 1.0,
326
+ "galore_proj_type": "std",
327
+ "galore_optim_per_parameter": false,
328
+ "galore_with_embedding": false,
329
+ "galore_quantization": false,
330
+ "galore_proj_quant": false,
331
+ "galore_proj_bits": 4,
332
+ "galore_proj_group_size": 256,
333
+ "galore_cos_threshold": 0.4,
334
+ "galore_gamma_proj": 2,
335
+ "galore_queue_size": 5,
336
+ "adalora_target_r": 8,
337
+ "adalora_init_r": 12,
338
+ "adalora_tinit": 0,
339
+ "adalora_tfinal": 0,
340
+ "adalora_deltaT": 1,
341
+ "adalora_beta1": 0.85,
342
+ "adalora_beta2": 0.85,
343
+ "adalora_orth_reg_weight": 0.5,
344
+ "llamapro_num_new_blocks": 4,
345
+ "llamapro_num_groups": null,
346
+ "lisa_activated_layers": 0,
347
+ "lisa_step_interval": 20,
348
+ "reft_layer_key": null,
349
+ "reft_layers": null,
350
+ "reft_rank": 4,
351
+ "reft_intervention_type": "LoreftIntervention",
352
+ "reft_args": null,
353
+ "swanlab_token": null,
354
+ "swanlab_project": null,
355
+ "swanlab_workspace": null,
356
+ "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422",
357
+ "swanlab_mode": "cloud",
358
+ "add_version": true,
359
+ "resume_only_model": false,
360
+ "create_checkpoint_symlink": false,
361
+ "lazy_tokenize": true,
362
+ "loss_type": null,
363
+ "metric": null,
364
+ "zero_hpz_partition_size": null,
365
+ "rank": 0,
366
+ "global_world_size": 4,
367
+ "local_world_size": 4,
368
+ "model_suffix": "Qwen2.5-VL-7B-Instruct",
369
+ "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
370
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7f93b21f0550>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
371
+ "model_dir": "/mnt/data/users/liamding/data/models/Qwen2.5-VL-7B-Instruct",
372
+ "hub": "<class 'swift.hub.hub.MSHub'>",
373
+ "evaluation_strategy": "epoch",
374
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=2e-06, weight_decay=0.0001, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, vit_gradient_checkpointing=True, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, use_logits_to_keep=None, channels=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
375
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "image_token_id": 151655,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 128000,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2_5_vl",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "pad_token_id": 151643,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
+ ],
27
+ "rope_type": "default",
28
+ "type": "default"
29
+ },
30
+ "rope_theta": 1000000.0,
31
+ "sliding_window": 32768,
32
+ "tie_word_embeddings": false,
33
+ "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.51.3",
35
+ "use_cache": false,
36
+ "use_sliding_window": false,
37
+ "video_token_id": 151656,
38
+ "vision_config": {
39
+ "depth": 32,
40
+ "fullatt_block_indexes": [
41
+ 7,
42
+ 15,
43
+ 23,
44
+ 31
45
+ ],
46
+ "hidden_act": "silu",
47
+ "hidden_size": 1280,
48
+ "in_channels": 3,
49
+ "in_chans": 3,
50
+ "intermediate_size": 3420,
51
+ "model_type": "qwen2_5_vl",
52
+ "num_heads": 16,
53
+ "out_hidden_size": 3584,
54
+ "patch_size": 14,
55
+ "spatial_merge_size": 2,
56
+ "spatial_patch_size": 14,
57
+ "temporal_patch_size": 2,
58
+ "tokens_per_second": 2,
59
+ "torch_dtype": "bfloat16",
60
+ "window_size": 112
61
+ },
62
+ "vision_end_token_id": 151653,
63
+ "vision_start_token_id": 151652,
64
+ "vision_token_id": 151654,
65
+ "vocab_size": 152064
66
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 1e-06,
11
+ "transformers_version": "4.51.3"
12
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step278
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/model.safetensors.index.json ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16584333312
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00004-of-00004.safetensors",
345
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
346
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
347
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
348
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
349
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
350
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
351
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
352
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
353
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
354
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
355
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
356
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
357
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
358
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
359
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
360
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
361
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
362
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
363
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
364
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
365
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
366
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
367
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
368
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
369
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
370
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
371
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
372
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
373
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
374
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
375
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
376
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
377
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
378
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
379
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
380
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
381
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
382
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
383
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
384
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
385
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
386
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
387
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
388
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
389
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
390
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
391
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
392
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
393
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
394
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
395
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
396
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
397
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
398
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
399
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
400
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
401
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
402
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
403
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
404
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
405
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
406
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
407
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
408
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
409
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
410
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
411
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
412
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
413
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
414
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
415
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
416
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
417
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
418
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
419
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
420
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
421
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
422
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
423
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
424
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
425
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
426
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
427
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
428
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
429
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
430
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
431
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
432
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
433
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
434
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
435
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
436
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
437
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
438
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
439
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
440
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
441
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
442
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
729
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
730
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
731
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
732
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
733
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
734
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
735
+ }
736
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Qwen2_5_VLProcessor"
19
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "processor_class": "Qwen2_5_VLProcessor",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/trainer_state.json ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 171,
3
+ "best_metric": 0.6724056,
4
+ "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-171",
5
+ "epoch": 4.920353982300885,
6
+ "eval_steps": 500,
7
+ "global_step": 280,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.017699115044247787,
14
+ "grad_norm": 7.40257722540107,
15
+ "learning_rate": 7.142857142857142e-08,
16
+ "loss": 1.5665555000305176,
17
+ "memory(GiB)": 49.46,
18
+ "step": 1,
19
+ "token_acc": 0.6045094454600853,
20
+ "train_speed(iter/s)": 0.040747
21
+ },
22
+ {
23
+ "epoch": 0.08849557522123894,
24
+ "grad_norm": 7.0226207081313206,
25
+ "learning_rate": 3.5714285714285716e-07,
26
+ "loss": 1.4509937763214111,
27
+ "memory(GiB)": 53.23,
28
+ "step": 5,
29
+ "token_acc": 0.6307320997586484,
30
+ "train_speed(iter/s)": 0.117152
31
+ },
32
+ {
33
+ "epoch": 0.17699115044247787,
34
+ "grad_norm": 6.593265202788787,
35
+ "learning_rate": 7.142857142857143e-07,
36
+ "loss": 1.526433563232422,
37
+ "memory(GiB)": 53.23,
38
+ "step": 10,
39
+ "token_acc": 0.5970879879502949,
40
+ "train_speed(iter/s)": 0.157327
41
+ },
42
+ {
43
+ "epoch": 0.26548672566371684,
44
+ "grad_norm": 6.007351809416256,
45
+ "learning_rate": 1.0714285714285714e-06,
46
+ "loss": 1.4530372619628906,
47
+ "memory(GiB)": 53.23,
48
+ "step": 15,
49
+ "token_acc": 0.6144578313253012,
50
+ "train_speed(iter/s)": 0.176339
51
+ },
52
+ {
53
+ "epoch": 0.35398230088495575,
54
+ "grad_norm": 6.044840522609318,
55
+ "learning_rate": 1.4285714285714286e-06,
56
+ "loss": 1.295381736755371,
57
+ "memory(GiB)": 53.23,
58
+ "step": 20,
59
+ "token_acc": 0.6550639547074858,
60
+ "train_speed(iter/s)": 0.188068
61
+ },
62
+ {
63
+ "epoch": 0.4424778761061947,
64
+ "grad_norm": 5.083934465976827,
65
+ "learning_rate": 1.7857142857142857e-06,
66
+ "loss": 1.1845590591430664,
67
+ "memory(GiB)": 53.23,
68
+ "step": 25,
69
+ "token_acc": 0.6735167769650529,
70
+ "train_speed(iter/s)": 0.194724
71
+ },
72
+ {
73
+ "epoch": 0.5309734513274337,
74
+ "grad_norm": 4.335314034421227,
75
+ "learning_rate": 1.999689182000816e-06,
76
+ "loss": 1.0762714385986327,
77
+ "memory(GiB)": 53.23,
78
+ "step": 30,
79
+ "token_acc": 0.6943215780035864,
80
+ "train_speed(iter/s)": 0.199848
81
+ },
82
+ {
83
+ "epoch": 0.6194690265486725,
84
+ "grad_norm": 3.8086299642108323,
85
+ "learning_rate": 1.9961946980917456e-06,
86
+ "loss": 0.9585922241210938,
87
+ "memory(GiB)": 53.23,
88
+ "step": 35,
89
+ "token_acc": 0.7230769230769231,
90
+ "train_speed(iter/s)": 0.203225
91
+ },
92
+ {
93
+ "epoch": 0.7079646017699115,
94
+ "grad_norm": 3.1353229453238276,
95
+ "learning_rate": 1.9888308262251284e-06,
96
+ "loss": 0.9166532516479492,
97
+ "memory(GiB)": 53.23,
98
+ "step": 40,
99
+ "token_acc": 0.7251565638662413,
100
+ "train_speed(iter/s)": 0.206022
101
+ },
102
+ {
103
+ "epoch": 0.7964601769911505,
104
+ "grad_norm": 3.132744443004781,
105
+ "learning_rate": 1.9776261689193047e-06,
106
+ "loss": 0.8232107162475586,
107
+ "memory(GiB)": 53.23,
108
+ "step": 45,
109
+ "token_acc": 0.7509369311489453,
110
+ "train_speed(iter/s)": 0.207906
111
+ },
112
+ {
113
+ "epoch": 0.8849557522123894,
114
+ "grad_norm": 2.9299303644020624,
115
+ "learning_rate": 1.962624246950012e-06,
116
+ "loss": 0.8237380981445312,
117
+ "memory(GiB)": 53.23,
118
+ "step": 50,
119
+ "token_acc": 0.7437298721577047,
120
+ "train_speed(iter/s)": 0.208941
121
+ },
122
+ {
123
+ "epoch": 0.9734513274336283,
124
+ "grad_norm": 2.965955486896327,
125
+ "learning_rate": 1.9438833303083674e-06,
126
+ "loss": 0.80694580078125,
127
+ "memory(GiB)": 53.23,
128
+ "step": 55,
129
+ "token_acc": 0.7513912412291314,
130
+ "train_speed(iter/s)": 0.210269
131
+ },
132
+ {
133
+ "epoch": 1.0,
134
+ "eval_loss": 0.7611977458000183,
135
+ "eval_runtime": 8.8811,
136
+ "eval_samples_per_second": 11.147,
137
+ "eval_steps_per_second": 1.464,
138
+ "eval_token_acc": 0.7686340023342564,
139
+ "step": 57
140
+ },
141
+ {
142
+ "epoch": 1.0530973451327434,
143
+ "grad_norm": 2.7587396910701876,
144
+ "learning_rate": 1.9214762118704076e-06,
145
+ "loss": 0.7415528297424316,
146
+ "memory(GiB)": 53.23,
147
+ "step": 60,
148
+ "token_acc": 0.7617431441759436,
149
+ "train_speed(iter/s)": 0.162271
150
+ },
151
+ {
152
+ "epoch": 1.1415929203539823,
153
+ "grad_norm": 2.8364280800148047,
154
+ "learning_rate": 1.895489924657301e-06,
155
+ "loss": 0.7000388145446778,
156
+ "memory(GiB)": 53.23,
157
+ "step": 65,
158
+ "token_acc": 0.772904598652323,
159
+ "train_speed(iter/s)": 0.16612
160
+ },
161
+ {
162
+ "epoch": 1.2300884955752212,
163
+ "grad_norm": 2.8409488097023816,
164
+ "learning_rate": 1.8660254037844386e-06,
165
+ "loss": 0.7180853843688965,
166
+ "memory(GiB)": 53.23,
167
+ "step": 70,
168
+ "token_acc": 0.7827201783723523,
169
+ "train_speed(iter/s)": 0.169126
170
+ },
171
+ {
172
+ "epoch": 1.3185840707964602,
173
+ "grad_norm": 2.9020651768320924,
174
+ "learning_rate": 1.8331970944124488e-06,
175
+ "loss": 0.6793193817138672,
176
+ "memory(GiB)": 53.23,
177
+ "step": 75,
178
+ "token_acc": 0.7840733246042138,
179
+ "train_speed(iter/s)": 0.172404
180
+ },
181
+ {
182
+ "epoch": 1.407079646017699,
183
+ "grad_norm": 2.609705143690802,
184
+ "learning_rate": 1.7971325072229223e-06,
185
+ "loss": 0.6879084587097168,
186
+ "memory(GiB)": 53.23,
187
+ "step": 80,
188
+ "token_acc": 0.7856836230950848,
189
+ "train_speed(iter/s)": 0.175184
190
+ },
191
+ {
192
+ "epoch": 1.495575221238938,
193
+ "grad_norm": 2.7485198999731923,
194
+ "learning_rate": 1.7579717231454529e-06,
195
+ "loss": 0.6852674961090088,
196
+ "memory(GiB)": 53.23,
197
+ "step": 85,
198
+ "token_acc": 0.7910067560084173,
199
+ "train_speed(iter/s)": 0.177327
200
+ },
201
+ {
202
+ "epoch": 1.584070796460177,
203
+ "grad_norm": 2.7450935504692753,
204
+ "learning_rate": 1.7158668492597184e-06,
205
+ "loss": 0.667497968673706,
206
+ "memory(GiB)": 53.23,
207
+ "step": 90,
208
+ "token_acc": 0.7859543400162243,
209
+ "train_speed(iter/s)": 0.179638
210
+ },
211
+ {
212
+ "epoch": 1.672566371681416,
213
+ "grad_norm": 2.910988919694692,
214
+ "learning_rate": 1.67098142798597e-06,
215
+ "loss": 0.6796401023864747,
216
+ "memory(GiB)": 53.23,
217
+ "step": 95,
218
+ "token_acc": 0.7799544419134397,
219
+ "train_speed(iter/s)": 0.181886
220
+ },
221
+ {
222
+ "epoch": 1.7610619469026547,
223
+ "grad_norm": 2.7983018227375127,
224
+ "learning_rate": 1.6234898018587336e-06,
225
+ "loss": 0.6755572319030761,
226
+ "memory(GiB)": 53.23,
227
+ "step": 100,
228
+ "token_acc": 0.7828237410071942,
229
+ "train_speed(iter/s)": 0.183764
230
+ },
231
+ {
232
+ "epoch": 1.8495575221238938,
233
+ "grad_norm": 2.7366377994394595,
234
+ "learning_rate": 1.573576436351046e-06,
235
+ "loss": 0.670599365234375,
236
+ "memory(GiB)": 53.23,
237
+ "step": 105,
238
+ "token_acc": 0.7895497498610339,
239
+ "train_speed(iter/s)": 0.18519
240
+ },
241
+ {
242
+ "epoch": 1.9380530973451329,
243
+ "grad_norm": 3.1226221302102064,
244
+ "learning_rate": 1.521435203379498e-06,
245
+ "loss": 0.6486417293548584,
246
+ "memory(GiB)": 53.23,
247
+ "step": 110,
248
+ "token_acc": 0.8053818010149888,
249
+ "train_speed(iter/s)": 0.187041
250
+ },
251
+ {
252
+ "epoch": 2.0,
253
+ "eval_loss": 0.6818546652793884,
254
+ "eval_runtime": 8.9028,
255
+ "eval_samples_per_second": 11.12,
256
+ "eval_steps_per_second": 1.46,
257
+ "eval_token_acc": 0.7845847541021123,
258
+ "step": 114
259
+ },
260
+ {
261
+ "epoch": 2.017699115044248,
262
+ "grad_norm": 2.8154612668678487,
263
+ "learning_rate": 1.467268628273062e-06,
264
+ "loss": 0.6256484031677246,
265
+ "memory(GiB)": 53.23,
266
+ "step": 115,
267
+ "token_acc": 0.8097877358490566,
268
+ "train_speed(iter/s)": 0.164747
269
+ },
270
+ {
271
+ "epoch": 2.106194690265487,
272
+ "grad_norm": 2.886828127030963,
273
+ "learning_rate": 1.4112871031306117e-06,
274
+ "loss": 0.5737779140472412,
275
+ "memory(GiB)": 53.23,
276
+ "step": 120,
277
+ "token_acc": 0.8136874361593462,
278
+ "train_speed(iter/s)": 0.166357
279
+ },
280
+ {
281
+ "epoch": 2.1946902654867255,
282
+ "grad_norm": 2.8002391821263237,
283
+ "learning_rate": 1.3537080696225813e-06,
284
+ "loss": 0.5658342361450195,
285
+ "memory(GiB)": 53.23,
286
+ "step": 125,
287
+ "token_acc": 0.8227344032822493,
288
+ "train_speed(iter/s)": 0.168259
289
+ },
290
+ {
291
+ "epoch": 2.2831858407079646,
292
+ "grad_norm": 2.6874528113525042,
293
+ "learning_rate": 1.2947551744109043e-06,
294
+ "loss": 0.5682441711425781,
295
+ "memory(GiB)": 53.59,
296
+ "step": 130,
297
+ "token_acc": 0.8231121067861477,
298
+ "train_speed(iter/s)": 0.169724
299
+ },
300
+ {
301
+ "epoch": 2.3716814159292037,
302
+ "grad_norm": 2.9472452463373333,
303
+ "learning_rate": 1.2346574004677154e-06,
304
+ "loss": 0.5679256916046143,
305
+ "memory(GiB)": 53.59,
306
+ "step": 135,
307
+ "token_acc": 0.8153380423814329,
308
+ "train_speed(iter/s)": 0.171306
309
+ },
310
+ {
311
+ "epoch": 2.4601769911504423,
312
+ "grad_norm": 2.684226386673578,
313
+ "learning_rate": 1.1736481776669305e-06,
314
+ "loss": 0.5530566215515137,
315
+ "memory(GiB)": 53.59,
316
+ "step": 140,
317
+ "token_acc": 0.8241896689955331,
318
+ "train_speed(iter/s)": 0.172815
319
+ },
320
+ {
321
+ "epoch": 2.5486725663716814,
322
+ "grad_norm": 2.8960034438006352,
323
+ "learning_rate": 1.1119644761033077e-06,
324
+ "loss": 0.5574289798736572,
325
+ "memory(GiB)": 53.59,
326
+ "step": 145,
327
+ "token_acc": 0.8239186374218845,
328
+ "train_speed(iter/s)": 0.174493
329
+ },
330
+ {
331
+ "epoch": 2.6371681415929205,
332
+ "grad_norm": 2.767187699405252,
333
+ "learning_rate": 1.0498458856606971e-06,
334
+ "loss": 0.5396804332733154,
335
+ "memory(GiB)": 53.59,
336
+ "step": 150,
337
+ "token_acc": 0.8241233910341766,
338
+ "train_speed(iter/s)": 0.175595
339
+ },
340
+ {
341
+ "epoch": 2.725663716814159,
342
+ "grad_norm": 2.827755019807615,
343
+ "learning_rate": 9.875336854045848e-07,
344
+ "loss": 0.552346134185791,
345
+ "memory(GiB)": 53.59,
346
+ "step": 155,
347
+ "token_acc": 0.8196902654867256,
348
+ "train_speed(iter/s)": 0.177022
349
+ },
350
+ {
351
+ "epoch": 2.814159292035398,
352
+ "grad_norm": 2.730589378155677,
353
+ "learning_rate": 9.252699064135758e-07,
354
+ "loss": 0.5428918838500977,
355
+ "memory(GiB)": 53.59,
356
+ "step": 160,
357
+ "token_acc": 0.8150500592098181,
358
+ "train_speed(iter/s)": 0.178401
359
+ },
360
+ {
361
+ "epoch": 2.9026548672566372,
362
+ "grad_norm": 2.9169675295805533,
363
+ "learning_rate": 8.632963916899268e-07,
364
+ "loss": 0.5644888877868652,
365
+ "memory(GiB)": 53.59,
366
+ "step": 165,
367
+ "token_acc": 0.8185314265112367,
368
+ "train_speed(iter/s)": 0.179906
369
+ },
370
+ {
371
+ "epoch": 2.991150442477876,
372
+ "grad_norm": 2.6339235662104494,
373
+ "learning_rate": 8.018538568006025e-07,
374
+ "loss": 0.5508373260498047,
375
+ "memory(GiB)": 53.59,
376
+ "step": 170,
377
+ "token_acc": 0.8208436167825892,
378
+ "train_speed(iter/s)": 0.180981
379
+ },
380
+ {
381
+ "epoch": 3.0,
382
+ "eval_loss": 0.6724056005477905,
383
+ "eval_runtime": 9.403,
384
+ "eval_samples_per_second": 10.529,
385
+ "eval_steps_per_second": 1.383,
386
+ "eval_token_acc": 0.7888642240886102,
387
+ "step": 171
388
+ },
389
+ {
390
+ "epoch": 3.0707964601769913,
391
+ "grad_norm": 2.6704718240156735,
392
+ "learning_rate": 7.411809548974791e-07,
393
+ "loss": 0.5087705612182617,
394
+ "memory(GiB)": 53.59,
395
+ "step": 175,
396
+ "token_acc": 0.8381913834778899,
397
+ "train_speed(iter/s)": 0.166424
398
+ },
399
+ {
400
+ "epoch": 3.15929203539823,
401
+ "grad_norm": 2.7278153978843647,
402
+ "learning_rate": 6.815133497483157e-07,
403
+ "loss": 0.4890453815460205,
404
+ "memory(GiB)": 53.59,
405
+ "step": 180,
406
+ "token_acc": 0.8365711510441962,
407
+ "train_speed(iter/s)": 0.167637
408
+ },
409
+ {
410
+ "epoch": 3.247787610619469,
411
+ "grad_norm": 2.8589387437951475,
412
+ "learning_rate": 6.230828003789947e-07,
413
+ "loss": 0.4827299118041992,
414
+ "memory(GiB)": 53.59,
415
+ "step": 185,
416
+ "token_acc": 0.8502861146794348,
417
+ "train_speed(iter/s)": 0.168775
418
+ },
419
+ {
420
+ "epoch": 3.336283185840708,
421
+ "grad_norm": 2.807835530135265,
422
+ "learning_rate": 5.661162608824419e-07,
423
+ "loss": 0.49242558479309084,
424
+ "memory(GiB)": 53.59,
425
+ "step": 190,
426
+ "token_acc": 0.847648975791434,
427
+ "train_speed(iter/s)": 0.169897
428
+ },
429
+ {
430
+ "epoch": 3.4247787610619467,
431
+ "grad_norm": 2.9025477586689195,
432
+ "learning_rate": 5.10834998890711e-07,
433
+ "loss": 0.48547563552856443,
434
+ "memory(GiB)": 53.59,
435
+ "step": 195,
436
+ "token_acc": 0.8414092364913731,
437
+ "train_speed(iter/s)": 0.170905
438
+ },
439
+ {
440
+ "epoch": 3.5132743362831858,
441
+ "grad_norm": 2.686659851342714,
442
+ "learning_rate": 4.5745373613424065e-07,
443
+ "loss": 0.500794506072998,
444
+ "memory(GiB)": 53.59,
445
+ "step": 200,
446
+ "token_acc": 0.8371804976917013,
447
+ "train_speed(iter/s)": 0.17211
448
+ },
449
+ {
450
+ "epoch": 3.601769911504425,
451
+ "grad_norm": 2.7730270923239906,
452
+ "learning_rate": 4.061798144264985e-07,
453
+ "loss": 0.46956601142883303,
454
+ "memory(GiB)": 53.59,
455
+ "step": 205,
456
+ "token_acc": 0.8479307025986526,
457
+ "train_speed(iter/s)": 0.173254
458
+ },
459
+ {
460
+ "epoch": 3.6902654867256635,
461
+ "grad_norm": 2.74603910813651,
462
+ "learning_rate": 3.5721239031346063e-07,
463
+ "loss": 0.4935324668884277,
464
+ "memory(GiB)": 53.59,
465
+ "step": 210,
466
+ "token_acc": 0.8333333333333334,
467
+ "train_speed(iter/s)": 0.174048
468
+ },
469
+ {
470
+ "epoch": 3.7787610619469025,
471
+ "grad_norm": 2.747199095702964,
472
+ "learning_rate": 3.1074166151605295e-07,
473
+ "loss": 0.46960010528564455,
474
+ "memory(GiB)": 53.59,
475
+ "step": 215,
476
+ "token_acc": 0.8389138188931078,
477
+ "train_speed(iter/s)": 0.17502
478
+ },
479
+ {
480
+ "epoch": 3.8672566371681416,
481
+ "grad_norm": 2.7297571871062196,
482
+ "learning_rate": 2.6694812817017387e-07,
483
+ "loss": 0.4748669624328613,
484
+ "memory(GiB)": 53.59,
485
+ "step": 220,
486
+ "token_acc": 0.8440873713281446,
487
+ "train_speed(iter/s)": 0.176046
488
+ },
489
+ {
490
+ "epoch": 3.9557522123893807,
491
+ "grad_norm": 2.6601635969723216,
492
+ "learning_rate": 2.260018917337726e-07,
493
+ "loss": 0.48950982093811035,
494
+ "memory(GiB)": 54.81,
495
+ "step": 225,
496
+ "token_acc": 0.8337030646619012,
497
+ "train_speed(iter/s)": 0.17663
498
+ },
499
+ {
500
+ "epoch": 4.0,
501
+ "eval_loss": 0.6796729564666748,
502
+ "eval_runtime": 9.3271,
503
+ "eval_samples_per_second": 10.614,
504
+ "eval_steps_per_second": 1.394,
505
+ "eval_token_acc": 0.7896423095407007,
506
+ "step": 228
507
+ },
508
+ {
509
+ "epoch": 4.035398230088496,
510
+ "grad_norm": 2.768023310981967,
511
+ "learning_rate": 1.880619942841435e-07,
512
+ "loss": 0.46919097900390627,
513
+ "memory(GiB)": 54.81,
514
+ "step": 230,
515
+ "token_acc": 0.8445914029286726,
516
+ "train_speed(iter/s)": 0.166105
517
+ },
518
+ {
519
+ "epoch": 4.123893805309734,
520
+ "grad_norm": 2.593427632271344,
521
+ "learning_rate": 1.5327580077171588e-07,
522
+ "loss": 0.4409231662750244,
523
+ "memory(GiB)": 54.81,
524
+ "step": 235,
525
+ "token_acc": 0.853274191665694,
526
+ "train_speed(iter/s)": 0.167131
527
+ },
528
+ {
529
+ "epoch": 4.212389380530974,
530
+ "grad_norm": 2.7122350384023686,
531
+ "learning_rate": 1.2177842662977133e-07,
532
+ "loss": 0.45473241806030273,
533
+ "memory(GiB)": 54.81,
534
+ "step": 240,
535
+ "token_acc": 0.8460348162475823,
536
+ "train_speed(iter/s)": 0.167961
537
+ },
538
+ {
539
+ "epoch": 4.300884955752212,
540
+ "grad_norm": 2.679197284138057,
541
+ "learning_rate": 9.369221296335006e-08,
542
+ "loss": 0.45319194793701173,
543
+ "memory(GiB)": 54.81,
544
+ "step": 245,
545
+ "token_acc": 0.8497835017208838,
546
+ "train_speed(iter/s)": 0.168899
547
+ },
548
+ {
549
+ "epoch": 4.389380530973451,
550
+ "grad_norm": 2.482195048589529,
551
+ "learning_rate": 6.912625135579586e-08,
552
+ "loss": 0.4435576438903809,
553
+ "memory(GiB)": 54.81,
554
+ "step": 250,
555
+ "token_acc": 0.8499620019541853,
556
+ "train_speed(iter/s)": 0.16972
557
+ },
558
+ {
559
+ "epoch": 4.477876106194691,
560
+ "grad_norm": 2.5377178768662003,
561
+ "learning_rate": 4.817596013867764e-08,
562
+ "loss": 0.4691779136657715,
563
+ "memory(GiB)": 54.81,
564
+ "step": 255,
565
+ "token_acc": 0.8424553623526666,
566
+ "train_speed(iter/s)": 0.170482
567
+ },
568
+ {
569
+ "epoch": 4.566371681415929,
570
+ "grad_norm": 2.7385619077524135,
571
+ "learning_rate": 3.092271377092215e-08,
572
+ "loss": 0.4618979454040527,
573
+ "memory(GiB)": 54.81,
574
+ "step": 260,
575
+ "token_acc": 0.8402349193708939,
576
+ "train_speed(iter/s)": 0.171244
577
+ },
578
+ {
579
+ "epoch": 4.654867256637168,
580
+ "grad_norm": 2.739774896659754,
581
+ "learning_rate": 1.7433526766711725e-08,
582
+ "loss": 0.4363240242004395,
583
+ "memory(GiB)": 54.81,
584
+ "step": 265,
585
+ "token_acc": 0.856353591160221,
586
+ "train_speed(iter/s)": 0.172005
587
+ },
588
+ {
589
+ "epoch": 4.743362831858407,
590
+ "grad_norm": 2.913400883331699,
591
+ "learning_rate": 7.760793399827936e-09,
592
+ "loss": 0.46266565322875974,
593
+ "memory(GiB)": 54.81,
594
+ "step": 270,
595
+ "token_acc": 0.8471272229822161,
596
+ "train_speed(iter/s)": 0.172782
597
+ },
598
+ {
599
+ "epoch": 4.831858407079646,
600
+ "grad_norm": 2.9063339515799447,
601
+ "learning_rate": 1.942084195468152e-09,
602
+ "loss": 0.4591742992401123,
603
+ "memory(GiB)": 54.81,
604
+ "step": 275,
605
+ "token_acc": 0.8490499232857311,
606
+ "train_speed(iter/s)": 0.17353
607
+ },
608
+ {
609
+ "epoch": 4.920353982300885,
610
+ "grad_norm": 2.655185884569906,
611
+ "learning_rate": 0.0,
612
+ "loss": 0.46257796287536623,
613
+ "memory(GiB)": 54.81,
614
+ "step": 280,
615
+ "token_acc": 0.8457238680827278,
616
+ "train_speed(iter/s)": 0.17427
617
+ },
618
+ {
619
+ "epoch": 4.920353982300885,
620
+ "eval_loss": 0.685617208480835,
621
+ "eval_runtime": 9.7242,
622
+ "eval_samples_per_second": 10.181,
623
+ "eval_steps_per_second": 1.337,
624
+ "eval_token_acc": 0.7898940430693182,
625
+ "step": 280
626
+ },
627
+ {
628
+ "epoch": 4.920353982300885,
629
+ "eval_loss": 0.685617208480835,
630
+ "eval_runtime": 10.0022,
631
+ "eval_samples_per_second": 9.898,
632
+ "eval_steps_per_second": 1.3,
633
+ "eval_token_acc": 0.7898940430693182,
634
+ "step": 280
635
+ }
636
+ ],
637
+ "logging_steps": 5,
638
+ "max_steps": 280,
639
+ "num_input_tokens_seen": 0,
640
+ "num_train_epochs": 5,
641
+ "save_steps": 500,
642
+ "stateful_callbacks": {
643
+ "TrainerControl": {
644
+ "args": {
645
+ "should_epoch_stop": false,
646
+ "should_evaluate": false,
647
+ "should_log": false,
648
+ "should_save": true,
649
+ "should_training_stop": true
650
+ },
651
+ "attributes": {}
652
+ }
653
+ },
654
+ "total_flos": 50117350760448.0,
655
+ "train_batch_size": 2,
656
+ "trial_name": null,
657
+ "trial_params": null
658
+ }
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/logging.jsonl ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"loss": 1.5665555, "token_acc": 0.60450945, "grad_norm": 7.40257723, "learning_rate": 7e-08, "memory(GiB)": 49.46, "train_speed(iter/s)": 0.040747, "epoch": 0.01769912, "global_step/max_steps": "1/280", "percentage": "0.36%", "elapsed_time": "11s", "remaining_time": "55m 43s"}
2
+ {"loss": 1.45099378, "token_acc": 0.6307321, "grad_norm": 7.02262071, "learning_rate": 3.6e-07, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.117152, "epoch": 0.08849558, "global_step/max_steps": "5/280", "percentage": "1.79%", "elapsed_time": "30s", "remaining_time": "27m 36s"}
3
+ {"loss": 1.52643356, "token_acc": 0.59708799, "grad_norm": 6.5932652, "learning_rate": 7.1e-07, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.157327, "epoch": 0.17699115, "global_step/max_steps": "10/280", "percentage": "3.57%", "elapsed_time": "51s", "remaining_time": "22m 57s"}
4
+ {"loss": 1.45303726, "token_acc": 0.61445783, "grad_norm": 6.00735181, "learning_rate": 1.07e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.176339, "epoch": 0.26548673, "global_step/max_steps": "15/280", "percentage": "5.36%", "elapsed_time": "1m 12s", "remaining_time": "21m 20s"}
5
+ {"loss": 1.29538174, "token_acc": 0.65506395, "grad_norm": 6.04484052, "learning_rate": 1.43e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.188068, "epoch": 0.3539823, "global_step/max_steps": "20/280", "percentage": "7.14%", "elapsed_time": "1m 33s", "remaining_time": "20m 19s"}
6
+ {"loss": 1.18455906, "token_acc": 0.67351678, "grad_norm": 5.08393447, "learning_rate": 1.79e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.194724, "epoch": 0.44247788, "global_step/max_steps": "25/280", "percentage": "8.93%", "elapsed_time": "1m 55s", "remaining_time": "19m 41s"}
7
+ {"loss": 1.07627144, "token_acc": 0.69432158, "grad_norm": 4.33531403, "learning_rate": 2e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.199848, "epoch": 0.53097345, "global_step/max_steps": "30/280", "percentage": "10.71%", "elapsed_time": "2m 17s", "remaining_time": "19m 6s"}
8
+ {"loss": 0.95859222, "token_acc": 0.72307692, "grad_norm": 3.80862996, "learning_rate": 2e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.203225, "epoch": 0.61946903, "global_step/max_steps": "35/280", "percentage": "12.50%", "elapsed_time": "2m 39s", "remaining_time": "18m 37s"}
9
+ {"loss": 0.91665325, "token_acc": 0.72515656, "grad_norm": 3.13532295, "learning_rate": 1.99e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.206022, "epoch": 0.7079646, "global_step/max_steps": "40/280", "percentage": "14.29%", "elapsed_time": "3m 1s", "remaining_time": "18m 9s"}
10
+ {"loss": 0.82321072, "token_acc": 0.75093693, "grad_norm": 3.13274444, "learning_rate": 1.98e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.207906, "epoch": 0.79646018, "global_step/max_steps": "45/280", "percentage": "16.07%", "elapsed_time": "3m 23s", "remaining_time": "17m 44s"}
11
+ {"loss": 0.8237381, "token_acc": 0.74372987, "grad_norm": 2.92993036, "learning_rate": 1.96e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.208941, "epoch": 0.88495575, "global_step/max_steps": "50/280", "percentage": "17.86%", "elapsed_time": "3m 46s", "remaining_time": "17m 23s"}
12
+ {"loss": 0.8069458, "token_acc": 0.75139124, "grad_norm": 2.96595549, "learning_rate": 1.94e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.210269, "epoch": 0.97345133, "global_step/max_steps": "55/280", "percentage": "19.64%", "elapsed_time": "4m 9s", "remaining_time": "16m 58s"}
13
+ {"eval_loss": 0.76119775, "eval_token_acc": 0.768634, "eval_runtime": 8.8811, "eval_samples_per_second": 11.147, "eval_steps_per_second": 1.464, "epoch": 1.0, "global_step/max_steps": "57/280", "percentage": "20.36%", "elapsed_time": "4m 24s", "remaining_time": "17m 13s"}
14
+ {"loss": 0.74155283, "token_acc": 0.76174314, "grad_norm": 2.75873969, "learning_rate": 1.92e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.162271, "epoch": 1.05309735, "global_step/max_steps": "60/280", "percentage": "21.43%", "elapsed_time": "5m 57s", "remaining_time": "21m 49s"}
15
+ {"loss": 0.70003881, "token_acc": 0.7729046, "grad_norm": 2.83642808, "learning_rate": 1.9e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.16612, "epoch": 1.14159292, "global_step/max_steps": "65/280", "percentage": "23.21%", "elapsed_time": "6m 18s", "remaining_time": "20m 52s"}
16
+ {"loss": 0.71808538, "token_acc": 0.78272018, "grad_norm": 2.84094881, "learning_rate": 1.87e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.169126, "epoch": 1.2300885, "global_step/max_steps": "70/280", "percentage": "25.00%", "elapsed_time": "6m 41s", "remaining_time": "20m 4s"}
17
+ {"loss": 0.67931938, "token_acc": 0.78407332, "grad_norm": 2.90206518, "learning_rate": 1.83e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.172404, "epoch": 1.31858407, "global_step/max_steps": "75/280", "percentage": "26.79%", "elapsed_time": "7m 2s", "remaining_time": "19m 14s"}
18
+ {"loss": 0.68790846, "token_acc": 0.78568362, "grad_norm": 2.60970514, "learning_rate": 1.8e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.175184, "epoch": 1.40707965, "global_step/max_steps": "80/280", "percentage": "28.57%", "elapsed_time": "7m 24s", "remaining_time": "18m 30s"}
19
+ {"loss": 0.6852675, "token_acc": 0.79100676, "grad_norm": 2.7485199, "learning_rate": 1.76e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.177327, "epoch": 1.49557522, "global_step/max_steps": "85/280", "percentage": "30.36%", "elapsed_time": "7m 46s", "remaining_time": "17m 50s"}
20
+ {"loss": 0.66749797, "token_acc": 0.78595434, "grad_norm": 2.74509355, "learning_rate": 1.72e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.179638, "epoch": 1.5840708, "global_step/max_steps": "90/280", "percentage": "32.14%", "elapsed_time": "8m 8s", "remaining_time": "17m 11s"}
21
+ {"loss": 0.6796401, "token_acc": 0.77995444, "grad_norm": 2.91098892, "learning_rate": 1.67e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.181886, "epoch": 1.67256637, "global_step/max_steps": "95/280", "percentage": "33.93%", "elapsed_time": "8m 29s", "remaining_time": "16m 32s"}
22
+ {"loss": 0.67555723, "token_acc": 0.78282374, "grad_norm": 2.79830182, "learning_rate": 1.62e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.183764, "epoch": 1.76106195, "global_step/max_steps": "100/280", "percentage": "35.71%", "elapsed_time": "8m 51s", "remaining_time": "15m 56s"}
23
+ {"loss": 0.67059937, "token_acc": 0.78954975, "grad_norm": 2.7366378, "learning_rate": 1.57e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.18519, "epoch": 1.84955752, "global_step/max_steps": "105/280", "percentage": "37.50%", "elapsed_time": "9m 14s", "remaining_time": "15m 24s"}
24
+ {"loss": 0.64864173, "token_acc": 0.8053818, "grad_norm": 3.12262213, "learning_rate": 1.52e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.187041, "epoch": 1.9380531, "global_step/max_steps": "110/280", "percentage": "39.29%", "elapsed_time": "9m 35s", "remaining_time": "14m 49s"}
25
+ {"eval_loss": 0.68185467, "eval_token_acc": 0.78458475, "eval_runtime": 8.9028, "eval_samples_per_second": 11.12, "eval_steps_per_second": 1.46, "epoch": 2.0, "global_step/max_steps": "114/280", "percentage": "40.71%", "elapsed_time": "10m 0s", "remaining_time": "14m 33s"}
26
+ {"loss": 0.6256484, "token_acc": 0.80978774, "grad_norm": 2.81546127, "learning_rate": 1.47e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.164747, "epoch": 2.01769912, "global_step/max_steps": "115/280", "percentage": "41.07%", "elapsed_time": "11m 25s", "remaining_time": "16m 23s"}
27
+ {"loss": 0.57377791, "token_acc": 0.81368744, "grad_norm": 2.88682813, "learning_rate": 1.41e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.166357, "epoch": 2.10619469, "global_step/max_steps": "120/280", "percentage": "42.86%", "elapsed_time": "11m 48s", "remaining_time": "15m 45s"}
28
+ {"loss": 0.56583424, "token_acc": 0.8227344, "grad_norm": 2.80023918, "learning_rate": 1.35e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.168259, "epoch": 2.19469027, "global_step/max_steps": "125/280", "percentage": "44.64%", "elapsed_time": "12m 10s", "remaining_time": "15m 5s"}
29
+ {"loss": 0.56824417, "token_acc": 0.82311211, "grad_norm": 2.68745281, "learning_rate": 1.29e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.169724, "epoch": 2.28318584, "global_step/max_steps": "130/280", "percentage": "46.43%", "elapsed_time": "12m 33s", "remaining_time": "14m 29s"}
30
+ {"loss": 0.56792569, "token_acc": 0.81533804, "grad_norm": 2.94724525, "learning_rate": 1.23e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.171306, "epoch": 2.37168142, "global_step/max_steps": "135/280", "percentage": "48.21%", "elapsed_time": "12m 55s", "remaining_time": "13m 52s"}
31
+ {"loss": 0.55305662, "token_acc": 0.82418967, "grad_norm": 2.68422639, "learning_rate": 1.17e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.172815, "epoch": 2.46017699, "global_step/max_steps": "140/280", "percentage": "50.00%", "elapsed_time": "13m 17s", "remaining_time": "13m 17s"}
32
+ {"loss": 0.55742898, "token_acc": 0.82391864, "grad_norm": 2.89600344, "learning_rate": 1.11e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.174493, "epoch": 2.54867257, "global_step/max_steps": "145/280", "percentage": "51.79%", "elapsed_time": "13m 38s", "remaining_time": "12m 41s"}
33
+ {"loss": 0.53968043, "token_acc": 0.82412339, "grad_norm": 2.7671877, "learning_rate": 1.05e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.175595, "epoch": 2.63716814, "global_step/max_steps": "150/280", "percentage": "53.57%", "elapsed_time": "14m 1s", "remaining_time": "12m 9s"}
34
+ {"loss": 0.55234613, "token_acc": 0.81969027, "grad_norm": 2.82775502, "learning_rate": 9.9e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.177022, "epoch": 2.72566372, "global_step/max_steps": "155/280", "percentage": "55.36%", "elapsed_time": "14m 23s", "remaining_time": "11m 35s"}
35
+ {"loss": 0.54289188, "token_acc": 0.81505006, "grad_norm": 2.73058938, "learning_rate": 9.3e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.178401, "epoch": 2.81415929, "global_step/max_steps": "160/280", "percentage": "57.14%", "elapsed_time": "14m 44s", "remaining_time": "11m 3s"}
36
+ {"loss": 0.56448889, "token_acc": 0.81853143, "grad_norm": 2.91696753, "learning_rate": 8.6e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.179906, "epoch": 2.90265487, "global_step/max_steps": "165/280", "percentage": "58.93%", "elapsed_time": "15m 4s", "remaining_time": "10m 30s"}
37
+ {"loss": 0.55083733, "token_acc": 0.82084362, "grad_norm": 2.63392357, "learning_rate": 8e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.180981, "epoch": 2.99115044, "global_step/max_steps": "170/280", "percentage": "60.71%", "elapsed_time": "15m 26s", "remaining_time": "9m 59s"}
38
+ {"eval_loss": 0.6724056, "eval_token_acc": 0.78886422, "eval_runtime": 9.403, "eval_samples_per_second": 10.529, "eval_steps_per_second": 1.383, "epoch": 3.0, "global_step/max_steps": "171/280", "percentage": "61.07%", "elapsed_time": "15m 37s", "remaining_time": "9m 57s"}
39
+ {"loss": 0.50877056, "token_acc": 0.83819138, "grad_norm": 2.67047182, "learning_rate": 7.4e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.166424, "epoch": 3.07079646, "global_step/max_steps": "175/280", "percentage": "62.50%", "elapsed_time": "17m 18s", "remaining_time": "10m 23s"}
40
+ {"loss": 0.48904538, "token_acc": 0.83657115, "grad_norm": 2.7278154, "learning_rate": 6.8e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.167637, "epoch": 3.15929204, "global_step/max_steps": "180/280", "percentage": "64.29%", "elapsed_time": "17m 41s", "remaining_time": "9m 49s"}
41
+ {"loss": 0.48272991, "token_acc": 0.85028611, "grad_norm": 2.85893874, "learning_rate": 6.2e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.168775, "epoch": 3.24778761, "global_step/max_steps": "185/280", "percentage": "66.07%", "elapsed_time": "18m 3s", "remaining_time": "9m 16s"}
42
+ {"loss": 0.49242558, "token_acc": 0.84764898, "grad_norm": 2.80783553, "learning_rate": 5.7e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.169897, "epoch": 3.33628319, "global_step/max_steps": "190/280", "percentage": "67.86%", "elapsed_time": "18m 25s", "remaining_time": "8m 43s"}
43
+ {"loss": 0.48547564, "token_acc": 0.84140924, "grad_norm": 2.90254776, "learning_rate": 5.1e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.170905, "epoch": 3.42477876, "global_step/max_steps": "195/280", "percentage": "69.64%", "elapsed_time": "18m 48s", "remaining_time": "8m 11s"}
44
+ {"loss": 0.50079451, "token_acc": 0.8371805, "grad_norm": 2.68665985, "learning_rate": 4.6e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.17211, "epoch": 3.51327434, "global_step/max_steps": "200/280", "percentage": "71.43%", "elapsed_time": "19m 9s", "remaining_time": "7m 39s"}
45
+ {"loss": 0.46956601, "token_acc": 0.8479307, "grad_norm": 2.77302709, "learning_rate": 4.1e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.173254, "epoch": 3.60176991, "global_step/max_steps": "205/280", "percentage": "73.21%", "elapsed_time": "19m 30s", "remaining_time": "7m 8s"}
46
+ {"loss": 0.49353247, "token_acc": 0.83333333, "grad_norm": 2.74603911, "learning_rate": 3.6e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.174048, "epoch": 3.69026549, "global_step/max_steps": "210/280", "percentage": "75.00%", "elapsed_time": "19m 53s", "remaining_time": "6m 37s"}
47
+ {"loss": 0.46960011, "token_acc": 0.83891382, "grad_norm": 2.7471991, "learning_rate": 3.1e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.17502, "epoch": 3.77876106, "global_step/max_steps": "215/280", "percentage": "76.79%", "elapsed_time": "20m 15s", "remaining_time": "6m 7s"}
48
+ {"loss": 0.47486696, "token_acc": 0.84408737, "grad_norm": 2.72975719, "learning_rate": 2.7e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.176046, "epoch": 3.86725664, "global_step/max_steps": "220/280", "percentage": "78.57%", "elapsed_time": "20m 37s", "remaining_time": "5m 37s"}
49
+ {"loss": 0.48950982, "token_acc": 0.83370306, "grad_norm": 2.6601636, "learning_rate": 2.3e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17663, "epoch": 3.95575221, "global_step/max_steps": "225/280", "percentage": "80.36%", "elapsed_time": "21m 1s", "remaining_time": "5m 8s"}
50
+ {"eval_loss": 0.67967296, "eval_token_acc": 0.78964231, "eval_runtime": 9.3271, "eval_samples_per_second": 10.614, "eval_steps_per_second": 1.394, "epoch": 4.0, "global_step/max_steps": "228/280", "percentage": "81.43%", "elapsed_time": "21m 20s", "remaining_time": "4m 51s"}
51
+ {"loss": 0.46919098, "token_acc": 0.8445914, "grad_norm": 2.76802331, "learning_rate": 1.9e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.166105, "epoch": 4.03539823, "global_step/max_steps": "230/280", "percentage": "82.14%", "elapsed_time": "22m 52s", "remaining_time": "4m 58s"}
52
+ {"loss": 0.44092317, "token_acc": 0.85327419, "grad_norm": 2.59342763, "learning_rate": 1.5e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.167131, "epoch": 4.12389381, "global_step/max_steps": "235/280", "percentage": "83.93%", "elapsed_time": "23m 13s", "remaining_time": "4m 26s"}
53
+ {"loss": 0.45473242, "token_acc": 0.84603482, "grad_norm": 2.71223504, "learning_rate": 1.2e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.167961, "epoch": 4.21238938, "global_step/max_steps": "240/280", "percentage": "85.71%", "elapsed_time": "23m 36s", "remaining_time": "3m 56s"}
54
+ {"loss": 0.45319195, "token_acc": 0.8497835, "grad_norm": 2.67919728, "learning_rate": 9e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.168899, "epoch": 4.30088496, "global_step/max_steps": "245/280", "percentage": "87.50%", "elapsed_time": "23m 58s", "remaining_time": "3m 25s"}
55
+ {"loss": 0.44355764, "token_acc": 0.849962, "grad_norm": 2.48219505, "learning_rate": 7e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.16972, "epoch": 4.38938053, "global_step/max_steps": "250/280", "percentage": "89.29%", "elapsed_time": "24m 20s", "remaining_time": "2m 55s"}
56
+ {"loss": 0.46917791, "token_acc": 0.84245536, "grad_norm": 2.53771788, "learning_rate": 5e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.170482, "epoch": 4.47787611, "global_step/max_steps": "255/280", "percentage": "91.07%", "elapsed_time": "24m 43s", "remaining_time": "2m 25s"}
57
+ {"loss": 0.46189795, "token_acc": 0.84023492, "grad_norm": 2.73856191, "learning_rate": 3e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.171244, "epoch": 4.56637168, "global_step/max_steps": "260/280", "percentage": "92.86%", "elapsed_time": "25m 5s", "remaining_time": "1m 55s"}
58
+ {"loss": 0.43632402, "token_acc": 0.85635359, "grad_norm": 2.7397749, "learning_rate": 2e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.172005, "epoch": 4.65486726, "global_step/max_steps": "265/280", "percentage": "94.64%", "elapsed_time": "25m 28s", "remaining_time": "1m 26s"}
59
+ {"loss": 0.46266565, "token_acc": 0.84712722, "grad_norm": 2.91340088, "learning_rate": 1e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.172782, "epoch": 4.74336283, "global_step/max_steps": "270/280", "percentage": "96.43%", "elapsed_time": "25m 50s", "remaining_time": "57s"}
60
+ {"loss": 0.4591743, "token_acc": 0.84904992, "grad_norm": 2.90633395, "learning_rate": 0.0, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17353, "epoch": 4.83185841, "global_step/max_steps": "275/280", "percentage": "98.21%", "elapsed_time": "26m 12s", "remaining_time": "28s"}
61
+ {"loss": 0.46257796, "token_acc": 0.84572387, "grad_norm": 2.65518588, "learning_rate": 0.0, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17427, "epoch": 4.92035398, "global_step/max_steps": "280/280", "percentage": "100.00%", "elapsed_time": "26m 34s", "remaining_time": "0s"}
62
+ {"eval_loss": 0.68561721, "eval_token_acc": 0.78989404, "eval_runtime": 9.7242, "eval_samples_per_second": 10.181, "eval_steps_per_second": 1.337, "epoch": 4.92035398, "global_step/max_steps": "280/280", "percentage": "100.00%", "elapsed_time": "26m 43s", "remaining_time": "0s"}
63
+ {"eval_loss": 0.68561721, "eval_token_acc": 0.78989404, "eval_runtime": 10.0022, "eval_samples_per_second": 9.898, "eval_steps_per_second": 1.3, "epoch": 4.92035398, "global_step/max_steps": "280/280", "percentage": "100.00%", "elapsed_time": "28m 18s", "remaining_time": "0s"}
64
+ {"train_runtime": 1863.6255, "train_samples_per_second": 2.417, "train_steps_per_second": 0.15, "total_flos": 50117350760448.0, "train_loss": 0.66115946, "epoch": 4.92035398, "global_step/max_steps": "280/280", "percentage": "100.00%", "elapsed_time": "30m 55s", "remaining_time": "0s"}
65
+ {"model_parameter_info": "Qwen2_5_VLForConditionalGeneration: 8292.1667M Params (7615.6165M Trainable [91.8411%]), 0.0019M Buffers.", "last_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280", "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-171", "best_metric": 0.6724056, "global_step": 280, "log_history": [{"loss": 1.5665555000305176, "token_acc": 0.6045094454600853, "grad_norm": 7.40257722540107, "learning_rate": 7.142857142857142e-08, "memory(GiB)": 49.46, "train_speed(iter/s)": 0.040747, "epoch": 0.017699115044247787, "step": 1}, {"loss": 1.4509937763214111, "token_acc": 0.6307320997586484, "grad_norm": 7.0226207081313206, "learning_rate": 3.5714285714285716e-07, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.117152, "epoch": 0.08849557522123894, "step": 5}, {"loss": 1.526433563232422, "token_acc": 0.5970879879502949, "grad_norm": 6.593265202788787, "learning_rate": 7.142857142857143e-07, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.157327, "epoch": 0.17699115044247787, "step": 10}, {"loss": 1.4530372619628906, "token_acc": 0.6144578313253012, "grad_norm": 6.007351809416256, "learning_rate": 1.0714285714285714e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.176339, "epoch": 0.26548672566371684, "step": 15}, {"loss": 1.295381736755371, "token_acc": 0.6550639547074858, "grad_norm": 6.044840522609318, "learning_rate": 1.4285714285714286e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.188068, "epoch": 0.35398230088495575, "step": 20}, {"loss": 1.1845590591430664, "token_acc": 0.6735167769650529, "grad_norm": 5.083934465976827, "learning_rate": 1.7857142857142857e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.194724, "epoch": 0.4424778761061947, "step": 25}, {"loss": 1.0762714385986327, "token_acc": 0.6943215780035864, "grad_norm": 4.335314034421227, "learning_rate": 1.999689182000816e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.199848, "epoch": 0.5309734513274337, "step": 30}, {"loss": 0.9585922241210938, "token_acc": 0.7230769230769231, "grad_norm": 3.8086299642108323, "learning_rate": 1.9961946980917456e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.203225, "epoch": 0.6194690265486725, "step": 35}, {"loss": 0.9166532516479492, "token_acc": 0.7251565638662413, "grad_norm": 3.1353229453238276, "learning_rate": 1.9888308262251284e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.206022, "epoch": 0.7079646017699115, "step": 40}, {"loss": 0.8232107162475586, "token_acc": 0.7509369311489453, "grad_norm": 3.132744443004781, "learning_rate": 1.9776261689193047e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.207906, "epoch": 0.7964601769911505, "step": 45}, {"loss": 0.8237380981445312, "token_acc": 0.7437298721577047, "grad_norm": 2.9299303644020624, "learning_rate": 1.962624246950012e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.208941, "epoch": 0.8849557522123894, "step": 50}, {"loss": 0.80694580078125, "token_acc": 0.7513912412291314, "grad_norm": 2.965955486896327, "learning_rate": 1.9438833303083674e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.210269, "epoch": 0.9734513274336283, "step": 55}, {"eval_loss": 0.7611977458000183, "eval_token_acc": 0.7686340023342564, "eval_runtime": 8.8811, "eval_samples_per_second": 11.147, "eval_steps_per_second": 1.464, "epoch": 1.0, "step": 57}, {"loss": 0.7415528297424316, "token_acc": 0.7617431441759436, "grad_norm": 2.7587396910701876, "learning_rate": 1.9214762118704076e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.162271, "epoch": 1.0530973451327434, "step": 60}, {"loss": 0.7000388145446778, "token_acc": 0.772904598652323, "grad_norm": 2.8364280800148047, "learning_rate": 1.895489924657301e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.16612, "epoch": 1.1415929203539823, "step": 65}, {"loss": 0.7180853843688965, "token_acc": 0.7827201783723523, "grad_norm": 2.8409488097023816, "learning_rate": 1.8660254037844386e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.169126, "epoch": 1.2300884955752212, "step": 70}, {"loss": 0.6793193817138672, "token_acc": 0.7840733246042138, "grad_norm": 2.9020651768320924, "learning_rate": 1.8331970944124488e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.172404, "epoch": 1.3185840707964602, "step": 75}, {"loss": 0.6879084587097168, "token_acc": 0.7856836230950848, "grad_norm": 2.609705143690802, "learning_rate": 1.7971325072229223e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.175184, "epoch": 1.407079646017699, "step": 80}, {"loss": 0.6852674961090088, "token_acc": 0.7910067560084173, "grad_norm": 2.7485198999731923, "learning_rate": 1.7579717231454529e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.177327, "epoch": 1.495575221238938, "step": 85}, {"loss": 0.667497968673706, "token_acc": 0.7859543400162243, "grad_norm": 2.7450935504692753, "learning_rate": 1.7158668492597184e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.179638, "epoch": 1.584070796460177, "step": 90}, {"loss": 0.6796401023864747, "token_acc": 0.7799544419134397, "grad_norm": 2.910988919694692, "learning_rate": 1.67098142798597e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.181886, "epoch": 1.672566371681416, "step": 95}, {"loss": 0.6755572319030761, "token_acc": 0.7828237410071942, "grad_norm": 2.7983018227375127, "learning_rate": 1.6234898018587336e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.183764, "epoch": 1.7610619469026547, "step": 100}, {"loss": 0.670599365234375, "token_acc": 0.7895497498610339, "grad_norm": 2.7366377994394595, "learning_rate": 1.573576436351046e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.18519, "epoch": 1.8495575221238938, "step": 105}, {"loss": 0.6486417293548584, "token_acc": 0.8053818010149888, "grad_norm": 3.1226221302102064, "learning_rate": 1.521435203379498e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.187041, "epoch": 1.9380530973451329, "step": 110}, {"eval_loss": 0.6818546652793884, "eval_token_acc": 0.7845847541021123, "eval_runtime": 8.9028, "eval_samples_per_second": 11.12, "eval_steps_per_second": 1.46, "epoch": 2.0, "step": 114}, {"loss": 0.6256484031677246, "token_acc": 0.8097877358490566, "grad_norm": 2.8154612668678487, "learning_rate": 1.467268628273062e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.164747, "epoch": 2.017699115044248, "step": 115}, {"loss": 0.5737779140472412, "token_acc": 0.8136874361593462, "grad_norm": 2.886828127030963, "learning_rate": 1.4112871031306117e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.166357, "epoch": 2.106194690265487, "step": 120}, {"loss": 0.5658342361450195, "token_acc": 0.8227344032822493, "grad_norm": 2.8002391821263237, "learning_rate": 1.3537080696225813e-06, "memory(GiB)": 53.23, "train_speed(iter/s)": 0.168259, "epoch": 2.1946902654867255, "step": 125}, {"loss": 0.5682441711425781, "token_acc": 0.8231121067861477, "grad_norm": 2.6874528113525042, "learning_rate": 1.2947551744109043e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.169724, "epoch": 2.2831858407079646, "step": 130}, {"loss": 0.5679256916046143, "token_acc": 0.8153380423814329, "grad_norm": 2.9472452463373333, "learning_rate": 1.2346574004677154e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.171306, "epoch": 2.3716814159292037, "step": 135}, {"loss": 0.5530566215515137, "token_acc": 0.8241896689955331, "grad_norm": 2.684226386673578, "learning_rate": 1.1736481776669305e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.172815, "epoch": 2.4601769911504423, "step": 140}, {"loss": 0.5574289798736572, "token_acc": 0.8239186374218845, "grad_norm": 2.8960034438006352, "learning_rate": 1.1119644761033077e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.174493, "epoch": 2.5486725663716814, "step": 145}, {"loss": 0.5396804332733154, "token_acc": 0.8241233910341766, "grad_norm": 2.767187699405252, "learning_rate": 1.0498458856606971e-06, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.175595, "epoch": 2.6371681415929205, "step": 150}, {"loss": 0.552346134185791, "token_acc": 0.8196902654867256, "grad_norm": 2.827755019807615, "learning_rate": 9.875336854045848e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.177022, "epoch": 2.725663716814159, "step": 155}, {"loss": 0.5428918838500977, "token_acc": 0.8150500592098181, "grad_norm": 2.730589378155677, "learning_rate": 9.252699064135758e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.178401, "epoch": 2.814159292035398, "step": 160}, {"loss": 0.5644888877868652, "token_acc": 0.8185314265112367, "grad_norm": 2.9169675295805533, "learning_rate": 8.632963916899268e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.179906, "epoch": 2.9026548672566372, "step": 165}, {"loss": 0.5508373260498047, "token_acc": 0.8208436167825892, "grad_norm": 2.6339235662104494, "learning_rate": 8.018538568006025e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.180981, "epoch": 2.991150442477876, "step": 170}, {"eval_loss": 0.6724056005477905, "eval_token_acc": 0.7888642240886102, "eval_runtime": 9.403, "eval_samples_per_second": 10.529, "eval_steps_per_second": 1.383, "epoch": 3.0, "step": 171}, {"loss": 0.5087705612182617, "token_acc": 0.8381913834778899, "grad_norm": 2.6704718240156735, "learning_rate": 7.411809548974791e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.166424, "epoch": 3.0707964601769913, "step": 175}, {"loss": 0.4890453815460205, "token_acc": 0.8365711510441962, "grad_norm": 2.7278153978843647, "learning_rate": 6.815133497483157e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.167637, "epoch": 3.15929203539823, "step": 180}, {"loss": 0.4827299118041992, "token_acc": 0.8502861146794348, "grad_norm": 2.8589387437951475, "learning_rate": 6.230828003789947e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.168775, "epoch": 3.247787610619469, "step": 185}, {"loss": 0.49242558479309084, "token_acc": 0.847648975791434, "grad_norm": 2.807835530135265, "learning_rate": 5.661162608824419e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.169897, "epoch": 3.336283185840708, "step": 190}, {"loss": 0.48547563552856443, "token_acc": 0.8414092364913731, "grad_norm": 2.9025477586689195, "learning_rate": 5.10834998890711e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.170905, "epoch": 3.4247787610619467, "step": 195}, {"loss": 0.500794506072998, "token_acc": 0.8371804976917013, "grad_norm": 2.686659851342714, "learning_rate": 4.5745373613424065e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.17211, "epoch": 3.5132743362831858, "step": 200}, {"loss": 0.46956601142883303, "token_acc": 0.8479307025986526, "grad_norm": 2.7730270923239906, "learning_rate": 4.061798144264985e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.173254, "epoch": 3.601769911504425, "step": 205}, {"loss": 0.4935324668884277, "token_acc": 0.8333333333333334, "grad_norm": 2.74603910813651, "learning_rate": 3.5721239031346063e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.174048, "epoch": 3.6902654867256635, "step": 210}, {"loss": 0.46960010528564455, "token_acc": 0.8389138188931078, "grad_norm": 2.747199095702964, "learning_rate": 3.1074166151605295e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.17502, "epoch": 3.7787610619469025, "step": 215}, {"loss": 0.4748669624328613, "token_acc": 0.8440873713281446, "grad_norm": 2.7297571871062196, "learning_rate": 2.6694812817017387e-07, "memory(GiB)": 53.59, "train_speed(iter/s)": 0.176046, "epoch": 3.8672566371681416, "step": 220}, {"loss": 0.48950982093811035, "token_acc": 0.8337030646619012, "grad_norm": 2.6601635969723216, "learning_rate": 2.260018917337726e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17663, "epoch": 3.9557522123893807, "step": 225}, {"eval_loss": 0.6796729564666748, "eval_token_acc": 0.7896423095407007, "eval_runtime": 9.3271, "eval_samples_per_second": 10.614, "eval_steps_per_second": 1.394, "epoch": 4.0, "step": 228}, {"loss": 0.46919097900390627, "token_acc": 0.8445914029286726, "grad_norm": 2.768023310981967, "learning_rate": 1.880619942841435e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.166105, "epoch": 4.035398230088496, "step": 230}, {"loss": 0.4409231662750244, "token_acc": 0.853274191665694, "grad_norm": 2.593427632271344, "learning_rate": 1.5327580077171588e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.167131, "epoch": 4.123893805309734, "step": 235}, {"loss": 0.45473241806030273, "token_acc": 0.8460348162475823, "grad_norm": 2.7122350384023686, "learning_rate": 1.2177842662977133e-07, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.167961, "epoch": 4.212389380530974, "step": 240}, {"loss": 0.45319194793701173, "token_acc": 0.8497835017208838, "grad_norm": 2.679197284138057, "learning_rate": 9.369221296335006e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.168899, "epoch": 4.300884955752212, "step": 245}, {"loss": 0.4435576438903809, "token_acc": 0.8499620019541853, "grad_norm": 2.482195048589529, "learning_rate": 6.912625135579586e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.16972, "epoch": 4.389380530973451, "step": 250}, {"loss": 0.4691779136657715, "token_acc": 0.8424553623526666, "grad_norm": 2.5377178768662003, "learning_rate": 4.817596013867764e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.170482, "epoch": 4.477876106194691, "step": 255}, {"loss": 0.4618979454040527, "token_acc": 0.8402349193708939, "grad_norm": 2.7385619077524135, "learning_rate": 3.092271377092215e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.171244, "epoch": 4.566371681415929, "step": 260}, {"loss": 0.4363240242004395, "token_acc": 0.856353591160221, "grad_norm": 2.739774896659754, "learning_rate": 1.7433526766711725e-08, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.172005, "epoch": 4.654867256637168, "step": 265}, {"loss": 0.46266565322875974, "token_acc": 0.8471272229822161, "grad_norm": 2.913400883331699, "learning_rate": 7.760793399827936e-09, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.172782, "epoch": 4.743362831858407, "step": 270}, {"loss": 0.4591742992401123, "token_acc": 0.8490499232857311, "grad_norm": 2.9063339515799447, "learning_rate": 1.942084195468152e-09, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17353, "epoch": 4.831858407079646, "step": 275}, {"loss": 0.46257796287536623, "token_acc": 0.8457238680827278, "grad_norm": 2.655185884569906, "learning_rate": 0.0, "memory(GiB)": 54.81, "train_speed(iter/s)": 0.17427, "epoch": 4.920353982300885, "step": 280}, {"eval_loss": 0.685617208480835, "eval_token_acc": 0.7898940430693182, "eval_runtime": 9.7242, "eval_samples_per_second": 10.181, "eval_steps_per_second": 1.337, "epoch": 4.920353982300885, "step": 280}, {"eval_loss": 0.685617208480835, "eval_token_acc": 0.7898940430693182, "eval_runtime": 10.0022, "eval_samples_per_second": 9.898, "eval_steps_per_second": 1.3, "epoch": 4.920353982300885, "step": 280}, {"train_runtime": 1863.6255, "train_samples_per_second": 2.417, "train_steps_per_second": 0.15, "total_flos": 50117350760448.0, "train_loss": 0.6611594574792045, "epoch": 4.920353982300885, "step": 280}], "memory": 54.8125}
qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/val_dataset.jsonl ADDED
The diff for this file is too large to render. See raw diff