p1k0 commited on
Commit
be7e0e3
·
verified ·
1 Parent(s): fc6f21b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/args.json +384 -0
  3. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/added_tokens.json +33 -0
  4. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/args.json +384 -0
  5. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/chat_template.jinja +54 -0
  6. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/config.json +144 -0
  7. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_intern_vit.py +120 -0
  8. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_internvl_chat.py +97 -0
  9. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/conversation.py +391 -0
  10. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/generation_config.json +5 -0
  11. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/latest +1 -0
  12. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/merges.txt +0 -0
  13. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/model.safetensors.index.json +693 -0
  14. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_intern_vit.py +431 -0
  15. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_internvl_chat.py +359 -0
  16. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/preprocessor_config.json +19 -0
  17. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/special_tokens_map.json +31 -0
  18. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/tokenizer_config.json +280 -0
  19. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/trainer_state.json +429 -0
  20. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/vocab.json +0 -0
  21. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/logging.jsonl +85 -0
  22. ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/val_dataset.jsonl +135 -0
  23. ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_1.pth +2 -2
  24. ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_3.pth +2 -2
  25. ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/scheduler.pt +2 -2
  26. ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/tokenizer.json +2 -2
  27. ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/training_args.bin +2 -2
  28. ood/qwen2.5vl-7b-lora-ood-merged/model-00001-of-00004.safetensors +2 -2
  29. ood/qwen2.5vl-7b-lora-ood-merged/model-00002-of-00004.safetensors +2 -2
  30. ood/qwen2.5vl-7b-lora-ood-merged/model-00003-of-00004.safetensors +2 -2
  31. ood/qwen2.5vl-7b-lora-ood-merged/model-00004-of-00004.safetensors +2 -2
  32. ood/qwen2.5vl-7b-lora-ood-merged/tokenizer.json +2 -2
  33. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00001-of-00004.safetensors +3 -0
  34. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00002-of-00004.safetensors +3 -0
  35. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00003-of-00004.safetensors +3 -0
  36. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00004-of-00004.safetensors +3 -0
  37. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model.safetensors.index.json +737 -0
  38. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/preprocessor_config.json +19 -0
  39. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_0.pth +3 -0
  40. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_1.pth +3 -0
  41. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_2.pth +3 -0
  42. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_3.pth +3 -0
  43. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/scheduler.pt +3 -0
  44. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/special_tokens_map.json +31 -0
  45. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json +3 -0
  46. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer_config.json +208 -0
  47. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/trainer_state.json +429 -0
  48. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/training_args.bin +3 -0
  49. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/video_preprocessor_config.json +43 -0
  50. ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/vocab.json +0 -0
.gitattributes CHANGED
@@ -60,3 +60,4 @@ qwen2.5vl-7b-lora_epoch10_2e-5/tokenizer.json filter=lfs diff=lfs merge=lfs -tex
60
  llava-ov-lora/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
  internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
  qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
60
  llava-ov-lora/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
  internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
  qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/args.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 2,
10
+ "per_device_eval_batch_size": 2,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 5e-07,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 5.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.1,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 5,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": 10,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": null,
65
+ "dataloader_num_workers": 4,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": true,
73
+ "metric_for_best_model": "eval_loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "swanlab"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "enable_channel_loss": false,
186
+ "check_model": true,
187
+ "acc_strategy": "token",
188
+ "train_dataloader_shuffle": true,
189
+ "max_epochs": null,
190
+ "aligner_lr": null,
191
+ "vit_lr": null,
192
+ "use_logits_to_keep": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
206
+ "model_type": "internvl3",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "internvl2_5",
221
+ "system": null,
222
+ "max_length": 32768,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "/mnt/data/users/liamding/data/3AM_Plus/final/ood_split/ambi_normal_train_aug_messages.json"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.1,
239
+ "dataset_num_proc": 1,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": true,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [
287
+ "vision_model",
288
+ "mlp1"
289
+ ],
290
+ "freeze_parameters_regex": null,
291
+ "freeze_parameters_ratio": 0.0,
292
+ "trainable_parameters": [],
293
+ "trainable_parameters_regex": null,
294
+ "freeze_llm": false,
295
+ "freeze_vit": true,
296
+ "freeze_aligner": true,
297
+ "target_modules": [
298
+ "all-linear"
299
+ ],
300
+ "target_regex": null,
301
+ "target_parameters": null,
302
+ "modules_to_save": [],
303
+ "lora_rank": 8,
304
+ "lora_alpha": 32,
305
+ "lora_dropout": 0.05,
306
+ "lora_bias": "none",
307
+ "lora_dtype": null,
308
+ "lorap_lr_ratio": null,
309
+ "use_rslora": false,
310
+ "use_dora": false,
311
+ "lora_ga_batch_size": 2,
312
+ "lora_ga_iters": 2,
313
+ "lora_ga_max_length": 1024,
314
+ "lora_ga_direction": "ArB2r",
315
+ "lora_ga_scale": "stable",
316
+ "lora_ga_stable_gamma": 16,
317
+ "init_weights": true,
318
+ "fourier_n_frequency": 2000,
319
+ "fourier_scaling": 300.0,
320
+ "boft_block_size": 4,
321
+ "boft_block_num": 0,
322
+ "boft_n_butterfly_factor": 1,
323
+ "boft_dropout": 0.0,
324
+ "vera_rank": 256,
325
+ "vera_projection_prng_key": 0,
326
+ "vera_dropout": 0.0,
327
+ "vera_d_initial": 0.1,
328
+ "adapter_act": "gelu",
329
+ "adapter_length": 128,
330
+ "use_galore": false,
331
+ "galore_target_modules": null,
332
+ "galore_rank": 128,
333
+ "galore_update_proj_gap": 50,
334
+ "galore_scale": 1.0,
335
+ "galore_proj_type": "std",
336
+ "galore_optim_per_parameter": false,
337
+ "galore_with_embedding": false,
338
+ "galore_quantization": false,
339
+ "galore_proj_quant": false,
340
+ "galore_proj_bits": 4,
341
+ "galore_proj_group_size": 256,
342
+ "galore_cos_threshold": 0.4,
343
+ "galore_gamma_proj": 2,
344
+ "galore_queue_size": 5,
345
+ "adalora_target_r": 8,
346
+ "adalora_init_r": 12,
347
+ "adalora_tinit": 0,
348
+ "adalora_tfinal": 0,
349
+ "adalora_deltaT": 1,
350
+ "adalora_beta1": 0.85,
351
+ "adalora_beta2": 0.85,
352
+ "adalora_orth_reg_weight": 0.5,
353
+ "llamapro_num_new_blocks": 4,
354
+ "llamapro_num_groups": null,
355
+ "lisa_activated_layers": 0,
356
+ "lisa_step_interval": 20,
357
+ "reft_layer_key": null,
358
+ "reft_layers": null,
359
+ "reft_rank": 4,
360
+ "reft_intervention_type": "LoreftIntervention",
361
+ "reft_args": null,
362
+ "swanlab_token": null,
363
+ "swanlab_project": null,
364
+ "swanlab_workspace": null,
365
+ "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
366
+ "swanlab_lark_webhook_url": null,
367
+ "swanlab_lark_secret": null,
368
+ "swanlab_mode": "cloud",
369
+ "add_version": true,
370
+ "create_checkpoint_symlink": false,
371
+ "zero_hpz_partition_size": null,
372
+ "deepspeed_autotp_size": null,
373
+ "early_stop_interval": 200,
374
+ "rank": 0,
375
+ "global_world_size": 4,
376
+ "local_world_size": 4,
377
+ "model_suffix": "InternVL3-8B-Instruct",
378
+ "model_info": "ModelInfo(model_type='internvl3', model_dir='/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'factor': 2.0, 'rope_type': 'dynamic', 'type': 'dynamic'}, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
379
+ "model_meta": "ModelMeta(model_type='internvl3', model_groups=[ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Pretrained', hf_model_id='OpenGVLab/InternVL3-1B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Pretrained', hf_model_id='OpenGVLab/InternVL3-2B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Pretrained', hf_model_id='OpenGVLab/InternVL3-8B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Pretrained', hf_model_id='OpenGVLab/InternVL3-9B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Pretrained', hf_model_id='OpenGVLab/InternVL3-14B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Pretrained', hf_model_id='OpenGVLab/InternVL3-38B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Pretrained', hf_model_id='OpenGVLab/InternVL3-78B-Pretrained', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Instruct', hf_model_id='OpenGVLab/InternVL3-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Instruct', hf_model_id='OpenGVLab/InternVL3-2B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Instruct', hf_model_id='OpenGVLab/InternVL3-8B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Instruct', hf_model_id='OpenGVLab/InternVL3-9B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Instruct', hf_model_id='OpenGVLab/InternVL3-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Instruct', hf_model_id='OpenGVLab/InternVL3-38B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Instruct', hf_model_id='OpenGVLab/InternVL3-78B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B', hf_model_id='OpenGVLab/InternVL3-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B', hf_model_id='OpenGVLab/InternVL3-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B', hf_model_id='OpenGVLab/InternVL3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B', hf_model_id='OpenGVLab/InternVL3-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B', hf_model_id='OpenGVLab/InternVL3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B', hf_model_id='OpenGVLab/InternVL3-38B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B', hf_model_id='OpenGVLab/InternVL3-78B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-AWQ', hf_model_id='OpenGVLab/InternVL3-1B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-AWQ', hf_model_id='OpenGVLab/InternVL3-2B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-AWQ', hf_model_id='OpenGVLab/InternVL3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-AWQ', hf_model_id='OpenGVLab/InternVL3-9B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-AWQ', hf_model_id='OpenGVLab/InternVL3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-AWQ', hf_model_id='OpenGVLab/InternVL3-38B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-AWQ', hf_model_id='OpenGVLab/InternVL3-78B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='internvl2_5', get_function=<function get_model_tokenizer_internvl at 0x7f81e9d2ea70>, model_arch=MultiModelKeys(arch_name='internvl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['language_model'], aligner=['mlp1'], vision_tower=['vision_model'], generator=[]), architectures=['InternVLChatModel'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37.2', 'timm'], tags=['vision', 'video'])",
380
+ "model_dir": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
381
+ "hub": "<class 'swift.hub.hub.MSHub'>",
382
+ "evaluation_strategy": "epoch",
383
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-07, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=10, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
384
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/added_tokens.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151666,
4
+ "</quad>": 151669,
5
+ "</ref>": 151671,
6
+ "</tool_call>": 151658,
7
+ "<IMG_CONTEXT>": 151667,
8
+ "<box>": 151672,
9
+ "<img>": 151665,
10
+ "<quad>": 151668,
11
+ "<ref>": 151670,
12
+ "<tool_call>": 151657,
13
+ "<|box_end|>": 151649,
14
+ "<|box_start|>": 151648,
15
+ "<|endoftext|>": 151643,
16
+ "<|file_sep|>": 151664,
17
+ "<|fim_middle|>": 151660,
18
+ "<|fim_pad|>": 151662,
19
+ "<|fim_prefix|>": 151659,
20
+ "<|fim_suffix|>": 151661,
21
+ "<|im_end|>": 151645,
22
+ "<|im_start|>": 151644,
23
+ "<|image_pad|>": 151655,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|video_pad|>": 151656,
30
+ "<|vision_end|>": 151653,
31
+ "<|vision_pad|>": 151654,
32
+ "<|vision_start|>": 151652
33
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/args.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "epoch",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 2,
10
+ "per_device_eval_batch_size": 2,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 2,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 5e-07,
18
+ "weight_decay": 0.1,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.95,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 5.0,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "cosine",
26
+ "lr_scheduler_kwargs": null,
27
+ "warmup_ratio": 0.1,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": true,
35
+ "logging_steps": 5,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "epoch",
38
+ "save_steps": 500,
39
+ "save_total_limit": 10,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": 42,
49
+ "jit_mode_eval": false,
50
+ "use_ipex": false,
51
+ "bf16": true,
52
+ "fp16": false,
53
+ "fp16_opt_level": "O1",
54
+ "half_precision_backend": "auto",
55
+ "bf16_full_eval": false,
56
+ "fp16_full_eval": false,
57
+ "tf32": null,
58
+ "local_rank": 0,
59
+ "ddp_backend": null,
60
+ "tpu_num_cores": null,
61
+ "tpu_metrics_debug": false,
62
+ "debug": null,
63
+ "dataloader_drop_last": false,
64
+ "eval_steps": null,
65
+ "dataloader_num_workers": 4,
66
+ "dataloader_prefetch_factor": null,
67
+ "past_index": -1,
68
+ "run_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
69
+ "disable_tqdm": null,
70
+ "remove_unused_columns": true,
71
+ "label_names": null,
72
+ "load_best_model_at_end": true,
73
+ "metric_for_best_model": "eval_loss",
74
+ "greater_is_better": false,
75
+ "ignore_data_skip": false,
76
+ "fsdp": "",
77
+ "fsdp_min_num_params": 0,
78
+ "fsdp_config": null,
79
+ "fsdp_transformer_layer_cls_to_wrap": null,
80
+ "accelerator_config": {
81
+ "dispatch_batches": false
82
+ },
83
+ "deepspeed": {
84
+ "fp16": {
85
+ "enabled": "auto",
86
+ "loss_scale": 0,
87
+ "loss_scale_window": 1000,
88
+ "initial_scale_power": 16,
89
+ "hysteresis": 2,
90
+ "min_loss_scale": 1
91
+ },
92
+ "bf16": {
93
+ "enabled": "auto"
94
+ },
95
+ "zero_optimization": {
96
+ "stage": 3,
97
+ "offload_optimizer": {
98
+ "device": "none",
99
+ "pin_memory": true
100
+ },
101
+ "offload_param": {
102
+ "device": "none",
103
+ "pin_memory": true
104
+ },
105
+ "overlap_comm": false,
106
+ "contiguous_gradients": true,
107
+ "sub_group_size": 1000000000.0,
108
+ "reduce_bucket_size": "auto",
109
+ "zero_quantized_weights": false,
110
+ "zero_quantized_gradients": false,
111
+ "stage3_prefetch_bucket_size": "auto",
112
+ "stage3_param_persistence_threshold": "auto",
113
+ "stage3_max_live_parameters": 1000000000.0,
114
+ "stage3_max_reuse_distance": 1000000000.0,
115
+ "stage3_gather_16bit_weights_on_model_save": true
116
+ },
117
+ "gradient_accumulation_steps": "auto",
118
+ "gradient_clipping": "auto",
119
+ "steps_per_print": 2000,
120
+ "train_batch_size": "auto",
121
+ "train_micro_batch_size_per_gpu": "auto",
122
+ "wall_clock_breakdown": false
123
+ },
124
+ "label_smoothing_factor": 0.0,
125
+ "optim": "adamw_torch",
126
+ "optim_args": null,
127
+ "adafactor": false,
128
+ "group_by_length": false,
129
+ "length_column_name": "length",
130
+ "report_to": [
131
+ "swanlab"
132
+ ],
133
+ "ddp_find_unused_parameters": null,
134
+ "ddp_bucket_cap_mb": null,
135
+ "ddp_broadcast_buffers": null,
136
+ "dataloader_pin_memory": true,
137
+ "dataloader_persistent_workers": false,
138
+ "skip_memory_metrics": true,
139
+ "use_legacy_prediction_loop": false,
140
+ "push_to_hub": false,
141
+ "resume_from_checkpoint": null,
142
+ "hub_model_id": null,
143
+ "hub_strategy": "every_save",
144
+ "hub_token": null,
145
+ "hub_private_repo": null,
146
+ "hub_always_push": false,
147
+ "hub_revision": null,
148
+ "gradient_checkpointing": true,
149
+ "gradient_checkpointing_kwargs": null,
150
+ "include_inputs_for_metrics": false,
151
+ "include_for_metrics": [],
152
+ "eval_do_concat_batches": true,
153
+ "fp16_backend": "auto",
154
+ "push_to_hub_model_id": null,
155
+ "push_to_hub_organization": null,
156
+ "push_to_hub_token": null,
157
+ "mp_parameters": "",
158
+ "auto_find_batch_size": false,
159
+ "full_determinism": false,
160
+ "torchdynamo": null,
161
+ "ray_scope": "last",
162
+ "ddp_timeout": 18000000,
163
+ "torch_compile": false,
164
+ "torch_compile_backend": null,
165
+ "torch_compile_mode": null,
166
+ "include_tokens_per_second": false,
167
+ "include_num_input_tokens_seen": false,
168
+ "neftune_noise_alpha": null,
169
+ "optim_target_modules": null,
170
+ "batch_eval_metrics": false,
171
+ "eval_on_start": false,
172
+ "use_liger_kernel": false,
173
+ "liger_kernel_config": null,
174
+ "eval_use_gather_object": false,
175
+ "average_tokens_across_devices": true,
176
+ "sortish_sampler": false,
177
+ "predict_with_generate": false,
178
+ "generation_max_length": null,
179
+ "generation_num_beams": null,
180
+ "generation_config": null,
181
+ "tuner_backend": "peft",
182
+ "vit_gradient_checkpointing": null,
183
+ "router_aux_loss_coef": 0.0,
184
+ "enable_dft_loss": false,
185
+ "enable_channel_loss": false,
186
+ "check_model": true,
187
+ "acc_strategy": "token",
188
+ "train_dataloader_shuffle": true,
189
+ "max_epochs": null,
190
+ "aligner_lr": null,
191
+ "vit_lr": null,
192
+ "use_logits_to_keep": null,
193
+ "ds3_gather_for_generation": true,
194
+ "resume_only_model": false,
195
+ "optimizer": null,
196
+ "loss_type": null,
197
+ "metric": null,
198
+ "eval_use_evalscope": false,
199
+ "eval_dataset": [],
200
+ "eval_dataset_args": null,
201
+ "eval_limit": null,
202
+ "eval_generation_config": null,
203
+ "extra_eval_args": null,
204
+ "use_flash_ckpt": false,
205
+ "model": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
206
+ "model_type": "internvl3",
207
+ "model_revision": null,
208
+ "task_type": "causal_lm",
209
+ "torch_dtype": "bfloat16",
210
+ "attn_impl": null,
211
+ "new_special_tokens": [],
212
+ "num_labels": null,
213
+ "problem_type": null,
214
+ "rope_scaling": null,
215
+ "device_map": null,
216
+ "max_memory": {},
217
+ "max_model_len": null,
218
+ "local_repo_path": null,
219
+ "init_strategy": null,
220
+ "template": "internvl2_5",
221
+ "system": null,
222
+ "max_length": 32768,
223
+ "truncation_strategy": "delete",
224
+ "max_pixels": null,
225
+ "agent_template": null,
226
+ "norm_bbox": null,
227
+ "use_chat_template": true,
228
+ "padding_free": false,
229
+ "padding_side": "right",
230
+ "loss_scale": "default",
231
+ "sequence_parallel_size": 1,
232
+ "response_prefix": null,
233
+ "template_backend": "swift",
234
+ "dataset": [
235
+ "/mnt/data/users/liamding/data/3AM_Plus/final/ood_split/ambi_normal_train_aug_messages.json"
236
+ ],
237
+ "val_dataset": [],
238
+ "split_dataset_ratio": 0.1,
239
+ "dataset_num_proc": 1,
240
+ "load_from_cache_file": true,
241
+ "dataset_shuffle": true,
242
+ "val_dataset_shuffle": false,
243
+ "streaming": false,
244
+ "interleave_prob": null,
245
+ "stopping_strategy": "first_exhausted",
246
+ "shuffle_buffer_size": 1000,
247
+ "download_mode": "reuse_dataset_if_exists",
248
+ "columns": {},
249
+ "strict": false,
250
+ "model_name": null,
251
+ "model_author": null,
252
+ "custom_dataset_info": [],
253
+ "quant_method": null,
254
+ "quant_bits": null,
255
+ "hqq_axis": null,
256
+ "bnb_4bit_compute_dtype": "bfloat16",
257
+ "bnb_4bit_quant_type": "nf4",
258
+ "bnb_4bit_use_double_quant": true,
259
+ "bnb_4bit_quant_storage": null,
260
+ "max_new_tokens": 64,
261
+ "temperature": 0.0,
262
+ "top_k": null,
263
+ "top_p": null,
264
+ "repetition_penalty": null,
265
+ "num_beams": 1,
266
+ "stream": false,
267
+ "stop_words": [],
268
+ "logprobs": false,
269
+ "top_logprobs": null,
270
+ "ckpt_dir": null,
271
+ "lora_modules": [],
272
+ "train_type": "full",
273
+ "adapters": [],
274
+ "external_plugins": [],
275
+ "model_kwargs": {},
276
+ "load_args": false,
277
+ "load_data_args": false,
278
+ "packing": false,
279
+ "packing_length": null,
280
+ "lazy_tokenize": true,
281
+ "cached_dataset": [],
282
+ "custom_register_path": [],
283
+ "use_hf": false,
284
+ "ignore_args_error": false,
285
+ "use_swift_lora": false,
286
+ "freeze_parameters": [
287
+ "vision_model",
288
+ "mlp1"
289
+ ],
290
+ "freeze_parameters_regex": null,
291
+ "freeze_parameters_ratio": 0.0,
292
+ "trainable_parameters": [],
293
+ "trainable_parameters_regex": null,
294
+ "freeze_llm": false,
295
+ "freeze_vit": true,
296
+ "freeze_aligner": true,
297
+ "target_modules": [
298
+ "all-linear"
299
+ ],
300
+ "target_regex": null,
301
+ "target_parameters": null,
302
+ "modules_to_save": [],
303
+ "lora_rank": 8,
304
+ "lora_alpha": 32,
305
+ "lora_dropout": 0.05,
306
+ "lora_bias": "none",
307
+ "lora_dtype": null,
308
+ "lorap_lr_ratio": null,
309
+ "use_rslora": false,
310
+ "use_dora": false,
311
+ "lora_ga_batch_size": 2,
312
+ "lora_ga_iters": 2,
313
+ "lora_ga_max_length": 1024,
314
+ "lora_ga_direction": "ArB2r",
315
+ "lora_ga_scale": "stable",
316
+ "lora_ga_stable_gamma": 16,
317
+ "init_weights": true,
318
+ "fourier_n_frequency": 2000,
319
+ "fourier_scaling": 300.0,
320
+ "boft_block_size": 4,
321
+ "boft_block_num": 0,
322
+ "boft_n_butterfly_factor": 1,
323
+ "boft_dropout": 0.0,
324
+ "vera_rank": 256,
325
+ "vera_projection_prng_key": 0,
326
+ "vera_dropout": 0.0,
327
+ "vera_d_initial": 0.1,
328
+ "adapter_act": "gelu",
329
+ "adapter_length": 128,
330
+ "use_galore": false,
331
+ "galore_target_modules": null,
332
+ "galore_rank": 128,
333
+ "galore_update_proj_gap": 50,
334
+ "galore_scale": 1.0,
335
+ "galore_proj_type": "std",
336
+ "galore_optim_per_parameter": false,
337
+ "galore_with_embedding": false,
338
+ "galore_quantization": false,
339
+ "galore_proj_quant": false,
340
+ "galore_proj_bits": 4,
341
+ "galore_proj_group_size": 256,
342
+ "galore_cos_threshold": 0.4,
343
+ "galore_gamma_proj": 2,
344
+ "galore_queue_size": 5,
345
+ "adalora_target_r": 8,
346
+ "adalora_init_r": 12,
347
+ "adalora_tinit": 0,
348
+ "adalora_tfinal": 0,
349
+ "adalora_deltaT": 1,
350
+ "adalora_beta1": 0.85,
351
+ "adalora_beta2": 0.85,
352
+ "adalora_orth_reg_weight": 0.5,
353
+ "llamapro_num_new_blocks": 4,
354
+ "llamapro_num_groups": null,
355
+ "lisa_activated_layers": 0,
356
+ "lisa_step_interval": 20,
357
+ "reft_layer_key": null,
358
+ "reft_layers": null,
359
+ "reft_rank": 4,
360
+ "reft_intervention_type": "LoreftIntervention",
361
+ "reft_args": null,
362
+ "swanlab_token": null,
363
+ "swanlab_project": null,
364
+ "swanlab_workspace": null,
365
+ "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
366
+ "swanlab_lark_webhook_url": null,
367
+ "swanlab_lark_secret": null,
368
+ "swanlab_mode": "cloud",
369
+ "add_version": true,
370
+ "create_checkpoint_symlink": false,
371
+ "zero_hpz_partition_size": null,
372
+ "deepspeed_autotp_size": null,
373
+ "early_stop_interval": 200,
374
+ "rank": 0,
375
+ "global_world_size": 4,
376
+ "local_world_size": 4,
377
+ "model_suffix": "InternVL3-8B-Instruct",
378
+ "model_info": "ModelInfo(model_type='internvl3', model_dir='/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'factor': 2.0, 'rope_type': 'dynamic', 'type': 'dynamic'}, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
379
+ "model_meta": "ModelMeta(model_type='internvl3', model_groups=[ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Pretrained', hf_model_id='OpenGVLab/InternVL3-1B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Pretrained', hf_model_id='OpenGVLab/InternVL3-2B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Pretrained', hf_model_id='OpenGVLab/InternVL3-8B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Pretrained', hf_model_id='OpenGVLab/InternVL3-9B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Pretrained', hf_model_id='OpenGVLab/InternVL3-14B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Pretrained', hf_model_id='OpenGVLab/InternVL3-38B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Pretrained', hf_model_id='OpenGVLab/InternVL3-78B-Pretrained', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Instruct', hf_model_id='OpenGVLab/InternVL3-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Instruct', hf_model_id='OpenGVLab/InternVL3-2B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Instruct', hf_model_id='OpenGVLab/InternVL3-8B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Instruct', hf_model_id='OpenGVLab/InternVL3-9B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Instruct', hf_model_id='OpenGVLab/InternVL3-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Instruct', hf_model_id='OpenGVLab/InternVL3-38B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Instruct', hf_model_id='OpenGVLab/InternVL3-78B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B', hf_model_id='OpenGVLab/InternVL3-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B', hf_model_id='OpenGVLab/InternVL3-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B', hf_model_id='OpenGVLab/InternVL3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B', hf_model_id='OpenGVLab/InternVL3-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B', hf_model_id='OpenGVLab/InternVL3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B', hf_model_id='OpenGVLab/InternVL3-38B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B', hf_model_id='OpenGVLab/InternVL3-78B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-AWQ', hf_model_id='OpenGVLab/InternVL3-1B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-AWQ', hf_model_id='OpenGVLab/InternVL3-2B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-AWQ', hf_model_id='OpenGVLab/InternVL3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-AWQ', hf_model_id='OpenGVLab/InternVL3-9B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-AWQ', hf_model_id='OpenGVLab/InternVL3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-AWQ', hf_model_id='OpenGVLab/InternVL3-38B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-AWQ', hf_model_id='OpenGVLab/InternVL3-78B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='internvl2_5', get_function=<function get_model_tokenizer_internvl at 0x7f81e9d2ea70>, model_arch=MultiModelKeys(arch_name='internvl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['language_model'], aligner=['mlp1'], vision_tower=['vision_model'], generator=[]), architectures=['InternVLChatModel'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37.2', 'timm'], tags=['vision', 'video'])",
380
+ "model_dir": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
381
+ "hub": "<class 'swift.hub.hub.MSHub'>",
382
+ "evaluation_strategy": "epoch",
383
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-07, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=10, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
384
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "InternVLChatModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
7
+ "AutoModel": "modeling_internvl_chat.InternVLChatModel",
8
+ "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 448,
13
+ "hidden_size": 3584,
14
+ "image_fold": null,
15
+ "keys_to_ignore_at_inference": [
16
+ "past_key_values"
17
+ ],
18
+ "llm_config": {
19
+ "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct",
20
+ "architectures": [
21
+ "Qwen2ForCausalLM"
22
+ ],
23
+ "attention_dropout": 0.0,
24
+ "bos_token_id": 151643,
25
+ "eos_token_id": 151643,
26
+ "hidden_act": "silu",
27
+ "hidden_size": 3584,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 18944,
30
+ "layer_types": [
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention"
59
+ ],
60
+ "max_position_embeddings": 32768,
61
+ "max_window_layers": 70,
62
+ "model_type": "qwen2",
63
+ "moe_config": null,
64
+ "num_attention_heads": 28,
65
+ "num_hidden_layers": 28,
66
+ "num_key_value_heads": 4,
67
+ "pad_token_id": 151643,
68
+ "rms_norm_eps": 1e-06,
69
+ "rope_scaling": {
70
+ "factor": 2.0,
71
+ "rope_type": "dynamic",
72
+ "type": "dynamic"
73
+ },
74
+ "rope_theta": 1000000.0,
75
+ "sliding_window": null,
76
+ "torch_dtype": "bfloat16",
77
+ "use_bfloat16": true,
78
+ "use_cache": false,
79
+ "use_sliding_window": false,
80
+ "vocab_size": 151674
81
+ },
82
+ "max_dynamic_patch": 12,
83
+ "min_dynamic_patch": 1,
84
+ "model_type": "internvl_chat",
85
+ "output_attentions": false,
86
+ "pad2square": false,
87
+ "pad_token_id": 151643,
88
+ "ps_version": "v2",
89
+ "select_layer": -1,
90
+ "template": "internvl2_5",
91
+ "tie_word_embeddings": false,
92
+ "torch_dtype": "bfloat16",
93
+ "transformers_version": null,
94
+ "use_backbone_lora": 0,
95
+ "use_llm_lora": 0,
96
+ "use_thumbnail": true,
97
+ "vision_config": {
98
+ "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
99
+ "architectures": [
100
+ "InternVisionModel"
101
+ ],
102
+ "attention_dropout": 0.0,
103
+ "auto_map": {
104
+ "AutoConfig": "configuration_intern_vit.InternVisionConfig",
105
+ "AutoModel": "modeling_intern_vit.InternVisionModel"
106
+ },
107
+ "capacity_factor": 1.2,
108
+ "drop_path_rate": 0.0,
109
+ "dropout": 0.0,
110
+ "eval_capacity_factor": 1.4,
111
+ "hidden_act": "gelu",
112
+ "hidden_size": 1024,
113
+ "image_size": 448,
114
+ "initializer_factor": 0.1,
115
+ "initializer_range": 1e-10,
116
+ "intermediate_size": 4096,
117
+ "laux_allreduce": "all_nodes",
118
+ "layer_norm_eps": 1e-06,
119
+ "model_type": "intern_vit_6b",
120
+ "moe_coeff_ratio": 0.5,
121
+ "moe_intermediate_size": 768,
122
+ "moe_output_scale": 4.0,
123
+ "noisy_gate_policy": "RSample_before",
124
+ "norm_type": "layer_norm",
125
+ "num_attention_heads": 16,
126
+ "num_channels": 3,
127
+ "num_experts": 8,
128
+ "num_hidden_layers": 24,
129
+ "num_routed_experts": 4,
130
+ "num_shared_experts": 4,
131
+ "pad_token_id": 151643,
132
+ "patch_size": 14,
133
+ "qk_normalization": false,
134
+ "qkv_bias": true,
135
+ "shared_expert_intermediate_size": 3072,
136
+ "torch_dtype": "bfloat16",
137
+ "use_bfloat16": true,
138
+ "use_flash_attn": true,
139
+ "use_moe": false,
140
+ "use_residual": true,
141
+ "use_rts": false,
142
+ "use_weighted_residual": false
143
+ }
144
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_intern_vit.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import os
8
+ from typing import Union
9
+
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ logger = logging.get_logger(__name__)
14
+
15
+
16
+ class InternVisionConfig(PretrainedConfig):
17
+ r"""
18
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
19
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
20
+
21
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
22
+ documentation from [`PretrainedConfig`] for more information.
23
+
24
+ Args:
25
+ num_channels (`int`, *optional*, defaults to 3):
26
+ Number of color channels in the input images (e.g., 3 for RGB).
27
+ patch_size (`int`, *optional*, defaults to 14):
28
+ The size (resolution) of each patch.
29
+ image_size (`int`, *optional*, defaults to 224):
30
+ The size (resolution) of each image.
31
+ qkv_bias (`bool`, *optional*, defaults to `False`):
32
+ Whether to add a bias to the queries and values in the self-attention layers.
33
+ hidden_size (`int`, *optional*, defaults to 3200):
34
+ Dimensionality of the encoder layers and the pooler layer.
35
+ num_attention_heads (`int`, *optional*, defaults to 25):
36
+ Number of attention heads for each attention layer in the Transformer encoder.
37
+ intermediate_size (`int`, *optional*, defaults to 12800):
38
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
39
+ qk_normalization (`bool`, *optional*, defaults to `True`):
40
+ Whether to normalize the queries and keys in the self-attention layers.
41
+ num_hidden_layers (`int`, *optional*, defaults to 48):
42
+ Number of hidden layers in the Transformer encoder.
43
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
44
+ Whether to use flash attention mechanism.
45
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
46
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
47
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
48
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
49
+ The epsilon used by the layer normalization layers.
50
+ dropout (`float`, *optional*, defaults to 0.0):
51
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
52
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
53
+ Dropout rate for stochastic depth.
54
+ attention_dropout (`float`, *optional*, defaults to 0.0):
55
+ The dropout ratio for the attention probabilities.
56
+ initializer_range (`float`, *optional*, defaults to 0.02):
57
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
58
+ initializer_factor (`float`, *optional*, defaults to 0.1):
59
+ A factor for layer scale.
60
+ """
61
+
62
+ model_type = 'intern_vit_6b'
63
+
64
+ def __init__(
65
+ self,
66
+ num_channels=3,
67
+ patch_size=14,
68
+ image_size=224,
69
+ qkv_bias=False,
70
+ hidden_size=3200,
71
+ num_attention_heads=25,
72
+ intermediate_size=12800,
73
+ qk_normalization=True,
74
+ num_hidden_layers=48,
75
+ use_flash_attn=True,
76
+ hidden_act='gelu',
77
+ norm_type='rms_norm',
78
+ layer_norm_eps=1e-6,
79
+ dropout=0.0,
80
+ drop_path_rate=0.0,
81
+ attention_dropout=0.0,
82
+ initializer_range=0.02,
83
+ initializer_factor=0.1,
84
+ **kwargs,
85
+ ):
86
+ super().__init__(**kwargs)
87
+
88
+ self.hidden_size = hidden_size
89
+ self.intermediate_size = intermediate_size
90
+ self.dropout = dropout
91
+ self.drop_path_rate = drop_path_rate
92
+ self.num_hidden_layers = num_hidden_layers
93
+ self.num_attention_heads = num_attention_heads
94
+ self.num_channels = num_channels
95
+ self.patch_size = patch_size
96
+ self.image_size = image_size
97
+ self.initializer_range = initializer_range
98
+ self.initializer_factor = initializer_factor
99
+ self.attention_dropout = attention_dropout
100
+ self.layer_norm_eps = layer_norm_eps
101
+ self.hidden_act = hidden_act
102
+ self.norm_type = norm_type
103
+ self.qkv_bias = qkv_bias
104
+ self.qk_normalization = qk_normalization
105
+ self.use_flash_attn = use_flash_attn
106
+
107
+ @classmethod
108
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
109
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
+
111
+ if 'vision_config' in config_dict:
112
+ config_dict = config_dict['vision_config']
113
+
114
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
115
+ logger.warning(
116
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
117
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
118
+ )
119
+
120
+ return cls.from_dict(config_dict, **kwargs)
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_internvl_chat.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from transformers import AutoConfig, LlamaConfig, Qwen2Config
10
+ from transformers.configuration_utils import PretrainedConfig
11
+ from transformers.utils import logging
12
+
13
+ from .configuration_intern_vit import InternVisionConfig
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class InternVLChatConfig(PretrainedConfig):
19
+ model_type = 'internvl_chat'
20
+ is_composition = True
21
+
22
+ def __init__(
23
+ self,
24
+ vision_config=None,
25
+ llm_config=None,
26
+ use_backbone_lora=0,
27
+ use_llm_lora=0,
28
+ select_layer=-1,
29
+ force_image_size=None,
30
+ downsample_ratio=0.5,
31
+ template=None,
32
+ dynamic_image_size=False,
33
+ use_thumbnail=False,
34
+ ps_version='v1',
35
+ min_dynamic_patch=1,
36
+ max_dynamic_patch=6,
37
+ **kwargs):
38
+ super().__init__(**kwargs)
39
+
40
+ if vision_config is None:
41
+ vision_config = {'architectures': ['InternVisionModel']}
42
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
43
+
44
+ if llm_config is None:
45
+ llm_config = {'architectures': ['Qwen2ForCausalLM']}
46
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
47
+
48
+ self.vision_config = InternVisionConfig(**vision_config)
49
+ if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
50
+ self.llm_config = LlamaConfig(**llm_config)
51
+ elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM':
52
+ self.llm_config = Qwen2Config(**llm_config)
53
+ else:
54
+ raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
55
+ self.use_backbone_lora = use_backbone_lora
56
+ self.use_llm_lora = use_llm_lora
57
+ self.select_layer = select_layer
58
+ self.force_image_size = force_image_size
59
+ self.downsample_ratio = downsample_ratio
60
+ self.template = template
61
+ self.dynamic_image_size = dynamic_image_size
62
+ self.use_thumbnail = use_thumbnail
63
+ self.ps_version = ps_version # pixel shuffle version
64
+ self.min_dynamic_patch = min_dynamic_patch
65
+ self.max_dynamic_patch = max_dynamic_patch
66
+ # By default, we use tie_word_embeddings=False for models of all sizes.
67
+ self.tie_word_embeddings = self.llm_config.tie_word_embeddings
68
+
69
+ logger.info(f'vision_select_layer: {self.select_layer}')
70
+ logger.info(f'ps_version: {self.ps_version}')
71
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
72
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
73
+
74
+ def to_dict(self):
75
+ """
76
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
77
+
78
+ Returns:
79
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
80
+ """
81
+ output = copy.deepcopy(self.__dict__)
82
+ output['vision_config'] = self.vision_config.to_dict()
83
+ output['llm_config'] = self.llm_config.to_dict()
84
+ output['model_type'] = self.__class__.model_type
85
+ output['use_backbone_lora'] = self.use_backbone_lora
86
+ output['use_llm_lora'] = self.use_llm_lora
87
+ output['select_layer'] = self.select_layer
88
+ output['force_image_size'] = self.force_image_size
89
+ output['downsample_ratio'] = self.downsample_ratio
90
+ output['template'] = self.template
91
+ output['dynamic_image_size'] = self.dynamic_image_size
92
+ output['use_thumbnail'] = self.use_thumbnail
93
+ output['ps_version'] = self.ps_version
94
+ output['min_dynamic_patch'] = self.min_dynamic_patch
95
+ output['max_dynamic_patch'] = self.max_dynamic_patch
96
+
97
+ return output
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/conversation.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+
7
+ Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
8
+ """
9
+
10
+ import dataclasses
11
+ from enum import IntEnum, auto
12
+ from typing import Dict, List, Tuple, Union
13
+
14
+
15
+ class SeparatorStyle(IntEnum):
16
+ """Separator styles."""
17
+
18
+ ADD_COLON_SINGLE = auto()
19
+ ADD_COLON_TWO = auto()
20
+ ADD_COLON_SPACE_SINGLE = auto()
21
+ NO_COLON_SINGLE = auto()
22
+ NO_COLON_TWO = auto()
23
+ ADD_NEW_LINE_SINGLE = auto()
24
+ LLAMA2 = auto()
25
+ CHATGLM = auto()
26
+ CHATML = auto()
27
+ CHATINTERN = auto()
28
+ DOLLY = auto()
29
+ RWKV = auto()
30
+ PHOENIX = auto()
31
+ ROBIN = auto()
32
+ FALCON_CHAT = auto()
33
+ CHATGLM3 = auto()
34
+ INTERNVL_ZH = auto()
35
+ MPT = auto()
36
+
37
+
38
+ @dataclasses.dataclass
39
+ class Conversation:
40
+ """A class that manages prompt templates and keeps all conversation history."""
41
+
42
+ # The name of this template
43
+ name: str
44
+ # The template of the system prompt
45
+ system_template: str = '{system_message}'
46
+ # The system message
47
+ system_message: str = ''
48
+ # The names of two roles
49
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
50
+ # All messages. Each item is (role, message).
51
+ messages: List[List[str]] = ()
52
+ # The number of few shot examples
53
+ offset: int = 0
54
+ # The separator style and configurations
55
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
56
+ sep: str = '\n'
57
+ sep2: str = None
58
+ # Stop criteria (the default one is EOS token)
59
+ stop_str: Union[str, List[str]] = None
60
+ # Stops generation if meeting any token in this list
61
+ stop_token_ids: List[int] = None
62
+
63
+ def get_prompt(self) -> str:
64
+ """Get the prompt for generation."""
65
+ system_prompt = self.system_template.format(system_message=self.system_message)
66
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
67
+ ret = system_prompt + self.sep
68
+ for role, message in self.messages:
69
+ if message:
70
+ ret += role + ': ' + message + self.sep
71
+ else:
72
+ ret += role + ':'
73
+ return ret
74
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
75
+ seps = [self.sep, self.sep2]
76
+ ret = system_prompt + seps[0]
77
+ for i, (role, message) in enumerate(self.messages):
78
+ if message:
79
+ ret += role + ': ' + message + seps[i % 2]
80
+ else:
81
+ ret += role + ':'
82
+ return ret
83
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
84
+ ret = system_prompt + self.sep
85
+ for role, message in self.messages:
86
+ if message:
87
+ ret += role + ': ' + message + self.sep
88
+ else:
89
+ ret += role + ': ' # must be end with a space
90
+ return ret
91
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
92
+ ret = '' if system_prompt == '' else system_prompt + self.sep
93
+ for role, message in self.messages:
94
+ if message:
95
+ ret += role + '\n' + message + self.sep
96
+ else:
97
+ ret += role + '\n'
98
+ return ret
99
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
100
+ ret = system_prompt
101
+ for role, message in self.messages:
102
+ if message:
103
+ ret += role + message + self.sep
104
+ else:
105
+ ret += role
106
+ return ret
107
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
108
+ seps = [self.sep, self.sep2]
109
+ ret = system_prompt
110
+ for i, (role, message) in enumerate(self.messages):
111
+ if message:
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ return ret
116
+ elif self.sep_style == SeparatorStyle.RWKV:
117
+ ret = system_prompt
118
+ for i, (role, message) in enumerate(self.messages):
119
+ if message:
120
+ ret += (
121
+ role
122
+ + ': '
123
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
124
+ )
125
+ ret += '\n\n'
126
+ else:
127
+ ret += role + ':'
128
+ return ret
129
+ elif self.sep_style == SeparatorStyle.LLAMA2:
130
+ seps = [self.sep, self.sep2]
131
+ if self.system_message:
132
+ ret = system_prompt
133
+ else:
134
+ ret = '[INST] '
135
+ for i, (role, message) in enumerate(self.messages):
136
+ tag = self.roles[i % 2]
137
+ if message:
138
+ if i == 0:
139
+ ret += message + ' '
140
+ else:
141
+ ret += tag + ' ' + message + seps[i % 2]
142
+ else:
143
+ ret += tag
144
+ return ret
145
+ elif self.sep_style == SeparatorStyle.CHATGLM:
146
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
147
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
148
+ round_add_n = 1 if self.name == 'chatglm2' else 0
149
+ if system_prompt:
150
+ ret = system_prompt + self.sep
151
+ else:
152
+ ret = ''
153
+
154
+ for i, (role, message) in enumerate(self.messages):
155
+ if i % 2 == 0:
156
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
157
+
158
+ if message:
159
+ ret += f'{role}:{message}{self.sep}'
160
+ else:
161
+ ret += f'{role}:'
162
+ return ret
163
+ elif self.sep_style == SeparatorStyle.CHATML:
164
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
165
+ for role, message in self.messages:
166
+ if message:
167
+ ret += role + '\n' + message + self.sep + '\n'
168
+ else:
169
+ ret += role + '\n'
170
+ return ret
171
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
172
+ ret = ''
173
+ if self.system_message:
174
+ ret += system_prompt
175
+ for role, message in self.messages:
176
+ if message:
177
+ ret += role + '\n' + ' ' + message
178
+ else:
179
+ ret += role
180
+ return ret
181
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
182
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
183
+ seps = [self.sep, self.sep2]
184
+ ret = system_prompt
185
+ for i, (role, message) in enumerate(self.messages):
186
+ # if i % 2 == 0:
187
+ # ret += "<s>"
188
+ if message:
189
+ ret += role + ':' + message + seps[i % 2] + '\n'
190
+ else:
191
+ ret += role + ':'
192
+ return ret
193
+ elif self.sep_style == SeparatorStyle.DOLLY:
194
+ seps = [self.sep, self.sep2]
195
+ ret = system_prompt
196
+ for i, (role, message) in enumerate(self.messages):
197
+ if message:
198
+ ret += role + ':\n' + message + seps[i % 2]
199
+ if i % 2 == 1:
200
+ ret += '\n\n'
201
+ else:
202
+ ret += role + ':\n'
203
+ return ret
204
+ elif self.sep_style == SeparatorStyle.PHOENIX:
205
+ ret = system_prompt
206
+ for role, message in self.messages:
207
+ if message:
208
+ ret += role + ': ' + '<s>' + message + '</s>'
209
+ else:
210
+ ret += role + ': ' + '<s>'
211
+ return ret
212
+ elif self.sep_style == SeparatorStyle.ROBIN:
213
+ ret = system_prompt + self.sep
214
+ for role, message in self.messages:
215
+ if message:
216
+ ret += role + ':\n' + message + self.sep
217
+ else:
218
+ ret += role + ':\n'
219
+ return ret
220
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
221
+ ret = ''
222
+ if self.system_message:
223
+ ret += system_prompt + self.sep
224
+ for role, message in self.messages:
225
+ if message:
226
+ ret += role + ': ' + message + self.sep
227
+ else:
228
+ ret += role + ':'
229
+
230
+ return ret
231
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
232
+ seps = [self.sep, self.sep2]
233
+ ret = self.system_message + seps[0]
234
+ for i, (role, message) in enumerate(self.messages):
235
+ if message:
236
+ ret += role + ': ' + message + seps[i % 2]
237
+ else:
238
+ ret += role + ':'
239
+ return ret
240
+ elif self.sep_style == SeparatorStyle.MPT:
241
+ ret = system_prompt + self.sep
242
+ for role, message in self.messages:
243
+ if message:
244
+ if type(message) is tuple:
245
+ message, _, _ = message
246
+ ret += role + message + self.sep
247
+ else:
248
+ ret += role
249
+ return ret
250
+ else:
251
+ raise ValueError(f'Invalid style: {self.sep_style}')
252
+
253
+ def set_system_message(self, system_message: str):
254
+ """Set the system message."""
255
+ self.system_message = system_message
256
+
257
+ def append_message(self, role: str, message: str):
258
+ """Append a new message."""
259
+ self.messages.append([role, message])
260
+
261
+ def update_last_message(self, message: str):
262
+ """Update the last output.
263
+
264
+ The last message is typically set to be None when constructing the prompt,
265
+ so we need to update it in-place after getting the response from a model.
266
+ """
267
+ self.messages[-1][1] = message
268
+
269
+ def to_gradio_chatbot(self):
270
+ """Convert the conversation to gradio chatbot format."""
271
+ ret = []
272
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
273
+ if i % 2 == 0:
274
+ ret.append([msg, None])
275
+ else:
276
+ ret[-1][-1] = msg
277
+ return ret
278
+
279
+ def to_openai_api_messages(self):
280
+ """Convert the conversation to OpenAI chat completion format."""
281
+ ret = [{'role': 'system', 'content': self.system_message}]
282
+
283
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
284
+ if i % 2 == 0:
285
+ ret.append({'role': 'user', 'content': msg})
286
+ else:
287
+ if msg is not None:
288
+ ret.append({'role': 'assistant', 'content': msg})
289
+ return ret
290
+
291
+ def copy(self):
292
+ return Conversation(
293
+ name=self.name,
294
+ system_template=self.system_template,
295
+ system_message=self.system_message,
296
+ roles=self.roles,
297
+ messages=[[x, y] for x, y in self.messages],
298
+ offset=self.offset,
299
+ sep_style=self.sep_style,
300
+ sep=self.sep,
301
+ sep2=self.sep2,
302
+ stop_str=self.stop_str,
303
+ stop_token_ids=self.stop_token_ids,
304
+ )
305
+
306
+ def dict(self):
307
+ return {
308
+ 'template_name': self.name,
309
+ 'system_message': self.system_message,
310
+ 'roles': self.roles,
311
+ 'messages': self.messages,
312
+ 'offset': self.offset,
313
+ }
314
+
315
+
316
+ # A global registry for all conversation templates
317
+ conv_templates: Dict[str, Conversation] = {}
318
+
319
+
320
+ def register_conv_template(template: Conversation, override: bool = False):
321
+ """Register a new conversation template."""
322
+ if not override:
323
+ assert (
324
+ template.name not in conv_templates
325
+ ), f'{template.name} has been registered.'
326
+
327
+ conv_templates[template.name] = template
328
+
329
+
330
+ def get_conv_template(name: str) -> Conversation:
331
+ """Get a conversation template."""
332
+ return conv_templates[name].copy()
333
+
334
+
335
+ # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
336
+ # is that during training, the preprocessing function for the Hermes-2 template doesn't add
337
+ # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
338
+ # Therefore, they are completely equivalent during inference.
339
+ register_conv_template(
340
+ Conversation(
341
+ name='Hermes-2',
342
+ system_template='<|im_start|>system\n{system_message}',
343
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
344
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
345
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
346
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
347
+ sep_style=SeparatorStyle.MPT,
348
+ sep='<|im_end|>',
349
+ stop_str='<|endoftext|>',
350
+ )
351
+ )
352
+
353
+
354
+ register_conv_template(
355
+ Conversation(
356
+ name='internlm2-chat',
357
+ system_template='<|im_start|>system\n{system_message}',
358
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
359
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
360
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
361
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
362
+ sep_style=SeparatorStyle.MPT,
363
+ sep='<|im_end|>',
364
+ )
365
+ )
366
+
367
+
368
+ register_conv_template(
369
+ Conversation(
370
+ name='phi3-chat',
371
+ system_template='<|system|>\n{system_message}',
372
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
373
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
374
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
375
+ roles=('<|user|>\n', '<|assistant|>\n'),
376
+ sep_style=SeparatorStyle.MPT,
377
+ sep='<|end|>',
378
+ )
379
+ )
380
+
381
+
382
+ register_conv_template(
383
+ Conversation(
384
+ name='internvl2_5',
385
+ system_template='<|im_start|>system\n{system_message}',
386
+ system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
387
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
388
+ sep_style=SeparatorStyle.MPT,
389
+ sep='<|im_end|>\n',
390
+ )
391
+ )
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 151645,
4
+ "transformers_version": "4.55.4"
5
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step228
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/model.safetensors.index.json ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 719360,
4
+ "total_size": 15888747520
5
+ },
6
+ "weight_map": {
7
+ "language_model.lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "language_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "language_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "language_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "language_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "language_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "language_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "language_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "language_model.model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "language_model.model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "language_model.model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "language_model.model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "language_model.model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "language_model.model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "language_model.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "language_model.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "language_model.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "language_model.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "language_model.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "language_model.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "language_model.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "language_model.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "language_model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "language_model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "language_model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "language_model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "language_model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "language_model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "language_model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "language_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
118
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
119
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
122
+ "language_model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "language_model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "language_model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "language_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
132
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
133
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "language_model.model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
135
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
136
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
137
+ "language_model.model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
138
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
139
+ "language_model.model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
140
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
141
+ "language_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "language_model.model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "language_model.model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "language_model.model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "language_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "language_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "language_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "language_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "language_model.model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "language_model.model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "language_model.model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "language_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "language_model.model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "language_model.model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "language_model.model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "language_model.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "language_model.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "language_model.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "language_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "language_model.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "language_model.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "language_model.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "language_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "language_model.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "language_model.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "language_model.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "language_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "language_model.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "language_model.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "language_model.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "language_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "language_model.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "language_model.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "language_model.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "language_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "language_model.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "language_model.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "language_model.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "language_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "language_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "language_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "language_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "language_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "language_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
286
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
287
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
289
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
290
+ "language_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "language_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "language_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
298
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
299
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
300
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
301
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
302
+ "language_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
303
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
304
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
305
+ "language_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
306
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
307
+ "language_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
308
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
309
+ "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
310
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
311
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
312
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
313
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
314
+ "language_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
315
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
316
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
317
+ "language_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
318
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
319
+ "language_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
320
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "language_model.model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
327
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
328
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
329
+ "language_model.model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
330
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
331
+ "language_model.model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
332
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
333
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "language_model.model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "language_model.model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "language_model.model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
346
+ "mlp1.0.bias": "model-00004-of-00004.safetensors",
347
+ "mlp1.0.weight": "model-00004-of-00004.safetensors",
348
+ "mlp1.1.bias": "model-00004-of-00004.safetensors",
349
+ "mlp1.1.weight": "model-00004-of-00004.safetensors",
350
+ "mlp1.3.bias": "model-00004-of-00004.safetensors",
351
+ "mlp1.3.weight": "model-00004-of-00004.safetensors",
352
+ "vision_model.embeddings.class_embedding": "model-00001-of-00004.safetensors",
353
+ "vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
354
+ "vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
355
+ "vision_model.embeddings.position_embedding": "model-00001-of-00004.safetensors",
356
+ "vision_model.encoder.layers.0.attn.proj.bias": "model-00001-of-00004.safetensors",
357
+ "vision_model.encoder.layers.0.attn.proj.weight": "model-00001-of-00004.safetensors",
358
+ "vision_model.encoder.layers.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
359
+ "vision_model.encoder.layers.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
360
+ "vision_model.encoder.layers.0.ls1": "model-00001-of-00004.safetensors",
361
+ "vision_model.encoder.layers.0.ls2": "model-00001-of-00004.safetensors",
362
+ "vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
363
+ "vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
364
+ "vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
365
+ "vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
366
+ "vision_model.encoder.layers.0.norm1.bias": "model-00001-of-00004.safetensors",
367
+ "vision_model.encoder.layers.0.norm1.weight": "model-00001-of-00004.safetensors",
368
+ "vision_model.encoder.layers.0.norm2.bias": "model-00001-of-00004.safetensors",
369
+ "vision_model.encoder.layers.0.norm2.weight": "model-00001-of-00004.safetensors",
370
+ "vision_model.encoder.layers.1.attn.proj.bias": "model-00001-of-00004.safetensors",
371
+ "vision_model.encoder.layers.1.attn.proj.weight": "model-00001-of-00004.safetensors",
372
+ "vision_model.encoder.layers.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
373
+ "vision_model.encoder.layers.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
374
+ "vision_model.encoder.layers.1.ls1": "model-00001-of-00004.safetensors",
375
+ "vision_model.encoder.layers.1.ls2": "model-00001-of-00004.safetensors",
376
+ "vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
377
+ "vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
378
+ "vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
379
+ "vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
380
+ "vision_model.encoder.layers.1.norm1.bias": "model-00001-of-00004.safetensors",
381
+ "vision_model.encoder.layers.1.norm1.weight": "model-00001-of-00004.safetensors",
382
+ "vision_model.encoder.layers.1.norm2.bias": "model-00001-of-00004.safetensors",
383
+ "vision_model.encoder.layers.1.norm2.weight": "model-00001-of-00004.safetensors",
384
+ "vision_model.encoder.layers.10.attn.proj.bias": "model-00001-of-00004.safetensors",
385
+ "vision_model.encoder.layers.10.attn.proj.weight": "model-00001-of-00004.safetensors",
386
+ "vision_model.encoder.layers.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
387
+ "vision_model.encoder.layers.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
388
+ "vision_model.encoder.layers.10.ls1": "model-00001-of-00004.safetensors",
389
+ "vision_model.encoder.layers.10.ls2": "model-00001-of-00004.safetensors",
390
+ "vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
391
+ "vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
392
+ "vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
393
+ "vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
394
+ "vision_model.encoder.layers.10.norm1.bias": "model-00001-of-00004.safetensors",
395
+ "vision_model.encoder.layers.10.norm1.weight": "model-00001-of-00004.safetensors",
396
+ "vision_model.encoder.layers.10.norm2.bias": "model-00001-of-00004.safetensors",
397
+ "vision_model.encoder.layers.10.norm2.weight": "model-00001-of-00004.safetensors",
398
+ "vision_model.encoder.layers.11.attn.proj.bias": "model-00001-of-00004.safetensors",
399
+ "vision_model.encoder.layers.11.attn.proj.weight": "model-00001-of-00004.safetensors",
400
+ "vision_model.encoder.layers.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
401
+ "vision_model.encoder.layers.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
402
+ "vision_model.encoder.layers.11.ls1": "model-00001-of-00004.safetensors",
403
+ "vision_model.encoder.layers.11.ls2": "model-00001-of-00004.safetensors",
404
+ "vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
405
+ "vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
406
+ "vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
407
+ "vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
408
+ "vision_model.encoder.layers.11.norm1.bias": "model-00001-of-00004.safetensors",
409
+ "vision_model.encoder.layers.11.norm1.weight": "model-00001-of-00004.safetensors",
410
+ "vision_model.encoder.layers.11.norm2.bias": "model-00001-of-00004.safetensors",
411
+ "vision_model.encoder.layers.11.norm2.weight": "model-00001-of-00004.safetensors",
412
+ "vision_model.encoder.layers.12.attn.proj.bias": "model-00001-of-00004.safetensors",
413
+ "vision_model.encoder.layers.12.attn.proj.weight": "model-00001-of-00004.safetensors",
414
+ "vision_model.encoder.layers.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
415
+ "vision_model.encoder.layers.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
416
+ "vision_model.encoder.layers.12.ls1": "model-00001-of-00004.safetensors",
417
+ "vision_model.encoder.layers.12.ls2": "model-00001-of-00004.safetensors",
418
+ "vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
419
+ "vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
420
+ "vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
421
+ "vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
422
+ "vision_model.encoder.layers.12.norm1.bias": "model-00001-of-00004.safetensors",
423
+ "vision_model.encoder.layers.12.norm1.weight": "model-00001-of-00004.safetensors",
424
+ "vision_model.encoder.layers.12.norm2.bias": "model-00001-of-00004.safetensors",
425
+ "vision_model.encoder.layers.12.norm2.weight": "model-00001-of-00004.safetensors",
426
+ "vision_model.encoder.layers.13.attn.proj.bias": "model-00001-of-00004.safetensors",
427
+ "vision_model.encoder.layers.13.attn.proj.weight": "model-00001-of-00004.safetensors",
428
+ "vision_model.encoder.layers.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
429
+ "vision_model.encoder.layers.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
430
+ "vision_model.encoder.layers.13.ls1": "model-00001-of-00004.safetensors",
431
+ "vision_model.encoder.layers.13.ls2": "model-00001-of-00004.safetensors",
432
+ "vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
433
+ "vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
434
+ "vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
435
+ "vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
436
+ "vision_model.encoder.layers.13.norm1.bias": "model-00001-of-00004.safetensors",
437
+ "vision_model.encoder.layers.13.norm1.weight": "model-00001-of-00004.safetensors",
438
+ "vision_model.encoder.layers.13.norm2.bias": "model-00001-of-00004.safetensors",
439
+ "vision_model.encoder.layers.13.norm2.weight": "model-00001-of-00004.safetensors",
440
+ "vision_model.encoder.layers.14.attn.proj.bias": "model-00001-of-00004.safetensors",
441
+ "vision_model.encoder.layers.14.attn.proj.weight": "model-00001-of-00004.safetensors",
442
+ "vision_model.encoder.layers.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
443
+ "vision_model.encoder.layers.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
444
+ "vision_model.encoder.layers.14.ls1": "model-00001-of-00004.safetensors",
445
+ "vision_model.encoder.layers.14.ls2": "model-00001-of-00004.safetensors",
446
+ "vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
447
+ "vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
448
+ "vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
449
+ "vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
450
+ "vision_model.encoder.layers.14.norm1.bias": "model-00001-of-00004.safetensors",
451
+ "vision_model.encoder.layers.14.norm1.weight": "model-00001-of-00004.safetensors",
452
+ "vision_model.encoder.layers.14.norm2.bias": "model-00001-of-00004.safetensors",
453
+ "vision_model.encoder.layers.14.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "vision_model.encoder.layers.15.attn.proj.bias": "model-00001-of-00004.safetensors",
455
+ "vision_model.encoder.layers.15.attn.proj.weight": "model-00001-of-00004.safetensors",
456
+ "vision_model.encoder.layers.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
457
+ "vision_model.encoder.layers.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
458
+ "vision_model.encoder.layers.15.ls1": "model-00001-of-00004.safetensors",
459
+ "vision_model.encoder.layers.15.ls2": "model-00001-of-00004.safetensors",
460
+ "vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
461
+ "vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
462
+ "vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
463
+ "vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
464
+ "vision_model.encoder.layers.15.norm1.bias": "model-00001-of-00004.safetensors",
465
+ "vision_model.encoder.layers.15.norm1.weight": "model-00001-of-00004.safetensors",
466
+ "vision_model.encoder.layers.15.norm2.bias": "model-00001-of-00004.safetensors",
467
+ "vision_model.encoder.layers.15.norm2.weight": "model-00001-of-00004.safetensors",
468
+ "vision_model.encoder.layers.16.attn.proj.bias": "model-00001-of-00004.safetensors",
469
+ "vision_model.encoder.layers.16.attn.proj.weight": "model-00001-of-00004.safetensors",
470
+ "vision_model.encoder.layers.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
471
+ "vision_model.encoder.layers.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
472
+ "vision_model.encoder.layers.16.ls1": "model-00001-of-00004.safetensors",
473
+ "vision_model.encoder.layers.16.ls2": "model-00001-of-00004.safetensors",
474
+ "vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
475
+ "vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
476
+ "vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
477
+ "vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
478
+ "vision_model.encoder.layers.16.norm1.bias": "model-00001-of-00004.safetensors",
479
+ "vision_model.encoder.layers.16.norm1.weight": "model-00001-of-00004.safetensors",
480
+ "vision_model.encoder.layers.16.norm2.bias": "model-00001-of-00004.safetensors",
481
+ "vision_model.encoder.layers.16.norm2.weight": "model-00001-of-00004.safetensors",
482
+ "vision_model.encoder.layers.17.attn.proj.bias": "model-00001-of-00004.safetensors",
483
+ "vision_model.encoder.layers.17.attn.proj.weight": "model-00001-of-00004.safetensors",
484
+ "vision_model.encoder.layers.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
485
+ "vision_model.encoder.layers.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
486
+ "vision_model.encoder.layers.17.ls1": "model-00001-of-00004.safetensors",
487
+ "vision_model.encoder.layers.17.ls2": "model-00001-of-00004.safetensors",
488
+ "vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
489
+ "vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
490
+ "vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
491
+ "vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
492
+ "vision_model.encoder.layers.17.norm1.bias": "model-00001-of-00004.safetensors",
493
+ "vision_model.encoder.layers.17.norm1.weight": "model-00001-of-00004.safetensors",
494
+ "vision_model.encoder.layers.17.norm2.bias": "model-00001-of-00004.safetensors",
495
+ "vision_model.encoder.layers.17.norm2.weight": "model-00001-of-00004.safetensors",
496
+ "vision_model.encoder.layers.18.attn.proj.bias": "model-00001-of-00004.safetensors",
497
+ "vision_model.encoder.layers.18.attn.proj.weight": "model-00001-of-00004.safetensors",
498
+ "vision_model.encoder.layers.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
499
+ "vision_model.encoder.layers.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
500
+ "vision_model.encoder.layers.18.ls1": "model-00001-of-00004.safetensors",
501
+ "vision_model.encoder.layers.18.ls2": "model-00001-of-00004.safetensors",
502
+ "vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
503
+ "vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
504
+ "vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
505
+ "vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
506
+ "vision_model.encoder.layers.18.norm1.bias": "model-00001-of-00004.safetensors",
507
+ "vision_model.encoder.layers.18.norm1.weight": "model-00001-of-00004.safetensors",
508
+ "vision_model.encoder.layers.18.norm2.bias": "model-00001-of-00004.safetensors",
509
+ "vision_model.encoder.layers.18.norm2.weight": "model-00001-of-00004.safetensors",
510
+ "vision_model.encoder.layers.19.attn.proj.bias": "model-00001-of-00004.safetensors",
511
+ "vision_model.encoder.layers.19.attn.proj.weight": "model-00001-of-00004.safetensors",
512
+ "vision_model.encoder.layers.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
513
+ "vision_model.encoder.layers.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
514
+ "vision_model.encoder.layers.19.ls1": "model-00001-of-00004.safetensors",
515
+ "vision_model.encoder.layers.19.ls2": "model-00001-of-00004.safetensors",
516
+ "vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
517
+ "vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
518
+ "vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
519
+ "vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
520
+ "vision_model.encoder.layers.19.norm1.bias": "model-00001-of-00004.safetensors",
521
+ "vision_model.encoder.layers.19.norm1.weight": "model-00001-of-00004.safetensors",
522
+ "vision_model.encoder.layers.19.norm2.bias": "model-00001-of-00004.safetensors",
523
+ "vision_model.encoder.layers.19.norm2.weight": "model-00001-of-00004.safetensors",
524
+ "vision_model.encoder.layers.2.attn.proj.bias": "model-00001-of-00004.safetensors",
525
+ "vision_model.encoder.layers.2.attn.proj.weight": "model-00001-of-00004.safetensors",
526
+ "vision_model.encoder.layers.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
527
+ "vision_model.encoder.layers.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
528
+ "vision_model.encoder.layers.2.ls1": "model-00001-of-00004.safetensors",
529
+ "vision_model.encoder.layers.2.ls2": "model-00001-of-00004.safetensors",
530
+ "vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
531
+ "vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
532
+ "vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
533
+ "vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
534
+ "vision_model.encoder.layers.2.norm1.bias": "model-00001-of-00004.safetensors",
535
+ "vision_model.encoder.layers.2.norm1.weight": "model-00001-of-00004.safetensors",
536
+ "vision_model.encoder.layers.2.norm2.bias": "model-00001-of-00004.safetensors",
537
+ "vision_model.encoder.layers.2.norm2.weight": "model-00001-of-00004.safetensors",
538
+ "vision_model.encoder.layers.20.attn.proj.bias": "model-00001-of-00004.safetensors",
539
+ "vision_model.encoder.layers.20.attn.proj.weight": "model-00001-of-00004.safetensors",
540
+ "vision_model.encoder.layers.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
541
+ "vision_model.encoder.layers.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
542
+ "vision_model.encoder.layers.20.ls1": "model-00001-of-00004.safetensors",
543
+ "vision_model.encoder.layers.20.ls2": "model-00001-of-00004.safetensors",
544
+ "vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
545
+ "vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
546
+ "vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
547
+ "vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
548
+ "vision_model.encoder.layers.20.norm1.bias": "model-00001-of-00004.safetensors",
549
+ "vision_model.encoder.layers.20.norm1.weight": "model-00001-of-00004.safetensors",
550
+ "vision_model.encoder.layers.20.norm2.bias": "model-00001-of-00004.safetensors",
551
+ "vision_model.encoder.layers.20.norm2.weight": "model-00001-of-00004.safetensors",
552
+ "vision_model.encoder.layers.21.attn.proj.bias": "model-00001-of-00004.safetensors",
553
+ "vision_model.encoder.layers.21.attn.proj.weight": "model-00001-of-00004.safetensors",
554
+ "vision_model.encoder.layers.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
555
+ "vision_model.encoder.layers.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
556
+ "vision_model.encoder.layers.21.ls1": "model-00001-of-00004.safetensors",
557
+ "vision_model.encoder.layers.21.ls2": "model-00001-of-00004.safetensors",
558
+ "vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
559
+ "vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
560
+ "vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
561
+ "vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
562
+ "vision_model.encoder.layers.21.norm1.bias": "model-00001-of-00004.safetensors",
563
+ "vision_model.encoder.layers.21.norm1.weight": "model-00001-of-00004.safetensors",
564
+ "vision_model.encoder.layers.21.norm2.bias": "model-00001-of-00004.safetensors",
565
+ "vision_model.encoder.layers.21.norm2.weight": "model-00001-of-00004.safetensors",
566
+ "vision_model.encoder.layers.22.attn.proj.bias": "model-00001-of-00004.safetensors",
567
+ "vision_model.encoder.layers.22.attn.proj.weight": "model-00001-of-00004.safetensors",
568
+ "vision_model.encoder.layers.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
569
+ "vision_model.encoder.layers.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
570
+ "vision_model.encoder.layers.22.ls1": "model-00001-of-00004.safetensors",
571
+ "vision_model.encoder.layers.22.ls2": "model-00001-of-00004.safetensors",
572
+ "vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
573
+ "vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
574
+ "vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
575
+ "vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
576
+ "vision_model.encoder.layers.22.norm1.bias": "model-00001-of-00004.safetensors",
577
+ "vision_model.encoder.layers.22.norm1.weight": "model-00001-of-00004.safetensors",
578
+ "vision_model.encoder.layers.22.norm2.bias": "model-00001-of-00004.safetensors",
579
+ "vision_model.encoder.layers.22.norm2.weight": "model-00001-of-00004.safetensors",
580
+ "vision_model.encoder.layers.23.attn.proj.bias": "model-00001-of-00004.safetensors",
581
+ "vision_model.encoder.layers.23.attn.proj.weight": "model-00001-of-00004.safetensors",
582
+ "vision_model.encoder.layers.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
583
+ "vision_model.encoder.layers.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
584
+ "vision_model.encoder.layers.23.ls1": "model-00001-of-00004.safetensors",
585
+ "vision_model.encoder.layers.23.ls2": "model-00001-of-00004.safetensors",
586
+ "vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
587
+ "vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
588
+ "vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
589
+ "vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
590
+ "vision_model.encoder.layers.23.norm1.bias": "model-00001-of-00004.safetensors",
591
+ "vision_model.encoder.layers.23.norm1.weight": "model-00001-of-00004.safetensors",
592
+ "vision_model.encoder.layers.23.norm2.bias": "model-00001-of-00004.safetensors",
593
+ "vision_model.encoder.layers.23.norm2.weight": "model-00001-of-00004.safetensors",
594
+ "vision_model.encoder.layers.3.attn.proj.bias": "model-00001-of-00004.safetensors",
595
+ "vision_model.encoder.layers.3.attn.proj.weight": "model-00001-of-00004.safetensors",
596
+ "vision_model.encoder.layers.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
597
+ "vision_model.encoder.layers.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
598
+ "vision_model.encoder.layers.3.ls1": "model-00001-of-00004.safetensors",
599
+ "vision_model.encoder.layers.3.ls2": "model-00001-of-00004.safetensors",
600
+ "vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
601
+ "vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
602
+ "vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
603
+ "vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
604
+ "vision_model.encoder.layers.3.norm1.bias": "model-00001-of-00004.safetensors",
605
+ "vision_model.encoder.layers.3.norm1.weight": "model-00001-of-00004.safetensors",
606
+ "vision_model.encoder.layers.3.norm2.bias": "model-00001-of-00004.safetensors",
607
+ "vision_model.encoder.layers.3.norm2.weight": "model-00001-of-00004.safetensors",
608
+ "vision_model.encoder.layers.4.attn.proj.bias": "model-00001-of-00004.safetensors",
609
+ "vision_model.encoder.layers.4.attn.proj.weight": "model-00001-of-00004.safetensors",
610
+ "vision_model.encoder.layers.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
611
+ "vision_model.encoder.layers.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
612
+ "vision_model.encoder.layers.4.ls1": "model-00001-of-00004.safetensors",
613
+ "vision_model.encoder.layers.4.ls2": "model-00001-of-00004.safetensors",
614
+ "vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
615
+ "vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
616
+ "vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
617
+ "vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
618
+ "vision_model.encoder.layers.4.norm1.bias": "model-00001-of-00004.safetensors",
619
+ "vision_model.encoder.layers.4.norm1.weight": "model-00001-of-00004.safetensors",
620
+ "vision_model.encoder.layers.4.norm2.bias": "model-00001-of-00004.safetensors",
621
+ "vision_model.encoder.layers.4.norm2.weight": "model-00001-of-00004.safetensors",
622
+ "vision_model.encoder.layers.5.attn.proj.bias": "model-00001-of-00004.safetensors",
623
+ "vision_model.encoder.layers.5.attn.proj.weight": "model-00001-of-00004.safetensors",
624
+ "vision_model.encoder.layers.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
625
+ "vision_model.encoder.layers.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
626
+ "vision_model.encoder.layers.5.ls1": "model-00001-of-00004.safetensors",
627
+ "vision_model.encoder.layers.5.ls2": "model-00001-of-00004.safetensors",
628
+ "vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
629
+ "vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
630
+ "vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
631
+ "vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
632
+ "vision_model.encoder.layers.5.norm1.bias": "model-00001-of-00004.safetensors",
633
+ "vision_model.encoder.layers.5.norm1.weight": "model-00001-of-00004.safetensors",
634
+ "vision_model.encoder.layers.5.norm2.bias": "model-00001-of-00004.safetensors",
635
+ "vision_model.encoder.layers.5.norm2.weight": "model-00001-of-00004.safetensors",
636
+ "vision_model.encoder.layers.6.attn.proj.bias": "model-00001-of-00004.safetensors",
637
+ "vision_model.encoder.layers.6.attn.proj.weight": "model-00001-of-00004.safetensors",
638
+ "vision_model.encoder.layers.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
639
+ "vision_model.encoder.layers.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
640
+ "vision_model.encoder.layers.6.ls1": "model-00001-of-00004.safetensors",
641
+ "vision_model.encoder.layers.6.ls2": "model-00001-of-00004.safetensors",
642
+ "vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
643
+ "vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
644
+ "vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
645
+ "vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
646
+ "vision_model.encoder.layers.6.norm1.bias": "model-00001-of-00004.safetensors",
647
+ "vision_model.encoder.layers.6.norm1.weight": "model-00001-of-00004.safetensors",
648
+ "vision_model.encoder.layers.6.norm2.bias": "model-00001-of-00004.safetensors",
649
+ "vision_model.encoder.layers.6.norm2.weight": "model-00001-of-00004.safetensors",
650
+ "vision_model.encoder.layers.7.attn.proj.bias": "model-00001-of-00004.safetensors",
651
+ "vision_model.encoder.layers.7.attn.proj.weight": "model-00001-of-00004.safetensors",
652
+ "vision_model.encoder.layers.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
653
+ "vision_model.encoder.layers.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
654
+ "vision_model.encoder.layers.7.ls1": "model-00001-of-00004.safetensors",
655
+ "vision_model.encoder.layers.7.ls2": "model-00001-of-00004.safetensors",
656
+ "vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
657
+ "vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
658
+ "vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
659
+ "vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
660
+ "vision_model.encoder.layers.7.norm1.bias": "model-00001-of-00004.safetensors",
661
+ "vision_model.encoder.layers.7.norm1.weight": "model-00001-of-00004.safetensors",
662
+ "vision_model.encoder.layers.7.norm2.bias": "model-00001-of-00004.safetensors",
663
+ "vision_model.encoder.layers.7.norm2.weight": "model-00001-of-00004.safetensors",
664
+ "vision_model.encoder.layers.8.attn.proj.bias": "model-00001-of-00004.safetensors",
665
+ "vision_model.encoder.layers.8.attn.proj.weight": "model-00001-of-00004.safetensors",
666
+ "vision_model.encoder.layers.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
667
+ "vision_model.encoder.layers.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
668
+ "vision_model.encoder.layers.8.ls1": "model-00001-of-00004.safetensors",
669
+ "vision_model.encoder.layers.8.ls2": "model-00001-of-00004.safetensors",
670
+ "vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
671
+ "vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
672
+ "vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
673
+ "vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
674
+ "vision_model.encoder.layers.8.norm1.bias": "model-00001-of-00004.safetensors",
675
+ "vision_model.encoder.layers.8.norm1.weight": "model-00001-of-00004.safetensors",
676
+ "vision_model.encoder.layers.8.norm2.bias": "model-00001-of-00004.safetensors",
677
+ "vision_model.encoder.layers.8.norm2.weight": "model-00001-of-00004.safetensors",
678
+ "vision_model.encoder.layers.9.attn.proj.bias": "model-00001-of-00004.safetensors",
679
+ "vision_model.encoder.layers.9.attn.proj.weight": "model-00001-of-00004.safetensors",
680
+ "vision_model.encoder.layers.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
681
+ "vision_model.encoder.layers.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
682
+ "vision_model.encoder.layers.9.ls1": "model-00001-of-00004.safetensors",
683
+ "vision_model.encoder.layers.9.ls2": "model-00001-of-00004.safetensors",
684
+ "vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
685
+ "vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
686
+ "vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
687
+ "vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
688
+ "vision_model.encoder.layers.9.norm1.bias": "model-00001-of-00004.safetensors",
689
+ "vision_model.encoder.layers.9.norm1.weight": "model-00001-of-00004.safetensors",
690
+ "vision_model.encoder.layers.9.norm2.bias": "model-00001-of-00004.safetensors",
691
+ "vision_model.encoder.layers.9.norm2.weight": "model-00001-of-00004.safetensors"
692
+ }
693
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_intern_vit.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ from typing import Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from einops import rearrange
13
+ from timm.layers import DropPath
14
+ from torch import nn
15
+ from transformers.activations import ACT2FN
16
+ from transformers.modeling_outputs import (BaseModelOutput,
17
+ BaseModelOutputWithPooling)
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import logging
20
+
21
+ from .configuration_intern_vit import InternVisionConfig
22
+
23
+ try:
24
+ from flash_attn.bert_padding import pad_input, unpad_input
25
+ from flash_attn.flash_attn_interface import \
26
+ flash_attn_varlen_qkvpacked_func
27
+ has_flash_attn = True
28
+ except:
29
+ print('FlashAttention2 is not installed.')
30
+ has_flash_attn = False
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ class FlashAttention(nn.Module):
36
+ """Implement the scaled dot product attention with softmax.
37
+ Arguments
38
+ ---------
39
+ softmax_scale: The temperature to use for the softmax attention.
40
+ (default: 1/sqrt(d_keys) where d_keys is computed at
41
+ runtime)
42
+ attention_dropout: The dropout rate to apply to the attention
43
+ (default: 0.0)
44
+ """
45
+
46
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
47
+ super().__init__()
48
+ self.softmax_scale = softmax_scale
49
+ self.dropout_p = attention_dropout
50
+
51
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
52
+ max_s=None, need_weights=False):
53
+ """Implements the multihead softmax attention.
54
+ Arguments
55
+ ---------
56
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
57
+ if unpadded: (nnz, 3, h, d)
58
+ key_padding_mask: a bool tensor of shape (B, S)
59
+ """
60
+ assert not need_weights
61
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
62
+ assert qkv.is_cuda
63
+
64
+ if cu_seqlens is None:
65
+ batch_size = qkv.shape[0]
66
+ seqlen = qkv.shape[1]
67
+ if key_padding_mask is None:
68
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
69
+ max_s = seqlen
70
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
71
+ device=qkv.device)
72
+ output = flash_attn_varlen_qkvpacked_func(
73
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
74
+ softmax_scale=self.softmax_scale, causal=causal
75
+ )
76
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
77
+ else:
78
+ nheads = qkv.shape[-2]
79
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
80
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
81
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
82
+ output_unpad = flash_attn_varlen_qkvpacked_func(
83
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
84
+ softmax_scale=self.softmax_scale, causal=causal
85
+ )
86
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
87
+ indices, batch_size, seqlen),
88
+ 'b s (h d) -> b s h d', h=nheads)
89
+ else:
90
+ assert max_s is not None
91
+ output = flash_attn_varlen_qkvpacked_func(
92
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
93
+ softmax_scale=self.softmax_scale, causal=causal
94
+ )
95
+
96
+ return output, None
97
+
98
+
99
+ class InternRMSNorm(nn.Module):
100
+ def __init__(self, hidden_size, eps=1e-6):
101
+ super().__init__()
102
+ self.weight = nn.Parameter(torch.ones(hidden_size))
103
+ self.variance_epsilon = eps
104
+
105
+ def forward(self, hidden_states):
106
+ input_dtype = hidden_states.dtype
107
+ hidden_states = hidden_states.to(torch.float32)
108
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
109
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
110
+ return self.weight * hidden_states.to(input_dtype)
111
+
112
+
113
+ try:
114
+ from apex.normalization import FusedRMSNorm
115
+
116
+ InternRMSNorm = FusedRMSNorm # noqa
117
+
118
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
119
+ except ImportError:
120
+ # using the normal InternRMSNorm
121
+ pass
122
+ except Exception:
123
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
124
+ pass
125
+
126
+
127
+ NORM2FN = {
128
+ 'rms_norm': InternRMSNorm,
129
+ 'layer_norm': nn.LayerNorm,
130
+ }
131
+
132
+
133
+ class InternVisionEmbeddings(nn.Module):
134
+ def __init__(self, config: InternVisionConfig):
135
+ super().__init__()
136
+ self.config = config
137
+ self.embed_dim = config.hidden_size
138
+ self.image_size = config.image_size
139
+ self.patch_size = config.patch_size
140
+
141
+ self.class_embedding = nn.Parameter(
142
+ torch.randn(1, 1, self.embed_dim),
143
+ )
144
+
145
+ self.patch_embedding = nn.Conv2d(
146
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
147
+ )
148
+
149
+ self.num_patches = (self.image_size // self.patch_size) ** 2
150
+ self.num_positions = self.num_patches + 1
151
+
152
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
153
+
154
+ def _get_pos_embed(self, pos_embed, H, W):
155
+ target_dtype = pos_embed.dtype
156
+ pos_embed = pos_embed.float().reshape(
157
+ 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
158
+ pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
159
+ reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
160
+ return pos_embed
161
+
162
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
163
+ target_dtype = self.patch_embedding.weight.dtype
164
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
165
+ batch_size, _, height, width = patch_embeds.shape
166
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
167
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
168
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
169
+ position_embedding = torch.cat([
170
+ self.position_embedding[:, :1, :],
171
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
172
+ ], dim=1)
173
+ embeddings = embeddings + position_embedding.to(target_dtype)
174
+ return embeddings
175
+
176
+
177
+ class InternAttention(nn.Module):
178
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
179
+
180
+ def __init__(self, config: InternVisionConfig):
181
+ super().__init__()
182
+ self.config = config
183
+ self.embed_dim = config.hidden_size
184
+ self.num_heads = config.num_attention_heads
185
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
186
+ if config.use_flash_attn and not has_flash_attn:
187
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
188
+ self.head_dim = self.embed_dim // self.num_heads
189
+ if self.head_dim * self.num_heads != self.embed_dim:
190
+ raise ValueError(
191
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
192
+ f' {self.num_heads}).'
193
+ )
194
+
195
+ self.scale = self.head_dim ** -0.5
196
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
197
+ self.attn_drop = nn.Dropout(config.attention_dropout)
198
+ self.proj_drop = nn.Dropout(config.dropout)
199
+
200
+ self.qk_normalization = config.qk_normalization
201
+
202
+ if self.qk_normalization:
203
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
204
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
205
+
206
+ if self.use_flash_attn:
207
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
208
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
209
+
210
+ def _naive_attn(self, x):
211
+ B, N, C = x.shape
212
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
213
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
214
+
215
+ if self.qk_normalization:
216
+ B_, H_, N_, D_ = q.shape
217
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
218
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
219
+
220
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
221
+ attn = attn.softmax(dim=-1)
222
+ attn = self.attn_drop(attn)
223
+
224
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
225
+ x = self.proj(x)
226
+ x = self.proj_drop(x)
227
+ return x
228
+
229
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
230
+ qkv = self.qkv(x)
231
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
232
+
233
+ if self.qk_normalization:
234
+ q, k, v = qkv.unbind(2)
235
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
236
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
237
+ qkv = torch.stack([q, k, v], dim=2)
238
+
239
+ context, _ = self.inner_attn(
240
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
241
+ )
242
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
243
+ outs = self.proj_drop(outs)
244
+ return outs
245
+
246
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
247
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
248
+ return x
249
+
250
+
251
+ class InternMLP(nn.Module):
252
+ def __init__(self, config: InternVisionConfig):
253
+ super().__init__()
254
+ self.config = config
255
+ self.act = ACT2FN[config.hidden_act]
256
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
257
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
258
+
259
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
260
+ hidden_states = self.fc1(hidden_states)
261
+ hidden_states = self.act(hidden_states)
262
+ hidden_states = self.fc2(hidden_states)
263
+ return hidden_states
264
+
265
+
266
+ class InternVisionEncoderLayer(nn.Module):
267
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
268
+ super().__init__()
269
+ self.embed_dim = config.hidden_size
270
+ self.intermediate_size = config.intermediate_size
271
+ self.norm_type = config.norm_type
272
+
273
+ self.attn = InternAttention(config)
274
+ self.mlp = InternMLP(config)
275
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
276
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
277
+
278
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
279
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
280
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
281
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
282
+
283
+ def forward(
284
+ self,
285
+ hidden_states: torch.Tensor,
286
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
287
+ """
288
+ Args:
289
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
290
+ """
291
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1)
292
+
293
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2)
294
+
295
+ return hidden_states
296
+
297
+
298
+ class InternVisionEncoder(nn.Module):
299
+ """
300
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
301
+ [`InternEncoderLayer`].
302
+
303
+ Args:
304
+ config (`InternConfig`):
305
+ The corresponding vision configuration for the `InternEncoder`.
306
+ """
307
+
308
+ def __init__(self, config: InternVisionConfig):
309
+ super().__init__()
310
+ self.config = config
311
+ # stochastic depth decay rule
312
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
313
+ self.layers = nn.ModuleList([
314
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
315
+ self.gradient_checkpointing = True
316
+
317
+ def forward(
318
+ self,
319
+ inputs_embeds,
320
+ output_hidden_states: Optional[bool] = None,
321
+ return_dict: Optional[bool] = None,
322
+ ) -> Union[Tuple, BaseModelOutput]:
323
+ r"""
324
+ Args:
325
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
326
+ Embedded representation of the inputs. Should be float, not int tokens.
327
+ output_hidden_states (`bool`, *optional*):
328
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
329
+ for more detail.
330
+ return_dict (`bool`, *optional*):
331
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
332
+ """
333
+ output_hidden_states = (
334
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
335
+ )
336
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
337
+
338
+ encoder_states = () if output_hidden_states else None
339
+ hidden_states = inputs_embeds
340
+
341
+ for idx, encoder_layer in enumerate(self.layers):
342
+ if output_hidden_states:
343
+ encoder_states = encoder_states + (hidden_states,)
344
+ if self.gradient_checkpointing and self.training:
345
+ layer_outputs = torch.utils.checkpoint.checkpoint(
346
+ encoder_layer,
347
+ hidden_states)
348
+ else:
349
+ layer_outputs = encoder_layer(
350
+ hidden_states,
351
+ )
352
+ hidden_states = layer_outputs
353
+
354
+ if output_hidden_states:
355
+ encoder_states = encoder_states + (hidden_states,)
356
+
357
+ if not return_dict:
358
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
359
+ return BaseModelOutput(
360
+ last_hidden_state=hidden_states, hidden_states=encoder_states
361
+ )
362
+
363
+
364
+ class InternVisionModel(PreTrainedModel):
365
+ main_input_name = 'pixel_values'
366
+ _supports_flash_attn_2 = True
367
+ supports_gradient_checkpointing = True
368
+ config_class = InternVisionConfig
369
+ _no_split_modules = ['InternVisionEncoderLayer']
370
+
371
+ def __init__(self, config: InternVisionConfig):
372
+ super().__init__(config)
373
+ self.config = config
374
+
375
+ self.embeddings = InternVisionEmbeddings(config)
376
+ self.encoder = InternVisionEncoder(config)
377
+
378
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
379
+ pos_emb = self.embeddings.position_embedding
380
+ _, num_positions, embed_dim = pos_emb.shape
381
+ cls_emb = pos_emb[:, :1, :]
382
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
383
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
384
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
385
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
386
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
387
+ self.embeddings.image_size = new_size
388
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
389
+
390
+ def get_input_embeddings(self):
391
+ return self.embeddings
392
+
393
+ def forward(
394
+ self,
395
+ pixel_values: Optional[torch.FloatTensor] = None,
396
+ output_hidden_states: Optional[bool] = None,
397
+ return_dict: Optional[bool] = None,
398
+ pixel_embeds: Optional[torch.FloatTensor] = None,
399
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
400
+ output_hidden_states = (
401
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
402
+ )
403
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
404
+
405
+ if pixel_values is None and pixel_embeds is None:
406
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
407
+
408
+ if pixel_embeds is not None:
409
+ hidden_states = pixel_embeds
410
+ else:
411
+ if len(pixel_values.shape) == 4:
412
+ hidden_states = self.embeddings(pixel_values)
413
+ else:
414
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
415
+ encoder_outputs = self.encoder(
416
+ inputs_embeds=hidden_states,
417
+ output_hidden_states=output_hidden_states,
418
+ return_dict=return_dict,
419
+ )
420
+ last_hidden_state = encoder_outputs.last_hidden_state
421
+ pooled_output = last_hidden_state[:, 0, :]
422
+
423
+ if not return_dict:
424
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
425
+
426
+ return BaseModelOutputWithPooling(
427
+ last_hidden_state=last_hidden_state,
428
+ pooler_output=pooled_output,
429
+ hidden_states=encoder_outputs.hidden_states,
430
+ attentions=encoder_outputs.attentions,
431
+ )
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_internvl_chat.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import warnings
8
+ from typing import List, Optional, Tuple, Union
9
+
10
+ import torch.utils.checkpoint
11
+ import transformers
12
+ from torch import nn
13
+ from torch.nn import CrossEntropyLoss
14
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
15
+ Qwen2ForCausalLM)
16
+ from transformers.modeling_outputs import CausalLMOutputWithPast
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import ModelOutput, logging
19
+
20
+ from .configuration_internvl_chat import InternVLChatConfig
21
+ from .conversation import get_conv_template
22
+ from .modeling_intern_vit import InternVisionModel, has_flash_attn
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ def version_cmp(v1, v2, op='eq'):
28
+ import operator
29
+
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
+ class InternVLChatModel(PreTrainedModel):
36
+ config_class = InternVLChatConfig
37
+ main_input_name = 'pixel_values'
38
+ base_model_prefix = 'language_model'
39
+ _supports_flash_attn_2 = True
40
+ supports_gradient_checkpointing = True
41
+ _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Qwen2DecoderLayer']
42
+
43
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
44
+ super().__init__(config)
45
+
46
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
47
+ image_size = config.force_image_size or config.vision_config.image_size
48
+ patch_size = config.vision_config.patch_size
49
+ self.patch_size = patch_size
50
+ self.select_layer = config.select_layer
51
+ self.template = config.template
52
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
53
+ self.downsample_ratio = config.downsample_ratio
54
+ self.ps_version = config.ps_version
55
+ use_flash_attn = use_flash_attn if has_flash_attn else False
56
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
57
+ config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
58
+
59
+ logger.info(f'num_image_token: {self.num_image_token}')
60
+ logger.info(f'ps_version: {self.ps_version}')
61
+ if vision_model is not None:
62
+ self.vision_model = vision_model
63
+ else:
64
+ self.vision_model = InternVisionModel(config.vision_config)
65
+ if language_model is not None:
66
+ self.language_model = language_model
67
+ else:
68
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
69
+ self.language_model = LlamaForCausalLM(config.llm_config)
70
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
71
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
72
+ else:
73
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
74
+
75
+ vit_hidden_size = config.vision_config.hidden_size
76
+ llm_hidden_size = config.llm_config.hidden_size
77
+
78
+ self.mlp1 = nn.Sequential(
79
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
80
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
81
+ nn.GELU(),
82
+ nn.Linear(llm_hidden_size, llm_hidden_size)
83
+ )
84
+
85
+ self.img_context_token_id = None
86
+ self.conv_template = get_conv_template(self.template)
87
+ self.system_message = self.conv_template.system_message
88
+
89
+ def forward(
90
+ self,
91
+ pixel_values: torch.FloatTensor,
92
+ input_ids: torch.LongTensor = None,
93
+ attention_mask: Optional[torch.Tensor] = None,
94
+ position_ids: Optional[torch.LongTensor] = None,
95
+ image_flags: Optional[torch.LongTensor] = None,
96
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
97
+ labels: Optional[torch.LongTensor] = None,
98
+ use_cache: Optional[bool] = None,
99
+ output_attentions: Optional[bool] = None,
100
+ output_hidden_states: Optional[bool] = None,
101
+ return_dict: Optional[bool] = None,
102
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
103
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
104
+
105
+ image_flags = image_flags.squeeze(-1)
106
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
107
+
108
+ vit_embeds = self.extract_feature(pixel_values)
109
+ vit_embeds = vit_embeds[image_flags == 1]
110
+ vit_batch_size = pixel_values.shape[0]
111
+
112
+ B, N, C = input_embeds.shape
113
+ input_embeds = input_embeds.reshape(B * N, C)
114
+
115
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
116
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
117
+
118
+ input_ids = input_ids.reshape(B * N)
119
+ selected = (input_ids == self.img_context_token_id)
120
+ try:
121
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
122
+ except Exception as e:
123
+ vit_embeds = vit_embeds.reshape(-1, C)
124
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
125
+ f'vit_embeds.shape={vit_embeds.shape}')
126
+ n_token = min(selected.sum(), vit_embeds.size(0))
127
+ input_embeds[selected][:n_token] = input_embeds[selected][:n_token] * 0.0 + vit_embeds[:n_token]
128
+
129
+ input_embeds = input_embeds.reshape(B, N, C)
130
+
131
+ outputs = self.language_model(
132
+ inputs_embeds=input_embeds,
133
+ attention_mask=attention_mask,
134
+ position_ids=position_ids,
135
+ past_key_values=past_key_values,
136
+ use_cache=use_cache,
137
+ output_attentions=output_attentions,
138
+ output_hidden_states=output_hidden_states,
139
+ return_dict=return_dict,
140
+ )
141
+ logits = outputs.logits
142
+
143
+ loss = None
144
+ if labels is not None:
145
+ # Shift so that tokens < n predict n
146
+ shift_logits = logits[..., :-1, :].contiguous()
147
+ shift_labels = labels[..., 1:].contiguous()
148
+ # Flatten the tokens
149
+ loss_fct = CrossEntropyLoss()
150
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
151
+ shift_labels = shift_labels.view(-1)
152
+ # Enable model parallelism
153
+ shift_labels = shift_labels.to(shift_logits.device)
154
+ loss = loss_fct(shift_logits, shift_labels)
155
+
156
+ if not return_dict:
157
+ output = (logits,) + outputs[1:]
158
+ return (loss,) + output if loss is not None else output
159
+
160
+ return CausalLMOutputWithPast(
161
+ loss=loss,
162
+ logits=logits,
163
+ past_key_values=outputs.past_key_values,
164
+ hidden_states=outputs.hidden_states,
165
+ attentions=outputs.attentions,
166
+ )
167
+
168
+ def pixel_shuffle(self, x, scale_factor=0.5):
169
+ n, w, h, c = x.size()
170
+ # N, W, H, C --> N, W, H * scale, C // scale
171
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
172
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
173
+ x = x.permute(0, 2, 1, 3).contiguous()
174
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
175
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
176
+ int(c / (scale_factor * scale_factor)))
177
+ if self.ps_version == 'v1':
178
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
179
+ 'which results in a transposed image.')
180
+ else:
181
+ x = x.permute(0, 2, 1, 3).contiguous()
182
+ return x
183
+
184
+ def extract_feature(self, pixel_values):
185
+ if self.select_layer == -1:
186
+ vit_embeds = self.vision_model(
187
+ pixel_values=pixel_values,
188
+ output_hidden_states=False,
189
+ return_dict=True).last_hidden_state
190
+ else:
191
+ vit_embeds = self.vision_model(
192
+ pixel_values=pixel_values,
193
+ output_hidden_states=True,
194
+ return_dict=True).hidden_states[self.select_layer]
195
+ vit_embeds = vit_embeds[:, 1:, :]
196
+
197
+ h = w = int(vit_embeds.shape[1] ** 0.5)
198
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
199
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
200
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
201
+ vit_embeds = self.mlp1(vit_embeds)
202
+ return vit_embeds
203
+
204
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
205
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
206
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
207
+ if history is not None or return_history:
208
+ print('Now multi-turn chat is not supported in batch_chat.')
209
+ raise NotImplementedError
210
+
211
+ if image_counts is not None:
212
+ num_patches_list = image_counts
213
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
214
+
215
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
216
+ self.img_context_token_id = img_context_token_id
217
+
218
+ if verbose and pixel_values is not None:
219
+ image_bs = pixel_values.shape[0]
220
+ print(f'dynamic ViT batch size: {image_bs}')
221
+
222
+ queries = []
223
+ for idx, num_patches in enumerate(num_patches_list):
224
+ question = questions[idx]
225
+ if pixel_values is not None and '<image>' not in question:
226
+ question = '<image>\n' + question
227
+ template = get_conv_template(self.template)
228
+ template.system_message = self.system_message
229
+ template.append_message(template.roles[0], question)
230
+ template.append_message(template.roles[1], None)
231
+ query = template.get_prompt()
232
+
233
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
234
+ query = query.replace('<image>', image_tokens, 1)
235
+ queries.append(query)
236
+
237
+ tokenizer.padding_side = 'left'
238
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
239
+ input_ids = model_inputs['input_ids'].to(self.device)
240
+ attention_mask = model_inputs['attention_mask'].to(self.device)
241
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
242
+ generation_config['eos_token_id'] = eos_token_id
243
+ generation_output = self.generate(
244
+ pixel_values=pixel_values,
245
+ input_ids=input_ids,
246
+ attention_mask=attention_mask,
247
+ **generation_config
248
+ )
249
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
250
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
251
+ return responses
252
+
253
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
254
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
255
+ verbose=False):
256
+
257
+ if history is None and pixel_values is not None and '<image>' not in question:
258
+ question = '<image>\n' + question
259
+
260
+ if num_patches_list is None:
261
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
262
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
263
+
264
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
265
+ self.img_context_token_id = img_context_token_id
266
+
267
+ template = get_conv_template(self.template)
268
+ template.system_message = self.system_message
269
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
270
+
271
+ history = [] if history is None else history
272
+ for (old_question, old_answer) in history:
273
+ template.append_message(template.roles[0], old_question)
274
+ template.append_message(template.roles[1], old_answer)
275
+ template.append_message(template.roles[0], question)
276
+ template.append_message(template.roles[1], None)
277
+ query = template.get_prompt()
278
+
279
+ if verbose and pixel_values is not None:
280
+ image_bs = pixel_values.shape[0]
281
+ print(f'dynamic ViT batch size: {image_bs}')
282
+
283
+ for num_patches in num_patches_list:
284
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
285
+ query = query.replace('<image>', image_tokens, 1)
286
+
287
+ model_inputs = tokenizer(query, return_tensors='pt')
288
+ input_ids = model_inputs['input_ids'].to(self.device)
289
+ attention_mask = model_inputs['attention_mask'].to(self.device)
290
+ generation_config['eos_token_id'] = eos_token_id
291
+ generation_output = self.generate(
292
+ pixel_values=pixel_values,
293
+ input_ids=input_ids,
294
+ attention_mask=attention_mask,
295
+ **generation_config
296
+ )
297
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
298
+ response = response.split(template.sep.strip())[0].strip()
299
+ history.append((question, response))
300
+ if return_history:
301
+ return response, history
302
+ else:
303
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
304
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
305
+ if verbose:
306
+ print(query_to_print, response)
307
+ return response
308
+
309
+ @torch.no_grad()
310
+ def generate(
311
+ self,
312
+ pixel_values: Optional[torch.FloatTensor] = None,
313
+ input_ids: Optional[torch.FloatTensor] = None,
314
+ attention_mask: Optional[torch.LongTensor] = None,
315
+ visual_features: Optional[torch.FloatTensor] = None,
316
+ generation_config: Optional[GenerationConfig] = None,
317
+ output_hidden_states: Optional[bool] = None,
318
+ **generate_kwargs,
319
+ ) -> torch.LongTensor:
320
+
321
+ assert self.img_context_token_id is not None
322
+ if pixel_values is not None:
323
+ if visual_features is not None:
324
+ vit_embeds = visual_features
325
+ else:
326
+ vit_embeds = self.extract_feature(pixel_values)
327
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
328
+ B, N, C = input_embeds.shape
329
+ input_embeds = input_embeds.reshape(B * N, C)
330
+
331
+ input_ids = input_ids.reshape(B * N)
332
+ selected = (input_ids == self.img_context_token_id)
333
+ assert selected.sum() != 0
334
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
335
+
336
+ input_embeds = input_embeds.reshape(B, N, C)
337
+ else:
338
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
339
+
340
+ outputs = self.language_model.generate(
341
+ inputs_embeds=input_embeds,
342
+ attention_mask=attention_mask,
343
+ generation_config=generation_config,
344
+ output_hidden_states=output_hidden_states,
345
+ use_cache=True,
346
+ **generate_kwargs,
347
+ )
348
+
349
+ return outputs
350
+
351
+ @property
352
+ def lm_head(self):
353
+ return self.language_model.get_output_embeddings()
354
+
355
+ def get_input_embeddings(self):
356
+ return self.language_model.get_input_embeddings()
357
+
358
+ def get_output_embeddings(self):
359
+ return self.language_model.get_output_embeddings()
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/tokenizer_config.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<img>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "151666": {
191
+ "content": "</img>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "151667": {
199
+ "content": "<IMG_CONTEXT>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "151668": {
207
+ "content": "<quad>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "151669": {
215
+ "content": "</quad>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<ref>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</ref>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ }
254
+ },
255
+ "additional_special_tokens": [
256
+ "<|im_start|>",
257
+ "<|im_end|>",
258
+ "<|object_ref_start|>",
259
+ "<|object_ref_end|>",
260
+ "<|box_start|>",
261
+ "<|box_end|>",
262
+ "<|quad_start|>",
263
+ "<|quad_end|>",
264
+ "<|vision_start|>",
265
+ "<|vision_end|>",
266
+ "<|vision_pad|>",
267
+ "<|image_pad|>",
268
+ "<|video_pad|>"
269
+ ],
270
+ "bos_token": null,
271
+ "clean_up_tokenization_spaces": false,
272
+ "eos_token": "<|im_end|>",
273
+ "errors": "replace",
274
+ "extra_special_tokens": {},
275
+ "model_max_length": 1000000,
276
+ "pad_token": "<|endoftext|>",
277
+ "split_special_tokens": false,
278
+ "tokenizer_class": "Qwen2Tokenizer",
279
+ "unk_token": null
280
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/trainer_state.json ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 228,
3
+ "best_metric": 0.72654486,
4
+ "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 228,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.013157894736842105,
14
+ "grad_norm": 16.149745965683884,
15
+ "learning_rate": 1.3157894736842104e-08,
16
+ "loss": 1.0491504669189453,
17
+ "step": 1,
18
+ "token_acc": 0.7316017316017316
19
+ },
20
+ {
21
+ "epoch": 0.06578947368421052,
22
+ "grad_norm": 19.394274240651384,
23
+ "learning_rate": 6.578947368421052e-08,
24
+ "loss": 0.9333165884017944,
25
+ "step": 5,
26
+ "token_acc": 0.7398773006134969
27
+ },
28
+ {
29
+ "epoch": 0.13157894736842105,
30
+ "grad_norm": 16.751648834817885,
31
+ "learning_rate": 1.3157894736842104e-07,
32
+ "loss": 0.9967510223388671,
33
+ "step": 10,
34
+ "token_acc": 0.7421441774491682
35
+ },
36
+ {
37
+ "epoch": 0.19736842105263158,
38
+ "grad_norm": 17.69556265222528,
39
+ "learning_rate": 1.9736842105263157e-07,
40
+ "loss": 0.8940377235412598,
41
+ "step": 15,
42
+ "token_acc": 0.769434628975265
43
+ },
44
+ {
45
+ "epoch": 0.2631578947368421,
46
+ "grad_norm": 16.580837710216986,
47
+ "learning_rate": 2.631578947368421e-07,
48
+ "loss": 0.94066162109375,
49
+ "step": 20,
50
+ "token_acc": 0.7534121929026387
51
+ },
52
+ {
53
+ "epoch": 0.32894736842105265,
54
+ "grad_norm": 17.18358188029577,
55
+ "learning_rate": 3.2894736842105264e-07,
56
+ "loss": 0.8913036346435547,
57
+ "step": 25,
58
+ "token_acc": 0.7722672064777328
59
+ },
60
+ {
61
+ "epoch": 0.39473684210526316,
62
+ "grad_norm": 12.655760128091604,
63
+ "learning_rate": 3.9473684210526315e-07,
64
+ "loss": 0.8995906829833984,
65
+ "step": 30,
66
+ "token_acc": 0.7529095792300806
67
+ },
68
+ {
69
+ "epoch": 0.4605263157894737,
70
+ "grad_norm": 14.799338872033982,
71
+ "learning_rate": 4.6052631578947365e-07,
72
+ "loss": 0.7829002857208252,
73
+ "step": 35,
74
+ "token_acc": 0.7704918032786885
75
+ },
76
+ {
77
+ "epoch": 0.5263157894736842,
78
+ "grad_norm": 13.6883274124527,
79
+ "learning_rate": 4.999578104083306e-07,
80
+ "loss": 0.7109686374664307,
81
+ "step": 40,
82
+ "token_acc": 0.7868705035971223
83
+ },
84
+ {
85
+ "epoch": 0.5921052631578947,
86
+ "grad_norm": 11.590886715849683,
87
+ "learning_rate": 4.994833410208486e-07,
88
+ "loss": 0.8100458145141601,
89
+ "step": 45,
90
+ "token_acc": 0.7766143106457243
91
+ },
92
+ {
93
+ "epoch": 0.6578947368421053,
94
+ "grad_norm": 18.37250203045574,
95
+ "learning_rate": 4.984826693294873e-07,
96
+ "loss": 0.7702572345733643,
97
+ "step": 50,
98
+ "token_acc": 0.789568345323741
99
+ },
100
+ {
101
+ "epoch": 0.7236842105263158,
102
+ "grad_norm": 14.257346400410995,
103
+ "learning_rate": 4.969579059190548e-07,
104
+ "loss": 0.8138771057128906,
105
+ "step": 55,
106
+ "token_acc": 0.7591763652641003
107
+ },
108
+ {
109
+ "epoch": 0.7894736842105263,
110
+ "grad_norm": 15.091592377911923,
111
+ "learning_rate": 4.949122667718934e-07,
112
+ "loss": 0.7258480548858642,
113
+ "step": 60,
114
+ "token_acc": 0.7591178965224766
115
+ },
116
+ {
117
+ "epoch": 0.8552631578947368,
118
+ "grad_norm": 14.643270287439186,
119
+ "learning_rate": 4.923500664848326e-07,
120
+ "loss": 0.8208457946777343,
121
+ "step": 65,
122
+ "token_acc": 0.7617753623188406
123
+ },
124
+ {
125
+ "epoch": 0.9210526315789473,
126
+ "grad_norm": 12.10237792449664,
127
+ "learning_rate": 4.892767091689785e-07,
128
+ "loss": 0.68003249168396,
129
+ "step": 70,
130
+ "token_acc": 0.8093922651933702
131
+ },
132
+ {
133
+ "epoch": 0.9868421052631579,
134
+ "grad_norm": 12.72703845793204,
135
+ "learning_rate": 4.856986770515357e-07,
136
+ "loss": 0.7365611076354981,
137
+ "step": 75,
138
+ "token_acc": 0.7881679389312977
139
+ },
140
+ {
141
+ "epoch": 1.0,
142
+ "eval_loss": 0.7485233545303345,
143
+ "eval_runtime": 14.0742,
144
+ "eval_samples_per_second": 9.592,
145
+ "eval_steps_per_second": 1.208,
146
+ "eval_token_acc": 0.7790202342917998,
147
+ "step": 76
148
+ },
149
+ {
150
+ "epoch": 1.0526315789473684,
151
+ "grad_norm": 12.921581074218519,
152
+ "learning_rate": 4.816235168037004e-07,
153
+ "loss": 0.7654649257659912,
154
+ "step": 80,
155
+ "token_acc": 0.7855787476280834
156
+ },
157
+ {
158
+ "epoch": 1.118421052631579,
159
+ "grad_norm": 13.068943217853121,
160
+ "learning_rate": 4.770598236234616e-07,
161
+ "loss": 0.7880854606628418,
162
+ "step": 85,
163
+ "token_acc": 0.768064228367529
164
+ },
165
+ {
166
+ "epoch": 1.1842105263157894,
167
+ "grad_norm": 12.89309027582137,
168
+ "learning_rate": 4.720172231068844e-07,
169
+ "loss": 0.7307503700256348,
170
+ "step": 90,
171
+ "token_acc": 0.7837606837606838
172
+ },
173
+ {
174
+ "epoch": 1.25,
175
+ "grad_norm": 12.073433385238404,
176
+ "learning_rate": 4.6650635094610966e-07,
177
+ "loss": 0.6501222610473633,
178
+ "step": 95,
179
+ "token_acc": 0.8068181818181818
180
+ },
181
+ {
182
+ "epoch": 1.3157894736842106,
183
+ "grad_norm": 11.811777608591989,
184
+ "learning_rate": 4.605388304968914e-07,
185
+ "loss": 0.7150445461273194,
186
+ "step": 100,
187
+ "token_acc": 0.7920792079207921
188
+ },
189
+ {
190
+ "epoch": 1.381578947368421,
191
+ "grad_norm": 11.639785635682916,
192
+ "learning_rate": 4.541272482629857e-07,
193
+ "loss": 0.6845383644104004,
194
+ "step": 105,
195
+ "token_acc": 0.7807971014492754
196
+ },
197
+ {
198
+ "epoch": 1.4473684210526316,
199
+ "grad_norm": 13.527665828737845,
200
+ "learning_rate": 4.472851273490984e-07,
201
+ "loss": 0.6341064453125,
202
+ "step": 110,
203
+ "token_acc": 0.7936936936936937
204
+ },
205
+ {
206
+ "epoch": 1.513157894736842,
207
+ "grad_norm": 12.400162052083235,
208
+ "learning_rate": 4.4002689893838405e-07,
209
+ "loss": 0.6944449424743653,
210
+ "step": 115,
211
+ "token_acc": 0.7871986699916874
212
+ },
213
+ {
214
+ "epoch": 1.5789473684210527,
215
+ "grad_norm": 12.084687708268657,
216
+ "learning_rate": 4.323678718546552e-07,
217
+ "loss": 0.6916102409362793,
218
+ "step": 120,
219
+ "token_acc": 0.7841726618705036
220
+ },
221
+ {
222
+ "epoch": 1.6447368421052633,
223
+ "grad_norm": 14.339547961512208,
224
+ "learning_rate": 4.243242002734988e-07,
225
+ "loss": 0.6724244117736816,
226
+ "step": 125,
227
+ "token_acc": 0.8003731343283582
228
+ },
229
+ {
230
+ "epoch": 1.7105263157894737,
231
+ "grad_norm": 12.886115725138072,
232
+ "learning_rate": 4.159128496504053e-07,
233
+ "loss": 0.7068803787231446,
234
+ "step": 130,
235
+ "token_acc": 0.7975970425138632
236
+ },
237
+ {
238
+ "epoch": 1.776315789473684,
239
+ "grad_norm": 13.664304711583553,
240
+ "learning_rate": 4.071515609377705e-07,
241
+ "loss": 0.6755687713623046,
242
+ "step": 135,
243
+ "token_acc": 0.8097982708933718
244
+ },
245
+ {
246
+ "epoch": 1.8421052631578947,
247
+ "grad_norm": 12.648918436490476,
248
+ "learning_rate": 3.9805881316624503e-07,
249
+ "loss": 0.6957115650177002,
250
+ "step": 140,
251
+ "token_acc": 0.7955555555555556
252
+ },
253
+ {
254
+ "epoch": 1.9078947368421053,
255
+ "grad_norm": 11.176293465675545,
256
+ "learning_rate": 3.8865378446935217e-07,
257
+ "loss": 0.6958436012268067,
258
+ "step": 145,
259
+ "token_acc": 0.7841860465116279
260
+ },
261
+ {
262
+ "epoch": 1.973684210526316,
263
+ "grad_norm": 13.253986253776585,
264
+ "learning_rate": 3.78956311633581e-07,
265
+ "loss": 0.7309179306030273,
266
+ "step": 150,
267
+ "token_acc": 0.7876838235294118
268
+ },
269
+ {
270
+ "epoch": 2.0,
271
+ "eval_loss": 0.7281007766723633,
272
+ "eval_runtime": 13.8917,
273
+ "eval_samples_per_second": 9.718,
274
+ "eval_steps_per_second": 1.224,
275
+ "eval_token_acc": 0.784877529286475,
276
+ "step": 152
277
+ },
278
+ {
279
+ "epoch": 2.039473684210526,
280
+ "grad_norm": 11.735958700270654,
281
+ "learning_rate": 3.689868482592684e-07,
282
+ "loss": 0.653682279586792,
283
+ "step": 155,
284
+ "token_acc": 0.7934875749785776
285
+ },
286
+ {
287
+ "epoch": 2.1052631578947367,
288
+ "grad_norm": 11.471307891877995,
289
+ "learning_rate": 3.587664216205183e-07,
290
+ "loss": 0.644793701171875,
291
+ "step": 160,
292
+ "token_acc": 0.8068077276908924
293
+ },
294
+ {
295
+ "epoch": 2.1710526315789473,
296
+ "grad_norm": 11.843571052368723,
297
+ "learning_rate": 3.4831658831514575e-07,
298
+ "loss": 0.5835385322570801,
299
+ "step": 165,
300
+ "token_acc": 0.8140161725067385
301
+ },
302
+ {
303
+ "epoch": 2.236842105263158,
304
+ "grad_norm": 10.76496899877298,
305
+ "learning_rate": 3.376593887981886e-07,
306
+ "loss": 0.5892860412597656,
307
+ "step": 170,
308
+ "token_acc": 0.8260038240917782
309
+ },
310
+ {
311
+ "epoch": 2.3026315789473686,
312
+ "grad_norm": 13.952005550709435,
313
+ "learning_rate": 3.268173008948826e-07,
314
+ "loss": 0.6131174564361572,
315
+ "step": 175,
316
+ "token_acc": 0.8151111111111111
317
+ },
318
+ {
319
+ "epoch": 2.3684210526315788,
320
+ "grad_norm": 11.500981624672228,
321
+ "learning_rate": 3.1581319239114976e-07,
322
+ "loss": 0.6069381713867188,
323
+ "step": 180,
324
+ "token_acc": 0.8110236220472441
325
+ },
326
+ {
327
+ "epoch": 2.4342105263157894,
328
+ "grad_norm": 11.46248454566006,
329
+ "learning_rate": 3.0467027280159394e-07,
330
+ "loss": 0.6257027626037598,
331
+ "step": 185,
332
+ "token_acc": 0.8304742684157417
333
+ },
334
+ {
335
+ "epoch": 2.5,
336
+ "grad_norm": 13.002284165063047,
337
+ "learning_rate": 2.934120444167326e-07,
338
+ "loss": 0.6725513458251953,
339
+ "step": 190,
340
+ "token_acc": 0.7950469684030743
341
+ },
342
+ {
343
+ "epoch": 2.5657894736842106,
344
+ "grad_norm": 13.013905049177902,
345
+ "learning_rate": 2.820622527327158e-07,
346
+ "loss": 0.5892130851745605,
347
+ "step": 195,
348
+ "token_acc": 0.8100706713780919
349
+ },
350
+ {
351
+ "epoch": 2.6315789473684212,
352
+ "grad_norm": 11.480677786146966,
353
+ "learning_rate": 2.706448363680831e-07,
354
+ "loss": 0.5530064582824707,
355
+ "step": 200,
356
+ "token_acc": 0.8359303391384051
357
+ },
358
+ {
359
+ "epoch": 2.6973684210526314,
360
+ "grad_norm": 13.276820229615291,
361
+ "learning_rate": 2.591838765731931e-07,
362
+ "loss": 0.5892533302307129,
363
+ "step": 205,
364
+ "token_acc": 0.8258575197889182
365
+ },
366
+ {
367
+ "epoch": 2.763157894736842,
368
+ "grad_norm": 13.90433344794858,
369
+ "learning_rate": 2.477035464388184e-07,
370
+ "loss": 0.6034513950347901,
371
+ "step": 210,
372
+ "token_acc": 0.8290840415486308
373
+ },
374
+ {
375
+ "epoch": 2.8289473684210527,
376
+ "grad_norm": 12.016974173998076,
377
+ "learning_rate": 2.3622805991103358e-07,
378
+ "loss": 0.6079031467437744,
379
+ "step": 215,
380
+ "token_acc": 0.803770351328192
381
+ },
382
+ {
383
+ "epoch": 2.8947368421052633,
384
+ "grad_norm": 12.883251904886917,
385
+ "learning_rate": 2.2478162071993296e-07,
386
+ "loss": 0.6700970649719238,
387
+ "step": 220,
388
+ "token_acc": 0.801210025929127
389
+ },
390
+ {
391
+ "epoch": 2.9605263157894735,
392
+ "grad_norm": 12.808332568512581,
393
+ "learning_rate": 2.133883713298946e-07,
394
+ "loss": 0.650183916091919,
395
+ "step": 225,
396
+ "token_acc": 0.8039927404718693
397
+ },
398
+ {
399
+ "epoch": 3.0,
400
+ "eval_loss": 0.7265448570251465,
401
+ "eval_runtime": 13.707,
402
+ "eval_samples_per_second": 9.849,
403
+ "eval_steps_per_second": 1.24,
404
+ "eval_token_acc": 0.7859424920127795,
405
+ "step": 228
406
+ }
407
+ ],
408
+ "logging_steps": 5,
409
+ "max_steps": 380,
410
+ "num_input_tokens_seen": 0,
411
+ "num_train_epochs": 5,
412
+ "save_steps": 500,
413
+ "stateful_callbacks": {
414
+ "TrainerControl": {
415
+ "args": {
416
+ "should_epoch_stop": false,
417
+ "should_evaluate": false,
418
+ "should_log": false,
419
+ "should_save": true,
420
+ "should_training_stop": false
421
+ },
422
+ "attributes": {}
423
+ }
424
+ },
425
+ "total_flos": 1.670596435818578e+17,
426
+ "train_batch_size": 2,
427
+ "trial_name": null,
428
+ "trial_params": null
429
+ }
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/logging.jsonl ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"loss": 1.04915047, "grad_norm": 16.14974597, "learning_rate": 1e-08, "token_acc": 0.73160173, "epoch": 0.01315789, "global_step/max_steps": "1/380", "percentage": "0.26%", "elapsed_time": "14s", "remaining_time": "1h 30m 22s", "memory(GiB)": 44.31, "train_speed(iter/s)": 0.069898}
2
+ {"loss": 0.93331659, "grad_norm": 19.39427424, "learning_rate": 7e-08, "token_acc": 0.7398773, "epoch": 0.06578947, "global_step/max_steps": "5/380", "percentage": "1.32%", "elapsed_time": "37s", "remaining_time": "47m 29s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.131596}
3
+ {"loss": 0.99675102, "grad_norm": 16.75164883, "learning_rate": 1.3e-07, "token_acc": 0.74214418, "epoch": 0.13157895, "global_step/max_steps": "10/380", "percentage": "2.63%", "elapsed_time": "1m 7s", "remaining_time": "41m 53s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.147217}
4
+ {"loss": 0.89403772, "grad_norm": 17.69556265, "learning_rate": 2e-07, "token_acc": 0.76943463, "epoch": 0.19736842, "global_step/max_steps": "15/380", "percentage": "3.95%", "elapsed_time": "1m 35s", "remaining_time": "38m 47s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.156854}
5
+ {"loss": 0.94066162, "grad_norm": 16.58083771, "learning_rate": 2.6e-07, "token_acc": 0.75341219, "epoch": 0.26315789, "global_step/max_steps": "20/380", "percentage": "5.26%", "elapsed_time": "2m 4s", "remaining_time": "37m 12s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.161241}
6
+ {"loss": 0.89130363, "grad_norm": 17.18358188, "learning_rate": 3.3e-07, "token_acc": 0.77226721, "epoch": 0.32894737, "global_step/max_steps": "25/380", "percentage": "6.58%", "elapsed_time": "2m 32s", "remaining_time": "36m 7s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.163803}
7
+ {"loss": 0.89959068, "grad_norm": 12.65576013, "learning_rate": 3.9e-07, "token_acc": 0.75290958, "epoch": 0.39473684, "global_step/max_steps": "30/380", "percentage": "7.89%", "elapsed_time": "2m 59s", "remaining_time": "34m 55s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.167039}
8
+ {"loss": 0.78290029, "grad_norm": 14.79933887, "learning_rate": 4.6e-07, "token_acc": 0.7704918, "epoch": 0.46052632, "global_step/max_steps": "35/380", "percentage": "9.21%", "elapsed_time": "3m 26s", "remaining_time": "33m 53s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.169641}
9
+ {"loss": 0.71096864, "grad_norm": 13.68832741, "learning_rate": 5e-07, "token_acc": 0.7868705, "epoch": 0.52631579, "global_step/max_steps": "40/380", "percentage": "10.53%", "elapsed_time": "3m 54s", "remaining_time": "33m 11s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.170729}
10
+ {"loss": 0.81004581, "grad_norm": 11.59088672, "learning_rate": 5e-07, "token_acc": 0.77661431, "epoch": 0.59210526, "global_step/max_steps": "45/380", "percentage": "11.84%", "elapsed_time": "4m 21s", "remaining_time": "32m 29s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.171868}
11
+ {"loss": 0.77025723, "grad_norm": 18.37250203, "learning_rate": 5e-07, "token_acc": 0.78956835, "epoch": 0.65789474, "global_step/max_steps": "50/380", "percentage": "13.16%", "elapsed_time": "4m 49s", "remaining_time": "31m 49s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.172822}
12
+ {"loss": 0.81387711, "grad_norm": 14.2573464, "learning_rate": 5e-07, "token_acc": 0.75917637, "epoch": 0.72368421, "global_step/max_steps": "55/380", "percentage": "14.47%", "elapsed_time": "5m 16s", "remaining_time": "31m 12s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.173519}
13
+ {"loss": 0.72584805, "grad_norm": 15.09159238, "learning_rate": 4.9e-07, "token_acc": 0.7591179, "epoch": 0.78947368, "global_step/max_steps": "60/380", "percentage": "15.79%", "elapsed_time": "5m 46s", "remaining_time": "30m 45s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.173396}
14
+ {"loss": 0.82084579, "grad_norm": 14.64327029, "learning_rate": 4.9e-07, "token_acc": 0.76177536, "epoch": 0.85526316, "global_step/max_steps": "65/380", "percentage": "17.11%", "elapsed_time": "6m 13s", "remaining_time": "30m 9s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.17406}
15
+ {"loss": 0.68003249, "grad_norm": 12.10237792, "learning_rate": 4.9e-07, "token_acc": 0.80939227, "epoch": 0.92105263, "global_step/max_steps": "70/380", "percentage": "18.42%", "elapsed_time": "6m 40s", "remaining_time": "29m 35s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.174604}
16
+ {"loss": 0.73656111, "grad_norm": 12.72703846, "learning_rate": 4.9e-07, "token_acc": 0.78816794, "epoch": 0.98684211, "global_step/max_steps": "75/380", "percentage": "19.74%", "elapsed_time": "7m 7s", "remaining_time": "28m 56s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.175597}
17
+ {"eval_loss": 0.74852335, "eval_runtime": 14.0742, "eval_samples_per_second": 9.592, "eval_steps_per_second": 1.208, "eval_token_acc": 0.77902023, "epoch": 1.0, "global_step/max_steps": "76/380", "percentage": "20.00%", "elapsed_time": "7m 26s", "remaining_time": "29m 47s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.170093}
18
+ {"loss": 0.76546493, "grad_norm": 12.92158107, "learning_rate": 4.8e-07, "token_acc": 0.78557875, "epoch": 1.05263158, "global_step/max_steps": "80/380", "percentage": "21.05%", "elapsed_time": "9m 26s", "remaining_time": "35m 25s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.141158}
19
+ {"loss": 0.78808546, "grad_norm": 13.06894322, "learning_rate": 4.8e-07, "token_acc": 0.76806423, "epoch": 1.11842105, "global_step/max_steps": "85/380", "percentage": "22.37%", "elapsed_time": "9m 56s", "remaining_time": "34m 31s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.142417}
20
+ {"loss": 0.73075037, "grad_norm": 12.89309028, "learning_rate": 4.7e-07, "token_acc": 0.78376068, "epoch": 1.18421053, "global_step/max_steps": "90/380", "percentage": "23.68%", "elapsed_time": "10m 23s", "remaining_time": "33m 30s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.144254}
21
+ {"loss": 0.65012226, "grad_norm": 12.07343339, "learning_rate": 4.7e-07, "token_acc": 0.80681818, "epoch": 1.25, "global_step/max_steps": "95/380", "percentage": "25.00%", "elapsed_time": "10m 53s", "remaining_time": "32m 39s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.145462}
22
+ {"loss": 0.71504455, "grad_norm": 11.81177761, "learning_rate": 4.6e-07, "token_acc": 0.79207921, "epoch": 1.31578947, "global_step/max_steps": "100/380", "percentage": "26.32%", "elapsed_time": "11m 21s", "remaining_time": "31m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.146813}
23
+ {"loss": 0.68453836, "grad_norm": 11.63978564, "learning_rate": 4.5e-07, "token_acc": 0.7807971, "epoch": 1.38157895, "global_step/max_steps": "105/380", "percentage": "27.63%", "elapsed_time": "11m 49s", "remaining_time": "30m 58s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.147973}
24
+ {"loss": 0.63410645, "grad_norm": 13.52766583, "learning_rate": 4.5e-07, "token_acc": 0.79369369, "epoch": 1.44736842, "global_step/max_steps": "110/380", "percentage": "28.95%", "elapsed_time": "12m 16s", "remaining_time": "30m 8s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14927}
25
+ {"loss": 0.69444494, "grad_norm": 12.40016205, "learning_rate": 4.4e-07, "token_acc": 0.78719867, "epoch": 1.51315789, "global_step/max_steps": "115/380", "percentage": "30.26%", "elapsed_time": "12m 45s", "remaining_time": "29m 23s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.150274}
26
+ {"loss": 0.69161024, "grad_norm": 12.08468771, "learning_rate": 4.3e-07, "token_acc": 0.78417266, "epoch": 1.57894737, "global_step/max_steps": "120/380", "percentage": "31.58%", "elapsed_time": "13m 11s", "remaining_time": "28m 35s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.151576}
27
+ {"loss": 0.67242441, "grad_norm": 14.33954796, "learning_rate": 4.2e-07, "token_acc": 0.80037313, "epoch": 1.64473684, "global_step/max_steps": "125/380", "percentage": "32.89%", "elapsed_time": "13m 39s", "remaining_time": "27m 51s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.152574}
28
+ {"loss": 0.70688038, "grad_norm": 12.88611573, "learning_rate": 4.2e-07, "token_acc": 0.79759704, "epoch": 1.71052632, "global_step/max_steps": "130/380", "percentage": "34.21%", "elapsed_time": "14m 7s", "remaining_time": "27m 9s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.153412}
29
+ {"loss": 0.67556877, "grad_norm": 13.66430471, "learning_rate": 4.1e-07, "token_acc": 0.80979827, "epoch": 1.77631579, "global_step/max_steps": "135/380", "percentage": "35.53%", "elapsed_time": "14m 34s", "remaining_time": "26m 26s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.154392}
30
+ {"loss": 0.69571157, "grad_norm": 12.64891844, "learning_rate": 4e-07, "token_acc": 0.79555556, "epoch": 1.84210526, "global_step/max_steps": "140/380", "percentage": "36.84%", "elapsed_time": "15m 2s", "remaining_time": "25m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.155104}
31
+ {"loss": 0.6958436, "grad_norm": 11.17629347, "learning_rate": 3.9e-07, "token_acc": 0.78418605, "epoch": 1.90789474, "global_step/max_steps": "145/380", "percentage": "38.16%", "elapsed_time": "15m 30s", "remaining_time": "25m 7s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.155882}
32
+ {"loss": 0.73091793, "grad_norm": 13.25398625, "learning_rate": 3.8e-07, "token_acc": 0.78768382, "epoch": 1.97368421, "global_step/max_steps": "150/380", "percentage": "39.47%", "elapsed_time": "15m 57s", "remaining_time": "24m 28s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.156611}
33
+ {"eval_loss": 0.72810078, "eval_runtime": 13.8917, "eval_samples_per_second": 9.718, "eval_steps_per_second": 1.224, "eval_token_acc": 0.78487753, "epoch": 2.0, "global_step/max_steps": "152/380", "percentage": "40.00%", "elapsed_time": "16m 22s", "remaining_time": "24m 33s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.154714}
34
+ {"loss": 0.65368228, "grad_norm": 11.7359587, "learning_rate": 3.7e-07, "token_acc": 0.79348757, "epoch": 2.03947368, "global_step/max_steps": "155/380", "percentage": "40.79%", "elapsed_time": "18m 27s", "remaining_time": "26m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.139981}
35
+ {"loss": 0.6447937, "grad_norm": 11.47130789, "learning_rate": 3.6e-07, "token_acc": 0.80680773, "epoch": 2.10526316, "global_step/max_steps": "160/380", "percentage": "42.11%", "elapsed_time": "18m 56s", "remaining_time": "26m 2s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.140824}
36
+ {"loss": 0.58353853, "grad_norm": 11.84357105, "learning_rate": 3.5e-07, "token_acc": 0.81401617, "epoch": 2.17105263, "global_step/max_steps": "165/380", "percentage": "43.42%", "elapsed_time": "19m 24s", "remaining_time": "25m 17s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.141657}
37
+ {"loss": 0.58928604, "grad_norm": 10.764969, "learning_rate": 3.4e-07, "token_acc": 0.82600382, "epoch": 2.23684211, "global_step/max_steps": "170/380", "percentage": "44.74%", "elapsed_time": "19m 52s", "remaining_time": "24m 32s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.142607}
38
+ {"loss": 0.61311746, "grad_norm": 13.95200555, "learning_rate": 3.3e-07, "token_acc": 0.81511111, "epoch": 2.30263158, "global_step/max_steps": "175/380", "percentage": "46.05%", "elapsed_time": "20m 19s", "remaining_time": "23m 48s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.143488}
39
+ {"loss": 0.60693817, "grad_norm": 11.50098162, "learning_rate": 3.2e-07, "token_acc": 0.81102362, "epoch": 2.36842105, "global_step/max_steps": "180/380", "percentage": "47.37%", "elapsed_time": "20m 46s", "remaining_time": "23m 5s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.144359}
40
+ {"loss": 0.62570276, "grad_norm": 11.46248455, "learning_rate": 3e-07, "token_acc": 0.83047427, "epoch": 2.43421053, "global_step/max_steps": "185/380", "percentage": "48.68%", "elapsed_time": "21m 14s", "remaining_time": "22m 23s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14516}
41
+ {"loss": 0.67255135, "grad_norm": 13.00228417, "learning_rate": 2.9e-07, "token_acc": 0.79504697, "epoch": 2.5, "global_step/max_steps": "190/380", "percentage": "50.00%", "elapsed_time": "21m 42s", "remaining_time": "21m 42s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.145912}
42
+ {"loss": 0.58921309, "grad_norm": 13.01390505, "learning_rate": 2.8e-07, "token_acc": 0.81007067, "epoch": 2.56578947, "global_step/max_steps": "195/380", "percentage": "51.32%", "elapsed_time": "22m 9s", "remaining_time": "21m 1s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14667}
43
+ {"loss": 0.55300646, "grad_norm": 11.48067779, "learning_rate": 2.7e-07, "token_acc": 0.83593034, "epoch": 2.63157895, "global_step/max_steps": "200/380", "percentage": "52.63%", "elapsed_time": "22m 37s", "remaining_time": "20m 21s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.147376}
44
+ {"loss": 0.58925333, "grad_norm": 13.27682023, "learning_rate": 2.6e-07, "token_acc": 0.82585752, "epoch": 2.69736842, "global_step/max_steps": "205/380", "percentage": "53.95%", "elapsed_time": "23m 4s", "remaining_time": "19m 41s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.148115}
45
+ {"loss": 0.6034514, "grad_norm": 13.90433345, "learning_rate": 2.5e-07, "token_acc": 0.82908404, "epoch": 2.76315789, "global_step/max_steps": "210/380", "percentage": "55.26%", "elapsed_time": "23m 32s", "remaining_time": "19m 3s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148683}
46
+ {"loss": 0.60790315, "grad_norm": 12.01697417, "learning_rate": 2.4e-07, "token_acc": 0.80377035, "epoch": 2.82894737, "global_step/max_steps": "215/380", "percentage": "56.58%", "elapsed_time": "23m 58s", "remaining_time": "18m 24s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.149437}
47
+ {"loss": 0.67009706, "grad_norm": 12.8832519, "learning_rate": 2.2e-07, "token_acc": 0.80121003, "epoch": 2.89473684, "global_step/max_steps": "220/380", "percentage": "57.89%", "elapsed_time": "24m 25s", "remaining_time": "17m 45s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.150125}
48
+ {"loss": 0.65018392, "grad_norm": 12.80833257, "learning_rate": 2.1e-07, "token_acc": 0.80399274, "epoch": 2.96052632, "global_step/max_steps": "225/380", "percentage": "59.21%", "elapsed_time": "24m 52s", "remaining_time": "17m 8s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.15071}
49
+ {"eval_loss": 0.72654486, "eval_runtime": 13.707, "eval_samples_per_second": 9.849, "eval_steps_per_second": 1.24, "eval_token_acc": 0.78594249, "epoch": 3.0, "global_step/max_steps": "228/380", "percentage": "60.00%", "elapsed_time": "25m 23s", "remaining_time": "16m 55s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.1497}
50
+ {"loss": 0.59686203, "grad_norm": 12.5998029, "learning_rate": 2e-07, "token_acc": 0.81576448, "epoch": 3.02631579, "global_step/max_steps": "230/380", "percentage": "60.53%", "elapsed_time": "27m 17s", "remaining_time": "17m 47s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.140464}
51
+ {"loss": 0.52481909, "grad_norm": 10.32396224, "learning_rate": 1.9e-07, "token_acc": 0.84875445, "epoch": 3.09210526, "global_step/max_steps": "235/380", "percentage": "61.84%", "elapsed_time": "27m 45s", "remaining_time": "17m 7s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141063}
52
+ {"loss": 0.56462321, "grad_norm": 13.344608, "learning_rate": 1.8e-07, "token_acc": 0.82659479, "epoch": 3.15789474, "global_step/max_steps": "240/380", "percentage": "63.16%", "elapsed_time": "28m 15s", "remaining_time": "16m 28s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141567}
53
+ {"loss": 0.52111754, "grad_norm": 11.17948974, "learning_rate": 1.7e-07, "token_acc": 0.83940972, "epoch": 3.22368421, "global_step/max_steps": "245/380", "percentage": "64.47%", "elapsed_time": "28m 41s", "remaining_time": "15m 48s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142307}
54
+ {"loss": 0.53249068, "grad_norm": 11.28136572, "learning_rate": 1.6e-07, "token_acc": 0.84306888, "epoch": 3.28947368, "global_step/max_steps": "250/380", "percentage": "65.79%", "elapsed_time": "29m 8s", "remaining_time": "15m 9s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142961}
55
+ {"loss": 0.55586386, "grad_norm": 11.01640688, "learning_rate": 1.5e-07, "token_acc": 0.83586338, "epoch": 3.35526316, "global_step/max_steps": "255/380", "percentage": "67.11%", "elapsed_time": "29m 36s", "remaining_time": "14m 30s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143574}
56
+ {"loss": 0.56448097, "grad_norm": 13.14098789, "learning_rate": 1.4e-07, "token_acc": 0.82502444, "epoch": 3.42105263, "global_step/max_steps": "260/380", "percentage": "68.42%", "elapsed_time": "30m 3s", "remaining_time": "13m 52s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144127}
57
+ {"loss": 0.59146905, "grad_norm": 12.53265712, "learning_rate": 1.3e-07, "token_acc": 0.80371353, "epoch": 3.48684211, "global_step/max_steps": "265/380", "percentage": "69.74%", "elapsed_time": "30m 31s", "remaining_time": "13m 14s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144685}
58
+ {"loss": 0.58134708, "grad_norm": 12.54092225, "learning_rate": 1.2e-07, "token_acc": 0.81122943, "epoch": 3.55263158, "global_step/max_steps": "270/380", "percentage": "71.05%", "elapsed_time": "30m 57s", "remaining_time": "12m 36s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145363}
59
+ {"loss": 0.52265396, "grad_norm": 11.3862044, "learning_rate": 1.1e-07, "token_acc": 0.84823091, "epoch": 3.61842105, "global_step/max_steps": "275/380", "percentage": "72.37%", "elapsed_time": "31m 24s", "remaining_time": "11m 59s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.14592}
60
+ {"loss": 0.64316354, "grad_norm": 13.16476262, "learning_rate": 1e-07, "token_acc": 0.82226212, "epoch": 3.68421053, "global_step/max_steps": "280/380", "percentage": "73.68%", "elapsed_time": "31m 52s", "remaining_time": "11m 23s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146387}
61
+ {"loss": 0.60033703, "grad_norm": 13.70750821, "learning_rate": 9e-08, "token_acc": 0.8210252, "epoch": 3.75, "global_step/max_steps": "285/380", "percentage": "75.00%", "elapsed_time": "32m 19s", "remaining_time": "10m 46s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146917}
62
+ {"loss": 0.59155693, "grad_norm": 16.09784354, "learning_rate": 8e-08, "token_acc": 0.82407407, "epoch": 3.81578947, "global_step/max_steps": "290/380", "percentage": "76.32%", "elapsed_time": "32m 46s", "remaining_time": "10m 10s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147469}
63
+ {"loss": 0.46532865, "grad_norm": 11.26346719, "learning_rate": 7e-08, "token_acc": 0.84084637, "epoch": 3.88157895, "global_step/max_steps": "295/380", "percentage": "77.63%", "elapsed_time": "33m 13s", "remaining_time": "9m 34s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148001}
64
+ {"loss": 0.52924175, "grad_norm": 12.96189546, "learning_rate": 6e-08, "token_acc": 0.84242424, "epoch": 3.94736842, "global_step/max_steps": "300/380", "percentage": "78.95%", "elapsed_time": "33m 40s", "remaining_time": "8m 58s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148451}
65
+ {"eval_loss": 0.72927332, "eval_runtime": 13.7897, "eval_samples_per_second": 9.79, "eval_steps_per_second": 1.233, "eval_token_acc": 0.78328009, "epoch": 4.0, "global_step/max_steps": "304/380", "percentage": "80.00%", "elapsed_time": "34m 15s", "remaining_time": "8m 33s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147892}
66
+ {"loss": 0.47034731, "grad_norm": 11.94114344, "learning_rate": 6e-08, "token_acc": 0.86036036, "epoch": 4.01315789, "global_step/max_steps": "305/380", "percentage": "80.26%", "elapsed_time": "36m 11s", "remaining_time": "8m 53s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.14048}
67
+ {"loss": 0.52430968, "grad_norm": 12.36947829, "learning_rate": 5e-08, "token_acc": 0.83421986, "epoch": 4.07894737, "global_step/max_steps": "310/380", "percentage": "81.58%", "elapsed_time": "36m 38s", "remaining_time": "8m 16s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.140994}
68
+ {"loss": 0.51603422, "grad_norm": 12.07287742, "learning_rate": 4e-08, "token_acc": 0.83608147, "epoch": 4.14473684, "global_step/max_steps": "315/380", "percentage": "82.89%", "elapsed_time": "37m 6s", "remaining_time": "7m 39s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141504}
69
+ {"loss": 0.51714025, "grad_norm": 13.02947867, "learning_rate": 4e-08, "token_acc": 0.84266409, "epoch": 4.21052632, "global_step/max_steps": "320/380", "percentage": "84.21%", "elapsed_time": "37m 33s", "remaining_time": "7m 2s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142011}
70
+ {"loss": 0.51301398, "grad_norm": 12.08777604, "learning_rate": 3e-08, "token_acc": 0.84384093, "epoch": 4.27631579, "global_step/max_steps": "325/380", "percentage": "85.53%", "elapsed_time": "37m 59s", "remaining_time": "6m 25s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142584}
71
+ {"loss": 0.54592509, "grad_norm": 12.26082191, "learning_rate": 3e-08, "token_acc": 0.84087237, "epoch": 4.34210526, "global_step/max_steps": "330/380", "percentage": "86.84%", "elapsed_time": "38m 28s", "remaining_time": "5m 49s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142964}
72
+ {"loss": 0.51755118, "grad_norm": 12.81988134, "learning_rate": 2e-08, "token_acc": 0.84494774, "epoch": 4.40789474, "global_step/max_steps": "335/380", "percentage": "88.16%", "elapsed_time": "38m 56s", "remaining_time": "5m 13s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143401}
73
+ {"loss": 0.53464289, "grad_norm": 13.09503379, "learning_rate": 2e-08, "token_acc": 0.82877317, "epoch": 4.47368421, "global_step/max_steps": "340/380", "percentage": "89.47%", "elapsed_time": "39m 24s", "remaining_time": "4m 38s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143775}
74
+ {"loss": 0.47082796, "grad_norm": 10.82230421, "learning_rate": 1e-08, "token_acc": 0.85964912, "epoch": 4.53947368, "global_step/max_steps": "345/380", "percentage": "90.79%", "elapsed_time": "39m 52s", "remaining_time": "4m 2s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144178}
75
+ {"loss": 0.58271708, "grad_norm": 12.0105016, "learning_rate": 1e-08, "token_acc": 0.81669691, "epoch": 4.60526316, "global_step/max_steps": "350/380", "percentage": "92.11%", "elapsed_time": "40m 20s", "remaining_time": "3m 27s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144595}
76
+ {"loss": 0.47942114, "grad_norm": 12.94941478, "learning_rate": 1e-08, "token_acc": 0.84791252, "epoch": 4.67105263, "global_step/max_steps": "355/380", "percentage": "93.42%", "elapsed_time": "40m 47s", "remaining_time": "2m 52s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145038}
77
+ {"loss": 0.554074, "grad_norm": 13.89755619, "learning_rate": 0.0, "token_acc": 0.821, "epoch": 4.73684211, "global_step/max_steps": "360/380", "percentage": "94.74%", "elapsed_time": "41m 14s", "remaining_time": "2m 17s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145469}
78
+ {"loss": 0.48682218, "grad_norm": 12.71374242, "learning_rate": 0.0, "token_acc": 0.85853659, "epoch": 4.80263158, "global_step/max_steps": "365/380", "percentage": "96.05%", "elapsed_time": "41m 42s", "remaining_time": "1m 42s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145883}
79
+ {"loss": 0.53818045, "grad_norm": 12.30313614, "learning_rate": 0.0, "token_acc": 0.8416955, "epoch": 4.86842105, "global_step/max_steps": "370/380", "percentage": "97.37%", "elapsed_time": "42m 8s", "remaining_time": "1m 8s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146317}
80
+ {"loss": 0.55644884, "grad_norm": 10.88836738, "learning_rate": 0.0, "token_acc": 0.83556747, "epoch": 4.93421053, "global_step/max_steps": "375/380", "percentage": "98.68%", "elapsed_time": "42m 36s", "remaining_time": "34s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146687}
81
+ {"loss": 0.56877451, "grad_norm": 15.15540611, "learning_rate": 0.0, "token_acc": 0.84521739, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "43m 4s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147038}
82
+ {"eval_loss": 0.73085946, "eval_runtime": 13.7849, "eval_samples_per_second": 9.793, "eval_steps_per_second": 1.233, "eval_token_acc": 0.78434505, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "43m 18s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146257}
83
+ {"eval_loss": 0.73085946, "eval_runtime": 16.1805, "eval_samples_per_second": 8.343, "eval_steps_per_second": 1.051, "eval_token_acc": 0.78434505, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "45m 17s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.139834}
84
+ {"train_runtime": 2916.558, "train_samples_per_second": 2.086, "train_steps_per_second": 0.13, "total_flos": 2.788227173675172e+17, "train_loss": 0.64442084, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "48m 31s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.130533}
85
+ {"model_parameter_info": "InternVLChatModel: 7944.3738M Params (7612.8210M Trainable [95.8266%]), 0.0001M Buffers.", "last_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-380", "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228", "best_metric": 0.72654486, "global_step": 380, "log_history": [{"loss": 1.0491504669189453, "grad_norm": 16.149745965683884, "learning_rate": 1.3157894736842104e-08, "token_acc": 0.7316017316017316, "epoch": 0.013157894736842105, "step": 1}, {"loss": 0.9333165884017944, "grad_norm": 19.394274240651384, "learning_rate": 6.578947368421052e-08, "token_acc": 0.7398773006134969, "epoch": 0.06578947368421052, "step": 5}, {"loss": 0.9967510223388671, "grad_norm": 16.751648834817885, "learning_rate": 1.3157894736842104e-07, "token_acc": 0.7421441774491682, "epoch": 0.13157894736842105, "step": 10}, {"loss": 0.8940377235412598, "grad_norm": 17.69556265222528, "learning_rate": 1.9736842105263157e-07, "token_acc": 0.769434628975265, "epoch": 0.19736842105263158, "step": 15}, {"loss": 0.94066162109375, "grad_norm": 16.580837710216986, "learning_rate": 2.631578947368421e-07, "token_acc": 0.7534121929026387, "epoch": 0.2631578947368421, "step": 20}, {"loss": 0.8913036346435547, "grad_norm": 17.18358188029577, "learning_rate": 3.2894736842105264e-07, "token_acc": 0.7722672064777328, "epoch": 0.32894736842105265, "step": 25}, {"loss": 0.8995906829833984, "grad_norm": 12.655760128091604, "learning_rate": 3.9473684210526315e-07, "token_acc": 0.7529095792300806, "epoch": 0.39473684210526316, "step": 30}, {"loss": 0.7829002857208252, "grad_norm": 14.799338872033982, "learning_rate": 4.6052631578947365e-07, "token_acc": 0.7704918032786885, "epoch": 0.4605263157894737, "step": 35}, {"loss": 0.7109686374664307, "grad_norm": 13.6883274124527, "learning_rate": 4.999578104083306e-07, "token_acc": 0.7868705035971223, "epoch": 0.5263157894736842, "step": 40}, {"loss": 0.8100458145141601, "grad_norm": 11.590886715849683, "learning_rate": 4.994833410208486e-07, "token_acc": 0.7766143106457243, "epoch": 0.5921052631578947, "step": 45}, {"loss": 0.7702572345733643, "grad_norm": 18.37250203045574, "learning_rate": 4.984826693294873e-07, "token_acc": 0.789568345323741, "epoch": 0.6578947368421053, "step": 50}, {"loss": 0.8138771057128906, "grad_norm": 14.257346400410995, "learning_rate": 4.969579059190548e-07, "token_acc": 0.7591763652641003, "epoch": 0.7236842105263158, "step": 55}, {"loss": 0.7258480548858642, "grad_norm": 15.091592377911923, "learning_rate": 4.949122667718934e-07, "token_acc": 0.7591178965224766, "epoch": 0.7894736842105263, "step": 60}, {"loss": 0.8208457946777343, "grad_norm": 14.643270287439186, "learning_rate": 4.923500664848326e-07, "token_acc": 0.7617753623188406, "epoch": 0.8552631578947368, "step": 65}, {"loss": 0.68003249168396, "grad_norm": 12.10237792449664, "learning_rate": 4.892767091689785e-07, "token_acc": 0.8093922651933702, "epoch": 0.9210526315789473, "step": 70}, {"loss": 0.7365611076354981, "grad_norm": 12.72703845793204, "learning_rate": 4.856986770515357e-07, "token_acc": 0.7881679389312977, "epoch": 0.9868421052631579, "step": 75}, {"eval_loss": 0.7485233545303345, "eval_runtime": 14.0742, "eval_samples_per_second": 9.592, "eval_steps_per_second": 1.208, "eval_token_acc": 0.7790202342917998, "epoch": 1.0, "step": 76}, {"loss": 0.7654649257659912, "grad_norm": 12.921581074218519, "learning_rate": 4.816235168037004e-07, "token_acc": 0.7855787476280834, "epoch": 1.0526315789473684, "step": 80}, {"loss": 0.7880854606628418, "grad_norm": 13.068943217853121, "learning_rate": 4.770598236234616e-07, "token_acc": 0.768064228367529, "epoch": 1.118421052631579, "step": 85}, {"loss": 0.7307503700256348, "grad_norm": 12.89309027582137, "learning_rate": 4.720172231068844e-07, "token_acc": 0.7837606837606838, "epoch": 1.1842105263157894, "step": 90}, {"loss": 0.6501222610473633, "grad_norm": 12.073433385238404, "learning_rate": 4.6650635094610966e-07, "token_acc": 0.8068181818181818, "epoch": 1.25, "step": 95}, {"loss": 0.7150445461273194, "grad_norm": 11.811777608591989, "learning_rate": 4.605388304968914e-07, "token_acc": 0.7920792079207921, "epoch": 1.3157894736842106, "step": 100}, {"loss": 0.6845383644104004, "grad_norm": 11.639785635682916, "learning_rate": 4.541272482629857e-07, "token_acc": 0.7807971014492754, "epoch": 1.381578947368421, "step": 105}, {"loss": 0.6341064453125, "grad_norm": 13.527665828737845, "learning_rate": 4.472851273490984e-07, "token_acc": 0.7936936936936937, "epoch": 1.4473684210526316, "step": 110}, {"loss": 0.6944449424743653, "grad_norm": 12.400162052083235, "learning_rate": 4.4002689893838405e-07, "token_acc": 0.7871986699916874, "epoch": 1.513157894736842, "step": 115}, {"loss": 0.6916102409362793, "grad_norm": 12.084687708268657, "learning_rate": 4.323678718546552e-07, "token_acc": 0.7841726618705036, "epoch": 1.5789473684210527, "step": 120}, {"loss": 0.6724244117736816, "grad_norm": 14.339547961512208, "learning_rate": 4.243242002734988e-07, "token_acc": 0.8003731343283582, "epoch": 1.6447368421052633, "step": 125}, {"loss": 0.7068803787231446, "grad_norm": 12.886115725138072, "learning_rate": 4.159128496504053e-07, "token_acc": 0.7975970425138632, "epoch": 1.7105263157894737, "step": 130}, {"loss": 0.6755687713623046, "grad_norm": 13.664304711583553, "learning_rate": 4.071515609377705e-07, "token_acc": 0.8097982708933718, "epoch": 1.776315789473684, "step": 135}, {"loss": 0.6957115650177002, "grad_norm": 12.648918436490476, "learning_rate": 3.9805881316624503e-07, "token_acc": 0.7955555555555556, "epoch": 1.8421052631578947, "step": 140}, {"loss": 0.6958436012268067, "grad_norm": 11.176293465675545, "learning_rate": 3.8865378446935217e-07, "token_acc": 0.7841860465116279, "epoch": 1.9078947368421053, "step": 145}, {"loss": 0.7309179306030273, "grad_norm": 13.253986253776585, "learning_rate": 3.78956311633581e-07, "token_acc": 0.7876838235294118, "epoch": 1.973684210526316, "step": 150}, {"eval_loss": 0.7281007766723633, "eval_runtime": 13.8917, "eval_samples_per_second": 9.718, "eval_steps_per_second": 1.224, "eval_token_acc": 0.784877529286475, "epoch": 2.0, "step": 152}, {"loss": 0.653682279586792, "grad_norm": 11.735958700270654, "learning_rate": 3.689868482592684e-07, "token_acc": 0.7934875749785776, "epoch": 2.039473684210526, "step": 155}, {"loss": 0.644793701171875, "grad_norm": 11.471307891877995, "learning_rate": 3.587664216205183e-07, "token_acc": 0.8068077276908924, "epoch": 2.1052631578947367, "step": 160}, {"loss": 0.5835385322570801, "grad_norm": 11.843571052368723, "learning_rate": 3.4831658831514575e-07, "token_acc": 0.8140161725067385, "epoch": 2.1710526315789473, "step": 165}, {"loss": 0.5892860412597656, "grad_norm": 10.76496899877298, "learning_rate": 3.376593887981886e-07, "token_acc": 0.8260038240917782, "epoch": 2.236842105263158, "step": 170}, {"loss": 0.6131174564361572, "grad_norm": 13.952005550709435, "learning_rate": 3.268173008948826e-07, "token_acc": 0.8151111111111111, "epoch": 2.3026315789473686, "step": 175}, {"loss": 0.6069381713867188, "grad_norm": 11.500981624672228, "learning_rate": 3.1581319239114976e-07, "token_acc": 0.8110236220472441, "epoch": 2.3684210526315788, "step": 180}, {"loss": 0.6257027626037598, "grad_norm": 11.46248454566006, "learning_rate": 3.0467027280159394e-07, "token_acc": 0.8304742684157417, "epoch": 2.4342105263157894, "step": 185}, {"loss": 0.6725513458251953, "grad_norm": 13.002284165063047, "learning_rate": 2.934120444167326e-07, "token_acc": 0.7950469684030743, "epoch": 2.5, "step": 190}, {"loss": 0.5892130851745605, "grad_norm": 13.013905049177902, "learning_rate": 2.820622527327158e-07, "token_acc": 0.8100706713780919, "epoch": 2.5657894736842106, "step": 195}, {"loss": 0.5530064582824707, "grad_norm": 11.480677786146966, "learning_rate": 2.706448363680831e-07, "token_acc": 0.8359303391384051, "epoch": 2.6315789473684212, "step": 200}, {"loss": 0.5892533302307129, "grad_norm": 13.276820229615291, "learning_rate": 2.591838765731931e-07, "token_acc": 0.8258575197889182, "epoch": 2.6973684210526314, "step": 205}, {"loss": 0.6034513950347901, "grad_norm": 13.90433344794858, "learning_rate": 2.477035464388184e-07, "token_acc": 0.8290840415486308, "epoch": 2.763157894736842, "step": 210}, {"loss": 0.6079031467437744, "grad_norm": 12.016974173998076, "learning_rate": 2.3622805991103358e-07, "token_acc": 0.803770351328192, "epoch": 2.8289473684210527, "step": 215}, {"loss": 0.6700970649719238, "grad_norm": 12.883251904886917, "learning_rate": 2.2478162071993296e-07, "token_acc": 0.801210025929127, "epoch": 2.8947368421052633, "step": 220}, {"loss": 0.650183916091919, "grad_norm": 12.808332568512581, "learning_rate": 2.133883713298946e-07, "token_acc": 0.8039927404718693, "epoch": 2.9605263157894735, "step": 225}, {"eval_loss": 0.7265448570251465, "eval_runtime": 13.707, "eval_samples_per_second": 9.849, "eval_steps_per_second": 1.24, "eval_token_acc": 0.7859424920127795, "epoch": 3.0, "step": 228}, {"loss": 0.5968620300292968, "grad_norm": 12.599802900090912, "learning_rate": 2.0207234201906545e-07, "token_acc": 0.8157644824311491, "epoch": 3.026315789473684, "step": 230}, {"loss": 0.5248190879821777, "grad_norm": 10.323962239910601, "learning_rate": 1.9085740019546436e-07, "token_acc": 0.8487544483985765, "epoch": 3.0921052631578947, "step": 235}, {"loss": 0.564623212814331, "grad_norm": 13.344607997796906, "learning_rate": 1.7976720005660767e-07, "token_acc": 0.8265947888589398, "epoch": 3.1578947368421053, "step": 240}, {"loss": 0.5211175441741943, "grad_norm": 11.17948974251107, "learning_rate": 1.6882513269882913e-07, "token_acc": 0.8394097222222222, "epoch": 3.223684210526316, "step": 245}, {"loss": 0.5324906826019287, "grad_norm": 11.281365715436248, "learning_rate": 1.5805427678152674e-07, "token_acc": 0.8430688753269399, "epoch": 3.2894736842105265, "step": 250}, {"loss": 0.5558638572692871, "grad_norm": 11.016406883829102, "learning_rate": 1.4747734985039e-07, "token_acc": 0.8358633776091081, "epoch": 3.3552631578947367, "step": 255}, {"loss": 0.564480972290039, "grad_norm": 13.140987889542432, "learning_rate": 1.371166604222777e-07, "token_acc": 0.8250244379276638, "epoch": 3.4210526315789473, "step": 260}, {"loss": 0.5914690494537354, "grad_norm": 12.53265712228206, "learning_rate": 1.2699406093280545e-07, "token_acc": 0.8037135278514589, "epoch": 3.486842105263158, "step": 265}, {"loss": 0.5813470840454101, "grad_norm": 12.540922253147693, "learning_rate": 1.1713090164588606e-07, "token_acc": 0.8112294288480155, "epoch": 3.5526315789473686, "step": 270}, {"loss": 0.5226539611816406, "grad_norm": 11.386204403520878, "learning_rate": 1.0754798562243344e-07, "token_acc": 0.8482309124767226, "epoch": 3.6184210526315788, "step": 275}, {"loss": 0.643163537979126, "grad_norm": 13.164762618482639, "learning_rate": 9.826552484321085e-08, "token_acc": 0.822262118491921, "epoch": 3.6842105263157894, "step": 280}, {"loss": 0.600337028503418, "grad_norm": 13.707508212457464, "learning_rate": 8.930309757836516e-08, "token_acc": 0.8210251954821894, "epoch": 3.75, "step": 285}, {"loss": 0.5915569305419922, "grad_norm": 16.097843538474795, "learning_rate": 8.067960709356478e-08, "token_acc": 0.8240740740740741, "epoch": 3.8157894736842106, "step": 290}, {"loss": 0.46532864570617677, "grad_norm": 11.263467188848725, "learning_rate": 7.241324177983399e-08, "token_acc": 0.8408463661453542, "epoch": 3.8815789473684212, "step": 295}, {"loss": 0.5292417526245117, "grad_norm": 12.961895460114615, "learning_rate": 6.452143679117964e-08, "token_acc": 0.8424242424242424, "epoch": 3.9473684210526314, "step": 300}, {"eval_loss": 0.7292733192443848, "eval_runtime": 13.7897, "eval_samples_per_second": 9.79, "eval_steps_per_second": 1.233, "eval_token_acc": 0.7832800851970181, "epoch": 4.0, "step": 304}, {"loss": 0.47034730911254885, "grad_norm": 11.941143436933976, "learning_rate": 5.702083727091977e-08, "token_acc": 0.8603603603603603, "epoch": 4.0131578947368425, "step": 305}, {"loss": 0.5243096828460694, "grad_norm": 12.369478294282793, "learning_rate": 4.992726324427901e-08, "token_acc": 0.8342198581560284, "epoch": 4.078947368421052, "step": 310}, {"loss": 0.5160342216491699, "grad_norm": 12.072877421688775, "learning_rate": 4.325567625129545e-08, "token_acc": 0.8360814742967992, "epoch": 4.144736842105263, "step": 315}, {"loss": 0.517140245437622, "grad_norm": 13.029478667761595, "learning_rate": 3.702014779041826e-08, "token_acc": 0.8426640926640927, "epoch": 4.2105263157894735, "step": 320}, {"loss": 0.5130139827728272, "grad_norm": 12.08777604241874, "learning_rate": 3.123382963935156e-08, "token_acc": 0.8438409311348206, "epoch": 4.276315789473684, "step": 325}, {"loss": 0.5459250926971435, "grad_norm": 12.260821906980192, "learning_rate": 2.5908926115744994e-08, "token_acc": 0.8408723747980614, "epoch": 4.342105263157895, "step": 330}, {"loss": 0.5175511837005615, "grad_norm": 12.81988134221774, "learning_rate": 2.1056668336235623e-08, "token_acc": 0.8449477351916377, "epoch": 4.407894736842105, "step": 335}, {"loss": 0.5346428871154785, "grad_norm": 13.095033793006467, "learning_rate": 1.6687290528135722e-08, "token_acc": 0.8287731685789939, "epoch": 4.473684210526316, "step": 340}, {"loss": 0.4708279609680176, "grad_norm": 10.822304213631957, "learning_rate": 1.2810008443726456e-08, "token_acc": 0.8596491228070176, "epoch": 4.5394736842105265, "step": 345}, {"loss": 0.5827170848846436, "grad_norm": 12.010501597635164, "learning_rate": 9.432999922687396e-09, "token_acc": 0.8166969147005445, "epoch": 4.605263157894737, "step": 350}, {"loss": 0.47942113876342773, "grad_norm": 12.949414781142634, "learning_rate": 6.563387643658075e-09, "token_acc": 0.8479125248508946, "epoch": 4.671052631578947, "step": 355}, {"loss": 0.5540740013122558, "grad_norm": 13.897556186858308, "learning_rate": 4.207224101311246e-09, "token_acc": 0.821, "epoch": 4.7368421052631575, "step": 360}, {"loss": 0.48682217597961425, "grad_norm": 12.7137424197023, "learning_rate": 2.3694788406241894e-09, "token_acc": 0.8585365853658536, "epoch": 4.802631578947368, "step": 365}, {"loss": 0.5381804466247558, "grad_norm": 12.303136136456917, "learning_rate": 1.0540279752731252e-09, "token_acc": 0.8416955017301038, "epoch": 4.868421052631579, "step": 370}, {"loss": 0.5564488410949707, "grad_norm": 10.888367382127468, "learning_rate": 2.636460122578399e-10, "token_acc": 0.8355674709562109, "epoch": 4.934210526315789, "step": 375}, {"loss": 0.5687745094299317, "grad_norm": 15.155406114537248, "learning_rate": 0.0, "token_acc": 0.8452173913043478, "epoch": 5.0, "step": 380}, {"eval_loss": 0.7308594584465027, "eval_runtime": 13.7849, "eval_samples_per_second": 9.793, "eval_steps_per_second": 1.233, "eval_token_acc": 0.7843450479233227, "epoch": 5.0, "step": 380}, {"eval_loss": 0.7308594584465027, "eval_runtime": 16.1805, "eval_samples_per_second": 8.343, "eval_steps_per_second": 1.051, "eval_token_acc": 0.7843450479233227, "epoch": 5.0, "step": 380}, {"train_runtime": 2916.558, "train_samples_per_second": 2.086, "train_steps_per_second": 0.13, "total_flos": 2.788227173675172e+17, "train_loss": 0.6444208371011835, "epoch": 5.0, "step": 380}], "memory": 75.64453125}
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/val_dataset.jsonl ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA broken-out path in the snow – never take it for granted!", "loss": null}, {"role": "assistant", "content": "雪地里的一条开辟出来的小径——永远不要认为这是理所当然的!", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2938376423.jpg"}]}
2
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bad day on the slopes beats a good day at work. Snowboarding shower curtain.", "loss": null}, {"role": "assistant", "content": "雪坡上糟糕的一天也比工作中的好日子强。单板滑雪主题浴帘。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1555605271.jpg"}]}
3
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA badass logo for a digital marketing agency.", "loss": null}, {"role": "assistant", "content": "一个超酷的数字营销机构标志", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_518835396.jpg"}]}
4
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA statue of a suit on display.", "loss": null}, {"role": "assistant", "content": "一尊装甲服的雕像正在展出。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_81366672.jpg"}]}
5
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nInside the shed, paneling and junk, a hole in the wall that has been covered—looks like a window.", "loss": null}, {"role": "assistant", "content": "棚屋里有镶板和杂物,墙上有一个被遮盖的洞——看起来像一扇窗户。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3826601270.jpg"}]}
6
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThree men are together with a sign.", "loss": null}, {"role": "assistant", "content": "三个人和一个标牌在一起。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4750415524.jpg"}]}
7
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA modest concrete and brick facade renders the apartment sympathetic, despite its height.", "loss": null}, {"role": "assistant", "content": "一个简朴的混凝土和砖砌外墙让这栋公寓显得和谐,尽管它很高。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2238752144.jpg"}]}
8
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA competition and the policemen on a motorbike.", "loss": null}, {"role": "assistant", "content": "一场比赛和骑摩托车的警察。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000170636.jpg"}]}
9
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA Penrith fan proposes at the trial on the weekend.", "loss": null}, {"role": "assistant", "content": "一名彭里斯球迷在周末的体育比赛中求婚。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1162350536.jpg"}]}
10
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA newly operated cat with a funnel on his head.", "loss": null}, {"role": "assistant", "content": "一只刚做完手术、头上戴着伊丽莎白圈的猫。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1983045687.jpg"}]}
11
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of hawks is perched with hunting caps on their heads.", "loss": null}, {"role": "assistant", "content": "一群鹰戴着猎帽栖息着。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000573484.jpg"}]}
12
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nCricket player plays football with another cricket player during a training session.", "loss": null}, {"role": "assistant", "content": "板球运动员在训练期间与另一位板球运动员踢足球。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2478276218.jpg"}]}
13
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe person is feeling very festive in one of her new collars.", "loss": null}, {"role": "assistant", "content": "她的宠物戴上了其中一个新项圈,她感到非常喜庆。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1250524829.jpg"}]}
14
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of people standing together with some turkeys.", "loss": null}, {"role": "assistant", "content": "一群人站在一起,旁边有几只火鸡。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000099179.jpg"}]}
15
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nApple melting on the floor.", "loss": null}, {"role": "assistant", "content": "一个苹果在地板上融化。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3119350094.jpg"}]}
16
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person is observed making marks.", "loss": null}, {"role": "assistant", "content": "一个人被观察到正在写字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/5508346028.jpg"}]}
17
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA new vision for East Hanover Street.", "loss": null}, {"role": "assistant", "content": "东汉诺威街的新视觉设计", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2903564087.jpg"}]}
18
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nCows in a field from the train.", "loss": null}, {"role": "assistant", "content": "从火车上看田野里的牛。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2867233884.jpg"}]}
19
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA SUP or kayak rigged for fly fishing, a snag-free deck, and a cooler to stand on.", "loss": null}, {"role": "assistant", "content": "一个用于飞钓的立式桨板或皮划艇,无钩甲板,还有一个用来站立的冷却器。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3469033103.jpg"}]}
20
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA tree with the effect of destruction, dispersion, butterfly, moth.", "loss": null}, {"role": "assistant", "content": "一棵带有破坏、分散、蝴蝶和飞蛾效果的树。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2934168610.jpg"}]}
21
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman cannot survive on wine alone; she also needs a camper and a dog shirt.", "loss": null}, {"role": "assistant", "content": "一个女人不能只靠葡萄酒生存;她还需要一辆露营车和一只狗。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2678975805.jpg"}]}
22
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA grandfather is someone you can look up to, no matter how tall you grow, photographic print.", "loss": null}, {"role": "assistant", "content": "祖父是一个无论你长多高都值得仰望的人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2935502303.jpg"}]}
23
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSubway braking on rails in front of a metropolitan city.", "loss": null}, {"role": "assistant", "content": "地铁在大都市前的铁轨上刹车", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000400152.jpg"}]}
24
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bit of a mess, but I’m still a magical sticker.", "loss": null}, {"role": "assistant", "content": "有点乱,但我仍然是一个神奇的贴纸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1763187881.jpg"}]}
25
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThere are people looking at the man that fell.", "loss": null}, {"role": "assistant", "content": "有人在看着那个摔倒的人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/482882307.jpg"}]}
26
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis is a girl on a skateboard walking her pit bull.", "loss": null}, {"role": "assistant", "content": "这是一个在滑板上的女孩遛着她的比特犬。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000379520.jpg"}]}
27
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nTwo ancient Roman soldiers with machine guns capped: when you're losing to the computer in Age of Empires and start using cheats.", "loss": null}, {"role": "assistant", "content": "两名古罗马士兵手持机关枪:当你在《帝国时代》中输给电脑并开始使用作弊码时。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2854084108.jpg"}]}
28
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little bit dramatic graphic T-shirt.", "loss": null}, {"role": "assistant", "content": "印有“A LITTLE BIT DRAMATIC”图案的T恤。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2298487757.jpg"}]}
29
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA match ball on the pitch.", "loss": null}, {"role": "assistant", "content": "比赛用球在球场上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1324293325.jpg"}]}
30
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nTraveling concept around the world.", "loss": null}, {"role": "assistant", "content": "环游世界的概念", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2632725473.jpg"}]}
31
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe man performs grooming activities.", "loss": null}, {"role": "assistant", "content": "该男子正在整理仪容。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4322110026.jpg"}]}
32
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nPicture of a cyber woman with a fresh lemon.", "loss": null}, {"role": "assistant", "content": "一张赛博女性拿着新鲜柠檬的照片。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2368384225.jpg"}]}
33
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA footpath with flower structures lined the path, and people walked in between it.", "loss": null}, {"role": "assistant", "content": "一条两侧带有园艺花卉结构的人行道,人们在其间穿行。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2923117339.jpg"}]}
34
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis secluded cabin in the woods has us dreaming of making an escape right now.", "loss": null}, {"role": "assistant", "content": "这间隐秘的林中小屋让我们梦想着现在就逃离一切。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4000716874.jpg"}]}
35
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fishing boat trawling off Constitutional Republic with seabirds following.", "loss": null}, {"role": "assistant", "content": "一艘渔船在宪政共和国附近拖网捕鱼,海鸟紧随其后。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3310020685.jpg"}]}
36
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA restaurant or other place to eat at Bagan Hotel River View.", "loss": null}, {"role": "assistant", "content": "在蒲甘河景酒店的一家餐厅或其他用餐场所。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_834397888.jpg"}]}
37
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nIt’s always a great idea to put something at the foot of the bed!", "loss": null}, {"role": "assistant", "content": "在床尾放点东西总是个好主意!", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3031991668.jpg"}]}
38
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe laptop is sitting on the cushion facing the TV.", "loss": null}, {"role": "assistant", "content": "笔记本电脑放在垫子上,屏幕朝向电视。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000064834.jpg"}]}
39
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA few drinks to get started.", "loss": null}, {"role": "assistant", "content": "从先喝几杯开始。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3230391523.jpg"}]}
40
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of green men and women in a circle.", "loss": null}, {"role": "assistant", "content": "一群绿色的男女围成一个圈。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_132331323.jpg"}]}
41
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA flight of stairs on the way.", "loss": null}, {"role": "assistant", "content": "路上的一段楼梯", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_101917360.jpg"}]}
42
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA lone bench sits in front of an area filled with greenery clumps and aligned as if it is the head of a class.", "loss": null}, {"role": "assistant", "content": "一张孤零零的长椅坐落在一片绿植丛前,这些绿植整齐排列,仿佛是班级的领头。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000349734.jpg"}]}
43
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHandwritten in green ink, impressions of a kiss mouth on the back, which also shine through on the front side.", "loss": null}, {"role": "assistant", "content": "用绿色墨水手写,背面有吻痕的印记,这些印记也透到了正面。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4203389789.jpg"}]}
44
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA clock with a concrete bird next to it.", "loss": null}, {"role": "assistant", "content": "一个时钟旁边有一只混凝土制成的鸟。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000238455.jpg"}]}
45
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA family prepares to run a rapid on the Jordan River. There are several points on the river where one can float on a raft down the river.", "loss": null}, {"role": "assistant", "content": "一个家庭准备在约旦河上穿越急流。河上有几个地方可以乘坐皮筏顺流而下。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1638667585.jpg"}]}
46
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person with a tie on a metal rail.", "loss": null}, {"role": "assistant", "content": "一个戴领带的人靠在金属栏杆上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000579589.jpg"}]}
47
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA girl watches a boy swing.", "loss": null}, {"role": "assistant", "content": "一个女孩看着一个男孩挥棒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/5769959745.jpg"}]}
48
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSand runs on the road with water.", "loss": null}, {"role": "assistant", "content": "沙子随着水在道路上流动。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1301356238.jpg"}]}
49
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA five-star bus is going down the road.", "loss": null}, {"role": "assistant", "content": "一辆五星巴士正在路上行驶。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000144298.jpg"}]}
50
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe \"North\" in front of the B Street sign shows a good example of traveling direction.", "loss": null}, {"role": "assistant", "content": "B街标志上的“北”很好地展示了旅行方向。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2961209760.jpg"}]}
51
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA vision for this forward-looking policy document sets out a vision for.", "loss": null}, {"role": "assistant", "content": "这份前瞻性的政策文件提出了一个愿景。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_727734346.jpg"}]}
52
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe drinks from the river near approaching elephants.", "loss": null}, {"role": "assistant", "content": "长颈鹿在河边喝水,附近有正在接近的大象。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000118401.jpg"}]}
53
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nLine of animals over the changing table.", "loss": null}, {"role": "assistant", "content": "换尿布台上方的一排动物。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_1028662478.jpg"}]}
54
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person displays her tail on the beach.", "loss": null}, {"role": "assistant", "content": "一个人在海滩上展示她的美人鱼尾巴。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2735079883.jpg"}]}
55
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA long-haired dog is playing with a small, blue bat.", "loss": null}, {"role": "assistant", "content": "一只长毛狗正在玩一个小的蓝色球棒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/2375924666.jpg"}]}
56
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of people in the shape of lock, heart, flash mob.", "loss": null}, {"role": "assistant", "content": "一群人排成锁和心形状,进行快闪活动。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_525499828.jpg"}]}
57
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nIt's the first concrete electricity pole in Japan, or so the sign says.", "loss": null}, {"role": "assistant", "content": "这是日本的第一根混凝土电线杆,至少牌子上是这么说的。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_395581361.jpg"}]}
58
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA door that has a window with a dog behind it.", "loss": null}, {"role": "assistant", "content": "一扇有窗户的门,窗户后面有一只狗。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000539263.jpg"}]}
59
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little imagination goes a long way.", "loss": null}, {"role": "assistant", "content": "一点想象力大有帮助。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4169470898.jpg"}]}
60
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA good dose of local in the lobby.", "loss": null}, {"role": "assistant", "content": "大堂里充满了本地特色。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4116608243.jpg"}]}
61
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe computer is sitting on a desk with a mouse.", "loss": null}, {"role": "assistant", "content": "电脑放在桌子上,旁边有一个鼠标。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000502877.jpg"}]}
62
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fan with a painted face with '19' celebrating the titles his club has won.", "loss": null}, {"role": "assistant", "content": "一位脸上涂着‘19’的球迷正在庆祝他所在俱乐部赢得的冠军。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3548576173.jpg"}]}
63
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA photo of workers working on the clock on a tower.", "loss": null}, {"role": "assistant", "content": "一张工人在塔上的钟表上工作的照片。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000332417.jpg"}]}
64
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person using a cell phone and some kitty knick-knacks.", "loss": null}, {"role": "assistant", "content": "一个人在使用手机,旁边有一些猫咪装饰品。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000055395.jpg"}]}
65
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAn improperly dressed person is outside trying to stay out of the elements.", "loss": null}, {"role": "assistant", "content": "一个穿着不当的人在外面试图避开恶劣天气。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/2685139184.jpg"}]}
66
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA hand with a red pen drew a black businessman climbing up the stairs, a concept of success and career.", "loss": null}, {"role": "assistant", "content": "一只手拿着红笔画了一个正在爬楼梯的黑人商人,象征成功和事业。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1853021108.jpg"}]}
67
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nOne man is making sure the scope is right and the other is getting ready to shoot the gun.", "loss": null}, {"role": "assistant", "content": "一个人在确保瞄准镜正确,另一个人准备开枪。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/7735129098.jpg"}]}
68
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA heart tap dancing with a big cheesy grin.", "loss": null}, {"role": "assistant", "content": "一个心形正在跳踢踏舞,带着夸张的大笑。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3839401695.jpg"}]}
69
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pug dressed as a film character from the movie walks along the runway.", "loss": null}, {"role": "assistant", "content": "一只打扮成电影角色的哈巴狗走在跑道上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4073626822.jpg"}]}
70
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman standing behind a podium, and an airplane figure.", "loss": null}, {"role": "assistant", "content": "一位站在讲台后面的女士和一个飞机模型。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2960077463.jpg"}]}
71
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA large-scale drawing of a \"global city.\"", "loss": null}, {"role": "assistant", "content": "一幅大规模的“全球城市”画作", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3980987087.jpg"}]}
72
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis hot dog is loaded with different peppers.", "loss": null}, {"role": "assistant", "content": "这个热狗加了很多种辣椒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000497440.jpg"}]}
73
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA special theorem about tangent-tangent angles given: and are tangent to circle O.", "loss": null}, {"role": "assistant", "content": "一个关于切线-切线角的特殊定理:PT与PS是圆O的切线。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4101914416.jpg"}]}
74
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man with more confidence with his new crowns.", "loss": null}, {"role": "assistant", "content": "一个男人带着新的牙冠更加自信了。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_26854658.jpg"}]}
75
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA close-up of the lager and stout.", "loss": null}, {"role": "assistant", "content": "拉格啤酒和黑啤的特写", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_468095102.jpg"}]}
76
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pair of stunning quality, check out the soft suede.", "loss": null}, {"role": "assistant", "content": "一双质量极佳的鞋子,看看这柔软的麂皮。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1908856807.jpg"}]}
77
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA couple of kites are flying above a field.", "loss": null}, {"role": "assistant", "content": "几只风筝在田野上空飞翔。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000339705.jpg"}]}
78
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA flying kite with a curly tail mimics a Blue Angels' jet.", "loss": null}, {"role": "assistant", "content": "一只带着卷曲尾巴的风筝模仿了蓝天使飞行队的喷气式飞机。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000044478.jpg"}]}
79
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHangin' in the car seat.", "loss": null}, {"role": "assistant", "content": "坐在汽车座椅上休息。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3104028364.jpg"}]}
80
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA guide to ageing young barn owls.", "loss": null}, {"role": "assistant", "content": "一份关于鉴定年轻仓鸮年龄的指南", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1728337343.jpg"}]}
81
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman with a glove on a baseball field with another woman.", "loss": null}, {"role": "assistant", "content": "一个戴着手套的女人和另一个女人在棒球场上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/566794440.jpg"}]}
82
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pool of calming water at the National Museum of African American History and Culture all-over print tote bag.", "loss": null}, {"role": "assistant", "content": "国家非裔美国人历史和文化博物馆的一池平静的水印在全印花托特包上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3904468300.jpg"}]}
83
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA crystal-clear pool and scalloped rock in your cave, White County, TN.", "loss": null}, {"role": "assistant", "content": "田纳西州怀特县,一个洞穴中有清澈的池水和扇形岩石。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3707245747.jpg"}]}
84
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nBound by the shadows. I like black and white better (:.", "loss": null}, {"role": "assistant", "content": "被阴影束缚。我更喜欢黑白(:。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3074023974.jpg"}]}
85
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA big-ass rock in the ocean.", "loss": null}, {"role": "assistant", "content": "海洋中的一块巨石", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_4214355063.jpg"}]}
86
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA cast-iron skillet with eight cheater's chorizo meatballs being cooked in it.", "loss": null}, {"role": "assistant", "content": "一个铸铁煎锅里正在煎八个简易版西班牙辣香肠肉丸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1348809314.jpg"}]}
87
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA map of the system.", "loss": null}, {"role": "assistant", "content": "一张交通系统的地图", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3169415834.jpg"}]}
88
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nPrinted pleated blouse - reverse of the article.", "loss": null}, {"role": "assistant", "content": "印花褶皱上衣 - 衣物的背面", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1394401849.jpg"}]}
89
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man holding a child on top of a skateboard.", "loss": null}, {"role": "assistant", "content": "一个男人扶着站在滑板上的孩子。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000414709.jpg"}]}
90
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nDog - when a song reminds you of a painful memory, but it bumps really hard, so you keep listening to it and try to ignore the flashbacks.", "loss": null}, {"role": "assistant", "content": "狗——当一首歌让你想起痛苦的回忆,但它的节奏感非常强,所以你继续听,并试图忽略那些闪回。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4153226469.jpg"}]}
91
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSomebody here is waiting to get somewhere else.", "loss": null}, {"role": "assistant", "content": "这里有人在等待去别的地方。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4679771579.jpg"}]}
92
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA beautiful woman standing and carrying a baby close to her.", "loss": null}, {"role": "assistant", "content": "一位美丽的女子站着,怀里抱着一个紧贴着她的婴儿。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_425878648.jpg"}]}
93
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bag of tricks: see page 297 of your text.", "loss": null}, {"role": "assistant", "content": "一套技巧:请参阅课本第297页。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2554611256.jpg"}]}
94
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA child causes himself not to see while next to a piece of furniture.", "loss": null}, {"role": "assistant", "content": "一个孩子站在家具旁边,用手遮住了眼睛。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/6820458661.jpg"}]}
95
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nVandalized stop sign with a sticker reminding people to stop [eating animals].", "loss": null}, {"role": "assistant", "content": "被破坏的停车标志,上面贴着一个提醒人们停止吃动物的贴纸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000206300.jpg"}]}
96
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA zener is not like a normal diode.", "loss": null}, {"role": "assistant", "content": "齐纳二极管不像普通的二极管。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4108174908.jpg"}]}
97
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe water from the sink that you wash your hands in flushes into the toilet tank.", "loss": null}, {"role": "assistant", "content": "洗手池里的水(你洗手时流出的水)直接流入马桶水箱。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3611995191.jpg"}]}
98
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA light dusting of snow covers some ferns living just outside of rooms to stay in Armidale.", "loss": null}, {"role": "assistant", "content": "一层薄雪覆盖了阿米代尔旅馆房间外的一些蕨类植物。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_98916313.jpg"}]}
99
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA peek inside the powder room.", "loss": null}, {"role": "assistant", "content": "一窥洗手间内部", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1537635414.jpg"}]}
100
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA lot goes into a good suit.", "loss": null}, {"role": "assistant", "content": "一套好西装需要很多讲究。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2106491181.jpg"}]}
101
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe and several zebra in Tail Brush.", "loss": null}, {"role": "assistant", "content": "一只长颈鹿和几只斑马在Tail Brush。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000192217.jpg"}]}
102
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nProduce market in India protected by umbrella shades.", "loss": null}, {"role": "assistant", "content": "印度的农产品市场受到伞荫的保护。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/3864394764.jpg"}]}
103
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe with his head out of sight over a covering.", "loss": null}, {"role": "assistant", "content": "一只长颈鹿的头越过遮盖物看不见了。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000496309.jpg"}]}
104
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA distant artistic conception of zen ink landscape painting. Layers of mountains and mountains of zen ink landscape painting.", "loss": null}, {"role": "assistant", "content": "远处的禅意水墨山水画意境,层层叠叠的山峦和禅意水墨山水画的意象。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2191215683.jpg"}]}
105
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bed or beds in a room at an English holiday home.", "loss": null}, {"role": "assistant", "content": "英国度假屋一个房间里的床铺。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_913296828.jpg"}]}
106
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of red and white striped tulips reminds me of the famous variety.", "loss": null}, {"role": "assistant", "content": "一簇红白条纹的郁金香让我想起了那个著名的品种。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_887742134.jpg"}]}
107
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nYoung Caucasian woman organizing a birthday, keeping a secret, or asking for silence.", "loss": null}, {"role": "assistant", "content": "一位年轻的白人女性正在组织生日派对,或在保守秘密,或要求保持安静。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1683317947.jpg"}]}
108
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nObservers sit watching jugglers with fire.", "loss": null}, {"role": "assistant", "content": "观众坐着观看玩火的杂技演员。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/111069375.jpg"}]}
109
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA collection of trees and branches across the Sucker River from its other bank. I guess I did call this a bridge!", "loss": null}, {"role": "assistant", "content": "从苏克河的另一岸看过去,是一堆树木和树枝。我想我确实把这叫做一座桥了!", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2031663003.jpg"}]}
110
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bed or beds in a room at Destinations Inn theme rooms.", "loss": null}, {"role": "assistant", "content": "Destinations Inn 主题房间中的床。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1080498727.jpg"}]}
111
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nNo building, under tree school at village Kumb.", "loss": null}, {"role": "assistant", "content": "库姆村的学校没有建筑物,设在树下。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2212461512.jpg"}]}
112
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThere are two people wearing plastic.", "loss": null}, {"role": "assistant", "content": "有两个人穿着塑料材质的服装。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/3391209042.jpg"}]}
113
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nBeautiful girl/model in couture dress standing by huge privet hedge on estate in Southampton, NY. Photo by Eric Striffler.", "loss": null}, {"role": "assistant", "content": "美丽的模特穿着高级定制礼服站在纽约南安普顿庄园的巨大女贞树篱旁。照片由Eric Striffler拍摄。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3010430221.jpg"}]}
114
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe laundry is hanging in the tilted room.", "loss": null}, {"role": "assistant", "content": "洗衣物挂在倾斜的房间里。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000279689.jpg"}]}
115
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA skater is holding out his hand and flashing a symbol.", "loss": null}, {"role": "assistant", "content": "滑板者伸出手,做了一个手势。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000107974.jpg"}]}
116
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA set of banners for the web of sport, fitness, and motivation text for sports equipment.", "loss": null}, {"role": "assistant", "content": "一组用于体育、健身和激励网站的横幅,包含体育设备的文字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3203677853.jpg"}]}
117
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA girl in a black tank with cargo shorts to what appears to be dancing, with several people around.", "loss": null}, {"role": "assistant", "content": "一个穿着黑色背心和工装短裤的女孩似乎在跳舞,周围有几个人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4879725156.jpg"}]}
118
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bathroom has dark colored appliances and light wood.", "loss": null}, {"role": "assistant", "content": "浴室里有深色的用具和浅色的木地板。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000400596.jpg"}]}
119
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA view of the shops and passers.", "loss": null}, {"role": "assistant", "content": "商店和路人的景象", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_43578409.jpg"}]}
120
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAnother cute kids' choir singing in church. The girl on the left did a lot of choir teaching, I think.", "loss": null}, {"role": "assistant", "content": "另一个可爱的儿童合唱团在教堂演唱。我想左边的女孩做了很多合唱教学。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3142177036.jpg"}]}
121
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little turkey of construction paper with the family name on it.", "loss": null}, {"role": "assistant", "content": "一只用彩纸做的小火鸡,上面写着家人的名字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3508197500.jpg"}]}
122
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA cheeky flip of the tail.", "loss": null}, {"role": "assistant", "content": "调皮地甩了一下尾巴。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_541197507.jpg"}]}
123
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA box in the boot... what more could a kitty ask for...", "loss": null}, {"role": "assistant", "content": "后备箱里的一个盒子……一只小猫还能要求什么呢……", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3308376282.jpg"}]}
124
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAn Air France plane is on the fly track.", "loss": null}, {"role": "assistant", "content": "一架法国航空的飞机在跑道上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000287960.jpg"}]}
125
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA black-and-white cat sleeping on top of a TV with a fish on it.", "loss": null}, {"role": "assistant", "content": "一只黑白相间的猫正睡在有一条鱼的电视上面。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000251358.jpg"}]}
126
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman on a park bench hides her face behind her handbag. The bench is marked.", "loss": null}, {"role": "assistant", "content": "一名女子坐在公园长椅上,用手提包遮住了脸。这张长椅上有标记。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3528349762.jpg"}]}
127
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pair of birds in the crown of the heart tree.", "loss": null}, {"role": "assistant", "content": "一对鸟儿在心形树的树冠上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1020101235.jpg"}]}
128
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis looks like a craft of cutting out greenery from magazines.", "loss": null}, {"role": "assistant", "content": "这看起来像是从杂志上剪下绿色植物的手工艺。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000127167.jpg"}]}
129
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThey don't call it the land of ice and fire for nothing.", "loss": null}, {"role": "assistant", "content": "他们称这里为冰与火之国并非没有道理。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1643510681.jpg"}]}
130
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fairy figure among the seashells and flowers.", "loss": null}, {"role": "assistant", "content": "贝壳和花朵中的仙女形象", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000577434.jpg"}]}
131
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man is on his truck.", "loss": null}, {"role": "assistant", "content": "一个男人坐在他的卡车后部。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4859995088.jpg"}]}
132
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nDog mom, the soul of a witch, the fire of a lioness v-neck t-shirt.", "loss": null}, {"role": "assistant", "content": "狗妈妈,女巫的灵魂,母狮的火焰 V领T恤", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2524820280.jpg"}]}
133
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA creative cold gradient line drawing cartoon fluid injection.", "loss": null}, {"role": "assistant", "content": "一幅创意的冷色调渐变线条卡通流体注射。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1778791047.jpg"}]}
134
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHorse lamp in front of check-in desk.", "loss": null}, {"role": "assistant", "content": "登记台前的马形灯。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_1044392237.jpg"}]}
135
+ {"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bunch of green vegetables on a table along with some literature.", "loss": null}, {"role": "assistant", "content": "桌上有一堆绿色蔬菜和一些宣传资料。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000577631.jpg"}]}
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6f3949e2b25ad05905da16bc2d58aeadc9a9f1db0b0e97a04ceb63467acff1d
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46438be1a017b4129e12daa7706d8e089acb52ce21dd221bdf56b7805908f8e7
3
+ size 130
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6eda756b33768639c1e1932ae00c5d43269de5705c09162bc09acdf71d3533c0
3
- size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b5d9c1719ded08a1ad4d2ab99a52e42b832740b2f617b20fbced3b44af1cb6
3
+ size 130
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:668792bf09fbb49f9c314d93f695654b11742e9f79ae00886d88610037822211
3
- size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2706c505d9b4f951bbed8fdc626de4fd07d6c4cab5c57c69911cad80ca8cd266
3
+ size 129
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f9ba4b4a6625b5047a1356f6081b641c3e4e6a4a198facbd4bef217747d1685
3
- size 11423548
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be323c4be6beb251cbbb892753b8e3795ed3cc61e6c1cd7b99248f07c0dd77d3
3
+ size 133
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e24637d5664f539efa19070b97cd134b22e84f4e6886afc216787e15a1dabf33
3
- size 9105
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4008f8556423d2a54d5c5da7dff26ae095bd1dbc88171574086541a58bf550e2
3
+ size 129
ood/qwen2.5vl-7b-lora-ood-merged/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:feda0a72bf79db5c840d893b72eae28a52c8efa4f9dbeb70919bef25be1f4203
3
- size 4968243304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a982e6dee71b0ebae179ee58d2e60d88b96c82a27afeb3b898682918322a4896
3
+ size 135
ood/qwen2.5vl-7b-lora-ood-merged/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87f787aa0a6de41b70622019d0945b52fc4f4727e489dbc04a16aa06abbececc
3
- size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f6463345d04663ff4f9afc252b987018d3c3d62aaf19a2e08f1c9af850769a5
3
+ size 135
ood/qwen2.5vl-7b-lora-ood-merged/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7631840e9191407b17edbbbadd1415be270a24562d698d8cd259c35e0d882b9c
3
- size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96868afd4278fd0b65c13d19d69e75e60dd517d61f4a95923b07a36024b44a1a
3
+ size 135
ood/qwen2.5vl-7b-lora-ood-merged/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ff7c07eaa9cb4421f500f376e445441b2d0a6ff70b4b899b3231b2ccad16000
3
- size 1691924384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d026cc75d4103794f79ae83e5228042b6ae32e017d0c1018237f6cc38b9b6e70
3
+ size 135
ood/qwen2.5vl-7b-lora-ood-merged/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cedf8734fa1adb8c4ce97738255bb9ae9a474dcc57a219a61df59d32cee881c9
3
+ size 133
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abbed0e0d4e1347664a1cc0049e9a6dfc803fcfa7143008ca19b9622aa6c5682
3
+ size 4968243304
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dcad0749899eaa609764a7a7716c5798980dd4be78a39bb4422fb0728aa52ed
3
+ size 4991495816
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b55402c2db527127c3c2ea2bb2b01836b1566bf17ac0ad434f03e51c2b0ab2a4
3
+ size 4932751040
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55e09e29d1678f8eac1b36f0a75ddcf819e2296f1b26c67f6c4828ca6151fe5d
3
+ size 1691924384
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model.safetensors.index.json ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 848896,
4
+ "total_size": 16584333312
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.norm.weight": "model-00004-of-00004.safetensors",
346
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
347
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
348
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
349
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
350
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
351
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
353
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
354
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
355
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
356
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
357
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
358
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
359
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
360
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
361
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
362
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
363
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
364
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
365
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
366
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
367
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
368
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
369
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
370
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
371
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
372
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
373
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
374
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
375
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
376
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
377
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
378
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
379
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
380
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
381
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
382
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
383
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
384
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
385
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
386
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
387
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
388
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
389
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
390
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
391
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
392
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
393
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
394
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
395
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
396
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
397
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
398
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
399
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
400
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
401
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
402
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
403
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
404
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
405
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
406
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
407
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
408
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
409
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
410
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
411
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
412
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
413
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
414
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
415
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
416
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
417
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
418
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
419
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
420
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
421
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
422
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
423
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
424
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
425
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
426
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
427
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
428
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
429
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
430
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
431
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
432
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
433
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
434
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
435
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
436
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
437
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
438
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
439
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
440
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
441
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
442
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
729
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
730
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
731
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
732
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
733
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
734
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
735
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
736
+ }
737
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Qwen2_5_VLProcessor"
19
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08c830b56eaefdd9b372fae4488e1ba93148ecf58aa2ed05c8103ab7afe964ee
3
+ size 15365
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f3949e2b25ad05905da16bc2d58aeadc9a9f1db0b0e97a04ceb63467acff1d
3
+ size 15429
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c3812c5e7f13c9d32f97d83cd625428061b1c9486e869caffd453d83ac07ceb
3
+ size 15429
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eda756b33768639c1e1932ae00c5d43269de5705c09162bc09acdf71d3533c0
3
+ size 15429
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:668792bf09fbb49f9c314d93f695654b11742e9f79ae00886d88610037822211
3
+ size 1465
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/trainer_state.json ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 152,
3
+ "best_metric": 0.75577664,
4
+ "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10/v0-20250918-203744/checkpoint-152",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 228,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.013157894736842105,
14
+ "grad_norm": 12.42494369838121,
15
+ "learning_rate": 2.6315789473684208e-08,
16
+ "loss": 1.5432628393173218,
17
+ "step": 1,
18
+ "token_acc": 0.6487418831168831
19
+ },
20
+ {
21
+ "epoch": 0.06578947368421052,
22
+ "grad_norm": 12.47365246922611,
23
+ "learning_rate": 1.3157894736842104e-07,
24
+ "loss": 1.5321896076202393,
25
+ "step": 5,
26
+ "token_acc": 0.6431663266829195
27
+ },
28
+ {
29
+ "epoch": 0.13157894736842105,
30
+ "grad_norm": 12.047748548567757,
31
+ "learning_rate": 2.631578947368421e-07,
32
+ "loss": 1.506991195678711,
33
+ "step": 10,
34
+ "token_acc": 0.6522582993004731
35
+ },
36
+ {
37
+ "epoch": 0.19736842105263158,
38
+ "grad_norm": 12.014778540391298,
39
+ "learning_rate": 3.9473684210526315e-07,
40
+ "loss": 1.4956873893737792,
41
+ "step": 15,
42
+ "token_acc": 0.6490658614832987
43
+ },
44
+ {
45
+ "epoch": 0.2631578947368421,
46
+ "grad_norm": 10.195113538835848,
47
+ "learning_rate": 5.263157894736842e-07,
48
+ "loss": 1.4674793243408204,
49
+ "step": 20,
50
+ "token_acc": 0.6513630945514669
51
+ },
52
+ {
53
+ "epoch": 0.32894736842105265,
54
+ "grad_norm": 10.285205984290517,
55
+ "learning_rate": 6.578947368421053e-07,
56
+ "loss": 1.3990056037902832,
57
+ "step": 25,
58
+ "token_acc": 0.6637606498910243
59
+ },
60
+ {
61
+ "epoch": 0.39473684210526316,
62
+ "grad_norm": 7.536123061987423,
63
+ "learning_rate": 7.894736842105263e-07,
64
+ "loss": 1.2762855529785155,
65
+ "step": 30,
66
+ "token_acc": 0.6822537218878683
67
+ },
68
+ {
69
+ "epoch": 0.4605263157894737,
70
+ "grad_norm": 6.843506846280123,
71
+ "learning_rate": 9.210526315789473e-07,
72
+ "loss": 1.1991205215454102,
73
+ "step": 35,
74
+ "token_acc": 0.6923714759535655
75
+ },
76
+ {
77
+ "epoch": 0.5263157894736842,
78
+ "grad_norm": 5.130992585179559,
79
+ "learning_rate": 1.0526315789473683e-06,
80
+ "loss": 1.0460807800292968,
81
+ "step": 40,
82
+ "token_acc": 0.72837884815945
83
+ },
84
+ {
85
+ "epoch": 0.5921052631578947,
86
+ "grad_norm": 3.9730546867292427,
87
+ "learning_rate": 1.1842105263157892e-06,
88
+ "loss": 0.9720072746276855,
89
+ "step": 45,
90
+ "token_acc": 0.741999770053271
91
+ },
92
+ {
93
+ "epoch": 0.6578947368421053,
94
+ "grad_norm": 4.1751054772178895,
95
+ "learning_rate": 1.3157894736842106e-06,
96
+ "loss": 0.8988618850708008,
97
+ "step": 50,
98
+ "token_acc": 0.7569205505161088
99
+ },
100
+ {
101
+ "epoch": 0.7236842105263158,
102
+ "grad_norm": 3.5074503427399657,
103
+ "learning_rate": 1.4473684210526317e-06,
104
+ "loss": 0.8683221817016602,
105
+ "step": 55,
106
+ "token_acc": 0.7598684210526315
107
+ },
108
+ {
109
+ "epoch": 0.7894736842105263,
110
+ "grad_norm": 3.402758348361785,
111
+ "learning_rate": 1.5789473684210526e-06,
112
+ "loss": 0.839161491394043,
113
+ "step": 60,
114
+ "token_acc": 0.7665720617394347
115
+ },
116
+ {
117
+ "epoch": 0.8552631578947368,
118
+ "grad_norm": 3.3488105431372595,
119
+ "learning_rate": 1.7105263157894735e-06,
120
+ "loss": 0.8634527206420899,
121
+ "step": 65,
122
+ "token_acc": 0.7606793413411821
123
+ },
124
+ {
125
+ "epoch": 0.9210526315789473,
126
+ "grad_norm": 3.365459865949832,
127
+ "learning_rate": 1.8421052631578946e-06,
128
+ "loss": 0.8623040199279786,
129
+ "step": 70,
130
+ "token_acc": 0.7610415546508948
131
+ },
132
+ {
133
+ "epoch": 0.9868421052631579,
134
+ "grad_norm": 3.228725710683044,
135
+ "learning_rate": 1.9736842105263157e-06,
136
+ "loss": 0.8207452774047852,
137
+ "step": 75,
138
+ "token_acc": 0.7709683833764168
139
+ },
140
+ {
141
+ "epoch": 1.0,
142
+ "eval_loss": 0.7950107455253601,
143
+ "eval_runtime": 11.1516,
144
+ "eval_samples_per_second": 12.106,
145
+ "eval_steps_per_second": 1.524,
146
+ "eval_token_acc": 0.7745666068883065,
147
+ "step": 76
148
+ },
149
+ {
150
+ "epoch": 1.0526315789473684,
151
+ "grad_norm": 3.061781454836411,
152
+ "learning_rate": 1.9998312416333223e-06,
153
+ "loss": 0.7909780979156494,
154
+ "step": 80,
155
+ "token_acc": 0.7769812355545109
156
+ },
157
+ {
158
+ "epoch": 1.118421052631579,
159
+ "grad_norm": 2.9959739226060877,
160
+ "learning_rate": 1.999145758387301e-06,
161
+ "loss": 0.8172050476074219,
162
+ "step": 85,
163
+ "token_acc": 0.7715258754719775
164
+ },
165
+ {
166
+ "epoch": 1.1842105263157894,
167
+ "grad_norm": 3.093664256799247,
168
+ "learning_rate": 1.9979333640833945e-06,
169
+ "loss": 0.7659732818603515,
170
+ "step": 90,
171
+ "token_acc": 0.7813097981020815
172
+ },
173
+ {
174
+ "epoch": 1.25,
175
+ "grad_norm": 2.9007288102242987,
176
+ "learning_rate": 1.9961946980917456e-06,
177
+ "loss": 0.7576296329498291,
178
+ "step": 95,
179
+ "token_acc": 0.7848722591027962
180
+ },
181
+ {
182
+ "epoch": 1.3157894736842106,
183
+ "grad_norm": 3.08974622770043,
184
+ "learning_rate": 1.9939306773179494e-06,
185
+ "loss": 0.7871574878692627,
186
+ "step": 100,
187
+ "token_acc": 0.7752208275220828
188
+ },
189
+ {
190
+ "epoch": 1.381578947368421,
191
+ "grad_norm": 2.9410249033476967,
192
+ "learning_rate": 1.9911424957195155e-06,
193
+ "loss": 0.7703839778900147,
194
+ "step": 105,
195
+ "token_acc": 0.7800666232721982
196
+ },
197
+ {
198
+ "epoch": 1.4473684210526316,
199
+ "grad_norm": 3.412089039840561,
200
+ "learning_rate": 1.9878316236762193e-06,
201
+ "loss": 0.7863829135894775,
202
+ "step": 110,
203
+ "token_acc": 0.7759826005903371
204
+ },
205
+ {
206
+ "epoch": 1.513157894736842,
207
+ "grad_norm": 3.185001921714578,
208
+ "learning_rate": 1.98399980721468e-06,
209
+ "loss": 0.7711540222167969,
210
+ "step": 115,
211
+ "token_acc": 0.7787817668414684
212
+ },
213
+ {
214
+ "epoch": 1.5789473684210527,
215
+ "grad_norm": 3.0212490269279906,
216
+ "learning_rate": 1.9796490670875738e-06,
217
+ "loss": 0.7722112655639648,
218
+ "step": 120,
219
+ "token_acc": 0.7774650089921026
220
+ },
221
+ {
222
+ "epoch": 1.6447368421052633,
223
+ "grad_norm": 3.097474614016035,
224
+ "learning_rate": 1.9747816977079668e-06,
225
+ "loss": 0.7530896186828613,
226
+ "step": 125,
227
+ "token_acc": 0.7835121176003178
228
+ },
229
+ {
230
+ "epoch": 1.7105263157894737,
231
+ "grad_norm": 2.9781695556647994,
232
+ "learning_rate": 1.9694002659393305e-06,
233
+ "loss": 0.7732268333435058,
234
+ "step": 130,
235
+ "token_acc": 0.7787148117353605
236
+ },
237
+ {
238
+ "epoch": 1.776315789473684,
239
+ "grad_norm": 3.06954358369665,
240
+ "learning_rate": 1.9635076097418733e-06,
241
+ "loss": 0.7715581893920899,
242
+ "step": 135,
243
+ "token_acc": 0.7787210627677553
244
+ },
245
+ {
246
+ "epoch": 1.8421052631578947,
247
+ "grad_norm": 2.9310526512559254,
248
+ "learning_rate": 1.957106836675914e-06,
249
+ "loss": 0.7432127475738526,
250
+ "step": 140,
251
+ "token_acc": 0.7850694030147556
252
+ },
253
+ {
254
+ "epoch": 1.9078947368421053,
255
+ "grad_norm": 2.9957118106295924,
256
+ "learning_rate": 1.950201322263071e-06,
257
+ "loss": 0.764984130859375,
258
+ "step": 145,
259
+ "token_acc": 0.7813440713084953
260
+ },
261
+ {
262
+ "epoch": 1.973684210526316,
263
+ "grad_norm": 2.9264049027139363,
264
+ "learning_rate": 1.942794708206143e-06,
265
+ "loss": 0.7624619483947754,
266
+ "step": 150,
267
+ "token_acc": 0.7813595519200548
268
+ },
269
+ {
270
+ "epoch": 2.0,
271
+ "eval_loss": 0.7557766437530518,
272
+ "eval_runtime": 11.4219,
273
+ "eval_samples_per_second": 11.819,
274
+ "eval_steps_per_second": 1.488,
275
+ "eval_token_acc": 0.7825907021658159,
276
+ "step": 152
277
+ },
278
+ {
279
+ "epoch": 2.039473684210526,
280
+ "grad_norm": 2.81677121987903,
281
+ "learning_rate": 1.934890900468615e-06,
282
+ "loss": 0.7107283115386963,
283
+ "step": 155,
284
+ "token_acc": 0.7902097902097902
285
+ },
286
+ {
287
+ "epoch": 2.1052631578947367,
288
+ "grad_norm": 3.110214332868517,
289
+ "learning_rate": 1.9264940672148015e-06,
290
+ "loss": 0.7049328804016113,
291
+ "step": 160,
292
+ "token_acc": 0.792085744242942
293
+ },
294
+ {
295
+ "epoch": 2.1710526315789473,
296
+ "grad_norm": 3.1170800221770736,
297
+ "learning_rate": 1.917608636611721e-06,
298
+ "loss": 0.6538917064666748,
299
+ "step": 165,
300
+ "token_acc": 0.8053540656434096
301
+ },
302
+ {
303
+ "epoch": 2.236842105263158,
304
+ "grad_norm": 2.9791586864999764,
305
+ "learning_rate": 1.9082392944938463e-06,
306
+ "loss": 0.662127685546875,
307
+ "step": 170,
308
+ "token_acc": 0.803340308455398
309
+ },
310
+ {
311
+ "epoch": 2.3026315789473686,
312
+ "grad_norm": 3.0436404891376267,
313
+ "learning_rate": 1.8983909818919787e-06,
314
+ "loss": 0.6792965888977051,
315
+ "step": 175,
316
+ "token_acc": 0.7981443134756292
317
+ },
318
+ {
319
+ "epoch": 2.3684210526315788,
320
+ "grad_norm": 3.0287782620036734,
321
+ "learning_rate": 1.8880688924275375e-06,
322
+ "loss": 0.6498573303222657,
323
+ "step": 180,
324
+ "token_acc": 0.8074076974276653
325
+ },
326
+ {
327
+ "epoch": 2.4342105263157894,
328
+ "grad_norm": 3.056401729041573,
329
+ "learning_rate": 1.8772784695736428e-06,
330
+ "loss": 0.6739880561828613,
331
+ "step": 185,
332
+ "token_acc": 0.7976903170986057
333
+ },
334
+ {
335
+ "epoch": 2.5,
336
+ "grad_norm": 2.9244106898705375,
337
+ "learning_rate": 1.8660254037844386e-06,
338
+ "loss": 0.6584476470947266,
339
+ "step": 190,
340
+ "token_acc": 0.8041385948026949
341
+ },
342
+ {
343
+ "epoch": 2.5657894736842106,
344
+ "grad_norm": 3.0726933163215957,
345
+ "learning_rate": 1.8543156294941648e-06,
346
+ "loss": 0.6841012001037597,
347
+ "step": 195,
348
+ "token_acc": 0.7971685375920997
349
+ },
350
+ {
351
+ "epoch": 2.6315789473684212,
352
+ "grad_norm": 2.8543141156332386,
353
+ "learning_rate": 1.8421553219875656e-06,
354
+ "loss": 0.6722308158874511,
355
+ "step": 200,
356
+ "token_acc": 0.7996123279705369
357
+ },
358
+ {
359
+ "epoch": 2.6973684210526314,
360
+ "grad_norm": 2.986532459756899,
361
+ "learning_rate": 1.8295508941432813e-06,
362
+ "loss": 0.6710346698760986,
363
+ "step": 205,
364
+ "token_acc": 0.7982886525101072
365
+ },
366
+ {
367
+ "epoch": 2.763157894736842,
368
+ "grad_norm": 3.0096419377618724,
369
+ "learning_rate": 1.8165089930519428e-06,
370
+ "loss": 0.6560275554656982,
371
+ "step": 210,
372
+ "token_acc": 0.8003802281368821
373
+ },
374
+ {
375
+ "epoch": 2.8289473684210527,
376
+ "grad_norm": 3.0712726957788483,
377
+ "learning_rate": 1.8030364965107515e-06,
378
+ "loss": 0.6660303115844727,
379
+ "step": 215,
380
+ "token_acc": 0.8012015503875969
381
+ },
382
+ {
383
+ "epoch": 2.8947368421052633,
384
+ "grad_norm": 3.069154754414468,
385
+ "learning_rate": 1.7891405093963937e-06,
386
+ "loss": 0.6556559562683105,
387
+ "step": 220,
388
+ "token_acc": 0.8036698692707513
389
+ },
390
+ {
391
+ "epoch": 2.9605263157894735,
392
+ "grad_norm": 3.055385524570652,
393
+ "learning_rate": 1.7748283599182013e-06,
394
+ "loss": 0.6615045547485352,
395
+ "step": 225,
396
+ "token_acc": 0.8035366518657007
397
+ },
398
+ {
399
+ "epoch": 3.0,
400
+ "eval_loss": 0.7560063004493713,
401
+ "eval_runtime": 11.7729,
402
+ "eval_samples_per_second": 11.467,
403
+ "eval_steps_per_second": 1.444,
404
+ "eval_token_acc": 0.7828206189359452,
405
+ "step": 228
406
+ }
407
+ ],
408
+ "logging_steps": 5,
409
+ "max_steps": 760,
410
+ "num_input_tokens_seen": 0,
411
+ "num_train_epochs": 10,
412
+ "save_steps": 500,
413
+ "stateful_callbacks": {
414
+ "TrainerControl": {
415
+ "args": {
416
+ "should_epoch_stop": false,
417
+ "should_evaluate": false,
418
+ "should_log": false,
419
+ "should_save": true,
420
+ "should_training_stop": false
421
+ },
422
+ "attributes": {}
423
+ }
424
+ },
425
+ "total_flos": 35470891442176.0,
426
+ "train_batch_size": 2,
427
+ "trial_name": null,
428
+ "trial_params": null
429
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70736c4ef9635bf51275a4c7d8823d8ba312cfa86b8f1c6b11ad543560f279aa
3
+ size 9105
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/video_preprocessor_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": null,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "do_sample_frames": false,
13
+ "fps": null,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_frames": 768,
26
+ "max_pixels": 12845056,
27
+ "merge_size": 2,
28
+ "min_frames": 4,
29
+ "min_pixels": 3136,
30
+ "num_frames": null,
31
+ "patch_size": 14,
32
+ "processor_class": "Qwen2_5_VLProcessor",
33
+ "resample": 3,
34
+ "rescale_factor": 0.00392156862745098,
35
+ "size": {
36
+ "longest_edge": 12845056,
37
+ "shortest_edge": 3136
38
+ },
39
+ "size_divisor": null,
40
+ "temporal_patch_size": 2,
41
+ "video_metadata": null,
42
+ "video_processor_type": "Qwen2VLVideoProcessor"
43
+ }
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/vocab.json ADDED
The diff for this file is too large to render. See raw diff