p1k0 commited on Dec 4, 2025

Commit

be7e0e3

verified ·

1 Parent(s): fc6f21b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/args.json +384 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/added_tokens.json +33 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/args.json +384 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/chat_template.jinja +54 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/config.json +144 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_intern_vit.py +120 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_internvl_chat.py +97 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/conversation.py +391 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/generation_config.json +5 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/latest +1 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/merges.txt +0 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/model.safetensors.index.json +693 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_intern_vit.py +431 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_internvl_chat.py +359 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/preprocessor_config.json +19 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/special_tokens_map.json +31 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/tokenizer_config.json +280 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/trainer_state.json +429 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/vocab.json +0 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/logging.jsonl +85 -0
ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/val_dataset.jsonl +135 -0
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_1.pth +2 -2
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_3.pth +2 -2
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/scheduler.pt +2 -2
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/tokenizer.json +2 -2
ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/training_args.bin +2 -2
ood/qwen2.5vl-7b-lora-ood-merged/model-00001-of-00004.safetensors +2 -2
ood/qwen2.5vl-7b-lora-ood-merged/model-00002-of-00004.safetensors +2 -2
ood/qwen2.5vl-7b-lora-ood-merged/model-00003-of-00004.safetensors +2 -2
ood/qwen2.5vl-7b-lora-ood-merged/model-00004-of-00004.safetensors +2 -2
ood/qwen2.5vl-7b-lora-ood-merged/tokenizer.json +2 -2
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00001-of-00004.safetensors +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00002-of-00004.safetensors +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00003-of-00004.safetensors +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00004-of-00004.safetensors +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model.safetensors.index.json +737 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/preprocessor_config.json +19 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_0.pth +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_1.pth +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_2.pth +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_3.pth +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/scheduler.pt +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/special_tokens_map.json +31 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer_config.json +208 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/trainer_state.json +429 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/training_args.bin +3 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/video_preprocessor_config.json +43 -0
ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -60,3 +60,4 @@ qwen2.5vl-7b-lora_epoch10_2e-5/tokenizer.json filter=lfs diff=lfs merge=lfs -tex
 llava-ov-lora/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 llava-ov-lora/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 internvl3-8b-instruct-lora_epoch10_5e-6/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 qwen2.5vl-7b-qvq_thinking_full_v2/v0-20250823-125422/checkpoint-280/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/args.json ADDED Viewed

	@@ -0,0 +1,384 @@

+{
+  "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 2,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 5e-07,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 5.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.1,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": 10,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": null,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": true,
+  "metric_for_best_model": "eval_loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "swanlab"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
+  "model_type": "internvl3",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "internvl2_5",
+  "system": null,
+  "max_length": 32768,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/mnt/data/users/liamding/data/3AM_Plus/final/ood_split/ambi_normal_train_aug_messages.json"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.1,
+  "dataset_num_proc": 1,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": true,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [
+    "vision_model",
+    "mlp1"
+  ],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "target_parameters": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": 200,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "InternVL3-8B-Instruct",
+  "model_info": "ModelInfo(model_type='internvl3', model_dir='/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'factor': 2.0, 'rope_type': 'dynamic', 'type': 'dynamic'}, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='internvl3', model_groups=[ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Pretrained', hf_model_id='OpenGVLab/InternVL3-1B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Pretrained', hf_model_id='OpenGVLab/InternVL3-2B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Pretrained', hf_model_id='OpenGVLab/InternVL3-8B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Pretrained', hf_model_id='OpenGVLab/InternVL3-9B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Pretrained', hf_model_id='OpenGVLab/InternVL3-14B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Pretrained', hf_model_id='OpenGVLab/InternVL3-38B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Pretrained', hf_model_id='OpenGVLab/InternVL3-78B-Pretrained', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Instruct', hf_model_id='OpenGVLab/InternVL3-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Instruct', hf_model_id='OpenGVLab/InternVL3-2B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Instruct', hf_model_id='OpenGVLab/InternVL3-8B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Instruct', hf_model_id='OpenGVLab/InternVL3-9B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Instruct', hf_model_id='OpenGVLab/InternVL3-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Instruct', hf_model_id='OpenGVLab/InternVL3-38B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Instruct', hf_model_id='OpenGVLab/InternVL3-78B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B', hf_model_id='OpenGVLab/InternVL3-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B', hf_model_id='OpenGVLab/InternVL3-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B', hf_model_id='OpenGVLab/InternVL3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B', hf_model_id='OpenGVLab/InternVL3-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B', hf_model_id='OpenGVLab/InternVL3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B', hf_model_id='OpenGVLab/InternVL3-38B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B', hf_model_id='OpenGVLab/InternVL3-78B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-AWQ', hf_model_id='OpenGVLab/InternVL3-1B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-AWQ', hf_model_id='OpenGVLab/InternVL3-2B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-AWQ', hf_model_id='OpenGVLab/InternVL3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-AWQ', hf_model_id='OpenGVLab/InternVL3-9B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-AWQ', hf_model_id='OpenGVLab/InternVL3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-AWQ', hf_model_id='OpenGVLab/InternVL3-38B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-AWQ', hf_model_id='OpenGVLab/InternVL3-78B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='internvl2_5', get_function=<function get_model_tokenizer_internvl at 0x7f81e9d2ea70>, model_arch=MultiModelKeys(arch_name='internvl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['language_model'], aligner=['mlp1'], vision_tower=['vision_model'], generator=[]), architectures=['InternVLChatModel'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37.2', 'timm'], tags=['vision', 'video'])",
+  "model_dir": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-07, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=10, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/added_tokens.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/args.json ADDED Viewed

	@@ -0,0 +1,384 @@

+{
+  "output_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "epoch",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 2,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 2,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 5e-07,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 5.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.1,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 5,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "epoch",
+  "save_steps": 500,
+  "save_total_limit": 10,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": false,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "seed": 42,
+  "data_seed": 42,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": null,
+  "dataloader_num_workers": 4,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "disable_tqdm": null,
+  "remove_unused_columns": true,
+  "label_names": null,
+  "load_best_model_at_end": true,
+  "metric_for_best_model": "eval_loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "swanlab"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_token": null,
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "hub_revision": null,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 18000000,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "liger_kernel_config": null,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": true,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "tuner_backend": "peft",
+  "vit_gradient_checkpointing": null,
+  "router_aux_loss_coef": 0.0,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "max_epochs": null,
+  "aligner_lr": null,
+  "vit_lr": null,
+  "use_logits_to_keep": null,
+  "ds3_gather_for_generation": true,
+  "resume_only_model": false,
+  "optimizer": null,
+  "loss_type": null,
+  "metric": null,
+  "eval_use_evalscope": false,
+  "eval_dataset": [],
+  "eval_dataset_args": null,
+  "eval_limit": null,
+  "eval_generation_config": null,
+  "extra_eval_args": null,
+  "use_flash_ckpt": false,
+  "model": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
+  "model_type": "internvl3",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "internvl2_5",
+  "system": null,
+  "max_length": 32768,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": false,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/mnt/data/users/liamding/data/3AM_Plus/final/ood_split/ambi_normal_train_aug_messages.json"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.1,
+  "dataset_num_proc": 1,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": null,
+  "lora_modules": [],
+  "train_type": "full",
+  "adapters": [],
+  "external_plugins": [],
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": false,
+  "packing_length": null,
+  "lazy_tokenize": true,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_parameters": [
+    "vision_model",
+    "mlp1"
+  ],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "target_parameters": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240",
+  "swanlab_lark_webhook_url": null,
+  "swanlab_lark_secret": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "create_checkpoint_symlink": false,
+  "zero_hpz_partition_size": null,
+  "deepspeed_autotp_size": null,
+  "early_stop_interval": 200,
+  "rank": 0,
+  "global_world_size": 4,
+  "local_world_size": 4,
+  "model_suffix": "InternVL3-8B-Instruct",
+  "model_info": "ModelInfo(model_type='internvl3', model_dir='/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling={'factor': 2.0, 'rope_type': 'dynamic', 'type': 'dynamic'}, is_moe_model=False, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='internvl3', model_groups=[ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Pretrained', hf_model_id='OpenGVLab/InternVL3-1B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Pretrained', hf_model_id='OpenGVLab/InternVL3-2B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Pretrained', hf_model_id='OpenGVLab/InternVL3-8B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Pretrained', hf_model_id='OpenGVLab/InternVL3-9B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Pretrained', hf_model_id='OpenGVLab/InternVL3-14B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Pretrained', hf_model_id='OpenGVLab/InternVL3-38B-Pretrained', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Pretrained', hf_model_id='OpenGVLab/InternVL3-78B-Pretrained', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-Instruct', hf_model_id='OpenGVLab/InternVL3-1B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-Instruct', hf_model_id='OpenGVLab/InternVL3-2B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-Instruct', hf_model_id='OpenGVLab/InternVL3-8B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-Instruct', hf_model_id='OpenGVLab/InternVL3-9B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-Instruct', hf_model_id='OpenGVLab/InternVL3-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-Instruct', hf_model_id='OpenGVLab/InternVL3-38B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-Instruct', hf_model_id='OpenGVLab/InternVL3-78B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B', hf_model_id='OpenGVLab/InternVL3-1B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B', hf_model_id='OpenGVLab/InternVL3-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B', hf_model_id='OpenGVLab/InternVL3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B', hf_model_id='OpenGVLab/InternVL3-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B', hf_model_id='OpenGVLab/InternVL3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B', hf_model_id='OpenGVLab/InternVL3-38B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B', hf_model_id='OpenGVLab/InternVL3-78B', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='OpenGVLab/InternVL3-1B-AWQ', hf_model_id='OpenGVLab/InternVL3-1B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-2B-AWQ', hf_model_id='OpenGVLab/InternVL3-2B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-8B-AWQ', hf_model_id='OpenGVLab/InternVL3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-9B-AWQ', hf_model_id='OpenGVLab/InternVL3-9B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-14B-AWQ', hf_model_id='OpenGVLab/InternVL3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-38B-AWQ', hf_model_id='OpenGVLab/InternVL3-38B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='OpenGVLab/InternVL3-78B-AWQ', hf_model_id='OpenGVLab/InternVL3-78B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='internvl2_5', get_function=<function get_model_tokenizer_internvl at 0x7f81e9d2ea70>, model_arch=MultiModelKeys(arch_name='internvl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['language_model'], aligner=['mlp1'], vision_tower=['vision_model'], generator=[]), architectures=['InternVLChatModel'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.37.2', 'timm'], tags=['vision', 'video'])",
+  "model_dir": "/mnt/data/users/liamding/data/models/InternVL3-8B-Instruct",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=5e-07, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=5.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.1, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=10, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=True, metric_for_best_model='eval_loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['swanlab'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, hub_revision=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=18000000, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, liger_kernel_config=None, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, metric=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, use_flash_ckpt=False, sft_alpha=0, train_type='full', local_repo_path=None, galore_config=None)"
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/config.json ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "hidden_size": 3584,
+  "image_fold": null,
+  "keys_to_ignore_at_inference": [
+    "past_key_values"
+  ],
+  "llm_config": {
+    "_name_or_path": "./pretrained/Qwen2.5-32B-Instruct",
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151643,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "max_window_layers": 70,
+    "model_type": "qwen2",
+    "moe_config": null,
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "pad_token_id": 151643,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "factor": 2.0,
+      "rope_type": "dynamic",
+      "type": "dynamic"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "torch_dtype": "bfloat16",
+    "use_bfloat16": true,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151674
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "output_attentions": false,
+  "pad2square": false,
+  "pad_token_id": 151643,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internvl2_5",
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "capacity_factor": 1.2,
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "eval_capacity_factor": 1.4,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 0.1,
+    "initializer_range": 1e-10,
+    "intermediate_size": 4096,
+    "laux_allreduce": "all_nodes",
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "moe_coeff_ratio": 0.5,
+    "moe_intermediate_size": 768,
+    "moe_output_scale": 4.0,
+    "noisy_gate_policy": "RSample_before",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_experts": 8,
+    "num_hidden_layers": 24,
+    "num_routed_experts": 4,
+    "num_shared_experts": 4,
+    "pad_token_id": 151643,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "shared_expert_intermediate_size": 3072,
+    "torch_dtype": "bfloat16",
+    "use_bfloat16": true,
+    "use_flash_attn": true,
+    "use_moe": false,
+    "use_residual": true,
+    "use_rts": false,
+    "use_weighted_residual": false
+  }
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            norm_type='rms_norm',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/configuration_internvl_chat.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import copy
+from transformers import AutoConfig, LlamaConfig, Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+logger = logging.get_logger(__name__)
+class InternVLChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+    def __init__(
+            self,
+            vision_config=None,
+            llm_config=None,
+            use_backbone_lora=0,
+            use_llm_lora=0,
+            select_layer=-1,
+            force_image_size=None,
+            downsample_ratio=0.5,
+            template=None,
+            dynamic_image_size=False,
+            use_thumbnail=False,
+            ps_version='v1',
+            min_dynamic_patch=1,
+            max_dynamic_patch=6,
+            **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {'architectures': ['InternVisionModel']}
+            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
+        if llm_config is None:
+            llm_config = {'architectures': ['Qwen2ForCausalLM']}
+            logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
+        self.vision_config = InternVisionConfig(**vision_config)
+        if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config.get('architectures')[0] == 'Qwen2ForCausalLM':
+            self.llm_config = Qwen2Config(**llm_config)
+        else:
+            raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        # By default, we use tie_word_embeddings=False for models of all sizes.
+        self.tie_word_embeddings = self.llm_config.tie_word_embeddings
+        logger.info(f'vision_select_layer: {self.select_layer}')
+        logger.info(f'ps_version: {self.ps_version}')
+        logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
+        logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['llm_config'] = self.llm_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        output['use_backbone_lora'] = self.use_backbone_lora
+        output['use_llm_lora'] = self.use_llm_lora
+        output['select_layer'] = self.select_layer
+        output['force_image_size'] = self.force_image_size
+        output['downsample_ratio'] = self.downsample_ratio
+        output['template'] = self.template
+        output['dynamic_image_size'] = self.dynamic_image_size
+        output['use_thumbnail'] = self.use_thumbnail
+        output['ps_version'] = self.ps_version
+        output['min_dynamic_patch'] = self.min_dynamic_patch
+        output['max_dynamic_patch'] = self.max_dynamic_patch
+        return output

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/conversation.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+import dataclasses
+from enum import IntEnum, auto
+from typing import Dict, List, Tuple, Union
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    INTERNVL_ZH = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = '{system_message}'
+    # The system message
+    system_message: str = ''
+    # The names of two roles
+    roles: Tuple[str] = ('USER', 'ASSISTANT')
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = '\n'
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ': '  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = '' if system_prompt == '' else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ': '
+                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
+                    )
+                    ret += '\n\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = '[INST] '
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + ' '
+                    else:
+                        ret += tag + ' ' + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == 'chatglm2' else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ''
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'
+                if message:
+                    ret += f'{role}：{message}{self.sep}'
+                else:
+                    ret += f'{role}：'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + message + self.sep + '\n'
+                else:
+                    ret += role + '\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + '\n' + ' ' + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                # if i % 2 == 0:
+                #     ret += "<s>"
+                if message:
+                    ret += role + ':' + message + seps[i % 2] + '\n'
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ':\n' + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += '\n\n'
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + '<s>' + message + '</s>'
+                else:
+                    ret += role + ': ' + '<s>'
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ':\n' + message + self.sep
+                else:
+                    ret += role + ':\n'
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ''
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ': ' + message + self.sep
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
+            seps = [self.sep, self.sep2]
+            ret = self.system_message + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ': ' + message + seps[i % 2]
+                else:
+                    ret += role + ':'
+            return ret
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f'Invalid style: {self.sep_style}')
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        ret = [{'role': 'system', 'content': self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({'role': 'user', 'content': msg})
+            else:
+                if msg is not None:
+                    ret.append({'role': 'assistant', 'content': msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            'template_name': self.name,
+            'system_message': self.system_message,
+            'roles': self.roles,
+            'messages': self.messages,
+            'offset': self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f'{template.name} has been registered.'
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
+# is that during training, the preprocessing function for the Hermes-2 template doesn't add
+# <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
+# Therefore, they are completely equivalent during inference.
+register_conv_template(
+    Conversation(
+        name='Hermes-2',
+        system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>',
+        stop_str='<|endoftext|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='internlm2-chat',
+        system_template='<|im_start|>system\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='phi3-chat',
+        system_template='<|system|>\n{system_message}',
+        # note: The new system prompt was not used here to avoid changes in benchmark performance.
+        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
+        roles=('<|user|>\n', '<|assistant|>\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|end|>',
+    )
+)
+register_conv_template(
+    Conversation(
+        name='internvl2_5',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+    )
+)

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 151645,
+  "transformers_version": "4.55.4"
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step228

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,693 @@

+{
+  "metadata": {
+    "total_parameters": 719360,
+    "total_size": 15888747520
+  },
+  "weight_map": {
+    "language_model.lm_head.weight": "model-00004-of-00004.safetensors",
+    "language_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
+    "mlp1.0.bias": "model-00004-of-00004.safetensors",
+    "mlp1.0.weight": "model-00004-of-00004.safetensors",
+    "mlp1.1.bias": "model-00004-of-00004.safetensors",
+    "mlp1.1.weight": "model-00004-of-00004.safetensors",
+    "mlp1.3.bias": "model-00004-of-00004.safetensors",
+    "mlp1.3.weight": "model-00004-of-00004.safetensors",
+    "vision_model.embeddings.class_embedding": "model-00001-of-00004.safetensors",
+    "vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
+    "vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
+    "vision_model.embeddings.position_embedding": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.ls1": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.ls2": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.norm1.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.norm2.bias": "model-00001-of-00004.safetensors",
+    "vision_model.encoder.layers.9.norm2.weight": "model-00001-of-00004.safetensors"
+  }
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,431 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.flash_attn_interface import \
+        flash_attn_varlen_qkvpacked_func
+    has_flash_attn = True
+except:
+    print('FlashAttention2 is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_varlen_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_varlen_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_varlen_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+NORM2FN = {
+    'rms_norm': InternRMSNorm,
+    'layer_norm': nn.LayerNorm,
+}
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = pos_embed.float().reshape(
+            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
+        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
+            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
+        return pos_embed
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat([
+            self.position_embedding[:, :1, :],
+            self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
+        ], dim=1)
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    config_class = InternVisionConfig
+    _no_split_modules = ['InternVisionEncoderLayer']
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/modeling_internvl_chat.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch.utils.checkpoint
+import transformers
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
+                          Qwen2ForCausalLM)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from .configuration_internvl_chat import InternVLChatConfig
+from .conversation import get_conv_template
+from .modeling_intern_vit import InternVisionModel, has_flash_attn
+logger = logging.get_logger(__name__)
+def version_cmp(v1, v2, op='eq'):
+    import operator
+    from packaging import version
+    op_func = getattr(operator, op)
+    return op_func(version.parse(v1), version.parse(v2))
+class InternVLChatModel(PreTrainedModel):
+    config_class = InternVLChatConfig
+    main_input_name = 'pixel_values'
+    base_model_prefix = 'language_model'
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Qwen2DecoderLayer']
+    def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
+        super().__init__(config)
+        assert version_cmp(transformers.__version__, '4.37.0', 'ge')
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+        use_flash_attn = use_flash_attn if has_flash_attn else False
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
+        logger.info(f'num_image_token: {self.num_image_token}')
+        logger.info(f'ps_version: {self.ps_version}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            self.vision_model = InternVisionModel(config.vision_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            if config.llm_config.architectures[0] == 'LlamaForCausalLM':
+                self.language_model = LlamaForCausalLM(config.llm_config)
+            elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
+                self.language_model = Qwen2ForCausalLM(config.llm_config)
+            else:
+                raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size)
+        )
+        self.img_context_token_id = None
+        self.conv_template = get_conv_template(self.template)
+        self.system_message = self.conv_template.system_message
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            image_flags: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        image_flags = image_flags.squeeze(-1)
+        input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
+        vit_embeds = self.extract_feature(pixel_values)
+        vit_embeds = vit_embeds[image_flags == 1]
+        vit_batch_size = pixel_values.shape[0]
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
+            print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
+        input_ids = input_ids.reshape(B * N)
+        selected = (input_ids == self.img_context_token_id)
+        try:
+            input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
+        except Exception as e:
+            vit_embeds = vit_embeds.reshape(-1, C)
+            print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
+                  f'vit_embeds.shape={vit_embeds.shape}')
+            n_token = min(selected.sum(), vit_embeds.size(0))
+            input_embeds[selected][:n_token] = input_embeds[selected][:n_token] * 0.0 + vit_embeds[:n_token]
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
+                          'which results in a transposed image.')
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+    def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
+                   history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
+                   IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
+        if history is not None or return_history:
+            print('Now multi-turn chat is not supported in batch_chat.')
+            raise NotImplementedError
+        if image_counts is not None:
+            num_patches_list = image_counts
+            print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            print(f'dynamic ViT batch size: {image_bs}')
+        queries = []
+        for idx, num_patches in enumerate(num_patches_list):
+            question = questions[idx]
+            if pixel_values is not None and '<image>' not in question:
+                question = '<image>\n' + question
+            template = get_conv_template(self.template)
+            template.system_message = self.system_message
+            template.append_message(template.roles[0], question)
+            template.append_message(template.roles[1], None)
+            query = template.get_prompt()
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace('<image>', image_tokens, 1)
+            queries.append(query)
+        tokenizer.padding_side = 'left'
+        model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
+        input_ids = model_inputs['input_ids'].to(self.device)
+        attention_mask = model_inputs['attention_mask'].to(self.device)
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
+        generation_config['eos_token_id'] = eos_token_id
+        generation_output = self.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generation_config
+        )
+        responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
+        responses = [response.split(template.sep.strip())[0].strip() for response in responses]
+        return responses
+    def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
+             num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
+             verbose=False):
+        if history is None and pixel_values is not None and '<image>' not in question:
+            question = '<image>\n' + question
+        if num_patches_list is None:
+            num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
+        assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
+        img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.img_context_token_id = img_context_token_id
+        template = get_conv_template(self.template)
+        template.system_message = self.system_message
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
+        history = [] if history is None else history
+        for (old_question, old_answer) in history:
+            template.append_message(template.roles[0], old_question)
+            template.append_message(template.roles[1], old_answer)
+        template.append_message(template.roles[0], question)
+        template.append_message(template.roles[1], None)
+        query = template.get_prompt()
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            print(f'dynamic ViT batch size: {image_bs}')
+        for num_patches in num_patches_list:
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace('<image>', image_tokens, 1)
+        model_inputs = tokenizer(query, return_tensors='pt')
+        input_ids = model_inputs['input_ids'].to(self.device)
+        attention_mask = model_inputs['attention_mask'].to(self.device)
+        generation_config['eos_token_id'] = eos_token_id
+        generation_output = self.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **generation_config
+        )
+        response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
+        response = response.split(template.sep.strip())[0].strip()
+        history.append((question, response))
+        if return_history:
+            return response, history
+        else:
+            query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
+            query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
+            if verbose:
+                print(query_to_print, response)
+            return response
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            visual_features: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        assert self.img_context_token_id is not None
+        if pixel_values is not None:
+            if visual_features is not None:
+                vit_embeds = visual_features
+            else:
+                vit_embeds = self.extract_feature(pixel_values)
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = (input_ids == self.img_context_token_id)
+            assert selected.sum() != 0
+            input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
+            input_embeds = input_embeds.reshape(B, N, C)
+        else:
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            use_cache=True,
+            **generate_kwargs,
+        )
+        return outputs
+    @property
+    def lm_head(self):
+        return self.language_model.get_output_embeddings()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "crop_size": 448,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 3,
+  "size": 448
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/trainer_state.json ADDED Viewed

	@@ -0,0 +1,429 @@

+{
+  "best_global_step": 228,
+  "best_metric": 0.72654486,
+  "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228",
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 228,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013157894736842105,
+      "grad_norm": 16.149745965683884,
+      "learning_rate": 1.3157894736842104e-08,
+      "loss": 1.0491504669189453,
+      "step": 1,
+      "token_acc": 0.7316017316017316
+    },
+    {
+      "epoch": 0.06578947368421052,
+      "grad_norm": 19.394274240651384,
+      "learning_rate": 6.578947368421052e-08,
+      "loss": 0.9333165884017944,
+      "step": 5,
+      "token_acc": 0.7398773006134969
+    },
+    {
+      "epoch": 0.13157894736842105,
+      "grad_norm": 16.751648834817885,
+      "learning_rate": 1.3157894736842104e-07,
+      "loss": 0.9967510223388671,
+      "step": 10,
+      "token_acc": 0.7421441774491682
+    },
+    {
+      "epoch": 0.19736842105263158,
+      "grad_norm": 17.69556265222528,
+      "learning_rate": 1.9736842105263157e-07,
+      "loss": 0.8940377235412598,
+      "step": 15,
+      "token_acc": 0.769434628975265
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 16.580837710216986,
+      "learning_rate": 2.631578947368421e-07,
+      "loss": 0.94066162109375,
+      "step": 20,
+      "token_acc": 0.7534121929026387
+    },
+    {
+      "epoch": 0.32894736842105265,
+      "grad_norm": 17.18358188029577,
+      "learning_rate": 3.2894736842105264e-07,
+      "loss": 0.8913036346435547,
+      "step": 25,
+      "token_acc": 0.7722672064777328
+    },
+    {
+      "epoch": 0.39473684210526316,
+      "grad_norm": 12.655760128091604,
+      "learning_rate": 3.9473684210526315e-07,
+      "loss": 0.8995906829833984,
+      "step": 30,
+      "token_acc": 0.7529095792300806
+    },
+    {
+      "epoch": 0.4605263157894737,
+      "grad_norm": 14.799338872033982,
+      "learning_rate": 4.6052631578947365e-07,
+      "loss": 0.7829002857208252,
+      "step": 35,
+      "token_acc": 0.7704918032786885
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 13.6883274124527,
+      "learning_rate": 4.999578104083306e-07,
+      "loss": 0.7109686374664307,
+      "step": 40,
+      "token_acc": 0.7868705035971223
+    },
+    {
+      "epoch": 0.5921052631578947,
+      "grad_norm": 11.590886715849683,
+      "learning_rate": 4.994833410208486e-07,
+      "loss": 0.8100458145141601,
+      "step": 45,
+      "token_acc": 0.7766143106457243
+    },
+    {
+      "epoch": 0.6578947368421053,
+      "grad_norm": 18.37250203045574,
+      "learning_rate": 4.984826693294873e-07,
+      "loss": 0.7702572345733643,
+      "step": 50,
+      "token_acc": 0.789568345323741
+    },
+    {
+      "epoch": 0.7236842105263158,
+      "grad_norm": 14.257346400410995,
+      "learning_rate": 4.969579059190548e-07,
+      "loss": 0.8138771057128906,
+      "step": 55,
+      "token_acc": 0.7591763652641003
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 15.091592377911923,
+      "learning_rate": 4.949122667718934e-07,
+      "loss": 0.7258480548858642,
+      "step": 60,
+      "token_acc": 0.7591178965224766
+    },
+    {
+      "epoch": 0.8552631578947368,
+      "grad_norm": 14.643270287439186,
+      "learning_rate": 4.923500664848326e-07,
+      "loss": 0.8208457946777343,
+      "step": 65,
+      "token_acc": 0.7617753623188406
+    },
+    {
+      "epoch": 0.9210526315789473,
+      "grad_norm": 12.10237792449664,
+      "learning_rate": 4.892767091689785e-07,
+      "loss": 0.68003249168396,
+      "step": 70,
+      "token_acc": 0.8093922651933702
+    },
+    {
+      "epoch": 0.9868421052631579,
+      "grad_norm": 12.72703845793204,
+      "learning_rate": 4.856986770515357e-07,
+      "loss": 0.7365611076354981,
+      "step": 75,
+      "token_acc": 0.7881679389312977
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.7485233545303345,
+      "eval_runtime": 14.0742,
+      "eval_samples_per_second": 9.592,
+      "eval_steps_per_second": 1.208,
+      "eval_token_acc": 0.7790202342917998,
+      "step": 76
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 12.921581074218519,
+      "learning_rate": 4.816235168037004e-07,
+      "loss": 0.7654649257659912,
+      "step": 80,
+      "token_acc": 0.7855787476280834
+    },
+    {
+      "epoch": 1.118421052631579,
+      "grad_norm": 13.068943217853121,
+      "learning_rate": 4.770598236234616e-07,
+      "loss": 0.7880854606628418,
+      "step": 85,
+      "token_acc": 0.768064228367529
+    },
+    {
+      "epoch": 1.1842105263157894,
+      "grad_norm": 12.89309027582137,
+      "learning_rate": 4.720172231068844e-07,
+      "loss": 0.7307503700256348,
+      "step": 90,
+      "token_acc": 0.7837606837606838
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 12.073433385238404,
+      "learning_rate": 4.6650635094610966e-07,
+      "loss": 0.6501222610473633,
+      "step": 95,
+      "token_acc": 0.8068181818181818
+    },
+    {
+      "epoch": 1.3157894736842106,
+      "grad_norm": 11.811777608591989,
+      "learning_rate": 4.605388304968914e-07,
+      "loss": 0.7150445461273194,
+      "step": 100,
+      "token_acc": 0.7920792079207921
+    },
+    {
+      "epoch": 1.381578947368421,
+      "grad_norm": 11.639785635682916,
+      "learning_rate": 4.541272482629857e-07,
+      "loss": 0.6845383644104004,
+      "step": 105,
+      "token_acc": 0.7807971014492754
+    },
+    {
+      "epoch": 1.4473684210526316,
+      "grad_norm": 13.527665828737845,
+      "learning_rate": 4.472851273490984e-07,
+      "loss": 0.6341064453125,
+      "step": 110,
+      "token_acc": 0.7936936936936937
+    },
+    {
+      "epoch": 1.513157894736842,
+      "grad_norm": 12.400162052083235,
+      "learning_rate": 4.4002689893838405e-07,
+      "loss": 0.6944449424743653,
+      "step": 115,
+      "token_acc": 0.7871986699916874
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 12.084687708268657,
+      "learning_rate": 4.323678718546552e-07,
+      "loss": 0.6916102409362793,
+      "step": 120,
+      "token_acc": 0.7841726618705036
+    },
+    {
+      "epoch": 1.6447368421052633,
+      "grad_norm": 14.339547961512208,
+      "learning_rate": 4.243242002734988e-07,
+      "loss": 0.6724244117736816,
+      "step": 125,
+      "token_acc": 0.8003731343283582
+    },
+    {
+      "epoch": 1.7105263157894737,
+      "grad_norm": 12.886115725138072,
+      "learning_rate": 4.159128496504053e-07,
+      "loss": 0.7068803787231446,
+      "step": 130,
+      "token_acc": 0.7975970425138632
+    },
+    {
+      "epoch": 1.776315789473684,
+      "grad_norm": 13.664304711583553,
+      "learning_rate": 4.071515609377705e-07,
+      "loss": 0.6755687713623046,
+      "step": 135,
+      "token_acc": 0.8097982708933718
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 12.648918436490476,
+      "learning_rate": 3.9805881316624503e-07,
+      "loss": 0.6957115650177002,
+      "step": 140,
+      "token_acc": 0.7955555555555556
+    },
+    {
+      "epoch": 1.9078947368421053,
+      "grad_norm": 11.176293465675545,
+      "learning_rate": 3.8865378446935217e-07,
+      "loss": 0.6958436012268067,
+      "step": 145,
+      "token_acc": 0.7841860465116279
+    },
+    {
+      "epoch": 1.973684210526316,
+      "grad_norm": 13.253986253776585,
+      "learning_rate": 3.78956311633581e-07,
+      "loss": 0.7309179306030273,
+      "step": 150,
+      "token_acc": 0.7876838235294118
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.7281007766723633,
+      "eval_runtime": 13.8917,
+      "eval_samples_per_second": 9.718,
+      "eval_steps_per_second": 1.224,
+      "eval_token_acc": 0.784877529286475,
+      "step": 152
+    },
+    {
+      "epoch": 2.039473684210526,
+      "grad_norm": 11.735958700270654,
+      "learning_rate": 3.689868482592684e-07,
+      "loss": 0.653682279586792,
+      "step": 155,
+      "token_acc": 0.7934875749785776
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 11.471307891877995,
+      "learning_rate": 3.587664216205183e-07,
+      "loss": 0.644793701171875,
+      "step": 160,
+      "token_acc": 0.8068077276908924
+    },
+    {
+      "epoch": 2.1710526315789473,
+      "grad_norm": 11.843571052368723,
+      "learning_rate": 3.4831658831514575e-07,
+      "loss": 0.5835385322570801,
+      "step": 165,
+      "token_acc": 0.8140161725067385
+    },
+    {
+      "epoch": 2.236842105263158,
+      "grad_norm": 10.76496899877298,
+      "learning_rate": 3.376593887981886e-07,
+      "loss": 0.5892860412597656,
+      "step": 170,
+      "token_acc": 0.8260038240917782
+    },
+    {
+      "epoch": 2.3026315789473686,
+      "grad_norm": 13.952005550709435,
+      "learning_rate": 3.268173008948826e-07,
+      "loss": 0.6131174564361572,
+      "step": 175,
+      "token_acc": 0.8151111111111111
+    },
+    {
+      "epoch": 2.3684210526315788,
+      "grad_norm": 11.500981624672228,
+      "learning_rate": 3.1581319239114976e-07,
+      "loss": 0.6069381713867188,
+      "step": 180,
+      "token_acc": 0.8110236220472441
+    },
+    {
+      "epoch": 2.4342105263157894,
+      "grad_norm": 11.46248454566006,
+      "learning_rate": 3.0467027280159394e-07,
+      "loss": 0.6257027626037598,
+      "step": 185,
+      "token_acc": 0.8304742684157417
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 13.002284165063047,
+      "learning_rate": 2.934120444167326e-07,
+      "loss": 0.6725513458251953,
+      "step": 190,
+      "token_acc": 0.7950469684030743
+    },
+    {
+      "epoch": 2.5657894736842106,
+      "grad_norm": 13.013905049177902,
+      "learning_rate": 2.820622527327158e-07,
+      "loss": 0.5892130851745605,
+      "step": 195,
+      "token_acc": 0.8100706713780919
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 11.480677786146966,
+      "learning_rate": 2.706448363680831e-07,
+      "loss": 0.5530064582824707,
+      "step": 200,
+      "token_acc": 0.8359303391384051
+    },
+    {
+      "epoch": 2.6973684210526314,
+      "grad_norm": 13.276820229615291,
+      "learning_rate": 2.591838765731931e-07,
+      "loss": 0.5892533302307129,
+      "step": 205,
+      "token_acc": 0.8258575197889182
+    },
+    {
+      "epoch": 2.763157894736842,
+      "grad_norm": 13.90433344794858,
+      "learning_rate": 2.477035464388184e-07,
+      "loss": 0.6034513950347901,
+      "step": 210,
+      "token_acc": 0.8290840415486308
+    },
+    {
+      "epoch": 2.8289473684210527,
+      "grad_norm": 12.016974173998076,
+      "learning_rate": 2.3622805991103358e-07,
+      "loss": 0.6079031467437744,
+      "step": 215,
+      "token_acc": 0.803770351328192
+    },
+    {
+      "epoch": 2.8947368421052633,
+      "grad_norm": 12.883251904886917,
+      "learning_rate": 2.2478162071993296e-07,
+      "loss": 0.6700970649719238,
+      "step": 220,
+      "token_acc": 0.801210025929127
+    },
+    {
+      "epoch": 2.9605263157894735,
+      "grad_norm": 12.808332568512581,
+      "learning_rate": 2.133883713298946e-07,
+      "loss": 0.650183916091919,
+      "step": 225,
+      "token_acc": 0.8039927404718693
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.7265448570251465,
+      "eval_runtime": 13.707,
+      "eval_samples_per_second": 9.849,
+      "eval_steps_per_second": 1.24,
+      "eval_token_acc": 0.7859424920127795,
+      "step": 228
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 380,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.670596435818578e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/logging.jsonl ADDED Viewed

	@@ -0,0 +1,85 @@

+{"loss": 1.04915047, "grad_norm": 16.14974597, "learning_rate": 1e-08, "token_acc": 0.73160173, "epoch": 0.01315789, "global_step/max_steps": "1/380", "percentage": "0.26%", "elapsed_time": "14s", "remaining_time": "1h 30m 22s", "memory(GiB)": 44.31, "train_speed(iter/s)": 0.069898}
+{"loss": 0.93331659, "grad_norm": 19.39427424, "learning_rate": 7e-08, "token_acc": 0.7398773, "epoch": 0.06578947, "global_step/max_steps": "5/380", "percentage": "1.32%", "elapsed_time": "37s", "remaining_time": "47m 29s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.131596}
+{"loss": 0.99675102, "grad_norm": 16.75164883, "learning_rate": 1.3e-07, "token_acc": 0.74214418, "epoch": 0.13157895, "global_step/max_steps": "10/380", "percentage": "2.63%", "elapsed_time": "1m 7s", "remaining_time": "41m 53s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.147217}
+{"loss": 0.89403772, "grad_norm": 17.69556265, "learning_rate": 2e-07, "token_acc": 0.76943463, "epoch": 0.19736842, "global_step/max_steps": "15/380", "percentage": "3.95%", "elapsed_time": "1m 35s", "remaining_time": "38m 47s", "memory(GiB)": 50.32, "train_speed(iter/s)": 0.156854}
+{"loss": 0.94066162, "grad_norm": 16.58083771, "learning_rate": 2.6e-07, "token_acc": 0.75341219, "epoch": 0.26315789, "global_step/max_steps": "20/380", "percentage": "5.26%", "elapsed_time": "2m 4s", "remaining_time": "37m 12s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.161241}
+{"loss": 0.89130363, "grad_norm": 17.18358188, "learning_rate": 3.3e-07, "token_acc": 0.77226721, "epoch": 0.32894737, "global_step/max_steps": "25/380", "percentage": "6.58%", "elapsed_time": "2m 32s", "remaining_time": "36m 7s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.163803}
+{"loss": 0.89959068, "grad_norm": 12.65576013, "learning_rate": 3.9e-07, "token_acc": 0.75290958, "epoch": 0.39473684, "global_step/max_steps": "30/380", "percentage": "7.89%", "elapsed_time": "2m 59s", "remaining_time": "34m 55s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.167039}
+{"loss": 0.78290029, "grad_norm": 14.79933887, "learning_rate": 4.6e-07, "token_acc": 0.7704918, "epoch": 0.46052632, "global_step/max_steps": "35/380", "percentage": "9.21%", "elapsed_time": "3m 26s", "remaining_time": "33m 53s", "memory(GiB)": 62.01, "train_speed(iter/s)": 0.169641}
+{"loss": 0.71096864, "grad_norm": 13.68832741, "learning_rate": 5e-07, "token_acc": 0.7868705, "epoch": 0.52631579, "global_step/max_steps": "40/380", "percentage": "10.53%", "elapsed_time": "3m 54s", "remaining_time": "33m 11s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.170729}
+{"loss": 0.81004581, "grad_norm": 11.59088672, "learning_rate": 5e-07, "token_acc": 0.77661431, "epoch": 0.59210526, "global_step/max_steps": "45/380", "percentage": "11.84%", "elapsed_time": "4m 21s", "remaining_time": "32m 29s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.171868}
+{"loss": 0.77025723, "grad_norm": 18.37250203, "learning_rate": 5e-07, "token_acc": 0.78956835, "epoch": 0.65789474, "global_step/max_steps": "50/380", "percentage": "13.16%", "elapsed_time": "4m 49s", "remaining_time": "31m 49s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.172822}
+{"loss": 0.81387711, "grad_norm": 14.2573464, "learning_rate": 5e-07, "token_acc": 0.75917637, "epoch": 0.72368421, "global_step/max_steps": "55/380", "percentage": "14.47%", "elapsed_time": "5m 16s", "remaining_time": "31m 12s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.173519}
+{"loss": 0.72584805, "grad_norm": 15.09159238, "learning_rate": 4.9e-07, "token_acc": 0.7591179, "epoch": 0.78947368, "global_step/max_steps": "60/380", "percentage": "15.79%", "elapsed_time": "5m 46s", "remaining_time": "30m 45s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.173396}
+{"loss": 0.82084579, "grad_norm": 14.64327029, "learning_rate": 4.9e-07, "token_acc": 0.76177536, "epoch": 0.85526316, "global_step/max_steps": "65/380", "percentage": "17.11%", "elapsed_time": "6m 13s", "remaining_time": "30m 9s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.17406}
+{"loss": 0.68003249, "grad_norm": 12.10237792, "learning_rate": 4.9e-07, "token_acc": 0.80939227, "epoch": 0.92105263, "global_step/max_steps": "70/380", "percentage": "18.42%", "elapsed_time": "6m 40s", "remaining_time": "29m 35s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.174604}
+{"loss": 0.73656111, "grad_norm": 12.72703846, "learning_rate": 4.9e-07, "token_acc": 0.78816794, "epoch": 0.98684211, "global_step/max_steps": "75/380", "percentage": "19.74%", "elapsed_time": "7m 7s", "remaining_time": "28m 56s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.175597}
+{"eval_loss": 0.74852335, "eval_runtime": 14.0742, "eval_samples_per_second": 9.592, "eval_steps_per_second": 1.208, "eval_token_acc": 0.77902023, "epoch": 1.0, "global_step/max_steps": "76/380", "percentage": "20.00%", "elapsed_time": "7m 26s", "remaining_time": "29m 47s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.170093}
+{"loss": 0.76546493, "grad_norm": 12.92158107, "learning_rate": 4.8e-07, "token_acc": 0.78557875, "epoch": 1.05263158, "global_step/max_steps": "80/380", "percentage": "21.05%", "elapsed_time": "9m 26s", "remaining_time": "35m 25s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.141158}
+{"loss": 0.78808546, "grad_norm": 13.06894322, "learning_rate": 4.8e-07, "token_acc": 0.76806423, "epoch": 1.11842105, "global_step/max_steps": "85/380", "percentage": "22.37%", "elapsed_time": "9m 56s", "remaining_time": "34m 31s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.142417}
+{"loss": 0.73075037, "grad_norm": 12.89309028, "learning_rate": 4.7e-07, "token_acc": 0.78376068, "epoch": 1.18421053, "global_step/max_steps": "90/380", "percentage": "23.68%", "elapsed_time": "10m 23s", "remaining_time": "33m 30s", "memory(GiB)": 73.71, "train_speed(iter/s)": 0.144254}
+{"loss": 0.65012226, "grad_norm": 12.07343339, "learning_rate": 4.7e-07, "token_acc": 0.80681818, "epoch": 1.25, "global_step/max_steps": "95/380", "percentage": "25.00%", "elapsed_time": "10m 53s", "remaining_time": "32m 39s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.145462}
+{"loss": 0.71504455, "grad_norm": 11.81177761, "learning_rate": 4.6e-07, "token_acc": 0.79207921, "epoch": 1.31578947, "global_step/max_steps": "100/380", "percentage": "26.32%", "elapsed_time": "11m 21s", "remaining_time": "31m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.146813}
+{"loss": 0.68453836, "grad_norm": 11.63978564, "learning_rate": 4.5e-07, "token_acc": 0.7807971, "epoch": 1.38157895, "global_step/max_steps": "105/380", "percentage": "27.63%", "elapsed_time": "11m 49s", "remaining_time": "30m 58s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.147973}
+{"loss": 0.63410645, "grad_norm": 13.52766583, "learning_rate": 4.5e-07, "token_acc": 0.79369369, "epoch": 1.44736842, "global_step/max_steps": "110/380", "percentage": "28.95%", "elapsed_time": "12m 16s", "remaining_time": "30m 8s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14927}
+{"loss": 0.69444494, "grad_norm": 12.40016205, "learning_rate": 4.4e-07, "token_acc": 0.78719867, "epoch": 1.51315789, "global_step/max_steps": "115/380", "percentage": "30.26%", "elapsed_time": "12m 45s", "remaining_time": "29m 23s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.150274}
+{"loss": 0.69161024, "grad_norm": 12.08468771, "learning_rate": 4.3e-07, "token_acc": 0.78417266, "epoch": 1.57894737, "global_step/max_steps": "120/380", "percentage": "31.58%", "elapsed_time": "13m 11s", "remaining_time": "28m 35s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.151576}
+{"loss": 0.67242441, "grad_norm": 14.33954796, "learning_rate": 4.2e-07, "token_acc": 0.80037313, "epoch": 1.64473684, "global_step/max_steps": "125/380", "percentage": "32.89%", "elapsed_time": "13m 39s", "remaining_time": "27m 51s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.152574}
+{"loss": 0.70688038, "grad_norm": 12.88611573, "learning_rate": 4.2e-07, "token_acc": 0.79759704, "epoch": 1.71052632, "global_step/max_steps": "130/380", "percentage": "34.21%", "elapsed_time": "14m 7s", "remaining_time": "27m 9s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.153412}
+{"loss": 0.67556877, "grad_norm": 13.66430471, "learning_rate": 4.1e-07, "token_acc": 0.80979827, "epoch": 1.77631579, "global_step/max_steps": "135/380", "percentage": "35.53%", "elapsed_time": "14m 34s", "remaining_time": "26m 26s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.154392}
+{"loss": 0.69571157, "grad_norm": 12.64891844, "learning_rate": 4e-07, "token_acc": 0.79555556, "epoch": 1.84210526, "global_step/max_steps": "140/380", "percentage": "36.84%", "elapsed_time": "15m 2s", "remaining_time": "25m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.155104}
+{"loss": 0.6958436, "grad_norm": 11.17629347, "learning_rate": 3.9e-07, "token_acc": 0.78418605, "epoch": 1.90789474, "global_step/max_steps": "145/380", "percentage": "38.16%", "elapsed_time": "15m 30s", "remaining_time": "25m 7s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.155882}
+{"loss": 0.73091793, "grad_norm": 13.25398625, "learning_rate": 3.8e-07, "token_acc": 0.78768382, "epoch": 1.97368421, "global_step/max_steps": "150/380", "percentage": "39.47%", "elapsed_time": "15m 57s", "remaining_time": "24m 28s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.156611}
+{"eval_loss": 0.72810078, "eval_runtime": 13.8917, "eval_samples_per_second": 9.718, "eval_steps_per_second": 1.224, "eval_token_acc": 0.78487753, "epoch": 2.0, "global_step/max_steps": "152/380", "percentage": "40.00%", "elapsed_time": "16m 22s", "remaining_time": "24m 33s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.154714}
+{"loss": 0.65368228, "grad_norm": 11.7359587, "learning_rate": 3.7e-07, "token_acc": 0.79348757, "epoch": 2.03947368, "global_step/max_steps": "155/380", "percentage": "40.79%", "elapsed_time": "18m 27s", "remaining_time": "26m 47s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.139981}
+{"loss": 0.6447937, "grad_norm": 11.47130789, "learning_rate": 3.6e-07, "token_acc": 0.80680773, "epoch": 2.10526316, "global_step/max_steps": "160/380", "percentage": "42.11%", "elapsed_time": "18m 56s", "remaining_time": "26m 2s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.140824}
+{"loss": 0.58353853, "grad_norm": 11.84357105, "learning_rate": 3.5e-07, "token_acc": 0.81401617, "epoch": 2.17105263, "global_step/max_steps": "165/380", "percentage": "43.42%", "elapsed_time": "19m 24s", "remaining_time": "25m 17s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.141657}
+{"loss": 0.58928604, "grad_norm": 10.764969, "learning_rate": 3.4e-07, "token_acc": 0.82600382, "epoch": 2.23684211, "global_step/max_steps": "170/380", "percentage": "44.74%", "elapsed_time": "19m 52s", "remaining_time": "24m 32s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.142607}
+{"loss": 0.61311746, "grad_norm": 13.95200555, "learning_rate": 3.3e-07, "token_acc": 0.81511111, "epoch": 2.30263158, "global_step/max_steps": "175/380", "percentage": "46.05%", "elapsed_time": "20m 19s", "remaining_time": "23m 48s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.143488}
+{"loss": 0.60693817, "grad_norm": 11.50098162, "learning_rate": 3.2e-07, "token_acc": 0.81102362, "epoch": 2.36842105, "global_step/max_steps": "180/380", "percentage": "47.37%", "elapsed_time": "20m 46s", "remaining_time": "23m 5s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.144359}
+{"loss": 0.62570276, "grad_norm": 11.46248455, "learning_rate": 3e-07, "token_acc": 0.83047427, "epoch": 2.43421053, "global_step/max_steps": "185/380", "percentage": "48.68%", "elapsed_time": "21m 14s", "remaining_time": "22m 23s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14516}
+{"loss": 0.67255135, "grad_norm": 13.00228417, "learning_rate": 2.9e-07, "token_acc": 0.79504697, "epoch": 2.5, "global_step/max_steps": "190/380", "percentage": "50.00%", "elapsed_time": "21m 42s", "remaining_time": "21m 42s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.145912}
+{"loss": 0.58921309, "grad_norm": 13.01390505, "learning_rate": 2.8e-07, "token_acc": 0.81007067, "epoch": 2.56578947, "global_step/max_steps": "195/380", "percentage": "51.32%", "elapsed_time": "22m 9s", "remaining_time": "21m 1s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.14667}
+{"loss": 0.55300646, "grad_norm": 11.48067779, "learning_rate": 2.7e-07, "token_acc": 0.83593034, "epoch": 2.63157895, "global_step/max_steps": "200/380", "percentage": "52.63%", "elapsed_time": "22m 37s", "remaining_time": "20m 21s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.147376}
+{"loss": 0.58925333, "grad_norm": 13.27682023, "learning_rate": 2.6e-07, "token_acc": 0.82585752, "epoch": 2.69736842, "global_step/max_steps": "205/380", "percentage": "53.95%", "elapsed_time": "23m 4s", "remaining_time": "19m 41s", "memory(GiB)": 74.17, "train_speed(iter/s)": 0.148115}
+{"loss": 0.6034514, "grad_norm": 13.90433345, "learning_rate": 2.5e-07, "token_acc": 0.82908404, "epoch": 2.76315789, "global_step/max_steps": "210/380", "percentage": "55.26%", "elapsed_time": "23m 32s", "remaining_time": "19m 3s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148683}
+{"loss": 0.60790315, "grad_norm": 12.01697417, "learning_rate": 2.4e-07, "token_acc": 0.80377035, "epoch": 2.82894737, "global_step/max_steps": "215/380", "percentage": "56.58%", "elapsed_time": "23m 58s", "remaining_time": "18m 24s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.149437}
+{"loss": 0.67009706, "grad_norm": 12.8832519, "learning_rate": 2.2e-07, "token_acc": 0.80121003, "epoch": 2.89473684, "global_step/max_steps": "220/380", "percentage": "57.89%", "elapsed_time": "24m 25s", "remaining_time": "17m 45s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.150125}
+{"loss": 0.65018392, "grad_norm": 12.80833257, "learning_rate": 2.1e-07, "token_acc": 0.80399274, "epoch": 2.96052632, "global_step/max_steps": "225/380", "percentage": "59.21%", "elapsed_time": "24m 52s", "remaining_time": "17m 8s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.15071}
+{"eval_loss": 0.72654486, "eval_runtime": 13.707, "eval_samples_per_second": 9.849, "eval_steps_per_second": 1.24, "eval_token_acc": 0.78594249, "epoch": 3.0, "global_step/max_steps": "228/380", "percentage": "60.00%", "elapsed_time": "25m 23s", "remaining_time": "16m 55s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.1497}
+{"loss": 0.59686203, "grad_norm": 12.5998029, "learning_rate": 2e-07, "token_acc": 0.81576448, "epoch": 3.02631579, "global_step/max_steps": "230/380", "percentage": "60.53%", "elapsed_time": "27m 17s", "remaining_time": "17m 47s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.140464}
+{"loss": 0.52481909, "grad_norm": 10.32396224, "learning_rate": 1.9e-07, "token_acc": 0.84875445, "epoch": 3.09210526, "global_step/max_steps": "235/380", "percentage": "61.84%", "elapsed_time": "27m 45s", "remaining_time": "17m 7s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141063}
+{"loss": 0.56462321, "grad_norm": 13.344608, "learning_rate": 1.8e-07, "token_acc": 0.82659479, "epoch": 3.15789474, "global_step/max_steps": "240/380", "percentage": "63.16%", "elapsed_time": "28m 15s", "remaining_time": "16m 28s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141567}
+{"loss": 0.52111754, "grad_norm": 11.17948974, "learning_rate": 1.7e-07, "token_acc": 0.83940972, "epoch": 3.22368421, "global_step/max_steps": "245/380", "percentage": "64.47%", "elapsed_time": "28m 41s", "remaining_time": "15m 48s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142307}
+{"loss": 0.53249068, "grad_norm": 11.28136572, "learning_rate": 1.6e-07, "token_acc": 0.84306888, "epoch": 3.28947368, "global_step/max_steps": "250/380", "percentage": "65.79%", "elapsed_time": "29m 8s", "remaining_time": "15m 9s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142961}
+{"loss": 0.55586386, "grad_norm": 11.01640688, "learning_rate": 1.5e-07, "token_acc": 0.83586338, "epoch": 3.35526316, "global_step/max_steps": "255/380", "percentage": "67.11%", "elapsed_time": "29m 36s", "remaining_time": "14m 30s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143574}
+{"loss": 0.56448097, "grad_norm": 13.14098789, "learning_rate": 1.4e-07, "token_acc": 0.82502444, "epoch": 3.42105263, "global_step/max_steps": "260/380", "percentage": "68.42%", "elapsed_time": "30m 3s", "remaining_time": "13m 52s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144127}
+{"loss": 0.59146905, "grad_norm": 12.53265712, "learning_rate": 1.3e-07, "token_acc": 0.80371353, "epoch": 3.48684211, "global_step/max_steps": "265/380", "percentage": "69.74%", "elapsed_time": "30m 31s", "remaining_time": "13m 14s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144685}
+{"loss": 0.58134708, "grad_norm": 12.54092225, "learning_rate": 1.2e-07, "token_acc": 0.81122943, "epoch": 3.55263158, "global_step/max_steps": "270/380", "percentage": "71.05%", "elapsed_time": "30m 57s", "remaining_time": "12m 36s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145363}
+{"loss": 0.52265396, "grad_norm": 11.3862044, "learning_rate": 1.1e-07, "token_acc": 0.84823091, "epoch": 3.61842105, "global_step/max_steps": "275/380", "percentage": "72.37%", "elapsed_time": "31m 24s", "remaining_time": "11m 59s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.14592}
+{"loss": 0.64316354, "grad_norm": 13.16476262, "learning_rate": 1e-07, "token_acc": 0.82226212, "epoch": 3.68421053, "global_step/max_steps": "280/380", "percentage": "73.68%", "elapsed_time": "31m 52s", "remaining_time": "11m 23s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146387}
+{"loss": 0.60033703, "grad_norm": 13.70750821, "learning_rate": 9e-08, "token_acc": 0.8210252, "epoch": 3.75, "global_step/max_steps": "285/380", "percentage": "75.00%", "elapsed_time": "32m 19s", "remaining_time": "10m 46s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146917}
+{"loss": 0.59155693, "grad_norm": 16.09784354, "learning_rate": 8e-08, "token_acc": 0.82407407, "epoch": 3.81578947, "global_step/max_steps": "290/380", "percentage": "76.32%", "elapsed_time": "32m 46s", "remaining_time": "10m 10s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147469}
+{"loss": 0.46532865, "grad_norm": 11.26346719, "learning_rate": 7e-08, "token_acc": 0.84084637, "epoch": 3.88157895, "global_step/max_steps": "295/380", "percentage": "77.63%", "elapsed_time": "33m 13s", "remaining_time": "9m 34s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148001}
+{"loss": 0.52924175, "grad_norm": 12.96189546, "learning_rate": 6e-08, "token_acc": 0.84242424, "epoch": 3.94736842, "global_step/max_steps": "300/380", "percentage": "78.95%", "elapsed_time": "33m 40s", "remaining_time": "8m 58s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.148451}
+{"eval_loss": 0.72927332, "eval_runtime": 13.7897, "eval_samples_per_second": 9.79, "eval_steps_per_second": 1.233, "eval_token_acc": 0.78328009, "epoch": 4.0, "global_step/max_steps": "304/380", "percentage": "80.00%", "elapsed_time": "34m 15s", "remaining_time": "8m 33s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147892}
+{"loss": 0.47034731, "grad_norm": 11.94114344, "learning_rate": 6e-08, "token_acc": 0.86036036, "epoch": 4.01315789, "global_step/max_steps": "305/380", "percentage": "80.26%", "elapsed_time": "36m 11s", "remaining_time": "8m 53s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.14048}
+{"loss": 0.52430968, "grad_norm": 12.36947829, "learning_rate": 5e-08, "token_acc": 0.83421986, "epoch": 4.07894737, "global_step/max_steps": "310/380", "percentage": "81.58%", "elapsed_time": "36m 38s", "remaining_time": "8m 16s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.140994}
+{"loss": 0.51603422, "grad_norm": 12.07287742, "learning_rate": 4e-08, "token_acc": 0.83608147, "epoch": 4.14473684, "global_step/max_steps": "315/380", "percentage": "82.89%", "elapsed_time": "37m 6s", "remaining_time": "7m 39s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.141504}
+{"loss": 0.51714025, "grad_norm": 13.02947867, "learning_rate": 4e-08, "token_acc": 0.84266409, "epoch": 4.21052632, "global_step/max_steps": "320/380", "percentage": "84.21%", "elapsed_time": "37m 33s", "remaining_time": "7m 2s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142011}
+{"loss": 0.51301398, "grad_norm": 12.08777604, "learning_rate": 3e-08, "token_acc": 0.84384093, "epoch": 4.27631579, "global_step/max_steps": "325/380", "percentage": "85.53%", "elapsed_time": "37m 59s", "remaining_time": "6m 25s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142584}
+{"loss": 0.54592509, "grad_norm": 12.26082191, "learning_rate": 3e-08, "token_acc": 0.84087237, "epoch": 4.34210526, "global_step/max_steps": "330/380", "percentage": "86.84%", "elapsed_time": "38m 28s", "remaining_time": "5m 49s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.142964}
+{"loss": 0.51755118, "grad_norm": 12.81988134, "learning_rate": 2e-08, "token_acc": 0.84494774, "epoch": 4.40789474, "global_step/max_steps": "335/380", "percentage": "88.16%", "elapsed_time": "38m 56s", "remaining_time": "5m 13s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143401}
+{"loss": 0.53464289, "grad_norm": 13.09503379, "learning_rate": 2e-08, "token_acc": 0.82877317, "epoch": 4.47368421, "global_step/max_steps": "340/380", "percentage": "89.47%", "elapsed_time": "39m 24s", "remaining_time": "4m 38s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.143775}
+{"loss": 0.47082796, "grad_norm": 10.82230421, "learning_rate": 1e-08, "token_acc": 0.85964912, "epoch": 4.53947368, "global_step/max_steps": "345/380", "percentage": "90.79%", "elapsed_time": "39m 52s", "remaining_time": "4m 2s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144178}
+{"loss": 0.58271708, "grad_norm": 12.0105016, "learning_rate": 1e-08, "token_acc": 0.81669691, "epoch": 4.60526316, "global_step/max_steps": "350/380", "percentage": "92.11%", "elapsed_time": "40m 20s", "remaining_time": "3m 27s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.144595}
+{"loss": 0.47942114, "grad_norm": 12.94941478, "learning_rate": 1e-08, "token_acc": 0.84791252, "epoch": 4.67105263, "global_step/max_steps": "355/380", "percentage": "93.42%", "elapsed_time": "40m 47s", "remaining_time": "2m 52s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145038}
+{"loss": 0.554074, "grad_norm": 13.89755619, "learning_rate": 0.0, "token_acc": 0.821, "epoch": 4.73684211, "global_step/max_steps": "360/380", "percentage": "94.74%", "elapsed_time": "41m 14s", "remaining_time": "2m 17s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145469}
+{"loss": 0.48682218, "grad_norm": 12.71374242, "learning_rate": 0.0, "token_acc": 0.85853659, "epoch": 4.80263158, "global_step/max_steps": "365/380", "percentage": "96.05%", "elapsed_time": "41m 42s", "remaining_time": "1m 42s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.145883}
+{"loss": 0.53818045, "grad_norm": 12.30313614, "learning_rate": 0.0, "token_acc": 0.8416955, "epoch": 4.86842105, "global_step/max_steps": "370/380", "percentage": "97.37%", "elapsed_time": "42m 8s", "remaining_time": "1m 8s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146317}
+{"loss": 0.55644884, "grad_norm": 10.88836738, "learning_rate": 0.0, "token_acc": 0.83556747, "epoch": 4.93421053, "global_step/max_steps": "375/380", "percentage": "98.68%", "elapsed_time": "42m 36s", "remaining_time": "34s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146687}
+{"loss": 0.56877451, "grad_norm": 15.15540611, "learning_rate": 0.0, "token_acc": 0.84521739, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "43m 4s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.147038}
+{"eval_loss": 0.73085946, "eval_runtime": 13.7849, "eval_samples_per_second": 9.793, "eval_steps_per_second": 1.233, "eval_token_acc": 0.78434505, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "43m 18s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.146257}
+{"eval_loss": 0.73085946, "eval_runtime": 16.1805, "eval_samples_per_second": 8.343, "eval_steps_per_second": 1.051, "eval_token_acc": 0.78434505, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "45m 17s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.139834}
+{"train_runtime": 2916.558, "train_samples_per_second": 2.086, "train_steps_per_second": 0.13, "total_flos": 2.788227173675172e+17, "train_loss": 0.64442084, "epoch": 5.0, "global_step/max_steps": "380/380", "percentage": "100.00%", "elapsed_time": "48m 31s", "remaining_time": "0s", "memory(GiB)": 75.64, "train_speed(iter/s)": 0.130533}
+{"model_parameter_info": "InternVLChatModel: 7944.3738M Params (7612.8210M Trainable [95.8266%]), 0.0001M Buffers.", "last_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-380", "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/checkpoint-228", "best_metric": 0.72654486, "global_step": 380, "log_history": [{"loss": 1.0491504669189453, "grad_norm": 16.149745965683884, "learning_rate": 1.3157894736842104e-08, "token_acc": 0.7316017316017316, "epoch": 0.013157894736842105, "step": 1}, {"loss": 0.9333165884017944, "grad_norm": 19.394274240651384, "learning_rate": 6.578947368421052e-08, "token_acc": 0.7398773006134969, "epoch": 0.06578947368421052, "step": 5}, {"loss": 0.9967510223388671, "grad_norm": 16.751648834817885, "learning_rate": 1.3157894736842104e-07, "token_acc": 0.7421441774491682, "epoch": 0.13157894736842105, "step": 10}, {"loss": 0.8940377235412598, "grad_norm": 17.69556265222528, "learning_rate": 1.9736842105263157e-07, "token_acc": 0.769434628975265, "epoch": 0.19736842105263158, "step": 15}, {"loss": 0.94066162109375, "grad_norm": 16.580837710216986, "learning_rate": 2.631578947368421e-07, "token_acc": 0.7534121929026387, "epoch": 0.2631578947368421, "step": 20}, {"loss": 0.8913036346435547, "grad_norm": 17.18358188029577, "learning_rate": 3.2894736842105264e-07, "token_acc": 0.7722672064777328, "epoch": 0.32894736842105265, "step": 25}, {"loss": 0.8995906829833984, "grad_norm": 12.655760128091604, "learning_rate": 3.9473684210526315e-07, "token_acc": 0.7529095792300806, "epoch": 0.39473684210526316, "step": 30}, {"loss": 0.7829002857208252, "grad_norm": 14.799338872033982, "learning_rate": 4.6052631578947365e-07, "token_acc": 0.7704918032786885, "epoch": 0.4605263157894737, "step": 35}, {"loss": 0.7109686374664307, "grad_norm": 13.6883274124527, "learning_rate": 4.999578104083306e-07, "token_acc": 0.7868705035971223, "epoch": 0.5263157894736842, "step": 40}, {"loss": 0.8100458145141601, "grad_norm": 11.590886715849683, "learning_rate": 4.994833410208486e-07, "token_acc": 0.7766143106457243, "epoch": 0.5921052631578947, "step": 45}, {"loss": 0.7702572345733643, "grad_norm": 18.37250203045574, "learning_rate": 4.984826693294873e-07, "token_acc": 0.789568345323741, "epoch": 0.6578947368421053, "step": 50}, {"loss": 0.8138771057128906, "grad_norm": 14.257346400410995, "learning_rate": 4.969579059190548e-07, "token_acc": 0.7591763652641003, "epoch": 0.7236842105263158, "step": 55}, {"loss": 0.7258480548858642, "grad_norm": 15.091592377911923, "learning_rate": 4.949122667718934e-07, "token_acc": 0.7591178965224766, "epoch": 0.7894736842105263, "step": 60}, {"loss": 0.8208457946777343, "grad_norm": 14.643270287439186, "learning_rate": 4.923500664848326e-07, "token_acc": 0.7617753623188406, "epoch": 0.8552631578947368, "step": 65}, {"loss": 0.68003249168396, "grad_norm": 12.10237792449664, "learning_rate": 4.892767091689785e-07, "token_acc": 0.8093922651933702, "epoch": 0.9210526315789473, "step": 70}, {"loss": 0.7365611076354981, "grad_norm": 12.72703845793204, "learning_rate": 4.856986770515357e-07, "token_acc": 0.7881679389312977, "epoch": 0.9868421052631579, "step": 75}, {"eval_loss": 0.7485233545303345, "eval_runtime": 14.0742, "eval_samples_per_second": 9.592, "eval_steps_per_second": 1.208, "eval_token_acc": 0.7790202342917998, "epoch": 1.0, "step": 76}, {"loss": 0.7654649257659912, "grad_norm": 12.921581074218519, "learning_rate": 4.816235168037004e-07, "token_acc": 0.7855787476280834, "epoch": 1.0526315789473684, "step": 80}, {"loss": 0.7880854606628418, "grad_norm": 13.068943217853121, "learning_rate": 4.770598236234616e-07, "token_acc": 0.768064228367529, "epoch": 1.118421052631579, "step": 85}, {"loss": 0.7307503700256348, "grad_norm": 12.89309027582137, "learning_rate": 4.720172231068844e-07, "token_acc": 0.7837606837606838, "epoch": 1.1842105263157894, "step": 90}, {"loss": 0.6501222610473633, "grad_norm": 12.073433385238404, "learning_rate": 4.6650635094610966e-07, "token_acc": 0.8068181818181818, "epoch": 1.25, "step": 95}, {"loss": 0.7150445461273194, "grad_norm": 11.811777608591989, "learning_rate": 4.605388304968914e-07, "token_acc": 0.7920792079207921, "epoch": 1.3157894736842106, "step": 100}, {"loss": 0.6845383644104004, "grad_norm": 11.639785635682916, "learning_rate": 4.541272482629857e-07, "token_acc": 0.7807971014492754, "epoch": 1.381578947368421, "step": 105}, {"loss": 0.6341064453125, "grad_norm": 13.527665828737845, "learning_rate": 4.472851273490984e-07, "token_acc": 0.7936936936936937, "epoch": 1.4473684210526316, "step": 110}, {"loss": 0.6944449424743653, "grad_norm": 12.400162052083235, "learning_rate": 4.4002689893838405e-07, "token_acc": 0.7871986699916874, "epoch": 1.513157894736842, "step": 115}, {"loss": 0.6916102409362793, "grad_norm": 12.084687708268657, "learning_rate": 4.323678718546552e-07, "token_acc": 0.7841726618705036, "epoch": 1.5789473684210527, "step": 120}, {"loss": 0.6724244117736816, "grad_norm": 14.339547961512208, "learning_rate": 4.243242002734988e-07, "token_acc": 0.8003731343283582, "epoch": 1.6447368421052633, "step": 125}, {"loss": 0.7068803787231446, "grad_norm": 12.886115725138072, "learning_rate": 4.159128496504053e-07, "token_acc": 0.7975970425138632, "epoch": 1.7105263157894737, "step": 130}, {"loss": 0.6755687713623046, "grad_norm": 13.664304711583553, "learning_rate": 4.071515609377705e-07, "token_acc": 0.8097982708933718, "epoch": 1.776315789473684, "step": 135}, {"loss": 0.6957115650177002, "grad_norm": 12.648918436490476, "learning_rate": 3.9805881316624503e-07, "token_acc": 0.7955555555555556, "epoch": 1.8421052631578947, "step": 140}, {"loss": 0.6958436012268067, "grad_norm": 11.176293465675545, "learning_rate": 3.8865378446935217e-07, "token_acc": 0.7841860465116279, "epoch": 1.9078947368421053, "step": 145}, {"loss": 0.7309179306030273, "grad_norm": 13.253986253776585, "learning_rate": 3.78956311633581e-07, "token_acc": 0.7876838235294118, "epoch": 1.973684210526316, "step": 150}, {"eval_loss": 0.7281007766723633, "eval_runtime": 13.8917, "eval_samples_per_second": 9.718, "eval_steps_per_second": 1.224, "eval_token_acc": 0.784877529286475, "epoch": 2.0, "step": 152}, {"loss": 0.653682279586792, "grad_norm": 11.735958700270654, "learning_rate": 3.689868482592684e-07, "token_acc": 0.7934875749785776, "epoch": 2.039473684210526, "step": 155}, {"loss": 0.644793701171875, "grad_norm": 11.471307891877995, "learning_rate": 3.587664216205183e-07, "token_acc": 0.8068077276908924, "epoch": 2.1052631578947367, "step": 160}, {"loss": 0.5835385322570801, "grad_norm": 11.843571052368723, "learning_rate": 3.4831658831514575e-07, "token_acc": 0.8140161725067385, "epoch": 2.1710526315789473, "step": 165}, {"loss": 0.5892860412597656, "grad_norm": 10.76496899877298, "learning_rate": 3.376593887981886e-07, "token_acc": 0.8260038240917782, "epoch": 2.236842105263158, "step": 170}, {"loss": 0.6131174564361572, "grad_norm": 13.952005550709435, "learning_rate": 3.268173008948826e-07, "token_acc": 0.8151111111111111, "epoch": 2.3026315789473686, "step": 175}, {"loss": 0.6069381713867188, "grad_norm": 11.500981624672228, "learning_rate": 3.1581319239114976e-07, "token_acc": 0.8110236220472441, "epoch": 2.3684210526315788, "step": 180}, {"loss": 0.6257027626037598, "grad_norm": 11.46248454566006, "learning_rate": 3.0467027280159394e-07, "token_acc": 0.8304742684157417, "epoch": 2.4342105263157894, "step": 185}, {"loss": 0.6725513458251953, "grad_norm": 13.002284165063047, "learning_rate": 2.934120444167326e-07, "token_acc": 0.7950469684030743, "epoch": 2.5, "step": 190}, {"loss": 0.5892130851745605, "grad_norm": 13.013905049177902, "learning_rate": 2.820622527327158e-07, "token_acc": 0.8100706713780919, "epoch": 2.5657894736842106, "step": 195}, {"loss": 0.5530064582824707, "grad_norm": 11.480677786146966, "learning_rate": 2.706448363680831e-07, "token_acc": 0.8359303391384051, "epoch": 2.6315789473684212, "step": 200}, {"loss": 0.5892533302307129, "grad_norm": 13.276820229615291, "learning_rate": 2.591838765731931e-07, "token_acc": 0.8258575197889182, "epoch": 2.6973684210526314, "step": 205}, {"loss": 0.6034513950347901, "grad_norm": 13.90433344794858, "learning_rate": 2.477035464388184e-07, "token_acc": 0.8290840415486308, "epoch": 2.763157894736842, "step": 210}, {"loss": 0.6079031467437744, "grad_norm": 12.016974173998076, "learning_rate": 2.3622805991103358e-07, "token_acc": 0.803770351328192, "epoch": 2.8289473684210527, "step": 215}, {"loss": 0.6700970649719238, "grad_norm": 12.883251904886917, "learning_rate": 2.2478162071993296e-07, "token_acc": 0.801210025929127, "epoch": 2.8947368421052633, "step": 220}, {"loss": 0.650183916091919, "grad_norm": 12.808332568512581, "learning_rate": 2.133883713298946e-07, "token_acc": 0.8039927404718693, "epoch": 2.9605263157894735, "step": 225}, {"eval_loss": 0.7265448570251465, "eval_runtime": 13.707, "eval_samples_per_second": 9.849, "eval_steps_per_second": 1.24, "eval_token_acc": 0.7859424920127795, "epoch": 3.0, "step": 228}, {"loss": 0.5968620300292968, "grad_norm": 12.599802900090912, "learning_rate": 2.0207234201906545e-07, "token_acc": 0.8157644824311491, "epoch": 3.026315789473684, "step": 230}, {"loss": 0.5248190879821777, "grad_norm": 10.323962239910601, "learning_rate": 1.9085740019546436e-07, "token_acc": 0.8487544483985765, "epoch": 3.0921052631578947, "step": 235}, {"loss": 0.564623212814331, "grad_norm": 13.344607997796906, "learning_rate": 1.7976720005660767e-07, "token_acc": 0.8265947888589398, "epoch": 3.1578947368421053, "step": 240}, {"loss": 0.5211175441741943, "grad_norm": 11.17948974251107, "learning_rate": 1.6882513269882913e-07, "token_acc": 0.8394097222222222, "epoch": 3.223684210526316, "step": 245}, {"loss": 0.5324906826019287, "grad_norm": 11.281365715436248, "learning_rate": 1.5805427678152674e-07, "token_acc": 0.8430688753269399, "epoch": 3.2894736842105265, "step": 250}, {"loss": 0.5558638572692871, "grad_norm": 11.016406883829102, "learning_rate": 1.4747734985039e-07, "token_acc": 0.8358633776091081, "epoch": 3.3552631578947367, "step": 255}, {"loss": 0.564480972290039, "grad_norm": 13.140987889542432, "learning_rate": 1.371166604222777e-07, "token_acc": 0.8250244379276638, "epoch": 3.4210526315789473, "step": 260}, {"loss": 0.5914690494537354, "grad_norm": 12.53265712228206, "learning_rate": 1.2699406093280545e-07, "token_acc": 0.8037135278514589, "epoch": 3.486842105263158, "step": 265}, {"loss": 0.5813470840454101, "grad_norm": 12.540922253147693, "learning_rate": 1.1713090164588606e-07, "token_acc": 0.8112294288480155, "epoch": 3.5526315789473686, "step": 270}, {"loss": 0.5226539611816406, "grad_norm": 11.386204403520878, "learning_rate": 1.0754798562243344e-07, "token_acc": 0.8482309124767226, "epoch": 3.6184210526315788, "step": 275}, {"loss": 0.643163537979126, "grad_norm": 13.164762618482639, "learning_rate": 9.826552484321085e-08, "token_acc": 0.822262118491921, "epoch": 3.6842105263157894, "step": 280}, {"loss": 0.600337028503418, "grad_norm": 13.707508212457464, "learning_rate": 8.930309757836516e-08, "token_acc": 0.8210251954821894, "epoch": 3.75, "step": 285}, {"loss": 0.5915569305419922, "grad_norm": 16.097843538474795, "learning_rate": 8.067960709356478e-08, "token_acc": 0.8240740740740741, "epoch": 3.8157894736842106, "step": 290}, {"loss": 0.46532864570617677, "grad_norm": 11.263467188848725, "learning_rate": 7.241324177983399e-08, "token_acc": 0.8408463661453542, "epoch": 3.8815789473684212, "step": 295}, {"loss": 0.5292417526245117, "grad_norm": 12.961895460114615, "learning_rate": 6.452143679117964e-08, "token_acc": 0.8424242424242424, "epoch": 3.9473684210526314, "step": 300}, {"eval_loss": 0.7292733192443848, "eval_runtime": 13.7897, "eval_samples_per_second": 9.79, "eval_steps_per_second": 1.233, "eval_token_acc": 0.7832800851970181, "epoch": 4.0, "step": 304}, {"loss": 0.47034730911254885, "grad_norm": 11.941143436933976, "learning_rate": 5.702083727091977e-08, "token_acc": 0.8603603603603603, "epoch": 4.0131578947368425, "step": 305}, {"loss": 0.5243096828460694, "grad_norm": 12.369478294282793, "learning_rate": 4.992726324427901e-08, "token_acc": 0.8342198581560284, "epoch": 4.078947368421052, "step": 310}, {"loss": 0.5160342216491699, "grad_norm": 12.072877421688775, "learning_rate": 4.325567625129545e-08, "token_acc": 0.8360814742967992, "epoch": 4.144736842105263, "step": 315}, {"loss": 0.517140245437622, "grad_norm": 13.029478667761595, "learning_rate": 3.702014779041826e-08, "token_acc": 0.8426640926640927, "epoch": 4.2105263157894735, "step": 320}, {"loss": 0.5130139827728272, "grad_norm": 12.08777604241874, "learning_rate": 3.123382963935156e-08, "token_acc": 0.8438409311348206, "epoch": 4.276315789473684, "step": 325}, {"loss": 0.5459250926971435, "grad_norm": 12.260821906980192, "learning_rate": 2.5908926115744994e-08, "token_acc": 0.8408723747980614, "epoch": 4.342105263157895, "step": 330}, {"loss": 0.5175511837005615, "grad_norm": 12.81988134221774, "learning_rate": 2.1056668336235623e-08, "token_acc": 0.8449477351916377, "epoch": 4.407894736842105, "step": 335}, {"loss": 0.5346428871154785, "grad_norm": 13.095033793006467, "learning_rate": 1.6687290528135722e-08, "token_acc": 0.8287731685789939, "epoch": 4.473684210526316, "step": 340}, {"loss": 0.4708279609680176, "grad_norm": 10.822304213631957, "learning_rate": 1.2810008443726456e-08, "token_acc": 0.8596491228070176, "epoch": 4.5394736842105265, "step": 345}, {"loss": 0.5827170848846436, "grad_norm": 12.010501597635164, "learning_rate": 9.432999922687396e-09, "token_acc": 0.8166969147005445, "epoch": 4.605263157894737, "step": 350}, {"loss": 0.47942113876342773, "grad_norm": 12.949414781142634, "learning_rate": 6.563387643658075e-09, "token_acc": 0.8479125248508946, "epoch": 4.671052631578947, "step": 355}, {"loss": 0.5540740013122558, "grad_norm": 13.897556186858308, "learning_rate": 4.207224101311246e-09, "token_acc": 0.821, "epoch": 4.7368421052631575, "step": 360}, {"loss": 0.48682217597961425, "grad_norm": 12.7137424197023, "learning_rate": 2.3694788406241894e-09, "token_acc": 0.8585365853658536, "epoch": 4.802631578947368, "step": 365}, {"loss": 0.5381804466247558, "grad_norm": 12.303136136456917, "learning_rate": 1.0540279752731252e-09, "token_acc": 0.8416955017301038, "epoch": 4.868421052631579, "step": 370}, {"loss": 0.5564488410949707, "grad_norm": 10.888367382127468, "learning_rate": 2.636460122578399e-10, "token_acc": 0.8355674709562109, "epoch": 4.934210526315789, "step": 375}, {"loss": 0.5687745094299317, "grad_norm": 15.155406114537248, "learning_rate": 0.0, "token_acc": 0.8452173913043478, "epoch": 5.0, "step": 380}, {"eval_loss": 0.7308594584465027, "eval_runtime": 13.7849, "eval_samples_per_second": 9.793, "eval_steps_per_second": 1.233, "eval_token_acc": 0.7843450479233227, "epoch": 5.0, "step": 380}, {"eval_loss": 0.7308594584465027, "eval_runtime": 16.1805, "eval_samples_per_second": 8.343, "eval_steps_per_second": 1.051, "eval_token_acc": 0.7843450479233227, "epoch": 5.0, "step": 380}, {"train_runtime": 2916.558, "train_samples_per_second": 2.086, "train_steps_per_second": 0.13, "total_flos": 2.788227173675172e+17, "train_loss": 0.6444208371011835, "epoch": 5.0, "step": 380}], "memory": 75.64453125}

ood/ivl-8b-instruct-full_sft_ood/v0-20251004-170240/val_dataset.jsonl ADDED Viewed

	@@ -0,0 +1,135 @@

+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA broken-out path in the snow – never take it for granted!", "loss": null}, {"role": "assistant", "content": "雪地里的一条开辟出来的小径——永远不要认为这是理所当然的！", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2938376423.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bad day on the slopes beats a good day at work. Snowboarding shower curtain.", "loss": null}, {"role": "assistant", "content": "雪坡上糟糕的一天也比工作中的好日子强。单板滑雪主题浴帘。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1555605271.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA badass logo for a digital marketing agency.", "loss": null}, {"role": "assistant", "content": "一个超酷的数字营销机构标志", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_518835396.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA statue of a suit on display.", "loss": null}, {"role": "assistant", "content": "一尊装甲服的雕像正在展出。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_81366672.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nInside the shed, paneling and junk, a hole in the wall that has been covered—looks like a window.", "loss": null}, {"role": "assistant", "content": "棚屋里有镶板和杂物，墙上有一个被遮盖的洞——看起来像一扇窗户。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3826601270.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThree men are together with a sign.", "loss": null}, {"role": "assistant", "content": "三个人和一个标牌在一起。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4750415524.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA modest concrete and brick facade renders the apartment sympathetic, despite its height.", "loss": null}, {"role": "assistant", "content": "一个简朴的混凝土和砖砌外墙让这栋公寓显得和谐，尽管它很高。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2238752144.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA competition and the policemen on a motorbike.", "loss": null}, {"role": "assistant", "content": "一场比赛和骑摩托车的警察。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000170636.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA Penrith fan proposes at the trial on the weekend.", "loss": null}, {"role": "assistant", "content": "一名彭里斯球迷在周末的体育比赛中求婚。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1162350536.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA newly operated cat with a funnel on his head.", "loss": null}, {"role": "assistant", "content": "一只刚做完手术、头上戴着伊丽莎白圈的猫。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1983045687.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of hawks is perched with hunting caps on their heads.", "loss": null}, {"role": "assistant", "content": "一群鹰戴着猎帽栖息着。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000573484.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nCricket player plays football with another cricket player during a training session.", "loss": null}, {"role": "assistant", "content": "板球运动员在训练期间与另一位板球运动员踢足球。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2478276218.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe person is feeling very festive in one of her new collars.", "loss": null}, {"role": "assistant", "content": "她的宠物戴上了其中一个新项圈，她感到非常喜庆。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1250524829.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of people standing together with some turkeys.", "loss": null}, {"role": "assistant", "content": "一群人站在一起，旁边有几只火鸡。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000099179.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nApple melting on the floor.", "loss": null}, {"role": "assistant", "content": "一个苹果在地板上融化。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3119350094.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person is observed making marks.", "loss": null}, {"role": "assistant", "content": "一个人被观察到正在写字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/5508346028.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA new vision for East Hanover Street.", "loss": null}, {"role": "assistant", "content": "东汉诺威街的新视觉设计", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2903564087.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nCows in a field from the train.", "loss": null}, {"role": "assistant", "content": "从火车上看田野里的牛。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2867233884.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA SUP or kayak rigged for fly fishing, a snag-free deck, and a cooler to stand on.", "loss": null}, {"role": "assistant", "content": "一个用于飞钓的立式桨板或皮划艇，无钩甲板，还有一个用来站立的冷却器。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3469033103.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA tree with the effect of destruction, dispersion, butterfly, moth.", "loss": null}, {"role": "assistant", "content": "一棵带有破坏、分散、蝴蝶和飞蛾效果的树。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2934168610.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman cannot survive on wine alone; she also needs a camper and a dog shirt.", "loss": null}, {"role": "assistant", "content": "一个女人不能只靠葡萄酒生存；她还需要一辆露营车和一只狗。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2678975805.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA grandfather is someone you can look up to, no matter how tall you grow, photographic print.", "loss": null}, {"role": "assistant", "content": "祖父是一个无论你长多高都值得仰望的人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2935502303.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSubway braking on rails in front of a metropolitan city.", "loss": null}, {"role": "assistant", "content": "地铁在大都市前的铁轨上刹车", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000400152.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bit of a mess, but I’m still a magical sticker.", "loss": null}, {"role": "assistant", "content": "有点乱，但我仍然是一个神奇的贴纸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1763187881.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThere are people looking at the man that fell.", "loss": null}, {"role": "assistant", "content": "有人在看着那个摔倒的人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/482882307.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis is a girl on a skateboard walking her pit bull.", "loss": null}, {"role": "assistant", "content": "这是一个在滑板上的女孩遛着她的比特犬。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000379520.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nTwo ancient Roman soldiers with machine guns capped: when you're losing to the computer in Age of Empires and start using cheats.", "loss": null}, {"role": "assistant", "content": "两名古罗马士兵手持机关枪：当你在《帝国时代》中输给电脑并开始使用作弊码时。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2854084108.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little bit dramatic graphic T-shirt.", "loss": null}, {"role": "assistant", "content": "印有“A LITTLE BIT DRAMATIC”图案的T恤。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2298487757.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA match ball on the pitch.", "loss": null}, {"role": "assistant", "content": "比赛用球在球场上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1324293325.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nTraveling concept around the world.", "loss": null}, {"role": "assistant", "content": "环游世界的概念", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2632725473.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe man performs grooming activities.", "loss": null}, {"role": "assistant", "content": "该男子正在整理仪容。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4322110026.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nPicture of a cyber woman with a fresh lemon.", "loss": null}, {"role": "assistant", "content": "一张赛博女性拿着新鲜柠檬的照片。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2368384225.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA footpath with flower structures lined the path, and people walked in between it.", "loss": null}, {"role": "assistant", "content": "一条两侧带有园艺花卉结构的人行道，人们在其间穿行。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2923117339.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis secluded cabin in the woods has us dreaming of making an escape right now.", "loss": null}, {"role": "assistant", "content": "这间隐秘的林中小屋让我们梦想着现在就逃离一切。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4000716874.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fishing boat trawling off Constitutional Republic with seabirds following.", "loss": null}, {"role": "assistant", "content": "一艘渔船在宪政共和国附近拖网捕鱼，海鸟紧随其后。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3310020685.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA restaurant or other place to eat at Bagan Hotel River View.", "loss": null}, {"role": "assistant", "content": "在蒲甘河景酒店的一家餐厅或其他用餐场所。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_834397888.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nIt’s always a great idea to put something at the foot of the bed!", "loss": null}, {"role": "assistant", "content": "在床尾放点东西总是个好主意！", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3031991668.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe laptop is sitting on the cushion facing the TV.", "loss": null}, {"role": "assistant", "content": "笔记本电脑放在垫子上，屏幕朝向电视。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000064834.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA few drinks to get started.", "loss": null}, {"role": "assistant", "content": "从先喝几杯开始。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3230391523.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of green men and women in a circle.", "loss": null}, {"role": "assistant", "content": "一群绿色的男女围成一个圈。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_132331323.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA flight of stairs on the way.", "loss": null}, {"role": "assistant", "content": "路上的一段楼梯", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_101917360.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA lone bench sits in front of an area filled with greenery clumps and aligned as if it is the head of a class.", "loss": null}, {"role": "assistant", "content": "一张孤零零的长椅坐落在一片绿植丛前，这些绿植整齐排列，仿佛是班级的领头。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000349734.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHandwritten in green ink, impressions of a kiss mouth on the back, which also shine through on the front side.", "loss": null}, {"role": "assistant", "content": "用绿色墨水手写，背面有吻痕的印记，这些印记也透到了正面。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4203389789.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA clock with a concrete bird next to it.", "loss": null}, {"role": "assistant", "content": "一个时钟旁边有一只混凝土制成的鸟。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000238455.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA family prepares to run a rapid on the Jordan River. There are several points on the river where one can float on a raft down the river.", "loss": null}, {"role": "assistant", "content": "一个家庭准备在约旦河上穿越急流。河上有几个地方可以乘坐皮筏顺流而下。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1638667585.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person with a tie on a metal rail.", "loss": null}, {"role": "assistant", "content": "一个戴领带的人靠在金属栏杆上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000579589.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA girl watches a boy swing.", "loss": null}, {"role": "assistant", "content": "一个女孩看着一个男孩挥棒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/5769959745.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSand runs on the road with water.", "loss": null}, {"role": "assistant", "content": "沙子随着水在道路上流动。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1301356238.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA five-star bus is going down the road.", "loss": null}, {"role": "assistant", "content": "一辆五星巴士正在路上行驶。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000144298.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe \"North\" in front of the B Street sign shows a good example of traveling direction.", "loss": null}, {"role": "assistant", "content": "B街标志上的“北”很好地展示了旅行方向。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2961209760.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA vision for this forward-looking policy document sets out a vision for.", "loss": null}, {"role": "assistant", "content": "这份前瞻性的政策文件提出了一个愿景。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_727734346.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe drinks from the river near approaching elephants.", "loss": null}, {"role": "assistant", "content": "长颈鹿在河边喝水，附近有正在接近的大象。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000118401.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nLine of animals over the changing table.", "loss": null}, {"role": "assistant", "content": "换尿布台上方的一排动物。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_1028662478.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person displays her tail on the beach.", "loss": null}, {"role": "assistant", "content": "一个人在海滩上展示她的美人鱼尾巴。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2735079883.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA long-haired dog is playing with a small, blue bat.", "loss": null}, {"role": "assistant", "content": "一只长毛狗正在玩一个小的蓝色球棒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/2375924666.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of people in the shape of lock, heart, flash mob.", "loss": null}, {"role": "assistant", "content": "一群人排成锁和心形状，进行快闪活动。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_525499828.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nIt's the first concrete electricity pole in Japan, or so the sign says.", "loss": null}, {"role": "assistant", "content": "这是日本的第一根混凝土电线杆，至少牌子上是这么说的。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_395581361.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA door that has a window with a dog behind it.", "loss": null}, {"role": "assistant", "content": "一扇有窗户的门，窗户后面有一只狗。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000539263.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little imagination goes a long way.", "loss": null}, {"role": "assistant", "content": "一点想象力大有帮助。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4169470898.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA good dose of local in the lobby.", "loss": null}, {"role": "assistant", "content": "大堂里充满了本地特色。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4116608243.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe computer is sitting on a desk with a mouse.", "loss": null}, {"role": "assistant", "content": "电脑放在桌子上，旁边有一个鼠标。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000502877.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fan with a painted face with '19' celebrating the titles his club has won.", "loss": null}, {"role": "assistant", "content": "一位脸上涂着‘19’的球迷正在庆祝他所在俱乐部赢得的冠军。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3548576173.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA photo of workers working on the clock on a tower.", "loss": null}, {"role": "assistant", "content": "一张工人在塔上的钟表上工作的照片。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000332417.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA person using a cell phone and some kitty knick-knacks.", "loss": null}, {"role": "assistant", "content": "一个人在使用手机，旁边有一些猫咪装饰品。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000055395.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAn improperly dressed person is outside trying to stay out of the elements.", "loss": null}, {"role": "assistant", "content": "一个穿着不当的人在外面试图避开恶劣天气。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/2685139184.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA hand with a red pen drew a black businessman climbing up the stairs, a concept of success and career.", "loss": null}, {"role": "assistant", "content": "一只手拿着红笔画了一个正在爬楼梯的黑人商人，象征成功和事业。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1853021108.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nOne man is making sure the scope is right and the other is getting ready to shoot the gun.", "loss": null}, {"role": "assistant", "content": "一个人在确保瞄准镜正确，另一个人准备开枪。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/7735129098.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA heart tap dancing with a big cheesy grin.", "loss": null}, {"role": "assistant", "content": "一个心形正在跳踢踏舞，带着夸张的大笑。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3839401695.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pug dressed as a film character from the movie walks along the runway.", "loss": null}, {"role": "assistant", "content": "一只打扮成电影角色的哈巴狗走在跑道上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_4073626822.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman standing behind a podium, and an airplane figure.", "loss": null}, {"role": "assistant", "content": "一位站在讲台后面的女士和一个飞机模型。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2960077463.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA large-scale drawing of a \"global city.\"", "loss": null}, {"role": "assistant", "content": "一幅大规模的“全球城市”画作", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3980987087.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis hot dog is loaded with different peppers.", "loss": null}, {"role": "assistant", "content": "这个热狗加了很多种辣椒。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000497440.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA special theorem about tangent-tangent angles given: and are tangent to circle O.", "loss": null}, {"role": "assistant", "content": "一个关于切线-切线角的特殊定理：PT与PS是圆O的切线。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4101914416.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man with more confidence with his new crowns.", "loss": null}, {"role": "assistant", "content": "一个男人带着新的牙冠更加自信了。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_26854658.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA close-up of the lager and stout.", "loss": null}, {"role": "assistant", "content": "拉格啤酒和黑啤的特写", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_468095102.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pair of stunning quality, check out the soft suede.", "loss": null}, {"role": "assistant", "content": "一双质量极佳的鞋子，看看这柔软的麂皮。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_1908856807.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA couple of kites are flying above a field.", "loss": null}, {"role": "assistant", "content": "几只风筝在田野上空飞翔。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000339705.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA flying kite with a curly tail mimics a Blue Angels' jet.", "loss": null}, {"role": "assistant", "content": "一只带着卷曲尾巴的风筝模仿了蓝天使飞行队的喷气式飞机。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000044478.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHangin' in the car seat.", "loss": null}, {"role": "assistant", "content": "坐在汽车座椅上休息。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3104028364.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA guide to ageing young barn owls.", "loss": null}, {"role": "assistant", "content": "一份关于鉴定年轻仓鸮年龄的指南", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1728337343.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman with a glove on a baseball field with another woman.", "loss": null}, {"role": "assistant", "content": "一个戴着手套的女人和另一个女人在棒球场上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/566794440.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pool of calming water at the National Museum of African American History and Culture all-over print tote bag.", "loss": null}, {"role": "assistant", "content": "国家非裔美国人历史和文化博物馆的一池平静的水印在全印花托特包上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3904468300.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA crystal-clear pool and scalloped rock in your cave, White County, TN.", "loss": null}, {"role": "assistant", "content": "田纳西州怀特县，一个洞穴中有清澈的池水和扇形岩石。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3707245747.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nBound by the shadows. I like black and white better (:.", "loss": null}, {"role": "assistant", "content": "被阴影束缚。我更喜欢黑白(:。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3074023974.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA big-ass rock in the ocean.", "loss": null}, {"role": "assistant", "content": "海洋中的一块巨石", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_4214355063.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA cast-iron skillet with eight cheater's chorizo meatballs being cooked in it.", "loss": null}, {"role": "assistant", "content": "一个铸铁煎锅里正在煎八个简易版西班牙辣香肠肉丸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1348809314.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA map of the system.", "loss": null}, {"role": "assistant", "content": "一张交通系统的地图", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3169415834.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nPrinted pleated blouse - reverse of the article.", "loss": null}, {"role": "assistant", "content": "印花褶皱上衣 - 衣物的背面", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1394401849.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man holding a child on top of a skateboard.", "loss": null}, {"role": "assistant", "content": "一个男人扶着站在滑板上的孩子。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000414709.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nDog - when a song reminds you of a painful memory, but it bumps really hard, so you keep listening to it and try to ignore the flashbacks.", "loss": null}, {"role": "assistant", "content": "狗——当一首歌让你想起痛苦的回忆，但它的节奏感非常强，所以你继续听，并试图忽略那些闪回。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4153226469.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nSomebody here is waiting to get somewhere else.", "loss": null}, {"role": "assistant", "content": "这里有人在等待去别的地方。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4679771579.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA beautiful woman standing and carrying a baby close to her.", "loss": null}, {"role": "assistant", "content": "一位美丽的女子站着，怀里抱着一个紧贴着她的婴儿。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_425878648.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bag of tricks: see page 297 of your text.", "loss": null}, {"role": "assistant", "content": "一套技巧：请参阅课本第297页。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2554611256.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA child causes himself not to see while next to a piece of furniture.", "loss": null}, {"role": "assistant", "content": "一个孩子站在家具旁边，用手遮住了眼睛。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/6820458661.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nVandalized stop sign with a sticker reminding people to stop [eating animals].", "loss": null}, {"role": "assistant", "content": "被破坏的停车标志，上面贴着一个提醒人们停止吃动物的贴纸。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000206300.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA zener is not like a normal diode.", "loss": null}, {"role": "assistant", "content": "齐纳二极管不像普通的二极管。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_4108174908.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe water from the sink that you wash your hands in flushes into the toilet tank.", "loss": null}, {"role": "assistant", "content": "洗手池里的水（你洗手时流出的水）直接流入马桶水箱。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3611995191.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA light dusting of snow covers some ferns living just outside of rooms to stay in Armidale.", "loss": null}, {"role": "assistant", "content": "一层薄雪覆盖了阿米代尔旅馆房间外的一些蕨类植物。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_98916313.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA peek inside the powder room.", "loss": null}, {"role": "assistant", "content": "一窥洗手间内部", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1537635414.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA lot goes into a good suit.", "loss": null}, {"role": "assistant", "content": "一套好西装需要很多讲究。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_2106491181.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe and several zebra in Tail Brush.", "loss": null}, {"role": "assistant", "content": "一只长颈鹿和几只斑马在Tail Brush。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000192217.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nProduce market in India protected by umbrella shades.", "loss": null}, {"role": "assistant", "content": "印度的农产品市场受到伞荫的保护。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/3864394764.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA giraffe with his head out of sight over a covering.", "loss": null}, {"role": "assistant", "content": "一只长颈鹿的头越过遮盖物看不见了。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000496309.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA distant artistic conception of zen ink landscape painting. Layers of mountains and mountains of zen ink landscape painting.", "loss": null}, {"role": "assistant", "content": "远处的禅意水墨山水画意境，层层叠叠的山峦和禅意水墨山水画的意象。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2191215683.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bed or beds in a room at an English holiday home.", "loss": null}, {"role": "assistant", "content": "英国度假屋一个房间里的床铺。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_913296828.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA group of red and white striped tulips reminds me of the famous variety.", "loss": null}, {"role": "assistant", "content": "一簇红白条纹的郁金香让我想起了那个著名的品种。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_887742134.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nYoung Caucasian woman organizing a birthday, keeping a secret, or asking for silence.", "loss": null}, {"role": "assistant", "content": "一位年轻的白人女性正在组织生日派对，或在保守秘密，或要求保持安静。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1683317947.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nObservers sit watching jugglers with fire.", "loss": null}, {"role": "assistant", "content": "观众坐着观看玩火的杂技演员。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/111069375.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA collection of trees and branches across the Sucker River from its other bank. I guess I did call this a bridge!", "loss": null}, {"role": "assistant", "content": "从苏克河的另一岸看过去，是一堆树木和树枝。我想我确实把这叫做一座桥了！", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2031663003.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bed or beds in a room at Destinations Inn theme rooms.", "loss": null}, {"role": "assistant", "content": "Destinations Inn 主题房间中的床。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1080498727.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nNo building, under tree school at village Kumb.", "loss": null}, {"role": "assistant", "content": "库姆村的学校没有建筑物，设在树下。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_2212461512.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThere are two people wearing plastic.", "loss": null}, {"role": "assistant", "content": "有两个人穿着塑料材质的服装。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/3391209042.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nBeautiful girl/model in couture dress standing by huge privet hedge on estate in Southampton, NY. Photo by Eric Striffler.", "loss": null}, {"role": "assistant", "content": "美丽的模特穿着高级定制礼服站在纽约南安普顿庄园的巨大女贞树篱旁。照片由Eric Striffler拍摄。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3010430221.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThe laundry is hanging in the tilted room.", "loss": null}, {"role": "assistant", "content": "洗衣物挂在倾斜的房间里。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000279689.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA skater is holding out his hand and flashing a symbol.", "loss": null}, {"role": "assistant", "content": "滑板者伸出手，做了一个手势。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000107974.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA set of banners for the web of sport, fitness, and motivation text for sports equipment.", "loss": null}, {"role": "assistant", "content": "一组用于体育、健身和激励网站的横幅，包含体育设备的文字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3203677853.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA girl in a black tank with cargo shorts to what appears to be dancing, with several people around.", "loss": null}, {"role": "assistant", "content": "一个穿着黑色背心和工装短裤的女孩似乎在跳舞，周围有几个人。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4879725156.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bathroom has dark colored appliances and light wood.", "loss": null}, {"role": "assistant", "content": "浴室里有深色的用具和浅色的木地板。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000400596.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA view of the shops and passers.", "loss": null}, {"role": "assistant", "content": "商店和路人的景象", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_43578409.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAnother cute kids' choir singing in church. The girl on the left did a lot of choir teaching, I think.", "loss": null}, {"role": "assistant", "content": "另一个可爱的儿童合唱团在教堂演唱。我想左边的女孩做了很多合唱教学。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3142177036.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA little turkey of construction paper with the family name on it.", "loss": null}, {"role": "assistant", "content": "一只用彩纸做的小火鸡，上面写着家人的名字。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_3508197500.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA cheeky flip of the tail.", "loss": null}, {"role": "assistant", "content": "调皮地甩了一下尾巴。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_541197507.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA box in the boot... what more could a kitty ask for...", "loss": null}, {"role": "assistant", "content": "后备箱里的一个盒子……一只小猫还能要求什么呢……", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_3308376282.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nAn Air France plane is on the fly track.", "loss": null}, {"role": "assistant", "content": "一架法国航空的飞机在跑道上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000287960.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA black-and-white cat sleeping on top of a TV with a fish on it.", "loss": null}, {"role": "assistant", "content": "一只黑白相间的猫正睡在有一条鱼的电视上面。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000251358.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA woman on a park bench hides her face behind her handbag. The bench is marked.", "loss": null}, {"role": "assistant", "content": "一名女子坐在公园长椅上，用手提包遮住了脸。这张长椅上有标记。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc_3528349762.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA pair of birds in the crown of the heart tree.", "loss": null}, {"role": "assistant", "content": "一对鸟儿在心形树的树冠上。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1020101235.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThis looks like a craft of cutting out greenery from magazines.", "loss": null}, {"role": "assistant", "content": "这看起来像是从杂志上剪下绿色植物的手工艺。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000127167.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nThey don't call it the land of ice and fire for nothing.", "loss": null}, {"role": "assistant", "content": "他们称这里为冰与火之国并非没有道理。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1643510681.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA fairy figure among the seashells and flowers.", "loss": null}, {"role": "assistant", "content": "贝壳和花朵中的仙女形象", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000577434.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA man is on his truck.", "loss": null}, {"role": "assistant", "content": "一个男人坐在他的卡车后部。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/4859995088.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nDog mom, the soul of a witch, the fire of a lioness v-neck t-shirt.", "loss": null}, {"role": "assistant", "content": "狗妈妈，女巫的灵魂，母狮的火焰 V领T恤", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_2524820280.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA creative cold gradient line drawing cartoon fluid injection.", "loss": null}, {"role": "assistant", "content": "一幅创意的冷色调渐变线条卡通流体注射。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/cc12m_1778791047.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nHorse lamp in front of check-in desk.", "loss": null}, {"role": "assistant", "content": "登记台前的马形灯。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/sbu_1044392237.jpg"}]}
+{"messages": [{"role": "system", "content": "A conversation between User and Assistant. The User provides text with ambiguity along with an image, and the Assistant performs multimodal translation, using both the text and the visual information to resolve ambiguities.", "loss": null}, {"role": "user", "content": "<image>Please translate the following English sentence into Chinese:\nA bunch of green vegetables on a table along with some literature.", "loss": null}, {"role": "assistant", "content": "桌上有一堆绿色蔬菜和一些宣传资料。", "loss": null}], "images": [{"bytes": null, "path": "/mnt/data/users/liamding/data/3AM/3AM/images/000000577631.jpg"}]}

ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b6f3949e2b25ad05905da16bc2d58aeadc9a9f1db0b0e97a04ceb63467acff1d
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:46438be1a017b4129e12daa7706d8e089acb52ce21dd221bdf56b7805908f8e7
+size 130

ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6eda756b33768639c1e1932ae00c5d43269de5705c09162bc09acdf71d3533c0
-size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:44b5d9c1719ded08a1ad4d2ab99a52e42b832740b2f617b20fbced3b44af1cb6
+size 130

ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:668792bf09fbb49f9c314d93f695654b11742e9f79ae00886d88610037822211
-size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:2706c505d9b4f951bbed8fdc626de4fd07d6c4cab5c57c69911cad80ca8cd266
+size 129

ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f9ba4b4a6625b5047a1356f6081b641c3e4e6a4a198facbd4bef217747d1685
-size 11423548

 version https://git-lfs.github.com/spec/v1
+oid sha256:be323c4be6beb251cbbb892753b8e3795ed3cc61e6c1cd7b99248f07c0dd77d3
+size 133

ood/ivl-8b-instruct-thinking_full_v3_ood_wd001_e10/v3-20250919-091625/checkpoint-228/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e24637d5664f539efa19070b97cd134b22e84f4e6886afc216787e15a1dabf33
-size 9105

 version https://git-lfs.github.com/spec/v1
+oid sha256:4008f8556423d2a54d5c5da7dff26ae095bd1dbc88171574086541a58bf550e2
+size 129

ood/qwen2.5vl-7b-lora-ood-merged/model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:feda0a72bf79db5c840d893b72eae28a52c8efa4f9dbeb70919bef25be1f4203
-size 4968243304

 version https://git-lfs.github.com/spec/v1
+oid sha256:a982e6dee71b0ebae179ee58d2e60d88b96c82a27afeb3b898682918322a4896
+size 135

ood/qwen2.5vl-7b-lora-ood-merged/model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87f787aa0a6de41b70622019d0945b52fc4f4727e489dbc04a16aa06abbececc
-size 4991495816

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f6463345d04663ff4f9afc252b987018d3c3d62aaf19a2e08f1c9af850769a5
+size 135

ood/qwen2.5vl-7b-lora-ood-merged/model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7631840e9191407b17edbbbadd1415be270a24562d698d8cd259c35e0d882b9c
-size 4932751040

 version https://git-lfs.github.com/spec/v1
+oid sha256:96868afd4278fd0b65c13d19d69e75e60dd517d61f4a95923b07a36024b44a1a
+size 135

ood/qwen2.5vl-7b-lora-ood-merged/model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ff7c07eaa9cb4421f500f376e445441b2d0a6ff70b4b899b3231b2ccad16000
-size 1691924384

 version https://git-lfs.github.com/spec/v1
+oid sha256:d026cc75d4103794f79ae83e5228042b6ae32e017d0c1018237f6cc38b9b6e70
+size 135

ood/qwen2.5vl-7b-lora-ood-merged/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
-size 11421896

 version https://git-lfs.github.com/spec/v1
+oid sha256:cedf8734fa1adb8c4ce97738255bb9ae9a474dcc57a219a61df59d32cee881c9
+size 133

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abbed0e0d4e1347664a1cc0049e9a6dfc803fcfa7143008ca19b9622aa6c5682
+size 4968243304

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dcad0749899eaa609764a7a7716c5798980dd4be78a39bb4422fb0728aa52ed
+size 4991495816

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b55402c2db527127c3c2ea2bb2b01836b1566bf17ac0ad434f03e51c2b0ab2a4
+size 4932751040

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55e09e29d1678f8eac1b36f0a75ddcf819e2296f1b26c67f6c4828ca6151fe5d
+size 1691924384

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,737 @@

+{
+  "metadata": {
+    "total_parameters": 848896,
+    "total_size": 16584333312
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
+  }
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "Qwen2_5_VLProcessor"
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08c830b56eaefdd9b372fae4488e1ba93148ecf58aa2ed05c8103ab7afe964ee
+size 15365

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6f3949e2b25ad05905da16bc2d58aeadc9a9f1db0b0e97a04ceb63467acff1d
+size 15429

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c3812c5e7f13c9d32f97d83cd625428061b1c9486e869caffd453d83ac07ceb
+size 15429

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eda756b33768639c1e1932ae00c5d43269de5705c09162bc09acdf71d3533c0
+size 15429

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:668792bf09fbb49f9c314d93f695654b11742e9f79ae00886d88610037822211
+size 1465

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/trainer_state.json ADDED Viewed

	@@ -0,0 +1,429 @@

+{
+  "best_global_step": 152,
+  "best_metric": 0.75577664,
+  "best_model_checkpoint": "/mnt/data/users/liamding/data/MMMT/lora/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10/v0-20250918-203744/checkpoint-152",
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 228,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.013157894736842105,
+      "grad_norm": 12.42494369838121,
+      "learning_rate": 2.6315789473684208e-08,
+      "loss": 1.5432628393173218,
+      "step": 1,
+      "token_acc": 0.6487418831168831
+    },
+    {
+      "epoch": 0.06578947368421052,
+      "grad_norm": 12.47365246922611,
+      "learning_rate": 1.3157894736842104e-07,
+      "loss": 1.5321896076202393,
+      "step": 5,
+      "token_acc": 0.6431663266829195
+    },
+    {
+      "epoch": 0.13157894736842105,
+      "grad_norm": 12.047748548567757,
+      "learning_rate": 2.631578947368421e-07,
+      "loss": 1.506991195678711,
+      "step": 10,
+      "token_acc": 0.6522582993004731
+    },
+    {
+      "epoch": 0.19736842105263158,
+      "grad_norm": 12.014778540391298,
+      "learning_rate": 3.9473684210526315e-07,
+      "loss": 1.4956873893737792,
+      "step": 15,
+      "token_acc": 0.6490658614832987
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 10.195113538835848,
+      "learning_rate": 5.263157894736842e-07,
+      "loss": 1.4674793243408204,
+      "step": 20,
+      "token_acc": 0.6513630945514669
+    },
+    {
+      "epoch": 0.32894736842105265,
+      "grad_norm": 10.285205984290517,
+      "learning_rate": 6.578947368421053e-07,
+      "loss": 1.3990056037902832,
+      "step": 25,
+      "token_acc": 0.6637606498910243
+    },
+    {
+      "epoch": 0.39473684210526316,
+      "grad_norm": 7.536123061987423,
+      "learning_rate": 7.894736842105263e-07,
+      "loss": 1.2762855529785155,
+      "step": 30,
+      "token_acc": 0.6822537218878683
+    },
+    {
+      "epoch": 0.4605263157894737,
+      "grad_norm": 6.843506846280123,
+      "learning_rate": 9.210526315789473e-07,
+      "loss": 1.1991205215454102,
+      "step": 35,
+      "token_acc": 0.6923714759535655
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 5.130992585179559,
+      "learning_rate": 1.0526315789473683e-06,
+      "loss": 1.0460807800292968,
+      "step": 40,
+      "token_acc": 0.72837884815945
+    },
+    {
+      "epoch": 0.5921052631578947,
+      "grad_norm": 3.9730546867292427,
+      "learning_rate": 1.1842105263157892e-06,
+      "loss": 0.9720072746276855,
+      "step": 45,
+      "token_acc": 0.741999770053271
+    },
+    {
+      "epoch": 0.6578947368421053,
+      "grad_norm": 4.1751054772178895,
+      "learning_rate": 1.3157894736842106e-06,
+      "loss": 0.8988618850708008,
+      "step": 50,
+      "token_acc": 0.7569205505161088
+    },
+    {
+      "epoch": 0.7236842105263158,
+      "grad_norm": 3.5074503427399657,
+      "learning_rate": 1.4473684210526317e-06,
+      "loss": 0.8683221817016602,
+      "step": 55,
+      "token_acc": 0.7598684210526315
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 3.402758348361785,
+      "learning_rate": 1.5789473684210526e-06,
+      "loss": 0.839161491394043,
+      "step": 60,
+      "token_acc": 0.7665720617394347
+    },
+    {
+      "epoch": 0.8552631578947368,
+      "grad_norm": 3.3488105431372595,
+      "learning_rate": 1.7105263157894735e-06,
+      "loss": 0.8634527206420899,
+      "step": 65,
+      "token_acc": 0.7606793413411821
+    },
+    {
+      "epoch": 0.9210526315789473,
+      "grad_norm": 3.365459865949832,
+      "learning_rate": 1.8421052631578946e-06,
+      "loss": 0.8623040199279786,
+      "step": 70,
+      "token_acc": 0.7610415546508948
+    },
+    {
+      "epoch": 0.9868421052631579,
+      "grad_norm": 3.228725710683044,
+      "learning_rate": 1.9736842105263157e-06,
+      "loss": 0.8207452774047852,
+      "step": 75,
+      "token_acc": 0.7709683833764168
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.7950107455253601,
+      "eval_runtime": 11.1516,
+      "eval_samples_per_second": 12.106,
+      "eval_steps_per_second": 1.524,
+      "eval_token_acc": 0.7745666068883065,
+      "step": 76
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 3.061781454836411,
+      "learning_rate": 1.9998312416333223e-06,
+      "loss": 0.7909780979156494,
+      "step": 80,
+      "token_acc": 0.7769812355545109
+    },
+    {
+      "epoch": 1.118421052631579,
+      "grad_norm": 2.9959739226060877,
+      "learning_rate": 1.999145758387301e-06,
+      "loss": 0.8172050476074219,
+      "step": 85,
+      "token_acc": 0.7715258754719775
+    },
+    {
+      "epoch": 1.1842105263157894,
+      "grad_norm": 3.093664256799247,
+      "learning_rate": 1.9979333640833945e-06,
+      "loss": 0.7659732818603515,
+      "step": 90,
+      "token_acc": 0.7813097981020815
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.9007288102242987,
+      "learning_rate": 1.9961946980917456e-06,
+      "loss": 0.7576296329498291,
+      "step": 95,
+      "token_acc": 0.7848722591027962
+    },
+    {
+      "epoch": 1.3157894736842106,
+      "grad_norm": 3.08974622770043,
+      "learning_rate": 1.9939306773179494e-06,
+      "loss": 0.7871574878692627,
+      "step": 100,
+      "token_acc": 0.7752208275220828
+    },
+    {
+      "epoch": 1.381578947368421,
+      "grad_norm": 2.9410249033476967,
+      "learning_rate": 1.9911424957195155e-06,
+      "loss": 0.7703839778900147,
+      "step": 105,
+      "token_acc": 0.7800666232721982
+    },
+    {
+      "epoch": 1.4473684210526316,
+      "grad_norm": 3.412089039840561,
+      "learning_rate": 1.9878316236762193e-06,
+      "loss": 0.7863829135894775,
+      "step": 110,
+      "token_acc": 0.7759826005903371
+    },
+    {
+      "epoch": 1.513157894736842,
+      "grad_norm": 3.185001921714578,
+      "learning_rate": 1.98399980721468e-06,
+      "loss": 0.7711540222167969,
+      "step": 115,
+      "token_acc": 0.7787817668414684
+    },
+    {
+      "epoch": 1.5789473684210527,
+      "grad_norm": 3.0212490269279906,
+      "learning_rate": 1.9796490670875738e-06,
+      "loss": 0.7722112655639648,
+      "step": 120,
+      "token_acc": 0.7774650089921026
+    },
+    {
+      "epoch": 1.6447368421052633,
+      "grad_norm": 3.097474614016035,
+      "learning_rate": 1.9747816977079668e-06,
+      "loss": 0.7530896186828613,
+      "step": 125,
+      "token_acc": 0.7835121176003178
+    },
+    {
+      "epoch": 1.7105263157894737,
+      "grad_norm": 2.9781695556647994,
+      "learning_rate": 1.9694002659393305e-06,
+      "loss": 0.7732268333435058,
+      "step": 130,
+      "token_acc": 0.7787148117353605
+    },
+    {
+      "epoch": 1.776315789473684,
+      "grad_norm": 3.06954358369665,
+      "learning_rate": 1.9635076097418733e-06,
+      "loss": 0.7715581893920899,
+      "step": 135,
+      "token_acc": 0.7787210627677553
+    },
+    {
+      "epoch": 1.8421052631578947,
+      "grad_norm": 2.9310526512559254,
+      "learning_rate": 1.957106836675914e-06,
+      "loss": 0.7432127475738526,
+      "step": 140,
+      "token_acc": 0.7850694030147556
+    },
+    {
+      "epoch": 1.9078947368421053,
+      "grad_norm": 2.9957118106295924,
+      "learning_rate": 1.950201322263071e-06,
+      "loss": 0.764984130859375,
+      "step": 145,
+      "token_acc": 0.7813440713084953
+    },
+    {
+      "epoch": 1.973684210526316,
+      "grad_norm": 2.9264049027139363,
+      "learning_rate": 1.942794708206143e-06,
+      "loss": 0.7624619483947754,
+      "step": 150,
+      "token_acc": 0.7813595519200548
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.7557766437530518,
+      "eval_runtime": 11.4219,
+      "eval_samples_per_second": 11.819,
+      "eval_steps_per_second": 1.488,
+      "eval_token_acc": 0.7825907021658159,
+      "step": 152
+    },
+    {
+      "epoch": 2.039473684210526,
+      "grad_norm": 2.81677121987903,
+      "learning_rate": 1.934890900468615e-06,
+      "loss": 0.7107283115386963,
+      "step": 155,
+      "token_acc": 0.7902097902097902
+    },
+    {
+      "epoch": 2.1052631578947367,
+      "grad_norm": 3.110214332868517,
+      "learning_rate": 1.9264940672148015e-06,
+      "loss": 0.7049328804016113,
+      "step": 160,
+      "token_acc": 0.792085744242942
+    },
+    {
+      "epoch": 2.1710526315789473,
+      "grad_norm": 3.1170800221770736,
+      "learning_rate": 1.917608636611721e-06,
+      "loss": 0.6538917064666748,
+      "step": 165,
+      "token_acc": 0.8053540656434096
+    },
+    {
+      "epoch": 2.236842105263158,
+      "grad_norm": 2.9791586864999764,
+      "learning_rate": 1.9082392944938463e-06,
+      "loss": 0.662127685546875,
+      "step": 170,
+      "token_acc": 0.803340308455398
+    },
+    {
+      "epoch": 2.3026315789473686,
+      "grad_norm": 3.0436404891376267,
+      "learning_rate": 1.8983909818919787e-06,
+      "loss": 0.6792965888977051,
+      "step": 175,
+      "token_acc": 0.7981443134756292
+    },
+    {
+      "epoch": 2.3684210526315788,
+      "grad_norm": 3.0287782620036734,
+      "learning_rate": 1.8880688924275375e-06,
+      "loss": 0.6498573303222657,
+      "step": 180,
+      "token_acc": 0.8074076974276653
+    },
+    {
+      "epoch": 2.4342105263157894,
+      "grad_norm": 3.056401729041573,
+      "learning_rate": 1.8772784695736428e-06,
+      "loss": 0.6739880561828613,
+      "step": 185,
+      "token_acc": 0.7976903170986057
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 2.9244106898705375,
+      "learning_rate": 1.8660254037844386e-06,
+      "loss": 0.6584476470947266,
+      "step": 190,
+      "token_acc": 0.8041385948026949
+    },
+    {
+      "epoch": 2.5657894736842106,
+      "grad_norm": 3.0726933163215957,
+      "learning_rate": 1.8543156294941648e-06,
+      "loss": 0.6841012001037597,
+      "step": 195,
+      "token_acc": 0.7971685375920997
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 2.8543141156332386,
+      "learning_rate": 1.8421553219875656e-06,
+      "loss": 0.6722308158874511,
+      "step": 200,
+      "token_acc": 0.7996123279705369
+    },
+    {
+      "epoch": 2.6973684210526314,
+      "grad_norm": 2.986532459756899,
+      "learning_rate": 1.8295508941432813e-06,
+      "loss": 0.6710346698760986,
+      "step": 205,
+      "token_acc": 0.7982886525101072
+    },
+    {
+      "epoch": 2.763157894736842,
+      "grad_norm": 3.0096419377618724,
+      "learning_rate": 1.8165089930519428e-06,
+      "loss": 0.6560275554656982,
+      "step": 210,
+      "token_acc": 0.8003802281368821
+    },
+    {
+      "epoch": 2.8289473684210527,
+      "grad_norm": 3.0712726957788483,
+      "learning_rate": 1.8030364965107515e-06,
+      "loss": 0.6660303115844727,
+      "step": 215,
+      "token_acc": 0.8012015503875969
+    },
+    {
+      "epoch": 2.8947368421052633,
+      "grad_norm": 3.069154754414468,
+      "learning_rate": 1.7891405093963937e-06,
+      "loss": 0.6556559562683105,
+      "step": 220,
+      "token_acc": 0.8036698692707513
+    },
+    {
+      "epoch": 2.9605263157894735,
+      "grad_norm": 3.055385524570652,
+      "learning_rate": 1.7748283599182013e-06,
+      "loss": 0.6615045547485352,
+      "step": 225,
+      "token_acc": 0.8035366518657007
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.7560063004493713,
+      "eval_runtime": 11.7729,
+      "eval_samples_per_second": 11.467,
+      "eval_steps_per_second": 1.444,
+      "eval_token_acc": 0.7828206189359452,
+      "step": 228
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 760,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 35470891442176.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70736c4ef9635bf51275a4c7d8823d8ba312cfa86b8f1c6b11ad543560f279aa
+size 9105

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

ood/qwen2.5vl-7b-thinking_full_v3_ood_wd001_e10-checkpoint-228/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff