yosubshin commited on Dec 4, 2025

Commit

cb6da0f

verified ·

1 Parent(s): 37300c7

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +1 -0
added_tokens.json +24 -0
chat_template.jinja +1 -0
config.json +132 -0
generation_config.json +11 -0
logs/rank_0000.log +46 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +737 -0
preprocessor_config.json +39 -0
runs/Dec04_22-09-12_6165b55eed88/events.out.tfevents.1764886209.6165b55eed88.4158.0 +3 -0
special_tokens_map.json +31 -0
telemetry/devices_info.txt +2 -0
telemetry/telemetry_callback_metrics_rank0000.json +10 -0
telemetry/telemetry_callback_rank0000.json +39 -0
telemetry/telemetry_callback_wandb_rank0000.json +5 -0
telemetry/training_config.yaml +245 -0
telemetry/world_size.json +4 -0
tokenizer.json +3 -0
tokenizer_config.json +209 -0
trainer_state.json +1345 -0
training_args.bin +3 -0
video_preprocessor_config.json +43 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for message in messages -%}{%- if loop.first and message['role'] != 'system' -%}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{%- endif -%}{{ '<|im_start|>' + message['role'] + '\n' }}{%- if message['content'] is string -%}{{- message['content'] -}}{%- elif message['content'] is iterable -%}{%- for item in message['content'] -%}{%- if item['type'].startswith('image') -%}{%- set image_count.value = image_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Picture ' + image_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|image_pad|><|vision_end|>' }}{%- elif item['type'].startswith('video') -%}{%- set video_count.value = video_count.value + 1 -%}{%- if add_vision_id -%}{{ 'Video ' + video_count.value + ': ' }}{%- endif -%}{{ '<|vision_start|><|video_pad|><|vision_end|>' }}{%- elif item['type']=='text' -%}{{- item['text'] if 'text' in item else item['content'] -}}{%- endif -%}{%- endfor -%}{%- endif -%}{{ '<|im_end|>\n' }}{%- endfor -%}{%- if add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_token_id": 151655,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "text_config": {
+    "_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
+    "architectures": [
+      "Qwen2_5_VLForConditionalGeneration"
+    ],
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 128000,
+    "max_window_layers": 28,
+    "model_type": "qwen2_5_vl_text",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "pad_token_id": 151643,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vision_token_id": 151654,
+    "vocab_size": 152064
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "depth": 32,
+    "dtype": "bfloat16",
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "qwen2_5_vl",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vocab_size": 152064
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 1e-06,
+  "transformers_version": "4.57.2"
+}

logs/rank_0000.log ADDED Viewed

	@@ -0,0 +1,46 @@

+[2025-12-04 22:08:17,625][oumi][rank0][pid:4158][MainThread][INFO]][train.py:318] Training config saved to /contents/qwen2_5_vl_7b_walton_random_500_1/telemetry/training_config.yaml
+[2025-12-04 22:08:17,791][oumi][rank0][pid:4158][MainThread][INFO]][models.py:469] Setting tokenizer to use the 'right' padding side for model 'Qwen/Qwen2.5-VL-7B-Instruct'. The 'right' padding side is configured as the default value for this model type.
+[2025-12-04 22:08:22,444][oumi][rank0][pid:4158][MainThread][INFO]][train.py:348] Set `training.trainer_kwargs.remove_unused_columns=False` for VLM training with TRL_SFT trainer.
+[2025-12-04 22:08:22,560][oumi][rank0][pid:4158][MainThread][WARNING]][data.py:66] Using torchdata preprocessing pipeline. This is currently in beta and may not be stable.
+[2025-12-04 22:08:22,562][oumi][rank0][pid:4158][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: HuggingFaceVisionDataset)... dataset_name: 'yosubshin/WaltonMultimodalColdStart-random-500-1'
+[2025-12-04 22:08:28,897][oumi][rank0][pid:4158][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
+	Split: train
+	Version: 0.0.0
+	Dataset size: 9237866
+	Download size: 7853429
+	Size: 17091295 bytes
+	Rows: 500
+	Columns: ['image', 'problem', 'solution']
+[2025-12-04 22:08:29,449][oumi][rank0][pid:4158][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (500, 3). Columns:
+image       object
+problem     object
+solution    object
+dtype: object
+[2025-12-04 22:08:32,293][oumi][rank0][pid:4158][MainThread][INFO]][train.py:449] Set Accelerate environment variables for FSDP: {'ACCELERATE_DYNAMO_BACKEND': 'NO', 'ACCELERATE_DYNAMO_MODE': 'default', 'ACCELERATE_DYNAMO_USE_FULLGRAPH': 'False', 'ACCELERATE_DYNAMO_USE_DYNAMIC': 'False', 'FSDP_CPU_RAM_EFFICIENT_LOADING': 'true', 'FSDP_USE_ORIG_PARAMS': 'true', 'ACCELERATE_USE_FSDP': 'true', 'FSDP_SHARDING_STRATEGY': 'HYBRID_SHARD', 'FSDP_OFFLOAD_PARAMS': 'false', 'ACCELERATE_MIXED_PRECISION': 'bf16', 'FSDP_BACKWARD_PREFETCH': 'BACKWARD_PRE', 'FSDP_FORWARD_PREFETCH': 'true', 'FSDP_STATE_DICT_TYPE': 'FULL_STATE_DICT', 'FSDP_AUTO_WRAP_POLICY': 'SIZE_BASED_WRAP', 'FSDP_MIN_NUM_PARAMS': '100000', 'FSDP_SYNC_MODULE_STATES': 'true', 'FSDP_ACTIVATION_CHECKPOINTING': 'true'}
+[2025-12-04 22:08:32,296][oumi][rank0][pid:4158][MainThread][INFO]][models.py:251] Accelerate FSDP run detected! Setting device_map to None.
+[2025-12-04 22:08:32,297][oumi][rank0][pid:4158][MainThread][INFO]][models.py:260] Building model using device_map: None (DeviceRankInfo(world_size=1, rank=0, local_world_size=1, local_rank=0))...
+[2025-12-04 22:08:32,412][oumi][rank0][pid:4158][MainThread][INFO]][models.py:336] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForVision2Seq'> to instantiate model.
+[2025-12-04 22:09:12,370][oumi][rank0][pid:4158][MainThread][INFO]][torch_utils.py:288]
+Model Parameters Summary:
+🔢 Total     parameters: 8,292,166,656
+🔗 Embedding parameters: 544,997,376
+🎯 Trainable parameters: 8,292,166,656
+🔒 Frozen    parameters: 0 (0.00%)
+[2025-12-04 22:09:12,951][oumi][rank0][pid:4158][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!
+[2025-12-04 22:09:12,953][oumi][rank0][pid:4158][MainThread][WARNING]][callbacks.py:72] MFU logging requires packed datasets. Skipping MFU callbacks.
+[2025-12-04 22:09:18,713][oumi][rank0][pid:4158][MainThread][INFO]][device_utils.py:343] GPU Metrics Before Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=1, used_memory_mb=17186.0, temperature=30, fan_speed=None, fan_speeds=None, power_usage_watts=61.819, power_limit_watts=400.0, gpu_utilization=0, memory_utilization=0, performance_state=0, clock_speed_graphics=1155, clock_speed_sm=1155, clock_speed_memory=1593).
+[2025-12-04 22:09:18,716][oumi][rank0][pid:4158][MainThread][INFO]][train.py:558] Training init time: 61.151s
+[2025-12-04 22:09:18,717][oumi][rank0][pid:4158][MainThread][INFO]][train.py:559] Starting training... (TrainerType.TRL_SFT, transformers: 4.57.2)
+[2025-12-04 22:14:27,502][oumi][rank0][pid:4158][MainThread][INFO]][device_utils.py:343] On epoch end: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=1, used_memory_mb=69996.0, temperature=56, fan_speed=None, fan_speeds=None, power_usage_watts=382.858, power_limit_watts=400.0, gpu_utilization=89, memory_utilization=46, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-12-04 22:14:27,515][oumi][rank0][pid:4158][MainThread][INFO]][telemetry_callback.py:242] Saving telemetry callback summary to /contents/qwen2_5_vl_7b_walton_random_500_1/telemetry/telemetry_callback_rank0000.json...
+[2025-12-04 22:14:27,583][oumi][rank0][pid:4158][MainThread][INFO]][train.py:566] Training is Complete.
+[2025-12-04 22:14:27,589][oumi][rank0][pid:4158][MainThread][INFO]][device_utils.py:343] GPU Metrics After Training: GPU runtime info: NVidiaGpuRuntimeInfo(device_index=0, device_count=1, used_memory_mb=69996.0, temperature=51, fan_speed=None, fan_speeds=None, power_usage_watts=93.419, power_limit_watts=400.0, gpu_utilization=89, memory_utilization=46, performance_state=0, clock_speed_graphics=1410, clock_speed_sm=1410, clock_speed_memory=1593).
+[2025-12-04 22:14:27,592][oumi][rank0][pid:4158][MainThread][INFO]][torch_utils.py:135] Peak GPU memory usage: 67.98 GB
+[2025-12-04 22:14:27,593][oumi][rank0][pid:4158][MainThread][INFO]][train.py:573] Saving final state...
+[2025-12-04 22:14:27,598][oumi][rank0][pid:4158][MainThread][INFO]][train.py:578] Saving final model...
+[2025-12-04 22:15:23,013][oumi][rank0][pid:4158][MainThread][INFO]][hf_trainer.py:127] Model has been saved at /contents/qwen2_5_vl_7b_walton_random_500_1
+[2025-12-04 22:15:23,859][oumi][rank0][pid:4158][MainThread][INFO]][hf_trainer.py:131] Processor config has been saved at /contents/qwen2_5_vl_7b_walton_random_500_1
+[2025-12-04 22:15:23,862][oumi][rank0][pid:4158][MainThread][INFO]][train.py:230]
+» We're always looking for feedback. What's one thing we can improve? https://oumi.ai/feedback

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b8d48a125502ada01ede6418f87d27eeb9dfcabbaf0d71db7418faacf34e270
+size 4968243304

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe6018e0ae4e22a3801e9a4ba290f1d213f9a8671f4092516f1472451878669
+size 4991495816

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d06ef5705ac08133834de476673eec34c461e0e7ff77642cae7055e1952d1b9f
+size 4932751040

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b01d30f5c729f9ac0e57a905b66d5afac19a52a37978880ee9e6a472ed0862d1
+size 1691924384

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,737 @@

+{
+  "metadata": {
+    "total_parameters": 8292166656,
+    "total_size": 16584333312
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
+    "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
+  }
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

runs/Dec04_22-09-12_6165b55eed88/events.out.tfevents.1764886209.6165b55eed88.4158.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71937f0ce877592f41e1be51b83a5adc2c2ee5688e2e2fab5ffe7c5a1cb21d80
+size 65742

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

telemetry/devices_info.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ CPU cores: 12 CUDA devices: 1
2	+ device(0)='NVIDIA A100-SXM4-80GB' Capability: (8, 0) Memory: [Total: 79.32GiB Free: 78.9GiB Allocated: 0.0GiB Cached: 0.0GiB]

telemetry/telemetry_callback_metrics_rank0000.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "train_runtime": 298.5529,
+  "train_samples_per_second": 1.675,
+  "train_steps_per_second": 1.675,
+  "train_tokens_per_second": 2758.017,
+  "total_flos": 1.972943710728192e+16,
+  "train_loss": 0.3892666745185852,
+  "epoch": 1.0,
+  "num_input_tokens_seen": 424444
+}

telemetry/telemetry_callback_rank0000.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "hostname": "6165b55eed88",
+  "total_time": 314.55871530499996,
+  "timers": {
+    "epochs": {
+      "count": 1.0,
+      "mean": 257.66958669400003,
+      "median": 257.66958669400003,
+      "std_dev": 0,
+      "min": 257.66958669400003,
+      "max": 257.66958669400003,
+      "total": 257.66958669400003,
+      "percentage": 81.91462329828009
+    },
+    "microsteps": {
+      "count": 498.0,
+      "mean": 0.4020090631485908,
+      "median": 0.37922136950004415,
+      "std_dev": 0.07505460587743448,
+      "min": 0.31015944900002523,
+      "max": 0.7942700079998986,
+      "total": 200.20051344799822,
+      "percentage": 63.64487890722765
+    },
+    "steps": {
+      "count": 498.0,
+      "mean": 0.4020103719718869,
+      "median": 0.3792217589999609,
+      "std_dev": 0.07505424005098268,
+      "min": 0.3101598060000015,
+      "max": 0.7942724639999597,
+      "total": 200.20116524199966,
+      "percentage": 63.64508611623816
+    }
+  },
+  "cuda_timers": {},
+  "gpu_memory": [],
+  "gpu_temperature": {}
+}

telemetry/telemetry_callback_wandb_rank0000.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "id": "0eqxl02b",
+  "name": "radiant-fire-125",
+  "url": "https://wandb.ai/video-sync/huggingface/runs/0eqxl02b"
+}

telemetry/training_config.yaml ADDED Viewed

	@@ -0,0 +1,245 @@

+data:
+  train:
+    datasets:
+    - dataset_name: hf_vision
+      dataset_path: null
+      subset: null
+      split: train
+      dataset_kwargs:
+        hf_dataset_path: yosubshin/WaltonMultimodalColdStart-random-500-1
+        image_column: image
+        question_column: problem
+        answer_column: solution
+        return_tensors: true
+        processor_name: Qwen/Qwen2.5-VL-7B-Instruct
+        return_conversations: true
+      sample_count: null
+      mixture_proportion: null
+      shuffle: true
+      seed: 42
+      shuffle_buffer_size: 1000
+      trust_remote_code: true
+      transform_num_workers: auto
+    collator_name: vision_language_sft
+    collator_kwargs:
+      process_individually: true
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: null
+    use_torchdata: true
+  test:
+    datasets: []
+    collator_name: null
+    collator_kwargs: {}
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: null
+    use_torchdata: null
+  validation:
+    datasets: []
+    collator_name: null
+    collator_kwargs: {}
+    pack: false
+    stream: false
+    target_col: null
+    mixture_strategy: first_exhausted
+    seed: null
+    use_torchdata: null
+model:
+  model_name: Qwen/Qwen2.5-VL-7B-Instruct
+  adapter_model: null
+  tokenizer_name: null
+  tokenizer_pad_token: null
+  tokenizer_kwargs: {}
+  processor_kwargs: {}
+  model_max_length: 10000
+  load_pretrained_weights: true
+  trust_remote_code: true
+  torch_dtype_str: bfloat16
+  compile: false
+  chat_template: qwen2-vl-instruct
+  chat_template_kwargs: null
+  attn_implementation: flash_attention_2
+  device_map: auto
+  model_kwargs: {}
+  enable_liger_kernel: false
+  shard_for_eval: false
+  freeze_layers: []
+  model_revision: null
+training:
+  use_peft: false
+  trainer_type: TRL_SFT
+  enable_gradient_checkpointing: true
+  gradient_checkpointing_kwargs:
+    use_reentrant: false
+  output_dir: /contents/qwen2_5_vl_7b_walton_random_500_1
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 8
+  gradient_accumulation_steps: 1
+  max_steps: -1
+  num_train_epochs: 1
+  save_epoch: false
+  save_steps: 0
+  save_final_model: true
+  seed: 42
+  data_seed: 42
+  use_deterministic: false
+  full_determinism: false
+  run_name: null
+  metrics_function: null
+  reward_functions: null
+  grpo:
+    model_init_kwargs: {}
+    max_prompt_length: null
+    max_completion_length: null
+    num_generations: null
+    temperature: 0.9
+    remove_unused_columns: false
+    repetition_penalty: 1.0
+    use_vllm: false
+    vllm_mode: null
+    vllm_gpu_memory_utilization: 0.9
+    epsilon: 0.2
+    log_completions: false
+    rollout_function: null
+  gkd:
+    teacher_model_name_or_path: null
+    teacher_model_init_kwargs:
+      dtype: auto
+    temperature: 0.9
+    lmbda: 0.5
+    beta: 0.5
+    max_new_tokens: 128
+    disable_dropout: true
+    seq_kd: false
+  log_level: info
+  dep_log_level: warning
+  log_examples: false
+  enable_wandb: true
+  enable_mlflow: false
+  enable_tensorboard: true
+  logging_strategy: steps
+  logging_dir: null
+  logging_steps: 5
+  logging_first_step: false
+  eval_strategy: 'no'
+  eval_steps: 500
+  learning_rate: 2.0e-05
+  lr_scheduler_type: cosine
+  lr_scheduler_kwargs: {}
+  warmup_ratio: 0.03
+  warmup_steps: null
+  optimizer: adamw_torch_fused
+  weight_decay: 0.01
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  sgd_momentum: 0.0
+  mixed_precision_dtype: NONE
+  compile: false
+  include_performance_metrics: true
+  include_alternative_mfu_metrics: false
+  log_model_summary: false
+  resume_from_checkpoint: null
+  try_resume_from_last_checkpoint: false
+  dataloader_num_workers: 2
+  dataloader_persistent_workers: false
+  dataloader_prefetch_factor: 8
+  dataloader_main_process_only: false
+  ddp_find_unused_parameters: false
+  max_grad_norm: 1.0
+  trainer_kwargs:
+    max_length: 10000
+    remove_unused_columns: false
+    dataset_kwargs:
+      skip_prepare_dataset: true
+  verl_config_overrides: {}
+  profiler:
+    save_dir: null
+    enable_cpu_profiling: false
+    enable_cuda_profiling: false
+    record_shapes: false
+    profile_memory: false
+    with_stack: false
+    with_flops: false
+    with_modules: false
+    row_limit: 50
+    schedule:
+      enable_schedule: false
+      wait: 0
+      warmup: 1
+      active: 3
+      repeat: 1
+      skip_first: 1
+  telemetry:
+    telemetry_dir: telemetry
+    collect_telemetry_for_all_ranks: false
+    track_gpu_temperature: false
+  empty_device_cache_steps: 1
+  nccl_default_timeout_minutes: null
+  label_ignore_index: null
+peft:
+  lora_r: 8
+  lora_alpha: 8
+  lora_dropout: 0.0
+  lora_target_modules: null
+  lora_target_parameters: null
+  lora_modules_to_save: null
+  lora_bias: none
+  lora_init_weights: DEFAULT
+  lora_task_type: CAUSAL_LM
+  q_lora: false
+  q_lora_bits: 4
+  bnb_4bit_quant_type: fp4
+  llm_int8_skip_modules: null
+  use_bnb_nested_quant: false
+  bnb_4bit_quant_storage: uint8
+  bnb_4bit_compute_dtype: float32
+  peft_save_mode: ADAPTER_ONLY
+fsdp:
+  enable_fsdp: true
+  sharding_strategy: HYBRID_SHARD
+  cpu_offload: false
+  mixed_precision: bf16
+  backward_prefetch: BACKWARD_PRE
+  forward_prefetch: true
+  use_orig_params: null
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: SIZE_BASED_WRAP
+  min_num_params: 100000
+  transformer_layer_cls: null
+  sync_module_states: true
+deepspeed:
+  enable_deepspeed: false
+  deepspeed_config_path: null
+  zero_stage: ZERO_0
+  offload_optimizer: null
+  offload_param: null
+  precision: null
+  overlap_comm: false
+  contiguous_gradients: true
+  reduce_bucket_size: 500000000
+  allgather_bucket_size: 500000000
+  allgather_partitions: true
+  reduce_scatter: true
+  round_robin_gradients: false
+  stage3_prefetch_bucket_size: 50000000
+  stage3_param_persistence_threshold: 100000
+  stage3_max_live_parameters: 1000000000
+  stage3_max_reuse_distance: 1000000000
+  stage3_gather_16bit_weights_on_model_save: false
+  sub_group_size: 1000000000
+  train_batch_size: auto
+  train_micro_batch_size_per_gpu: auto
+  gradient_accumulation_steps: auto
+  gradient_clipping: auto
+  zero_allow_untested_optimizer: true
+  zero_force_ds_cpu_optimizer: true
+  activation_checkpointing: {}
+  memory_efficient_linear: false
+  steps_per_print: 10
+  wall_clock_breakdown: false

telemetry/world_size.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "LOCAL_WORLD_SIZE": 1,
+  "WORLD_SIZE": 1
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 10000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1345 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2098822593688965,
+      "epoch": 0.01,
+      "grad_norm": 42.0,
+      "learning_rate": 5.333333333333334e-06,
+      "loss": 1.2171,
+      "mean_token_accuracy": 0.8310450196266175,
+      "num_input_tokens_seen": 4781,
+      "num_tokens": 4781.0,
+      "step": 5,
+      "train_runtime": 45.8999,
+      "train_tokens_per_second": 104.161
+    },
+    {
+      "entropy": 0.6481770157814026,
+      "epoch": 0.02,
+      "grad_norm": 5.65625,
+      "learning_rate": 1.2e-05,
+      "loss": 0.6354,
+      "mean_token_accuracy": 0.8989984631538391,
+      "num_input_tokens_seen": 8956,
+      "num_tokens": 8956.0,
+      "step": 10,
+      "train_runtime": 48.3641,
+      "train_tokens_per_second": 185.179
+    },
+    {
+      "entropy": 1.4211748957633972,
+      "epoch": 0.03,
+      "grad_norm": 16.125,
+      "learning_rate": 1.866666666666667e-05,
+      "loss": 0.5623,
+      "mean_token_accuracy": 0.8859113693237305,
+      "num_input_tokens_seen": 12206,
+      "num_tokens": 12206.0,
+      "step": 15,
+      "train_runtime": 50.75,
+      "train_tokens_per_second": 240.512
+    },
+    {
+      "entropy": 1.6534127473831177,
+      "epoch": 0.04,
+      "grad_norm": 24.125,
+      "learning_rate": 1.9996643536268202e-05,
+      "loss": 0.5616,
+      "mean_token_accuracy": 0.8681243538856507,
+      "num_input_tokens_seen": 15256,
+      "num_tokens": 15256.0,
+      "step": 20,
+      "train_runtime": 53.0284,
+      "train_tokens_per_second": 287.695
+    },
+    {
+      "entropy": 1.156516921520233,
+      "epoch": 0.05,
+      "grad_norm": 10.5625,
+      "learning_rate": 1.9983011763899674e-05,
+      "loss": 0.9336,
+      "mean_token_accuracy": 0.7931149959564209,
+      "num_input_tokens_seen": 19115,
+      "num_tokens": 19115.0,
+      "step": 25,
+      "train_runtime": 55.4933,
+      "train_tokens_per_second": 344.456
+    },
+    {
+      "entropy": 1.7109876096248626,
+      "epoch": 0.06,
+      "grad_norm": 31.375,
+      "learning_rate": 1.995890919075519e-05,
+      "loss": 0.419,
+      "mean_token_accuracy": 0.8940125346183777,
+      "num_input_tokens_seen": 23042,
+      "num_tokens": 23042.0,
+      "step": 30,
+      "train_runtime": 57.958,
+      "train_tokens_per_second": 397.564
+    },
+    {
+      "entropy": 1.2635510742664338,
+      "epoch": 0.07,
+      "grad_norm": 37.75,
+      "learning_rate": 1.9924361097105624e-05,
+      "loss": 0.4631,
+      "mean_token_accuracy": 0.8700049161911011,
+      "num_input_tokens_seen": 27231,
+      "num_tokens": 27231.0,
+      "step": 35,
+      "train_runtime": 60.4425,
+      "train_tokens_per_second": 450.527
+    },
+    {
+      "entropy": 1.7808212399482728,
+      "epoch": 0.08,
+      "grad_norm": 9.125,
+      "learning_rate": 1.987940371913044e-05,
+      "loss": 0.4037,
+      "mean_token_accuracy": 0.8829172015190124,
+      "num_input_tokens_seen": 30038,
+      "num_tokens": 30038.0,
+      "step": 40,
+      "train_runtime": 62.6525,
+      "train_tokens_per_second": 479.439
+    },
+    {
+      "entropy": 0.542309308052063,
+      "epoch": 0.09,
+      "grad_norm": 226.0,
+      "learning_rate": 1.9824084210910924e-05,
+      "loss": 0.2899,
+      "mean_token_accuracy": 0.9064181089401245,
+      "num_input_tokens_seen": 35933,
+      "num_tokens": 35933.0,
+      "step": 45,
+      "train_runtime": 65.4796,
+      "train_tokens_per_second": 548.766
+    },
+    {
+      "entropy": 1.5936108112335206,
+      "epoch": 0.1,
+      "grad_norm": 10.125,
+      "learning_rate": 1.9758460594972068e-05,
+      "loss": 0.3814,
+      "mean_token_accuracy": 0.8922946691513062,
+      "num_input_tokens_seen": 39538,
+      "num_tokens": 39538.0,
+      "step": 50,
+      "train_runtime": 67.8868,
+      "train_tokens_per_second": 582.411
+    },
+    {
+      "entropy": 0.33137865364551544,
+      "epoch": 0.11,
+      "grad_norm": 6.0625,
+      "learning_rate": 1.9682601701424958e-05,
+      "loss": 0.2417,
+      "mean_token_accuracy": 0.9260695338249206,
+      "num_input_tokens_seen": 45125,
+      "num_tokens": 45125.0,
+      "step": 55,
+      "train_runtime": 70.6806,
+      "train_tokens_per_second": 638.435
+    },
+    {
+      "entropy": 2.0804291367530823,
+      "epoch": 0.12,
+      "grad_norm": 6.0,
+      "learning_rate": 1.9596587095773496e-05,
+      "loss": 0.3692,
+      "mean_token_accuracy": 0.888221287727356,
+      "num_input_tokens_seen": 49426,
+      "num_tokens": 49426.0,
+      "step": 60,
+      "train_runtime": 73.2615,
+      "train_tokens_per_second": 674.652
+    },
+    {
+      "entropy": 2.3786060571670533,
+      "epoch": 0.13,
+      "grad_norm": 52.25,
+      "learning_rate": 1.950050699546116e-05,
+      "loss": 0.5333,
+      "mean_token_accuracy": 0.844174611568451,
+      "num_input_tokens_seen": 52776,
+      "num_tokens": 52776.0,
+      "step": 65,
+      "train_runtime": 75.5797,
+      "train_tokens_per_second": 698.283
+    },
+    {
+      "entropy": 3.5936172008514404,
+      "epoch": 0.14,
+      "grad_norm": 10.4375,
+      "learning_rate": 1.9394462175245382e-05,
+      "loss": 0.3433,
+      "mean_token_accuracy": 0.9040402054786683,
+      "num_input_tokens_seen": 58224,
+      "num_tokens": 58224.0,
+      "step": 70,
+      "train_runtime": 78.4088,
+      "train_tokens_per_second": 742.57
+    },
+    {
+      "entropy": 3.5159372806549074,
+      "epoch": 0.15,
+      "grad_norm": 8.5,
+      "learning_rate": 1.9278563861498726e-05,
+      "loss": 0.4922,
+      "mean_token_accuracy": 0.8572691679000854,
+      "num_input_tokens_seen": 61751,
+      "num_tokens": 61751.0,
+      "step": 75,
+      "train_runtime": 80.8265,
+      "train_tokens_per_second": 763.995
+    },
+    {
+      "entropy": 2.106823134422302,
+      "epoch": 0.16,
+      "grad_norm": 9.0,
+      "learning_rate": 1.91529336155478e-05,
+      "loss": 0.3312,
+      "mean_token_accuracy": 0.8902272701263427,
+      "num_input_tokens_seen": 66370,
+      "num_tokens": 66370.0,
+      "step": 80,
+      "train_runtime": 83.4413,
+      "train_tokens_per_second": 795.409
+    },
+    {
+      "entropy": 2.849087345600128,
+      "epoch": 0.17,
+      "grad_norm": 4.34375,
+      "learning_rate": 1.9017703206172187e-05,
+      "loss": 0.3891,
+      "mean_token_accuracy": 0.8769482135772705,
+      "num_input_tokens_seen": 71825,
+      "num_tokens": 71825.0,
+      "step": 85,
+      "train_runtime": 86.2703,
+      "train_tokens_per_second": 832.558
+    },
+    {
+      "entropy": 0.9910945057868957,
+      "epoch": 0.18,
+      "grad_norm": 4.375,
+      "learning_rate": 1.8873014471397225e-05,
+      "loss": 0.2142,
+      "mean_token_accuracy": 0.9370332717895508,
+      "num_input_tokens_seen": 78620,
+      "num_tokens": 78620.0,
+      "step": 90,
+      "train_runtime": 89.3207,
+      "train_tokens_per_second": 880.199
+    },
+    {
+      "entropy": 1.6270696818828583,
+      "epoch": 0.19,
+      "grad_norm": 15.6875,
+      "learning_rate": 1.871901916972547e-05,
+      "loss": 0.2867,
+      "mean_token_accuracy": 0.91047682762146,
+      "num_input_tokens_seen": 81646,
+      "num_tokens": 81646.0,
+      "step": 95,
+      "train_runtime": 91.6169,
+      "train_tokens_per_second": 891.167
+    },
+    {
+      "entropy": 2.038286340236664,
+      "epoch": 0.2,
+      "grad_norm": 5.1875,
+      "learning_rate": 1.8555878820963014e-05,
+      "loss": 0.2827,
+      "mean_token_accuracy": 0.9124228358268738,
+      "num_input_tokens_seen": 85808,
+      "num_tokens": 85808.0,
+      "step": 100,
+      "train_runtime": 94.1424,
+      "train_tokens_per_second": 911.47
+    },
+    {
+      "entropy": 1.9634060263633728,
+      "epoch": 0.21,
+      "grad_norm": 34.0,
+      "learning_rate": 1.8383764536807486e-05,
+      "loss": 0.4848,
+      "mean_token_accuracy": 0.8574860453605652,
+      "num_input_tokens_seen": 89581,
+      "num_tokens": 89581.0,
+      "step": 105,
+      "train_runtime": 96.5271,
+      "train_tokens_per_second": 928.04
+    },
+    {
+      "entropy": 1.775536298751831,
+      "epoch": 0.22,
+      "grad_norm": 5.40625,
+      "learning_rate": 1.8202856841375517e-05,
+      "loss": 0.3573,
+      "mean_token_accuracy": 0.8958765864372253,
+      "num_input_tokens_seen": 92352,
+      "num_tokens": 92352.0,
+      "step": 110,
+      "train_runtime": 98.7649,
+      "train_tokens_per_second": 935.069
+    },
+    {
+      "entropy": 1.5104227304458617,
+      "epoch": 0.23,
+      "grad_norm": 8.6875,
+      "learning_rate": 1.8013345481857903e-05,
+      "loss": 0.3306,
+      "mean_token_accuracy": 0.8874430656433105,
+      "num_input_tokens_seen": 97262,
+      "num_tokens": 97262.0,
+      "step": 115,
+      "train_runtime": 101.4721,
+      "train_tokens_per_second": 958.51
+    },
+    {
+      "entropy": 1.6048516988754273,
+      "epoch": 0.24,
+      "grad_norm": 6.75,
+      "learning_rate": 1.7815429229500946e-05,
+      "loss": 0.2865,
+      "mean_token_accuracy": 0.9133572459220887,
+      "num_input_tokens_seen": 101680,
+      "num_tokens": 101680.0,
+      "step": 120,
+      "train_runtime": 104.0352,
+      "train_tokens_per_second": 977.361
+    },
+    {
+      "entropy": 1.5413621544837952,
+      "epoch": 0.25,
+      "grad_norm": 9.5,
+      "learning_rate": 1.7609315671122912e-05,
+      "loss": 0.4193,
+      "mean_token_accuracy": 0.8804891586303711,
+      "num_input_tokens_seen": 105845,
+      "num_tokens": 105845.0,
+      "step": 125,
+      "train_runtime": 106.5596,
+      "train_tokens_per_second": 993.293
+    },
+    {
+      "entropy": 0.7295441627502441,
+      "epoch": 0.26,
+      "grad_norm": 5.5,
+      "learning_rate": 1.739522099138411e-05,
+      "loss": 0.2694,
+      "mean_token_accuracy": 0.9172501802444458,
+      "num_input_tokens_seen": 110058,
+      "num_tokens": 110058.0,
+      "step": 130,
+      "train_runtime": 109.0787,
+      "train_tokens_per_second": 1008.978
+    },
+    {
+      "entropy": 0.8994132578372955,
+      "epoch": 0.27,
+      "grad_norm": 7.65625,
+      "learning_rate": 1.7173369746039026e-05,
+      "loss": 0.3696,
+      "mean_token_accuracy": 0.8766513228416443,
+      "num_input_tokens_seen": 115609,
+      "num_tokens": 115609.0,
+      "step": 135,
+      "train_runtime": 111.8977,
+      "train_tokens_per_second": 1033.167
+    },
+    {
+      "entropy": 1.00724515914917,
+      "epoch": 0.28,
+      "grad_norm": 9.625,
+      "learning_rate": 1.6943994626408365e-05,
+      "loss": 0.3011,
+      "mean_token_accuracy": 0.9135396718978882,
+      "num_input_tokens_seen": 119642,
+      "num_tokens": 119642.0,
+      "step": 140,
+      "train_runtime": 114.4392,
+      "train_tokens_per_second": 1045.464
+    },
+    {
+      "entropy": 0.5401971220970154,
+      "epoch": 0.29,
+      "grad_norm": 42.75,
+      "learning_rate": 1.6707336215317968e-05,
+      "loss": 0.334,
+      "mean_token_accuracy": 0.8978281736373901,
+      "num_input_tokens_seen": 124260,
+      "num_tokens": 124260.0,
+      "step": 145,
+      "train_runtime": 117.0247,
+      "train_tokens_per_second": 1061.827
+    },
+    {
+      "entropy": 0.5970407903194428,
+      "epoch": 0.3,
+      "grad_norm": 17.875,
+      "learning_rate": 1.646364273476067e-05,
+      "loss": 0.3841,
+      "mean_token_accuracy": 0.8826762557029724,
+      "num_input_tokens_seen": 129260,
+      "num_tokens": 129260.0,
+      "step": 150,
+      "train_runtime": 119.7547,
+      "train_tokens_per_second": 1079.373
+    },
+    {
+      "entropy": 0.6529799103736877,
+      "epoch": 0.31,
+      "grad_norm": 5.8125,
+      "learning_rate": 1.6213169785545688e-05,
+      "loss": 0.4827,
+      "mean_token_accuracy": 0.8586708188056946,
+      "num_input_tokens_seen": 132550,
+      "num_tokens": 132550.0,
+      "step": 155,
+      "train_runtime": 122.083,
+      "train_tokens_per_second": 1085.737
+    },
+    {
+      "entropy": 0.8556781709194183,
+      "epoch": 0.32,
+      "grad_norm": 12.6875,
+      "learning_rate": 1.5956180079208684e-05,
+      "loss": 0.3258,
+      "mean_token_accuracy": 0.899234139919281,
+      "num_input_tokens_seen": 137016,
+      "num_tokens": 137016.0,
+      "step": 160,
+      "train_runtime": 124.7137,
+      "train_tokens_per_second": 1098.644
+    },
+    {
+      "entropy": 1.3605871438980102,
+      "epoch": 0.33,
+      "grad_norm": 14.6875,
+      "learning_rate": 1.5692943162463628e-05,
+      "loss": 0.4556,
+      "mean_token_accuracy": 0.8675174951553345,
+      "num_input_tokens_seen": 140162,
+      "num_tokens": 140162.0,
+      "step": 165,
+      "train_runtime": 127.0736,
+      "train_tokens_per_second": 1102.998
+    },
+    {
+      "entropy": 1.3584325790405274,
+      "epoch": 0.34,
+      "grad_norm": 11.125,
+      "learning_rate": 1.5423735134485537e-05,
+      "loss": 0.3785,
+      "mean_token_accuracy": 0.884908127784729,
+      "num_input_tokens_seen": 144154,
+      "num_tokens": 144154.0,
+      "step": 170,
+      "train_runtime": 129.5953,
+      "train_tokens_per_second": 1112.34
+    },
+    {
+      "entropy": 1.1947744607925415,
+      "epoch": 0.35,
+      "grad_norm": 11.125,
+      "learning_rate": 1.5148838357320537e-05,
+      "loss": 0.5116,
+      "mean_token_accuracy": 0.8327634334564209,
+      "num_input_tokens_seen": 147793,
+      "num_tokens": 147793.0,
+      "step": 175,
+      "train_runtime": 132.0069,
+      "train_tokens_per_second": 1119.586
+    },
+    {
+      "entropy": 0.691273283958435,
+      "epoch": 0.36,
+      "grad_norm": 8.6875,
+      "learning_rate": 1.4868541159727097e-05,
+      "loss": 0.4557,
+      "mean_token_accuracy": 0.869376790523529,
+      "num_input_tokens_seen": 151710,
+      "num_tokens": 151710.0,
+      "step": 180,
+      "train_runtime": 134.5176,
+      "train_tokens_per_second": 1127.808
+    },
+    {
+      "entropy": 0.6039592266082764,
+      "epoch": 0.37,
+      "grad_norm": 6.53125,
+      "learning_rate": 1.4583137534758968e-05,
+      "loss": 0.3602,
+      "mean_token_accuracy": 0.8929054975509644,
+      "num_input_tokens_seen": 156235,
+      "num_tokens": 156235.0,
+      "step": 185,
+      "train_runtime": 137.1479,
+      "train_tokens_per_second": 1139.172
+    },
+    {
+      "entropy": 1.1752222299575805,
+      "epoch": 0.38,
+      "grad_norm": 7.59375,
+      "learning_rate": 1.429292683140706e-05,
+      "loss": 0.3343,
+      "mean_token_accuracy": 0.9040075540542603,
+      "num_input_tokens_seen": 160384,
+      "num_tokens": 160384.0,
+      "step": 190,
+      "train_runtime": 139.7132,
+      "train_tokens_per_second": 1147.952
+    },
+    {
+      "entropy": 0.49049092233181,
+      "epoch": 0.39,
+      "grad_norm": 15.3125,
+      "learning_rate": 1.3998213440623691e-05,
+      "loss": 0.2802,
+      "mean_token_accuracy": 0.9056740045547486,
+      "num_input_tokens_seen": 165232,
+      "num_tokens": 165232.0,
+      "step": 195,
+      "train_runtime": 142.3929,
+      "train_tokens_per_second": 1160.395
+    },
+    {
+      "entropy": 0.45642663538455963,
+      "epoch": 0.4,
+      "grad_norm": 3.84375,
+      "learning_rate": 1.3699306476058523e-05,
+      "loss": 0.243,
+      "mean_token_accuracy": 0.924269187450409,
+      "num_input_tokens_seen": 169212,
+      "num_tokens": 169212.0,
+      "step": 200,
+      "train_runtime": 144.8837,
+      "train_tokens_per_second": 1167.916
+    },
+    {
+      "entropy": 0.6119729101657867,
+      "epoch": 0.41,
+      "grad_norm": 13.75,
+      "learning_rate": 1.3396519449841006e-05,
+      "loss": 0.3785,
+      "mean_token_accuracy": 0.8857461333274841,
+      "num_input_tokens_seen": 173680,
+      "num_tokens": 173680.0,
+      "step": 205,
+      "train_runtime": 147.502,
+      "train_tokens_per_second": 1177.475
+    },
+    {
+      "entropy": 0.3425890028476715,
+      "epoch": 0.42,
+      "grad_norm": 2.71875,
+      "learning_rate": 1.3090169943749475e-05,
+      "loss": 0.3554,
+      "mean_token_accuracy": 0.8928213953971863,
+      "num_input_tokens_seen": 178944,
+      "num_tokens": 178944.0,
+      "step": 210,
+      "train_runtime": 150.2217,
+      "train_tokens_per_second": 1191.2
+    },
+    {
+      "entropy": 0.47310497164726256,
+      "epoch": 0.43,
+      "grad_norm": 6.6875,
+      "learning_rate": 1.2780579276111702e-05,
+      "loss": 0.3294,
+      "mean_token_accuracy": 0.8975471615791321,
+      "num_input_tokens_seen": 184031,
+      "num_tokens": 184031.0,
+      "step": 215,
+      "train_runtime": 152.9698,
+      "train_tokens_per_second": 1203.054
+    },
+    {
+      "entropy": 0.4975893020629883,
+      "epoch": 0.44,
+      "grad_norm": 14.6875,
+      "learning_rate": 1.2468072164786342e-05,
+      "loss": 0.4125,
+      "mean_token_accuracy": 0.8858692884445191,
+      "num_input_tokens_seen": 188268,
+      "num_tokens": 188268.0,
+      "step": 220,
+      "train_runtime": 155.6411,
+      "train_tokens_per_second": 1209.629
+    },
+    {
+      "entropy": 0.9669841468334198,
+      "epoch": 0.45,
+      "grad_norm": 6.96875,
+      "learning_rate": 1.215297638657875e-05,
+      "loss": 0.4774,
+      "mean_token_accuracy": 0.8559552788734436,
+      "num_input_tokens_seen": 193510,
+      "num_tokens": 193510.0,
+      "step": 225,
+      "train_runtime": 158.4801,
+      "train_tokens_per_second": 1221.037
+    },
+    {
+      "entropy": 1.1044464826583862,
+      "epoch": 0.46,
+      "grad_norm": 31.75,
+      "learning_rate": 1.1835622433448361e-05,
+      "loss": 0.4524,
+      "mean_token_accuracy": 0.8517592191696167,
+      "num_input_tokens_seen": 197386,
+      "num_tokens": 197386.0,
+      "step": 230,
+      "train_runtime": 160.9307,
+      "train_tokens_per_second": 1226.528
+    },
+    {
+      "entropy": 0.7312864840030671,
+      "epoch": 0.47,
+      "grad_norm": 6.84375,
+      "learning_rate": 1.151634316586828e-05,
+      "loss": 0.4799,
+      "mean_token_accuracy": 0.8530717253684997,
+      "num_input_tokens_seen": 201250,
+      "num_tokens": 201250.0,
+      "step": 235,
+      "train_runtime": 163.4728,
+      "train_tokens_per_second": 1231.092
+    },
+    {
+      "entropy": 0.9152665376663208,
+      "epoch": 0.48,
+      "grad_norm": 9.75,
+      "learning_rate": 1.119547346370059e-05,
+      "loss": 0.5162,
+      "mean_token_accuracy": 0.8369959235191345,
+      "num_input_tokens_seen": 204456,
+      "num_tokens": 204456.0,
+      "step": 240,
+      "train_runtime": 165.817,
+      "train_tokens_per_second": 1233.022
+    },
+    {
+      "entropy": 0.3133948802947998,
+      "epoch": 0.49,
+      "grad_norm": 3.65625,
+      "learning_rate": 1.087334987495364e-05,
+      "loss": 0.2908,
+      "mean_token_accuracy": 0.9040228486061096,
+      "num_input_tokens_seen": 211253,
+      "num_tokens": 211253.0,
+      "step": 245,
+      "train_runtime": 168.8822,
+      "train_tokens_per_second": 1250.89
+    },
+    {
+      "entropy": 1.0434356153011322,
+      "epoch": 0.5,
+      "grad_norm": 19.625,
+      "learning_rate": 1.055031026278965e-05,
+      "loss": 0.3272,
+      "mean_token_accuracy": 0.887370252609253,
+      "num_input_tokens_seen": 216724,
+      "num_tokens": 216724.0,
+      "step": 250,
+      "train_runtime": 171.7262,
+      "train_tokens_per_second": 1262.033
+    },
+    {
+      "entropy": 0.2794555306434631,
+      "epoch": 0.51,
+      "grad_norm": 4.875,
+      "learning_rate": 1.02266934511529e-05,
+      "loss": 0.2826,
+      "mean_token_accuracy": 0.9088755249977112,
+      "num_input_tokens_seen": 222416,
+      "num_tokens": 222416.0,
+      "step": 255,
+      "train_runtime": 174.5262,
+      "train_tokens_per_second": 1274.399
+    },
+    {
+      "entropy": 0.3180317282676697,
+      "epoch": 0.52,
+      "grad_norm": 4.71875,
+      "learning_rate": 9.90283886939023e-06,
+      "loss": 0.2928,
+      "mean_token_accuracy": 0.9024029850959778,
+      "num_input_tokens_seen": 227244,
+      "num_tokens": 227244.0,
+      "step": 260,
+      "train_runtime": 177.1886,
+      "train_tokens_per_second": 1282.498
+    },
+    {
+      "entropy": 0.6288993656635284,
+      "epoch": 0.53,
+      "grad_norm": 10.125,
+      "learning_rate": 9.579086196236483e-06,
+      "loss": 0.5484,
+      "mean_token_accuracy": 0.8212805986404419,
+      "num_input_tokens_seen": 230180,
+      "num_tokens": 230180.0,
+      "step": 265,
+      "train_runtime": 179.4447,
+      "train_tokens_per_second": 1282.735
+    },
+    {
+      "entropy": 0.6958479404449462,
+      "epoch": 0.54,
+      "grad_norm": 8.3125,
+      "learning_rate": 9.255775003538462e-06,
+      "loss": 0.4021,
+      "mean_token_accuracy": 0.8744520783424378,
+      "num_input_tokens_seen": 233108,
+      "num_tokens": 233108.0,
+      "step": 270,
+      "train_runtime": 181.7241,
+      "train_tokens_per_second": 1282.758
+    },
+    {
+      "entropy": 0.6523207008838654,
+      "epoch": 0.55,
+      "grad_norm": 3.359375,
+      "learning_rate": 8.933244400090937e-06,
+      "loss": 0.2467,
+      "mean_token_accuracy": 0.9232385158538818,
+      "num_input_tokens_seen": 239685,
+      "num_tokens": 239685.0,
+      "step": 275,
+      "train_runtime": 184.7242,
+      "train_tokens_per_second": 1297.529
+    },
+    {
+      "entropy": 0.670277139544487,
+      "epoch": 0.56,
+      "grad_norm": 10.375,
+      "learning_rate": 8.611832675958335e-06,
+      "loss": 0.4696,
+      "mean_token_accuracy": 0.856619131565094,
+      "num_input_tokens_seen": 243110,
+      "num_tokens": 243110.0,
+      "step": 280,
+      "train_runtime": 187.1388,
+      "train_tokens_per_second": 1299.089
+    },
+    {
+      "entropy": 0.5471946001052856,
+      "epoch": 0.57,
+      "grad_norm": 55.0,
+      "learning_rate": 8.291876947655197e-06,
+      "loss": 0.4469,
+      "mean_token_accuracy": 0.8535593390464783,
+      "num_input_tokens_seen": 247528,
+      "num_tokens": 247528.0,
+      "step": 285,
+      "train_runtime": 189.7769,
+      "train_tokens_per_second": 1304.311
+    },
+    {
+      "entropy": 0.3831256806850433,
+      "epoch": 0.58,
+      "grad_norm": 5.125,
+      "learning_rate": 7.9737128045575e-06,
+      "loss": 0.334,
+      "mean_token_accuracy": 0.8906423330307007,
+      "num_input_tokens_seen": 252416,
+      "num_tokens": 252416.0,
+      "step": 290,
+      "train_runtime": 192.4368,
+      "train_tokens_per_second": 1311.683
+    },
+    {
+      "entropy": 0.39651446640491483,
+      "epoch": 0.59,
+      "grad_norm": 12.625,
+      "learning_rate": 7.657673956915735e-06,
+      "loss": 0.332,
+      "mean_token_accuracy": 0.9020482778549195,
+      "num_input_tokens_seen": 257998,
+      "num_tokens": 257998.0,
+      "step": 295,
+      "train_runtime": 195.2753,
+      "train_tokens_per_second": 1321.201
+    },
+    {
+      "entropy": 0.4273271858692169,
+      "epoch": 0.6,
+      "grad_norm": 6.15625,
+      "learning_rate": 7.344091885838949e-06,
+      "loss": 0.3462,
+      "mean_token_accuracy": 0.8952762126922608,
+      "num_input_tokens_seen": 262326,
+      "num_tokens": 262326.0,
+      "step": 300,
+      "train_runtime": 197.861,
+      "train_tokens_per_second": 1325.81
+    },
+    {
+      "entropy": 0.29413761496543883,
+      "epoch": 0.61,
+      "grad_norm": 6.6875,
+      "learning_rate": 7.033295495616834e-06,
+      "loss": 0.2557,
+      "mean_token_accuracy": 0.9186268091201782,
+      "num_input_tokens_seen": 267582,
+      "num_tokens": 267582.0,
+      "step": 305,
+      "train_runtime": 200.6458,
+      "train_tokens_per_second": 1333.604
+    },
+    {
+      "entropy": 0.8326136410236359,
+      "epoch": 0.62,
+      "grad_norm": 4.375,
+      "learning_rate": 6.725610768744535e-06,
+      "loss": 0.4105,
+      "mean_token_accuracy": 0.8623612403869629,
+      "num_input_tokens_seen": 272200,
+      "num_tokens": 272200.0,
+      "step": 310,
+      "train_runtime": 203.284,
+      "train_tokens_per_second": 1339.013
+    },
+    {
+      "entropy": 0.5061671316623688,
+      "epoch": 0.63,
+      "grad_norm": 11.625,
+      "learning_rate": 6.421360424012039e-06,
+      "loss": 0.388,
+      "mean_token_accuracy": 0.8867331266403198,
+      "num_input_tokens_seen": 275904,
+      "num_tokens": 275904.0,
+      "step": 315,
+      "train_runtime": 205.7058,
+      "train_tokens_per_second": 1341.256
+    },
+    {
+      "entropy": 0.46229581236839296,
+      "epoch": 0.64,
+      "grad_norm": 5.09375,
+      "learning_rate": 6.120863578016736e-06,
+      "loss": 0.3408,
+      "mean_token_accuracy": 0.8920576095581054,
+      "num_input_tokens_seen": 279452,
+      "num_tokens": 279452.0,
+      "step": 320,
+      "train_runtime": 208.0951,
+      "train_tokens_per_second": 1342.905
+    },
+    {
+      "entropy": 0.38839916586875917,
+      "epoch": 0.65,
+      "grad_norm": 3.421875,
+      "learning_rate": 5.82443541045415e-06,
+      "loss": 0.3633,
+      "mean_token_accuracy": 0.8919775366783143,
+      "num_input_tokens_seen": 282480,
+      "num_tokens": 282480.0,
+      "step": 325,
+      "train_runtime": 210.4335,
+      "train_tokens_per_second": 1342.372
+    },
+    {
+      "entropy": 0.4262740671634674,
+      "epoch": 0.66,
+      "grad_norm": 6.96875,
+      "learning_rate": 5.5323868335379775e-06,
+      "loss": 0.1888,
+      "mean_token_accuracy": 0.9332632660865784,
+      "num_input_tokens_seen": 286126,
+      "num_tokens": 286126.0,
+      "step": 330,
+      "train_runtime": 212.9061,
+      "train_tokens_per_second": 1343.907
+    },
+    {
+      "entropy": 0.5601199686527252,
+      "epoch": 0.67,
+      "grad_norm": 9.9375,
+      "learning_rate": 5.245024165896126e-06,
+      "loss": 0.3555,
+      "mean_token_accuracy": 0.8886765837669373,
+      "num_input_tokens_seen": 289732,
+      "num_tokens": 289732.0,
+      "step": 335,
+      "train_runtime": 215.3015,
+      "train_tokens_per_second": 1345.704
+    },
+    {
+      "entropy": 0.5852712333202362,
+      "epoch": 0.68,
+      "grad_norm": 3.8125,
+      "learning_rate": 4.9626488112847384e-06,
+      "loss": 0.3912,
+      "mean_token_accuracy": 0.8769573211669922,
+      "num_input_tokens_seen": 295490,
+      "num_tokens": 295490.0,
+      "step": 340,
+      "train_runtime": 218.1472,
+      "train_tokens_per_second": 1354.544
+    },
+    {
+      "entropy": 0.8315090477466583,
+      "epoch": 0.69,
+      "grad_norm": 14.0,
+      "learning_rate": 4.685556942457296e-06,
+      "loss": 0.568,
+      "mean_token_accuracy": 0.8297129511833191,
+      "num_input_tokens_seen": 299015,
+      "num_tokens": 299015.0,
+      "step": 345,
+      "train_runtime": 220.5146,
+      "train_tokens_per_second": 1355.987
+    },
+    {
+      "entropy": 0.7352282881736756,
+      "epoch": 0.7,
+      "grad_norm": 4.875,
+      "learning_rate": 4.414039190520308e-06,
+      "loss": 0.4531,
+      "mean_token_accuracy": 0.8616520643234253,
+      "num_input_tokens_seen": 301596,
+      "num_tokens": 301596.0,
+      "step": 350,
+      "train_runtime": 222.7275,
+      "train_tokens_per_second": 1354.103
+    },
+    {
+      "entropy": 0.8405511379241943,
+      "epoch": 0.71,
+      "grad_norm": 8.5,
+      "learning_rate": 4.14838034010138e-06,
+      "loss": 0.3854,
+      "mean_token_accuracy": 0.8817270874977112,
+      "num_input_tokens_seen": 304601,
+      "num_tokens": 304601.0,
+      "step": 355,
+      "train_runtime": 225.0321,
+      "train_tokens_per_second": 1353.589
+    },
+    {
+      "entropy": 0.5560254514217376,
+      "epoch": 0.72,
+      "grad_norm": 8.375,
+      "learning_rate": 3.888859030649498e-06,
+      "loss": 0.379,
+      "mean_token_accuracy": 0.8813069701194763,
+      "num_input_tokens_seen": 307944,
+      "num_tokens": 307944.0,
+      "step": 360,
+      "train_runtime": 227.4633,
+      "train_tokens_per_second": 1353.818
+    },
+    {
+      "entropy": 0.5684987097978592,
+      "epoch": 0.73,
+      "grad_norm": 5.8125,
+      "learning_rate": 3.63574746418072e-06,
+      "loss": 0.329,
+      "mean_token_accuracy": 0.8944082856178284,
+      "num_input_tokens_seen": 311082,
+      "num_tokens": 311082.0,
+      "step": 365,
+      "train_runtime": 229.7563,
+      "train_tokens_per_second": 1353.965
+    },
+    {
+      "entropy": 0.5046725153923035,
+      "epoch": 0.74,
+      "grad_norm": 15.6875,
+      "learning_rate": 3.3893111197758276e-06,
+      "loss": 0.3795,
+      "mean_token_accuracy": 0.8841215133666992,
+      "num_input_tokens_seen": 314566,
+      "num_tokens": 314566.0,
+      "step": 370,
+      "train_runtime": 232.0872,
+      "train_tokens_per_second": 1355.379
+    },
+    {
+      "entropy": 0.4979218304157257,
+      "epoch": 0.75,
+      "grad_norm": 10.375,
+      "learning_rate": 3.1498084751294523e-06,
+      "loss": 0.3802,
+      "mean_token_accuracy": 0.8899975538253784,
+      "num_input_tokens_seen": 318700,
+      "num_tokens": 318700.0,
+      "step": 375,
+      "train_runtime": 234.621,
+      "train_tokens_per_second": 1358.361
+    },
+    {
+      "entropy": 0.9946581959724426,
+      "epoch": 0.76,
+      "grad_norm": 7.75,
+      "learning_rate": 2.9174907354426696e-06,
+      "loss": 0.4987,
+      "mean_token_accuracy": 0.8541815996170044,
+      "num_input_tokens_seen": 321552,
+      "num_tokens": 321552.0,
+      "step": 380,
+      "train_runtime": 236.909,
+      "train_tokens_per_second": 1357.281
+    },
+    {
+      "entropy": 0.5532813727855682,
+      "epoch": 0.77,
+      "grad_norm": 7.0625,
+      "learning_rate": 2.692601569943407e-06,
+      "loss": 0.3382,
+      "mean_token_accuracy": 0.8951536297798157,
+      "num_input_tokens_seen": 323895,
+      "num_tokens": 323895.0,
+      "step": 385,
+      "train_runtime": 239.1019,
+      "train_tokens_per_second": 1354.632
+    },
+    {
+      "entropy": 0.8350813835859299,
+      "epoch": 0.78,
+      "grad_norm": 13.1875,
+      "learning_rate": 2.475376856311097e-06,
+      "loss": 0.466,
+      "mean_token_accuracy": 0.863529098033905,
+      "num_input_tokens_seen": 331802,
+      "num_tokens": 331802.0,
+      "step": 390,
+      "train_runtime": 242.4637,
+      "train_tokens_per_second": 1368.461
+    },
+    {
+      "entropy": 0.6075281083583832,
+      "epoch": 0.79,
+      "grad_norm": 13.125,
+      "learning_rate": 2.266044433273562e-06,
+      "loss": 0.302,
+      "mean_token_accuracy": 0.9050341606140136,
+      "num_input_tokens_seen": 336078,
+      "num_tokens": 336078.0,
+      "step": 395,
+      "train_runtime": 245.0273,
+      "train_tokens_per_second": 1371.594
+    },
+    {
+      "entropy": 0.6790629208087922,
+      "epoch": 0.8,
+      "grad_norm": 6.90625,
+      "learning_rate": 2.064823861635633e-06,
+      "loss": 0.3397,
+      "mean_token_accuracy": 0.8993703126907349,
+      "num_input_tokens_seen": 342134,
+      "num_tokens": 342134.0,
+      "step": 400,
+      "train_runtime": 248.0414,
+      "train_tokens_per_second": 1379.342
+    },
+    {
+      "entropy": 0.5511207699775695,
+      "epoch": 0.81,
+      "grad_norm": 12.3125,
+      "learning_rate": 1.8719261939902023e-06,
+      "loss": 0.2637,
+      "mean_token_accuracy": 0.9192676782608032,
+      "num_input_tokens_seen": 346820,
+      "num_tokens": 346820.0,
+      "step": 405,
+      "train_runtime": 250.6779,
+      "train_tokens_per_second": 1383.529
+    },
+    {
+      "entropy": 0.5205666542053222,
+      "epoch": 0.82,
+      "grad_norm": 6.65625,
+      "learning_rate": 1.687553753353195e-06,
+      "loss": 0.3645,
+      "mean_token_accuracy": 0.8808952808380127,
+      "num_input_tokens_seen": 351616,
+      "num_tokens": 351616.0,
+      "step": 410,
+      "train_runtime": 253.312,
+      "train_tokens_per_second": 1388.075
+    },
+    {
+      "entropy": 0.6745355784893036,
+      "epoch": 0.83,
+      "grad_norm": 6.5,
+      "learning_rate": 1.511899920954656e-06,
+      "loss": 0.3404,
+      "mean_token_accuracy": 0.8947509765625,
+      "num_input_tokens_seen": 355522,
+      "num_tokens": 355522.0,
+      "step": 415,
+      "train_runtime": 255.8146,
+      "train_tokens_per_second": 1389.764
+    },
+    {
+      "entropy": 0.4722595751285553,
+      "epoch": 0.84,
+      "grad_norm": 14.3125,
+      "learning_rate": 1.3451489334085555e-06,
+      "loss": 0.4371,
+      "mean_token_accuracy": 0.8792779207229614,
+      "num_input_tokens_seen": 359166,
+      "num_tokens": 359166.0,
+      "step": 420,
+      "train_runtime": 258.2177,
+      "train_tokens_per_second": 1390.943
+    },
+    {
+      "entropy": 0.9886871218681336,
+      "epoch": 0.85,
+      "grad_norm": 5.25,
+      "learning_rate": 1.1874756894740137e-06,
+      "loss": 0.4125,
+      "mean_token_accuracy": 0.8851464509963989,
+      "num_input_tokens_seen": 364544,
+      "num_tokens": 364544.0,
+      "step": 425,
+      "train_runtime": 261.0586,
+      "train_tokens_per_second": 1396.407
+    },
+    {
+      "entropy": 0.9293424725532532,
+      "epoch": 0.86,
+      "grad_norm": 5.21875,
+      "learning_rate": 1.0390455666106547e-06,
+      "loss": 0.3954,
+      "mean_token_accuracy": 0.8745028972625732,
+      "num_input_tokens_seen": 370002,
+      "num_tokens": 370002.0,
+      "step": 430,
+      "train_runtime": 263.9087,
+      "train_tokens_per_second": 1402.007
+    },
+    {
+      "entropy": 0.5260325253009797,
+      "epoch": 0.87,
+      "grad_norm": 10.4375,
+      "learning_rate": 9.000142475204965e-07,
+      "loss": 0.2916,
+      "mean_token_accuracy": 0.9067874073982238,
+      "num_input_tokens_seen": 375736,
+      "num_tokens": 375736.0,
+      "step": 435,
+      "train_runtime": 266.7797,
+      "train_tokens_per_second": 1408.413
+    },
+    {
+      "entropy": 0.5771724641323089,
+      "epoch": 0.88,
+      "grad_norm": 4.96875,
+      "learning_rate": 7.705275568582848e-07,
+      "loss": 0.3105,
+      "mean_token_accuracy": 0.908876633644104,
+      "num_input_tokens_seen": 379930,
+      "num_tokens": 379930.0,
+      "step": 440,
+      "train_runtime": 269.3582,
+      "train_tokens_per_second": 1410.501
+    },
+    {
+      "entropy": 0.5990758180618286,
+      "epoch": 0.89,
+      "grad_norm": 12.0,
+      "learning_rate": 6.507213082815745e-07,
+      "loss": 0.3247,
+      "mean_token_accuracy": 0.8999800682067871,
+      "num_input_tokens_seen": 383068,
+      "num_tokens": 383068.0,
+      "step": 445,
+      "train_runtime": 271.6586,
+      "train_tokens_per_second": 1410.108
+    },
+    {
+      "entropy": 0.3203736484050751,
+      "epoch": 0.9,
+      "grad_norm": 4.34375,
+      "learning_rate": 5.407211620009545e-07,
+      "loss": 0.3401,
+      "mean_token_accuracy": 0.8904389262199401,
+      "num_input_tokens_seen": 388016,
+      "num_tokens": 388016.0,
+      "step": 450,
+      "train_runtime": 274.3517,
+      "train_tokens_per_second": 1414.301
+    },
+    {
+      "entropy": 0.36918359696865083,
+      "epoch": 0.91,
+      "grad_norm": 4.375,
+      "learning_rate": 4.406424929798403e-07,
+      "loss": 0.2458,
+      "mean_token_accuracy": 0.9116828203201294,
+      "num_input_tokens_seen": 392392,
+      "num_tokens": 392392.0,
+      "step": 455,
+      "train_runtime": 276.8995,
+      "train_tokens_per_second": 1417.092
+    },
+    {
+      "entropy": 0.6511303842067718,
+      "epoch": 0.92,
+      "grad_norm": 6.5625,
+      "learning_rate": 3.5059026992206645e-07,
+      "loss": 0.4391,
+      "mean_token_accuracy": 0.8760095119476319,
+      "num_input_tokens_seen": 395760,
+      "num_tokens": 395760.0,
+      "step": 460,
+      "train_runtime": 279.2527,
+      "train_tokens_per_second": 1417.211
+    },
+    {
+      "entropy": 0.7154091179370881,
+      "epoch": 0.93,
+      "grad_norm": 10.125,
+      "learning_rate": 2.706589451742181e-07,
+      "loss": 0.3676,
+      "mean_token_accuracy": 0.8919078826904296,
+      "num_input_tokens_seen": 398977,
+      "num_tokens": 398977.0,
+      "step": 465,
+      "train_runtime": 281.5974,
+      "train_tokens_per_second": 1416.835
+    },
+    {
+      "entropy": 0.4087792098522186,
+      "epoch": 0.94,
+      "grad_norm": 4.15625,
+      "learning_rate": 2.009323556581566e-07,
+      "loss": 0.3225,
+      "mean_token_accuracy": 0.8970118045806885,
+      "num_input_tokens_seen": 402758,
+      "num_tokens": 402758.0,
+      "step": 470,
+      "train_runtime": 284.0365,
+      "train_tokens_per_second": 1417.98
+    },
+    {
+      "entropy": 0.8475154399871826,
+      "epoch": 0.95,
+      "grad_norm": 6.15625,
+      "learning_rate": 1.4148363493766803e-07,
+      "loss": 0.2751,
+      "mean_token_accuracy": 0.9305037021636963,
+      "num_input_tokens_seen": 405718,
+      "num_tokens": 405718.0,
+      "step": 475,
+      "train_runtime": 286.3033,
+      "train_tokens_per_second": 1417.092
+    },
+    {
+      "entropy": 0.8792379856109619,
+      "epoch": 0.96,
+      "grad_norm": 7.84375,
+      "learning_rate": 9.237513651145224e-08,
+      "loss": 0.3242,
+      "mean_token_accuracy": 0.886387574672699,
+      "num_input_tokens_seen": 409156,
+      "num_tokens": 409156.0,
+      "step": 480,
+      "train_runtime": 288.6685,
+      "train_tokens_per_second": 1417.39
+    },
+    {
+      "entropy": 0.6999157905578614,
+      "epoch": 0.97,
+      "grad_norm": 10.5,
+      "learning_rate": 5.365836841291439e-08,
+      "loss": 0.4644,
+      "mean_token_accuracy": 0.8499702572822571,
+      "num_input_tokens_seen": 411896,
+      "num_tokens": 411896.0,
+      "step": 485,
+      "train_runtime": 290.8465,
+      "train_tokens_per_second": 1416.197
+    },
+    {
+      "entropy": 0.5511339485645295,
+      "epoch": 0.98,
+      "grad_norm": 4.75,
+      "learning_rate": 2.537393918535358e-08,
+      "loss": 0.3121,
+      "mean_token_accuracy": 0.9038938760757447,
+      "num_input_tokens_seen": 416390,
+      "num_tokens": 416390.0,
+      "step": 490,
+      "train_runtime": 293.4783,
+      "train_tokens_per_second": 1418.81
+    },
+    {
+      "entropy": 0.8974492073059082,
+      "epoch": 0.99,
+      "grad_norm": 23.5,
+      "learning_rate": 7.551515289203615e-09,
+      "loss": 0.2867,
+      "mean_token_accuracy": 0.9067962288856506,
+      "num_input_tokens_seen": 420619,
+      "num_tokens": 420619.0,
+      "step": 495,
+      "train_runtime": 296.0593,
+      "train_tokens_per_second": 1420.725
+    },
+    {
+      "entropy": 0.7672029852867126,
+      "epoch": 1.0,
+      "grad_norm": 29.75,
+      "learning_rate": 2.0978998601206558e-10,
+      "loss": 0.4602,
+      "mean_token_accuracy": 0.8477967858314515,
+      "num_input_tokens_seen": 424444,
+      "num_tokens": 424444.0,
+      "step": 500,
+      "train_runtime": 298.5337,
+      "train_tokens_per_second": 1421.763
+    },
+    {
+      "epoch": 1.0,
+      "num_input_tokens_seen": 424444,
+      "step": 500,
+      "total_flos": 1.972943710728192e+16,
+      "train_loss": 0.3892666745185852,
+      "train_runtime": 298.5529,
+      "train_samples_per_second": 1.675,
+      "train_steps_per_second": 1.675,
+      "train_tokens_per_second": 2758.017
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 500,
+  "num_input_tokens_seen": 424444,
+  "num_train_epochs": 1,
+  "save_steps": 0,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.972943710728192e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fecb476f6e32b58071a12f031b6e22b6f84db2cf2477b0ab37ac362e5a69726
+size 6353

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": false,
+  "fps": null,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_frames": 768,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_frames": 4,
+  "min_pixels": 3136,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff