lihy285 commited on Feb 11

Commit

484f036

verified ·

1 Parent(s): 909ce3a

Delete QeoThinker-VGGT-Qwen25VL-7B-Vanilla

Browse files

Files changed (18) hide show

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/added_tokens.json +0 -24
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/chat_template.json +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/config.json +0 -87
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/generation_config.json +0 -12
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/merges.txt +0 -0
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00001-of-00005.safetensors +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00002-of-00005.safetensors +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00003-of-00005.safetensors +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00004-of-00005.safetensors +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00005-of-00005.safetensors +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model.safetensors.index.json +0 -0
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/preprocessor_config.json +0 -29
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/special_tokens_map.json +0 -31
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/tokenizer_config.json +0 -209
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/train.log +0 -928
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/trainer_state.json +0 -3298
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/training_args.bin +0 -3
QeoThinker-VGGT-Qwen25VL-7B-Vanilla/vocab.json +0 -0

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/added_tokens.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "</tool_call>": 151658,
-  "<tool_call>": 151657,
-  "<|box_end|>": 151649,
-  "<|box_start|>": 151648,
-  "<|endoftext|>": 151643,
-  "<|file_sep|>": 151664,
-  "<|fim_middle|>": 151660,
-  "<|fim_pad|>": 151662,
-  "<|fim_prefix|>": 151659,
-  "<|fim_suffix|>": 151661,
-  "<|im_end|>": 151645,
-  "<|im_start|>": 151644,
-  "<|image_pad|>": 151655,
-  "<|object_ref_end|>": 151647,
-  "<|object_ref_start|>": 151646,
-  "<|quad_end|>": 151651,
-  "<|quad_start|>": 151650,
-  "<|repo_name|>": 151663,
-  "<|video_pad|>": 151656,
-  "<|vision_end|>": 151653,
-  "<|vision_pad|>": 151654,
-  "<|vision_start|>": 151652
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/chat_template.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/config.json DELETED Viewed

@@ -1,87 +0,0 @@
-{
-  "align_method": "zero",
-  "align_method_weight": 0.1,
-  "architectures": [
-    "Qwen2_5_VLForConditionalGenerationWithVGGT"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "cam_merger_type": "zero",
-  "depart_smi_token": false,
-  "eos_token_id": 151645,
-  "feature_fusion_method": "zero",
-  "fusion_num_layers": 1,
-  "geo_cross_attn": true,
-  "geo_importance_gate": true,
-  "geo_inject_version": "v1",
-  "geo_layer_interval": 1,
-  "geo_learn_bias": false,
-  "geo_spatial_bias": false,
-  "geometry_encoder_type": "vggt",
-  "geometry_merger_type": "mlp",
-  "hidden_act": "silu",
-  "hidden_size": 3584,
-  "image_token_id": 151655,
-  "initializer_range": 0.02,
-  "intermediate_size": 18944,
-  "max_position_embeddings": 128000,
-  "max_window_layers": 28,
-  "model_type": "qwen2_5_vl",
-  "num_attention_heads": 28,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
-  "reference_frame": "first",
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "mrope_section": [
-      16,
-      24,
-      24
-    ],
-    "rope_type": "default",
-    "type": "default"
-  },
-  "rope_theta": 1000000.0,
-  "selection_method": "zero",
-  "selection_method_ratio": 0.25,
-  "sliding_window": 32768,
-  "smi_downsample_rate": 2,
-  "smi_image_num": 8,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "training": true,
-  "transformers_version": "4.50.0",
-  "use_cache": true,
-  "use_geometry_encoder": true,
-  "use_qwenvl_loss": false,
-  "use_sliding_window": false,
-  "video_token_id": 151656,
-  "vision_config": {
-    "depth": 32,
-    "fullatt_block_indexes": [
-      7,
-      15,
-      23,
-      31
-    ],
-    "hidden_act": "silu",
-    "hidden_size": 1280,
-    "in_channels": 3,
-    "in_chans": 3,
-    "intermediate_size": 3420,
-    "model_type": "qwen2_5_vl",
-    "num_heads": 16,
-    "out_hidden_size": 3584,
-    "patch_size": 14,
-    "spatial_merge_size": 2,
-    "spatial_patch_size": 14,
-    "temporal_patch_size": 2,
-    "tokens_per_second": 2,
-    "torch_dtype": "bfloat16",
-    "window_size": 112
-  },
-  "vision_end_token_id": 151653,
-  "vision_start_token_id": 151652,
-  "vision_token_id": 151654,
-  "vocab_size": 152064
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/generation_config.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "bos_token_id": 151643,
-  "do_sample": true,
-  "eos_token_id": [
-    151645,
-    151643
-  ],
-  "pad_token_id": 151643,
-  "repetition_penalty": 1.05,
-  "temperature": 1e-06,
-  "transformers_version": "4.50.0"
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00001-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:70ce11bb19d95f66f54c8c997ce0f8a3587c7680976b9e9fcc6bbcc359d03a2c
-size 4979504134

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00002-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3d2dd11e7ed7cb5f8d383cf84975703fdb7563ad84e8542558db5ebca92a95b3
-size 4932982360

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00003-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a83756d32444e4617234c69385c4d9c5867721afb39dc23dd648197b39720334
-size 4906423326

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00004-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f81aba14b898a4149058edbe692ad9673a87c386a234ba23b8122ffce6dda477
-size 4874252312

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model-00005-of-00005.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a3ef632569857d037ce42eda45f2192991cc44368c507b15214a3d73db772c9a
-size 1910358204

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/preprocessor_config.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "do_convert_rgb": true,
-  "do_normalize": true,
-  "do_rescale": true,
-  "do_resize": true,
-  "image_mean": [
-    0.48145466,
-    0.4578275,
-    0.40821073
-  ],
-  "image_processor_type": "Qwen2VLImageProcessor",
-  "image_std": [
-    0.26862954,
-    0.26130258,
-    0.27577711
-  ],
-  "max_pixels": 451584,
-  "merge_size": 2,
-  "min_pixels": 12544,
-  "patch_size": 14,
-  "processor_class": "Qwen2_5_VLProcessor",
-  "resample": 3,
-  "rescale_factor": 0.00392156862745098,
-  "size": {
-    "longest_edge": 451584,
-    "shortest_edge": 12544
-  },
-  "temporal_patch_size": 2
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/special_tokens_map.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "eos_token": {
-    "content": "<|im_end|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/tokenizer_config.json DELETED Viewed

@@ -1,209 +0,0 @@
-{
-  "add_bos_token": false,
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "151643": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151644": {
-      "content": "<|im_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151645": {
-      "content": "<|im_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151646": {
-      "content": "<|object_ref_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151647": {
-      "content": "<|object_ref_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151648": {
-      "content": "<|box_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151649": {
-      "content": "<|box_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151650": {
-      "content": "<|quad_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151651": {
-      "content": "<|quad_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151652": {
-      "content": "<|vision_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151653": {
-      "content": "<|vision_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151654": {
-      "content": "<|vision_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151655": {
-      "content": "<|image_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151656": {
-      "content": "<|video_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "151657": {
-      "content": "<tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151658": {
-      "content": "</tool_call>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151659": {
-      "content": "<|fim_prefix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151660": {
-      "content": "<|fim_middle|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151661": {
-      "content": "<|fim_suffix|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151662": {
-      "content": "<|fim_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151663": {
-      "content": "<|repo_name|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    },
-    "151664": {
-      "content": "<|file_sep|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": false
-    }
-  },
-  "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>",
-    "<|object_ref_start|>",
-    "<|object_ref_end|>",
-    "<|box_start|>",
-    "<|box_end|>",
-    "<|quad_start|>",
-    "<|quad_end|>",
-    "<|vision_start|>",
-    "<|vision_end|>",
-    "<|vision_pad|>",
-    "<|image_pad|>",
-    "<|video_pad|>"
-  ],
-  "bos_token": null,
-  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|im_end|>",
-  "errors": "replace",
-  "extra_special_tokens": {},
-  "model_max_length": 12800,
-  "pad_token": "<|endoftext|>",
-  "padding_side": "right",
-  "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
-  "unk_token": null
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/train.log DELETED Viewed

@@ -1,928 +0,0 @@
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-[2026-01-14 23:17:48,691] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-Trainer._get_train_sampler replaced with custom implementation.
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-load moxing failed
-Trainer._get_train_sampler replaced with custom implementation.
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-load moxing failed
-Trainer._get_train_sampler replaced with custom implementation.
-load moxing failed
-[2026-01-14 23:17:49,114] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,227] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,250] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,269] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,275] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,275] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,279] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2026-01-14 23:17:49,620] [INFO] [comm.py:658:init_distributed] cdb=None
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-[2026-01-14 23:17:49,980] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,125] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,163] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,350] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,357] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,431] [INFO] [comm.py:658:init_distributed] cdb=None
-[2026-01-14 23:17:50,469] [INFO] [comm.py:658:init_distributed] cdb=None
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-Warning: FlashAttention 3 is not available, falling back to PyTorch's scaled_dot_product_attention
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-/cache/pretrained/Qwen2.5-VL-7B-Instruct/
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-fusion_config FeatureFusionConfig(fusion_method='zero', hidden_size=3584, num_heads=8, dropout=0.1, num_layers=1, cam_merger_type='zero', align_method='zero', selection_method='zero', selection_method_ratio=0.25, align_method_weight=0.1, training=True)
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-Loading weights from local directory
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-Qwen2_5_VLConfig {
-  "_attn_implementation_autoset": true,
-  "align_method": "zero",
-  "align_method_weight": 0.1,
-  "architectures": [
-    "Qwen2_5_VLForConditionalGeneration"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "cam_merger_type": "zero",
-  "depart_smi_token": false,
-  "eos_token_id": 151645,
-  "feature_fusion_method": "zero",
-  "fusion_num_layers": 1,
-  "geo_cross_attn": true,
-  "geo_importance_gate": true,
-  "geo_inject_version": "v4",
-  "geo_layer_interval": 1,
-  "geo_learn_bias": false,
-  "geo_spatial_bias": false,
-  "geometry_encoder_type": "vggt",
-  "geometry_merger_type": "mlp",
-  "hidden_act": "silu",
-  "hidden_size": 3584,
-  "image_token_id": 151655,
-  "initializer_range": 0.02,
-  "intermediate_size": 18944,
-  "max_position_embeddings": 128000,
-  "max_window_layers": 28,
-  "model_type": "qwen2_5_vl",
-  "num_attention_heads": 28,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
-  "reference_frame": "first",
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "mrope_section": [
-      16,
-      24,
-      24
-    ],
-    "rope_type": "default",
-    "type": "default"
-  },
-  "rope_theta": 1000000.0,
-  "selection_method": "zero",
-  "selection_method_ratio": 0.25,
-  "sliding_window": 32768,
-  "smi_downsample_rate": 2,
-  "smi_image_num": 8,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "training": true,
-  "transformers_version": "4.50.0",
-  "use_cache": false,
-  "use_geometry_encoder": true,
-  "use_qwenvl_loss": false,
-  "use_sliding_window": false,
-  "video_token_id": 151656,
-  "vision_config": {
-    "depth": 32,
-    "fullatt_block_indexes": [
-      7,
-      15,
-      23,
-      31
-    ],
-    "hidden_act": "silu",
-    "hidden_size": 1280,
-    "in_channels": 3,
-    "in_chans": 3,
-    "intermediate_size": 3420,
-    "model_type": "qwen2_5_vl",
-    "num_heads": 16,
-    "out_hidden_size": 3584,
-    "patch_size": 14,
-    "spatial_merge_size": 2,
-    "spatial_patch_size": 14,
-    "temporal_patch_size": 2,
-    "tokens_per_second": 2,
-    "torch_dtype": "bfloat16",
-    "window_size": 112
-  },
-  "vision_end_token_id": 151653,
-  "vision_start_token_id": 151652,
-  "vision_token_id": 151654,
-  "vocab_size": 152064
-}
-====                                          ====
-====  Only training the following parameters  ====
-====                                          ====
-	 geometry_merger.ln_q.weight torch.Size([2048])
-	 geometry_merger.mlp.0.weight torch.Size([4096, 8192])
-	 geometry_merger.mlp.0.bias torch.Size([4096])
-	 geometry_merger.mlp.2.weight torch.Size([3584, 4096])
-	 geometry_merger.mlp.2.bias torch.Size([3584])
-	 geometry_merger.camera_mlp.0.weight torch.Size([4096, 2048])
-	 geometry_merger.camera_mlp.0.bias torch.Size([4096])
-	 geometry_merger.camera_mlp.2.weight torch.Size([3584, 4096])
-	 geometry_merger.camera_mlp.2.bias torch.Size([3584])
-	 model.embed_tokens.weight torch.Size([152064, 3584])
-	 model.layers.0.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.0.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.0.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.0.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.0.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.0.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.0.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.0.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.0.input_layernorm.weight torch.Size([3584])
-	 model.layers.0.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.0.cross_attn.gate torch.Size([])
-	 model.layers.0.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.0.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.0.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.0.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.0.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.0.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.0.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.0.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.0.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.0.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.1.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.1.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.1.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.1.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.1.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.1.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.1.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.1.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.1.input_layernorm.weight torch.Size([3584])
-	 model.layers.1.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.1.cross_attn.gate torch.Size([])
-	 model.layers.1.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.1.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.1.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.1.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.1.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.1.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.1.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.1.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.1.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.1.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.2.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.2.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.2.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.2.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.2.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.2.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.2.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.2.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.2.input_layernorm.weight torch.Size([3584])
-	 model.layers.2.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.2.cross_attn.gate torch.Size([])
-	 model.layers.2.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.2.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.2.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.2.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.2.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.2.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.2.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.2.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.2.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.2.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.3.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.3.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.3.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.3.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.3.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.3.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.3.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.3.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.3.input_layernorm.weight torch.Size([3584])
-	 model.layers.3.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.3.cross_attn.gate torch.Size([])
-	 model.layers.3.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.3.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.3.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.3.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.3.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.3.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.3.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.3.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.3.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.3.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.4.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.4.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.4.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.4.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.4.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.4.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.4.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.4.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.4.input_layernorm.weight torch.Size([3584])
-	 model.layers.4.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.4.cross_attn.gate torch.Size([])
-	 model.layers.4.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.4.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.4.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.4.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.4.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.4.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.4.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.4.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.4.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.4.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.5.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.5.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.5.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.5.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.5.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.5.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.5.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.5.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.5.input_layernorm.weight torch.Size([3584])
-	 model.layers.5.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.5.cross_attn.gate torch.Size([])
-	 model.layers.5.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.5.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.5.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.5.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.5.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.5.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.5.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.5.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.5.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.5.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.6.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.6.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.6.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.6.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.6.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.6.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.6.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.6.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.6.input_layernorm.weight torch.Size([3584])
-	 model.layers.6.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.6.cross_attn.gate torch.Size([])
-	 model.layers.6.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.6.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.6.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.6.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.6.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.6.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.6.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.6.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.6.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.6.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.7.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.7.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.7.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.7.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.7.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.7.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.7.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.7.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.7.input_layernorm.weight torch.Size([3584])
-	 model.layers.7.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.7.cross_attn.gate torch.Size([])
-	 model.layers.7.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.7.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.7.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.7.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.7.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.7.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.7.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.7.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.7.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.7.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.8.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.8.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.8.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.8.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.8.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.8.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.8.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.8.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.8.input_layernorm.weight torch.Size([3584])
-	 model.layers.8.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.8.cross_attn.gate torch.Size([])
-	 model.layers.8.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.8.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.8.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.8.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.8.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.8.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.8.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.8.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.8.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.8.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.9.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.9.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.9.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.9.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.9.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.9.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.9.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.9.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.9.input_layernorm.weight torch.Size([3584])
-	 model.layers.9.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.9.cross_attn.gate torch.Size([])
-	 model.layers.9.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.9.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.9.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.9.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.9.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.9.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.9.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.9.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.9.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.9.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.10.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.10.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.10.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.10.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.10.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.10.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.10.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.10.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.10.input_layernorm.weight torch.Size([3584])
-	 model.layers.10.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.10.cross_attn.gate torch.Size([])
-	 model.layers.10.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.10.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.10.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.10.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.10.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.10.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.10.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.10.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.10.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.10.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.11.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.11.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.11.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.11.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.11.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.11.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.11.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.11.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.11.input_layernorm.weight torch.Size([3584])
-	 model.layers.11.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.11.cross_attn.gate torch.Size([])
-	 model.layers.11.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.11.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.11.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.11.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.11.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.11.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.11.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.11.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.11.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.11.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.12.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.12.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.12.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.12.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.12.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.12.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.12.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.12.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.12.input_layernorm.weight torch.Size([3584])
-	 model.layers.12.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.12.cross_attn.gate torch.Size([])
-	 model.layers.12.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.12.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.12.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.12.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.12.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.12.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.12.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.12.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.12.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.12.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.13.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.13.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.13.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.13.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.13.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.13.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.13.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.13.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.13.input_layernorm.weight torch.Size([3584])
-	 model.layers.13.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.13.cross_attn.gate torch.Size([])
-	 model.layers.13.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.13.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.13.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.13.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.13.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.13.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.13.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.13.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.13.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.13.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.14.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.14.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.14.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.14.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.14.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.14.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.14.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.14.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.14.input_layernorm.weight torch.Size([3584])
-	 model.layers.14.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.14.cross_attn.gate torch.Size([])
-	 model.layers.14.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.14.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.14.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.14.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.14.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.14.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.14.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.14.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.14.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.14.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.15.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.15.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.15.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.15.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.15.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.15.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.15.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.15.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.15.input_layernorm.weight torch.Size([3584])
-	 model.layers.15.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.15.cross_attn.gate torch.Size([])
-	 model.layers.15.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.15.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.15.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.15.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.15.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.15.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.15.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.15.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.15.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.15.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.16.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.16.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.16.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.16.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.16.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.16.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.16.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.16.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.16.input_layernorm.weight torch.Size([3584])
-	 model.layers.16.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.16.cross_attn.gate torch.Size([])
-	 model.layers.16.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.16.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.16.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.16.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.16.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.16.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.16.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.16.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.16.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.16.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.17.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.17.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.17.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.17.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.17.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.17.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.17.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.17.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.17.input_layernorm.weight torch.Size([3584])
-	 model.layers.17.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.17.cross_attn.gate torch.Size([])
-	 model.layers.17.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.17.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.17.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.17.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.17.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.17.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.17.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.17.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.17.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.17.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.18.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.18.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.18.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.18.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.18.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.18.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.18.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.18.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.18.input_layernorm.weight torch.Size([3584])
-	 model.layers.18.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.18.cross_attn.gate torch.Size([])
-	 model.layers.18.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.18.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.18.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.18.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.18.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.18.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.18.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.18.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.18.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.18.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.19.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.19.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.19.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.19.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.19.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.19.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.19.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.19.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.19.input_layernorm.weight torch.Size([3584])
-	 model.layers.19.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.19.cross_attn.gate torch.Size([])
-	 model.layers.19.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.19.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.19.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.19.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.19.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.19.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.19.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.19.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.19.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.19.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.20.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.20.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.20.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.20.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.20.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.20.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.20.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.20.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.20.input_layernorm.weight torch.Size([3584])
-	 model.layers.20.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.20.cross_attn.gate torch.Size([])
-	 model.layers.20.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.20.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.20.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.20.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.20.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.20.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.20.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.20.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.20.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.20.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.21.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.21.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.21.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.21.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.21.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.21.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.21.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.21.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.21.input_layernorm.weight torch.Size([3584])
-	 model.layers.21.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.21.cross_attn.gate torch.Size([])
-	 model.layers.21.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.21.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.21.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.21.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.21.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.21.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.21.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.21.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.21.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.21.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.22.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.22.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.22.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.22.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.22.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.22.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.22.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.22.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.22.input_layernorm.weight torch.Size([3584])
-	 model.layers.22.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.22.cross_attn.gate torch.Size([])
-	 model.layers.22.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.22.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.22.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.22.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.22.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.22.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.22.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.22.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.22.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.22.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.23.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.23.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.23.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.23.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.23.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.23.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.23.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.23.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.23.input_layernorm.weight torch.Size([3584])
-	 model.layers.23.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.23.cross_attn.gate torch.Size([])
-	 model.layers.23.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.23.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.23.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.23.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.23.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.23.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.23.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.23.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.23.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.23.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.24.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.24.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.24.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.24.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.24.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.24.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.24.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.24.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.24.input_layernorm.weight torch.Size([3584])
-	 model.layers.24.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.24.cross_attn.gate torch.Size([])
-	 model.layers.24.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.24.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.24.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.24.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.24.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.24.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.24.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.24.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.24.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.24.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.25.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.25.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.25.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.25.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.25.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.25.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.25.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.25.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.25.input_layernorm.weight torch.Size([3584])
-	 model.layers.25.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.25.cross_attn.gate torch.Size([])
-	 model.layers.25.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.25.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.25.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.25.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.25.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.25.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.25.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.25.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.25.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.25.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.26.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.26.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.26.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.26.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.26.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.26.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.26.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.26.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.26.input_layernorm.weight torch.Size([3584])
-	 model.layers.26.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.26.cross_attn.gate torch.Size([])
-	 model.layers.26.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.26.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.26.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.26.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.26.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.26.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.26.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.26.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.26.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.26.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.layers.27.self_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.self_attn.q_proj.bias torch.Size([3584])
-	 model.layers.27.self_attn.k_proj.weight torch.Size([512, 3584])
-	 model.layers.27.self_attn.k_proj.bias torch.Size([512])
-	 model.layers.27.self_attn.v_proj.weight torch.Size([512, 3584])
-	 model.layers.27.self_attn.v_proj.bias torch.Size([512])
-	 model.layers.27.self_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.mlp.gate_proj.weight torch.Size([18944, 3584])
-	 model.layers.27.mlp.up_proj.weight torch.Size([18944, 3584])
-	 model.layers.27.mlp.down_proj.weight torch.Size([3584, 18944])
-	 model.layers.27.input_layernorm.weight torch.Size([3584])
-	 model.layers.27.post_attention_layernorm.weight torch.Size([3584])
-	 model.layers.27.cross_attn.gate torch.Size([])
-	 model.layers.27.cross_attn.spatial_scale_param torch.Size([])
-	 model.layers.27.cross_attn.input_layernorm.weight torch.Size([3584])
-	 model.layers.27.cross_attn.input_layernorm.bias torch.Size([3584])
-	 model.layers.27.cross_attn.vggt_layernorm.weight torch.Size([3584])
-	 model.layers.27.cross_attn.vggt_layernorm.bias torch.Size([3584])
-	 model.layers.27.cross_attn.q_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.cross_attn.k_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.cross_attn.v_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.cross_attn.o_proj.weight torch.Size([3584, 3584])
-	 model.layers.27.cross_attn.importance_net.0.weight torch.Size([896, 3584])
-	 model.layers.27.cross_attn.importance_net.0.bias torch.Size([896])
-	 model.layers.27.cross_attn.importance_net.2.weight torch.Size([1, 896])
-	 model.layers.27.cross_attn.importance_net.2.bias torch.Size([1])
-	 model.norm.weight torch.Size([3584])
-	 lm_head.weight torch.Size([152064, 3584])
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-dataset_names ['llava_hound_64k', 'spar_234k']
-Loading datasets: [{'annotation_path': 'data/train/llava_hound_64k.json', 'data_path': 'data/media', 'tag': '2d', 'sampling_rate': 1.0, 'dataset_name': 'llava_hound_64k'}, {'annotation_path': 'data/train/spar_234k.json', 'data_path': 'data/media', 'tag': '3d', 'sampling_rate': 1.0, 'dataset_name': 'spar_234k'}]
-Total training samples: 298027
-Formatting inputs...Skip in lazy mode
-Total training samples: 298027
-Total training samples: 298027
-Total training samples: 298027
-Total training samples: 298027
-Total training samples: 298027
-Formatting inputs...Skip in lazy mode
-Formatting inputs...Skip in lazy mode
-Formatting inputs...Skip in lazy mode
-Total training samples: 298027
-Total training samples: 298027
-Formatting inputs...Skip in lazy mode
-Formatting inputs...Skip in lazy mode
-Formatting inputs...Skip in lazy mode
-Formatting inputs...Skip in lazy mode

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/trainer_state.json DELETED Viewed

@@ -1,3298 +0,0 @@
-{
-  "best_global_step": null,
-  "best_metric": null,
-  "best_model_checkpoint": null,
-  "epoch": 0.9998389434691577,
-  "eval_steps": 500,
-  "global_step": 4656,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.0021474204112310086,
-      "grad_norm": 114.95087432861328,
-      "learning_rate": 7.142857142857143e-07,
-      "loss": 8.1645,
-      "step": 10
-    },
-    {
-      "epoch": 0.004294840822462017,
-      "grad_norm": 57.820430755615234,
-      "learning_rate": 1.4285714285714286e-06,
-      "loss": 7.1175,
-      "step": 20
-    },
-    {
-      "epoch": 0.006442261233693026,
-      "grad_norm": 38.31983947753906,
-      "learning_rate": 2.1428571428571427e-06,
-      "loss": 6.2552,
-      "step": 30
-    },
-    {
-      "epoch": 0.008589681644924034,
-      "grad_norm": 30.298812866210938,
-      "learning_rate": 2.8571428571428573e-06,
-      "loss": 5.4179,
-      "step": 40
-    },
-    {
-      "epoch": 0.010737102056155044,
-      "grad_norm": 31.72263526916504,
-      "learning_rate": 3.5714285714285718e-06,
-      "loss": 4.9746,
-      "step": 50
-    },
-    {
-      "epoch": 0.012884522467386052,
-      "grad_norm": 25.22931480407715,
-      "learning_rate": 4.2857142857142855e-06,
-      "loss": 4.6509,
-      "step": 60
-    },
-    {
-      "epoch": 0.01503194287861706,
-      "grad_norm": 28.012941360473633,
-      "learning_rate": 5e-06,
-      "loss": 4.2574,
-      "step": 70
-    },
-    {
-      "epoch": 0.01717936328984807,
-      "grad_norm": 27.468263626098633,
-      "learning_rate": 5.7142857142857145e-06,
-      "loss": 3.806,
-      "step": 80
-    },
-    {
-      "epoch": 0.01932678370107908,
-      "grad_norm": 22.046875,
-      "learning_rate": 6.4285714285714295e-06,
-      "loss": 3.8487,
-      "step": 90
-    },
-    {
-      "epoch": 0.02147420411231009,
-      "grad_norm": 20.30846405029297,
-      "learning_rate": 7.1428571428571436e-06,
-      "loss": 2.9113,
-      "step": 100
-    },
-    {
-      "epoch": 0.023621624523541097,
-      "grad_norm": 23.86836051940918,
-      "learning_rate": 7.857142857142858e-06,
-      "loss": 3.472,
-      "step": 110
-    },
-    {
-      "epoch": 0.025769044934772105,
-      "grad_norm": 26.69591522216797,
-      "learning_rate": 8.571428571428571e-06,
-      "loss": 3.6422,
-      "step": 120
-    },
-    {
-      "epoch": 0.027916465346003113,
-      "grad_norm": 20.753564834594727,
-      "learning_rate": 9.285714285714288e-06,
-      "loss": 3.0168,
-      "step": 130
-    },
-    {
-      "epoch": 0.03006388575723412,
-      "grad_norm": 29.848037719726562,
-      "learning_rate": 1e-05,
-      "loss": 3.9783,
-      "step": 140
-    },
-    {
-      "epoch": 0.03221130616846513,
-      "grad_norm": 20.223854064941406,
-      "learning_rate": 9.999879015387978e-06,
-      "loss": 3.51,
-      "step": 150
-    },
-    {
-      "epoch": 0.03435872657969614,
-      "grad_norm": 18.951417922973633,
-      "learning_rate": 9.999516067406818e-06,
-      "loss": 2.9719,
-      "step": 160
-    },
-    {
-      "epoch": 0.03650614699092715,
-      "grad_norm": 18.97931671142578,
-      "learning_rate": 9.998911173620972e-06,
-      "loss": 2.2938,
-      "step": 170
-    },
-    {
-      "epoch": 0.03865356740215816,
-      "grad_norm": 23.515886306762695,
-      "learning_rate": 9.998064363303573e-06,
-      "loss": 2.8837,
-      "step": 180
-    },
-    {
-      "epoch": 0.040800987813389165,
-      "grad_norm": 22.68735694885254,
-      "learning_rate": 9.996975677435031e-06,
-      "loss": 2.8285,
-      "step": 190
-    },
-    {
-      "epoch": 0.04294840822462018,
-      "grad_norm": 20.7940731048584,
-      "learning_rate": 9.995645168701038e-06,
-      "loss": 2.1357,
-      "step": 200
-    },
-    {
-      "epoch": 0.04509582863585118,
-      "grad_norm": 26.744306564331055,
-      "learning_rate": 9.994072901490028e-06,
-      "loss": 2.807,
-      "step": 210
-    },
-    {
-      "epoch": 0.04724324904708219,
-      "grad_norm": 18.151416778564453,
-      "learning_rate": 9.992258951890057e-06,
-      "loss": 2.0274,
-      "step": 220
-    },
-    {
-      "epoch": 0.0493906694583132,
-      "grad_norm": 17.781078338623047,
-      "learning_rate": 9.990203407685122e-06,
-      "loss": 2.6628,
-      "step": 230
-    },
-    {
-      "epoch": 0.05153808986954421,
-      "grad_norm": 23.66128158569336,
-      "learning_rate": 9.987906368350908e-06,
-      "loss": 3.2315,
-      "step": 240
-    },
-    {
-      "epoch": 0.05368551028077522,
-      "grad_norm": 23.81083106994629,
-      "learning_rate": 9.98536794504998e-06,
-      "loss": 2.971,
-      "step": 250
-    },
-    {
-      "epoch": 0.055832930692006226,
-      "grad_norm": 23.036523818969727,
-      "learning_rate": 9.982588260626402e-06,
-      "loss": 2.5181,
-      "step": 260
-    },
-    {
-      "epoch": 0.05798035110323724,
-      "grad_norm": 15.273154258728027,
-      "learning_rate": 9.979567449599793e-06,
-      "loss": 2.542,
-      "step": 270
-    },
-    {
-      "epoch": 0.06012777151446824,
-      "grad_norm": 14.187295913696289,
-      "learning_rate": 9.976305658158806e-06,
-      "loss": 3.2005,
-      "step": 280
-    },
-    {
-      "epoch": 0.062275191925699254,
-      "grad_norm": 25.680112838745117,
-      "learning_rate": 9.972803044154078e-06,
-      "loss": 2.1621,
-      "step": 290
-    },
-    {
-      "epoch": 0.06442261233693027,
-      "grad_norm": 24.12076759338379,
-      "learning_rate": 9.969059777090564e-06,
-      "loss": 3.1866,
-      "step": 300
-    },
-    {
-      "epoch": 0.06657003274816127,
-      "grad_norm": 16.713743209838867,
-      "learning_rate": 9.965076038119347e-06,
-      "loss": 1.8246,
-      "step": 310
-    },
-    {
-      "epoch": 0.06871745315939227,
-      "grad_norm": 23.78823471069336,
-      "learning_rate": 9.960852020028877e-06,
-      "loss": 3.2128,
-      "step": 320
-    },
-    {
-      "epoch": 0.0708648735706233,
-      "grad_norm": 24.15093994140625,
-      "learning_rate": 9.956387927235628e-06,
-      "loss": 2.8004,
-      "step": 330
-    },
-    {
-      "epoch": 0.0730122939818543,
-      "grad_norm": 23.038162231445312,
-      "learning_rate": 9.951683975774213e-06,
-      "loss": 2.3707,
-      "step": 340
-    },
-    {
-      "epoch": 0.0751597143930853,
-      "grad_norm": 24.0445556640625,
-      "learning_rate": 9.946740393286928e-06,
-      "loss": 2.3701,
-      "step": 350
-    },
-    {
-      "epoch": 0.07730713480431632,
-      "grad_norm": 23.56902313232422,
-      "learning_rate": 9.941557419012742e-06,
-      "loss": 2.5382,
-      "step": 360
-    },
-    {
-      "epoch": 0.07945455521554733,
-      "grad_norm": 13.076924324035645,
-      "learning_rate": 9.936135303775702e-06,
-      "loss": 1.6862,
-      "step": 370
-    },
-    {
-      "epoch": 0.08160197562677833,
-      "grad_norm": 20.497676849365234,
-      "learning_rate": 9.930474309972813e-06,
-      "loss": 3.52,
-      "step": 380
-    },
-    {
-      "epoch": 0.08374939603800934,
-      "grad_norm": 14.558286666870117,
-      "learning_rate": 9.92457471156133e-06,
-      "loss": 2.3912,
-      "step": 390
-    },
-    {
-      "epoch": 0.08589681644924035,
-      "grad_norm": 12.070184707641602,
-      "learning_rate": 9.918436794045507e-06,
-      "loss": 1.997,
-      "step": 400
-    },
-    {
-      "epoch": 0.08804423686047136,
-      "grad_norm": 13.852490425109863,
-      "learning_rate": 9.912060854462766e-06,
-      "loss": 3.0433,
-      "step": 410
-    },
-    {
-      "epoch": 0.09019165727170236,
-      "grad_norm": 26.084009170532227,
-      "learning_rate": 9.90544720136934e-06,
-      "loss": 2.4177,
-      "step": 420
-    },
-    {
-      "epoch": 0.09233907768293338,
-      "grad_norm": 12.258209228515625,
-      "learning_rate": 9.898596154825333e-06,
-      "loss": 1.9946,
-      "step": 430
-    },
-    {
-      "epoch": 0.09448649809416439,
-      "grad_norm": 12.761174201965332,
-      "learning_rate": 9.891508046379225e-06,
-      "loss": 2.641,
-      "step": 440
-    },
-    {
-      "epoch": 0.09663391850539539,
-      "grad_norm": 14.85302448272705,
-      "learning_rate": 9.884183219051837e-06,
-      "loss": 2.7384,
-      "step": 450
-    },
-    {
-      "epoch": 0.0987813389166264,
-      "grad_norm": 13.194392204284668,
-      "learning_rate": 9.876622027319726e-06,
-      "loss": 1.9535,
-      "step": 460
-    },
-    {
-      "epoch": 0.10092875932785741,
-      "grad_norm": 15.4808349609375,
-      "learning_rate": 9.868824837098032e-06,
-      "loss": 1.5539,
-      "step": 470
-    },
-    {
-      "epoch": 0.10307617973908842,
-      "grad_norm": 14.890803337097168,
-      "learning_rate": 9.860792025722768e-06,
-      "loss": 2.2916,
-      "step": 480
-    },
-    {
-      "epoch": 0.10522360015031942,
-      "grad_norm": 12.184898376464844,
-      "learning_rate": 9.852523981932558e-06,
-      "loss": 1.9207,
-      "step": 490
-    },
-    {
-      "epoch": 0.10737102056155044,
-      "grad_norm": 10.492812156677246,
-      "learning_rate": 9.844021105849837e-06,
-      "loss": 1.8793,
-      "step": 500
-    },
-    {
-      "epoch": 0.10951844097278145,
-      "grad_norm": 15.009350776672363,
-      "learning_rate": 9.835283808961463e-06,
-      "loss": 2.2671,
-      "step": 510
-    },
-    {
-      "epoch": 0.11166586138401245,
-      "grad_norm": 11.420863151550293,
-      "learning_rate": 9.82631251409883e-06,
-      "loss": 1.9357,
-      "step": 520
-    },
-    {
-      "epoch": 0.11381328179524347,
-      "grad_norm": 23.468830108642578,
-      "learning_rate": 9.81710765541739e-06,
-      "loss": 2.3423,
-      "step": 530
-    },
-    {
-      "epoch": 0.11596070220647448,
-      "grad_norm": 11.901519775390625,
-      "learning_rate": 9.807669678375643e-06,
-      "loss": 2.2662,
-      "step": 540
-    },
-    {
-      "epoch": 0.11810812261770548,
-      "grad_norm": 22.7636775970459,
-      "learning_rate": 9.797999039713586e-06,
-      "loss": 2.6903,
-      "step": 550
-    },
-    {
-      "epoch": 0.12025554302893648,
-      "grad_norm": 10.768227577209473,
-      "learning_rate": 9.788096207430608e-06,
-      "loss": 1.8689,
-      "step": 560
-    },
-    {
-      "epoch": 0.1224029634401675,
-      "grad_norm": 10.858819007873535,
-      "learning_rate": 9.777961660762833e-06,
-      "loss": 2.2812,
-      "step": 570
-    },
-    {
-      "epoch": 0.12455038385139851,
-      "grad_norm": 21.28119468688965,
-      "learning_rate": 9.767595890159944e-06,
-      "loss": 2.6011,
-      "step": 580
-    },
-    {
-      "epoch": 0.12669780426262953,
-      "grad_norm": 11.53591251373291,
-      "learning_rate": 9.756999397261433e-06,
-      "loss": 2.2605,
-      "step": 590
-    },
-    {
-      "epoch": 0.12884522467386053,
-      "grad_norm": 9.207640647888184,
-      "learning_rate": 9.746172694872332e-06,
-      "loss": 1.4534,
-      "step": 600
-    },
-    {
-      "epoch": 0.13099264508509154,
-      "grad_norm": 12.190791130065918,
-      "learning_rate": 9.735116306938397e-06,
-      "loss": 2.1114,
-      "step": 610
-    },
-    {
-      "epoch": 0.13314006549632254,
-      "grad_norm": 11.150038719177246,
-      "learning_rate": 9.72383076852075e-06,
-      "loss": 2.1683,
-      "step": 620
-    },
-    {
-      "epoch": 0.13528748590755355,
-      "grad_norm": 21.529489517211914,
-      "learning_rate": 9.712316625769988e-06,
-      "loss": 3.3042,
-      "step": 630
-    },
-    {
-      "epoch": 0.13743490631878455,
-      "grad_norm": 11.192593574523926,
-      "learning_rate": 9.700574435899745e-06,
-      "loss": 1.8376,
-      "step": 640
-    },
-    {
-      "epoch": 0.13958232673001558,
-      "grad_norm": 12.935983657836914,
-      "learning_rate": 9.688604767159736e-06,
-      "loss": 2.1633,
-      "step": 650
-    },
-    {
-      "epoch": 0.1417297471412466,
-      "grad_norm": 11.339750289916992,
-      "learning_rate": 9.676408198808253e-06,
-      "loss": 3.0413,
-      "step": 660
-    },
-    {
-      "epoch": 0.1438771675524776,
-      "grad_norm": 10.103887557983398,
-      "learning_rate": 9.663985321084132e-06,
-      "loss": 2.6125,
-      "step": 670
-    },
-    {
-      "epoch": 0.1460245879637086,
-      "grad_norm": 20.008167266845703,
-      "learning_rate": 9.651336735178191e-06,
-      "loss": 1.8728,
-      "step": 680
-    },
-    {
-      "epoch": 0.1481720083749396,
-      "grad_norm": 21.304391860961914,
-      "learning_rate": 9.63846305320413e-06,
-      "loss": 2.6488,
-      "step": 690
-    },
-    {
-      "epoch": 0.1503194287861706,
-      "grad_norm": 11.066816329956055,
-      "learning_rate": 9.62536489816892e-06,
-      "loss": 2.2746,
-      "step": 700
-    },
-    {
-      "epoch": 0.1524668491974016,
-      "grad_norm": 13.05954647064209,
-      "learning_rate": 9.61204290394264e-06,
-      "loss": 1.9458,
-      "step": 710
-    },
-    {
-      "epoch": 0.15461426960863264,
-      "grad_norm": 10.509926795959473,
-      "learning_rate": 9.598497715227815e-06,
-      "loss": 1.7751,
-      "step": 720
-    },
-    {
-      "epoch": 0.15676169001986365,
-      "grad_norm": 9.94949722290039,
-      "learning_rate": 9.584729987528202e-06,
-      "loss": 2.5051,
-      "step": 730
-    },
-    {
-      "epoch": 0.15890911043109465,
-      "grad_norm": 10.857903480529785,
-      "learning_rate": 9.570740387117078e-06,
-      "loss": 2.6639,
-      "step": 740
-    },
-    {
-      "epoch": 0.16105653084232566,
-      "grad_norm": 9.37006664276123,
-      "learning_rate": 9.556529591005001e-06,
-      "loss": 2.3239,
-      "step": 750
-    },
-    {
-      "epoch": 0.16320395125355666,
-      "grad_norm": 12.794018745422363,
-      "learning_rate": 9.542098286907024e-06,
-      "loss": 1.7987,
-      "step": 760
-    },
-    {
-      "epoch": 0.16535137166478767,
-      "grad_norm": 10.971644401550293,
-      "learning_rate": 9.527447173209444e-06,
-      "loss": 2.1475,
-      "step": 770
-    },
-    {
-      "epoch": 0.16749879207601867,
-      "grad_norm": 12.863593101501465,
-      "learning_rate": 9.51257695893598e-06,
-      "loss": 2.5493,
-      "step": 780
-    },
-    {
-      "epoch": 0.1696462124872497,
-      "grad_norm": 10.18968391418457,
-      "learning_rate": 9.497488363713477e-06,
-      "loss": 2.5998,
-      "step": 790
-    },
-    {
-      "epoch": 0.1717936328984807,
-      "grad_norm": 10.925987243652344,
-      "learning_rate": 9.482182117737066e-06,
-      "loss": 1.7935,
-      "step": 800
-    },
-    {
-      "epoch": 0.1739410533097117,
-      "grad_norm": 8.317191123962402,
-      "learning_rate": 9.466658961734844e-06,
-      "loss": 1.8083,
-      "step": 810
-    },
-    {
-      "epoch": 0.17608847372094272,
-      "grad_norm": 11.145355224609375,
-      "learning_rate": 9.450919646932013e-06,
-      "loss": 2.2068,
-      "step": 820
-    },
-    {
-      "epoch": 0.17823589413217372,
-      "grad_norm": 9.376663208007812,
-      "learning_rate": 9.434964935014527e-06,
-      "loss": 2.5394,
-      "step": 830
-    },
-    {
-      "epoch": 0.18038331454340473,
-      "grad_norm": 10.962883949279785,
-      "learning_rate": 9.418795598092243e-06,
-      "loss": 2.5965,
-      "step": 840
-    },
-    {
-      "epoch": 0.18253073495463573,
-      "grad_norm": 9.319477081298828,
-      "learning_rate": 9.402412418661541e-06,
-      "loss": 2.9125,
-      "step": 850
-    },
-    {
-      "epoch": 0.18467815536586676,
-      "grad_norm": 10.956426620483398,
-      "learning_rate": 9.385816189567462e-06,
-      "loss": 2.168,
-      "step": 860
-    },
-    {
-      "epoch": 0.18682557577709777,
-      "grad_norm": 11.152973175048828,
-      "learning_rate": 9.369007713965346e-06,
-      "loss": 1.8573,
-      "step": 870
-    },
-    {
-      "epoch": 0.18897299618832877,
-      "grad_norm": 10.0717134475708,
-      "learning_rate": 9.351987805281949e-06,
-      "loss": 2.5868,
-      "step": 880
-    },
-    {
-      "epoch": 0.19112041659955978,
-      "grad_norm": 15.084976196289062,
-      "learning_rate": 9.334757287176089e-06,
-      "loss": 2.2103,
-      "step": 890
-    },
-    {
-      "epoch": 0.19326783701079078,
-      "grad_norm": 10.498251914978027,
-      "learning_rate": 9.317316993498788e-06,
-      "loss": 1.8223,
-      "step": 900
-    },
-    {
-      "epoch": 0.1954152574220218,
-      "grad_norm": 9.38797664642334,
-      "learning_rate": 9.299667768252911e-06,
-      "loss": 1.7798,
-      "step": 910
-    },
-    {
-      "epoch": 0.1975626778332528,
-      "grad_norm": 10.024442672729492,
-      "learning_rate": 9.281810465552327e-06,
-      "loss": 1.4073,
-      "step": 920
-    },
-    {
-      "epoch": 0.19971009824448382,
-      "grad_norm": 9.467859268188477,
-      "learning_rate": 9.263745949580569e-06,
-      "loss": 2.2005,
-      "step": 930
-    },
-    {
-      "epoch": 0.20185751865571483,
-      "grad_norm": 10.421006202697754,
-      "learning_rate": 9.24547509454902e-06,
-      "loss": 1.4132,
-      "step": 940
-    },
-    {
-      "epoch": 0.20400493906694583,
-      "grad_norm": 10.632217407226562,
-      "learning_rate": 9.226998784654606e-06,
-      "loss": 2.5512,
-      "step": 950
-    },
-    {
-      "epoch": 0.20615235947817684,
-      "grad_norm": 10.489507675170898,
-      "learning_rate": 9.208317914036997e-06,
-      "loss": 1.8359,
-      "step": 960
-    },
-    {
-      "epoch": 0.20829977988940784,
-      "grad_norm": 22.003978729248047,
-      "learning_rate": 9.18943338673535e-06,
-      "loss": 2.1296,
-      "step": 970
-    },
-    {
-      "epoch": 0.21044720030063885,
-      "grad_norm": 9.565354347229004,
-      "learning_rate": 9.170346116644545e-06,
-      "loss": 2.1198,
-      "step": 980
-    },
-    {
-      "epoch": 0.21259462071186985,
-      "grad_norm": 10.454195022583008,
-      "learning_rate": 9.15105702747097e-06,
-      "loss": 2.2026,
-      "step": 990
-    },
-    {
-      "epoch": 0.21474204112310089,
-      "grad_norm": 19.8426456451416,
-      "learning_rate": 9.131567052687811e-06,
-      "loss": 2.9161,
-      "step": 1000
-    },
-    {
-      "epoch": 0.2168894615343319,
-      "grad_norm": 19.293813705444336,
-      "learning_rate": 9.111877135489886e-06,
-      "loss": 2.5228,
-      "step": 1010
-    },
-    {
-      "epoch": 0.2190368819455629,
-      "grad_norm": 10.029641151428223,
-      "learning_rate": 9.091988228747992e-06,
-      "loss": 1.869,
-      "step": 1020
-    },
-    {
-      "epoch": 0.2211843023567939,
-      "grad_norm": 9.284717559814453,
-      "learning_rate": 9.071901294962795e-06,
-      "loss": 3.7155,
-      "step": 1030
-    },
-    {
-      "epoch": 0.2233317227680249,
-      "grad_norm": 19.013784408569336,
-      "learning_rate": 9.05161730621825e-06,
-      "loss": 2.1832,
-      "step": 1040
-    },
-    {
-      "epoch": 0.2254791431792559,
-      "grad_norm": 9.73091983795166,
-      "learning_rate": 9.03113724413456e-06,
-      "loss": 2.1185,
-      "step": 1050
-    },
-    {
-      "epoch": 0.22762656359048694,
-      "grad_norm": 10.244536399841309,
-      "learning_rate": 9.010462099820674e-06,
-      "loss": 1.7729,
-      "step": 1060
-    },
-    {
-      "epoch": 0.22977398400171795,
-      "grad_norm": 8.954618453979492,
-      "learning_rate": 8.989592873826316e-06,
-      "loss": 1.7244,
-      "step": 1070
-    },
-    {
-      "epoch": 0.23192140441294895,
-      "grad_norm": 19.035173416137695,
-      "learning_rate": 8.96853057609357e-06,
-      "loss": 2.5808,
-      "step": 1080
-    },
-    {
-      "epoch": 0.23406882482417996,
-      "grad_norm": 9.041604042053223,
-      "learning_rate": 8.947276225908001e-06,
-      "loss": 1.7619,
-      "step": 1090
-    },
-    {
-      "epoch": 0.23621624523541096,
-      "grad_norm": 11.142961502075195,
-      "learning_rate": 8.925830851849338e-06,
-      "loss": 1.7543,
-      "step": 1100
-    },
-    {
-      "epoch": 0.23836366564664196,
-      "grad_norm": 8.855928421020508,
-      "learning_rate": 8.904195491741682e-06,
-      "loss": 1.4371,
-      "step": 1110
-    },
-    {
-      "epoch": 0.24051108605787297,
-      "grad_norm": 10.576139450073242,
-      "learning_rate": 8.882371192603297e-06,
-      "loss": 1.7964,
-      "step": 1120
-    },
-    {
-      "epoch": 0.242658506469104,
-      "grad_norm": 19.504796981811523,
-      "learning_rate": 8.86035901059592e-06,
-      "loss": 2.4225,
-      "step": 1130
-    },
-    {
-      "epoch": 0.244805926880335,
-      "grad_norm": 8.53579330444336,
-      "learning_rate": 8.83816001097368e-06,
-      "loss": 2.5166,
-      "step": 1140
-    },
-    {
-      "epoch": 0.246953347291566,
-      "grad_norm": 10.541763305664062,
-      "learning_rate": 8.815775268031514e-06,
-      "loss": 2.5292,
-      "step": 1150
-    },
-    {
-      "epoch": 0.24910076770279702,
-      "grad_norm": 8.700005531311035,
-      "learning_rate": 8.7932058650532e-06,
-      "loss": 1.7864,
-      "step": 1160
-    },
-    {
-      "epoch": 0.25124818811402805,
-      "grad_norm": 19.96916961669922,
-      "learning_rate": 8.770452894258922e-06,
-      "loss": 2.1556,
-      "step": 1170
-    },
-    {
-      "epoch": 0.25339560852525905,
-      "grad_norm": 10.887153625488281,
-      "learning_rate": 8.747517456752419e-06,
-      "loss": 1.7778,
-      "step": 1180
-    },
-    {
-      "epoch": 0.25554302893649006,
-      "grad_norm": 9.90004825592041,
-      "learning_rate": 8.724400662467694e-06,
-      "loss": 2.4839,
-      "step": 1190
-    },
-    {
-      "epoch": 0.25769044934772106,
-      "grad_norm": 18.490222930908203,
-      "learning_rate": 8.701103630115303e-06,
-      "loss": 2.9508,
-      "step": 1200
-    },
-    {
-      "epoch": 0.25983786975895207,
-      "grad_norm": 10.261496543884277,
-      "learning_rate": 8.677627487128211e-06,
-      "loss": 2.1185,
-      "step": 1210
-    },
-    {
-      "epoch": 0.26198529017018307,
-      "grad_norm": 11.693336486816406,
-      "learning_rate": 8.65397336960724e-06,
-      "loss": 2.5583,
-      "step": 1220
-    },
-    {
-      "epoch": 0.2641327105814141,
-      "grad_norm": 9.892570495605469,
-      "learning_rate": 8.630142422266086e-06,
-      "loss": 2.5668,
-      "step": 1230
-    },
-    {
-      "epoch": 0.2662801309926451,
-      "grad_norm": 11.391189575195312,
-      "learning_rate": 8.60613579837591e-06,
-      "loss": 2.1724,
-      "step": 1240
-    },
-    {
-      "epoch": 0.2684275514038761,
-      "grad_norm": 9.438261032104492,
-      "learning_rate": 8.581954659709549e-06,
-      "loss": 2.51,
-      "step": 1250
-    },
-    {
-      "epoch": 0.2705749718151071,
-      "grad_norm": 12.741863250732422,
-      "learning_rate": 8.55760017648527e-06,
-      "loss": 2.1885,
-      "step": 1260
-    },
-    {
-      "epoch": 0.2727223922263381,
-      "grad_norm": 21.02118492126465,
-      "learning_rate": 8.533073527310157e-06,
-      "loss": 2.5175,
-      "step": 1270
-    },
-    {
-      "epoch": 0.2748698126375691,
-      "grad_norm": 10.651395797729492,
-      "learning_rate": 8.508375899123064e-06,
-      "loss": 2.1997,
-      "step": 1280
-    },
-    {
-      "epoch": 0.2770172330488001,
-      "grad_norm": 10.738212585449219,
-      "learning_rate": 8.483508487137175e-06,
-      "loss": 2.1453,
-      "step": 1290
-    },
-    {
-      "epoch": 0.27916465346003116,
-      "grad_norm": 14.632546424865723,
-      "learning_rate": 8.458472494782169e-06,
-      "loss": 2.1023,
-      "step": 1300
-    },
-    {
-      "epoch": 0.28131207387126217,
-      "grad_norm": 8.816224098205566,
-      "learning_rate": 8.433269133645974e-06,
-      "loss": 1.682,
-      "step": 1310
-    },
-    {
-      "epoch": 0.2834594942824932,
-      "grad_norm": 8.408437728881836,
-      "learning_rate": 8.407899623416136e-06,
-      "loss": 1.7611,
-      "step": 1320
-    },
-    {
-      "epoch": 0.2856069146937242,
-      "grad_norm": 9.93062686920166,
-      "learning_rate": 8.382365191820796e-06,
-      "loss": 2.503,
-      "step": 1330
-    },
-    {
-      "epoch": 0.2877543351049552,
-      "grad_norm": 11.0846586227417,
-      "learning_rate": 8.356667074569274e-06,
-      "loss": 2.0817,
-      "step": 1340
-    },
-    {
-      "epoch": 0.2899017555161862,
-      "grad_norm": 18.90940284729004,
-      "learning_rate": 8.330806515292271e-06,
-      "loss": 2.1409,
-      "step": 1350
-    },
-    {
-      "epoch": 0.2920491759274172,
-      "grad_norm": 7.251021862030029,
-      "learning_rate": 8.304784765481676e-06,
-      "loss": 1.7067,
-      "step": 1360
-    },
-    {
-      "epoch": 0.2941965963386482,
-      "grad_norm": 18.02667808532715,
-      "learning_rate": 8.278603084430012e-06,
-      "loss": 2.5272,
-      "step": 1370
-    },
-    {
-      "epoch": 0.2963440167498792,
-      "grad_norm": 19.2054386138916,
-      "learning_rate": 8.25226273916949e-06,
-      "loss": 2.1962,
-      "step": 1380
-    },
-    {
-      "epoch": 0.2984914371611102,
-      "grad_norm": 8.991003036499023,
-      "learning_rate": 8.225765004410688e-06,
-      "loss": 2.8256,
-      "step": 1390
-    },
-    {
-      "epoch": 0.3006388575723412,
-      "grad_norm": 10.063870429992676,
-      "learning_rate": 8.199111162480871e-06,
-      "loss": 2.5387,
-      "step": 1400
-    },
-    {
-      "epoch": 0.3027862779835722,
-      "grad_norm": 9.316102027893066,
-      "learning_rate": 8.17230250326193e-06,
-      "loss": 1.7767,
-      "step": 1410
-    },
-    {
-      "epoch": 0.3049336983948032,
-      "grad_norm": 18.266010284423828,
-      "learning_rate": 8.145340324127958e-06,
-      "loss": 1.7364,
-      "step": 1420
-    },
-    {
-      "epoch": 0.3070811188060342,
-      "grad_norm": 18.232187271118164,
-      "learning_rate": 8.118225929882468e-06,
-      "loss": 2.1927,
-      "step": 1430
-    },
-    {
-      "epoch": 0.3092285392172653,
-      "grad_norm": 8.176814079284668,
-      "learning_rate": 8.090960632695246e-06,
-      "loss": 2.1411,
-      "step": 1440
-    },
-    {
-      "epoch": 0.3113759596284963,
-      "grad_norm": 9.711817741394043,
-      "learning_rate": 8.063545752038854e-06,
-      "loss": 1.7321,
-      "step": 1450
-    },
-    {
-      "epoch": 0.3135233800397273,
-      "grad_norm": 11.406618118286133,
-      "learning_rate": 8.035982614624774e-06,
-      "loss": 1.3379,
-      "step": 1460
-    },
-    {
-      "epoch": 0.3156708004509583,
-      "grad_norm": 8.103960990905762,
-      "learning_rate": 8.008272554339195e-06,
-      "loss": 2.4391,
-      "step": 1470
-    },
-    {
-      "epoch": 0.3178182208621893,
-      "grad_norm": 10.89107608795166,
-      "learning_rate": 7.980416912178478e-06,
-      "loss": 2.6215,
-      "step": 1480
-    },
-    {
-      "epoch": 0.3199656412734203,
-      "grad_norm": 13.625832557678223,
-      "learning_rate": 7.952417036184248e-06,
-      "loss": 1.3208,
-      "step": 1490
-    },
-    {
-      "epoch": 0.3221130616846513,
-      "grad_norm": 9.747784614562988,
-      "learning_rate": 7.924274281378153e-06,
-      "loss": 1.3278,
-      "step": 1500
-    },
-    {
-      "epoch": 0.3242604820958823,
-      "grad_norm": 23.162227630615234,
-      "learning_rate": 7.895990009696308e-06,
-      "loss": 2.5893,
-      "step": 1510
-    },
-    {
-      "epoch": 0.3264079025071133,
-      "grad_norm": 9.394268989562988,
-      "learning_rate": 7.867565589923364e-06,
-      "loss": 2.1448,
-      "step": 1520
-    },
-    {
-      "epoch": 0.32855532291834433,
-      "grad_norm": 8.258391380310059,
-      "learning_rate": 7.839002397626281e-06,
-      "loss": 2.1836,
-      "step": 1530
-    },
-    {
-      "epoch": 0.33070274332957533,
-      "grad_norm": 10.303350448608398,
-      "learning_rate": 7.810301815087753e-06,
-      "loss": 3.2407,
-      "step": 1540
-    },
-    {
-      "epoch": 0.33285016374080634,
-      "grad_norm": 8.373433113098145,
-      "learning_rate": 7.781465231239318e-06,
-      "loss": 2.5282,
-      "step": 1550
-    },
-    {
-      "epoch": 0.33499758415203734,
-      "grad_norm": 8.955942153930664,
-      "learning_rate": 7.75249404159414e-06,
-      "loss": 2.4664,
-      "step": 1560
-    },
-    {
-      "epoch": 0.33714500456326835,
-      "grad_norm": 8.981405258178711,
-      "learning_rate": 7.723389648179475e-06,
-      "loss": 2.0393,
-      "step": 1570
-    },
-    {
-      "epoch": 0.3392924249744994,
-      "grad_norm": 11.585493087768555,
-      "learning_rate": 7.694153459468822e-06,
-      "loss": 1.7473,
-      "step": 1580
-    },
-    {
-      "epoch": 0.3414398453857304,
-      "grad_norm": 7.84296989440918,
-      "learning_rate": 7.664786890313757e-06,
-      "loss": 2.0628,
-      "step": 1590
-    },
-    {
-      "epoch": 0.3435872657969614,
-      "grad_norm": 7.5332417488098145,
-      "learning_rate": 7.635291361875474e-06,
-      "loss": 2.1288,
-      "step": 1600
-    },
-    {
-      "epoch": 0.3457346862081924,
-      "grad_norm": 10.541316986083984,
-      "learning_rate": 7.6056683015559975e-06,
-      "loss": 1.7313,
-      "step": 1610
-    },
-    {
-      "epoch": 0.3478821066194234,
-      "grad_norm": 8.132133483886719,
-      "learning_rate": 7.57591914292911e-06,
-      "loss": 1.7268,
-      "step": 1620
-    },
-    {
-      "epoch": 0.35002952703065443,
-      "grad_norm": 20.494726181030273,
-      "learning_rate": 7.546045325670979e-06,
-      "loss": 3.3231,
-      "step": 1630
-    },
-    {
-      "epoch": 0.35217694744188544,
-      "grad_norm": 10.624666213989258,
-      "learning_rate": 7.516048295490479e-06,
-      "loss": 2.0748,
-      "step": 1640
-    },
-    {
-      "epoch": 0.35432436785311644,
-      "grad_norm": 9.553028106689453,
-      "learning_rate": 7.485929504059234e-06,
-      "loss": 1.6695,
-      "step": 1650
-    },
-    {
-      "epoch": 0.35647178826434744,
-      "grad_norm": 8.269372940063477,
-      "learning_rate": 7.455690408941363e-06,
-      "loss": 2.1454,
-      "step": 1660
-    },
-    {
-      "epoch": 0.35861920867557845,
-      "grad_norm": 8.983115196228027,
-      "learning_rate": 7.425332473522942e-06,
-      "loss": 2.085,
-      "step": 1670
-    },
-    {
-      "epoch": 0.36076662908680945,
-      "grad_norm": 10.12364387512207,
-      "learning_rate": 7.394857166941187e-06,
-      "loss": 1.7961,
-      "step": 1680
-    },
-    {
-      "epoch": 0.36291404949804046,
-      "grad_norm": 13.731148719787598,
-      "learning_rate": 7.364265964013355e-06,
-      "loss": 1.8293,
-      "step": 1690
-    },
-    {
-      "epoch": 0.36506146990927146,
-      "grad_norm": 7.890660762786865,
-      "learning_rate": 7.333560345165371e-06,
-      "loss": 1.694,
-      "step": 1700
-    },
-    {
-      "epoch": 0.3672088903205025,
-      "grad_norm": 7.874722957611084,
-      "learning_rate": 7.302741796360192e-06,
-      "loss": 2.1216,
-      "step": 1710
-    },
-    {
-      "epoch": 0.36935631073173353,
-      "grad_norm": 9.214128494262695,
-      "learning_rate": 7.271811809025882e-06,
-      "loss": 1.3505,
-      "step": 1720
-    },
-    {
-      "epoch": 0.37150373114296453,
-      "grad_norm": 8.188569068908691,
-      "learning_rate": 7.240771879983451e-06,
-      "loss": 2.1366,
-      "step": 1730
-    },
-    {
-      "epoch": 0.37365115155419554,
-      "grad_norm": 7.846069812774658,
-      "learning_rate": 7.209623511374407e-06,
-      "loss": 1.351,
-      "step": 1740
-    },
-    {
-      "epoch": 0.37579857196542654,
-      "grad_norm": 11.016289710998535,
-      "learning_rate": 7.178368210588067e-06,
-      "loss": 2.1055,
-      "step": 1750
-    },
-    {
-      "epoch": 0.37794599237665755,
-      "grad_norm": 9.798548698425293,
-      "learning_rate": 7.1470074901886065e-06,
-      "loss": 1.6451,
-      "step": 1760
-    },
-    {
-      "epoch": 0.38009341278788855,
-      "grad_norm": 19.033615112304688,
-      "learning_rate": 7.1155428678418615e-06,
-      "loss": 2.1027,
-      "step": 1770
-    },
-    {
-      "epoch": 0.38224083319911956,
-      "grad_norm": 10.220734596252441,
-      "learning_rate": 7.083975866241881e-06,
-      "loss": 2.43,
-      "step": 1780
-    },
-    {
-      "epoch": 0.38438825361035056,
-      "grad_norm": 17.87563705444336,
-      "learning_rate": 7.0523080130372444e-06,
-      "loss": 3.0401,
-      "step": 1790
-    },
-    {
-      "epoch": 0.38653567402158157,
-      "grad_norm": 7.5210161209106445,
-      "learning_rate": 7.020540840757124e-06,
-      "loss": 2.4589,
-      "step": 1800
-    },
-    {
-      "epoch": 0.38868309443281257,
-      "grad_norm": 8.672847747802734,
-      "learning_rate": 6.988675886737125e-06,
-      "loss": 1.6752,
-      "step": 1810
-    },
-    {
-      "epoch": 0.3908305148440436,
-      "grad_norm": 8.692748069763184,
-      "learning_rate": 6.956714693044888e-06,
-      "loss": 1.7287,
-      "step": 1820
-    },
-    {
-      "epoch": 0.3929779352552746,
-      "grad_norm": 7.074733257293701,
-      "learning_rate": 6.9246588064054575e-06,
-      "loss": 2.8803,
-      "step": 1830
-    },
-    {
-      "epoch": 0.3951253556665056,
-      "grad_norm": 18.272645950317383,
-      "learning_rate": 6.892509778126442e-06,
-      "loss": 3.2607,
-      "step": 1840
-    },
-    {
-      "epoch": 0.39727277607773664,
-      "grad_norm": 9.085878372192383,
-      "learning_rate": 6.860269164022921e-06,
-      "loss": 1.7211,
-      "step": 1850
-    },
-    {
-      "epoch": 0.39942019648896765,
-      "grad_norm": 7.339599132537842,
-      "learning_rate": 6.827938524342175e-06,
-      "loss": 1.6545,
-      "step": 1860
-    },
-    {
-      "epoch": 0.40156761690019865,
-      "grad_norm": 18.716262817382812,
-      "learning_rate": 6.7955194236881595e-06,
-      "loss": 2.499,
-      "step": 1870
-    },
-    {
-      "epoch": 0.40371503731142966,
-      "grad_norm": 10.822924613952637,
-      "learning_rate": 6.763013430945803e-06,
-      "loss": 2.7362,
-      "step": 1880
-    },
-    {
-      "epoch": 0.40586245772266066,
-      "grad_norm": 8.549121856689453,
-      "learning_rate": 6.730422119205072e-06,
-      "loss": 2.4742,
-      "step": 1890
-    },
-    {
-      "epoch": 0.40800987813389167,
-      "grad_norm": 8.39169692993164,
-      "learning_rate": 6.697747065684851e-06,
-      "loss": 2.0802,
-      "step": 1900
-    },
-    {
-      "epoch": 0.4101572985451227,
-      "grad_norm": 10.248717308044434,
-      "learning_rate": 6.664989851656606e-06,
-      "loss": 2.1458,
-      "step": 1910
-    },
-    {
-      "epoch": 0.4123047189563537,
-      "grad_norm": 9.720662117004395,
-      "learning_rate": 6.632152062367871e-06,
-      "loss": 1.6682,
-      "step": 1920
-    },
-    {
-      "epoch": 0.4144521393675847,
-      "grad_norm": 9.940651893615723,
-      "learning_rate": 6.599235286965524e-06,
-      "loss": 1.6989,
-      "step": 1930
-    },
-    {
-      "epoch": 0.4165995597788157,
-      "grad_norm": 9.139376640319824,
-      "learning_rate": 6.566241118418888e-06,
-      "loss": 2.0926,
-      "step": 1940
-    },
-    {
-      "epoch": 0.4187469801900467,
-      "grad_norm": 8.232985496520996,
-      "learning_rate": 6.5331711534426326e-06,
-      "loss": 2.4193,
-      "step": 1950
-    },
-    {
-      "epoch": 0.4208944006012777,
-      "grad_norm": 7.869434356689453,
-      "learning_rate": 6.50002699241951e-06,
-      "loss": 2.5119,
-      "step": 1960
-    },
-    {
-      "epoch": 0.4230418210125087,
-      "grad_norm": 9.408770561218262,
-      "learning_rate": 6.466810239322909e-06,
-      "loss": 1.3976,
-      "step": 1970
-    },
-    {
-      "epoch": 0.4251892414237397,
-      "grad_norm": 8.584155082702637,
-      "learning_rate": 6.43352250163922e-06,
-      "loss": 1.6854,
-      "step": 1980
-    },
-    {
-      "epoch": 0.42733666183497077,
-      "grad_norm": 18.368183135986328,
-      "learning_rate": 6.4001653902900564e-06,
-      "loss": 2.9371,
-      "step": 1990
-    },
-    {
-      "epoch": 0.42948408224620177,
-      "grad_norm": 10.570369720458984,
-      "learning_rate": 6.366740519554286e-06,
-      "loss": 1.3262,
-      "step": 2000
-    },
-    {
-      "epoch": 0.4316315026574328,
-      "grad_norm": 10.918615341186523,
-      "learning_rate": 6.33324950698992e-06,
-      "loss": 2.1305,
-      "step": 2010
-    },
-    {
-      "epoch": 0.4337789230686638,
-      "grad_norm": 9.189411163330078,
-      "learning_rate": 6.299693973355821e-06,
-      "loss": 2.079,
-      "step": 2020
-    },
-    {
-      "epoch": 0.4359263434798948,
-      "grad_norm": 19.311662673950195,
-      "learning_rate": 6.266075542533275e-06,
-      "loss": 2.4221,
-      "step": 2030
-    },
-    {
-      "epoch": 0.4380737638911258,
-      "grad_norm": 10.08248233795166,
-      "learning_rate": 6.2323958414474065e-06,
-      "loss": 1.703,
-      "step": 2040
-    },
-    {
-      "epoch": 0.4402211843023568,
-      "grad_norm": 18.993967056274414,
-      "learning_rate": 6.198656499988444e-06,
-      "loss": 1.9942,
-      "step": 2050
-    },
-    {
-      "epoch": 0.4423686047135878,
-      "grad_norm": 18.06959342956543,
-      "learning_rate": 6.164859150932839e-06,
-      "loss": 2.7462,
-      "step": 2060
-    },
-    {
-      "epoch": 0.4445160251248188,
-      "grad_norm": 17.054033279418945,
-      "learning_rate": 6.131005429864262e-06,
-      "loss": 2.7662,
-      "step": 2070
-    },
-    {
-      "epoch": 0.4466634455360498,
-      "grad_norm": 22.749589920043945,
-      "learning_rate": 6.097096975094432e-06,
-      "loss": 2.0372,
-      "step": 2080
-    },
-    {
-      "epoch": 0.4488108659472808,
-      "grad_norm": 8.990077018737793,
-      "learning_rate": 6.063135427583849e-06,
-      "loss": 2.3963,
-      "step": 2090
-    },
-    {
-      "epoch": 0.4509582863585118,
-      "grad_norm": 8.372001647949219,
-      "learning_rate": 6.029122430862373e-06,
-      "loss": 1.7628,
-      "step": 2100
-    },
-    {
-      "epoch": 0.4531057067697428,
-      "grad_norm": 8.995088577270508,
-      "learning_rate": 5.9950596309496864e-06,
-      "loss": 1.6897,
-      "step": 2110
-    },
-    {
-      "epoch": 0.4552531271809739,
-      "grad_norm": 10.031561851501465,
-      "learning_rate": 5.9609486762756465e-06,
-      "loss": 1.7369,
-      "step": 2120
-    },
-    {
-      "epoch": 0.4574005475922049,
-      "grad_norm": 8.10063362121582,
-      "learning_rate": 5.9267912176004925e-06,
-      "loss": 2.051,
-      "step": 2130
-    },
-    {
-      "epoch": 0.4595479680034359,
-      "grad_norm": 9.510966300964355,
-      "learning_rate": 5.892588907934988e-06,
-      "loss": 2.4391,
-      "step": 2140
-    },
-    {
-      "epoch": 0.4616953884146669,
-      "grad_norm": 19.641340255737305,
-      "learning_rate": 5.858343402460391e-06,
-      "loss": 2.0852,
-      "step": 2150
-    },
-    {
-      "epoch": 0.4638428088258979,
-      "grad_norm": 8.368480682373047,
-      "learning_rate": 5.8240563584483855e-06,
-      "loss": 2.0298,
-      "step": 2160
-    },
-    {
-      "epoch": 0.4659902292371289,
-      "grad_norm": 18.213680267333984,
-      "learning_rate": 5.789729435180853e-06,
-      "loss": 2.4393,
-      "step": 2170
-    },
-    {
-      "epoch": 0.4681376496483599,
-      "grad_norm": 17.781763076782227,
-      "learning_rate": 5.7553642938695945e-06,
-      "loss": 1.6106,
-      "step": 2180
-    },
-    {
-      "epoch": 0.4702850700595909,
-      "grad_norm": 8.500686645507812,
-      "learning_rate": 5.720962597575922e-06,
-      "loss": 1.6916,
-      "step": 2190
-    },
-    {
-      "epoch": 0.4724324904708219,
-      "grad_norm": 8.981590270996094,
-      "learning_rate": 5.68652601113019e-06,
-      "loss": 1.2966,
-      "step": 2200
-    },
-    {
-      "epoch": 0.4745799108820529,
-      "grad_norm": 17.556732177734375,
-      "learning_rate": 5.652056201051217e-06,
-      "loss": 2.4725,
-      "step": 2210
-    },
-    {
-      "epoch": 0.47672733129328393,
-      "grad_norm": 8.62265396118164,
-      "learning_rate": 5.617554835465646e-06,
-      "loss": 1.6889,
-      "step": 2220
-    },
-    {
-      "epoch": 0.47887475170451493,
-      "grad_norm": 19.104825973510742,
-      "learning_rate": 5.583023584027204e-06,
-      "loss": 2.4591,
-      "step": 2230
-    },
-    {
-      "epoch": 0.48102217211574594,
-      "grad_norm": 8.052384376525879,
-      "learning_rate": 5.548464117835917e-06,
-      "loss": 2.127,
-      "step": 2240
-    },
-    {
-      "epoch": 0.48316959252697694,
-      "grad_norm": 17.943681716918945,
-      "learning_rate": 5.513878109357228e-06,
-      "loss": 2.1374,
-      "step": 2250
-    },
-    {
-      "epoch": 0.485317012938208,
-      "grad_norm": 8.557195663452148,
-      "learning_rate": 5.479267232341064e-06,
-      "loss": 2.1146,
-      "step": 2260
-    },
-    {
-      "epoch": 0.487464433349439,
-      "grad_norm": 8.944927215576172,
-      "learning_rate": 5.444633161740834e-06,
-      "loss": 3.1203,
-      "step": 2270
-    },
-    {
-      "epoch": 0.48961185376067,
-      "grad_norm": 16.947784423828125,
-      "learning_rate": 5.40997757363238e-06,
-      "loss": 2.7593,
-      "step": 2280
-    },
-    {
-      "epoch": 0.491759274171901,
-      "grad_norm": 7.79644250869751,
-      "learning_rate": 5.3753021451328525e-06,
-      "loss": 2.0236,
-      "step": 2290
-    },
-    {
-      "epoch": 0.493906694583132,
-      "grad_norm": 7.248489856719971,
-      "learning_rate": 5.3406085543195555e-06,
-      "loss": 1.6521,
-      "step": 2300
-    },
-    {
-      "epoch": 0.496054114994363,
-      "grad_norm": 17.251184463500977,
-      "learning_rate": 5.305898480148741e-06,
-      "loss": 1.9537,
-      "step": 2310
-    },
-    {
-      "epoch": 0.49820153540559403,
-      "grad_norm": 8.320523262023926,
-      "learning_rate": 5.27117360237435e-06,
-      "loss": 2.8172,
-      "step": 2320
-    },
-    {
-      "epoch": 0.500348955816825,
-      "grad_norm": 7.883960723876953,
-      "learning_rate": 5.23643560146673e-06,
-      "loss": 2.7325,
-      "step": 2330
-    },
-    {
-      "epoch": 0.5024963762280561,
-      "grad_norm": 17.11644172668457,
-      "learning_rate": 5.201686158531304e-06,
-      "loss": 2.0326,
-      "step": 2340
-    },
-    {
-      "epoch": 0.504643796639287,
-      "grad_norm": 9.46158218383789,
-      "learning_rate": 5.166926955227224e-06,
-      "loss": 2.4668,
-      "step": 2350
-    },
-    {
-      "epoch": 0.5067912170505181,
-      "grad_norm": 6.561529636383057,
-      "learning_rate": 5.132159673685976e-06,
-      "loss": 1.614,
-      "step": 2360
-    },
-    {
-      "epoch": 0.508938637461749,
-      "grad_norm": 17.91114616394043,
-      "learning_rate": 5.097385996429992e-06,
-      "loss": 2.3969,
-      "step": 2370
-    },
-    {
-      "epoch": 0.5110860578729801,
-      "grad_norm": 8.510595321655273,
-      "learning_rate": 5.062607606291208e-06,
-      "loss": 2.0208,
-      "step": 2380
-    },
-    {
-      "epoch": 0.5132334782842111,
-      "grad_norm": 9.593419075012207,
-      "learning_rate": 5.027826186329642e-06,
-      "loss": 2.0224,
-      "step": 2390
-    },
-    {
-      "epoch": 0.5153808986954421,
-      "grad_norm": 9.872200012207031,
-      "learning_rate": 4.993043419751933e-06,
-      "loss": 1.2689,
-      "step": 2400
-    },
-    {
-      "epoch": 0.5175283191066731,
-      "grad_norm": 8.374130249023438,
-      "learning_rate": 4.958260989829889e-06,
-      "loss": 1.2412,
-      "step": 2410
-    },
-    {
-      "epoch": 0.5196757395179041,
-      "grad_norm": 9.26429271697998,
-      "learning_rate": 4.923480579819025e-06,
-      "loss": 1.9938,
-      "step": 2420
-    },
-    {
-      "epoch": 0.5218231599291351,
-      "grad_norm": 9.154571533203125,
-      "learning_rate": 4.888703872877108e-06,
-      "loss": 1.6627,
-      "step": 2430
-    },
-    {
-      "epoch": 0.5239705803403661,
-      "grad_norm": 7.144714832305908,
-      "learning_rate": 4.853932551982692e-06,
-      "loss": 1.675,
-      "step": 2440
-    },
-    {
-      "epoch": 0.5261180007515971,
-      "grad_norm": 9.676398277282715,
-      "learning_rate": 4.8191682998536905e-06,
-      "loss": 2.051,
-      "step": 2450
-    },
-    {
-      "epoch": 0.5282654211628282,
-      "grad_norm": 7.870721817016602,
-      "learning_rate": 4.7844127988659204e-06,
-      "loss": 1.9328,
-      "step": 2460
-    },
-    {
-      "epoch": 0.5304128415740592,
-      "grad_norm": 6.848033428192139,
-      "learning_rate": 4.749667730971704e-06,
-      "loss": 1.7212,
-      "step": 2470
-    },
-    {
-      "epoch": 0.5325602619852902,
-      "grad_norm": 17.782894134521484,
-      "learning_rate": 4.714934777618468e-06,
-      "loss": 2.4386,
-      "step": 2480
-    },
-    {
-      "epoch": 0.5347076823965212,
-      "grad_norm": 17.414901733398438,
-      "learning_rate": 4.680215619667364e-06,
-      "loss": 2.778,
-      "step": 2490
-    },
-    {
-      "epoch": 0.5368551028077522,
-      "grad_norm": 9.445879936218262,
-      "learning_rate": 4.645511937311934e-06,
-      "loss": 2.0429,
-      "step": 2500
-    },
-    {
-      "epoch": 0.5390025232189832,
-      "grad_norm": 10.506054878234863,
-      "learning_rate": 4.610825409996795e-06,
-      "loss": 2.454,
-      "step": 2510
-    },
-    {
-      "epoch": 0.5411499436302142,
-      "grad_norm": 10.331677436828613,
-      "learning_rate": 4.576157716336369e-06,
-      "loss": 2.0336,
-      "step": 2520
-    },
-    {
-      "epoch": 0.5432973640414452,
-      "grad_norm": 16.582256317138672,
-      "learning_rate": 4.541510534033643e-06,
-      "loss": 3.265,
-      "step": 2530
-    },
-    {
-      "epoch": 0.5454447844526762,
-      "grad_norm": 19.730741500854492,
-      "learning_rate": 4.50688553979898e-06,
-      "loss": 2.4532,
-      "step": 2540
-    },
-    {
-      "epoch": 0.5475922048639073,
-      "grad_norm": 19.238706588745117,
-      "learning_rate": 4.472284409268976e-06,
-      "loss": 2.4381,
-      "step": 2550
-    },
-    {
-      "epoch": 0.5497396252751382,
-      "grad_norm": 8.645087242126465,
-      "learning_rate": 4.437708816925374e-06,
-      "loss": 2.0653,
-      "step": 2560
-    },
-    {
-      "epoch": 0.5518870456863693,
-      "grad_norm": 8.598827362060547,
-      "learning_rate": 4.403160436014022e-06,
-      "loss": 2.7631,
-      "step": 2570
-    },
-    {
-      "epoch": 0.5540344660976002,
-      "grad_norm": 17.874841690063477,
-      "learning_rate": 4.368640938463909e-06,
-      "loss": 3.2379,
-      "step": 2580
-    },
-    {
-      "epoch": 0.5561818865088313,
-      "grad_norm": 19.613523483276367,
-      "learning_rate": 4.334151994806236e-06,
-      "loss": 2.3779,
-      "step": 2590
-    },
-    {
-      "epoch": 0.5583293069200623,
-      "grad_norm": 8.068832397460938,
-      "learning_rate": 4.299695274093593e-06,
-      "loss": 1.6249,
-      "step": 2600
-    },
-    {
-      "epoch": 0.5604767273312933,
-      "grad_norm": 9.436213493347168,
-      "learning_rate": 4.265272443819175e-06,
-      "loss": 2.3983,
-      "step": 2610
-    },
-    {
-      "epoch": 0.5626241477425243,
-      "grad_norm": 9.81657600402832,
-      "learning_rate": 4.23088516983609e-06,
-      "loss": 1.9805,
-      "step": 2620
-    },
-    {
-      "epoch": 0.5647715681537553,
-      "grad_norm": 17.40388298034668,
-      "learning_rate": 4.1965351162767344e-06,
-      "loss": 2.883,
-      "step": 2630
-    },
-    {
-      "epoch": 0.5669189885649863,
-      "grad_norm": 8.201501846313477,
-      "learning_rate": 4.162223945472271e-06,
-      "loss": 2.3845,
-      "step": 2640
-    },
-    {
-      "epoch": 0.5690664089762173,
-      "grad_norm": 7.743041038513184,
-      "learning_rate": 4.1279533178721755e-06,
-      "loss": 1.6637,
-      "step": 2650
-    },
-    {
-      "epoch": 0.5712138293874484,
-      "grad_norm": 9.991127967834473,
-      "learning_rate": 4.093724891963882e-06,
-      "loss": 1.6559,
-      "step": 2660
-    },
-    {
-      "epoch": 0.5733612497986793,
-      "grad_norm": 7.750714302062988,
-      "learning_rate": 4.059540324192522e-06,
-      "loss": 2.4365,
-      "step": 2670
-    },
-    {
-      "epoch": 0.5755086702099104,
-      "grad_norm": 12.723278999328613,
-      "learning_rate": 4.025401268880762e-06,
-      "loss": 2.0321,
-      "step": 2680
-    },
-    {
-      "epoch": 0.5776560906211413,
-      "grad_norm": 9.74173641204834,
-      "learning_rate": 3.991309378148746e-06,
-      "loss": 2.1192,
-      "step": 2690
-    },
-    {
-      "epoch": 0.5798035110323724,
-      "grad_norm": 18.213069915771484,
-      "learning_rate": 3.957266301834145e-06,
-      "loss": 2.3723,
-      "step": 2700
-    },
-    {
-      "epoch": 0.5819509314436033,
-      "grad_norm": 8.864288330078125,
-      "learning_rate": 3.923273687412313e-06,
-      "loss": 2.8205,
-      "step": 2710
-    },
-    {
-      "epoch": 0.5840983518548344,
-      "grad_norm": 8.741086959838867,
-      "learning_rate": 3.889333179916552e-06,
-      "loss": 2.0988,
-      "step": 2720
-    },
-    {
-      "epoch": 0.5862457722660653,
-      "grad_norm": 9.561637878417969,
-      "learning_rate": 3.855446421858517e-06,
-      "loss": 1.9501,
-      "step": 2730
-    },
-    {
-      "epoch": 0.5883931926772964,
-      "grad_norm": 8.792778015136719,
-      "learning_rate": 3.821615053148717e-06,
-      "loss": 1.2542,
-      "step": 2740
-    },
-    {
-      "epoch": 0.5905406130885275,
-      "grad_norm": 8.091140747070312,
-      "learning_rate": 3.7878407110171646e-06,
-      "loss": 2.0234,
-      "step": 2750
-    },
-    {
-      "epoch": 0.5926880334997584,
-      "grad_norm": 9.446223258972168,
-      "learning_rate": 3.7541250299341243e-06,
-      "loss": 2.3535,
-      "step": 2760
-    },
-    {
-      "epoch": 0.5948354539109895,
-      "grad_norm": 7.457852363586426,
-      "learning_rate": 3.7204696415310377e-06,
-      "loss": 2.379,
-      "step": 2770
-    },
-    {
-      "epoch": 0.5969828743222204,
-      "grad_norm": 10.4889497756958,
-      "learning_rate": 3.6868761745215474e-06,
-      "loss": 2.455,
-      "step": 2780
-    },
-    {
-      "epoch": 0.5991302947334515,
-      "grad_norm": 7.388014316558838,
-      "learning_rate": 3.653346254622683e-06,
-      "loss": 1.6059,
-      "step": 2790
-    },
-    {
-      "epoch": 0.6012777151446824,
-      "grad_norm": 8.156157493591309,
-      "learning_rate": 3.6198815044761847e-06,
-      "loss": 2.8084,
-      "step": 2800
-    },
-    {
-      "epoch": 0.6034251355559135,
-      "grad_norm": 10.154397010803223,
-      "learning_rate": 3.586483543569977e-06,
-      "loss": 2.0311,
-      "step": 2810
-    },
-    {
-      "epoch": 0.6055725559671444,
-      "grad_norm": 9.225127220153809,
-      "learning_rate": 3.5531539881597967e-06,
-      "loss": 2.4153,
-      "step": 2820
-    },
-    {
-      "epoch": 0.6077199763783755,
-      "grad_norm": 17.420804977416992,
-      "learning_rate": 3.519894451190976e-06,
-      "loss": 2.7263,
-      "step": 2830
-    },
-    {
-      "epoch": 0.6098673967896064,
-      "grad_norm": 7.618463039398193,
-      "learning_rate": 3.4867065422203885e-06,
-      "loss": 1.9863,
-      "step": 2840
-    },
-    {
-      "epoch": 0.6120148172008375,
-      "grad_norm": 7.851476192474365,
-      "learning_rate": 3.4535918673385456e-06,
-      "loss": 1.6099,
-      "step": 2850
-    },
-    {
-      "epoch": 0.6141622376120685,
-      "grad_norm": 17.60761260986328,
-      "learning_rate": 3.420552029091886e-06,
-      "loss": 2.0744,
-      "step": 2860
-    },
-    {
-      "epoch": 0.6163096580232995,
-      "grad_norm": 7.814233779907227,
-      "learning_rate": 3.3875886264052155e-06,
-      "loss": 1.9523,
-      "step": 2870
-    },
-    {
-      "epoch": 0.6184570784345306,
-      "grad_norm": 8.906336784362793,
-      "learning_rate": 3.354703254504328e-06,
-      "loss": 2.4133,
-      "step": 2880
-    },
-    {
-      "epoch": 0.6206044988457615,
-      "grad_norm": 10.231346130371094,
-      "learning_rate": 3.3218975048388067e-06,
-      "loss": 1.644,
-      "step": 2890
-    },
-    {
-      "epoch": 0.6227519192569926,
-      "grad_norm": 8.996302604675293,
-      "learning_rate": 3.2891729650050096e-06,
-      "loss": 2.3329,
-      "step": 2900
-    },
-    {
-      "epoch": 0.6248993396682235,
-      "grad_norm": 8.609686851501465,
-      "learning_rate": 3.2565312186692403e-06,
-      "loss": 1.6096,
-      "step": 2910
-    },
-    {
-      "epoch": 0.6270467600794546,
-      "grad_norm": 7.7054338455200195,
-      "learning_rate": 3.2239738454911057e-06,
-      "loss": 2.4928,
-      "step": 2920
-    },
-    {
-      "epoch": 0.6291941804906855,
-      "grad_norm": 7.8000383377075195,
-      "learning_rate": 3.191502421047068e-06,
-      "loss": 2.0236,
-      "step": 2930
-    },
-    {
-      "epoch": 0.6313416009019166,
-      "grad_norm": 17.71788215637207,
-      "learning_rate": 3.1591185167542047e-06,
-      "loss": 2.0003,
-      "step": 2940
-    },
-    {
-      "epoch": 0.6334890213131475,
-      "grad_norm": 10.549698829650879,
-      "learning_rate": 3.1268236997941535e-06,
-      "loss": 1.2635,
-      "step": 2950
-    },
-    {
-      "epoch": 0.6356364417243786,
-      "grad_norm": 18.817182540893555,
-      "learning_rate": 3.0946195330372754e-06,
-      "loss": 3.523,
-      "step": 2960
-    },
-    {
-      "epoch": 0.6377838621356096,
-      "grad_norm": 7.883931636810303,
-      "learning_rate": 3.0625075749670195e-06,
-      "loss": 2.2924,
-      "step": 2970
-    },
-    {
-      "epoch": 0.6399312825468406,
-      "grad_norm": 8.429082870483398,
-      "learning_rate": 3.0304893796044988e-06,
-      "loss": 1.6421,
-      "step": 2980
-    },
-    {
-      "epoch": 0.6420787029580716,
-      "grad_norm": 8.13762092590332,
-      "learning_rate": 2.9985664964332904e-06,
-      "loss": 1.9695,
-      "step": 2990
-    },
-    {
-      "epoch": 0.6442261233693026,
-      "grad_norm": 16.98133659362793,
-      "learning_rate": 2.966740470324451e-06,
-      "loss": 3.2073,
-      "step": 3000
-    },
-    {
-      "epoch": 0.6463735437805337,
-      "grad_norm": 8.005885124206543,
-      "learning_rate": 2.9350128414617483e-06,
-      "loss": 2.0849,
-      "step": 3010
-    },
-    {
-      "epoch": 0.6485209641917646,
-      "grad_norm": 18.396974563598633,
-      "learning_rate": 2.903385145267129e-06,
-      "loss": 2.4081,
-      "step": 3020
-    },
-    {
-      "epoch": 0.6506683846029957,
-      "grad_norm": 11.773588180541992,
-      "learning_rate": 2.871858912326414e-06,
-      "loss": 1.6188,
-      "step": 3030
-    },
-    {
-      "epoch": 0.6528158050142266,
-      "grad_norm": 12.255626678466797,
-      "learning_rate": 2.8404356683152256e-06,
-      "loss": 2.4209,
-      "step": 3040
-    },
-    {
-      "epoch": 0.6549632254254577,
-      "grad_norm": 7.566231727600098,
-      "learning_rate": 2.8091169339251644e-06,
-      "loss": 1.6243,
-      "step": 3050
-    },
-    {
-      "epoch": 0.6571106458366887,
-      "grad_norm": 8.969308853149414,
-      "learning_rate": 2.777904224790197e-06,
-      "loss": 1.671,
-      "step": 3060
-    },
-    {
-      "epoch": 0.6592580662479197,
-      "grad_norm": 7.9166364669799805,
-      "learning_rate": 2.746799051413325e-06,
-      "loss": 2.461,
-      "step": 3070
-    },
-    {
-      "epoch": 0.6614054866591507,
-      "grad_norm": 16.861021041870117,
-      "learning_rate": 2.715802919093484e-06,
-      "loss": 1.9931,
-      "step": 3080
-    },
-    {
-      "epoch": 0.6635529070703817,
-      "grad_norm": 18.910966873168945,
-      "learning_rate": 2.684917327852691e-06,
-      "loss": 2.483,
-      "step": 3090
-    },
-    {
-      "epoch": 0.6657003274816127,
-      "grad_norm": 7.5983686447143555,
-      "learning_rate": 2.654143772363455e-06,
-      "loss": 1.9728,
-      "step": 3100
-    },
-    {
-      "epoch": 0.6678477478928437,
-      "grad_norm": 8.704245567321777,
-      "learning_rate": 2.623483741876443e-06,
-      "loss": 1.5872,
-      "step": 3110
-    },
-    {
-      "epoch": 0.6699951683040747,
-      "grad_norm": 10.111562728881836,
-      "learning_rate": 2.5929387201484133e-06,
-      "loss": 1.5902,
-      "step": 3120
-    },
-    {
-      "epoch": 0.6721425887153057,
-      "grad_norm": 18.483325958251953,
-      "learning_rate": 2.562510185370407e-06,
-      "loss": 2.751,
-      "step": 3130
-    },
-    {
-      "epoch": 0.6742900091265367,
-      "grad_norm": 7.957283020019531,
-      "learning_rate": 2.5321996100962163e-06,
-      "loss": 2.3621,
-      "step": 3140
-    },
-    {
-      "epoch": 0.6764374295377678,
-      "grad_norm": 7.387941837310791,
-      "learning_rate": 2.502008461171114e-06,
-      "loss": 1.5594,
-      "step": 3150
-    },
-    {
-      "epoch": 0.6785848499489988,
-      "grad_norm": 8.454072952270508,
-      "learning_rate": 2.4719381996608748e-06,
-      "loss": 3.2272,
-      "step": 3160
-    },
-    {
-      "epoch": 0.6807322703602298,
-      "grad_norm": 8.459441184997559,
-      "learning_rate": 2.4419902807810707e-06,
-      "loss": 1.5914,
-      "step": 3170
-    },
-    {
-      "epoch": 0.6828796907714608,
-      "grad_norm": 8.403077125549316,
-      "learning_rate": 2.412166153826639e-06,
-      "loss": 1.6618,
-      "step": 3180
-    },
-    {
-      "epoch": 0.6850271111826918,
-      "grad_norm": 8.397653579711914,
-      "learning_rate": 2.382467262101751e-06,
-      "loss": 1.6576,
-      "step": 3190
-    },
-    {
-      "epoch": 0.6871745315939228,
-      "grad_norm": 8.854833602905273,
-      "learning_rate": 2.352895042849965e-06,
-      "loss": 2.0303,
-      "step": 3200
-    },
-    {
-      "epoch": 0.6893219520051538,
-      "grad_norm": 17.544239044189453,
-      "learning_rate": 2.3234509271846683e-06,
-      "loss": 3.0549,
-      "step": 3210
-    },
-    {
-      "epoch": 0.6914693724163848,
-      "grad_norm": 20.67961883544922,
-      "learning_rate": 2.294136340019826e-06,
-      "loss": 2.3566,
-      "step": 3220
-    },
-    {
-      "epoch": 0.6936167928276158,
-      "grad_norm": 18.9661922454834,
-      "learning_rate": 2.264952700001022e-06,
-      "loss": 2.746,
-      "step": 3230
-    },
-    {
-      "epoch": 0.6957642132388469,
-      "grad_norm": 7.390661716461182,
-      "learning_rate": 2.2359014194367986e-06,
-      "loss": 1.971,
-      "step": 3240
-    },
-    {
-      "epoch": 0.6979116336500778,
-      "grad_norm": 6.957170486450195,
-      "learning_rate": 2.20698390423032e-06,
-      "loss": 1.2324,
-      "step": 3250
-    },
-    {
-      "epoch": 0.7000590540613089,
-      "grad_norm": 17.50217628479004,
-      "learning_rate": 2.17820155381133e-06,
-      "loss": 2.3776,
-      "step": 3260
-    },
-    {
-      "epoch": 0.7022064744725398,
-      "grad_norm": 8.914453506469727,
-      "learning_rate": 2.14955576106843e-06,
-      "loss": 2.3974,
-      "step": 3270
-    },
-    {
-      "epoch": 0.7043538948837709,
-      "grad_norm": 18.855152130126953,
-      "learning_rate": 2.1210479122816646e-06,
-      "loss": 1.6601,
-      "step": 3280
-    },
-    {
-      "epoch": 0.7065013152950019,
-      "grad_norm": 18.482494354248047,
-      "learning_rate": 2.0926793870554457e-06,
-      "loss": 1.984,
-      "step": 3290
-    },
-    {
-      "epoch": 0.7086487357062329,
-      "grad_norm": 8.10519027709961,
-      "learning_rate": 2.0644515582517803e-06,
-      "loss": 1.9964,
-      "step": 3300
-    },
-    {
-      "epoch": 0.7107961561174639,
-      "grad_norm": 19.076732635498047,
-      "learning_rate": 2.0363657919238357e-06,
-      "loss": 2.7817,
-      "step": 3310
-    },
-    {
-      "epoch": 0.7129435765286949,
-      "grad_norm": 18.62787628173828,
-      "learning_rate": 2.0084234472498274e-06,
-      "loss": 2.3052,
-      "step": 3320
-    },
-    {
-      "epoch": 0.715090996939926,
-      "grad_norm": 8.185526847839355,
-      "learning_rate": 1.9806258764672488e-06,
-      "loss": 2.2661,
-      "step": 3330
-    },
-    {
-      "epoch": 0.7172384173511569,
-      "grad_norm": 8.022353172302246,
-      "learning_rate": 1.952974424807425e-06,
-      "loss": 2.0014,
-      "step": 3340
-    },
-    {
-      "epoch": 0.719385837762388,
-      "grad_norm": 10.178295135498047,
-      "learning_rate": 1.9254704304304174e-06,
-      "loss": 1.5894,
-      "step": 3350
-    },
-    {
-      "epoch": 0.7215332581736189,
-      "grad_norm": 8.717124938964844,
-      "learning_rate": 1.898115224360263e-06,
-      "loss": 1.1596,
-      "step": 3360
-    },
-    {
-      "epoch": 0.72368067858485,
-      "grad_norm": 8.687152862548828,
-      "learning_rate": 1.870910130420555e-06,
-      "loss": 2.0217,
-      "step": 3370
-    },
-    {
-      "epoch": 0.7258280989960809,
-      "grad_norm": 8.655681610107422,
-      "learning_rate": 1.84385646517039e-06,
-      "loss": 1.557,
-      "step": 3380
-    },
-    {
-      "epoch": 0.727975519407312,
-      "grad_norm": 18.08987808227539,
-      "learning_rate": 1.8169555378406456e-06,
-      "loss": 2.3207,
-      "step": 3390
-    },
-    {
-      "epoch": 0.7301229398185429,
-      "grad_norm": 8.34890079498291,
-      "learning_rate": 1.7902086502706256e-06,
-      "loss": 1.9965,
-      "step": 3400
-    },
-    {
-      "epoch": 0.732270360229774,
-      "grad_norm": 7.2736921310424805,
-      "learning_rate": 1.7636170968450533e-06,
-      "loss": 1.5618,
-      "step": 3410
-    },
-    {
-      "epoch": 0.734417780641005,
-      "grad_norm": 7.780071258544922,
-      "learning_rate": 1.7371821644314392e-06,
-      "loss": 2.3343,
-      "step": 3420
-    },
-    {
-      "epoch": 0.736565201052236,
-      "grad_norm": 8.583059310913086,
-      "learning_rate": 1.710905132317801e-06,
-      "loss": 2.7489,
-      "step": 3430
-    },
-    {
-      "epoch": 0.7387126214634671,
-      "grad_norm": 8.453703880310059,
-      "learning_rate": 1.6847872721507525e-06,
-      "loss": 2.0472,
-      "step": 3440
-    },
-    {
-      "epoch": 0.740860041874698,
-      "grad_norm": 8.093005180358887,
-      "learning_rate": 1.658829847873965e-06,
-      "loss": 1.6072,
-      "step": 3450
-    },
-    {
-      "epoch": 0.7430074622859291,
-      "grad_norm": 17.80980110168457,
-      "learning_rate": 1.633034115667001e-06,
-      "loss": 1.9875,
-      "step": 3460
-    },
-    {
-      "epoch": 0.74515488269716,
-      "grad_norm": 19.054296493530273,
-      "learning_rate": 1.6074013238845214e-06,
-      "loss": 2.7245,
-      "step": 3470
-    },
-    {
-      "epoch": 0.7473023031083911,
-      "grad_norm": 7.945253372192383,
-      "learning_rate": 1.5819327129958762e-06,
-      "loss": 2.3126,
-      "step": 3480
-    },
-    {
-      "epoch": 0.749449723519622,
-      "grad_norm": 8.40179443359375,
-      "learning_rate": 1.5566295155250644e-06,
-      "loss": 2.3687,
-      "step": 3490
-    },
-    {
-      "epoch": 0.7515971439308531,
-      "grad_norm": 8.832183837890625,
-      "learning_rate": 1.5314929559910985e-06,
-      "loss": 1.6223,
-      "step": 3500
-    },
-    {
-      "epoch": 0.753744564342084,
-      "grad_norm": 9.67546558380127,
-      "learning_rate": 1.506524250848741e-06,
-      "loss": 2.3248,
-      "step": 3510
-    },
-    {
-      "epoch": 0.7558919847533151,
-      "grad_norm": 7.6303558349609375,
-      "learning_rate": 1.4817246084296327e-06,
-      "loss": 1.2378,
-      "step": 3520
-    },
-    {
-      "epoch": 0.758039405164546,
-      "grad_norm": 10.63131046295166,
-      "learning_rate": 1.457095228883822e-06,
-      "loss": 1.1994,
-      "step": 3530
-    },
-    {
-      "epoch": 0.7601868255757771,
-      "grad_norm": 11.350310325622559,
-      "learning_rate": 1.4326373041216774e-06,
-      "loss": 1.5317,
-      "step": 3540
-    },
-    {
-      "epoch": 0.762334245987008,
-      "grad_norm": 9.425748825073242,
-      "learning_rate": 1.4083520177562154e-06,
-      "loss": 1.5042,
-      "step": 3550
-    },
-    {
-      "epoch": 0.7644816663982391,
-      "grad_norm": 18.74260902404785,
-      "learning_rate": 1.3842405450458158e-06,
-      "loss": 2.3408,
-      "step": 3560
-    },
-    {
-      "epoch": 0.7666290868094702,
-      "grad_norm": 8.119706153869629,
-      "learning_rate": 1.3603040528373467e-06,
-      "loss": 2.3104,
-      "step": 3570
-    },
-    {
-      "epoch": 0.7687765072207011,
-      "grad_norm": 10.3068208694458,
-      "learning_rate": 1.336543699509698e-06,
-      "loss": 2.5002,
-      "step": 3580
-    },
-    {
-      "epoch": 0.7709239276319322,
-      "grad_norm": 17.977724075317383,
-      "learning_rate": 1.312960634917721e-06,
-      "loss": 2.6457,
-      "step": 3590
-    },
-    {
-      "epoch": 0.7730713480431631,
-      "grad_norm": 9.48388957977295,
-      "learning_rate": 1.2895560003365837e-06,
-      "loss": 1.9769,
-      "step": 3600
-    },
-    {
-      "epoch": 0.7752187684543942,
-      "grad_norm": 8.272026062011719,
-      "learning_rate": 1.2663309284065407e-06,
-      "loss": 2.9297,
-      "step": 3610
-    },
-    {
-      "epoch": 0.7773661888656251,
-      "grad_norm": 8.88364315032959,
-      "learning_rate": 1.2432865430781166e-06,
-      "loss": 1.6369,
-      "step": 3620
-    },
-    {
-      "epoch": 0.7795136092768562,
-      "grad_norm": 7.110147476196289,
-      "learning_rate": 1.2204239595577195e-06,
-      "loss": 1.5633,
-      "step": 3630
-    },
-    {
-      "epoch": 0.7816610296880872,
-      "grad_norm": 8.931872367858887,
-      "learning_rate": 1.1977442842536685e-06,
-      "loss": 1.5733,
-      "step": 3640
-    },
-    {
-      "epoch": 0.7838084500993182,
-      "grad_norm": 8.303011894226074,
-      "learning_rate": 1.1752486147226505e-06,
-      "loss": 1.9338,
-      "step": 3650
-    },
-    {
-      "epoch": 0.7859558705105492,
-      "grad_norm": 7.458680629730225,
-      "learning_rate": 1.1529380396166074e-06,
-      "loss": 3.1121,
-      "step": 3660
-    },
-    {
-      "epoch": 0.7881032909217802,
-      "grad_norm": 7.7846598625183105,
-      "learning_rate": 1.1308136386300455e-06,
-      "loss": 1.5423,
-      "step": 3670
-    },
-    {
-      "epoch": 0.7902507113330112,
-      "grad_norm": 10.858473777770996,
-      "learning_rate": 1.1088764824477938e-06,
-      "loss": 1.2332,
-      "step": 3680
-    },
-    {
-      "epoch": 0.7923981317442422,
-      "grad_norm": 10.06197738647461,
-      "learning_rate": 1.0871276326931845e-06,
-      "loss": 1.9164,
-      "step": 3690
-    },
-    {
-      "epoch": 0.7945455521554733,
-      "grad_norm": 7.83908224105835,
-      "learning_rate": 1.0655681418766772e-06,
-      "loss": 1.9849,
-      "step": 3700
-    },
-    {
-      "epoch": 0.7966929725667042,
-      "grad_norm": 8.16726303100586,
-      "learning_rate": 1.0441990533449247e-06,
-      "loss": 1.6056,
-      "step": 3710
-    },
-    {
-      "epoch": 0.7988403929779353,
-      "grad_norm": 17.369827270507812,
-      "learning_rate": 1.0230214012302807e-06,
-      "loss": 2.7502,
-      "step": 3720
-    },
-    {
-      "epoch": 0.8009878133891662,
-      "grad_norm": 9.131800651550293,
-      "learning_rate": 1.0020362104007558e-06,
-      "loss": 1.246,
-      "step": 3730
-    },
-    {
-      "epoch": 0.8031352338003973,
-      "grad_norm": 17.537370681762695,
-      "learning_rate": 9.812444964104195e-07,
-      "loss": 1.5823,
-      "step": 3740
-    },
-    {
-      "epoch": 0.8052826542116283,
-      "grad_norm": 8.849536895751953,
-      "learning_rate": 9.60647265450249e-07,
-      "loss": 1.6186,
-      "step": 3750
-    },
-    {
-      "epoch": 0.8074300746228593,
-      "grad_norm": 8.354120254516602,
-      "learning_rate": 9.402455142994443e-07,
-      "loss": 2.3345,
-      "step": 3760
-    },
-    {
-      "epoch": 0.8095774950340903,
-      "grad_norm": 8.479084968566895,
-      "learning_rate": 9.200402302771843e-07,
-      "loss": 2.3736,
-      "step": 3770
-    },
-    {
-      "epoch": 0.8117249154453213,
-      "grad_norm": 8.048047065734863,
-      "learning_rate": 9.000323911948483e-07,
-      "loss": 1.6527,
-      "step": 3780
-    },
-    {
-      "epoch": 0.8138723358565523,
-      "grad_norm": 9.944975852966309,
-      "learning_rate": 8.802229653086975e-07,
-      "loss": 2.3705,
-      "step": 3790
-    },
-    {
-      "epoch": 0.8160197562677833,
-      "grad_norm": 19.067068099975586,
-      "learning_rate": 8.60612911273011e-07,
-      "loss": 2.3448,
-      "step": 3800
-    },
-    {
-      "epoch": 0.8181671766790143,
-      "grad_norm": 11.265616416931152,
-      "learning_rate": 8.412031780937025e-07,
-      "loss": 1.5598,
-      "step": 3810
-    },
-    {
-      "epoch": 0.8203145970902453,
-      "grad_norm": 7.994141101837158,
-      "learning_rate": 8.219947050823862e-07,
-      "loss": 1.5608,
-      "step": 3820
-    },
-    {
-      "epoch": 0.8224620175014764,
-      "grad_norm": 10.012679100036621,
-      "learning_rate": 8.029884218109246e-07,
-      "loss": 1.5939,
-      "step": 3830
-    },
-    {
-      "epoch": 0.8246094379127074,
-      "grad_norm": 7.568084716796875,
-      "learning_rate": 7.841852480664414e-07,
-      "loss": 1.5617,
-      "step": 3840
-    },
-    {
-      "epoch": 0.8267568583239384,
-      "grad_norm": 8.215276718139648,
-      "learning_rate": 7.655860938068071e-07,
-      "loss": 1.5861,
-      "step": 3850
-    },
-    {
-      "epoch": 0.8289042787351694,
-      "grad_norm": 9.24499797821045,
-      "learning_rate": 7.471918591166078e-07,
-      "loss": 2.6894,
-      "step": 3860
-    },
-    {
-      "epoch": 0.8310516991464004,
-      "grad_norm": 17.422555923461914,
-      "learning_rate": 7.290034341635838e-07,
-      "loss": 1.9686,
-      "step": 3870
-    },
-    {
-      "epoch": 0.8331991195576314,
-      "grad_norm": 17.70247459411621,
-      "learning_rate": 7.110216991555457e-07,
-      "loss": 2.3266,
-      "step": 3880
-    },
-    {
-      "epoch": 0.8353465399688624,
-      "grad_norm": 8.571028709411621,
-      "learning_rate": 6.932475242977899e-07,
-      "loss": 1.5555,
-      "step": 3890
-    },
-    {
-      "epoch": 0.8374939603800934,
-      "grad_norm": 8.366564750671387,
-      "learning_rate": 6.756817697509755e-07,
-      "loss": 1.5639,
-      "step": 3900
-    },
-    {
-      "epoch": 0.8396413807913244,
-      "grad_norm": 7.642975807189941,
-      "learning_rate": 6.583252855895012e-07,
-      "loss": 1.1573,
-      "step": 3910
-    },
-    {
-      "epoch": 0.8417888012025554,
-      "grad_norm": 9.669418334960938,
-      "learning_rate": 6.411789117603701e-07,
-      "loss": 2.3294,
-      "step": 3920
-    },
-    {
-      "epoch": 0.8439362216137865,
-      "grad_norm": 8.85915470123291,
-      "learning_rate": 6.242434780425333e-07,
-      "loss": 1.5266,
-      "step": 3930
-    },
-    {
-      "epoch": 0.8460836420250174,
-      "grad_norm": 8.256942749023438,
-      "learning_rate": 6.075198040067432e-07,
-      "loss": 1.5556,
-      "step": 3940
-    },
-    {
-      "epoch": 0.8482310624362485,
-      "grad_norm": 8.303900718688965,
-      "learning_rate": 5.910086989758862e-07,
-      "loss": 1.9915,
-      "step": 3950
-    },
-    {
-      "epoch": 0.8503784828474794,
-      "grad_norm": 8.628076553344727,
-      "learning_rate": 5.747109619858176e-07,
-      "loss": 1.2209,
-      "step": 3960
-    },
-    {
-      "epoch": 0.8525259032587105,
-      "grad_norm": 8.535489082336426,
-      "learning_rate": 5.586273817466891e-07,
-      "loss": 2.7191,
-      "step": 3970
-    },
-    {
-      "epoch": 0.8546733236699415,
-      "grad_norm": 10.173384666442871,
-      "learning_rate": 5.427587366047893e-07,
-      "loss": 2.3468,
-      "step": 3980
-    },
-    {
-      "epoch": 0.8568207440811725,
-      "grad_norm": 8.549752235412598,
-      "learning_rate": 5.27105794504868e-07,
-      "loss": 2.3646,
-      "step": 3990
-    },
-    {
-      "epoch": 0.8589681644924035,
-      "grad_norm": 9.579891204833984,
-      "learning_rate": 5.11669312952977e-07,
-      "loss": 1.54,
-      "step": 4000
-    },
-    {
-      "epoch": 0.8611155849036345,
-      "grad_norm": 8.296952247619629,
-      "learning_rate": 4.964500389798066e-07,
-      "loss": 1.9316,
-      "step": 4010
-    },
-    {
-      "epoch": 0.8632630053148656,
-      "grad_norm": 16.096107482910156,
-      "learning_rate": 4.814487091045405e-07,
-      "loss": 2.6136,
-      "step": 4020
-    },
-    {
-      "epoch": 0.8654104257260965,
-      "grad_norm": 8.543082237243652,
-      "learning_rate": 4.666660492992092e-07,
-      "loss": 1.5991,
-      "step": 4030
-    },
-    {
-      "epoch": 0.8675578461373276,
-      "grad_norm": 7.935061931610107,
-      "learning_rate": 4.521027749535578e-07,
-      "loss": 1.921,
-      "step": 4040
-    },
-    {
-      "epoch": 0.8697052665485585,
-      "grad_norm": 9.394523620605469,
-      "learning_rate": 4.377595908404225e-07,
-      "loss": 1.5151,
-      "step": 4050
-    },
-    {
-      "epoch": 0.8718526869597896,
-      "grad_norm": 8.564217567443848,
-      "learning_rate": 4.2363719108163113e-07,
-      "loss": 1.1554,
-      "step": 4060
-    },
-    {
-      "epoch": 0.8740001073710205,
-      "grad_norm": 10.94825267791748,
-      "learning_rate": 4.097362591144055e-07,
-      "loss": 1.1995,
-      "step": 4070
-    },
-    {
-      "epoch": 0.8761475277822516,
-      "grad_norm": 17.23007583618164,
-      "learning_rate": 3.960574676582901e-07,
-      "loss": 2.391,
-      "step": 4080
-    },
-    {
-      "epoch": 0.8782949481934825,
-      "grad_norm": 9.983611106872559,
-      "learning_rate": 3.8260147868259713e-07,
-      "loss": 3.2049,
-      "step": 4090
-    },
-    {
-      "epoch": 0.8804423686047136,
-      "grad_norm": 7.7599921226501465,
-      "learning_rate": 3.693689433743658e-07,
-      "loss": 2.2062,
-      "step": 4100
-    },
-    {
-      "epoch": 0.8825897890159446,
-      "grad_norm": 8.449335098266602,
-      "learning_rate": 3.563605021068578e-07,
-      "loss": 1.9016,
-      "step": 4110
-    },
-    {
-      "epoch": 0.8847372094271756,
-      "grad_norm": 18.595163345336914,
-      "learning_rate": 3.4357678440856136e-07,
-      "loss": 2.035,
-      "step": 4120
-    },
-    {
-      "epoch": 0.8868846298384067,
-      "grad_norm": 8.655158996582031,
-      "learning_rate": 3.3101840893272786e-07,
-      "loss": 1.6087,
-      "step": 4130
-    },
-    {
-      "epoch": 0.8890320502496376,
-      "grad_norm": 8.49612045288086,
-      "learning_rate": 3.186859834274292e-07,
-      "loss": 1.5791,
-      "step": 4140
-    },
-    {
-      "epoch": 0.8911794706608687,
-      "grad_norm": 8.00724983215332,
-      "learning_rate": 3.065801047061517e-07,
-      "loss": 1.2228,
-      "step": 4150
-    },
-    {
-      "epoch": 0.8933268910720996,
-      "grad_norm": 18.299640655517578,
-      "learning_rate": 2.947013586189124e-07,
-      "loss": 2.3075,
-      "step": 4160
-    },
-    {
-      "epoch": 0.8954743114833307,
-      "grad_norm": 9.55125904083252,
-      "learning_rate": 2.83050320023906e-07,
-      "loss": 2.684,
-      "step": 4170
-    },
-    {
-      "epoch": 0.8976217318945616,
-      "grad_norm": 9.116815567016602,
-      "learning_rate": 2.7162755275968513e-07,
-      "loss": 1.9426,
-      "step": 4180
-    },
-    {
-      "epoch": 0.8997691523057927,
-      "grad_norm": 12.08741283416748,
-      "learning_rate": 2.604336096178767e-07,
-      "loss": 1.9566,
-      "step": 4190
-    },
-    {
-      "epoch": 0.9019165727170236,
-      "grad_norm": 17.83538818359375,
-      "learning_rate": 2.4946903231642727e-07,
-      "loss": 2.3625,
-      "step": 4200
-    },
-    {
-      "epoch": 0.9040639931282547,
-      "grad_norm": 8.699211120605469,
-      "learning_rate": 2.3873435147338975e-07,
-      "loss": 1.6144,
-      "step": 4210
-    },
-    {
-      "epoch": 0.9062114135394856,
-      "grad_norm": 11.926467895507812,
-      "learning_rate": 2.2823008658124425e-07,
-      "loss": 1.6301,
-      "step": 4220
-    },
-    {
-      "epoch": 0.9083588339507167,
-      "grad_norm": 18.642906188964844,
-      "learning_rate": 2.1795674598175277e-07,
-      "loss": 2.007,
-      "step": 4230
-    },
-    {
-      "epoch": 0.9105062543619478,
-      "grad_norm": 8.510333061218262,
-      "learning_rate": 2.0791482684136833e-07,
-      "loss": 2.3849,
-      "step": 4240
-    },
-    {
-      "epoch": 0.9126536747731787,
-      "grad_norm": 8.797530174255371,
-      "learning_rate": 1.9810481512716638e-07,
-      "loss": 2.2797,
-      "step": 4250
-    },
-    {
-      "epoch": 0.9148010951844098,
-      "grad_norm": 9.010210990905762,
-      "learning_rate": 1.885271855833315e-07,
-      "loss": 1.9205,
-      "step": 4260
-    },
-    {
-      "epoch": 0.9169485155956407,
-      "grad_norm": 6.828660488128662,
-      "learning_rate": 1.7918240170818225e-07,
-      "loss": 1.9204,
-      "step": 4270
-    },
-    {
-      "epoch": 0.9190959360068718,
-      "grad_norm": 17.962793350219727,
-      "learning_rate": 1.7007091573173818e-07,
-      "loss": 2.3651,
-      "step": 4280
-    },
-    {
-      "epoch": 0.9212433564181027,
-      "grad_norm": 11.682183265686035,
-      "learning_rate": 1.6119316859383837e-07,
-      "loss": 1.2778,
-      "step": 4290
-    },
-    {
-      "epoch": 0.9233907768293338,
-      "grad_norm": 7.454334259033203,
-      "learning_rate": 1.5254958992280022e-07,
-      "loss": 2.2826,
-      "step": 4300
-    },
-    {
-      "epoch": 0.9255381972405647,
-      "grad_norm": 10.836345672607422,
-      "learning_rate": 1.4414059801462709e-07,
-      "loss": 1.9795,
-      "step": 4310
-    },
-    {
-      "epoch": 0.9276856176517958,
-      "grad_norm": 17.844743728637695,
-      "learning_rate": 1.3596659981277016e-07,
-      "loss": 2.6067,
-      "step": 4320
-    },
-    {
-      "epoch": 0.9298330380630268,
-      "grad_norm": 7.685535907745361,
-      "learning_rate": 1.280279908884291e-07,
-      "loss": 1.5424,
-      "step": 4330
-    },
-    {
-      "epoch": 0.9319804584742578,
-      "grad_norm": 7.696123123168945,
-      "learning_rate": 1.2032515542141188e-07,
-      "loss": 1.5663,
-      "step": 4340
-    },
-    {
-      "epoch": 0.9341278788854888,
-      "grad_norm": 8.067780494689941,
-      "learning_rate": 1.128584661815435e-07,
-      "loss": 1.5158,
-      "step": 4350
-    },
-    {
-      "epoch": 0.9362752992967198,
-      "grad_norm": 7.884578227996826,
-      "learning_rate": 1.0562828451062323e-07,
-      "loss": 2.2029,
-      "step": 4360
-    },
-    {
-      "epoch": 0.9384227197079508,
-      "grad_norm": 9.749954223632812,
-      "learning_rate": 9.86349603049419e-08,
-      "loss": 1.566,
-      "step": 4370
-    },
-    {
-      "epoch": 0.9405701401191818,
-      "grad_norm": 9.595564842224121,
-      "learning_rate": 9.187883199834491e-08,
-      "loss": 1.5778,
-      "step": 4380
-    },
-    {
-      "epoch": 0.9427175605304129,
-      "grad_norm": 9.646600723266602,
-      "learning_rate": 8.536022654585752e-08,
-      "loss": 1.2144,
-      "step": 4390
-    },
-    {
-      "epoch": 0.9448649809416438,
-      "grad_norm": 10.880244255065918,
-      "learning_rate": 7.907945940786033e-08,
-      "loss": 2.6352,
-      "step": 4400
-    },
-    {
-      "epoch": 0.9470124013528749,
-      "grad_norm": 18.716659545898438,
-      "learning_rate": 7.303683453482368e-08,
-      "loss": 3.4305,
-      "step": 4410
-    },
-    {
-      "epoch": 0.9491598217641058,
-      "grad_norm": 9.710079193115234,
-      "learning_rate": 6.723264435259725e-08,
-      "loss": 2.7579,
-      "step": 4420
-    },
-    {
-      "epoch": 0.9513072421753369,
-      "grad_norm": 8.077021598815918,
-      "learning_rate": 6.16671697482607e-08,
-      "loss": 1.9376,
-      "step": 4430
-    },
-    {
-      "epoch": 0.9534546625865679,
-      "grad_norm": 10.034996032714844,
-      "learning_rate": 5.634068005652804e-08,
-      "loss": 2.3081,
-      "step": 4440
-    },
-    {
-      "epoch": 0.9556020829977989,
-      "grad_norm": 10.336363792419434,
-      "learning_rate": 5.125343304671459e-08,
-      "loss": 2.3248,
-      "step": 4450
-    },
-    {
-      "epoch": 0.9577495034090299,
-      "grad_norm": 8.281291007995605,
-      "learning_rate": 4.640567491026316e-08,
-      "loss": 1.596,
-      "step": 4460
-    },
-    {
-      "epoch": 0.9598969238202609,
-      "grad_norm": 9.9288969039917,
-      "learning_rate": 4.179764024882793e-08,
-      "loss": 1.983,
-      "step": 4470
-    },
-    {
-      "epoch": 0.9620443442314919,
-      "grad_norm": 8.1442289352417,
-      "learning_rate": 3.7429552062923644e-08,
-      "loss": 1.4951,
-      "step": 4480
-    },
-    {
-      "epoch": 0.9641917646427229,
-      "grad_norm": 9.167194366455078,
-      "learning_rate": 3.330162174113249e-08,
-      "loss": 1.8853,
-      "step": 4490
-    },
-    {
-      "epoch": 0.9663391850539539,
-      "grad_norm": 8.423112869262695,
-      "learning_rate": 2.9414049049872883e-08,
-      "loss": 1.8813,
-      "step": 4500
-    },
-    {
-      "epoch": 0.968486605465185,
-      "grad_norm": 9.233404159545898,
-      "learning_rate": 2.5767022123734942e-08,
-      "loss": 1.9462,
-      "step": 4510
-    },
-    {
-      "epoch": 0.970634025876416,
-      "grad_norm": 18.79878044128418,
-      "learning_rate": 2.236071745637336e-08,
-      "loss": 2.2862,
-      "step": 4520
-    },
-    {
-      "epoch": 0.972781446287647,
-      "grad_norm": 7.818319797515869,
-      "learning_rate": 1.9195299891968667e-08,
-      "loss": 1.5587,
-      "step": 4530
-    },
-    {
-      "epoch": 0.974928866698878,
-      "grad_norm": 14.589919090270996,
-      "learning_rate": 1.627092261724583e-08,
-      "loss": 2.3639,
-      "step": 4540
-    },
-    {
-      "epoch": 0.977076287110109,
-      "grad_norm": 8.41909408569336,
-      "learning_rate": 1.35877271540652e-08,
-      "loss": 2.2179,
-      "step": 4550
-    },
-    {
-      "epoch": 0.97922370752134,
-      "grad_norm": 18.559860229492188,
-      "learning_rate": 1.1145843352572406e-08,
-      "loss": 2.0184,
-      "step": 4560
-    },
-    {
-      "epoch": 0.981371127932571,
-      "grad_norm": 11.833897590637207,
-      "learning_rate": 8.945389384912851e-09,
-      "loss": 1.9478,
-      "step": 4570
-    },
-    {
-      "epoch": 0.983518548343802,
-      "grad_norm": 19.6647891998291,
-      "learning_rate": 6.986471739513501e-09,
-      "loss": 1.9924,
-      "step": 4580
-    },
-    {
-      "epoch": 0.985665968755033,
-      "grad_norm": 7.491504669189453,
-      "learning_rate": 5.269185215931449e-09,
-      "loss": 1.2046,
-      "step": 4590
-    },
-    {
-      "epoch": 0.987813389166264,
-      "grad_norm": 8.722803115844727,
-      "learning_rate": 3.7936129202648106e-09,
-      "loss": 1.5883,
-      "step": 4600
-    },
-    {
-      "epoch": 0.989960809577495,
-      "grad_norm": 8.63818073272705,
-      "learning_rate": 2.5598262611298275e-09,
-      "loss": 1.2219,
-      "step": 4610
-    },
-    {
-      "epoch": 0.992108229988726,
-      "grad_norm": 9.115127563476562,
-      "learning_rate": 1.5678849462058554e-09,
-      "loss": 2.692,
-      "step": 4620
-    },
-    {
-      "epoch": 0.994255650399957,
-      "grad_norm": 18.569103240966797,
-      "learning_rate": 8.178369793482299e-10,
-      "loss": 2.3245,
-      "step": 4630
-    },
-    {
-      "epoch": 0.9964030708111881,
-      "grad_norm": 17.41649055480957,
-      "learning_rate": 3.097186582606826e-10,
-      "loss": 3.4339,
-      "step": 4640
-    },
-    {
-      "epoch": 0.9985504912224191,
-      "grad_norm": 7.883552551269531,
-      "learning_rate": 4.3554572743409463e-11,
-      "loss": 2.5974,
-      "step": 4650
-    },
-    {
-      "epoch": 0.9998389434691577,
-      "step": 4656,
-      "total_flos": 3.7794240074694525e+19,
-      "train_loss": 2.209910297516695,
-      "train_runtime": 29829.0342,
-      "train_samples_per_second": 9.991,
-      "train_steps_per_second": 0.156
-    }
-  ],
-  "logging_steps": 10,
-  "max_steps": 4656,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
-  "save_steps": 100000,
-  "stateful_callbacks": {
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": true
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 3.7794240074694525e+19,
-  "train_batch_size": 1,
-  "trial_name": null,
-  "trial_params": null
-}

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/training_args.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:046e63488d2ef3c054c8e281f6d7bdec5d13667b525e5e08db174b565281ce2d
-size 7352

QeoThinker-VGGT-Qwen25VL-7B-Vanilla/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff