peppermint20 commited on
Commit
862ed98
·
verified ·
1 Parent(s): f3a2f76

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|regression|>": 151665,
20
+ "<|repo_name|>": 151663,
21
+ "<|video_pad|>": 151656,
22
+ "<|vision_end|>": 151653,
23
+ "<|vision_pad|>": 151654,
24
+ "<|vision_start|>": 151652
25
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "image_token_id": 151655,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 11008,
13
+ "max_position_embeddings": 128000,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2_5_vl",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "text_config": {
32
+ "architectures": [
33
+ "Qwen2_5_VLForConditionalGeneration"
34
+ ],
35
+ "attention_dropout": 0.0,
36
+ "bos_token_id": 151643,
37
+ "eos_token_id": 151645,
38
+ "hidden_act": "silu",
39
+ "hidden_size": 2048,
40
+ "image_token_id": null,
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 11008,
43
+ "layer_types": [
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention",
75
+ "full_attention",
76
+ "full_attention",
77
+ "full_attention",
78
+ "full_attention",
79
+ "full_attention"
80
+ ],
81
+ "max_position_embeddings": 128000,
82
+ "max_window_layers": 70,
83
+ "model_type": "qwen2_5_vl_text",
84
+ "num_attention_heads": 16,
85
+ "num_hidden_layers": 36,
86
+ "num_key_value_heads": 2,
87
+ "rms_norm_eps": 1e-06,
88
+ "rope_scaling": {
89
+ "mrope_section": [
90
+ 16,
91
+ 24,
92
+ 24
93
+ ],
94
+ "rope_type": "default",
95
+ "type": "default"
96
+ },
97
+ "rope_theta": 1000000.0,
98
+ "sliding_window": null,
99
+ "tie_word_embeddings": true,
100
+ "torch_dtype": "float32",
101
+ "use_cache": false,
102
+ "use_sliding_window": false,
103
+ "video_token_id": null,
104
+ "vision_end_token_id": 151653,
105
+ "vision_start_token_id": 151652,
106
+ "vision_token_id": 151654,
107
+ "vocab_size": 151936
108
+ },
109
+ "torch_dtype": "bfloat16",
110
+ "transformers_version": "4.55.0",
111
+ "use_cache": false,
112
+ "use_sliding_window": false,
113
+ "video_token_id": 151656,
114
+ "vision_config": {
115
+ "depth": 32,
116
+ "fullatt_block_indexes": [
117
+ 7,
118
+ 15,
119
+ 23,
120
+ 31
121
+ ],
122
+ "hidden_act": "silu",
123
+ "hidden_size": 1280,
124
+ "in_channels": 3,
125
+ "in_chans": 3,
126
+ "initializer_range": 0.02,
127
+ "intermediate_size": 3420,
128
+ "model_type": "qwen2_5_vl",
129
+ "num_heads": 16,
130
+ "out_hidden_size": 2048,
131
+ "patch_size": 14,
132
+ "spatial_merge_size": 2,
133
+ "spatial_patch_size": 14,
134
+ "temporal_patch_size": 2,
135
+ "tokens_per_second": 2,
136
+ "torch_dtype": "float32",
137
+ "window_size": 112
138
+ },
139
+ "vision_end_token_id": 151653,
140
+ "vision_start_token_id": 151652,
141
+ "vision_token_id": 151654,
142
+ "vocab_size": 151936
143
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 1e-06,
11
+ "transformers_version": "4.55.0"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessorFast",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "input_data_format": null,
24
+ "max_pixels": 12845056,
25
+ "merge_size": 2,
26
+ "min_pixels": 3136,
27
+ "patch_size": 14,
28
+ "processor_class": "Qwen2_5_VLProcessor",
29
+ "resample": 3,
30
+ "rescale_factor": 0.00392156862745098,
31
+ "return_tensors": null,
32
+ "size": {
33
+ "longest_edge": 12845056,
34
+ "shortest_edge": 3136
35
+ },
36
+ "temporal_patch_size": 2
37
+ }
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fcfb7f872f7c06c07727a4ec33498c57fb4d37372414d9ceb81a0d3dd2fe85e
3
+ size 4997892186
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447d31587a47346cb8072cfc30a6e29b3f239425ff4de02bc7cad22e4f743ff1
3
+ size 2511631122
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 755712,
4
+ "total_size": 7509245952
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
136
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
138
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
139
+ "model.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
142
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
143
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
144
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
145
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
146
+ "model.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
147
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
148
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
149
+ "model.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
150
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
151
+ "model.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
153
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
154
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
155
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
156
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
157
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
158
+ "model.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
159
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
160
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
161
+ "model.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
162
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
163
+ "model.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
164
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
165
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
166
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
167
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
168
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.20.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.20.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
174
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
175
+ "model.layers.20.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
176
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
177
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
178
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.21.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.21.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.21.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.22.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.22.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.22.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.23.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.23.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.23.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
216
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
217
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
218
+ "model.layers.24.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
219
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
220
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
221
+ "model.layers.24.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
222
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
223
+ "model.layers.24.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
224
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
226
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
227
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
228
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
229
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
230
+ "model.layers.25.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
231
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
232
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.25.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
235
+ "model.layers.25.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
236
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
237
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
238
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
239
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
240
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
241
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
242
+ "model.layers.26.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
243
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
244
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
245
+ "model.layers.26.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
246
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
247
+ "model.layers.26.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
248
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
249
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
250
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
251
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
252
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
253
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
254
+ "model.layers.27.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
255
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
256
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
257
+ "model.layers.27.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
258
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
259
+ "model.layers.27.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
260
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
261
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
262
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
263
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
264
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
265
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
266
+ "model.layers.28.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
267
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
268
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
269
+ "model.layers.28.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
270
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
271
+ "model.layers.28.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
272
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
273
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
274
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
275
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
276
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
277
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
278
+ "model.layers.29.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
279
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
280
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
281
+ "model.layers.29.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
282
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
283
+ "model.layers.29.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
284
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
285
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
296
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
297
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
298
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
299
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
300
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
301
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
302
+ "model.layers.30.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
303
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
304
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
305
+ "model.layers.30.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
306
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
307
+ "model.layers.30.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
308
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
309
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
310
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
311
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
312
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
313
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
314
+ "model.layers.31.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
315
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
316
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
317
+ "model.layers.31.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
318
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
319
+ "model.layers.31.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
320
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
321
+ "model.layers.32.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
322
+ "model.layers.32.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
323
+ "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
324
+ "model.layers.32.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
325
+ "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
326
+ "model.layers.32.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
327
+ "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
328
+ "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
329
+ "model.layers.32.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
330
+ "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
331
+ "model.layers.32.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
332
+ "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
333
+ "model.layers.33.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
334
+ "model.layers.33.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
335
+ "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
336
+ "model.layers.33.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
337
+ "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
338
+ "model.layers.33.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
339
+ "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
340
+ "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
341
+ "model.layers.33.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
342
+ "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
343
+ "model.layers.33.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
344
+ "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
345
+ "model.layers.34.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
346
+ "model.layers.34.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
347
+ "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
348
+ "model.layers.34.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
349
+ "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
350
+ "model.layers.34.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
351
+ "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
352
+ "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
353
+ "model.layers.34.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
354
+ "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
355
+ "model.layers.34.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
356
+ "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
357
+ "model.layers.35.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
358
+ "model.layers.35.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
359
+ "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
360
+ "model.layers.35.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
361
+ "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
362
+ "model.layers.35.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
363
+ "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
364
+ "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
365
+ "model.layers.35.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
366
+ "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
367
+ "model.layers.35.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
368
+ "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
369
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
370
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
371
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
372
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
373
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
374
+ "model.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
375
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
376
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
377
+ "model.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
378
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
379
+ "model.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
380
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
381
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
382
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
383
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
384
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
385
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
386
+ "model.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
387
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
388
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
389
+ "model.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
390
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
391
+ "model.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
392
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
393
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
394
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
395
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
396
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
397
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
398
+ "model.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
399
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
400
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
401
+ "model.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
402
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
403
+ "model.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
404
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
405
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
406
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
407
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
408
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
409
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
410
+ "model.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
411
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
412
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
413
+ "model.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
414
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
415
+ "model.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
416
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
417
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
418
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
419
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
420
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
421
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
422
+ "model.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
423
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
424
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
425
+ "model.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
426
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
427
+ "model.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
428
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
429
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
430
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
431
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
432
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
433
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
434
+ "model.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
435
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
436
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
437
+ "model.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
438
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
439
+ "model.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
440
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
441
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin",
442
+ "visual.blocks.0.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
443
+ "visual.blocks.0.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
444
+ "visual.blocks.0.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
445
+ "visual.blocks.0.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
446
+ "visual.blocks.0.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
447
+ "visual.blocks.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
448
+ "visual.blocks.0.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
449
+ "visual.blocks.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
450
+ "visual.blocks.0.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
451
+ "visual.blocks.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
452
+ "visual.blocks.0.norm1.weight": "pytorch_model-00001-of-00002.bin",
453
+ "visual.blocks.0.norm2.weight": "pytorch_model-00001-of-00002.bin",
454
+ "visual.blocks.1.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
455
+ "visual.blocks.1.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
456
+ "visual.blocks.1.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
457
+ "visual.blocks.1.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
458
+ "visual.blocks.1.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
459
+ "visual.blocks.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
460
+ "visual.blocks.1.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
461
+ "visual.blocks.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
462
+ "visual.blocks.1.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
463
+ "visual.blocks.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
464
+ "visual.blocks.1.norm1.weight": "pytorch_model-00001-of-00002.bin",
465
+ "visual.blocks.1.norm2.weight": "pytorch_model-00001-of-00002.bin",
466
+ "visual.blocks.10.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
467
+ "visual.blocks.10.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
468
+ "visual.blocks.10.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
469
+ "visual.blocks.10.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
470
+ "visual.blocks.10.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
471
+ "visual.blocks.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
472
+ "visual.blocks.10.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
473
+ "visual.blocks.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
474
+ "visual.blocks.10.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
475
+ "visual.blocks.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
476
+ "visual.blocks.10.norm1.weight": "pytorch_model-00001-of-00002.bin",
477
+ "visual.blocks.10.norm2.weight": "pytorch_model-00001-of-00002.bin",
478
+ "visual.blocks.11.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
479
+ "visual.blocks.11.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
480
+ "visual.blocks.11.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
481
+ "visual.blocks.11.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
482
+ "visual.blocks.11.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
483
+ "visual.blocks.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
484
+ "visual.blocks.11.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
485
+ "visual.blocks.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
486
+ "visual.blocks.11.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
487
+ "visual.blocks.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
488
+ "visual.blocks.11.norm1.weight": "pytorch_model-00001-of-00002.bin",
489
+ "visual.blocks.11.norm2.weight": "pytorch_model-00001-of-00002.bin",
490
+ "visual.blocks.12.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
491
+ "visual.blocks.12.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
492
+ "visual.blocks.12.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
493
+ "visual.blocks.12.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
494
+ "visual.blocks.12.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
495
+ "visual.blocks.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
496
+ "visual.blocks.12.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
497
+ "visual.blocks.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
498
+ "visual.blocks.12.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
499
+ "visual.blocks.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
500
+ "visual.blocks.12.norm1.weight": "pytorch_model-00001-of-00002.bin",
501
+ "visual.blocks.12.norm2.weight": "pytorch_model-00001-of-00002.bin",
502
+ "visual.blocks.13.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
503
+ "visual.blocks.13.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
504
+ "visual.blocks.13.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
505
+ "visual.blocks.13.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
506
+ "visual.blocks.13.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
507
+ "visual.blocks.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
508
+ "visual.blocks.13.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
509
+ "visual.blocks.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
510
+ "visual.blocks.13.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
511
+ "visual.blocks.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
512
+ "visual.blocks.13.norm1.weight": "pytorch_model-00001-of-00002.bin",
513
+ "visual.blocks.13.norm2.weight": "pytorch_model-00001-of-00002.bin",
514
+ "visual.blocks.14.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
515
+ "visual.blocks.14.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
516
+ "visual.blocks.14.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
517
+ "visual.blocks.14.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
518
+ "visual.blocks.14.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
519
+ "visual.blocks.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
520
+ "visual.blocks.14.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
521
+ "visual.blocks.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
522
+ "visual.blocks.14.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
523
+ "visual.blocks.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
524
+ "visual.blocks.14.norm1.weight": "pytorch_model-00001-of-00002.bin",
525
+ "visual.blocks.14.norm2.weight": "pytorch_model-00001-of-00002.bin",
526
+ "visual.blocks.15.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
527
+ "visual.blocks.15.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
528
+ "visual.blocks.15.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
529
+ "visual.blocks.15.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
530
+ "visual.blocks.15.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
531
+ "visual.blocks.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
532
+ "visual.blocks.15.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
533
+ "visual.blocks.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
534
+ "visual.blocks.15.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
535
+ "visual.blocks.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
536
+ "visual.blocks.15.norm1.weight": "pytorch_model-00001-of-00002.bin",
537
+ "visual.blocks.15.norm2.weight": "pytorch_model-00001-of-00002.bin",
538
+ "visual.blocks.16.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
539
+ "visual.blocks.16.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
540
+ "visual.blocks.16.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
541
+ "visual.blocks.16.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
542
+ "visual.blocks.16.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
543
+ "visual.blocks.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
544
+ "visual.blocks.16.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
545
+ "visual.blocks.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
546
+ "visual.blocks.16.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
547
+ "visual.blocks.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
548
+ "visual.blocks.16.norm1.weight": "pytorch_model-00001-of-00002.bin",
549
+ "visual.blocks.16.norm2.weight": "pytorch_model-00001-of-00002.bin",
550
+ "visual.blocks.17.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
551
+ "visual.blocks.17.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
552
+ "visual.blocks.17.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
553
+ "visual.blocks.17.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
554
+ "visual.blocks.17.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
555
+ "visual.blocks.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
556
+ "visual.blocks.17.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
557
+ "visual.blocks.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
558
+ "visual.blocks.17.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
559
+ "visual.blocks.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
560
+ "visual.blocks.17.norm1.weight": "pytorch_model-00001-of-00002.bin",
561
+ "visual.blocks.17.norm2.weight": "pytorch_model-00001-of-00002.bin",
562
+ "visual.blocks.18.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
563
+ "visual.blocks.18.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
564
+ "visual.blocks.18.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
565
+ "visual.blocks.18.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
566
+ "visual.blocks.18.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
567
+ "visual.blocks.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
568
+ "visual.blocks.18.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
569
+ "visual.blocks.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
570
+ "visual.blocks.18.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
571
+ "visual.blocks.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
572
+ "visual.blocks.18.norm1.weight": "pytorch_model-00001-of-00002.bin",
573
+ "visual.blocks.18.norm2.weight": "pytorch_model-00001-of-00002.bin",
574
+ "visual.blocks.19.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
575
+ "visual.blocks.19.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
576
+ "visual.blocks.19.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
577
+ "visual.blocks.19.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
578
+ "visual.blocks.19.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
579
+ "visual.blocks.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
580
+ "visual.blocks.19.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
581
+ "visual.blocks.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
582
+ "visual.blocks.19.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
583
+ "visual.blocks.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
584
+ "visual.blocks.19.norm1.weight": "pytorch_model-00001-of-00002.bin",
585
+ "visual.blocks.19.norm2.weight": "pytorch_model-00001-of-00002.bin",
586
+ "visual.blocks.2.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
587
+ "visual.blocks.2.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
588
+ "visual.blocks.2.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
589
+ "visual.blocks.2.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
590
+ "visual.blocks.2.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
591
+ "visual.blocks.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
592
+ "visual.blocks.2.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
593
+ "visual.blocks.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
594
+ "visual.blocks.2.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
595
+ "visual.blocks.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
596
+ "visual.blocks.2.norm1.weight": "pytorch_model-00001-of-00002.bin",
597
+ "visual.blocks.2.norm2.weight": "pytorch_model-00001-of-00002.bin",
598
+ "visual.blocks.20.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
599
+ "visual.blocks.20.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
600
+ "visual.blocks.20.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
601
+ "visual.blocks.20.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
602
+ "visual.blocks.20.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
603
+ "visual.blocks.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
604
+ "visual.blocks.20.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
605
+ "visual.blocks.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
606
+ "visual.blocks.20.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
607
+ "visual.blocks.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
608
+ "visual.blocks.20.norm1.weight": "pytorch_model-00001-of-00002.bin",
609
+ "visual.blocks.20.norm2.weight": "pytorch_model-00001-of-00002.bin",
610
+ "visual.blocks.21.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
611
+ "visual.blocks.21.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
612
+ "visual.blocks.21.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
613
+ "visual.blocks.21.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
614
+ "visual.blocks.21.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
615
+ "visual.blocks.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
616
+ "visual.blocks.21.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
617
+ "visual.blocks.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
618
+ "visual.blocks.21.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
619
+ "visual.blocks.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
620
+ "visual.blocks.21.norm1.weight": "pytorch_model-00001-of-00002.bin",
621
+ "visual.blocks.21.norm2.weight": "pytorch_model-00001-of-00002.bin",
622
+ "visual.blocks.22.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
623
+ "visual.blocks.22.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
624
+ "visual.blocks.22.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
625
+ "visual.blocks.22.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
626
+ "visual.blocks.22.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
627
+ "visual.blocks.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
628
+ "visual.blocks.22.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
629
+ "visual.blocks.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
630
+ "visual.blocks.22.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
631
+ "visual.blocks.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
632
+ "visual.blocks.22.norm1.weight": "pytorch_model-00001-of-00002.bin",
633
+ "visual.blocks.22.norm2.weight": "pytorch_model-00001-of-00002.bin",
634
+ "visual.blocks.23.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
635
+ "visual.blocks.23.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
636
+ "visual.blocks.23.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
637
+ "visual.blocks.23.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
638
+ "visual.blocks.23.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
639
+ "visual.blocks.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
640
+ "visual.blocks.23.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
641
+ "visual.blocks.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
642
+ "visual.blocks.23.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
643
+ "visual.blocks.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
644
+ "visual.blocks.23.norm1.weight": "pytorch_model-00001-of-00002.bin",
645
+ "visual.blocks.23.norm2.weight": "pytorch_model-00001-of-00002.bin",
646
+ "visual.blocks.24.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
647
+ "visual.blocks.24.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
648
+ "visual.blocks.24.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
649
+ "visual.blocks.24.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
650
+ "visual.blocks.24.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
651
+ "visual.blocks.24.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
652
+ "visual.blocks.24.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
653
+ "visual.blocks.24.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
654
+ "visual.blocks.24.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
655
+ "visual.blocks.24.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
656
+ "visual.blocks.24.norm1.weight": "pytorch_model-00001-of-00002.bin",
657
+ "visual.blocks.24.norm2.weight": "pytorch_model-00001-of-00002.bin",
658
+ "visual.blocks.25.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
659
+ "visual.blocks.25.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
660
+ "visual.blocks.25.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
661
+ "visual.blocks.25.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
662
+ "visual.blocks.25.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
663
+ "visual.blocks.25.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
664
+ "visual.blocks.25.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
665
+ "visual.blocks.25.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
666
+ "visual.blocks.25.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
667
+ "visual.blocks.25.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
668
+ "visual.blocks.25.norm1.weight": "pytorch_model-00001-of-00002.bin",
669
+ "visual.blocks.25.norm2.weight": "pytorch_model-00001-of-00002.bin",
670
+ "visual.blocks.26.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
671
+ "visual.blocks.26.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
672
+ "visual.blocks.26.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
673
+ "visual.blocks.26.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
674
+ "visual.blocks.26.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
675
+ "visual.blocks.26.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
676
+ "visual.blocks.26.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
677
+ "visual.blocks.26.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
678
+ "visual.blocks.26.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
679
+ "visual.blocks.26.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
680
+ "visual.blocks.26.norm1.weight": "pytorch_model-00001-of-00002.bin",
681
+ "visual.blocks.26.norm2.weight": "pytorch_model-00001-of-00002.bin",
682
+ "visual.blocks.27.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
683
+ "visual.blocks.27.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
684
+ "visual.blocks.27.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
685
+ "visual.blocks.27.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
686
+ "visual.blocks.27.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
687
+ "visual.blocks.27.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
688
+ "visual.blocks.27.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
689
+ "visual.blocks.27.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
690
+ "visual.blocks.27.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
691
+ "visual.blocks.27.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
692
+ "visual.blocks.27.norm1.weight": "pytorch_model-00001-of-00002.bin",
693
+ "visual.blocks.27.norm2.weight": "pytorch_model-00001-of-00002.bin",
694
+ "visual.blocks.28.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
695
+ "visual.blocks.28.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
696
+ "visual.blocks.28.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
697
+ "visual.blocks.28.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
698
+ "visual.blocks.28.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
699
+ "visual.blocks.28.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
700
+ "visual.blocks.28.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
701
+ "visual.blocks.28.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
702
+ "visual.blocks.28.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
703
+ "visual.blocks.28.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
704
+ "visual.blocks.28.norm1.weight": "pytorch_model-00001-of-00002.bin",
705
+ "visual.blocks.28.norm2.weight": "pytorch_model-00001-of-00002.bin",
706
+ "visual.blocks.29.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
707
+ "visual.blocks.29.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
708
+ "visual.blocks.29.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
709
+ "visual.blocks.29.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
710
+ "visual.blocks.29.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
711
+ "visual.blocks.29.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
712
+ "visual.blocks.29.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
713
+ "visual.blocks.29.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
714
+ "visual.blocks.29.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
715
+ "visual.blocks.29.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
716
+ "visual.blocks.29.norm1.weight": "pytorch_model-00001-of-00002.bin",
717
+ "visual.blocks.29.norm2.weight": "pytorch_model-00001-of-00002.bin",
718
+ "visual.blocks.3.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
719
+ "visual.blocks.3.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
720
+ "visual.blocks.3.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
721
+ "visual.blocks.3.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
722
+ "visual.blocks.3.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
723
+ "visual.blocks.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
724
+ "visual.blocks.3.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
725
+ "visual.blocks.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
726
+ "visual.blocks.3.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
727
+ "visual.blocks.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
728
+ "visual.blocks.3.norm1.weight": "pytorch_model-00001-of-00002.bin",
729
+ "visual.blocks.3.norm2.weight": "pytorch_model-00001-of-00002.bin",
730
+ "visual.blocks.30.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
731
+ "visual.blocks.30.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
732
+ "visual.blocks.30.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
733
+ "visual.blocks.30.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
734
+ "visual.blocks.30.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
735
+ "visual.blocks.30.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
736
+ "visual.blocks.30.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
737
+ "visual.blocks.30.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
738
+ "visual.blocks.30.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
739
+ "visual.blocks.30.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
740
+ "visual.blocks.30.norm1.weight": "pytorch_model-00001-of-00002.bin",
741
+ "visual.blocks.30.norm2.weight": "pytorch_model-00001-of-00002.bin",
742
+ "visual.blocks.31.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
743
+ "visual.blocks.31.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
744
+ "visual.blocks.31.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
745
+ "visual.blocks.31.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
746
+ "visual.blocks.31.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
747
+ "visual.blocks.31.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
748
+ "visual.blocks.31.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
749
+ "visual.blocks.31.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
750
+ "visual.blocks.31.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
751
+ "visual.blocks.31.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
752
+ "visual.blocks.31.norm1.weight": "pytorch_model-00001-of-00002.bin",
753
+ "visual.blocks.31.norm2.weight": "pytorch_model-00001-of-00002.bin",
754
+ "visual.blocks.4.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
755
+ "visual.blocks.4.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
756
+ "visual.blocks.4.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
757
+ "visual.blocks.4.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
758
+ "visual.blocks.4.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
759
+ "visual.blocks.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
760
+ "visual.blocks.4.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
761
+ "visual.blocks.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
762
+ "visual.blocks.4.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
763
+ "visual.blocks.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
764
+ "visual.blocks.4.norm1.weight": "pytorch_model-00001-of-00002.bin",
765
+ "visual.blocks.4.norm2.weight": "pytorch_model-00001-of-00002.bin",
766
+ "visual.blocks.5.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
767
+ "visual.blocks.5.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
768
+ "visual.blocks.5.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
769
+ "visual.blocks.5.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
770
+ "visual.blocks.5.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
771
+ "visual.blocks.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
772
+ "visual.blocks.5.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
773
+ "visual.blocks.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
774
+ "visual.blocks.5.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
775
+ "visual.blocks.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
776
+ "visual.blocks.5.norm1.weight": "pytorch_model-00001-of-00002.bin",
777
+ "visual.blocks.5.norm2.weight": "pytorch_model-00001-of-00002.bin",
778
+ "visual.blocks.6.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
779
+ "visual.blocks.6.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
780
+ "visual.blocks.6.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
781
+ "visual.blocks.6.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
782
+ "visual.blocks.6.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
783
+ "visual.blocks.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
784
+ "visual.blocks.6.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
785
+ "visual.blocks.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
786
+ "visual.blocks.6.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
787
+ "visual.blocks.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
788
+ "visual.blocks.6.norm1.weight": "pytorch_model-00001-of-00002.bin",
789
+ "visual.blocks.6.norm2.weight": "pytorch_model-00001-of-00002.bin",
790
+ "visual.blocks.7.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
791
+ "visual.blocks.7.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
792
+ "visual.blocks.7.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
793
+ "visual.blocks.7.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
794
+ "visual.blocks.7.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
795
+ "visual.blocks.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
796
+ "visual.blocks.7.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
797
+ "visual.blocks.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
798
+ "visual.blocks.7.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
799
+ "visual.blocks.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
800
+ "visual.blocks.7.norm1.weight": "pytorch_model-00001-of-00002.bin",
801
+ "visual.blocks.7.norm2.weight": "pytorch_model-00001-of-00002.bin",
802
+ "visual.blocks.8.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
803
+ "visual.blocks.8.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
804
+ "visual.blocks.8.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
805
+ "visual.blocks.8.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
806
+ "visual.blocks.8.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
807
+ "visual.blocks.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
808
+ "visual.blocks.8.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
809
+ "visual.blocks.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
810
+ "visual.blocks.8.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
811
+ "visual.blocks.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
812
+ "visual.blocks.8.norm1.weight": "pytorch_model-00001-of-00002.bin",
813
+ "visual.blocks.8.norm2.weight": "pytorch_model-00001-of-00002.bin",
814
+ "visual.blocks.9.attn.proj.bias": "pytorch_model-00001-of-00002.bin",
815
+ "visual.blocks.9.attn.proj.weight": "pytorch_model-00001-of-00002.bin",
816
+ "visual.blocks.9.attn.qkv.bias": "pytorch_model-00001-of-00002.bin",
817
+ "visual.blocks.9.attn.qkv.weight": "pytorch_model-00001-of-00002.bin",
818
+ "visual.blocks.9.mlp.down_proj.bias": "pytorch_model-00001-of-00002.bin",
819
+ "visual.blocks.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
820
+ "visual.blocks.9.mlp.gate_proj.bias": "pytorch_model-00001-of-00002.bin",
821
+ "visual.blocks.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
822
+ "visual.blocks.9.mlp.up_proj.bias": "pytorch_model-00001-of-00002.bin",
823
+ "visual.blocks.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
824
+ "visual.blocks.9.norm1.weight": "pytorch_model-00001-of-00002.bin",
825
+ "visual.blocks.9.norm2.weight": "pytorch_model-00001-of-00002.bin",
826
+ "visual.merger.ln_q.weight": "pytorch_model-00001-of-00002.bin",
827
+ "visual.merger.mlp.0.bias": "pytorch_model-00001-of-00002.bin",
828
+ "visual.merger.mlp.0.weight": "pytorch_model-00001-of-00002.bin",
829
+ "visual.merger.mlp.2.bias": "pytorch_model-00001-of-00002.bin",
830
+ "visual.merger.mlp.2.weight": "pytorch_model-00001-of-00002.bin",
831
+ "visual.patch_embed.proj.weight": "pytorch_model-00001-of-00002.bin"
832
+ }
833
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|regression|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "eos_token": {
12
+ "content": "<|im_end|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "pad_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ }
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44013e9ab1aea45209ffb9adc60df25c543dc8fda8a217eb7dba3f112347328
3
+ size 11422087
tokenizer_config.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|regression|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ }
189
+ },
190
+ "additional_special_tokens": [
191
+ "<|regression|>"
192
+ ],
193
+ "bos_token": null,
194
+ "clean_up_tokenization_spaces": false,
195
+ "eos_token": "<|im_end|>",
196
+ "errors": "replace",
197
+ "extra_special_tokens": {},
198
+ "model_max_length": 131072,
199
+ "pad_token": "<|endoftext|>",
200
+ "padding_side": "right",
201
+ "processor_class": "Qwen2_5_VLProcessor",
202
+ "split_special_tokens": false,
203
+ "tokenizer_class": "Qwen2Tokenizer",
204
+ "unk_token": null
205
+ }
trainer_state.json ADDED
@@ -0,0 +1,3624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0965971459934138,
6
+ "eval_steps": 50,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0021953896816684962,
14
+ "grad_norm": 63.43226149633802,
15
+ "learning_rate": 0.0,
16
+ "loss": 0.9349,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.0043907793633369925,
21
+ "grad_norm": 36.34002028523472,
22
+ "learning_rate": 7.2992700729927e-09,
23
+ "loss": 0.84,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.006586169045005488,
28
+ "grad_norm": 35.076123786065985,
29
+ "learning_rate": 1.45985401459854e-08,
30
+ "loss": 0.872,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.008781558726673985,
35
+ "grad_norm": 35.02301842850996,
36
+ "learning_rate": 2.1897810218978102e-08,
37
+ "loss": 0.851,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.010976948408342482,
42
+ "grad_norm": 27.48208643895236,
43
+ "learning_rate": 2.91970802919708e-08,
44
+ "loss": 0.8809,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.013172338090010977,
49
+ "grad_norm": 45.477345508153874,
50
+ "learning_rate": 3.64963503649635e-08,
51
+ "loss": 0.9287,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.015367727771679473,
56
+ "grad_norm": 56.90118203461904,
57
+ "learning_rate": 4.3795620437956203e-08,
58
+ "loss": 0.9007,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.01756311745334797,
63
+ "grad_norm": 46.42558807941359,
64
+ "learning_rate": 5.10948905109489e-08,
65
+ "loss": 0.9807,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.019758507135016465,
70
+ "grad_norm": 42.603618137740504,
71
+ "learning_rate": 5.83941605839416e-08,
72
+ "loss": 0.9002,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.021953896816684963,
77
+ "grad_norm": 31.238590091540356,
78
+ "learning_rate": 6.569343065693431e-08,
79
+ "loss": 0.8321,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.024149286498353458,
84
+ "grad_norm": 74.05095323835384,
85
+ "learning_rate": 7.2992700729927e-08,
86
+ "loss": 0.7992,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.026344676180021953,
91
+ "grad_norm": 37.54325755059656,
92
+ "learning_rate": 8.029197080291971e-08,
93
+ "loss": 0.8845,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.02854006586169045,
98
+ "grad_norm": 42.06630970394279,
99
+ "learning_rate": 8.759124087591241e-08,
100
+ "loss": 0.9717,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.030735455543358946,
105
+ "grad_norm": 46.71572391565178,
106
+ "learning_rate": 9.48905109489051e-08,
107
+ "loss": 0.868,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.03293084522502744,
112
+ "grad_norm": 47.53527688382936,
113
+ "learning_rate": 1.021897810218978e-07,
114
+ "loss": 0.9339,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.03512623490669594,
119
+ "grad_norm": 30.17176017806663,
120
+ "learning_rate": 1.0948905109489052e-07,
121
+ "loss": 0.8985,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.03732162458836443,
126
+ "grad_norm": 53.77317643602209,
127
+ "learning_rate": 1.167883211678832e-07,
128
+ "loss": 0.9014,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.03951701427003293,
133
+ "grad_norm": 37.069232452020195,
134
+ "learning_rate": 1.240875912408759e-07,
135
+ "loss": 0.8917,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.04171240395170143,
140
+ "grad_norm": 57.37762629699942,
141
+ "learning_rate": 1.3138686131386862e-07,
142
+ "loss": 0.8932,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.043907793633369926,
147
+ "grad_norm": 35.926837420049885,
148
+ "learning_rate": 1.386861313868613e-07,
149
+ "loss": 0.903,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.04610318331503842,
154
+ "grad_norm": 32.248278685728465,
155
+ "learning_rate": 1.45985401459854e-07,
156
+ "loss": 0.8963,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.048298572996706916,
161
+ "grad_norm": 29.26799952212303,
162
+ "learning_rate": 1.532846715328467e-07,
163
+ "loss": 0.8644,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.050493962678375415,
168
+ "grad_norm": 30.785195210511173,
169
+ "learning_rate": 1.6058394160583942e-07,
170
+ "loss": 0.8437,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.052689352360043906,
175
+ "grad_norm": 33.71197663612937,
176
+ "learning_rate": 1.678832116788321e-07,
177
+ "loss": 0.8897,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.054884742041712405,
182
+ "grad_norm": 35.96552084220703,
183
+ "learning_rate": 1.7518248175182481e-07,
184
+ "loss": 0.8967,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.0570801317233809,
189
+ "grad_norm": 41.50138484689535,
190
+ "learning_rate": 1.824817518248175e-07,
191
+ "loss": 0.9074,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.059275521405049394,
196
+ "grad_norm": 38.797876016859426,
197
+ "learning_rate": 1.897810218978102e-07,
198
+ "loss": 0.9287,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.06147091108671789,
203
+ "grad_norm": 32.19034408841759,
204
+ "learning_rate": 1.9708029197080292e-07,
205
+ "loss": 0.8716,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.06366630076838639,
210
+ "grad_norm": 51.76045022284941,
211
+ "learning_rate": 2.043795620437956e-07,
212
+ "loss": 0.8812,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.06586169045005488,
217
+ "grad_norm": 31.359275794152296,
218
+ "learning_rate": 2.116788321167883e-07,
219
+ "loss": 0.8557,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.06805708013172337,
224
+ "grad_norm": 30.678686692428276,
225
+ "learning_rate": 2.1897810218978103e-07,
226
+ "loss": 0.8758,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.07025246981339188,
231
+ "grad_norm": 32.41276845704741,
232
+ "learning_rate": 2.2627737226277372e-07,
233
+ "loss": 0.8713,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.07244785949506037,
238
+ "grad_norm": 28.950024669686137,
239
+ "learning_rate": 2.335766423357664e-07,
240
+ "loss": 0.8627,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.07464324917672886,
245
+ "grad_norm": 18.417431019302867,
246
+ "learning_rate": 2.408759124087591e-07,
247
+ "loss": 0.7561,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.07683863885839737,
252
+ "grad_norm": 30.826596106550824,
253
+ "learning_rate": 2.481751824817518e-07,
254
+ "loss": 0.9044,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.07903402854006586,
259
+ "grad_norm": 27.86009502146877,
260
+ "learning_rate": 2.5547445255474454e-07,
261
+ "loss": 0.9158,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.08122941822173436,
266
+ "grad_norm": 33.10171663561771,
267
+ "learning_rate": 2.6277372262773725e-07,
268
+ "loss": 0.8973,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.08342480790340286,
273
+ "grad_norm": 33.54784642550598,
274
+ "learning_rate": 2.700729927007299e-07,
275
+ "loss": 0.8854,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.08562019758507135,
280
+ "grad_norm": 31.94225571174728,
281
+ "learning_rate": 2.773722627737226e-07,
282
+ "loss": 0.8634,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.08781558726673985,
287
+ "grad_norm": 18.667616381853037,
288
+ "learning_rate": 2.846715328467153e-07,
289
+ "loss": 0.8365,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.09001097694840834,
294
+ "grad_norm": 33.72334370651476,
295
+ "learning_rate": 2.91970802919708e-07,
296
+ "loss": 0.8826,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.09220636663007684,
301
+ "grad_norm": 26.43734180083069,
302
+ "learning_rate": 2.9927007299270075e-07,
303
+ "loss": 0.8895,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.09440175631174534,
308
+ "grad_norm": 31.963757242976843,
309
+ "learning_rate": 3.065693430656934e-07,
310
+ "loss": 0.8623,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.09659714599341383,
315
+ "grad_norm": 21.0251515948571,
316
+ "learning_rate": 3.138686131386861e-07,
317
+ "loss": 0.7966,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.09879253567508232,
322
+ "grad_norm": 31.038625878390096,
323
+ "learning_rate": 3.2116788321167883e-07,
324
+ "loss": 0.8834,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.10098792535675083,
329
+ "grad_norm": 20.27262840408739,
330
+ "learning_rate": 3.284671532846715e-07,
331
+ "loss": 0.833,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.10318331503841932,
336
+ "grad_norm": 27.9353455887378,
337
+ "learning_rate": 3.357664233576642e-07,
338
+ "loss": 0.8401,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.10537870472008781,
343
+ "grad_norm": 24.712611262961317,
344
+ "learning_rate": 3.4306569343065697e-07,
345
+ "loss": 0.759,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.10757409440175632,
350
+ "grad_norm": 19.616432703856,
351
+ "learning_rate": 3.5036496350364963e-07,
352
+ "loss": 0.7371,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.10976948408342481,
357
+ "grad_norm": 34.3573339583084,
358
+ "learning_rate": 3.5766423357664234e-07,
359
+ "loss": 0.8114,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.10976948408342481,
364
+ "eval_accuracy": 0.564,
365
+ "eval_loss": 0.6804020404815674,
366
+ "eval_runtime": 51.4851,
367
+ "eval_samples_per_second": 9.712,
368
+ "eval_steps_per_second": 1.224,
369
+ "step": 50
370
+ },
371
+ {
372
+ "epoch": 0.1119648737650933,
373
+ "grad_norm": 52.44727313113604,
374
+ "learning_rate": 3.64963503649635e-07,
375
+ "loss": 0.8058,
376
+ "step": 51
377
+ },
378
+ {
379
+ "epoch": 0.1141602634467618,
380
+ "grad_norm": 29.635805263155994,
381
+ "learning_rate": 3.722627737226277e-07,
382
+ "loss": 0.7798,
383
+ "step": 52
384
+ },
385
+ {
386
+ "epoch": 0.1163556531284303,
387
+ "grad_norm": 16.05086854109441,
388
+ "learning_rate": 3.795620437956204e-07,
389
+ "loss": 0.7593,
390
+ "step": 53
391
+ },
392
+ {
393
+ "epoch": 0.11855104281009879,
394
+ "grad_norm": 29.823000715758084,
395
+ "learning_rate": 3.8686131386861313e-07,
396
+ "loss": 0.7561,
397
+ "step": 54
398
+ },
399
+ {
400
+ "epoch": 0.1207464324917673,
401
+ "grad_norm": 14.127126775727975,
402
+ "learning_rate": 3.9416058394160584e-07,
403
+ "loss": 0.7271,
404
+ "step": 55
405
+ },
406
+ {
407
+ "epoch": 0.12294182217343579,
408
+ "grad_norm": 21.593202970179426,
409
+ "learning_rate": 4.0145985401459856e-07,
410
+ "loss": 0.7764,
411
+ "step": 56
412
+ },
413
+ {
414
+ "epoch": 0.1251372118551043,
415
+ "grad_norm": 14.951837963554619,
416
+ "learning_rate": 4.087591240875912e-07,
417
+ "loss": 0.6718,
418
+ "step": 57
419
+ },
420
+ {
421
+ "epoch": 0.12733260153677278,
422
+ "grad_norm": 15.378678774753151,
423
+ "learning_rate": 4.160583941605839e-07,
424
+ "loss": 0.7698,
425
+ "step": 58
426
+ },
427
+ {
428
+ "epoch": 0.12952799121844127,
429
+ "grad_norm": 24.73055485970318,
430
+ "learning_rate": 4.233576642335766e-07,
431
+ "loss": 0.7673,
432
+ "step": 59
433
+ },
434
+ {
435
+ "epoch": 0.13172338090010977,
436
+ "grad_norm": 17.57321911038973,
437
+ "learning_rate": 4.306569343065693e-07,
438
+ "loss": 0.7157,
439
+ "step": 60
440
+ },
441
+ {
442
+ "epoch": 0.13391877058177826,
443
+ "grad_norm": 16.77467268690982,
444
+ "learning_rate": 4.3795620437956206e-07,
445
+ "loss": 0.7432,
446
+ "step": 61
447
+ },
448
+ {
449
+ "epoch": 0.13611416026344675,
450
+ "grad_norm": 27.713531401351116,
451
+ "learning_rate": 4.452554744525547e-07,
452
+ "loss": 0.8336,
453
+ "step": 62
454
+ },
455
+ {
456
+ "epoch": 0.13830954994511527,
457
+ "grad_norm": 18.28886043174935,
458
+ "learning_rate": 4.5255474452554743e-07,
459
+ "loss": 0.6868,
460
+ "step": 63
461
+ },
462
+ {
463
+ "epoch": 0.14050493962678376,
464
+ "grad_norm": 17.56165424803153,
465
+ "learning_rate": 4.5985401459854014e-07,
466
+ "loss": 0.8134,
467
+ "step": 64
468
+ },
469
+ {
470
+ "epoch": 0.14270032930845225,
471
+ "grad_norm": 16.76906700049957,
472
+ "learning_rate": 4.671532846715328e-07,
473
+ "loss": 0.7071,
474
+ "step": 65
475
+ },
476
+ {
477
+ "epoch": 0.14489571899012074,
478
+ "grad_norm": 22.604350303541462,
479
+ "learning_rate": 4.744525547445255e-07,
480
+ "loss": 0.7717,
481
+ "step": 66
482
+ },
483
+ {
484
+ "epoch": 0.14709110867178923,
485
+ "grad_norm": 15.57570075158043,
486
+ "learning_rate": 4.817518248175182e-07,
487
+ "loss": 0.6858,
488
+ "step": 67
489
+ },
490
+ {
491
+ "epoch": 0.14928649835345773,
492
+ "grad_norm": 23.99766199274266,
493
+ "learning_rate": 4.89051094890511e-07,
494
+ "loss": 0.6925,
495
+ "step": 68
496
+ },
497
+ {
498
+ "epoch": 0.15148188803512624,
499
+ "grad_norm": 14.622811969684838,
500
+ "learning_rate": 4.963503649635036e-07,
501
+ "loss": 0.6772,
502
+ "step": 69
503
+ },
504
+ {
505
+ "epoch": 0.15367727771679474,
506
+ "grad_norm": 11.873629512549707,
507
+ "learning_rate": 5.036496350364964e-07,
508
+ "loss": 0.7201,
509
+ "step": 70
510
+ },
511
+ {
512
+ "epoch": 0.15587266739846323,
513
+ "grad_norm": 14.971650059257875,
514
+ "learning_rate": 5.109489051094891e-07,
515
+ "loss": 0.7144,
516
+ "step": 71
517
+ },
518
+ {
519
+ "epoch": 0.15806805708013172,
520
+ "grad_norm": 14.89771761097792,
521
+ "learning_rate": 5.182481751824817e-07,
522
+ "loss": 0.6823,
523
+ "step": 72
524
+ },
525
+ {
526
+ "epoch": 0.1602634467618002,
527
+ "grad_norm": 26.145706119308308,
528
+ "learning_rate": 5.255474452554745e-07,
529
+ "loss": 0.6815,
530
+ "step": 73
531
+ },
532
+ {
533
+ "epoch": 0.16245883644346873,
534
+ "grad_norm": 25.218475252582063,
535
+ "learning_rate": 5.328467153284672e-07,
536
+ "loss": 0.6683,
537
+ "step": 74
538
+ },
539
+ {
540
+ "epoch": 0.16465422612513722,
541
+ "grad_norm": 24.795553870705557,
542
+ "learning_rate": 5.401459854014598e-07,
543
+ "loss": 0.6869,
544
+ "step": 75
545
+ },
546
+ {
547
+ "epoch": 0.1668496158068057,
548
+ "grad_norm": 20.06359220796973,
549
+ "learning_rate": 5.474452554744526e-07,
550
+ "loss": 0.6776,
551
+ "step": 76
552
+ },
553
+ {
554
+ "epoch": 0.1690450054884742,
555
+ "grad_norm": 14.605622927723077,
556
+ "learning_rate": 5.547445255474452e-07,
557
+ "loss": 0.6169,
558
+ "step": 77
559
+ },
560
+ {
561
+ "epoch": 0.1712403951701427,
562
+ "grad_norm": 13.398369259423399,
563
+ "learning_rate": 5.620437956204379e-07,
564
+ "loss": 0.5911,
565
+ "step": 78
566
+ },
567
+ {
568
+ "epoch": 0.1734357848518112,
569
+ "grad_norm": 22.20812822444505,
570
+ "learning_rate": 5.693430656934306e-07,
571
+ "loss": 0.6641,
572
+ "step": 79
573
+ },
574
+ {
575
+ "epoch": 0.1756311745334797,
576
+ "grad_norm": 11.530488794545967,
577
+ "learning_rate": 5.766423357664233e-07,
578
+ "loss": 0.6546,
579
+ "step": 80
580
+ },
581
+ {
582
+ "epoch": 0.1778265642151482,
583
+ "grad_norm": 19.45472907398047,
584
+ "learning_rate": 5.83941605839416e-07,
585
+ "loss": 0.6346,
586
+ "step": 81
587
+ },
588
+ {
589
+ "epoch": 0.1800219538968167,
590
+ "grad_norm": 14.402272095344948,
591
+ "learning_rate": 5.912408759124087e-07,
592
+ "loss": 0.6157,
593
+ "step": 82
594
+ },
595
+ {
596
+ "epoch": 0.18221734357848518,
597
+ "grad_norm": 20.486016309108074,
598
+ "learning_rate": 5.985401459854015e-07,
599
+ "loss": 0.6353,
600
+ "step": 83
601
+ },
602
+ {
603
+ "epoch": 0.18441273326015367,
604
+ "grad_norm": 21.724039606610585,
605
+ "learning_rate": 6.058394160583942e-07,
606
+ "loss": 0.6669,
607
+ "step": 84
608
+ },
609
+ {
610
+ "epoch": 0.18660812294182216,
611
+ "grad_norm": 23.550204927202188,
612
+ "learning_rate": 6.131386861313868e-07,
613
+ "loss": 0.6573,
614
+ "step": 85
615
+ },
616
+ {
617
+ "epoch": 0.18880351262349068,
618
+ "grad_norm": 25.850348423694676,
619
+ "learning_rate": 6.204379562043796e-07,
620
+ "loss": 0.6096,
621
+ "step": 86
622
+ },
623
+ {
624
+ "epoch": 0.19099890230515917,
625
+ "grad_norm": 15.668802168044579,
626
+ "learning_rate": 6.277372262773722e-07,
627
+ "loss": 0.6074,
628
+ "step": 87
629
+ },
630
+ {
631
+ "epoch": 0.19319429198682767,
632
+ "grad_norm": 11.15207045876449,
633
+ "learning_rate": 6.350364963503649e-07,
634
+ "loss": 0.5861,
635
+ "step": 88
636
+ },
637
+ {
638
+ "epoch": 0.19538968166849616,
639
+ "grad_norm": 30.989418951503534,
640
+ "learning_rate": 6.423357664233577e-07,
641
+ "loss": 0.5803,
642
+ "step": 89
643
+ },
644
+ {
645
+ "epoch": 0.19758507135016465,
646
+ "grad_norm": 37.87369273674589,
647
+ "learning_rate": 6.496350364963503e-07,
648
+ "loss": 0.5607,
649
+ "step": 90
650
+ },
651
+ {
652
+ "epoch": 0.19978046103183314,
653
+ "grad_norm": 16.304752909448357,
654
+ "learning_rate": 6.56934306569343e-07,
655
+ "loss": 0.4855,
656
+ "step": 91
657
+ },
658
+ {
659
+ "epoch": 0.20197585071350166,
660
+ "grad_norm": 46.813965107422554,
661
+ "learning_rate": 6.642335766423358e-07,
662
+ "loss": 0.5277,
663
+ "step": 92
664
+ },
665
+ {
666
+ "epoch": 0.20417124039517015,
667
+ "grad_norm": 16.25834561476597,
668
+ "learning_rate": 6.715328467153284e-07,
669
+ "loss": 0.5671,
670
+ "step": 93
671
+ },
672
+ {
673
+ "epoch": 0.20636663007683864,
674
+ "grad_norm": 24.571054002712096,
675
+ "learning_rate": 6.788321167883211e-07,
676
+ "loss": 0.5485,
677
+ "step": 94
678
+ },
679
+ {
680
+ "epoch": 0.20856201975850713,
681
+ "grad_norm": 17.16346868917258,
682
+ "learning_rate": 6.861313868613139e-07,
683
+ "loss": 0.6168,
684
+ "step": 95
685
+ },
686
+ {
687
+ "epoch": 0.21075740944017562,
688
+ "grad_norm": 24.939402022765716,
689
+ "learning_rate": 6.934306569343066e-07,
690
+ "loss": 0.5097,
691
+ "step": 96
692
+ },
693
+ {
694
+ "epoch": 0.21295279912184412,
695
+ "grad_norm": 35.04290652380563,
696
+ "learning_rate": 7.007299270072993e-07,
697
+ "loss": 0.511,
698
+ "step": 97
699
+ },
700
+ {
701
+ "epoch": 0.21514818880351264,
702
+ "grad_norm": 19.369038528744387,
703
+ "learning_rate": 7.080291970802919e-07,
704
+ "loss": 0.501,
705
+ "step": 98
706
+ },
707
+ {
708
+ "epoch": 0.21734357848518113,
709
+ "grad_norm": 14.504342290526917,
710
+ "learning_rate": 7.153284671532847e-07,
711
+ "loss": 0.5576,
712
+ "step": 99
713
+ },
714
+ {
715
+ "epoch": 0.21953896816684962,
716
+ "grad_norm": 22.831165380560154,
717
+ "learning_rate": 7.226277372262773e-07,
718
+ "loss": 0.6357,
719
+ "step": 100
720
+ },
721
+ {
722
+ "epoch": 0.21953896816684962,
723
+ "eval_accuracy": 0.716,
724
+ "eval_loss": 0.5146071910858154,
725
+ "eval_runtime": 51.4544,
726
+ "eval_samples_per_second": 9.717,
727
+ "eval_steps_per_second": 1.224,
728
+ "step": 100
729
+ },
730
+ {
731
+ "epoch": 0.2217343578485181,
732
+ "grad_norm": 17.214759546075644,
733
+ "learning_rate": 7.2992700729927e-07,
734
+ "loss": 0.5879,
735
+ "step": 101
736
+ },
737
+ {
738
+ "epoch": 0.2239297475301866,
739
+ "grad_norm": 28.14453156116428,
740
+ "learning_rate": 7.372262773722628e-07,
741
+ "loss": 0.6077,
742
+ "step": 102
743
+ },
744
+ {
745
+ "epoch": 0.2261251372118551,
746
+ "grad_norm": 36.81857394774189,
747
+ "learning_rate": 7.445255474452554e-07,
748
+ "loss": 0.5656,
749
+ "step": 103
750
+ },
751
+ {
752
+ "epoch": 0.2283205268935236,
753
+ "grad_norm": 19.720256889573427,
754
+ "learning_rate": 7.518248175182481e-07,
755
+ "loss": 0.5779,
756
+ "step": 104
757
+ },
758
+ {
759
+ "epoch": 0.2305159165751921,
760
+ "grad_norm": 13.404265085713362,
761
+ "learning_rate": 7.591240875912408e-07,
762
+ "loss": 0.6194,
763
+ "step": 105
764
+ },
765
+ {
766
+ "epoch": 0.2327113062568606,
767
+ "grad_norm": 10.223644936756514,
768
+ "learning_rate": 7.664233576642335e-07,
769
+ "loss": 0.5653,
770
+ "step": 106
771
+ },
772
+ {
773
+ "epoch": 0.2349066959385291,
774
+ "grad_norm": 11.802424231339492,
775
+ "learning_rate": 7.737226277372263e-07,
776
+ "loss": 0.5243,
777
+ "step": 107
778
+ },
779
+ {
780
+ "epoch": 0.23710208562019758,
781
+ "grad_norm": 15.21599780255988,
782
+ "learning_rate": 7.81021897810219e-07,
783
+ "loss": 0.5237,
784
+ "step": 108
785
+ },
786
+ {
787
+ "epoch": 0.23929747530186607,
788
+ "grad_norm": 9.817905688792953,
789
+ "learning_rate": 7.883211678832117e-07,
790
+ "loss": 0.5559,
791
+ "step": 109
792
+ },
793
+ {
794
+ "epoch": 0.2414928649835346,
795
+ "grad_norm": 14.019441837913432,
796
+ "learning_rate": 7.956204379562043e-07,
797
+ "loss": 0.5634,
798
+ "step": 110
799
+ },
800
+ {
801
+ "epoch": 0.24368825466520308,
802
+ "grad_norm": 11.287304916292086,
803
+ "learning_rate": 8.029197080291971e-07,
804
+ "loss": 0.5169,
805
+ "step": 111
806
+ },
807
+ {
808
+ "epoch": 0.24588364434687157,
809
+ "grad_norm": 13.731826260486923,
810
+ "learning_rate": 8.102189781021898e-07,
811
+ "loss": 0.4922,
812
+ "step": 112
813
+ },
814
+ {
815
+ "epoch": 0.24807903402854006,
816
+ "grad_norm": 10.127768690583599,
817
+ "learning_rate": 8.175182481751824e-07,
818
+ "loss": 0.4939,
819
+ "step": 113
820
+ },
821
+ {
822
+ "epoch": 0.2502744237102086,
823
+ "grad_norm": 10.643603728793044,
824
+ "learning_rate": 8.248175182481751e-07,
825
+ "loss": 0.5433,
826
+ "step": 114
827
+ },
828
+ {
829
+ "epoch": 0.2524698133918771,
830
+ "grad_norm": 16.423886102329853,
831
+ "learning_rate": 8.321167883211679e-07,
832
+ "loss": 0.4931,
833
+ "step": 115
834
+ },
835
+ {
836
+ "epoch": 0.25466520307354557,
837
+ "grad_norm": 11.683580419342695,
838
+ "learning_rate": 8.394160583941605e-07,
839
+ "loss": 0.6014,
840
+ "step": 116
841
+ },
842
+ {
843
+ "epoch": 0.25686059275521406,
844
+ "grad_norm": 9.964987145456663,
845
+ "learning_rate": 8.467153284671532e-07,
846
+ "loss": 0.5111,
847
+ "step": 117
848
+ },
849
+ {
850
+ "epoch": 0.25905598243688255,
851
+ "grad_norm": 11.994223750763792,
852
+ "learning_rate": 8.540145985401459e-07,
853
+ "loss": 0.5427,
854
+ "step": 118
855
+ },
856
+ {
857
+ "epoch": 0.26125137211855104,
858
+ "grad_norm": 17.08922954256959,
859
+ "learning_rate": 8.613138686131386e-07,
860
+ "loss": 0.5454,
861
+ "step": 119
862
+ },
863
+ {
864
+ "epoch": 0.26344676180021953,
865
+ "grad_norm": 9.233224929973781,
866
+ "learning_rate": 8.686131386861314e-07,
867
+ "loss": 0.4425,
868
+ "step": 120
869
+ },
870
+ {
871
+ "epoch": 0.265642151481888,
872
+ "grad_norm": 11.158793551879755,
873
+ "learning_rate": 8.759124087591241e-07,
874
+ "loss": 0.4592,
875
+ "step": 121
876
+ },
877
+ {
878
+ "epoch": 0.2678375411635565,
879
+ "grad_norm": 9.984032512868838,
880
+ "learning_rate": 8.832116788321168e-07,
881
+ "loss": 0.4619,
882
+ "step": 122
883
+ },
884
+ {
885
+ "epoch": 0.270032930845225,
886
+ "grad_norm": 13.6121955869425,
887
+ "learning_rate": 8.905109489051094e-07,
888
+ "loss": 0.5102,
889
+ "step": 123
890
+ },
891
+ {
892
+ "epoch": 0.2722283205268935,
893
+ "grad_norm": 16.38565463082547,
894
+ "learning_rate": 8.978102189781022e-07,
895
+ "loss": 0.5414,
896
+ "step": 124
897
+ },
898
+ {
899
+ "epoch": 0.27442371020856204,
900
+ "grad_norm": 13.72322184659144,
901
+ "learning_rate": 9.051094890510949e-07,
902
+ "loss": 0.4404,
903
+ "step": 125
904
+ },
905
+ {
906
+ "epoch": 0.27661909989023054,
907
+ "grad_norm": 9.728231053042531,
908
+ "learning_rate": 9.124087591240875e-07,
909
+ "loss": 0.4891,
910
+ "step": 126
911
+ },
912
+ {
913
+ "epoch": 0.278814489571899,
914
+ "grad_norm": 19.653134550591925,
915
+ "learning_rate": 9.197080291970803e-07,
916
+ "loss": 0.4911,
917
+ "step": 127
918
+ },
919
+ {
920
+ "epoch": 0.2810098792535675,
921
+ "grad_norm": 15.233058767794994,
922
+ "learning_rate": 9.270072992700729e-07,
923
+ "loss": 0.4482,
924
+ "step": 128
925
+ },
926
+ {
927
+ "epoch": 0.283205268935236,
928
+ "grad_norm": 14.85049447902684,
929
+ "learning_rate": 9.343065693430656e-07,
930
+ "loss": 0.4381,
931
+ "step": 129
932
+ },
933
+ {
934
+ "epoch": 0.2854006586169045,
935
+ "grad_norm": 18.23588962213726,
936
+ "learning_rate": 9.416058394160583e-07,
937
+ "loss": 0.5267,
938
+ "step": 130
939
+ },
940
+ {
941
+ "epoch": 0.287596048298573,
942
+ "grad_norm": 13.687385484203132,
943
+ "learning_rate": 9.48905109489051e-07,
944
+ "loss": 0.533,
945
+ "step": 131
946
+ },
947
+ {
948
+ "epoch": 0.2897914379802415,
949
+ "grad_norm": 16.31348614065883,
950
+ "learning_rate": 9.562043795620438e-07,
951
+ "loss": 0.4592,
952
+ "step": 132
953
+ },
954
+ {
955
+ "epoch": 0.29198682766191,
956
+ "grad_norm": 10.619144950897025,
957
+ "learning_rate": 9.635036496350364e-07,
958
+ "loss": 0.5337,
959
+ "step": 133
960
+ },
961
+ {
962
+ "epoch": 0.29418221734357847,
963
+ "grad_norm": 10.964180780818202,
964
+ "learning_rate": 9.708029197080291e-07,
965
+ "loss": 0.531,
966
+ "step": 134
967
+ },
968
+ {
969
+ "epoch": 0.29637760702524696,
970
+ "grad_norm": 9.68554990094299,
971
+ "learning_rate": 9.78102189781022e-07,
972
+ "loss": 0.4466,
973
+ "step": 135
974
+ },
975
+ {
976
+ "epoch": 0.29857299670691545,
977
+ "grad_norm": 11.193131252386175,
978
+ "learning_rate": 9.854014598540146e-07,
979
+ "loss": 0.4518,
980
+ "step": 136
981
+ },
982
+ {
983
+ "epoch": 0.300768386388584,
984
+ "grad_norm": 7.652184708715949,
985
+ "learning_rate": 9.927007299270073e-07,
986
+ "loss": 0.4514,
987
+ "step": 137
988
+ },
989
+ {
990
+ "epoch": 0.3029637760702525,
991
+ "grad_norm": 9.282129557780532,
992
+ "learning_rate": 1e-06,
993
+ "loss": 0.4952,
994
+ "step": 138
995
+ },
996
+ {
997
+ "epoch": 0.305159165751921,
998
+ "grad_norm": 9.514439267127576,
999
+ "learning_rate": 9.999983717412808e-07,
1000
+ "loss": 0.5277,
1001
+ "step": 139
1002
+ },
1003
+ {
1004
+ "epoch": 0.30735455543358947,
1005
+ "grad_norm": 8.04642891691016,
1006
+ "learning_rate": 9.999934869757278e-07,
1007
+ "loss": 0.5323,
1008
+ "step": 140
1009
+ },
1010
+ {
1011
+ "epoch": 0.30954994511525796,
1012
+ "grad_norm": 14.24768033484951,
1013
+ "learning_rate": 9.999853457351558e-07,
1014
+ "loss": 0.4996,
1015
+ "step": 141
1016
+ },
1017
+ {
1018
+ "epoch": 0.31174533479692645,
1019
+ "grad_norm": 11.061860342712375,
1020
+ "learning_rate": 9.999739480725893e-07,
1021
+ "loss": 0.4849,
1022
+ "step": 142
1023
+ },
1024
+ {
1025
+ "epoch": 0.31394072447859495,
1026
+ "grad_norm": 15.443833167332631,
1027
+ "learning_rate": 9.999592940622613e-07,
1028
+ "loss": 0.4957,
1029
+ "step": 143
1030
+ },
1031
+ {
1032
+ "epoch": 0.31613611416026344,
1033
+ "grad_norm": 9.488802082671176,
1034
+ "learning_rate": 9.999413837996137e-07,
1035
+ "loss": 0.495,
1036
+ "step": 144
1037
+ },
1038
+ {
1039
+ "epoch": 0.31833150384193193,
1040
+ "grad_norm": 7.092185690265562,
1041
+ "learning_rate": 9.999202174012972e-07,
1042
+ "loss": 0.4189,
1043
+ "step": 145
1044
+ },
1045
+ {
1046
+ "epoch": 0.3205268935236004,
1047
+ "grad_norm": 8.422031666334862,
1048
+ "learning_rate": 9.99895795005169e-07,
1049
+ "loss": 0.4548,
1050
+ "step": 146
1051
+ },
1052
+ {
1053
+ "epoch": 0.3227222832052689,
1054
+ "grad_norm": 9.442638766406834,
1055
+ "learning_rate": 9.99868116770293e-07,
1056
+ "loss": 0.4385,
1057
+ "step": 147
1058
+ },
1059
+ {
1060
+ "epoch": 0.32491767288693746,
1061
+ "grad_norm": 7.687755924564992,
1062
+ "learning_rate": 9.998371828769384e-07,
1063
+ "loss": 0.4726,
1064
+ "step": 148
1065
+ },
1066
+ {
1067
+ "epoch": 0.32711306256860595,
1068
+ "grad_norm": 11.01823455774795,
1069
+ "learning_rate": 9.99802993526579e-07,
1070
+ "loss": 0.5062,
1071
+ "step": 149
1072
+ },
1073
+ {
1074
+ "epoch": 0.32930845225027444,
1075
+ "grad_norm": 8.828666504433839,
1076
+ "learning_rate": 9.997655489418912e-07,
1077
+ "loss": 0.5211,
1078
+ "step": 150
1079
+ },
1080
+ {
1081
+ "epoch": 0.32930845225027444,
1082
+ "eval_accuracy": 0.774,
1083
+ "eval_loss": 0.4343988001346588,
1084
+ "eval_runtime": 52.0075,
1085
+ "eval_samples_per_second": 9.614,
1086
+ "eval_steps_per_second": 1.211,
1087
+ "step": 150
1088
+ },
1089
+ {
1090
+ "epoch": 0.33150384193194293,
1091
+ "grad_norm": 9.826049631904308,
1092
+ "learning_rate": 9.997248493667527e-07,
1093
+ "loss": 0.5144,
1094
+ "step": 151
1095
+ },
1096
+ {
1097
+ "epoch": 0.3336992316136114,
1098
+ "grad_norm": 9.822644682623359,
1099
+ "learning_rate": 9.996808950662413e-07,
1100
+ "loss": 0.46,
1101
+ "step": 152
1102
+ },
1103
+ {
1104
+ "epoch": 0.3358946212952799,
1105
+ "grad_norm": 7.448251385258531,
1106
+ "learning_rate": 9.99633686326633e-07,
1107
+ "loss": 0.4837,
1108
+ "step": 153
1109
+ },
1110
+ {
1111
+ "epoch": 0.3380900109769484,
1112
+ "grad_norm": 8.66825069576008,
1113
+ "learning_rate": 9.995832234554e-07,
1114
+ "loss": 0.4442,
1115
+ "step": 154
1116
+ },
1117
+ {
1118
+ "epoch": 0.3402854006586169,
1119
+ "grad_norm": 9.445663098800049,
1120
+ "learning_rate": 9.995295067812083e-07,
1121
+ "loss": 0.4984,
1122
+ "step": 155
1123
+ },
1124
+ {
1125
+ "epoch": 0.3424807903402854,
1126
+ "grad_norm": 7.886446457789178,
1127
+ "learning_rate": 9.99472536653917e-07,
1128
+ "loss": 0.4888,
1129
+ "step": 156
1130
+ },
1131
+ {
1132
+ "epoch": 0.3446761800219539,
1133
+ "grad_norm": 6.5270562387929285,
1134
+ "learning_rate": 9.994123134445746e-07,
1135
+ "loss": 0.4608,
1136
+ "step": 157
1137
+ },
1138
+ {
1139
+ "epoch": 0.3468715697036224,
1140
+ "grad_norm": 8.44851437549508,
1141
+ "learning_rate": 9.993488375454165e-07,
1142
+ "loss": 0.4319,
1143
+ "step": 158
1144
+ },
1145
+ {
1146
+ "epoch": 0.34906695938529086,
1147
+ "grad_norm": 8.945522697664286,
1148
+ "learning_rate": 9.992821093698636e-07,
1149
+ "loss": 0.52,
1150
+ "step": 159
1151
+ },
1152
+ {
1153
+ "epoch": 0.3512623490669594,
1154
+ "grad_norm": 6.9004623022662965,
1155
+ "learning_rate": 9.992121293525188e-07,
1156
+ "loss": 0.5153,
1157
+ "step": 160
1158
+ },
1159
+ {
1160
+ "epoch": 0.3534577387486279,
1161
+ "grad_norm": 9.258099865422958,
1162
+ "learning_rate": 9.991388979491646e-07,
1163
+ "loss": 0.4248,
1164
+ "step": 161
1165
+ },
1166
+ {
1167
+ "epoch": 0.3556531284302964,
1168
+ "grad_norm": 9.52300325877657,
1169
+ "learning_rate": 9.990624156367596e-07,
1170
+ "loss": 0.4744,
1171
+ "step": 162
1172
+ },
1173
+ {
1174
+ "epoch": 0.3578485181119649,
1175
+ "grad_norm": 8.87937529404168,
1176
+ "learning_rate": 9.989826829134356e-07,
1177
+ "loss": 0.4804,
1178
+ "step": 163
1179
+ },
1180
+ {
1181
+ "epoch": 0.3600439077936334,
1182
+ "grad_norm": 8.218898687145828,
1183
+ "learning_rate": 9.988997002984949e-07,
1184
+ "loss": 0.5154,
1185
+ "step": 164
1186
+ },
1187
+ {
1188
+ "epoch": 0.36223929747530187,
1189
+ "grad_norm": 9.404715926766505,
1190
+ "learning_rate": 9.988134683324058e-07,
1191
+ "loss": 0.4595,
1192
+ "step": 165
1193
+ },
1194
+ {
1195
+ "epoch": 0.36443468715697036,
1196
+ "grad_norm": 8.404212842327881,
1197
+ "learning_rate": 9.987239875768006e-07,
1198
+ "loss": 0.4122,
1199
+ "step": 166
1200
+ },
1201
+ {
1202
+ "epoch": 0.36663007683863885,
1203
+ "grad_norm": 12.927034279467245,
1204
+ "learning_rate": 9.9863125861447e-07,
1205
+ "loss": 0.458,
1206
+ "step": 167
1207
+ },
1208
+ {
1209
+ "epoch": 0.36882546652030734,
1210
+ "grad_norm": 9.395688075375638,
1211
+ "learning_rate": 9.985352820493614e-07,
1212
+ "loss": 0.4442,
1213
+ "step": 168
1214
+ },
1215
+ {
1216
+ "epoch": 0.37102085620197583,
1217
+ "grad_norm": 12.027605468945074,
1218
+ "learning_rate": 9.984360585065733e-07,
1219
+ "loss": 0.4342,
1220
+ "step": 169
1221
+ },
1222
+ {
1223
+ "epoch": 0.3732162458836443,
1224
+ "grad_norm": 11.101898867057535,
1225
+ "learning_rate": 9.983335886323524e-07,
1226
+ "loss": 0.4457,
1227
+ "step": 170
1228
+ },
1229
+ {
1230
+ "epoch": 0.3754116355653128,
1231
+ "grad_norm": 9.788751810099042,
1232
+ "learning_rate": 9.98227873094088e-07,
1233
+ "loss": 0.4773,
1234
+ "step": 171
1235
+ },
1236
+ {
1237
+ "epoch": 0.37760702524698136,
1238
+ "grad_norm": 8.772604827049461,
1239
+ "learning_rate": 9.981189125803095e-07,
1240
+ "loss": 0.4763,
1241
+ "step": 172
1242
+ },
1243
+ {
1244
+ "epoch": 0.37980241492864986,
1245
+ "grad_norm": 9.88650795305363,
1246
+ "learning_rate": 9.980067078006804e-07,
1247
+ "loss": 0.5062,
1248
+ "step": 173
1249
+ },
1250
+ {
1251
+ "epoch": 0.38199780461031835,
1252
+ "grad_norm": 8.73207633341288,
1253
+ "learning_rate": 9.978912594859946e-07,
1254
+ "loss": 0.5079,
1255
+ "step": 174
1256
+ },
1257
+ {
1258
+ "epoch": 0.38419319429198684,
1259
+ "grad_norm": 7.037929236057823,
1260
+ "learning_rate": 9.977725683881707e-07,
1261
+ "loss": 0.4444,
1262
+ "step": 175
1263
+ },
1264
+ {
1265
+ "epoch": 0.38638858397365533,
1266
+ "grad_norm": 14.22447093808163,
1267
+ "learning_rate": 9.97650635280248e-07,
1268
+ "loss": 0.4683,
1269
+ "step": 176
1270
+ },
1271
+ {
1272
+ "epoch": 0.3885839736553238,
1273
+ "grad_norm": 6.850018676344095,
1274
+ "learning_rate": 9.97525460956381e-07,
1275
+ "loss": 0.4544,
1276
+ "step": 177
1277
+ },
1278
+ {
1279
+ "epoch": 0.3907793633369923,
1280
+ "grad_norm": 9.295362171330144,
1281
+ "learning_rate": 9.973970462318349e-07,
1282
+ "loss": 0.4168,
1283
+ "step": 178
1284
+ },
1285
+ {
1286
+ "epoch": 0.3929747530186608,
1287
+ "grad_norm": 9.498462196078586,
1288
+ "learning_rate": 9.972653919429788e-07,
1289
+ "loss": 0.4762,
1290
+ "step": 179
1291
+ },
1292
+ {
1293
+ "epoch": 0.3951701427003293,
1294
+ "grad_norm": 14.51893007863248,
1295
+ "learning_rate": 9.971304989472817e-07,
1296
+ "loss": 0.429,
1297
+ "step": 180
1298
+ },
1299
+ {
1300
+ "epoch": 0.3973655323819978,
1301
+ "grad_norm": 10.861523669617597,
1302
+ "learning_rate": 9.969923681233066e-07,
1303
+ "loss": 0.5009,
1304
+ "step": 181
1305
+ },
1306
+ {
1307
+ "epoch": 0.3995609220636663,
1308
+ "grad_norm": 8.71310472180412,
1309
+ "learning_rate": 9.968510003707042e-07,
1310
+ "loss": 0.4562,
1311
+ "step": 182
1312
+ },
1313
+ {
1314
+ "epoch": 0.40175631174533477,
1315
+ "grad_norm": 7.504849700550564,
1316
+ "learning_rate": 9.967063966102079e-07,
1317
+ "loss": 0.4103,
1318
+ "step": 183
1319
+ },
1320
+ {
1321
+ "epoch": 0.4039517014270033,
1322
+ "grad_norm": 11.160809122945897,
1323
+ "learning_rate": 9.965585577836264e-07,
1324
+ "loss": 0.4868,
1325
+ "step": 184
1326
+ },
1327
+ {
1328
+ "epoch": 0.4061470911086718,
1329
+ "grad_norm": 9.173442546613508,
1330
+ "learning_rate": 9.9640748485384e-07,
1331
+ "loss": 0.4178,
1332
+ "step": 185
1333
+ },
1334
+ {
1335
+ "epoch": 0.4083424807903403,
1336
+ "grad_norm": 9.946380701067211,
1337
+ "learning_rate": 9.962531788047913e-07,
1338
+ "loss": 0.4075,
1339
+ "step": 186
1340
+ },
1341
+ {
1342
+ "epoch": 0.4105378704720088,
1343
+ "grad_norm": 6.7742539591322775,
1344
+ "learning_rate": 9.960956406414813e-07,
1345
+ "loss": 0.4228,
1346
+ "step": 187
1347
+ },
1348
+ {
1349
+ "epoch": 0.4127332601536773,
1350
+ "grad_norm": 6.667418828739606,
1351
+ "learning_rate": 9.959348713899613e-07,
1352
+ "loss": 0.4313,
1353
+ "step": 188
1354
+ },
1355
+ {
1356
+ "epoch": 0.4149286498353458,
1357
+ "grad_norm": 8.5318733439587,
1358
+ "learning_rate": 9.957708720973273e-07,
1359
+ "loss": 0.432,
1360
+ "step": 189
1361
+ },
1362
+ {
1363
+ "epoch": 0.41712403951701427,
1364
+ "grad_norm": 8.281669822513821,
1365
+ "learning_rate": 9.956036438317123e-07,
1366
+ "loss": 0.4454,
1367
+ "step": 190
1368
+ },
1369
+ {
1370
+ "epoch": 0.41931942919868276,
1371
+ "grad_norm": 11.094591963482126,
1372
+ "learning_rate": 9.954331876822798e-07,
1373
+ "loss": 0.4472,
1374
+ "step": 191
1375
+ },
1376
+ {
1377
+ "epoch": 0.42151481888035125,
1378
+ "grad_norm": 10.576004038338146,
1379
+ "learning_rate": 9.952595047592167e-07,
1380
+ "loss": 0.5284,
1381
+ "step": 192
1382
+ },
1383
+ {
1384
+ "epoch": 0.42371020856201974,
1385
+ "grad_norm": 7.579127144334551,
1386
+ "learning_rate": 9.950825961937257e-07,
1387
+ "loss": 0.4835,
1388
+ "step": 193
1389
+ },
1390
+ {
1391
+ "epoch": 0.42590559824368823,
1392
+ "grad_norm": 9.83385395859545,
1393
+ "learning_rate": 9.949024631380189e-07,
1394
+ "loss": 0.4513,
1395
+ "step": 194
1396
+ },
1397
+ {
1398
+ "epoch": 0.4281009879253567,
1399
+ "grad_norm": 9.260566516176205,
1400
+ "learning_rate": 9.94719106765309e-07,
1401
+ "loss": 0.4215,
1402
+ "step": 195
1403
+ },
1404
+ {
1405
+ "epoch": 0.43029637760702527,
1406
+ "grad_norm": 9.7492885786622,
1407
+ "learning_rate": 9.945325282698022e-07,
1408
+ "loss": 0.4338,
1409
+ "step": 196
1410
+ },
1411
+ {
1412
+ "epoch": 0.43249176728869376,
1413
+ "grad_norm": 8.278244359603962,
1414
+ "learning_rate": 9.94342728866691e-07,
1415
+ "loss": 0.4249,
1416
+ "step": 197
1417
+ },
1418
+ {
1419
+ "epoch": 0.43468715697036225,
1420
+ "grad_norm": 9.668030105849532,
1421
+ "learning_rate": 9.941497097921456e-07,
1422
+ "loss": 0.4328,
1423
+ "step": 198
1424
+ },
1425
+ {
1426
+ "epoch": 0.43688254665203075,
1427
+ "grad_norm": 9.271666205840873,
1428
+ "learning_rate": 9.939534723033057e-07,
1429
+ "loss": 0.4424,
1430
+ "step": 199
1431
+ },
1432
+ {
1433
+ "epoch": 0.43907793633369924,
1434
+ "grad_norm": 9.508865253140504,
1435
+ "learning_rate": 9.937540176782731e-07,
1436
+ "loss": 0.4076,
1437
+ "step": 200
1438
+ },
1439
+ {
1440
+ "epoch": 0.43907793633369924,
1441
+ "eval_accuracy": 0.778,
1442
+ "eval_loss": 0.41148170828819275,
1443
+ "eval_runtime": 52.0039,
1444
+ "eval_samples_per_second": 9.615,
1445
+ "eval_steps_per_second": 1.211,
1446
+ "step": 200
1447
+ },
1448
+ {
1449
+ "epoch": 0.44127332601536773,
1450
+ "grad_norm": 15.641606615350042,
1451
+ "learning_rate": 9.935513472161026e-07,
1452
+ "loss": 0.5175,
1453
+ "step": 201
1454
+ },
1455
+ {
1456
+ "epoch": 0.4434687156970362,
1457
+ "grad_norm": 6.583946691921041,
1458
+ "learning_rate": 9.93345462236794e-07,
1459
+ "loss": 0.4612,
1460
+ "step": 202
1461
+ },
1462
+ {
1463
+ "epoch": 0.4456641053787047,
1464
+ "grad_norm": 7.317576490413668,
1465
+ "learning_rate": 9.931363640812837e-07,
1466
+ "loss": 0.4319,
1467
+ "step": 203
1468
+ },
1469
+ {
1470
+ "epoch": 0.4478594950603732,
1471
+ "grad_norm": 8.58238734593965,
1472
+ "learning_rate": 9.929240541114347e-07,
1473
+ "loss": 0.4362,
1474
+ "step": 204
1475
+ },
1476
+ {
1477
+ "epoch": 0.4500548847420417,
1478
+ "grad_norm": 7.645878462772526,
1479
+ "learning_rate": 9.927085337100298e-07,
1480
+ "loss": 0.4449,
1481
+ "step": 205
1482
+ },
1483
+ {
1484
+ "epoch": 0.4522502744237102,
1485
+ "grad_norm": 8.128944249572761,
1486
+ "learning_rate": 9.924898042807604e-07,
1487
+ "loss": 0.4359,
1488
+ "step": 206
1489
+ },
1490
+ {
1491
+ "epoch": 0.4544456641053787,
1492
+ "grad_norm": 6.323254265724658,
1493
+ "learning_rate": 9.922678672482192e-07,
1494
+ "loss": 0.4349,
1495
+ "step": 207
1496
+ },
1497
+ {
1498
+ "epoch": 0.4566410537870472,
1499
+ "grad_norm": 8.343466628765478,
1500
+ "learning_rate": 9.920427240578898e-07,
1501
+ "loss": 0.4601,
1502
+ "step": 208
1503
+ },
1504
+ {
1505
+ "epoch": 0.4588364434687157,
1506
+ "grad_norm": 9.94252397747147,
1507
+ "learning_rate": 9.918143761761376e-07,
1508
+ "loss": 0.4275,
1509
+ "step": 209
1510
+ },
1511
+ {
1512
+ "epoch": 0.4610318331503842,
1513
+ "grad_norm": 6.996773271668393,
1514
+ "learning_rate": 9.915828250902003e-07,
1515
+ "loss": 0.4092,
1516
+ "step": 210
1517
+ },
1518
+ {
1519
+ "epoch": 0.4632272228320527,
1520
+ "grad_norm": 7.702375964154107,
1521
+ "learning_rate": 9.913480723081782e-07,
1522
+ "loss": 0.4039,
1523
+ "step": 211
1524
+ },
1525
+ {
1526
+ "epoch": 0.4654226125137212,
1527
+ "grad_norm": 12.275736080317706,
1528
+ "learning_rate": 9.911101193590243e-07,
1529
+ "loss": 0.483,
1530
+ "step": 212
1531
+ },
1532
+ {
1533
+ "epoch": 0.4676180021953897,
1534
+ "grad_norm": 13.554222149681413,
1535
+ "learning_rate": 9.908689677925347e-07,
1536
+ "loss": 0.408,
1537
+ "step": 213
1538
+ },
1539
+ {
1540
+ "epoch": 0.4698133918770582,
1541
+ "grad_norm": 10.859860574414233,
1542
+ "learning_rate": 9.906246191793378e-07,
1543
+ "loss": 0.4032,
1544
+ "step": 214
1545
+ },
1546
+ {
1547
+ "epoch": 0.47200878155872666,
1548
+ "grad_norm": 13.78109413813406,
1549
+ "learning_rate": 9.903770751108845e-07,
1550
+ "loss": 0.414,
1551
+ "step": 215
1552
+ },
1553
+ {
1554
+ "epoch": 0.47420417124039516,
1555
+ "grad_norm": 7.982093750602254,
1556
+ "learning_rate": 9.901263371994381e-07,
1557
+ "loss": 0.4097,
1558
+ "step": 216
1559
+ },
1560
+ {
1561
+ "epoch": 0.47639956092206365,
1562
+ "grad_norm": 7.226081741797403,
1563
+ "learning_rate": 9.898724070780636e-07,
1564
+ "loss": 0.424,
1565
+ "step": 217
1566
+ },
1567
+ {
1568
+ "epoch": 0.47859495060373214,
1569
+ "grad_norm": 6.5505528789866005,
1570
+ "learning_rate": 9.896152864006163e-07,
1571
+ "loss": 0.4117,
1572
+ "step": 218
1573
+ },
1574
+ {
1575
+ "epoch": 0.4807903402854007,
1576
+ "grad_norm": 7.238329656366918,
1577
+ "learning_rate": 9.893549768417324e-07,
1578
+ "loss": 0.426,
1579
+ "step": 219
1580
+ },
1581
+ {
1582
+ "epoch": 0.4829857299670692,
1583
+ "grad_norm": 9.018371675867607,
1584
+ "learning_rate": 9.89091480096817e-07,
1585
+ "loss": 0.4301,
1586
+ "step": 220
1587
+ },
1588
+ {
1589
+ "epoch": 0.48518111964873767,
1590
+ "grad_norm": 6.873575031102029,
1591
+ "learning_rate": 9.888247978820336e-07,
1592
+ "loss": 0.4412,
1593
+ "step": 221
1594
+ },
1595
+ {
1596
+ "epoch": 0.48737650933040616,
1597
+ "grad_norm": 7.71741695116787,
1598
+ "learning_rate": 9.88554931934293e-07,
1599
+ "loss": 0.401,
1600
+ "step": 222
1601
+ },
1602
+ {
1603
+ "epoch": 0.48957189901207465,
1604
+ "grad_norm": 6.450074931709192,
1605
+ "learning_rate": 9.882818840112412e-07,
1606
+ "loss": 0.4453,
1607
+ "step": 223
1608
+ },
1609
+ {
1610
+ "epoch": 0.49176728869374314,
1611
+ "grad_norm": 10.18694339380834,
1612
+ "learning_rate": 9.88005655891249e-07,
1613
+ "loss": 0.4499,
1614
+ "step": 224
1615
+ },
1616
+ {
1617
+ "epoch": 0.49396267837541163,
1618
+ "grad_norm": 8.526454527609461,
1619
+ "learning_rate": 9.877262493734e-07,
1620
+ "loss": 0.3529,
1621
+ "step": 225
1622
+ },
1623
+ {
1624
+ "epoch": 0.4961580680570801,
1625
+ "grad_norm": 7.8436191187666795,
1626
+ "learning_rate": 9.874436662774781e-07,
1627
+ "loss": 0.3862,
1628
+ "step": 226
1629
+ },
1630
+ {
1631
+ "epoch": 0.4983534577387486,
1632
+ "grad_norm": 12.16130920239228,
1633
+ "learning_rate": 9.871579084439573e-07,
1634
+ "loss": 0.4494,
1635
+ "step": 227
1636
+ },
1637
+ {
1638
+ "epoch": 0.5005488474204172,
1639
+ "grad_norm": 13.22347671983063,
1640
+ "learning_rate": 9.868689777339882e-07,
1641
+ "loss": 0.3777,
1642
+ "step": 228
1643
+ },
1644
+ {
1645
+ "epoch": 0.5027442371020856,
1646
+ "grad_norm": 10.714878837498341,
1647
+ "learning_rate": 9.865768760293865e-07,
1648
+ "loss": 0.4392,
1649
+ "step": 229
1650
+ },
1651
+ {
1652
+ "epoch": 0.5049396267837541,
1653
+ "grad_norm": 10.989931098186185,
1654
+ "learning_rate": 9.862816052326207e-07,
1655
+ "loss": 0.4342,
1656
+ "step": 230
1657
+ },
1658
+ {
1659
+ "epoch": 0.5071350164654226,
1660
+ "grad_norm": 9.420235479586577,
1661
+ "learning_rate": 9.859831672668001e-07,
1662
+ "loss": 0.449,
1663
+ "step": 231
1664
+ },
1665
+ {
1666
+ "epoch": 0.5093304061470911,
1667
+ "grad_norm": 7.8352789241534895,
1668
+ "learning_rate": 9.856815640756614e-07,
1669
+ "loss": 0.4066,
1670
+ "step": 232
1671
+ },
1672
+ {
1673
+ "epoch": 0.5115257958287596,
1674
+ "grad_norm": 6.528093022715006,
1675
+ "learning_rate": 9.85376797623557e-07,
1676
+ "loss": 0.4076,
1677
+ "step": 233
1678
+ },
1679
+ {
1680
+ "epoch": 0.5137211855104281,
1681
+ "grad_norm": 9.599510824703753,
1682
+ "learning_rate": 9.850688698954408e-07,
1683
+ "loss": 0.4656,
1684
+ "step": 234
1685
+ },
1686
+ {
1687
+ "epoch": 0.5159165751920965,
1688
+ "grad_norm": 7.327102993121418,
1689
+ "learning_rate": 9.847577828968574e-07,
1690
+ "loss": 0.3928,
1691
+ "step": 235
1692
+ },
1693
+ {
1694
+ "epoch": 0.5181119648737651,
1695
+ "grad_norm": 8.47025842420312,
1696
+ "learning_rate": 9.84443538653927e-07,
1697
+ "loss": 0.4459,
1698
+ "step": 236
1699
+ },
1700
+ {
1701
+ "epoch": 0.5203073545554336,
1702
+ "grad_norm": 8.808300848288402,
1703
+ "learning_rate": 9.841261392133334e-07,
1704
+ "loss": 0.4699,
1705
+ "step": 237
1706
+ },
1707
+ {
1708
+ "epoch": 0.5225027442371021,
1709
+ "grad_norm": 6.4472715749678535,
1710
+ "learning_rate": 9.838055866423101e-07,
1711
+ "loss": 0.3927,
1712
+ "step": 238
1713
+ },
1714
+ {
1715
+ "epoch": 0.5246981339187706,
1716
+ "grad_norm": 8.71789150223208,
1717
+ "learning_rate": 9.834818830286274e-07,
1718
+ "loss": 0.4112,
1719
+ "step": 239
1720
+ },
1721
+ {
1722
+ "epoch": 0.5268935236004391,
1723
+ "grad_norm": 6.520504611614625,
1724
+ "learning_rate": 9.83155030480578e-07,
1725
+ "loss": 0.4221,
1726
+ "step": 240
1727
+ },
1728
+ {
1729
+ "epoch": 0.5290889132821076,
1730
+ "grad_norm": 7.822762966760966,
1731
+ "learning_rate": 9.82825031126964e-07,
1732
+ "loss": 0.4085,
1733
+ "step": 241
1734
+ },
1735
+ {
1736
+ "epoch": 0.531284302963776,
1737
+ "grad_norm": 8.42042192814293,
1738
+ "learning_rate": 9.82491887117083e-07,
1739
+ "loss": 0.3344,
1740
+ "step": 242
1741
+ },
1742
+ {
1743
+ "epoch": 0.5334796926454446,
1744
+ "grad_norm": 8.795849170631488,
1745
+ "learning_rate": 9.821556006207131e-07,
1746
+ "loss": 0.4706,
1747
+ "step": 243
1748
+ },
1749
+ {
1750
+ "epoch": 0.535675082327113,
1751
+ "grad_norm": 11.237650950766433,
1752
+ "learning_rate": 9.818161738281003e-07,
1753
+ "loss": 0.3922,
1754
+ "step": 244
1755
+ },
1756
+ {
1757
+ "epoch": 0.5378704720087816,
1758
+ "grad_norm": 8.79761175477303,
1759
+ "learning_rate": 9.81473608949943e-07,
1760
+ "loss": 0.3798,
1761
+ "step": 245
1762
+ },
1763
+ {
1764
+ "epoch": 0.54006586169045,
1765
+ "grad_norm": 9.224818726295643,
1766
+ "learning_rate": 9.811279082173783e-07,
1767
+ "loss": 0.4045,
1768
+ "step": 246
1769
+ },
1770
+ {
1771
+ "epoch": 0.5422612513721186,
1772
+ "grad_norm": 11.451420498389481,
1773
+ "learning_rate": 9.80779073881967e-07,
1774
+ "loss": 0.4767,
1775
+ "step": 247
1776
+ },
1777
+ {
1778
+ "epoch": 0.544456641053787,
1779
+ "grad_norm": 14.204545837436886,
1780
+ "learning_rate": 9.804271082156792e-07,
1781
+ "loss": 0.5448,
1782
+ "step": 248
1783
+ },
1784
+ {
1785
+ "epoch": 0.5466520307354555,
1786
+ "grad_norm": 8.162056177470248,
1787
+ "learning_rate": 9.800720135108798e-07,
1788
+ "loss": 0.4662,
1789
+ "step": 249
1790
+ },
1791
+ {
1792
+ "epoch": 0.5488474204171241,
1793
+ "grad_norm": 5.928770792444553,
1794
+ "learning_rate": 9.79713792080313e-07,
1795
+ "loss": 0.3764,
1796
+ "step": 250
1797
+ },
1798
+ {
1799
+ "epoch": 0.5488474204171241,
1800
+ "eval_accuracy": 0.764,
1801
+ "eval_loss": 0.41615644097328186,
1802
+ "eval_runtime": 51.876,
1803
+ "eval_samples_per_second": 9.638,
1804
+ "eval_steps_per_second": 1.214,
1805
+ "step": 250
1806
+ },
1807
+ {
1808
+ "epoch": 0.5510428100987925,
1809
+ "grad_norm": 6.184202969509846,
1810
+ "learning_rate": 9.793524462570874e-07,
1811
+ "loss": 0.3895,
1812
+ "step": 251
1813
+ },
1814
+ {
1815
+ "epoch": 0.5532381997804611,
1816
+ "grad_norm": 7.749574593570711,
1817
+ "learning_rate": 9.78987978394661e-07,
1818
+ "loss": 0.4515,
1819
+ "step": 252
1820
+ },
1821
+ {
1822
+ "epoch": 0.5554335894621295,
1823
+ "grad_norm": 8.854833970278174,
1824
+ "learning_rate": 9.786203908668255e-07,
1825
+ "loss": 0.4269,
1826
+ "step": 253
1827
+ },
1828
+ {
1829
+ "epoch": 0.557628979143798,
1830
+ "grad_norm": 6.18988995877324,
1831
+ "learning_rate": 9.78249686067691e-07,
1832
+ "loss": 0.4204,
1833
+ "step": 254
1834
+ },
1835
+ {
1836
+ "epoch": 0.5598243688254665,
1837
+ "grad_norm": 6.0975770712127,
1838
+ "learning_rate": 9.778758664116717e-07,
1839
+ "loss": 0.4312,
1840
+ "step": 255
1841
+ },
1842
+ {
1843
+ "epoch": 0.562019758507135,
1844
+ "grad_norm": 7.860127371723332,
1845
+ "learning_rate": 9.774989343334675e-07,
1846
+ "loss": 0.4056,
1847
+ "step": 256
1848
+ },
1849
+ {
1850
+ "epoch": 0.5642151481888035,
1851
+ "grad_norm": 6.287357081202248,
1852
+ "learning_rate": 9.771188922880501e-07,
1853
+ "loss": 0.431,
1854
+ "step": 257
1855
+ },
1856
+ {
1857
+ "epoch": 0.566410537870472,
1858
+ "grad_norm": 7.482025843040615,
1859
+ "learning_rate": 9.76735742750647e-07,
1860
+ "loss": 0.4237,
1861
+ "step": 258
1862
+ },
1863
+ {
1864
+ "epoch": 0.5686059275521405,
1865
+ "grad_norm": 6.626343732308425,
1866
+ "learning_rate": 9.763494882167238e-07,
1867
+ "loss": 0.4258,
1868
+ "step": 259
1869
+ },
1870
+ {
1871
+ "epoch": 0.570801317233809,
1872
+ "grad_norm": 6.379142997039637,
1873
+ "learning_rate": 9.759601312019705e-07,
1874
+ "loss": 0.4496,
1875
+ "step": 260
1876
+ },
1877
+ {
1878
+ "epoch": 0.5729967069154775,
1879
+ "grad_norm": 7.786859033495975,
1880
+ "learning_rate": 9.755676742422824e-07,
1881
+ "loss": 0.3697,
1882
+ "step": 261
1883
+ },
1884
+ {
1885
+ "epoch": 0.575192096597146,
1886
+ "grad_norm": 7.474508144431387,
1887
+ "learning_rate": 9.751721198937457e-07,
1888
+ "loss": 0.3805,
1889
+ "step": 262
1890
+ },
1891
+ {
1892
+ "epoch": 0.5773874862788145,
1893
+ "grad_norm": 5.797750979716916,
1894
+ "learning_rate": 9.747734707326194e-07,
1895
+ "loss": 0.3832,
1896
+ "step": 263
1897
+ },
1898
+ {
1899
+ "epoch": 0.579582875960483,
1900
+ "grad_norm": 10.801633788158538,
1901
+ "learning_rate": 9.743717293553197e-07,
1902
+ "loss": 0.3489,
1903
+ "step": 264
1904
+ },
1905
+ {
1906
+ "epoch": 0.5817782656421515,
1907
+ "grad_norm": 8.81184130305926,
1908
+ "learning_rate": 9.73966898378402e-07,
1909
+ "loss": 0.3933,
1910
+ "step": 265
1911
+ },
1912
+ {
1913
+ "epoch": 0.58397365532382,
1914
+ "grad_norm": 10.99886571046786,
1915
+ "learning_rate": 9.735589804385445e-07,
1916
+ "loss": 0.4582,
1917
+ "step": 266
1918
+ },
1919
+ {
1920
+ "epoch": 0.5861690450054885,
1921
+ "grad_norm": 8.989645189656823,
1922
+ "learning_rate": 9.731479781925308e-07,
1923
+ "loss": 0.39,
1924
+ "step": 267
1925
+ },
1926
+ {
1927
+ "epoch": 0.5883644346871569,
1928
+ "grad_norm": 9.04325471249987,
1929
+ "learning_rate": 9.727338943172335e-07,
1930
+ "loss": 0.402,
1931
+ "step": 268
1932
+ },
1933
+ {
1934
+ "epoch": 0.5905598243688255,
1935
+ "grad_norm": 8.32075801154753,
1936
+ "learning_rate": 9.723167315095947e-07,
1937
+ "loss": 0.3896,
1938
+ "step": 269
1939
+ },
1940
+ {
1941
+ "epoch": 0.5927552140504939,
1942
+ "grad_norm": 11.735779106191535,
1943
+ "learning_rate": 9.718964924866108e-07,
1944
+ "loss": 0.4977,
1945
+ "step": 270
1946
+ },
1947
+ {
1948
+ "epoch": 0.5949506037321625,
1949
+ "grad_norm": 6.9875015531767515,
1950
+ "learning_rate": 9.71473179985313e-07,
1951
+ "loss": 0.4142,
1952
+ "step": 271
1953
+ },
1954
+ {
1955
+ "epoch": 0.5971459934138309,
1956
+ "grad_norm": 7.256558656488309,
1957
+ "learning_rate": 9.710467967627502e-07,
1958
+ "loss": 0.4729,
1959
+ "step": 272
1960
+ },
1961
+ {
1962
+ "epoch": 0.5993413830954994,
1963
+ "grad_norm": 6.63170478635617,
1964
+ "learning_rate": 9.706173455959713e-07,
1965
+ "loss": 0.3926,
1966
+ "step": 273
1967
+ },
1968
+ {
1969
+ "epoch": 0.601536772777168,
1970
+ "grad_norm": 7.736941306994274,
1971
+ "learning_rate": 9.701848292820069e-07,
1972
+ "loss": 0.4269,
1973
+ "step": 274
1974
+ },
1975
+ {
1976
+ "epoch": 0.6037321624588364,
1977
+ "grad_norm": 7.141227825990968,
1978
+ "learning_rate": 9.697492506378507e-07,
1979
+ "loss": 0.4746,
1980
+ "step": 275
1981
+ },
1982
+ {
1983
+ "epoch": 0.605927552140505,
1984
+ "grad_norm": 6.175085064537569,
1985
+ "learning_rate": 9.693106125004416e-07,
1986
+ "loss": 0.3916,
1987
+ "step": 276
1988
+ },
1989
+ {
1990
+ "epoch": 0.6081229418221734,
1991
+ "grad_norm": 8.496732166784003,
1992
+ "learning_rate": 9.688689177266452e-07,
1993
+ "loss": 0.4195,
1994
+ "step": 277
1995
+ },
1996
+ {
1997
+ "epoch": 0.610318331503842,
1998
+ "grad_norm": 5.940495748803936,
1999
+ "learning_rate": 9.684241691932347e-07,
2000
+ "loss": 0.4218,
2001
+ "step": 278
2002
+ },
2003
+ {
2004
+ "epoch": 0.6125137211855104,
2005
+ "grad_norm": 6.794809207027829,
2006
+ "learning_rate": 9.679763697968732e-07,
2007
+ "loss": 0.4064,
2008
+ "step": 279
2009
+ },
2010
+ {
2011
+ "epoch": 0.6147091108671789,
2012
+ "grad_norm": 7.232988324397075,
2013
+ "learning_rate": 9.675255224540934e-07,
2014
+ "loss": 0.3699,
2015
+ "step": 280
2016
+ },
2017
+ {
2018
+ "epoch": 0.6169045005488474,
2019
+ "grad_norm": 7.07939070544042,
2020
+ "learning_rate": 9.6707163010128e-07,
2021
+ "loss": 0.4168,
2022
+ "step": 281
2023
+ },
2024
+ {
2025
+ "epoch": 0.6190998902305159,
2026
+ "grad_norm": 8.718610382883304,
2027
+ "learning_rate": 9.666146956946496e-07,
2028
+ "loss": 0.4097,
2029
+ "step": 282
2030
+ },
2031
+ {
2032
+ "epoch": 0.6212952799121844,
2033
+ "grad_norm": 9.093471494946654,
2034
+ "learning_rate": 9.661547222102321e-07,
2035
+ "loss": 0.4252,
2036
+ "step": 283
2037
+ },
2038
+ {
2039
+ "epoch": 0.6234906695938529,
2040
+ "grad_norm": 13.940230416519052,
2041
+ "learning_rate": 9.656917126438508e-07,
2042
+ "loss": 0.4567,
2043
+ "step": 284
2044
+ },
2045
+ {
2046
+ "epoch": 0.6256860592755215,
2047
+ "grad_norm": 8.779803873571954,
2048
+ "learning_rate": 9.65225670011103e-07,
2049
+ "loss": 0.383,
2050
+ "step": 285
2051
+ },
2052
+ {
2053
+ "epoch": 0.6278814489571899,
2054
+ "grad_norm": 7.5708107372450035,
2055
+ "learning_rate": 9.647565973473407e-07,
2056
+ "loss": 0.4124,
2057
+ "step": 286
2058
+ },
2059
+ {
2060
+ "epoch": 0.6300768386388584,
2061
+ "grad_norm": 13.174216612702878,
2062
+ "learning_rate": 9.642844977076507e-07,
2063
+ "loss": 0.4502,
2064
+ "step": 287
2065
+ },
2066
+ {
2067
+ "epoch": 0.6322722283205269,
2068
+ "grad_norm": 8.49295896068912,
2069
+ "learning_rate": 9.63809374166834e-07,
2070
+ "loss": 0.4369,
2071
+ "step": 288
2072
+ },
2073
+ {
2074
+ "epoch": 0.6344676180021954,
2075
+ "grad_norm": 9.345508533381187,
2076
+ "learning_rate": 9.633312298193871e-07,
2077
+ "loss": 0.4625,
2078
+ "step": 289
2079
+ },
2080
+ {
2081
+ "epoch": 0.6366630076838639,
2082
+ "grad_norm": 9.696466925170553,
2083
+ "learning_rate": 9.62850067779481e-07,
2084
+ "loss": 0.4442,
2085
+ "step": 290
2086
+ },
2087
+ {
2088
+ "epoch": 0.6388583973655324,
2089
+ "grad_norm": 8.896625991218926,
2090
+ "learning_rate": 9.623658911809404e-07,
2091
+ "loss": 0.3991,
2092
+ "step": 291
2093
+ },
2094
+ {
2095
+ "epoch": 0.6410537870472008,
2096
+ "grad_norm": 9.525265923938584,
2097
+ "learning_rate": 9.618787031772245e-07,
2098
+ "loss": 0.3849,
2099
+ "step": 292
2100
+ },
2101
+ {
2102
+ "epoch": 0.6432491767288694,
2103
+ "grad_norm": 5.576065012637179,
2104
+ "learning_rate": 9.61388506941406e-07,
2105
+ "loss": 0.4233,
2106
+ "step": 293
2107
+ },
2108
+ {
2109
+ "epoch": 0.6454445664105378,
2110
+ "grad_norm": 7.207790760174152,
2111
+ "learning_rate": 9.6089530566615e-07,
2112
+ "loss": 0.3935,
2113
+ "step": 294
2114
+ },
2115
+ {
2116
+ "epoch": 0.6476399560922064,
2117
+ "grad_norm": 6.129075752774988,
2118
+ "learning_rate": 9.603991025636933e-07,
2119
+ "loss": 0.3904,
2120
+ "step": 295
2121
+ },
2122
+ {
2123
+ "epoch": 0.6498353457738749,
2124
+ "grad_norm": 7.223439076532464,
2125
+ "learning_rate": 9.598999008658241e-07,
2126
+ "loss": 0.3842,
2127
+ "step": 296
2128
+ },
2129
+ {
2130
+ "epoch": 0.6520307354555434,
2131
+ "grad_norm": 8.143383508497742,
2132
+ "learning_rate": 9.59397703823861e-07,
2133
+ "loss": 0.4269,
2134
+ "step": 297
2135
+ },
2136
+ {
2137
+ "epoch": 0.6542261251372119,
2138
+ "grad_norm": 9.02225167943478,
2139
+ "learning_rate": 9.588925147086303e-07,
2140
+ "loss": 0.3878,
2141
+ "step": 298
2142
+ },
2143
+ {
2144
+ "epoch": 0.6564215148188803,
2145
+ "grad_norm": 7.737039238378963,
2146
+ "learning_rate": 9.583843368104464e-07,
2147
+ "loss": 0.3508,
2148
+ "step": 299
2149
+ },
2150
+ {
2151
+ "epoch": 0.6586169045005489,
2152
+ "grad_norm": 6.743123024533224,
2153
+ "learning_rate": 9.578731734390898e-07,
2154
+ "loss": 0.3808,
2155
+ "step": 300
2156
+ },
2157
+ {
2158
+ "epoch": 0.6586169045005489,
2159
+ "eval_accuracy": 0.77,
2160
+ "eval_loss": 0.41097283363342285,
2161
+ "eval_runtime": 52.0828,
2162
+ "eval_samples_per_second": 9.6,
2163
+ "eval_steps_per_second": 1.21,
2164
+ "step": 300
2165
+ },
2166
+ {
2167
+ "epoch": 0.6608122941822173,
2168
+ "grad_norm": 7.137538658423445,
2169
+ "learning_rate": 9.573590279237854e-07,
2170
+ "loss": 0.3938,
2171
+ "step": 301
2172
+ },
2173
+ {
2174
+ "epoch": 0.6630076838638859,
2175
+ "grad_norm": 6.92894058529831,
2176
+ "learning_rate": 9.568419036131807e-07,
2177
+ "loss": 0.3729,
2178
+ "step": 302
2179
+ },
2180
+ {
2181
+ "epoch": 0.6652030735455543,
2182
+ "grad_norm": 8.370178365332173,
2183
+ "learning_rate": 9.563218038753245e-07,
2184
+ "loss": 0.4004,
2185
+ "step": 303
2186
+ },
2187
+ {
2188
+ "epoch": 0.6673984632272228,
2189
+ "grad_norm": 14.386262804761378,
2190
+ "learning_rate": 9.557987320976446e-07,
2191
+ "loss": 0.4211,
2192
+ "step": 304
2193
+ },
2194
+ {
2195
+ "epoch": 0.6695938529088913,
2196
+ "grad_norm": 7.7875110721670975,
2197
+ "learning_rate": 9.552726916869254e-07,
2198
+ "loss": 0.388,
2199
+ "step": 305
2200
+ },
2201
+ {
2202
+ "epoch": 0.6717892425905598,
2203
+ "grad_norm": 8.49964853234254,
2204
+ "learning_rate": 9.547436860692869e-07,
2205
+ "loss": 0.4369,
2206
+ "step": 306
2207
+ },
2208
+ {
2209
+ "epoch": 0.6739846322722283,
2210
+ "grad_norm": 6.5964314472495404,
2211
+ "learning_rate": 9.542117186901608e-07,
2212
+ "loss": 0.3423,
2213
+ "step": 307
2214
+ },
2215
+ {
2216
+ "epoch": 0.6761800219538968,
2217
+ "grad_norm": 7.6505759115431085,
2218
+ "learning_rate": 9.536767930142692e-07,
2219
+ "loss": 0.3602,
2220
+ "step": 308
2221
+ },
2222
+ {
2223
+ "epoch": 0.6783754116355654,
2224
+ "grad_norm": 7.556175808157708,
2225
+ "learning_rate": 9.53138912525602e-07,
2226
+ "loss": 0.4385,
2227
+ "step": 309
2228
+ },
2229
+ {
2230
+ "epoch": 0.6805708013172338,
2231
+ "grad_norm": 9.165395898058962,
2232
+ "learning_rate": 9.525980807273933e-07,
2233
+ "loss": 0.4516,
2234
+ "step": 310
2235
+ },
2236
+ {
2237
+ "epoch": 0.6827661909989023,
2238
+ "grad_norm": 7.637449593962663,
2239
+ "learning_rate": 9.520543011420994e-07,
2240
+ "loss": 0.4289,
2241
+ "step": 311
2242
+ },
2243
+ {
2244
+ "epoch": 0.6849615806805708,
2245
+ "grad_norm": 9.18857916606884,
2246
+ "learning_rate": 9.515075773113758e-07,
2247
+ "loss": 0.3785,
2248
+ "step": 312
2249
+ },
2250
+ {
2251
+ "epoch": 0.6871569703622393,
2252
+ "grad_norm": 7.821219676613859,
2253
+ "learning_rate": 9.509579127960541e-07,
2254
+ "loss": 0.4158,
2255
+ "step": 313
2256
+ },
2257
+ {
2258
+ "epoch": 0.6893523600439078,
2259
+ "grad_norm": 6.571343391685928,
2260
+ "learning_rate": 9.504053111761183e-07,
2261
+ "loss": 0.3609,
2262
+ "step": 314
2263
+ },
2264
+ {
2265
+ "epoch": 0.6915477497255763,
2266
+ "grad_norm": 6.225595749149676,
2267
+ "learning_rate": 9.498497760506819e-07,
2268
+ "loss": 0.4359,
2269
+ "step": 315
2270
+ },
2271
+ {
2272
+ "epoch": 0.6937431394072447,
2273
+ "grad_norm": 6.950774628316192,
2274
+ "learning_rate": 9.492913110379647e-07,
2275
+ "loss": 0.4417,
2276
+ "step": 316
2277
+ },
2278
+ {
2279
+ "epoch": 0.6959385290889133,
2280
+ "grad_norm": 5.612528566630172,
2281
+ "learning_rate": 9.487299197752687e-07,
2282
+ "loss": 0.4052,
2283
+ "step": 317
2284
+ },
2285
+ {
2286
+ "epoch": 0.6981339187705817,
2287
+ "grad_norm": 8.404168783130382,
2288
+ "learning_rate": 9.481656059189549e-07,
2289
+ "loss": 0.4162,
2290
+ "step": 318
2291
+ },
2292
+ {
2293
+ "epoch": 0.7003293084522503,
2294
+ "grad_norm": 8.031865429035927,
2295
+ "learning_rate": 9.475983731444191e-07,
2296
+ "loss": 0.4125,
2297
+ "step": 319
2298
+ },
2299
+ {
2300
+ "epoch": 0.7025246981339188,
2301
+ "grad_norm": 10.321047597755841,
2302
+ "learning_rate": 9.47028225146068e-07,
2303
+ "loss": 0.4505,
2304
+ "step": 320
2305
+ },
2306
+ {
2307
+ "epoch": 0.7047200878155873,
2308
+ "grad_norm": 7.740999393168428,
2309
+ "learning_rate": 9.464551656372955e-07,
2310
+ "loss": 0.3765,
2311
+ "step": 321
2312
+ },
2313
+ {
2314
+ "epoch": 0.7069154774972558,
2315
+ "grad_norm": 8.215346652475102,
2316
+ "learning_rate": 9.458791983504581e-07,
2317
+ "loss": 0.401,
2318
+ "step": 322
2319
+ },
2320
+ {
2321
+ "epoch": 0.7091108671789242,
2322
+ "grad_norm": 7.4542945597526336,
2323
+ "learning_rate": 9.453003270368509e-07,
2324
+ "loss": 0.4796,
2325
+ "step": 323
2326
+ },
2327
+ {
2328
+ "epoch": 0.7113062568605928,
2329
+ "grad_norm": 7.054708809148231,
2330
+ "learning_rate": 9.44718555466683e-07,
2331
+ "loss": 0.4497,
2332
+ "step": 324
2333
+ },
2334
+ {
2335
+ "epoch": 0.7135016465422612,
2336
+ "grad_norm": 7.460280928904919,
2337
+ "learning_rate": 9.44133887429053e-07,
2338
+ "loss": 0.3926,
2339
+ "step": 325
2340
+ },
2341
+ {
2342
+ "epoch": 0.7156970362239298,
2343
+ "grad_norm": 8.265245053890029,
2344
+ "learning_rate": 9.435463267319239e-07,
2345
+ "loss": 0.3805,
2346
+ "step": 326
2347
+ },
2348
+ {
2349
+ "epoch": 0.7178924259055982,
2350
+ "grad_norm": 7.800641885443644,
2351
+ "learning_rate": 9.429558772020992e-07,
2352
+ "loss": 0.4424,
2353
+ "step": 327
2354
+ },
2355
+ {
2356
+ "epoch": 0.7200878155872668,
2357
+ "grad_norm": 8.978144769546304,
2358
+ "learning_rate": 9.423625426851973e-07,
2359
+ "loss": 0.4159,
2360
+ "step": 328
2361
+ },
2362
+ {
2363
+ "epoch": 0.7222832052689352,
2364
+ "grad_norm": 6.867455347578303,
2365
+ "learning_rate": 9.417663270456267e-07,
2366
+ "loss": 0.402,
2367
+ "step": 329
2368
+ },
2369
+ {
2370
+ "epoch": 0.7244785949506037,
2371
+ "grad_norm": 7.15816324078702,
2372
+ "learning_rate": 9.411672341665604e-07,
2373
+ "loss": 0.3814,
2374
+ "step": 330
2375
+ },
2376
+ {
2377
+ "epoch": 0.7266739846322722,
2378
+ "grad_norm": 7.444204700173956,
2379
+ "learning_rate": 9.405652679499115e-07,
2380
+ "loss": 0.3782,
2381
+ "step": 331
2382
+ },
2383
+ {
2384
+ "epoch": 0.7288693743139407,
2385
+ "grad_norm": 6.789459839990034,
2386
+ "learning_rate": 9.399604323163068e-07,
2387
+ "loss": 0.3484,
2388
+ "step": 332
2389
+ },
2390
+ {
2391
+ "epoch": 0.7310647639956093,
2392
+ "grad_norm": 14.478524238054472,
2393
+ "learning_rate": 9.393527312050617e-07,
2394
+ "loss": 0.464,
2395
+ "step": 333
2396
+ },
2397
+ {
2398
+ "epoch": 0.7332601536772777,
2399
+ "grad_norm": 20.668512412883626,
2400
+ "learning_rate": 9.387421685741552e-07,
2401
+ "loss": 0.4503,
2402
+ "step": 334
2403
+ },
2404
+ {
2405
+ "epoch": 0.7354555433589463,
2406
+ "grad_norm": 12.197800422910193,
2407
+ "learning_rate": 9.381287484002027e-07,
2408
+ "loss": 0.4281,
2409
+ "step": 335
2410
+ },
2411
+ {
2412
+ "epoch": 0.7376509330406147,
2413
+ "grad_norm": 6.445277090460248,
2414
+ "learning_rate": 9.375124746784311e-07,
2415
+ "loss": 0.3648,
2416
+ "step": 336
2417
+ },
2418
+ {
2419
+ "epoch": 0.7398463227222832,
2420
+ "grad_norm": 11.96859661106773,
2421
+ "learning_rate": 9.368933514226529e-07,
2422
+ "loss": 0.4608,
2423
+ "step": 337
2424
+ },
2425
+ {
2426
+ "epoch": 0.7420417124039517,
2427
+ "grad_norm": 5.939377322140176,
2428
+ "learning_rate": 9.362713826652392e-07,
2429
+ "loss": 0.4006,
2430
+ "step": 338
2431
+ },
2432
+ {
2433
+ "epoch": 0.7442371020856202,
2434
+ "grad_norm": 8.561222995237147,
2435
+ "learning_rate": 9.356465724570943e-07,
2436
+ "loss": 0.4163,
2437
+ "step": 339
2438
+ },
2439
+ {
2440
+ "epoch": 0.7464324917672887,
2441
+ "grad_norm": 6.033330356983505,
2442
+ "learning_rate": 9.350189248676292e-07,
2443
+ "loss": 0.4144,
2444
+ "step": 340
2445
+ },
2446
+ {
2447
+ "epoch": 0.7486278814489572,
2448
+ "grad_norm": 6.205787168601971,
2449
+ "learning_rate": 9.34388443984734e-07,
2450
+ "loss": 0.4258,
2451
+ "step": 341
2452
+ },
2453
+ {
2454
+ "epoch": 0.7508232711306256,
2455
+ "grad_norm": 7.631254905556204,
2456
+ "learning_rate": 9.33755133914753e-07,
2457
+ "loss": 0.4065,
2458
+ "step": 342
2459
+ },
2460
+ {
2461
+ "epoch": 0.7530186608122942,
2462
+ "grad_norm": 6.32445637747235,
2463
+ "learning_rate": 9.331189987824568e-07,
2464
+ "loss": 0.4389,
2465
+ "step": 343
2466
+ },
2467
+ {
2468
+ "epoch": 0.7552140504939627,
2469
+ "grad_norm": 5.65488048442344,
2470
+ "learning_rate": 9.324800427310155e-07,
2471
+ "loss": 0.4363,
2472
+ "step": 344
2473
+ },
2474
+ {
2475
+ "epoch": 0.7574094401756312,
2476
+ "grad_norm": 6.166240836463136,
2477
+ "learning_rate": 9.318382699219722e-07,
2478
+ "loss": 0.4091,
2479
+ "step": 345
2480
+ },
2481
+ {
2482
+ "epoch": 0.7596048298572997,
2483
+ "grad_norm": 7.489797283268898,
2484
+ "learning_rate": 9.311936845352157e-07,
2485
+ "loss": 0.3923,
2486
+ "step": 346
2487
+ },
2488
+ {
2489
+ "epoch": 0.7618002195389681,
2490
+ "grad_norm": 7.098266537437995,
2491
+ "learning_rate": 9.305462907689532e-07,
2492
+ "loss": 0.4363,
2493
+ "step": 347
2494
+ },
2495
+ {
2496
+ "epoch": 0.7639956092206367,
2497
+ "grad_norm": 7.518882790839629,
2498
+ "learning_rate": 9.298960928396826e-07,
2499
+ "loss": 0.3989,
2500
+ "step": 348
2501
+ },
2502
+ {
2503
+ "epoch": 0.7661909989023051,
2504
+ "grad_norm": 7.535014992332507,
2505
+ "learning_rate": 9.292430949821659e-07,
2506
+ "loss": 0.3939,
2507
+ "step": 349
2508
+ },
2509
+ {
2510
+ "epoch": 0.7683863885839737,
2511
+ "grad_norm": 7.793847428801205,
2512
+ "learning_rate": 9.285873014494008e-07,
2513
+ "loss": 0.4238,
2514
+ "step": 350
2515
+ },
2516
+ {
2517
+ "epoch": 0.7683863885839737,
2518
+ "eval_accuracy": 0.78,
2519
+ "eval_loss": 0.3999713063240051,
2520
+ "eval_runtime": 52.2328,
2521
+ "eval_samples_per_second": 9.573,
2522
+ "eval_steps_per_second": 1.206,
2523
+ "step": 350
2524
+ },
2525
+ {
2526
+ "epoch": 0.7705817782656421,
2527
+ "grad_norm": 6.9078346633967405,
2528
+ "learning_rate": 9.279287165125936e-07,
2529
+ "loss": 0.3417,
2530
+ "step": 351
2531
+ },
2532
+ {
2533
+ "epoch": 0.7727771679473107,
2534
+ "grad_norm": 8.452074250696336,
2535
+ "learning_rate": 9.272673444611308e-07,
2536
+ "loss": 0.426,
2537
+ "step": 352
2538
+ },
2539
+ {
2540
+ "epoch": 0.7749725576289791,
2541
+ "grad_norm": 6.7224178173856215,
2542
+ "learning_rate": 9.266031896025516e-07,
2543
+ "loss": 0.421,
2544
+ "step": 353
2545
+ },
2546
+ {
2547
+ "epoch": 0.7771679473106476,
2548
+ "grad_norm": 8.735363673640215,
2549
+ "learning_rate": 9.259362562625199e-07,
2550
+ "loss": 0.3596,
2551
+ "step": 354
2552
+ },
2553
+ {
2554
+ "epoch": 0.7793633369923162,
2555
+ "grad_norm": 7.949892246537311,
2556
+ "learning_rate": 9.252665487847957e-07,
2557
+ "loss": 0.3922,
2558
+ "step": 355
2559
+ },
2560
+ {
2561
+ "epoch": 0.7815587266739846,
2562
+ "grad_norm": 12.194367861210461,
2563
+ "learning_rate": 9.245940715312074e-07,
2564
+ "loss": 0.4538,
2565
+ "step": 356
2566
+ },
2567
+ {
2568
+ "epoch": 0.7837541163556532,
2569
+ "grad_norm": 10.45987571002838,
2570
+ "learning_rate": 9.239188288816226e-07,
2571
+ "loss": 0.4928,
2572
+ "step": 357
2573
+ },
2574
+ {
2575
+ "epoch": 0.7859495060373216,
2576
+ "grad_norm": 6.764374108633695,
2577
+ "learning_rate": 9.232408252339201e-07,
2578
+ "loss": 0.4226,
2579
+ "step": 358
2580
+ },
2581
+ {
2582
+ "epoch": 0.7881448957189902,
2583
+ "grad_norm": 6.394843611556007,
2584
+ "learning_rate": 9.225600650039615e-07,
2585
+ "loss": 0.4096,
2586
+ "step": 359
2587
+ },
2588
+ {
2589
+ "epoch": 0.7903402854006586,
2590
+ "grad_norm": 6.971937381598142,
2591
+ "learning_rate": 9.218765526255619e-07,
2592
+ "loss": 0.4162,
2593
+ "step": 360
2594
+ },
2595
+ {
2596
+ "epoch": 0.7925356750823271,
2597
+ "grad_norm": 5.9443940673499185,
2598
+ "learning_rate": 9.211902925504613e-07,
2599
+ "loss": 0.4161,
2600
+ "step": 361
2601
+ },
2602
+ {
2603
+ "epoch": 0.7947310647639956,
2604
+ "grad_norm": 5.483471181116708,
2605
+ "learning_rate": 9.205012892482952e-07,
2606
+ "loss": 0.3627,
2607
+ "step": 362
2608
+ },
2609
+ {
2610
+ "epoch": 0.7969264544456641,
2611
+ "grad_norm": 6.649198037257637,
2612
+ "learning_rate": 9.198095472065667e-07,
2613
+ "loss": 0.445,
2614
+ "step": 363
2615
+ },
2616
+ {
2617
+ "epoch": 0.7991218441273326,
2618
+ "grad_norm": 7.566125395640005,
2619
+ "learning_rate": 9.191150709306155e-07,
2620
+ "loss": 0.4352,
2621
+ "step": 364
2622
+ },
2623
+ {
2624
+ "epoch": 0.8013172338090011,
2625
+ "grad_norm": 5.877051153754724,
2626
+ "learning_rate": 9.184178649435896e-07,
2627
+ "loss": 0.3855,
2628
+ "step": 365
2629
+ },
2630
+ {
2631
+ "epoch": 0.8035126234906695,
2632
+ "grad_norm": 6.82408167677956,
2633
+ "learning_rate": 9.177179337864163e-07,
2634
+ "loss": 0.44,
2635
+ "step": 366
2636
+ },
2637
+ {
2638
+ "epoch": 0.8057080131723381,
2639
+ "grad_norm": 5.76805550171372,
2640
+ "learning_rate": 9.170152820177714e-07,
2641
+ "loss": 0.3722,
2642
+ "step": 367
2643
+ },
2644
+ {
2645
+ "epoch": 0.8079034028540066,
2646
+ "grad_norm": 5.37322049457271,
2647
+ "learning_rate": 9.163099142140505e-07,
2648
+ "loss": 0.39,
2649
+ "step": 368
2650
+ },
2651
+ {
2652
+ "epoch": 0.8100987925356751,
2653
+ "grad_norm": 6.652734325876614,
2654
+ "learning_rate": 9.156018349693386e-07,
2655
+ "loss": 0.33,
2656
+ "step": 369
2657
+ },
2658
+ {
2659
+ "epoch": 0.8122941822173436,
2660
+ "grad_norm": 8.078887258485842,
2661
+ "learning_rate": 9.148910488953807e-07,
2662
+ "loss": 0.4495,
2663
+ "step": 370
2664
+ },
2665
+ {
2666
+ "epoch": 0.814489571899012,
2667
+ "grad_norm": 6.057501262886995,
2668
+ "learning_rate": 9.141775606215512e-07,
2669
+ "loss": 0.3793,
2670
+ "step": 371
2671
+ },
2672
+ {
2673
+ "epoch": 0.8166849615806806,
2674
+ "grad_norm": 8.349159344050063,
2675
+ "learning_rate": 9.134613747948238e-07,
2676
+ "loss": 0.4165,
2677
+ "step": 372
2678
+ },
2679
+ {
2680
+ "epoch": 0.818880351262349,
2681
+ "grad_norm": 6.093265741529057,
2682
+ "learning_rate": 9.127424960797423e-07,
2683
+ "loss": 0.3683,
2684
+ "step": 373
2685
+ },
2686
+ {
2687
+ "epoch": 0.8210757409440176,
2688
+ "grad_norm": 7.626403918604381,
2689
+ "learning_rate": 9.120209291583885e-07,
2690
+ "loss": 0.4478,
2691
+ "step": 374
2692
+ },
2693
+ {
2694
+ "epoch": 0.823271130625686,
2695
+ "grad_norm": 6.286457043534019,
2696
+ "learning_rate": 9.11296678730353e-07,
2697
+ "loss": 0.4432,
2698
+ "step": 375
2699
+ },
2700
+ {
2701
+ "epoch": 0.8254665203073546,
2702
+ "grad_norm": 6.1102728691763195,
2703
+ "learning_rate": 9.10569749512704e-07,
2704
+ "loss": 0.4408,
2705
+ "step": 376
2706
+ },
2707
+ {
2708
+ "epoch": 0.827661909989023,
2709
+ "grad_norm": 5.8435296710540765,
2710
+ "learning_rate": 9.098401462399572e-07,
2711
+ "loss": 0.3889,
2712
+ "step": 377
2713
+ },
2714
+ {
2715
+ "epoch": 0.8298572996706916,
2716
+ "grad_norm": 6.8221898822791776,
2717
+ "learning_rate": 9.091078736640438e-07,
2718
+ "loss": 0.3924,
2719
+ "step": 378
2720
+ },
2721
+ {
2722
+ "epoch": 0.8320526893523601,
2723
+ "grad_norm": 5.449250972840727,
2724
+ "learning_rate": 9.083729365542807e-07,
2725
+ "loss": 0.4544,
2726
+ "step": 379
2727
+ },
2728
+ {
2729
+ "epoch": 0.8342480790340285,
2730
+ "grad_norm": 6.531068732007626,
2731
+ "learning_rate": 9.076353396973391e-07,
2732
+ "loss": 0.3824,
2733
+ "step": 380
2734
+ },
2735
+ {
2736
+ "epoch": 0.8364434687156971,
2737
+ "grad_norm": 6.253901506150054,
2738
+ "learning_rate": 9.068950878972128e-07,
2739
+ "loss": 0.393,
2740
+ "step": 381
2741
+ },
2742
+ {
2743
+ "epoch": 0.8386388583973655,
2744
+ "grad_norm": 6.104772949983095,
2745
+ "learning_rate": 9.06152185975188e-07,
2746
+ "loss": 0.3857,
2747
+ "step": 382
2748
+ },
2749
+ {
2750
+ "epoch": 0.8408342480790341,
2751
+ "grad_norm": 5.459509353786465,
2752
+ "learning_rate": 9.054066387698103e-07,
2753
+ "loss": 0.3748,
2754
+ "step": 383
2755
+ },
2756
+ {
2757
+ "epoch": 0.8430296377607025,
2758
+ "grad_norm": 6.221629121824156,
2759
+ "learning_rate": 9.04658451136855e-07,
2760
+ "loss": 0.3869,
2761
+ "step": 384
2762
+ },
2763
+ {
2764
+ "epoch": 0.845225027442371,
2765
+ "grad_norm": 11.117787770421263,
2766
+ "learning_rate": 9.039076279492938e-07,
2767
+ "loss": 0.428,
2768
+ "step": 385
2769
+ },
2770
+ {
2771
+ "epoch": 0.8474204171240395,
2772
+ "grad_norm": 7.6402881827025695,
2773
+ "learning_rate": 9.03154174097265e-07,
2774
+ "loss": 0.3685,
2775
+ "step": 386
2776
+ },
2777
+ {
2778
+ "epoch": 0.849615806805708,
2779
+ "grad_norm": 8.329862332560323,
2780
+ "learning_rate": 9.023980944880395e-07,
2781
+ "loss": 0.3888,
2782
+ "step": 387
2783
+ },
2784
+ {
2785
+ "epoch": 0.8518111964873765,
2786
+ "grad_norm": 9.379807197876737,
2787
+ "learning_rate": 9.016393940459901e-07,
2788
+ "loss": 0.3228,
2789
+ "step": 388
2790
+ },
2791
+ {
2792
+ "epoch": 0.854006586169045,
2793
+ "grad_norm": 9.497050528776587,
2794
+ "learning_rate": 9.008780777125592e-07,
2795
+ "loss": 0.3643,
2796
+ "step": 389
2797
+ },
2798
+ {
2799
+ "epoch": 0.8562019758507134,
2800
+ "grad_norm": 11.317540635350868,
2801
+ "learning_rate": 9.001141504462267e-07,
2802
+ "loss": 0.3849,
2803
+ "step": 390
2804
+ },
2805
+ {
2806
+ "epoch": 0.858397365532382,
2807
+ "grad_norm": 9.649076333363197,
2808
+ "learning_rate": 8.993476172224776e-07,
2809
+ "loss": 0.4216,
2810
+ "step": 391
2811
+ },
2812
+ {
2813
+ "epoch": 0.8605927552140505,
2814
+ "grad_norm": 5.870968468017432,
2815
+ "learning_rate": 8.985784830337694e-07,
2816
+ "loss": 0.3512,
2817
+ "step": 392
2818
+ },
2819
+ {
2820
+ "epoch": 0.862788144895719,
2821
+ "grad_norm": 6.63746206403619,
2822
+ "learning_rate": 8.978067528895001e-07,
2823
+ "loss": 0.4034,
2824
+ "step": 393
2825
+ },
2826
+ {
2827
+ "epoch": 0.8649835345773875,
2828
+ "grad_norm": 7.9614797293095805,
2829
+ "learning_rate": 8.970324318159747e-07,
2830
+ "loss": 0.4374,
2831
+ "step": 394
2832
+ },
2833
+ {
2834
+ "epoch": 0.867178924259056,
2835
+ "grad_norm": 6.6259673033312305,
2836
+ "learning_rate": 8.962555248563737e-07,
2837
+ "loss": 0.4034,
2838
+ "step": 395
2839
+ },
2840
+ {
2841
+ "epoch": 0.8693743139407245,
2842
+ "grad_norm": 5.874087728977652,
2843
+ "learning_rate": 8.95476037070719e-07,
2844
+ "loss": 0.4381,
2845
+ "step": 396
2846
+ },
2847
+ {
2848
+ "epoch": 0.8715697036223929,
2849
+ "grad_norm": 5.374289997565401,
2850
+ "learning_rate": 8.94693973535842e-07,
2851
+ "loss": 0.4015,
2852
+ "step": 397
2853
+ },
2854
+ {
2855
+ "epoch": 0.8737650933040615,
2856
+ "grad_norm": 5.135571917512859,
2857
+ "learning_rate": 8.939093393453494e-07,
2858
+ "loss": 0.3764,
2859
+ "step": 398
2860
+ },
2861
+ {
2862
+ "epoch": 0.8759604829857299,
2863
+ "grad_norm": 6.320197335683808,
2864
+ "learning_rate": 8.931221396095914e-07,
2865
+ "loss": 0.3573,
2866
+ "step": 399
2867
+ },
2868
+ {
2869
+ "epoch": 0.8781558726673985,
2870
+ "grad_norm": 5.934124946312254,
2871
+ "learning_rate": 8.92332379455627e-07,
2872
+ "loss": 0.3985,
2873
+ "step": 400
2874
+ },
2875
+ {
2876
+ "epoch": 0.8781558726673985,
2877
+ "eval_accuracy": 0.78,
2878
+ "eval_loss": 0.3908381462097168,
2879
+ "eval_runtime": 52.2922,
2880
+ "eval_samples_per_second": 9.562,
2881
+ "eval_steps_per_second": 1.205,
2882
+ "step": 400
2883
+ },
2884
+ {
2885
+ "epoch": 0.8803512623490669,
2886
+ "grad_norm": 5.227662807462205,
2887
+ "learning_rate": 8.91540064027192e-07,
2888
+ "loss": 0.3802,
2889
+ "step": 401
2890
+ },
2891
+ {
2892
+ "epoch": 0.8825466520307355,
2893
+ "grad_norm": 5.4312238244768105,
2894
+ "learning_rate": 8.907451984846642e-07,
2895
+ "loss": 0.357,
2896
+ "step": 402
2897
+ },
2898
+ {
2899
+ "epoch": 0.884742041712404,
2900
+ "grad_norm": 6.5227729333700255,
2901
+ "learning_rate": 8.899477880050305e-07,
2902
+ "loss": 0.4599,
2903
+ "step": 403
2904
+ },
2905
+ {
2906
+ "epoch": 0.8869374313940724,
2907
+ "grad_norm": 6.320159868176597,
2908
+ "learning_rate": 8.891478377818533e-07,
2909
+ "loss": 0.3456,
2910
+ "step": 404
2911
+ },
2912
+ {
2913
+ "epoch": 0.889132821075741,
2914
+ "grad_norm": 6.595230869297455,
2915
+ "learning_rate": 8.883453530252363e-07,
2916
+ "loss": 0.3385,
2917
+ "step": 405
2918
+ },
2919
+ {
2920
+ "epoch": 0.8913282107574094,
2921
+ "grad_norm": 8.372111940632312,
2922
+ "learning_rate": 8.875403389617909e-07,
2923
+ "loss": 0.4228,
2924
+ "step": 406
2925
+ },
2926
+ {
2927
+ "epoch": 0.893523600439078,
2928
+ "grad_norm": 7.273966750571457,
2929
+ "learning_rate": 8.867328008346012e-07,
2930
+ "loss": 0.378,
2931
+ "step": 407
2932
+ },
2933
+ {
2934
+ "epoch": 0.8957189901207464,
2935
+ "grad_norm": 9.685872652254469,
2936
+ "learning_rate": 8.859227439031917e-07,
2937
+ "loss": 0.4191,
2938
+ "step": 408
2939
+ },
2940
+ {
2941
+ "epoch": 0.897914379802415,
2942
+ "grad_norm": 9.466266479949322,
2943
+ "learning_rate": 8.851101734434916e-07,
2944
+ "loss": 0.3949,
2945
+ "step": 409
2946
+ },
2947
+ {
2948
+ "epoch": 0.9001097694840834,
2949
+ "grad_norm": 8.975473353994817,
2950
+ "learning_rate": 8.842950947478001e-07,
2951
+ "loss": 0.4137,
2952
+ "step": 410
2953
+ },
2954
+ {
2955
+ "epoch": 0.9023051591657519,
2956
+ "grad_norm": 10.051513089446058,
2957
+ "learning_rate": 8.834775131247534e-07,
2958
+ "loss": 0.4055,
2959
+ "step": 411
2960
+ },
2961
+ {
2962
+ "epoch": 0.9045005488474204,
2963
+ "grad_norm": 7.417826600936023,
2964
+ "learning_rate": 8.826574338992893e-07,
2965
+ "loss": 0.4014,
2966
+ "step": 412
2967
+ },
2968
+ {
2969
+ "epoch": 0.9066959385290889,
2970
+ "grad_norm": 7.7383905505396475,
2971
+ "learning_rate": 8.818348624126122e-07,
2972
+ "loss": 0.3484,
2973
+ "step": 413
2974
+ },
2975
+ {
2976
+ "epoch": 0.9088913282107574,
2977
+ "grad_norm": 7.082975830005012,
2978
+ "learning_rate": 8.810098040221588e-07,
2979
+ "loss": 0.3709,
2980
+ "step": 414
2981
+ },
2982
+ {
2983
+ "epoch": 0.9110867178924259,
2984
+ "grad_norm": 6.727376213025829,
2985
+ "learning_rate": 8.801822641015635e-07,
2986
+ "loss": 0.3843,
2987
+ "step": 415
2988
+ },
2989
+ {
2990
+ "epoch": 0.9132821075740944,
2991
+ "grad_norm": 7.275555271868482,
2992
+ "learning_rate": 8.793522480406223e-07,
2993
+ "loss": 0.356,
2994
+ "step": 416
2995
+ },
2996
+ {
2997
+ "epoch": 0.9154774972557629,
2998
+ "grad_norm": 5.944537715719952,
2999
+ "learning_rate": 8.785197612452591e-07,
3000
+ "loss": 0.4293,
3001
+ "step": 417
3002
+ },
3003
+ {
3004
+ "epoch": 0.9176728869374314,
3005
+ "grad_norm": 6.472465668641822,
3006
+ "learning_rate": 8.776848091374892e-07,
3007
+ "loss": 0.353,
3008
+ "step": 418
3009
+ },
3010
+ {
3011
+ "epoch": 0.9198682766190999,
3012
+ "grad_norm": 7.887473362612184,
3013
+ "learning_rate": 8.768473971553847e-07,
3014
+ "loss": 0.417,
3015
+ "step": 419
3016
+ },
3017
+ {
3018
+ "epoch": 0.9220636663007684,
3019
+ "grad_norm": 5.72620013047636,
3020
+ "learning_rate": 8.760075307530392e-07,
3021
+ "loss": 0.3725,
3022
+ "step": 420
3023
+ },
3024
+ {
3025
+ "epoch": 0.9242590559824369,
3026
+ "grad_norm": 6.518816397700794,
3027
+ "learning_rate": 8.75165215400532e-07,
3028
+ "loss": 0.3819,
3029
+ "step": 421
3030
+ },
3031
+ {
3032
+ "epoch": 0.9264544456641054,
3033
+ "grad_norm": 7.2515922225795695,
3034
+ "learning_rate": 8.743204565838922e-07,
3035
+ "loss": 0.4082,
3036
+ "step": 422
3037
+ },
3038
+ {
3039
+ "epoch": 0.9286498353457738,
3040
+ "grad_norm": 6.156643476702157,
3041
+ "learning_rate": 8.734732598050636e-07,
3042
+ "loss": 0.4111,
3043
+ "step": 423
3044
+ },
3045
+ {
3046
+ "epoch": 0.9308452250274424,
3047
+ "grad_norm": 6.1131283071130635,
3048
+ "learning_rate": 8.726236305818681e-07,
3049
+ "loss": 0.3849,
3050
+ "step": 424
3051
+ },
3052
+ {
3053
+ "epoch": 0.9330406147091108,
3054
+ "grad_norm": 8.044334742394684,
3055
+ "learning_rate": 8.717715744479706e-07,
3056
+ "loss": 0.4006,
3057
+ "step": 425
3058
+ },
3059
+ {
3060
+ "epoch": 0.9352360043907794,
3061
+ "grad_norm": 6.327277854929185,
3062
+ "learning_rate": 8.709170969528425e-07,
3063
+ "loss": 0.4525,
3064
+ "step": 426
3065
+ },
3066
+ {
3067
+ "epoch": 0.9374313940724479,
3068
+ "grad_norm": 6.919003187239627,
3069
+ "learning_rate": 8.700602036617253e-07,
3070
+ "loss": 0.3976,
3071
+ "step": 427
3072
+ },
3073
+ {
3074
+ "epoch": 0.9396267837541163,
3075
+ "grad_norm": 6.750362117722639,
3076
+ "learning_rate": 8.692009001555951e-07,
3077
+ "loss": 0.4214,
3078
+ "step": 428
3079
+ },
3080
+ {
3081
+ "epoch": 0.9418221734357849,
3082
+ "grad_norm": 5.309527010131869,
3083
+ "learning_rate": 8.683391920311256e-07,
3084
+ "loss": 0.3837,
3085
+ "step": 429
3086
+ },
3087
+ {
3088
+ "epoch": 0.9440175631174533,
3089
+ "grad_norm": 5.756043416124202,
3090
+ "learning_rate": 8.674750849006518e-07,
3091
+ "loss": 0.3748,
3092
+ "step": 430
3093
+ },
3094
+ {
3095
+ "epoch": 0.9462129527991219,
3096
+ "grad_norm": 5.947118773957151,
3097
+ "learning_rate": 8.666085843921337e-07,
3098
+ "loss": 0.3656,
3099
+ "step": 431
3100
+ },
3101
+ {
3102
+ "epoch": 0.9484083424807903,
3103
+ "grad_norm": 6.803175272777888,
3104
+ "learning_rate": 8.65739696149119e-07,
3105
+ "loss": 0.3871,
3106
+ "step": 432
3107
+ },
3108
+ {
3109
+ "epoch": 0.9506037321624589,
3110
+ "grad_norm": 11.872613980622894,
3111
+ "learning_rate": 8.648684258307075e-07,
3112
+ "loss": 0.4402,
3113
+ "step": 433
3114
+ },
3115
+ {
3116
+ "epoch": 0.9527991218441273,
3117
+ "grad_norm": 11.845458798682339,
3118
+ "learning_rate": 8.639947791115131e-07,
3119
+ "loss": 0.398,
3120
+ "step": 434
3121
+ },
3122
+ {
3123
+ "epoch": 0.9549945115257958,
3124
+ "grad_norm": 7.885958732125603,
3125
+ "learning_rate": 8.631187616816271e-07,
3126
+ "loss": 0.3649,
3127
+ "step": 435
3128
+ },
3129
+ {
3130
+ "epoch": 0.9571899012074643,
3131
+ "grad_norm": 7.087463703257775,
3132
+ "learning_rate": 8.622403792465819e-07,
3133
+ "loss": 0.3938,
3134
+ "step": 436
3135
+ },
3136
+ {
3137
+ "epoch": 0.9593852908891328,
3138
+ "grad_norm": 6.677319825795207,
3139
+ "learning_rate": 8.613596375273127e-07,
3140
+ "loss": 0.379,
3141
+ "step": 437
3142
+ },
3143
+ {
3144
+ "epoch": 0.9615806805708014,
3145
+ "grad_norm": 5.657521052821894,
3146
+ "learning_rate": 8.604765422601213e-07,
3147
+ "loss": 0.3482,
3148
+ "step": 438
3149
+ },
3150
+ {
3151
+ "epoch": 0.9637760702524698,
3152
+ "grad_norm": 5.630660484188195,
3153
+ "learning_rate": 8.595910991966375e-07,
3154
+ "loss": 0.4039,
3155
+ "step": 439
3156
+ },
3157
+ {
3158
+ "epoch": 0.9659714599341384,
3159
+ "grad_norm": 6.274352234189354,
3160
+ "learning_rate": 8.587033141037833e-07,
3161
+ "loss": 0.3926,
3162
+ "step": 440
3163
+ },
3164
+ {
3165
+ "epoch": 0.9681668496158068,
3166
+ "grad_norm": 6.263591446228187,
3167
+ "learning_rate": 8.578131927637339e-07,
3168
+ "loss": 0.3528,
3169
+ "step": 441
3170
+ },
3171
+ {
3172
+ "epoch": 0.9703622392974753,
3173
+ "grad_norm": 7.115149701513277,
3174
+ "learning_rate": 8.569207409738804e-07,
3175
+ "loss": 0.3812,
3176
+ "step": 442
3177
+ },
3178
+ {
3179
+ "epoch": 0.9725576289791438,
3180
+ "grad_norm": 6.770204900870094,
3181
+ "learning_rate": 8.560259645467927e-07,
3182
+ "loss": 0.3994,
3183
+ "step": 443
3184
+ },
3185
+ {
3186
+ "epoch": 0.9747530186608123,
3187
+ "grad_norm": 6.36111561811909,
3188
+ "learning_rate": 8.551288693101808e-07,
3189
+ "loss": 0.3578,
3190
+ "step": 444
3191
+ },
3192
+ {
3193
+ "epoch": 0.9769484083424808,
3194
+ "grad_norm": 5.460698588059828,
3195
+ "learning_rate": 8.542294611068573e-07,
3196
+ "loss": 0.357,
3197
+ "step": 445
3198
+ },
3199
+ {
3200
+ "epoch": 0.9791437980241493,
3201
+ "grad_norm": 7.059061442819567,
3202
+ "learning_rate": 8.533277457946988e-07,
3203
+ "loss": 0.3943,
3204
+ "step": 446
3205
+ },
3206
+ {
3207
+ "epoch": 0.9813391877058177,
3208
+ "grad_norm": 5.455920326056064,
3209
+ "learning_rate": 8.524237292466092e-07,
3210
+ "loss": 0.3498,
3211
+ "step": 447
3212
+ },
3213
+ {
3214
+ "epoch": 0.9835345773874863,
3215
+ "grad_norm": 8.145756413409373,
3216
+ "learning_rate": 8.515174173504795e-07,
3217
+ "loss": 0.4005,
3218
+ "step": 448
3219
+ },
3220
+ {
3221
+ "epoch": 0.9857299670691547,
3222
+ "grad_norm": 6.554023882646871,
3223
+ "learning_rate": 8.506088160091506e-07,
3224
+ "loss": 0.4014,
3225
+ "step": 449
3226
+ },
3227
+ {
3228
+ "epoch": 0.9879253567508233,
3229
+ "grad_norm": 7.592409507167033,
3230
+ "learning_rate": 8.49697931140375e-07,
3231
+ "loss": 0.3976,
3232
+ "step": 450
3233
+ },
3234
+ {
3235
+ "epoch": 0.9879253567508233,
3236
+ "eval_accuracy": 0.776,
3237
+ "eval_loss": 0.38674676418304443,
3238
+ "eval_runtime": 52.2498,
3239
+ "eval_samples_per_second": 9.569,
3240
+ "eval_steps_per_second": 1.206,
3241
+ "step": 450
3242
+ },
3243
+ {
3244
+ "epoch": 0.9901207464324918,
3245
+ "grad_norm": 6.445935731601792,
3246
+ "learning_rate": 8.487847686767771e-07,
3247
+ "loss": 0.3376,
3248
+ "step": 451
3249
+ },
3250
+ {
3251
+ "epoch": 0.9923161361141603,
3252
+ "grad_norm": 7.661577003586348,
3253
+ "learning_rate": 8.478693345658165e-07,
3254
+ "loss": 0.3673,
3255
+ "step": 452
3256
+ },
3257
+ {
3258
+ "epoch": 0.9945115257958288,
3259
+ "grad_norm": 6.07812754087199,
3260
+ "learning_rate": 8.469516347697472e-07,
3261
+ "loss": 0.3901,
3262
+ "step": 453
3263
+ },
3264
+ {
3265
+ "epoch": 0.9967069154774972,
3266
+ "grad_norm": 6.985188252913582,
3267
+ "learning_rate": 8.460316752655798e-07,
3268
+ "loss": 0.3532,
3269
+ "step": 454
3270
+ },
3271
+ {
3272
+ "epoch": 0.9989023051591658,
3273
+ "grad_norm": 6.960412464204759,
3274
+ "learning_rate": 8.451094620450431e-07,
3275
+ "loss": 0.3584,
3276
+ "step": 455
3277
+ },
3278
+ {
3279
+ "epoch": 1.0,
3280
+ "grad_norm": 6.960412464204759,
3281
+ "learning_rate": 8.441850011145435e-07,
3282
+ "loss": 0.3955,
3283
+ "step": 456
3284
+ },
3285
+ {
3286
+ "epoch": 1.0021953896816684,
3287
+ "grad_norm": 10.21658360278486,
3288
+ "learning_rate": 8.432582984951276e-07,
3289
+ "loss": 0.3445,
3290
+ "step": 457
3291
+ },
3292
+ {
3293
+ "epoch": 1.004390779363337,
3294
+ "grad_norm": 5.546577685721332,
3295
+ "learning_rate": 8.423293602224417e-07,
3296
+ "loss": 0.4039,
3297
+ "step": 458
3298
+ },
3299
+ {
3300
+ "epoch": 1.0065861690450055,
3301
+ "grad_norm": 5.367108573957618,
3302
+ "learning_rate": 8.413981923466932e-07,
3303
+ "loss": 0.4015,
3304
+ "step": 459
3305
+ },
3306
+ {
3307
+ "epoch": 1.008781558726674,
3308
+ "grad_norm": 7.677188675606839,
3309
+ "learning_rate": 8.404648009326111e-07,
3310
+ "loss": 0.4446,
3311
+ "step": 460
3312
+ },
3313
+ {
3314
+ "epoch": 1.0109769484083424,
3315
+ "grad_norm": 5.110493371628498,
3316
+ "learning_rate": 8.395291920594061e-07,
3317
+ "loss": 0.3263,
3318
+ "step": 461
3319
+ },
3320
+ {
3321
+ "epoch": 1.013172338090011,
3322
+ "grad_norm": 5.952324335313491,
3323
+ "learning_rate": 8.385913718207313e-07,
3324
+ "loss": 0.3865,
3325
+ "step": 462
3326
+ },
3327
+ {
3328
+ "epoch": 1.0153677277716795,
3329
+ "grad_norm": 6.4399342076113495,
3330
+ "learning_rate": 8.376513463246429e-07,
3331
+ "loss": 0.3821,
3332
+ "step": 463
3333
+ },
3334
+ {
3335
+ "epoch": 1.017563117453348,
3336
+ "grad_norm": 5.698634912941117,
3337
+ "learning_rate": 8.367091216935596e-07,
3338
+ "loss": 0.4065,
3339
+ "step": 464
3340
+ },
3341
+ {
3342
+ "epoch": 1.0197585071350164,
3343
+ "grad_norm": 6.689636508179602,
3344
+ "learning_rate": 8.357647040642231e-07,
3345
+ "loss": 0.3466,
3346
+ "step": 465
3347
+ },
3348
+ {
3349
+ "epoch": 1.021953896816685,
3350
+ "grad_norm": 5.443917132674302,
3351
+ "learning_rate": 8.348180995876587e-07,
3352
+ "loss": 0.3785,
3353
+ "step": 466
3354
+ },
3355
+ {
3356
+ "epoch": 1.0241492864983535,
3357
+ "grad_norm": 5.443111782189951,
3358
+ "learning_rate": 8.338693144291342e-07,
3359
+ "loss": 0.3985,
3360
+ "step": 467
3361
+ },
3362
+ {
3363
+ "epoch": 1.026344676180022,
3364
+ "grad_norm": 5.887332270970953,
3365
+ "learning_rate": 8.329183547681205e-07,
3366
+ "loss": 0.3742,
3367
+ "step": 468
3368
+ },
3369
+ {
3370
+ "epoch": 1.0285400658616906,
3371
+ "grad_norm": 6.187509225148951,
3372
+ "learning_rate": 8.319652267982508e-07,
3373
+ "loss": 0.3716,
3374
+ "step": 469
3375
+ },
3376
+ {
3377
+ "epoch": 1.030735455543359,
3378
+ "grad_norm": 6.121985990165869,
3379
+ "learning_rate": 8.310099367272812e-07,
3380
+ "loss": 0.429,
3381
+ "step": 470
3382
+ },
3383
+ {
3384
+ "epoch": 1.0329308452250274,
3385
+ "grad_norm": 11.523110274065871,
3386
+ "learning_rate": 8.30052490777049e-07,
3387
+ "loss": 0.4404,
3388
+ "step": 471
3389
+ },
3390
+ {
3391
+ "epoch": 1.0351262349066959,
3392
+ "grad_norm": 5.907917517927115,
3393
+ "learning_rate": 8.29092895183433e-07,
3394
+ "loss": 0.3681,
3395
+ "step": 472
3396
+ },
3397
+ {
3398
+ "epoch": 1.0373216245883645,
3399
+ "grad_norm": 5.901926465788518,
3400
+ "learning_rate": 8.281311561963129e-07,
3401
+ "loss": 0.3975,
3402
+ "step": 473
3403
+ },
3404
+ {
3405
+ "epoch": 1.039517014270033,
3406
+ "grad_norm": 5.906068428010956,
3407
+ "learning_rate": 8.271672800795284e-07,
3408
+ "loss": 0.3665,
3409
+ "step": 474
3410
+ },
3411
+ {
3412
+ "epoch": 1.0417124039517014,
3413
+ "grad_norm": 5.294577975700778,
3414
+ "learning_rate": 8.26201273110838e-07,
3415
+ "loss": 0.4487,
3416
+ "step": 475
3417
+ },
3418
+ {
3419
+ "epoch": 1.0439077936333698,
3420
+ "grad_norm": 5.978237084104336,
3421
+ "learning_rate": 8.252331415818788e-07,
3422
+ "loss": 0.333,
3423
+ "step": 476
3424
+ },
3425
+ {
3426
+ "epoch": 1.0461031833150385,
3427
+ "grad_norm": 5.498099104199104,
3428
+ "learning_rate": 8.242628917981253e-07,
3429
+ "loss": 0.3685,
3430
+ "step": 477
3431
+ },
3432
+ {
3433
+ "epoch": 1.048298572996707,
3434
+ "grad_norm": 6.789539587285106,
3435
+ "learning_rate": 8.232905300788484e-07,
3436
+ "loss": 0.3657,
3437
+ "step": 478
3438
+ },
3439
+ {
3440
+ "epoch": 1.0504939626783754,
3441
+ "grad_norm": 7.957958255256437,
3442
+ "learning_rate": 8.223160627570736e-07,
3443
+ "loss": 0.3326,
3444
+ "step": 479
3445
+ },
3446
+ {
3447
+ "epoch": 1.0526893523600438,
3448
+ "grad_norm": 5.600567494048053,
3449
+ "learning_rate": 8.213394961795406e-07,
3450
+ "loss": 0.3681,
3451
+ "step": 480
3452
+ },
3453
+ {
3454
+ "epoch": 1.0548847420417125,
3455
+ "grad_norm": 7.539054348927877,
3456
+ "learning_rate": 8.203608367066615e-07,
3457
+ "loss": 0.4324,
3458
+ "step": 481
3459
+ },
3460
+ {
3461
+ "epoch": 1.057080131723381,
3462
+ "grad_norm": 8.903288366040119,
3463
+ "learning_rate": 8.193800907124798e-07,
3464
+ "loss": 0.4113,
3465
+ "step": 482
3466
+ },
3467
+ {
3468
+ "epoch": 1.0592755214050493,
3469
+ "grad_norm": 8.111853302663592,
3470
+ "learning_rate": 8.183972645846282e-07,
3471
+ "loss": 0.4387,
3472
+ "step": 483
3473
+ },
3474
+ {
3475
+ "epoch": 1.061470911086718,
3476
+ "grad_norm": 9.450203257580068,
3477
+ "learning_rate": 8.174123647242877e-07,
3478
+ "loss": 0.3703,
3479
+ "step": 484
3480
+ },
3481
+ {
3482
+ "epoch": 1.0636663007683864,
3483
+ "grad_norm": 7.7233909149018025,
3484
+ "learning_rate": 8.164253975461453e-07,
3485
+ "loss": 0.3967,
3486
+ "step": 485
3487
+ },
3488
+ {
3489
+ "epoch": 1.0658616904500549,
3490
+ "grad_norm": 6.1052811281532176,
3491
+ "learning_rate": 8.154363694783526e-07,
3492
+ "loss": 0.3528,
3493
+ "step": 486
3494
+ },
3495
+ {
3496
+ "epoch": 1.0680570801317233,
3497
+ "grad_norm": 6.985081806859129,
3498
+ "learning_rate": 8.14445286962484e-07,
3499
+ "loss": 0.3402,
3500
+ "step": 487
3501
+ },
3502
+ {
3503
+ "epoch": 1.070252469813392,
3504
+ "grad_norm": 6.737047071565736,
3505
+ "learning_rate": 8.134521564534947e-07,
3506
+ "loss": 0.3484,
3507
+ "step": 488
3508
+ },
3509
+ {
3510
+ "epoch": 1.0724478594950604,
3511
+ "grad_norm": 8.024277201928818,
3512
+ "learning_rate": 8.124569844196779e-07,
3513
+ "loss": 0.4149,
3514
+ "step": 489
3515
+ },
3516
+ {
3517
+ "epoch": 1.0746432491767288,
3518
+ "grad_norm": 6.684001008171824,
3519
+ "learning_rate": 8.11459777342624e-07,
3520
+ "loss": 0.3545,
3521
+ "step": 490
3522
+ },
3523
+ {
3524
+ "epoch": 1.0768386388583973,
3525
+ "grad_norm": 8.174576979591736,
3526
+ "learning_rate": 8.104605417171776e-07,
3527
+ "loss": 0.4058,
3528
+ "step": 491
3529
+ },
3530
+ {
3531
+ "epoch": 1.079034028540066,
3532
+ "grad_norm": 6.567306113947398,
3533
+ "learning_rate": 8.094592840513949e-07,
3534
+ "loss": 0.3559,
3535
+ "step": 492
3536
+ },
3537
+ {
3538
+ "epoch": 1.0812294182217344,
3539
+ "grad_norm": 5.861714140854109,
3540
+ "learning_rate": 8.084560108665023e-07,
3541
+ "loss": 0.3705,
3542
+ "step": 493
3543
+ },
3544
+ {
3545
+ "epoch": 1.0834248079034028,
3546
+ "grad_norm": 5.948783706558882,
3547
+ "learning_rate": 8.074507286968528e-07,
3548
+ "loss": 0.3855,
3549
+ "step": 494
3550
+ },
3551
+ {
3552
+ "epoch": 1.0856201975850714,
3553
+ "grad_norm": 9.048770250869833,
3554
+ "learning_rate": 8.064434440898844e-07,
3555
+ "loss": 0.3838,
3556
+ "step": 495
3557
+ },
3558
+ {
3559
+ "epoch": 1.0878155872667399,
3560
+ "grad_norm": 6.4254457902019375,
3561
+ "learning_rate": 8.054341636060766e-07,
3562
+ "loss": 0.3616,
3563
+ "step": 496
3564
+ },
3565
+ {
3566
+ "epoch": 1.0900109769484083,
3567
+ "grad_norm": 8.363722938696041,
3568
+ "learning_rate": 8.044228938189088e-07,
3569
+ "loss": 0.4117,
3570
+ "step": 497
3571
+ },
3572
+ {
3573
+ "epoch": 1.0922063666300768,
3574
+ "grad_norm": 8.80748939543165,
3575
+ "learning_rate": 8.034096413148161e-07,
3576
+ "loss": 0.3498,
3577
+ "step": 498
3578
+ },
3579
+ {
3580
+ "epoch": 1.0944017563117454,
3581
+ "grad_norm": 6.142541366225709,
3582
+ "learning_rate": 8.023944126931475e-07,
3583
+ "loss": 0.3959,
3584
+ "step": 499
3585
+ },
3586
+ {
3587
+ "epoch": 1.0965971459934138,
3588
+ "grad_norm": 6.217108697055721,
3589
+ "learning_rate": 8.013772145661224e-07,
3590
+ "loss": 0.4058,
3591
+ "step": 500
3592
+ },
3593
+ {
3594
+ "epoch": 1.0965971459934138,
3595
+ "eval_accuracy": 0.79,
3596
+ "eval_loss": 0.3937423825263977,
3597
+ "eval_runtime": 52.5967,
3598
+ "eval_samples_per_second": 9.506,
3599
+ "eval_steps_per_second": 1.198,
3600
+ "step": 500
3601
+ }
3602
+ ],
3603
+ "logging_steps": 1,
3604
+ "max_steps": 1368,
3605
+ "num_input_tokens_seen": 0,
3606
+ "num_train_epochs": 3,
3607
+ "save_steps": 100,
3608
+ "stateful_callbacks": {
3609
+ "TrainerControl": {
3610
+ "args": {
3611
+ "should_epoch_stop": false,
3612
+ "should_evaluate": false,
3613
+ "should_log": false,
3614
+ "should_save": true,
3615
+ "should_training_stop": false
3616
+ },
3617
+ "attributes": {}
3618
+ }
3619
+ },
3620
+ "total_flos": 0.0,
3621
+ "train_batch_size": 1,
3622
+ "trial_name": null,
3623
+ "trial_params": null
3624
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c0681051523291f916cefea578dc89dc4918860d2302a4a70b765ee3cb7422
3
+ size 7928
value_head.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:121ff2e2cf4764abfc20a0b438bd2743e9260920915942433531206e55ed9eae
3
+ size 5610
video_preprocessor_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": null,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "do_sample_frames": false,
13
+ "fps": null,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_frames": 768,
26
+ "max_pixels": 12845056,
27
+ "merge_size": 2,
28
+ "min_frames": 4,
29
+ "min_pixels": 3136,
30
+ "num_frames": null,
31
+ "patch_size": 14,
32
+ "processor_class": "Qwen2_5_VLProcessor",
33
+ "resample": 3,
34
+ "rescale_factor": 0.00392156862745098,
35
+ "size": {
36
+ "longest_edge": 12845056,
37
+ "shortest_edge": 3136
38
+ },
39
+ "size_divisor": null,
40
+ "temporal_patch_size": 2,
41
+ "video_metadata": null,
42
+ "video_processor_type": "Qwen2VLVideoProcessor"
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff