Robin Ding commited on
Commit
e9755c0
·
1 Parent(s): d8b12cb

add initial model files

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/deepseek-coder-6.7b-base",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 32013,
9
+ "eos_token_id": 32014,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "max_position_embeddings": 16384,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 32,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "factor": 4.0,
23
+ "type": "linear"
24
+ },
25
+ "rope_theta": 100000,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float16",
28
+ "transformers_version": "4.36.2",
29
+ "use_cache": true,
30
+ "vocab_size": 32256
31
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32013,
4
+ "eos_token_id": 32014,
5
+ "transformers_version": "4.36.2"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f771e094d62502a9071a536d9d65c12467d3e1bf0ed97d1864980e67e01d3812
3
+ size 4941082400
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11412b26ce3650e29dd114120e5cbf0fc3db54a10c79a7c899249da73bef0ad
3
+ size 4947390768
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:757cac5e8097609c7c6343b21d78c0b93b28b4da88c882143139b902e3fcae54
3
+ size 3592585888
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13481025536
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "32000": {
6
+ "content": "õ",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": false
12
+ },
13
+ "32001": {
14
+ "content": "÷",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "32002": {
22
+ "content": "Á",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32003": {
30
+ "content": "ý",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "32004": {
38
+ "content": "À",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "32005": {
46
+ "content": "ÿ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "32006": {
54
+ "content": "ø",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "32007": {
62
+ "content": "ú",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "32008": {
70
+ "content": "þ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "32009": {
78
+ "content": "ü",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "32010": {
86
+ "content": "ù",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "32011": {
94
+ "content": "ö",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "32012": {
102
+ "content": "û",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "32013": {
110
+ "content": "<|begin▁of▁sentence|>",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32014": {
118
+ "content": "<|end▁of▁sentence|>",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32015": {
126
+ "content": "<|fim▁hole|>",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "32016": {
134
+ "content": "<|fim▁begin|>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "32017": {
142
+ "content": "<|fim▁end|>",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "32018": {
150
+ "content": "<pad>",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "32019": {
158
+ "content": "<|User|>",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "32020": {
166
+ "content": "<|Assistant|>",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "32021": {
174
+ "content": "<|EOT|>",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "bos_token": "<|begin▁of▁sentence|>",
183
+ "clean_up_tokenization_spaces": false,
184
+ "eos_token": "<|end▁of▁sentence|>",
185
+ "legacy": true,
186
+ "model_max_length": 16384,
187
+ "pad_token": "<|end▁of▁sentence|>",
188
+ "sp_model_kwargs": {},
189
+ "tokenizer_class": "LlamaTokenizer",
190
+ "unk_token": null,
191
+ "use_default_system_prompt": false
192
+ }
trainer_state.json ADDED
@@ -0,0 +1,3386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9979409746053536,
5
+ "eval_steps": 100,
6
+ "global_step": 1092,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 0.0,
14
+ "loss": 0.7252,
15
+ "step": 2
16
+ },
17
+ {
18
+ "epoch": 0.01,
19
+ "learning_rate": 0.0,
20
+ "loss": 0.7311,
21
+ "step": 4
22
+ },
23
+ {
24
+ "epoch": 0.02,
25
+ "learning_rate": 0.0,
26
+ "loss": 0.7542,
27
+ "step": 6
28
+ },
29
+ {
30
+ "epoch": 0.02,
31
+ "learning_rate": 0.0,
32
+ "loss": 0.7189,
33
+ "step": 8
34
+ },
35
+ {
36
+ "epoch": 0.03,
37
+ "learning_rate": 0.0,
38
+ "loss": 0.7314,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.03,
43
+ "learning_rate": 0.0,
44
+ "loss": 0.7629,
45
+ "step": 12
46
+ },
47
+ {
48
+ "epoch": 0.04,
49
+ "learning_rate": 0.0,
50
+ "loss": 0.7358,
51
+ "step": 14
52
+ },
53
+ {
54
+ "epoch": 0.04,
55
+ "learning_rate": 1.5384615384615387e-06,
56
+ "loss": 0.7348,
57
+ "step": 16
58
+ },
59
+ {
60
+ "epoch": 0.05,
61
+ "learning_rate": 2.307692307692308e-06,
62
+ "loss": 0.7438,
63
+ "step": 18
64
+ },
65
+ {
66
+ "epoch": 0.05,
67
+ "learning_rate": 3.846153846153847e-06,
68
+ "loss": 0.7021,
69
+ "step": 20
70
+ },
71
+ {
72
+ "epoch": 0.06,
73
+ "learning_rate": 5.3846153846153855e-06,
74
+ "loss": 0.684,
75
+ "step": 22
76
+ },
77
+ {
78
+ "epoch": 0.07,
79
+ "learning_rate": 6.923076923076923e-06,
80
+ "loss": 0.6652,
81
+ "step": 24
82
+ },
83
+ {
84
+ "epoch": 0.07,
85
+ "learning_rate": 8.461538461538462e-06,
86
+ "loss": 0.6302,
87
+ "step": 26
88
+ },
89
+ {
90
+ "epoch": 0.08,
91
+ "learning_rate": 1e-05,
92
+ "loss": 0.6212,
93
+ "step": 28
94
+ },
95
+ {
96
+ "epoch": 0.08,
97
+ "learning_rate": 1.153846153846154e-05,
98
+ "loss": 0.5972,
99
+ "step": 30
100
+ },
101
+ {
102
+ "epoch": 0.09,
103
+ "learning_rate": 1.3076923076923078e-05,
104
+ "loss": 0.6068,
105
+ "step": 32
106
+ },
107
+ {
108
+ "epoch": 0.09,
109
+ "learning_rate": 1.4615384615384617e-05,
110
+ "loss": 0.5911,
111
+ "step": 34
112
+ },
113
+ {
114
+ "epoch": 0.1,
115
+ "learning_rate": 1.6153846153846154e-05,
116
+ "loss": 0.6242,
117
+ "step": 36
118
+ },
119
+ {
120
+ "epoch": 0.1,
121
+ "learning_rate": 1.7692307692307694e-05,
122
+ "loss": 0.5971,
123
+ "step": 38
124
+ },
125
+ {
126
+ "epoch": 0.11,
127
+ "learning_rate": 1.923076923076923e-05,
128
+ "loss": 0.6028,
129
+ "step": 40
130
+ },
131
+ {
132
+ "epoch": 0.12,
133
+ "learning_rate": 2.0769230769230772e-05,
134
+ "loss": 0.5916,
135
+ "step": 42
136
+ },
137
+ {
138
+ "epoch": 0.12,
139
+ "learning_rate": 2.230769230769231e-05,
140
+ "loss": 0.5942,
141
+ "step": 44
142
+ },
143
+ {
144
+ "epoch": 0.13,
145
+ "learning_rate": 2.384615384615385e-05,
146
+ "loss": 0.5751,
147
+ "step": 46
148
+ },
149
+ {
150
+ "epoch": 0.13,
151
+ "learning_rate": 2.5384615384615383e-05,
152
+ "loss": 0.5544,
153
+ "step": 48
154
+ },
155
+ {
156
+ "epoch": 0.14,
157
+ "learning_rate": 2.6923076923076923e-05,
158
+ "loss": 0.581,
159
+ "step": 50
160
+ },
161
+ {
162
+ "epoch": 0.14,
163
+ "learning_rate": 2.846153846153846e-05,
164
+ "loss": 0.5676,
165
+ "step": 52
166
+ },
167
+ {
168
+ "epoch": 0.15,
169
+ "learning_rate": 3e-05,
170
+ "loss": 0.5969,
171
+ "step": 54
172
+ },
173
+ {
174
+ "epoch": 0.15,
175
+ "learning_rate": 3.153846153846154e-05,
176
+ "loss": 0.5501,
177
+ "step": 56
178
+ },
179
+ {
180
+ "epoch": 0.16,
181
+ "learning_rate": 3.307692307692308e-05,
182
+ "loss": 0.6018,
183
+ "step": 58
184
+ },
185
+ {
186
+ "epoch": 0.16,
187
+ "learning_rate": 3.461538461538462e-05,
188
+ "loss": 0.5452,
189
+ "step": 60
190
+ },
191
+ {
192
+ "epoch": 0.17,
193
+ "learning_rate": 3.615384615384615e-05,
194
+ "loss": 0.5271,
195
+ "step": 62
196
+ },
197
+ {
198
+ "epoch": 0.18,
199
+ "learning_rate": 3.769230769230769e-05,
200
+ "loss": 0.5893,
201
+ "step": 64
202
+ },
203
+ {
204
+ "epoch": 0.18,
205
+ "learning_rate": 3.923076923076923e-05,
206
+ "loss": 0.59,
207
+ "step": 66
208
+ },
209
+ {
210
+ "epoch": 0.19,
211
+ "learning_rate": 4.0769230769230773e-05,
212
+ "loss": 0.5464,
213
+ "step": 68
214
+ },
215
+ {
216
+ "epoch": 0.19,
217
+ "learning_rate": 4.230769230769231e-05,
218
+ "loss": 0.5732,
219
+ "step": 70
220
+ },
221
+ {
222
+ "epoch": 0.2,
223
+ "learning_rate": 4.384615384615385e-05,
224
+ "loss": 0.556,
225
+ "step": 72
226
+ },
227
+ {
228
+ "epoch": 0.2,
229
+ "learning_rate": 4.538461538461539e-05,
230
+ "loss": 0.5391,
231
+ "step": 74
232
+ },
233
+ {
234
+ "epoch": 0.21,
235
+ "learning_rate": 4.692307692307693e-05,
236
+ "loss": 0.5504,
237
+ "step": 76
238
+ },
239
+ {
240
+ "epoch": 0.21,
241
+ "learning_rate": 4.846153846153846e-05,
242
+ "loss": 0.5403,
243
+ "step": 78
244
+ },
245
+ {
246
+ "epoch": 0.22,
247
+ "learning_rate": 5e-05,
248
+ "loss": 0.5598,
249
+ "step": 80
250
+ },
251
+ {
252
+ "epoch": 0.23,
253
+ "learning_rate": 4.999953212751255e-05,
254
+ "loss": 0.5541,
255
+ "step": 82
256
+ },
257
+ {
258
+ "epoch": 0.23,
259
+ "learning_rate": 4.999812852756259e-05,
260
+ "loss": 0.5621,
261
+ "step": 84
262
+ },
263
+ {
264
+ "epoch": 0.24,
265
+ "learning_rate": 4.999578925268656e-05,
266
+ "loss": 0.5448,
267
+ "step": 86
268
+ },
269
+ {
270
+ "epoch": 0.24,
271
+ "learning_rate": 4.999251439044307e-05,
272
+ "loss": 0.5271,
273
+ "step": 88
274
+ },
275
+ {
276
+ "epoch": 0.25,
277
+ "learning_rate": 4.998830406340954e-05,
278
+ "loss": 0.5689,
279
+ "step": 90
280
+ },
281
+ {
282
+ "epoch": 0.25,
283
+ "learning_rate": 4.998315842917767e-05,
284
+ "loss": 0.5448,
285
+ "step": 92
286
+ },
287
+ {
288
+ "epoch": 0.26,
289
+ "learning_rate": 4.997707768034752e-05,
290
+ "loss": 0.5657,
291
+ "step": 94
292
+ },
293
+ {
294
+ "epoch": 0.26,
295
+ "learning_rate": 4.997006204452029e-05,
296
+ "loss": 0.5593,
297
+ "step": 96
298
+ },
299
+ {
300
+ "epoch": 0.27,
301
+ "learning_rate": 4.996211178428982e-05,
302
+ "loss": 0.586,
303
+ "step": 98
304
+ },
305
+ {
306
+ "epoch": 0.27,
307
+ "learning_rate": 4.9953227197232755e-05,
308
+ "loss": 0.5443,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 0.27,
313
+ "eval_loss": 0.5578761696815491,
314
+ "eval_runtime": 405.967,
315
+ "eval_samples_per_second": 24.174,
316
+ "eval_steps_per_second": 0.19,
317
+ "step": 100
318
+ },
319
+ {
320
+ "epoch": 0.28,
321
+ "learning_rate": 4.9943408615897404e-05,
322
+ "loss": 0.5368,
323
+ "step": 102
324
+ },
325
+ {
326
+ "epoch": 0.29,
327
+ "learning_rate": 4.993265640779129e-05,
328
+ "loss": 0.5516,
329
+ "step": 104
330
+ },
331
+ {
332
+ "epoch": 0.29,
333
+ "learning_rate": 4.99209709753674e-05,
334
+ "loss": 0.5334,
335
+ "step": 106
336
+ },
337
+ {
338
+ "epoch": 0.3,
339
+ "learning_rate": 4.990835275600913e-05,
340
+ "loss": 0.5467,
341
+ "step": 108
342
+ },
343
+ {
344
+ "epoch": 0.3,
345
+ "learning_rate": 4.989480222201387e-05,
346
+ "loss": 0.5644,
347
+ "step": 110
348
+ },
349
+ {
350
+ "epoch": 0.31,
351
+ "learning_rate": 4.988031988057541e-05,
352
+ "loss": 0.5657,
353
+ "step": 112
354
+ },
355
+ {
356
+ "epoch": 0.31,
357
+ "learning_rate": 4.9864906273764864e-05,
358
+ "loss": 0.5275,
359
+ "step": 114
360
+ },
361
+ {
362
+ "epoch": 0.32,
363
+ "learning_rate": 4.984856197851045e-05,
364
+ "loss": 0.5513,
365
+ "step": 116
366
+ },
367
+ {
368
+ "epoch": 0.32,
369
+ "learning_rate": 4.983128760657584e-05,
370
+ "loss": 0.5472,
371
+ "step": 118
372
+ },
373
+ {
374
+ "epoch": 0.33,
375
+ "learning_rate": 4.981308380453732e-05,
376
+ "loss": 0.5543,
377
+ "step": 120
378
+ },
379
+ {
380
+ "epoch": 0.33,
381
+ "learning_rate": 4.979395125375952e-05,
382
+ "loss": 0.5625,
383
+ "step": 122
384
+ },
385
+ {
386
+ "epoch": 0.34,
387
+ "learning_rate": 4.977389067036998e-05,
388
+ "loss": 0.5687,
389
+ "step": 124
390
+ },
391
+ {
392
+ "epoch": 0.35,
393
+ "learning_rate": 4.9752902805232306e-05,
394
+ "loss": 0.5779,
395
+ "step": 126
396
+ },
397
+ {
398
+ "epoch": 0.35,
399
+ "learning_rate": 4.973098844391807e-05,
400
+ "loss": 0.5339,
401
+ "step": 128
402
+ },
403
+ {
404
+ "epoch": 0.36,
405
+ "learning_rate": 4.9708148406677415e-05,
406
+ "loss": 0.5633,
407
+ "step": 130
408
+ },
409
+ {
410
+ "epoch": 0.36,
411
+ "learning_rate": 4.968438354840834e-05,
412
+ "loss": 0.5479,
413
+ "step": 132
414
+ },
415
+ {
416
+ "epoch": 0.37,
417
+ "learning_rate": 4.965969475862471e-05,
418
+ "loss": 0.5428,
419
+ "step": 134
420
+ },
421
+ {
422
+ "epoch": 0.37,
423
+ "learning_rate": 4.963408296142297e-05,
424
+ "loss": 0.5403,
425
+ "step": 136
426
+ },
427
+ {
428
+ "epoch": 0.38,
429
+ "learning_rate": 4.960754911544753e-05,
430
+ "loss": 0.5488,
431
+ "step": 138
432
+ },
433
+ {
434
+ "epoch": 0.38,
435
+ "learning_rate": 4.9580094213854935e-05,
436
+ "loss": 0.5434,
437
+ "step": 140
438
+ },
439
+ {
440
+ "epoch": 0.39,
441
+ "learning_rate": 4.9551719284276604e-05,
442
+ "loss": 0.5652,
443
+ "step": 142
444
+ },
445
+ {
446
+ "epoch": 0.4,
447
+ "learning_rate": 4.9522425388780466e-05,
448
+ "loss": 0.5371,
449
+ "step": 144
450
+ },
451
+ {
452
+ "epoch": 0.4,
453
+ "learning_rate": 4.949221362383114e-05,
454
+ "loss": 0.5247,
455
+ "step": 146
456
+ },
457
+ {
458
+ "epoch": 0.41,
459
+ "learning_rate": 4.946108512024891e-05,
460
+ "loss": 0.5389,
461
+ "step": 148
462
+ },
463
+ {
464
+ "epoch": 0.41,
465
+ "learning_rate": 4.942904104316741e-05,
466
+ "loss": 0.5153,
467
+ "step": 150
468
+ },
469
+ {
470
+ "epoch": 0.42,
471
+ "learning_rate": 4.939608259199001e-05,
472
+ "loss": 0.5274,
473
+ "step": 152
474
+ },
475
+ {
476
+ "epoch": 0.42,
477
+ "learning_rate": 4.93622110003449e-05,
478
+ "loss": 0.5311,
479
+ "step": 154
480
+ },
481
+ {
482
+ "epoch": 0.43,
483
+ "learning_rate": 4.932742753603896e-05,
484
+ "loss": 0.5356,
485
+ "step": 156
486
+ },
487
+ {
488
+ "epoch": 0.43,
489
+ "learning_rate": 4.929173350101025e-05,
490
+ "loss": 0.5508,
491
+ "step": 158
492
+ },
493
+ {
494
+ "epoch": 0.44,
495
+ "learning_rate": 4.925513023127934e-05,
496
+ "loss": 0.5431,
497
+ "step": 160
498
+ },
499
+ {
500
+ "epoch": 0.44,
501
+ "learning_rate": 4.921761909689927e-05,
502
+ "loss": 0.5685,
503
+ "step": 162
504
+ },
505
+ {
506
+ "epoch": 0.45,
507
+ "learning_rate": 4.917920150190424e-05,
508
+ "loss": 0.5334,
509
+ "step": 164
510
+ },
511
+ {
512
+ "epoch": 0.46,
513
+ "learning_rate": 4.913987888425712e-05,
514
+ "loss": 0.5454,
515
+ "step": 166
516
+ },
517
+ {
518
+ "epoch": 0.46,
519
+ "learning_rate": 4.909965271579557e-05,
520
+ "loss": 0.5622,
521
+ "step": 168
522
+ },
523
+ {
524
+ "epoch": 0.47,
525
+ "learning_rate": 4.9058524502177005e-05,
526
+ "loss": 0.5206,
527
+ "step": 170
528
+ },
529
+ {
530
+ "epoch": 0.47,
531
+ "learning_rate": 4.9016495782822185e-05,
532
+ "loss": 0.5381,
533
+ "step": 172
534
+ },
535
+ {
536
+ "epoch": 0.48,
537
+ "learning_rate": 4.897356813085763e-05,
538
+ "loss": 0.5245,
539
+ "step": 174
540
+ },
541
+ {
542
+ "epoch": 0.48,
543
+ "learning_rate": 4.892974315305674e-05,
544
+ "loss": 0.5363,
545
+ "step": 176
546
+ },
547
+ {
548
+ "epoch": 0.49,
549
+ "learning_rate": 4.8885022489779594e-05,
550
+ "loss": 0.5575,
551
+ "step": 178
552
+ },
553
+ {
554
+ "epoch": 0.49,
555
+ "learning_rate": 4.8839407814911646e-05,
556
+ "loss": 0.5195,
557
+ "step": 180
558
+ },
559
+ {
560
+ "epoch": 0.5,
561
+ "learning_rate": 4.879290083580101e-05,
562
+ "loss": 0.5377,
563
+ "step": 182
564
+ },
565
+ {
566
+ "epoch": 0.51,
567
+ "learning_rate": 4.874550329319457e-05,
568
+ "loss": 0.5259,
569
+ "step": 184
570
+ },
571
+ {
572
+ "epoch": 0.51,
573
+ "learning_rate": 4.869721696117281e-05,
574
+ "loss": 0.5287,
575
+ "step": 186
576
+ },
577
+ {
578
+ "epoch": 0.52,
579
+ "learning_rate": 4.864804364708343e-05,
580
+ "loss": 0.5392,
581
+ "step": 188
582
+ },
583
+ {
584
+ "epoch": 0.52,
585
+ "learning_rate": 4.859798519147369e-05,
586
+ "loss": 0.539,
587
+ "step": 190
588
+ },
589
+ {
590
+ "epoch": 0.53,
591
+ "learning_rate": 4.8547043468021535e-05,
592
+ "loss": 0.5475,
593
+ "step": 192
594
+ },
595
+ {
596
+ "epoch": 0.53,
597
+ "learning_rate": 4.849522038346543e-05,
598
+ "loss": 0.5285,
599
+ "step": 194
600
+ },
601
+ {
602
+ "epoch": 0.54,
603
+ "learning_rate": 4.8442517877533014e-05,
604
+ "loss": 0.5643,
605
+ "step": 196
606
+ },
607
+ {
608
+ "epoch": 0.54,
609
+ "learning_rate": 4.838893792286847e-05,
610
+ "loss": 0.54,
611
+ "step": 198
612
+ },
613
+ {
614
+ "epoch": 0.55,
615
+ "learning_rate": 4.8334482524958766e-05,
616
+ "loss": 0.5173,
617
+ "step": 200
618
+ },
619
+ {
620
+ "epoch": 0.55,
621
+ "eval_loss": 0.5429729223251343,
622
+ "eval_runtime": 372.2276,
623
+ "eval_samples_per_second": 26.366,
624
+ "eval_steps_per_second": 0.207,
625
+ "step": 200
626
+ },
627
+ {
628
+ "epoch": 0.55,
629
+ "learning_rate": 4.827915372205847e-05,
630
+ "loss": 0.5277,
631
+ "step": 202
632
+ },
633
+ {
634
+ "epoch": 0.56,
635
+ "learning_rate": 4.8222953585113576e-05,
636
+ "loss": 0.5166,
637
+ "step": 204
638
+ },
639
+ {
640
+ "epoch": 0.57,
641
+ "learning_rate": 4.8165884217683885e-05,
642
+ "loss": 0.5486,
643
+ "step": 206
644
+ },
645
+ {
646
+ "epoch": 0.57,
647
+ "learning_rate": 4.810794775586438e-05,
648
+ "loss": 0.5249,
649
+ "step": 208
650
+ },
651
+ {
652
+ "epoch": 0.58,
653
+ "learning_rate": 4.804914636820517e-05,
654
+ "loss": 0.5439,
655
+ "step": 210
656
+ },
657
+ {
658
+ "epoch": 0.58,
659
+ "learning_rate": 4.798948225563037e-05,
660
+ "loss": 0.5612,
661
+ "step": 212
662
+ },
663
+ {
664
+ "epoch": 0.59,
665
+ "learning_rate": 4.792895765135572e-05,
666
+ "loss": 0.5426,
667
+ "step": 214
668
+ },
669
+ {
670
+ "epoch": 0.59,
671
+ "learning_rate": 4.7867574820805003e-05,
672
+ "loss": 0.5313,
673
+ "step": 216
674
+ },
675
+ {
676
+ "epoch": 0.6,
677
+ "learning_rate": 4.780533606152522e-05,
678
+ "loss": 0.5669,
679
+ "step": 218
680
+ },
681
+ {
682
+ "epoch": 0.6,
683
+ "learning_rate": 4.7742243703100626e-05,
684
+ "loss": 0.5544,
685
+ "step": 220
686
+ },
687
+ {
688
+ "epoch": 0.61,
689
+ "learning_rate": 4.767830010706551e-05,
690
+ "loss": 0.5239,
691
+ "step": 222
692
+ },
693
+ {
694
+ "epoch": 0.61,
695
+ "learning_rate": 4.761350766681582e-05,
696
+ "loss": 0.5711,
697
+ "step": 224
698
+ },
699
+ {
700
+ "epoch": 0.62,
701
+ "learning_rate": 4.754786880751957e-05,
702
+ "loss": 0.5363,
703
+ "step": 226
704
+ },
705
+ {
706
+ "epoch": 0.63,
707
+ "learning_rate": 4.7481385986026075e-05,
708
+ "loss": 0.548,
709
+ "step": 228
710
+ },
711
+ {
712
+ "epoch": 0.63,
713
+ "learning_rate": 4.7414061690773967e-05,
714
+ "loss": 0.5308,
715
+ "step": 230
716
+ },
717
+ {
718
+ "epoch": 0.64,
719
+ "learning_rate": 4.73458984416981e-05,
720
+ "loss": 0.5467,
721
+ "step": 232
722
+ },
723
+ {
724
+ "epoch": 0.64,
725
+ "learning_rate": 4.7276898790135185e-05,
726
+ "loss": 0.5149,
727
+ "step": 234
728
+ },
729
+ {
730
+ "epoch": 0.65,
731
+ "learning_rate": 4.72070653187283e-05,
732
+ "loss": 0.5403,
733
+ "step": 236
734
+ },
735
+ {
736
+ "epoch": 0.65,
737
+ "learning_rate": 4.713640064133025e-05,
738
+ "loss": 0.5584,
739
+ "step": 238
740
+ },
741
+ {
742
+ "epoch": 0.66,
743
+ "learning_rate": 4.7064907402905705e-05,
744
+ "loss": 0.5513,
745
+ "step": 240
746
+ },
747
+ {
748
+ "epoch": 0.66,
749
+ "learning_rate": 4.699258827943221e-05,
750
+ "loss": 0.5197,
751
+ "step": 242
752
+ },
753
+ {
754
+ "epoch": 0.67,
755
+ "learning_rate": 4.6919445977800014e-05,
756
+ "loss": 0.5389,
757
+ "step": 244
758
+ },
759
+ {
760
+ "epoch": 0.68,
761
+ "learning_rate": 4.6845483235710774e-05,
762
+ "loss": 0.5373,
763
+ "step": 246
764
+ },
765
+ {
766
+ "epoch": 0.68,
767
+ "learning_rate": 4.6770702821575055e-05,
768
+ "loss": 0.5302,
769
+ "step": 248
770
+ },
771
+ {
772
+ "epoch": 0.69,
773
+ "learning_rate": 4.669510753440873e-05,
774
+ "loss": 0.5004,
775
+ "step": 250
776
+ },
777
+ {
778
+ "epoch": 0.69,
779
+ "learning_rate": 4.6618700203728196e-05,
780
+ "loss": 0.513,
781
+ "step": 252
782
+ },
783
+ {
784
+ "epoch": 0.7,
785
+ "learning_rate": 4.654148368944449e-05,
786
+ "loss": 0.5382,
787
+ "step": 254
788
+ },
789
+ {
790
+ "epoch": 0.7,
791
+ "learning_rate": 4.646346088175621e-05,
792
+ "loss": 0.5376,
793
+ "step": 256
794
+ },
795
+ {
796
+ "epoch": 0.71,
797
+ "learning_rate": 4.638463470104139e-05,
798
+ "loss": 0.5564,
799
+ "step": 258
800
+ },
801
+ {
802
+ "epoch": 0.71,
803
+ "learning_rate": 4.630500809774809e-05,
804
+ "loss": 0.5468,
805
+ "step": 260
806
+ },
807
+ {
808
+ "epoch": 0.72,
809
+ "learning_rate": 4.6224584052284106e-05,
810
+ "loss": 0.5559,
811
+ "step": 262
812
+ },
813
+ {
814
+ "epoch": 0.72,
815
+ "learning_rate": 4.614336557490526e-05,
816
+ "loss": 0.5112,
817
+ "step": 264
818
+ },
819
+ {
820
+ "epoch": 0.73,
821
+ "learning_rate": 4.606135570560286e-05,
822
+ "loss": 0.5295,
823
+ "step": 266
824
+ },
825
+ {
826
+ "epoch": 0.74,
827
+ "learning_rate": 4.5978557513989814e-05,
828
+ "loss": 0.5489,
829
+ "step": 268
830
+ },
831
+ {
832
+ "epoch": 0.74,
833
+ "learning_rate": 4.5894974099185806e-05,
834
+ "loss": 0.5569,
835
+ "step": 270
836
+ },
837
+ {
838
+ "epoch": 0.75,
839
+ "learning_rate": 4.581060858970124e-05,
840
+ "loss": 0.5462,
841
+ "step": 272
842
+ },
843
+ {
844
+ "epoch": 0.75,
845
+ "learning_rate": 4.572546414332018e-05,
846
+ "loss": 0.543,
847
+ "step": 274
848
+ },
849
+ {
850
+ "epoch": 0.76,
851
+ "learning_rate": 4.5639543946982144e-05,
852
+ "loss": 0.5119,
853
+ "step": 276
854
+ },
855
+ {
856
+ "epoch": 0.76,
857
+ "learning_rate": 4.55528512166628e-05,
858
+ "loss": 0.5203,
859
+ "step": 278
860
+ },
861
+ {
862
+ "epoch": 0.77,
863
+ "learning_rate": 4.546538919725364e-05,
864
+ "loss": 0.5358,
865
+ "step": 280
866
+ },
867
+ {
868
+ "epoch": 0.77,
869
+ "learning_rate": 4.5377161162440445e-05,
870
+ "loss": 0.5373,
871
+ "step": 282
872
+ },
873
+ {
874
+ "epoch": 0.78,
875
+ "learning_rate": 4.528817041458084e-05,
876
+ "loss": 0.5436,
877
+ "step": 284
878
+ },
879
+ {
880
+ "epoch": 0.79,
881
+ "learning_rate": 4.519842028458062e-05,
882
+ "loss": 0.5235,
883
+ "step": 286
884
+ },
885
+ {
886
+ "epoch": 0.79,
887
+ "learning_rate": 4.510791413176912e-05,
888
+ "loss": 0.5087,
889
+ "step": 288
890
+ },
891
+ {
892
+ "epoch": 0.8,
893
+ "learning_rate": 4.501665534377345e-05,
894
+ "loss": 0.5202,
895
+ "step": 290
896
+ },
897
+ {
898
+ "epoch": 0.8,
899
+ "learning_rate": 4.492464733639168e-05,
900
+ "loss": 0.5374,
901
+ "step": 292
902
+ },
903
+ {
904
+ "epoch": 0.81,
905
+ "learning_rate": 4.483189355346506e-05,
906
+ "loss": 0.5154,
907
+ "step": 294
908
+ },
909
+ {
910
+ "epoch": 0.81,
911
+ "learning_rate": 4.473839746674902e-05,
912
+ "loss": 0.5269,
913
+ "step": 296
914
+ },
915
+ {
916
+ "epoch": 0.82,
917
+ "learning_rate": 4.46441625757833e-05,
918
+ "loss": 0.561,
919
+ "step": 298
920
+ },
921
+ {
922
+ "epoch": 0.82,
923
+ "learning_rate": 4.454919240776093e-05,
924
+ "loss": 0.5133,
925
+ "step": 300
926
+ },
927
+ {
928
+ "epoch": 0.82,
929
+ "eval_loss": 0.5352138876914978,
930
+ "eval_runtime": 372.3236,
931
+ "eval_samples_per_second": 26.359,
932
+ "eval_steps_per_second": 0.207,
933
+ "step": 300
934
+ },
935
+ {
936
+ "epoch": 0.83,
937
+ "learning_rate": 4.4453490517396215e-05,
938
+ "loss": 0.5321,
939
+ "step": 302
940
+ },
941
+ {
942
+ "epoch": 0.83,
943
+ "learning_rate": 4.435706048679166e-05,
944
+ "loss": 0.5605,
945
+ "step": 304
946
+ },
947
+ {
948
+ "epoch": 0.84,
949
+ "learning_rate": 4.4259905925303935e-05,
950
+ "loss": 0.5195,
951
+ "step": 306
952
+ },
953
+ {
954
+ "epoch": 0.85,
955
+ "learning_rate": 4.416203046940875e-05,
956
+ "loss": 0.5167,
957
+ "step": 308
958
+ },
959
+ {
960
+ "epoch": 0.85,
961
+ "learning_rate": 4.4063437782564745e-05,
962
+ "loss": 0.4983,
963
+ "step": 310
964
+ },
965
+ {
966
+ "epoch": 0.86,
967
+ "learning_rate": 4.396413155507637e-05,
968
+ "loss": 0.5192,
969
+ "step": 312
970
+ },
971
+ {
972
+ "epoch": 0.86,
973
+ "learning_rate": 4.386411550395576e-05,
974
+ "loss": 0.5082,
975
+ "step": 314
976
+ },
977
+ {
978
+ "epoch": 0.87,
979
+ "learning_rate": 4.37633933727836e-05,
980
+ "loss": 0.5406,
981
+ "step": 316
982
+ },
983
+ {
984
+ "epoch": 0.87,
985
+ "learning_rate": 4.366196893156902e-05,
986
+ "loss": 0.5123,
987
+ "step": 318
988
+ },
989
+ {
990
+ "epoch": 0.88,
991
+ "learning_rate": 4.355984597660846e-05,
992
+ "loss": 0.5366,
993
+ "step": 320
994
+ },
995
+ {
996
+ "epoch": 0.88,
997
+ "learning_rate": 4.3457028330343606e-05,
998
+ "loss": 0.5418,
999
+ "step": 322
1000
+ },
1001
+ {
1002
+ "epoch": 0.89,
1003
+ "learning_rate": 4.335351984121829e-05,
1004
+ "loss": 0.5353,
1005
+ "step": 324
1006
+ },
1007
+ {
1008
+ "epoch": 0.89,
1009
+ "learning_rate": 4.324932438353446e-05,
1010
+ "loss": 0.5181,
1011
+ "step": 326
1012
+ },
1013
+ {
1014
+ "epoch": 0.9,
1015
+ "learning_rate": 4.314444585730713e-05,
1016
+ "loss": 0.5291,
1017
+ "step": 328
1018
+ },
1019
+ {
1020
+ "epoch": 0.91,
1021
+ "learning_rate": 4.3038888188118475e-05,
1022
+ "loss": 0.5377,
1023
+ "step": 330
1024
+ },
1025
+ {
1026
+ "epoch": 0.91,
1027
+ "learning_rate": 4.293265532697084e-05,
1028
+ "loss": 0.5304,
1029
+ "step": 332
1030
+ },
1031
+ {
1032
+ "epoch": 0.92,
1033
+ "learning_rate": 4.282575125013884e-05,
1034
+ "loss": 0.5121,
1035
+ "step": 334
1036
+ },
1037
+ {
1038
+ "epoch": 0.92,
1039
+ "learning_rate": 4.271817995902062e-05,
1040
+ "loss": 0.5247,
1041
+ "step": 336
1042
+ },
1043
+ {
1044
+ "epoch": 0.93,
1045
+ "learning_rate": 4.260994547998795e-05,
1046
+ "loss": 0.5106,
1047
+ "step": 338
1048
+ },
1049
+ {
1050
+ "epoch": 0.93,
1051
+ "learning_rate": 4.2501051864235636e-05,
1052
+ "loss": 0.4972,
1053
+ "step": 340
1054
+ },
1055
+ {
1056
+ "epoch": 0.94,
1057
+ "learning_rate": 4.2391503187629836e-05,
1058
+ "loss": 0.5521,
1059
+ "step": 342
1060
+ },
1061
+ {
1062
+ "epoch": 0.94,
1063
+ "learning_rate": 4.228130355055548e-05,
1064
+ "loss": 0.5126,
1065
+ "step": 344
1066
+ },
1067
+ {
1068
+ "epoch": 0.95,
1069
+ "learning_rate": 4.217045707776285e-05,
1070
+ "loss": 0.5321,
1071
+ "step": 346
1072
+ },
1073
+ {
1074
+ "epoch": 0.96,
1075
+ "learning_rate": 4.2058967918213125e-05,
1076
+ "loss": 0.5347,
1077
+ "step": 348
1078
+ },
1079
+ {
1080
+ "epoch": 0.96,
1081
+ "learning_rate": 4.194684024492315e-05,
1082
+ "loss": 0.5411,
1083
+ "step": 350
1084
+ },
1085
+ {
1086
+ "epoch": 0.97,
1087
+ "learning_rate": 4.1834078254809194e-05,
1088
+ "loss": 0.522,
1089
+ "step": 352
1090
+ },
1091
+ {
1092
+ "epoch": 0.97,
1093
+ "learning_rate": 4.172068616852988e-05,
1094
+ "loss": 0.5115,
1095
+ "step": 354
1096
+ },
1097
+ {
1098
+ "epoch": 0.98,
1099
+ "learning_rate": 4.16066682303282e-05,
1100
+ "loss": 0.5396,
1101
+ "step": 356
1102
+ },
1103
+ {
1104
+ "epoch": 0.98,
1105
+ "learning_rate": 4.149202870787269e-05,
1106
+ "loss": 0.5215,
1107
+ "step": 358
1108
+ },
1109
+ {
1110
+ "epoch": 0.99,
1111
+ "learning_rate": 4.13767718920976e-05,
1112
+ "loss": 0.508,
1113
+ "step": 360
1114
+ },
1115
+ {
1116
+ "epoch": 0.99,
1117
+ "learning_rate": 4.1260902097042385e-05,
1118
+ "loss": 0.531,
1119
+ "step": 362
1120
+ },
1121
+ {
1122
+ "epoch": 1.0,
1123
+ "learning_rate": 4.114442365969019e-05,
1124
+ "loss": 0.5266,
1125
+ "step": 364
1126
+ },
1127
+ {
1128
+ "epoch": 1.0,
1129
+ "learning_rate": 4.10273409398055e-05,
1130
+ "loss": 0.5073,
1131
+ "step": 366
1132
+ },
1133
+ {
1134
+ "epoch": 1.01,
1135
+ "learning_rate": 4.090965831977101e-05,
1136
+ "loss": 0.5128,
1137
+ "step": 368
1138
+ },
1139
+ {
1140
+ "epoch": 1.02,
1141
+ "learning_rate": 4.079138020442351e-05,
1142
+ "loss": 0.5457,
1143
+ "step": 370
1144
+ },
1145
+ {
1146
+ "epoch": 1.02,
1147
+ "learning_rate": 4.0672511020889104e-05,
1148
+ "loss": 0.526,
1149
+ "step": 372
1150
+ },
1151
+ {
1152
+ "epoch": 1.03,
1153
+ "learning_rate": 4.055305521841743e-05,
1154
+ "loss": 0.5231,
1155
+ "step": 374
1156
+ },
1157
+ {
1158
+ "epoch": 1.03,
1159
+ "learning_rate": 4.043301726821515e-05,
1160
+ "loss": 0.5354,
1161
+ "step": 376
1162
+ },
1163
+ {
1164
+ "epoch": 1.04,
1165
+ "learning_rate": 4.0312401663278616e-05,
1166
+ "loss": 0.5275,
1167
+ "step": 378
1168
+ },
1169
+ {
1170
+ "epoch": 1.04,
1171
+ "learning_rate": 4.019121291822569e-05,
1172
+ "loss": 0.5066,
1173
+ "step": 380
1174
+ },
1175
+ {
1176
+ "epoch": 1.05,
1177
+ "learning_rate": 4.006945556912673e-05,
1178
+ "loss": 0.5434,
1179
+ "step": 382
1180
+ },
1181
+ {
1182
+ "epoch": 1.05,
1183
+ "learning_rate": 3.9947134173334846e-05,
1184
+ "loss": 0.4798,
1185
+ "step": 384
1186
+ },
1187
+ {
1188
+ "epoch": 1.06,
1189
+ "learning_rate": 3.9824253309315286e-05,
1190
+ "loss": 0.5124,
1191
+ "step": 386
1192
+ },
1193
+ {
1194
+ "epoch": 1.07,
1195
+ "learning_rate": 3.97008175764741e-05,
1196
+ "loss": 0.523,
1197
+ "step": 388
1198
+ },
1199
+ {
1200
+ "epoch": 1.07,
1201
+ "learning_rate": 3.9576831594985956e-05,
1202
+ "loss": 0.4993,
1203
+ "step": 390
1204
+ },
1205
+ {
1206
+ "epoch": 1.08,
1207
+ "learning_rate": 3.945230000562121e-05,
1208
+ "loss": 0.4933,
1209
+ "step": 392
1210
+ },
1211
+ {
1212
+ "epoch": 1.08,
1213
+ "learning_rate": 3.932722746957223e-05,
1214
+ "loss": 0.4813,
1215
+ "step": 394
1216
+ },
1217
+ {
1218
+ "epoch": 1.09,
1219
+ "learning_rate": 3.920161866827889e-05,
1220
+ "loss": 0.5043,
1221
+ "step": 396
1222
+ },
1223
+ {
1224
+ "epoch": 1.09,
1225
+ "learning_rate": 3.9075478303253396e-05,
1226
+ "loss": 0.4889,
1227
+ "step": 398
1228
+ },
1229
+ {
1230
+ "epoch": 1.1,
1231
+ "learning_rate": 3.8948811095904234e-05,
1232
+ "loss": 0.5248,
1233
+ "step": 400
1234
+ },
1235
+ {
1236
+ "epoch": 1.1,
1237
+ "eval_loss": 0.5309346914291382,
1238
+ "eval_runtime": 372.3756,
1239
+ "eval_samples_per_second": 26.355,
1240
+ "eval_steps_per_second": 0.207,
1241
+ "step": 400
1242
+ },
1243
+ {
1244
+ "epoch": 1.1,
1245
+ "learning_rate": 3.882162178735952e-05,
1246
+ "loss": 0.4794,
1247
+ "step": 402
1248
+ },
1249
+ {
1250
+ "epoch": 1.11,
1251
+ "learning_rate": 3.869391513828951e-05,
1252
+ "loss": 0.5006,
1253
+ "step": 404
1254
+ },
1255
+ {
1256
+ "epoch": 1.11,
1257
+ "learning_rate": 3.856569592872841e-05,
1258
+ "loss": 0.509,
1259
+ "step": 406
1260
+ },
1261
+ {
1262
+ "epoch": 1.12,
1263
+ "learning_rate": 3.843696895789546e-05,
1264
+ "loss": 0.4963,
1265
+ "step": 408
1266
+ },
1267
+ {
1268
+ "epoch": 1.13,
1269
+ "learning_rate": 3.83077390440153e-05,
1270
+ "loss": 0.4662,
1271
+ "step": 410
1272
+ },
1273
+ {
1274
+ "epoch": 1.13,
1275
+ "learning_rate": 3.8178011024137636e-05,
1276
+ "loss": 0.46,
1277
+ "step": 412
1278
+ },
1279
+ {
1280
+ "epoch": 1.14,
1281
+ "learning_rate": 3.804778975395618e-05,
1282
+ "loss": 0.4948,
1283
+ "step": 414
1284
+ },
1285
+ {
1286
+ "epoch": 1.14,
1287
+ "learning_rate": 3.791708010762689e-05,
1288
+ "loss": 0.464,
1289
+ "step": 416
1290
+ },
1291
+ {
1292
+ "epoch": 1.15,
1293
+ "learning_rate": 3.778588697758556e-05,
1294
+ "loss": 0.5009,
1295
+ "step": 418
1296
+ },
1297
+ {
1298
+ "epoch": 1.15,
1299
+ "learning_rate": 3.7654215274364675e-05,
1300
+ "loss": 0.4557,
1301
+ "step": 420
1302
+ },
1303
+ {
1304
+ "epoch": 1.16,
1305
+ "learning_rate": 3.752206992640962e-05,
1306
+ "loss": 0.4959,
1307
+ "step": 422
1308
+ },
1309
+ {
1310
+ "epoch": 1.16,
1311
+ "learning_rate": 3.73894558798942e-05,
1312
+ "loss": 0.4421,
1313
+ "step": 424
1314
+ },
1315
+ {
1316
+ "epoch": 1.17,
1317
+ "learning_rate": 3.7256378098535544e-05,
1318
+ "loss": 0.4353,
1319
+ "step": 426
1320
+ },
1321
+ {
1322
+ "epoch": 1.18,
1323
+ "learning_rate": 3.712284156340824e-05,
1324
+ "loss": 0.4763,
1325
+ "step": 428
1326
+ },
1327
+ {
1328
+ "epoch": 1.18,
1329
+ "learning_rate": 3.698885127275795e-05,
1330
+ "loss": 0.4794,
1331
+ "step": 430
1332
+ },
1333
+ {
1334
+ "epoch": 1.19,
1335
+ "learning_rate": 3.685441224181434e-05,
1336
+ "loss": 0.4441,
1337
+ "step": 432
1338
+ },
1339
+ {
1340
+ "epoch": 1.19,
1341
+ "learning_rate": 3.671952950260331e-05,
1342
+ "loss": 0.4641,
1343
+ "step": 434
1344
+ },
1345
+ {
1346
+ "epoch": 1.2,
1347
+ "learning_rate": 3.658420810375866e-05,
1348
+ "loss": 0.4387,
1349
+ "step": 436
1350
+ },
1351
+ {
1352
+ "epoch": 1.2,
1353
+ "learning_rate": 3.644845311033316e-05,
1354
+ "loss": 0.441,
1355
+ "step": 438
1356
+ },
1357
+ {
1358
+ "epoch": 1.21,
1359
+ "learning_rate": 3.631226960360894e-05,
1360
+ "loss": 0.429,
1361
+ "step": 440
1362
+ },
1363
+ {
1364
+ "epoch": 1.21,
1365
+ "learning_rate": 3.6175662680907265e-05,
1366
+ "loss": 0.4344,
1367
+ "step": 442
1368
+ },
1369
+ {
1370
+ "epoch": 1.22,
1371
+ "learning_rate": 3.60386374553978e-05,
1372
+ "loss": 0.4528,
1373
+ "step": 444
1374
+ },
1375
+ {
1376
+ "epoch": 1.22,
1377
+ "learning_rate": 3.5901199055907195e-05,
1378
+ "loss": 0.4319,
1379
+ "step": 446
1380
+ },
1381
+ {
1382
+ "epoch": 1.23,
1383
+ "learning_rate": 3.576335262672711e-05,
1384
+ "loss": 0.45,
1385
+ "step": 448
1386
+ },
1387
+ {
1388
+ "epoch": 1.24,
1389
+ "learning_rate": 3.5625103327421684e-05,
1390
+ "loss": 0.4411,
1391
+ "step": 450
1392
+ },
1393
+ {
1394
+ "epoch": 1.24,
1395
+ "learning_rate": 3.54864563326344e-05,
1396
+ "loss": 0.4117,
1397
+ "step": 452
1398
+ },
1399
+ {
1400
+ "epoch": 1.25,
1401
+ "learning_rate": 3.534741683189441e-05,
1402
+ "loss": 0.4585,
1403
+ "step": 454
1404
+ },
1405
+ {
1406
+ "epoch": 1.25,
1407
+ "learning_rate": 3.5207990029422284e-05,
1408
+ "loss": 0.4313,
1409
+ "step": 456
1410
+ },
1411
+ {
1412
+ "epoch": 1.26,
1413
+ "learning_rate": 3.50681811439352e-05,
1414
+ "loss": 0.4476,
1415
+ "step": 458
1416
+ },
1417
+ {
1418
+ "epoch": 1.26,
1419
+ "learning_rate": 3.492799540845165e-05,
1420
+ "loss": 0.4558,
1421
+ "step": 460
1422
+ },
1423
+ {
1424
+ "epoch": 1.27,
1425
+ "learning_rate": 3.478743807009552e-05,
1426
+ "loss": 0.4661,
1427
+ "step": 462
1428
+ },
1429
+ {
1430
+ "epoch": 1.27,
1431
+ "learning_rate": 3.4646514389899755e-05,
1432
+ "loss": 0.4317,
1433
+ "step": 464
1434
+ },
1435
+ {
1436
+ "epoch": 1.28,
1437
+ "learning_rate": 3.450522964260936e-05,
1438
+ "loss": 0.4353,
1439
+ "step": 466
1440
+ },
1441
+ {
1442
+ "epoch": 1.28,
1443
+ "learning_rate": 3.436358911648403e-05,
1444
+ "loss": 0.4386,
1445
+ "step": 468
1446
+ },
1447
+ {
1448
+ "epoch": 1.29,
1449
+ "learning_rate": 3.4221598113100195e-05,
1450
+ "loss": 0.4262,
1451
+ "step": 470
1452
+ },
1453
+ {
1454
+ "epoch": 1.3,
1455
+ "learning_rate": 3.407926194715257e-05,
1456
+ "loss": 0.4343,
1457
+ "step": 472
1458
+ },
1459
+ {
1460
+ "epoch": 1.3,
1461
+ "learning_rate": 3.393658594625523e-05,
1462
+ "loss": 0.4504,
1463
+ "step": 474
1464
+ },
1465
+ {
1466
+ "epoch": 1.31,
1467
+ "learning_rate": 3.379357545074221e-05,
1468
+ "loss": 0.4502,
1469
+ "step": 476
1470
+ },
1471
+ {
1472
+ "epoch": 1.31,
1473
+ "learning_rate": 3.365023581346762e-05,
1474
+ "loss": 0.4238,
1475
+ "step": 478
1476
+ },
1477
+ {
1478
+ "epoch": 1.32,
1479
+ "learning_rate": 3.350657239960526e-05,
1480
+ "loss": 0.4433,
1481
+ "step": 480
1482
+ },
1483
+ {
1484
+ "epoch": 1.32,
1485
+ "learning_rate": 3.3362590586447846e-05,
1486
+ "loss": 0.4298,
1487
+ "step": 482
1488
+ },
1489
+ {
1490
+ "epoch": 1.33,
1491
+ "learning_rate": 3.3218295763205694e-05,
1492
+ "loss": 0.4424,
1493
+ "step": 484
1494
+ },
1495
+ {
1496
+ "epoch": 1.33,
1497
+ "learning_rate": 3.307369333080504e-05,
1498
+ "loss": 0.4417,
1499
+ "step": 486
1500
+ },
1501
+ {
1502
+ "epoch": 1.34,
1503
+ "learning_rate": 3.292878870168585e-05,
1504
+ "loss": 0.4561,
1505
+ "step": 488
1506
+ },
1507
+ {
1508
+ "epoch": 1.35,
1509
+ "learning_rate": 3.278358729959929e-05,
1510
+ "loss": 0.4683,
1511
+ "step": 490
1512
+ },
1513
+ {
1514
+ "epoch": 1.35,
1515
+ "learning_rate": 3.263809455940463e-05,
1516
+ "loss": 0.4342,
1517
+ "step": 492
1518
+ },
1519
+ {
1520
+ "epoch": 1.36,
1521
+ "learning_rate": 3.24923159268659e-05,
1522
+ "loss": 0.4437,
1523
+ "step": 494
1524
+ },
1525
+ {
1526
+ "epoch": 1.36,
1527
+ "learning_rate": 3.234625685844803e-05,
1528
+ "loss": 0.4343,
1529
+ "step": 496
1530
+ },
1531
+ {
1532
+ "epoch": 1.37,
1533
+ "learning_rate": 3.219992282111256e-05,
1534
+ "loss": 0.4324,
1535
+ "step": 498
1536
+ },
1537
+ {
1538
+ "epoch": 1.37,
1539
+ "learning_rate": 3.2053319292113114e-05,
1540
+ "loss": 0.4312,
1541
+ "step": 500
1542
+ },
1543
+ {
1544
+ "epoch": 1.37,
1545
+ "eval_loss": 0.5447221398353577,
1546
+ "eval_runtime": 372.5869,
1547
+ "eval_samples_per_second": 26.34,
1548
+ "eval_steps_per_second": 0.207,
1549
+ "step": 500
1550
+ },
1551
+ {
1552
+ "epoch": 1.38,
1553
+ "learning_rate": 3.190645175879032e-05,
1554
+ "loss": 0.437,
1555
+ "step": 502
1556
+ },
1557
+ {
1558
+ "epoch": 1.38,
1559
+ "learning_rate": 3.1759325718366414e-05,
1560
+ "loss": 0.4252,
1561
+ "step": 504
1562
+ },
1563
+ {
1564
+ "epoch": 1.39,
1565
+ "learning_rate": 3.1611946677739515e-05,
1566
+ "loss": 0.4599,
1567
+ "step": 506
1568
+ },
1569
+ {
1570
+ "epoch": 1.39,
1571
+ "learning_rate": 3.146432015327751e-05,
1572
+ "loss": 0.4204,
1573
+ "step": 508
1574
+ },
1575
+ {
1576
+ "epoch": 1.4,
1577
+ "learning_rate": 3.131645167061152e-05,
1578
+ "loss": 0.4276,
1579
+ "step": 510
1580
+ },
1581
+ {
1582
+ "epoch": 1.41,
1583
+ "learning_rate": 3.1168346764429126e-05,
1584
+ "loss": 0.4286,
1585
+ "step": 512
1586
+ },
1587
+ {
1588
+ "epoch": 1.41,
1589
+ "learning_rate": 3.10200109782672e-05,
1590
+ "loss": 0.4038,
1591
+ "step": 514
1592
+ },
1593
+ {
1594
+ "epoch": 1.42,
1595
+ "learning_rate": 3.087144986430442e-05,
1596
+ "loss": 0.4278,
1597
+ "step": 516
1598
+ },
1599
+ {
1600
+ "epoch": 1.42,
1601
+ "learning_rate": 3.0722668983153394e-05,
1602
+ "loss": 0.4254,
1603
+ "step": 518
1604
+ },
1605
+ {
1606
+ "epoch": 1.43,
1607
+ "learning_rate": 3.0573673903652615e-05,
1608
+ "loss": 0.4241,
1609
+ "step": 520
1610
+ },
1611
+ {
1612
+ "epoch": 1.43,
1613
+ "learning_rate": 3.042447020265795e-05,
1614
+ "loss": 0.4487,
1615
+ "step": 522
1616
+ },
1617
+ {
1618
+ "epoch": 1.44,
1619
+ "learning_rate": 3.027506346483395e-05,
1620
+ "loss": 0.4359,
1621
+ "step": 524
1622
+ },
1623
+ {
1624
+ "epoch": 1.44,
1625
+ "learning_rate": 3.0125459282444755e-05,
1626
+ "loss": 0.4468,
1627
+ "step": 526
1628
+ },
1629
+ {
1630
+ "epoch": 1.45,
1631
+ "learning_rate": 2.997566325514487e-05,
1632
+ "loss": 0.4352,
1633
+ "step": 528
1634
+ },
1635
+ {
1636
+ "epoch": 1.46,
1637
+ "learning_rate": 2.982568098976947e-05,
1638
+ "loss": 0.4306,
1639
+ "step": 530
1640
+ },
1641
+ {
1642
+ "epoch": 1.46,
1643
+ "learning_rate": 2.967551810012461e-05,
1644
+ "loss": 0.4406,
1645
+ "step": 532
1646
+ },
1647
+ {
1648
+ "epoch": 1.47,
1649
+ "learning_rate": 2.9525180206777058e-05,
1650
+ "loss": 0.4278,
1651
+ "step": 534
1652
+ },
1653
+ {
1654
+ "epoch": 1.47,
1655
+ "learning_rate": 2.9374672936843937e-05,
1656
+ "loss": 0.4227,
1657
+ "step": 536
1658
+ },
1659
+ {
1660
+ "epoch": 1.48,
1661
+ "learning_rate": 2.9224001923782134e-05,
1662
+ "loss": 0.4191,
1663
+ "step": 538
1664
+ },
1665
+ {
1666
+ "epoch": 1.48,
1667
+ "learning_rate": 2.907317280717736e-05,
1668
+ "loss": 0.4219,
1669
+ "step": 540
1670
+ },
1671
+ {
1672
+ "epoch": 1.49,
1673
+ "learning_rate": 2.8922191232533137e-05,
1674
+ "loss": 0.4483,
1675
+ "step": 542
1676
+ },
1677
+ {
1678
+ "epoch": 1.49,
1679
+ "learning_rate": 2.8771062851059456e-05,
1680
+ "loss": 0.4192,
1681
+ "step": 544
1682
+ },
1683
+ {
1684
+ "epoch": 1.5,
1685
+ "learning_rate": 2.861979331946126e-05,
1686
+ "loss": 0.4188,
1687
+ "step": 546
1688
+ },
1689
+ {
1690
+ "epoch": 1.5,
1691
+ "learning_rate": 2.8468388299726712e-05,
1692
+ "loss": 0.4305,
1693
+ "step": 548
1694
+ },
1695
+ {
1696
+ "epoch": 1.51,
1697
+ "learning_rate": 2.8316853458915256e-05,
1698
+ "loss": 0.4191,
1699
+ "step": 550
1700
+ },
1701
+ {
1702
+ "epoch": 1.52,
1703
+ "learning_rate": 2.816519446894555e-05,
1704
+ "loss": 0.435,
1705
+ "step": 552
1706
+ },
1707
+ {
1708
+ "epoch": 1.52,
1709
+ "learning_rate": 2.8013417006383076e-05,
1710
+ "loss": 0.4341,
1711
+ "step": 554
1712
+ },
1713
+ {
1714
+ "epoch": 1.53,
1715
+ "learning_rate": 2.7861526752227767e-05,
1716
+ "loss": 0.4346,
1717
+ "step": 556
1718
+ },
1719
+ {
1720
+ "epoch": 1.53,
1721
+ "learning_rate": 2.7709529391701305e-05,
1722
+ "loss": 0.4205,
1723
+ "step": 558
1724
+ },
1725
+ {
1726
+ "epoch": 1.54,
1727
+ "learning_rate": 2.7557430614034336e-05,
1728
+ "loss": 0.4578,
1729
+ "step": 560
1730
+ },
1731
+ {
1732
+ "epoch": 1.54,
1733
+ "learning_rate": 2.740523611225354e-05,
1734
+ "loss": 0.4287,
1735
+ "step": 562
1736
+ },
1737
+ {
1738
+ "epoch": 1.55,
1739
+ "learning_rate": 2.7252951582968523e-05,
1740
+ "loss": 0.4184,
1741
+ "step": 564
1742
+ },
1743
+ {
1744
+ "epoch": 1.55,
1745
+ "learning_rate": 2.7100582726158608e-05,
1746
+ "loss": 0.4243,
1747
+ "step": 566
1748
+ },
1749
+ {
1750
+ "epoch": 1.56,
1751
+ "learning_rate": 2.694813524495947e-05,
1752
+ "loss": 0.4133,
1753
+ "step": 568
1754
+ },
1755
+ {
1756
+ "epoch": 1.56,
1757
+ "learning_rate": 2.6795614845449714e-05,
1758
+ "loss": 0.4318,
1759
+ "step": 570
1760
+ },
1761
+ {
1762
+ "epoch": 1.57,
1763
+ "learning_rate": 2.6643027236437212e-05,
1764
+ "loss": 0.419,
1765
+ "step": 572
1766
+ },
1767
+ {
1768
+ "epoch": 1.58,
1769
+ "learning_rate": 2.6490378129245498e-05,
1770
+ "loss": 0.4369,
1771
+ "step": 574
1772
+ },
1773
+ {
1774
+ "epoch": 1.58,
1775
+ "learning_rate": 2.6337673237499988e-05,
1776
+ "loss": 0.4497,
1777
+ "step": 576
1778
+ },
1779
+ {
1780
+ "epoch": 1.59,
1781
+ "learning_rate": 2.618491827691407e-05,
1782
+ "loss": 0.4435,
1783
+ "step": 578
1784
+ },
1785
+ {
1786
+ "epoch": 1.59,
1787
+ "learning_rate": 2.6032118965075225e-05,
1788
+ "loss": 0.4288,
1789
+ "step": 580
1790
+ },
1791
+ {
1792
+ "epoch": 1.6,
1793
+ "learning_rate": 2.5879281021230972e-05,
1794
+ "loss": 0.4417,
1795
+ "step": 582
1796
+ },
1797
+ {
1798
+ "epoch": 1.6,
1799
+ "learning_rate": 2.572641016607484e-05,
1800
+ "loss": 0.4521,
1801
+ "step": 584
1802
+ },
1803
+ {
1804
+ "epoch": 1.61,
1805
+ "learning_rate": 2.5573512121532207e-05,
1806
+ "loss": 0.4259,
1807
+ "step": 586
1808
+ },
1809
+ {
1810
+ "epoch": 1.61,
1811
+ "learning_rate": 2.542059261054613e-05,
1812
+ "loss": 0.457,
1813
+ "step": 588
1814
+ },
1815
+ {
1816
+ "epoch": 1.62,
1817
+ "learning_rate": 2.5267657356863188e-05,
1818
+ "loss": 0.4348,
1819
+ "step": 590
1820
+ },
1821
+ {
1822
+ "epoch": 1.63,
1823
+ "learning_rate": 2.511471208481918e-05,
1824
+ "loss": 0.4357,
1825
+ "step": 592
1826
+ },
1827
+ {
1828
+ "epoch": 1.63,
1829
+ "learning_rate": 2.49617625191249e-05,
1830
+ "loss": 0.4301,
1831
+ "step": 594
1832
+ },
1833
+ {
1834
+ "epoch": 1.64,
1835
+ "learning_rate": 2.4808814384651834e-05,
1836
+ "loss": 0.429,
1837
+ "step": 596
1838
+ },
1839
+ {
1840
+ "epoch": 1.64,
1841
+ "learning_rate": 2.4655873406217928e-05,
1842
+ "loss": 0.4215,
1843
+ "step": 598
1844
+ },
1845
+ {
1846
+ "epoch": 1.65,
1847
+ "learning_rate": 2.4502945308373246e-05,
1848
+ "loss": 0.4257,
1849
+ "step": 600
1850
+ },
1851
+ {
1852
+ "epoch": 1.65,
1853
+ "eval_loss": 0.5429728627204895,
1854
+ "eval_runtime": 372.1195,
1855
+ "eval_samples_per_second": 26.373,
1856
+ "eval_steps_per_second": 0.207,
1857
+ "step": 600
1858
+ },
1859
+ {
1860
+ "epoch": 1.65,
1861
+ "learning_rate": 2.435003581518577e-05,
1862
+ "loss": 0.4524,
1863
+ "step": 602
1864
+ },
1865
+ {
1866
+ "epoch": 1.66,
1867
+ "learning_rate": 2.4197150650027086e-05,
1868
+ "loss": 0.4464,
1869
+ "step": 604
1870
+ },
1871
+ {
1872
+ "epoch": 1.66,
1873
+ "learning_rate": 2.4044295535358195e-05,
1874
+ "loss": 0.4308,
1875
+ "step": 606
1876
+ },
1877
+ {
1878
+ "epoch": 1.67,
1879
+ "learning_rate": 2.389147619251531e-05,
1880
+ "loss": 0.4212,
1881
+ "step": 608
1882
+ },
1883
+ {
1884
+ "epoch": 1.67,
1885
+ "learning_rate": 2.3738698341495724e-05,
1886
+ "loss": 0.4358,
1887
+ "step": 610
1888
+ },
1889
+ {
1890
+ "epoch": 1.68,
1891
+ "learning_rate": 2.358596770074369e-05,
1892
+ "loss": 0.4226,
1893
+ "step": 612
1894
+ },
1895
+ {
1896
+ "epoch": 1.69,
1897
+ "learning_rate": 2.3433289986936398e-05,
1898
+ "loss": 0.4027,
1899
+ "step": 614
1900
+ },
1901
+ {
1902
+ "epoch": 1.69,
1903
+ "learning_rate": 2.3280670914769972e-05,
1904
+ "loss": 0.4154,
1905
+ "step": 616
1906
+ },
1907
+ {
1908
+ "epoch": 1.7,
1909
+ "learning_rate": 2.3128116196745605e-05,
1910
+ "loss": 0.438,
1911
+ "step": 618
1912
+ },
1913
+ {
1914
+ "epoch": 1.7,
1915
+ "learning_rate": 2.297563154295575e-05,
1916
+ "loss": 0.4265,
1917
+ "step": 620
1918
+ },
1919
+ {
1920
+ "epoch": 1.71,
1921
+ "learning_rate": 2.2823222660870337e-05,
1922
+ "loss": 0.4539,
1923
+ "step": 622
1924
+ },
1925
+ {
1926
+ "epoch": 1.71,
1927
+ "learning_rate": 2.267089525512318e-05,
1928
+ "loss": 0.428,
1929
+ "step": 624
1930
+ },
1931
+ {
1932
+ "epoch": 1.72,
1933
+ "learning_rate": 2.2518655027298464e-05,
1934
+ "loss": 0.4473,
1935
+ "step": 626
1936
+ },
1937
+ {
1938
+ "epoch": 1.72,
1939
+ "learning_rate": 2.2366507675717314e-05,
1940
+ "loss": 0.4209,
1941
+ "step": 628
1942
+ },
1943
+ {
1944
+ "epoch": 1.73,
1945
+ "learning_rate": 2.221445889522452e-05,
1946
+ "loss": 0.4155,
1947
+ "step": 630
1948
+ },
1949
+ {
1950
+ "epoch": 1.74,
1951
+ "learning_rate": 2.2062514376975373e-05,
1952
+ "loss": 0.4486,
1953
+ "step": 632
1954
+ },
1955
+ {
1956
+ "epoch": 1.74,
1957
+ "learning_rate": 2.191067980822266e-05,
1958
+ "loss": 0.4482,
1959
+ "step": 634
1960
+ },
1961
+ {
1962
+ "epoch": 1.75,
1963
+ "learning_rate": 2.1758960872103733e-05,
1964
+ "loss": 0.4348,
1965
+ "step": 636
1966
+ },
1967
+ {
1968
+ "epoch": 1.75,
1969
+ "learning_rate": 2.160736324742792e-05,
1970
+ "loss": 0.4422,
1971
+ "step": 638
1972
+ },
1973
+ {
1974
+ "epoch": 1.76,
1975
+ "learning_rate": 2.1455892608463824e-05,
1976
+ "loss": 0.4081,
1977
+ "step": 640
1978
+ },
1979
+ {
1980
+ "epoch": 1.76,
1981
+ "learning_rate": 2.1304554624727006e-05,
1982
+ "loss": 0.4191,
1983
+ "step": 642
1984
+ },
1985
+ {
1986
+ "epoch": 1.77,
1987
+ "learning_rate": 2.1153354960767785e-05,
1988
+ "loss": 0.4343,
1989
+ "step": 644
1990
+ },
1991
+ {
1992
+ "epoch": 1.77,
1993
+ "learning_rate": 2.1002299275959185e-05,
1994
+ "loss": 0.4211,
1995
+ "step": 646
1996
+ },
1997
+ {
1998
+ "epoch": 1.78,
1999
+ "learning_rate": 2.085139322428514e-05,
2000
+ "loss": 0.4481,
2001
+ "step": 648
2002
+ },
2003
+ {
2004
+ "epoch": 1.78,
2005
+ "learning_rate": 2.0700642454128815e-05,
2006
+ "loss": 0.4278,
2007
+ "step": 650
2008
+ },
2009
+ {
2010
+ "epoch": 1.79,
2011
+ "learning_rate": 2.055005260806125e-05,
2012
+ "loss": 0.4126,
2013
+ "step": 652
2014
+ },
2015
+ {
2016
+ "epoch": 1.8,
2017
+ "learning_rate": 2.0399629322630102e-05,
2018
+ "loss": 0.4152,
2019
+ "step": 654
2020
+ },
2021
+ {
2022
+ "epoch": 1.8,
2023
+ "learning_rate": 2.024937822814871e-05,
2024
+ "loss": 0.4351,
2025
+ "step": 656
2026
+ },
2027
+ {
2028
+ "epoch": 1.81,
2029
+ "learning_rate": 2.009930494848535e-05,
2030
+ "loss": 0.4102,
2031
+ "step": 658
2032
+ },
2033
+ {
2034
+ "epoch": 1.81,
2035
+ "learning_rate": 1.994941510085271e-05,
2036
+ "loss": 0.4274,
2037
+ "step": 660
2038
+ },
2039
+ {
2040
+ "epoch": 1.82,
2041
+ "learning_rate": 1.9799714295597657e-05,
2042
+ "loss": 0.4515,
2043
+ "step": 662
2044
+ },
2045
+ {
2046
+ "epoch": 1.82,
2047
+ "learning_rate": 1.9650208135991227e-05,
2048
+ "loss": 0.4101,
2049
+ "step": 664
2050
+ },
2051
+ {
2052
+ "epoch": 1.83,
2053
+ "learning_rate": 1.9500902218018946e-05,
2054
+ "loss": 0.4325,
2055
+ "step": 666
2056
+ },
2057
+ {
2058
+ "epoch": 1.83,
2059
+ "learning_rate": 1.935180213017131e-05,
2060
+ "loss": 0.4496,
2061
+ "step": 668
2062
+ },
2063
+ {
2064
+ "epoch": 1.84,
2065
+ "learning_rate": 1.9202913453234622e-05,
2066
+ "loss": 0.4326,
2067
+ "step": 670
2068
+ },
2069
+ {
2070
+ "epoch": 1.84,
2071
+ "learning_rate": 1.9054241760082142e-05,
2072
+ "loss": 0.4157,
2073
+ "step": 672
2074
+ },
2075
+ {
2076
+ "epoch": 1.85,
2077
+ "learning_rate": 1.8905792615465455e-05,
2078
+ "loss": 0.4022,
2079
+ "step": 674
2080
+ },
2081
+ {
2082
+ "epoch": 1.86,
2083
+ "learning_rate": 1.8757571575806213e-05,
2084
+ "loss": 0.4185,
2085
+ "step": 676
2086
+ },
2087
+ {
2088
+ "epoch": 1.86,
2089
+ "learning_rate": 1.8609584188988136e-05,
2090
+ "loss": 0.4238,
2091
+ "step": 678
2092
+ },
2093
+ {
2094
+ "epoch": 1.87,
2095
+ "learning_rate": 1.8461835994149362e-05,
2096
+ "loss": 0.4265,
2097
+ "step": 680
2098
+ },
2099
+ {
2100
+ "epoch": 1.87,
2101
+ "learning_rate": 1.8314332521475132e-05,
2102
+ "loss": 0.4182,
2103
+ "step": 682
2104
+ },
2105
+ {
2106
+ "epoch": 1.88,
2107
+ "learning_rate": 1.816707929199077e-05,
2108
+ "loss": 0.4323,
2109
+ "step": 684
2110
+ },
2111
+ {
2112
+ "epoch": 1.88,
2113
+ "learning_rate": 1.8020081817355066e-05,
2114
+ "loss": 0.4375,
2115
+ "step": 686
2116
+ },
2117
+ {
2118
+ "epoch": 1.89,
2119
+ "learning_rate": 1.7873345599653946e-05,
2120
+ "loss": 0.4281,
2121
+ "step": 688
2122
+ },
2123
+ {
2124
+ "epoch": 1.89,
2125
+ "learning_rate": 1.772687613119455e-05,
2126
+ "loss": 0.4274,
2127
+ "step": 690
2128
+ },
2129
+ {
2130
+ "epoch": 1.9,
2131
+ "learning_rate": 1.7580678894299618e-05,
2132
+ "loss": 0.4267,
2133
+ "step": 692
2134
+ },
2135
+ {
2136
+ "epoch": 1.91,
2137
+ "learning_rate": 1.7434759361102372e-05,
2138
+ "loss": 0.4402,
2139
+ "step": 694
2140
+ },
2141
+ {
2142
+ "epoch": 1.91,
2143
+ "learning_rate": 1.7289122993341596e-05,
2144
+ "loss": 0.4347,
2145
+ "step": 696
2146
+ },
2147
+ {
2148
+ "epoch": 1.92,
2149
+ "learning_rate": 1.714377524215725e-05,
2150
+ "loss": 0.4095,
2151
+ "step": 698
2152
+ },
2153
+ {
2154
+ "epoch": 1.92,
2155
+ "learning_rate": 1.6998721547886465e-05,
2156
+ "loss": 0.4267,
2157
+ "step": 700
2158
+ },
2159
+ {
2160
+ "epoch": 1.92,
2161
+ "eval_loss": 0.5412114262580872,
2162
+ "eval_runtime": 372.1815,
2163
+ "eval_samples_per_second": 26.369,
2164
+ "eval_steps_per_second": 0.207,
2165
+ "step": 700
2166
+ },
2167
+ {
2168
+ "epoch": 1.93,
2169
+ "learning_rate": 1.6853967339859842e-05,
2170
+ "loss": 0.4078,
2171
+ "step": 702
2172
+ },
2173
+ {
2174
+ "epoch": 1.93,
2175
+ "learning_rate": 1.6709518036198308e-05,
2176
+ "loss": 0.4204,
2177
+ "step": 704
2178
+ },
2179
+ {
2180
+ "epoch": 1.94,
2181
+ "learning_rate": 1.656537904361026e-05,
2182
+ "loss": 0.4402,
2183
+ "step": 706
2184
+ },
2185
+ {
2186
+ "epoch": 1.94,
2187
+ "learning_rate": 1.6421555757189205e-05,
2188
+ "loss": 0.4218,
2189
+ "step": 708
2190
+ },
2191
+ {
2192
+ "epoch": 1.95,
2193
+ "learning_rate": 1.627805356021187e-05,
2194
+ "loss": 0.428,
2195
+ "step": 710
2196
+ },
2197
+ {
2198
+ "epoch": 1.95,
2199
+ "learning_rate": 1.613487782393661e-05,
2200
+ "loss": 0.4337,
2201
+ "step": 712
2202
+ },
2203
+ {
2204
+ "epoch": 1.96,
2205
+ "learning_rate": 1.5992033907402482e-05,
2206
+ "loss": 0.4443,
2207
+ "step": 714
2208
+ },
2209
+ {
2210
+ "epoch": 1.97,
2211
+ "learning_rate": 1.5849527157228565e-05,
2212
+ "loss": 0.4197,
2213
+ "step": 716
2214
+ },
2215
+ {
2216
+ "epoch": 1.97,
2217
+ "learning_rate": 1.5707362907413868e-05,
2218
+ "loss": 0.4259,
2219
+ "step": 718
2220
+ },
2221
+ {
2222
+ "epoch": 1.98,
2223
+ "learning_rate": 1.5565546479137676e-05,
2224
+ "loss": 0.432,
2225
+ "step": 720
2226
+ },
2227
+ {
2228
+ "epoch": 1.98,
2229
+ "learning_rate": 1.5424083180560418e-05,
2230
+ "loss": 0.4272,
2231
+ "step": 722
2232
+ },
2233
+ {
2234
+ "epoch": 1.99,
2235
+ "learning_rate": 1.528297830662491e-05,
2236
+ "loss": 0.4076,
2237
+ "step": 724
2238
+ },
2239
+ {
2240
+ "epoch": 1.99,
2241
+ "learning_rate": 1.5142237138858221e-05,
2242
+ "loss": 0.4387,
2243
+ "step": 726
2244
+ },
2245
+ {
2246
+ "epoch": 2.0,
2247
+ "learning_rate": 1.5001864945173972e-05,
2248
+ "loss": 0.4271,
2249
+ "step": 728
2250
+ },
2251
+ {
2252
+ "epoch": 2.0,
2253
+ "learning_rate": 1.4861866979675154e-05,
2254
+ "loss": 0.4142,
2255
+ "step": 730
2256
+ },
2257
+ {
2258
+ "epoch": 2.01,
2259
+ "learning_rate": 1.4722248482457484e-05,
2260
+ "loss": 0.4221,
2261
+ "step": 732
2262
+ },
2263
+ {
2264
+ "epoch": 2.02,
2265
+ "learning_rate": 1.4583014679413242e-05,
2266
+ "loss": 0.4456,
2267
+ "step": 734
2268
+ },
2269
+ {
2270
+ "epoch": 2.02,
2271
+ "learning_rate": 1.4444170782035699e-05,
2272
+ "loss": 0.4366,
2273
+ "step": 736
2274
+ },
2275
+ {
2276
+ "epoch": 2.03,
2277
+ "learning_rate": 1.4305721987224008e-05,
2278
+ "loss": 0.4324,
2279
+ "step": 738
2280
+ },
2281
+ {
2282
+ "epoch": 2.03,
2283
+ "learning_rate": 1.4167673477088739e-05,
2284
+ "loss": 0.4263,
2285
+ "step": 740
2286
+ },
2287
+ {
2288
+ "epoch": 2.04,
2289
+ "learning_rate": 1.4030030418757892e-05,
2290
+ "loss": 0.44,
2291
+ "step": 742
2292
+ },
2293
+ {
2294
+ "epoch": 2.04,
2295
+ "learning_rate": 1.3892797964183449e-05,
2296
+ "loss": 0.4129,
2297
+ "step": 744
2298
+ },
2299
+ {
2300
+ "epoch": 2.05,
2301
+ "learning_rate": 1.3755981249948625e-05,
2302
+ "loss": 0.4423,
2303
+ "step": 746
2304
+ },
2305
+ {
2306
+ "epoch": 2.05,
2307
+ "learning_rate": 1.3619585397075505e-05,
2308
+ "loss": 0.3938,
2309
+ "step": 748
2310
+ },
2311
+ {
2312
+ "epoch": 2.06,
2313
+ "learning_rate": 1.3483615510833463e-05,
2314
+ "loss": 0.4129,
2315
+ "step": 750
2316
+ },
2317
+ {
2318
+ "epoch": 2.06,
2319
+ "learning_rate": 1.3348076680548021e-05,
2320
+ "loss": 0.4318,
2321
+ "step": 752
2322
+ },
2323
+ {
2324
+ "epoch": 2.07,
2325
+ "learning_rate": 1.3212973979410338e-05,
2326
+ "loss": 0.3967,
2327
+ "step": 754
2328
+ },
2329
+ {
2330
+ "epoch": 2.08,
2331
+ "learning_rate": 1.3078312464287353e-05,
2332
+ "loss": 0.4087,
2333
+ "step": 756
2334
+ },
2335
+ {
2336
+ "epoch": 2.08,
2337
+ "learning_rate": 1.2944097175532522e-05,
2338
+ "loss": 0.3822,
2339
+ "step": 758
2340
+ },
2341
+ {
2342
+ "epoch": 2.09,
2343
+ "learning_rate": 1.2810333136797134e-05,
2344
+ "loss": 0.3914,
2345
+ "step": 760
2346
+ },
2347
+ {
2348
+ "epoch": 2.09,
2349
+ "learning_rate": 1.267702535484225e-05,
2350
+ "loss": 0.3947,
2351
+ "step": 762
2352
+ },
2353
+ {
2354
+ "epoch": 2.1,
2355
+ "learning_rate": 1.2544178819351376e-05,
2356
+ "loss": 0.4057,
2357
+ "step": 764
2358
+ },
2359
+ {
2360
+ "epoch": 2.1,
2361
+ "learning_rate": 1.241179850274361e-05,
2362
+ "loss": 0.3814,
2363
+ "step": 766
2364
+ },
2365
+ {
2366
+ "epoch": 2.11,
2367
+ "learning_rate": 1.2279889359987604e-05,
2368
+ "loss": 0.3955,
2369
+ "step": 768
2370
+ },
2371
+ {
2372
+ "epoch": 2.11,
2373
+ "learning_rate": 1.2148456328416068e-05,
2374
+ "loss": 0.3999,
2375
+ "step": 770
2376
+ },
2377
+ {
2378
+ "epoch": 2.12,
2379
+ "learning_rate": 1.2017504327540935e-05,
2380
+ "loss": 0.3896,
2381
+ "step": 772
2382
+ },
2383
+ {
2384
+ "epoch": 2.12,
2385
+ "learning_rate": 1.1887038258869295e-05,
2386
+ "loss": 0.3685,
2387
+ "step": 774
2388
+ },
2389
+ {
2390
+ "epoch": 2.13,
2391
+ "learning_rate": 1.175706300571986e-05,
2392
+ "loss": 0.358,
2393
+ "step": 776
2394
+ },
2395
+ {
2396
+ "epoch": 2.14,
2397
+ "learning_rate": 1.162758343304023e-05,
2398
+ "loss": 0.379,
2399
+ "step": 778
2400
+ },
2401
+ {
2402
+ "epoch": 2.14,
2403
+ "learning_rate": 1.1498604387224798e-05,
2404
+ "loss": 0.3678,
2405
+ "step": 780
2406
+ },
2407
+ {
2408
+ "epoch": 2.15,
2409
+ "learning_rate": 1.1370130695933318e-05,
2410
+ "loss": 0.3808,
2411
+ "step": 782
2412
+ },
2413
+ {
2414
+ "epoch": 2.15,
2415
+ "learning_rate": 1.1242167167910216e-05,
2416
+ "loss": 0.3507,
2417
+ "step": 784
2418
+ },
2419
+ {
2420
+ "epoch": 2.16,
2421
+ "learning_rate": 1.1114718592804637e-05,
2422
+ "loss": 0.3664,
2423
+ "step": 786
2424
+ },
2425
+ {
2426
+ "epoch": 2.16,
2427
+ "learning_rate": 1.0987789740991143e-05,
2428
+ "loss": 0.3474,
2429
+ "step": 788
2430
+ },
2431
+ {
2432
+ "epoch": 2.17,
2433
+ "learning_rate": 1.0861385363391117e-05,
2434
+ "loss": 0.3316,
2435
+ "step": 790
2436
+ },
2437
+ {
2438
+ "epoch": 2.17,
2439
+ "learning_rate": 1.0735510191295025e-05,
2440
+ "loss": 0.3592,
2441
+ "step": 792
2442
+ },
2443
+ {
2444
+ "epoch": 2.18,
2445
+ "learning_rate": 1.0610168936185245e-05,
2446
+ "loss": 0.3532,
2447
+ "step": 794
2448
+ },
2449
+ {
2450
+ "epoch": 2.19,
2451
+ "learning_rate": 1.0485366289559765e-05,
2452
+ "loss": 0.3359,
2453
+ "step": 796
2454
+ },
2455
+ {
2456
+ "epoch": 2.19,
2457
+ "learning_rate": 1.0361106922756574e-05,
2458
+ "loss": 0.3443,
2459
+ "step": 798
2460
+ },
2461
+ {
2462
+ "epoch": 2.2,
2463
+ "learning_rate": 1.0237395486778775e-05,
2464
+ "loss": 0.3239,
2465
+ "step": 800
2466
+ },
2467
+ {
2468
+ "epoch": 2.2,
2469
+ "eval_loss": 0.5829917788505554,
2470
+ "eval_runtime": 372.4526,
2471
+ "eval_samples_per_second": 26.35,
2472
+ "eval_steps_per_second": 0.207,
2473
+ "step": 800
2474
+ },
2475
+ {
2476
+ "epoch": 2.2,
2477
+ "learning_rate": 1.011423661212057e-05,
2478
+ "loss": 0.3277,
2479
+ "step": 802
2480
+ },
2481
+ {
2482
+ "epoch": 2.21,
2483
+ "learning_rate": 9.991634908593864e-06,
2484
+ "loss": 0.3181,
2485
+ "step": 804
2486
+ },
2487
+ {
2488
+ "epoch": 2.21,
2489
+ "learning_rate": 9.869594965155784e-06,
2490
+ "loss": 0.3135,
2491
+ "step": 806
2492
+ },
2493
+ {
2494
+ "epoch": 2.22,
2495
+ "learning_rate": 9.748121349736892e-06,
2496
+ "loss": 0.3394,
2497
+ "step": 808
2498
+ },
2499
+ {
2500
+ "epoch": 2.22,
2501
+ "learning_rate": 9.627218609070189e-06,
2502
+ "loss": 0.3148,
2503
+ "step": 810
2504
+ },
2505
+ {
2506
+ "epoch": 2.23,
2507
+ "learning_rate": 9.506891268520943e-06,
2508
+ "loss": 0.3326,
2509
+ "step": 812
2510
+ },
2511
+ {
2512
+ "epoch": 2.23,
2513
+ "learning_rate": 9.387143831917336e-06,
2514
+ "loss": 0.3272,
2515
+ "step": 814
2516
+ },
2517
+ {
2518
+ "epoch": 2.24,
2519
+ "learning_rate": 9.26798078138186e-06,
2520
+ "loss": 0.306,
2521
+ "step": 816
2522
+ },
2523
+ {
2524
+ "epoch": 2.25,
2525
+ "learning_rate": 9.149406577163528e-06,
2526
+ "loss": 0.3267,
2527
+ "step": 818
2528
+ },
2529
+ {
2530
+ "epoch": 2.25,
2531
+ "learning_rate": 9.031425657470981e-06,
2532
+ "loss": 0.3185,
2533
+ "step": 820
2534
+ },
2535
+ {
2536
+ "epoch": 2.26,
2537
+ "learning_rate": 8.914042438306319e-06,
2538
+ "loss": 0.321,
2539
+ "step": 822
2540
+ },
2541
+ {
2542
+ "epoch": 2.26,
2543
+ "learning_rate": 8.797261313299845e-06,
2544
+ "loss": 0.3303,
2545
+ "step": 824
2546
+ },
2547
+ {
2548
+ "epoch": 2.27,
2549
+ "learning_rate": 8.681086653545606e-06,
2550
+ "loss": 0.3526,
2551
+ "step": 826
2552
+ },
2553
+ {
2554
+ "epoch": 2.27,
2555
+ "learning_rate": 8.565522807437743e-06,
2556
+ "loss": 0.3265,
2557
+ "step": 828
2558
+ },
2559
+ {
2560
+ "epoch": 2.28,
2561
+ "learning_rate": 8.450574100507807e-06,
2562
+ "loss": 0.3084,
2563
+ "step": 830
2564
+ },
2565
+ {
2566
+ "epoch": 2.28,
2567
+ "learning_rate": 8.336244835262778e-06,
2568
+ "loss": 0.3264,
2569
+ "step": 832
2570
+ },
2571
+ {
2572
+ "epoch": 2.29,
2573
+ "learning_rate": 8.222539291024078e-06,
2574
+ "loss": 0.3155,
2575
+ "step": 834
2576
+ },
2577
+ {
2578
+ "epoch": 2.3,
2579
+ "learning_rate": 8.109461723767384e-06,
2580
+ "loss": 0.3138,
2581
+ "step": 836
2582
+ },
2583
+ {
2584
+ "epoch": 2.3,
2585
+ "learning_rate": 7.9970163659633e-06,
2586
+ "loss": 0.3248,
2587
+ "step": 838
2588
+ },
2589
+ {
2590
+ "epoch": 2.31,
2591
+ "learning_rate": 7.885207426418959e-06,
2592
+ "loss": 0.3359,
2593
+ "step": 840
2594
+ },
2595
+ {
2596
+ "epoch": 2.31,
2597
+ "learning_rate": 7.7740390901205e-06,
2598
+ "loss": 0.3113,
2599
+ "step": 842
2600
+ },
2601
+ {
2602
+ "epoch": 2.32,
2603
+ "learning_rate": 7.663515518076416e-06,
2604
+ "loss": 0.3231,
2605
+ "step": 844
2606
+ },
2607
+ {
2608
+ "epoch": 2.32,
2609
+ "learning_rate": 7.5536408471617744e-06,
2610
+ "loss": 0.3195,
2611
+ "step": 846
2612
+ },
2613
+ {
2614
+ "epoch": 2.33,
2615
+ "learning_rate": 7.444419189963442e-06,
2616
+ "loss": 0.324,
2617
+ "step": 848
2618
+ },
2619
+ {
2620
+ "epoch": 2.33,
2621
+ "learning_rate": 7.335854634626074e-06,
2622
+ "loss": 0.323,
2623
+ "step": 850
2624
+ },
2625
+ {
2626
+ "epoch": 2.34,
2627
+ "learning_rate": 7.227951244699166e-06,
2628
+ "loss": 0.3383,
2629
+ "step": 852
2630
+ },
2631
+ {
2632
+ "epoch": 2.34,
2633
+ "learning_rate": 7.120713058984918e-06,
2634
+ "loss": 0.3475,
2635
+ "step": 854
2636
+ },
2637
+ {
2638
+ "epoch": 2.35,
2639
+ "learning_rate": 7.014144091387054e-06,
2640
+ "loss": 0.3262,
2641
+ "step": 856
2642
+ },
2643
+ {
2644
+ "epoch": 2.36,
2645
+ "learning_rate": 6.9082483307606245e-06,
2646
+ "loss": 0.3239,
2647
+ "step": 858
2648
+ },
2649
+ {
2650
+ "epoch": 2.36,
2651
+ "learning_rate": 6.803029740762648e-06,
2652
+ "loss": 0.3207,
2653
+ "step": 860
2654
+ },
2655
+ {
2656
+ "epoch": 2.37,
2657
+ "learning_rate": 6.698492259703807e-06,
2658
+ "loss": 0.3224,
2659
+ "step": 862
2660
+ },
2661
+ {
2662
+ "epoch": 2.37,
2663
+ "learning_rate": 6.5946398004010115e-06,
2664
+ "loss": 0.3119,
2665
+ "step": 864
2666
+ },
2667
+ {
2668
+ "epoch": 2.38,
2669
+ "learning_rate": 6.491476250030934e-06,
2670
+ "loss": 0.3178,
2671
+ "step": 866
2672
+ },
2673
+ {
2674
+ "epoch": 2.38,
2675
+ "learning_rate": 6.389005469984519e-06,
2676
+ "loss": 0.3105,
2677
+ "step": 868
2678
+ },
2679
+ {
2680
+ "epoch": 2.39,
2681
+ "learning_rate": 6.28723129572247e-06,
2682
+ "loss": 0.3417,
2683
+ "step": 870
2684
+ },
2685
+ {
2686
+ "epoch": 2.39,
2687
+ "learning_rate": 6.1861575366316895e-06,
2688
+ "loss": 0.3191,
2689
+ "step": 872
2690
+ },
2691
+ {
2692
+ "epoch": 2.4,
2693
+ "learning_rate": 6.08578797588264e-06,
2694
+ "loss": 0.3116,
2695
+ "step": 874
2696
+ },
2697
+ {
2698
+ "epoch": 2.4,
2699
+ "learning_rate": 5.986126370287826e-06,
2700
+ "loss": 0.3223,
2701
+ "step": 876
2702
+ },
2703
+ {
2704
+ "epoch": 2.41,
2705
+ "learning_rate": 5.887176450161097e-06,
2706
+ "loss": 0.2942,
2707
+ "step": 878
2708
+ },
2709
+ {
2710
+ "epoch": 2.42,
2711
+ "learning_rate": 5.788941919178078e-06,
2712
+ "loss": 0.3197,
2713
+ "step": 880
2714
+ },
2715
+ {
2716
+ "epoch": 2.42,
2717
+ "learning_rate": 5.691426454237531e-06,
2718
+ "loss": 0.313,
2719
+ "step": 882
2720
+ },
2721
+ {
2722
+ "epoch": 2.43,
2723
+ "learning_rate": 5.594633705323687e-06,
2724
+ "loss": 0.3172,
2725
+ "step": 884
2726
+ },
2727
+ {
2728
+ "epoch": 2.43,
2729
+ "learning_rate": 5.4985672953697e-06,
2730
+ "loss": 0.3444,
2731
+ "step": 886
2732
+ },
2733
+ {
2734
+ "epoch": 2.44,
2735
+ "learning_rate": 5.403230820121971e-06,
2736
+ "loss": 0.3197,
2737
+ "step": 888
2738
+ },
2739
+ {
2740
+ "epoch": 2.44,
2741
+ "learning_rate": 5.308627848005618e-06,
2742
+ "loss": 0.3405,
2743
+ "step": 890
2744
+ },
2745
+ {
2746
+ "epoch": 2.45,
2747
+ "learning_rate": 5.214761919990857e-06,
2748
+ "loss": 0.3252,
2749
+ "step": 892
2750
+ },
2751
+ {
2752
+ "epoch": 2.45,
2753
+ "learning_rate": 5.121636549460523e-06,
2754
+ "loss": 0.3173,
2755
+ "step": 894
2756
+ },
2757
+ {
2758
+ "epoch": 2.46,
2759
+ "learning_rate": 5.0292552220784995e-06,
2760
+ "loss": 0.3329,
2761
+ "step": 896
2762
+ },
2763
+ {
2764
+ "epoch": 2.47,
2765
+ "learning_rate": 4.9376213956593116e-06,
2766
+ "loss": 0.3148,
2767
+ "step": 898
2768
+ },
2769
+ {
2770
+ "epoch": 2.47,
2771
+ "learning_rate": 4.846738500038667e-06,
2772
+ "loss": 0.3171,
2773
+ "step": 900
2774
+ },
2775
+ {
2776
+ "epoch": 2.47,
2777
+ "eval_loss": 0.594931423664093,
2778
+ "eval_runtime": 372.4124,
2779
+ "eval_samples_per_second": 26.353,
2780
+ "eval_steps_per_second": 0.207,
2781
+ "step": 900
2782
+ },
2783
+ {
2784
+ "epoch": 2.48,
2785
+ "learning_rate": 4.756609936945069e-06,
2786
+ "loss": 0.3116,
2787
+ "step": 902
2788
+ },
2789
+ {
2790
+ "epoch": 2.48,
2791
+ "learning_rate": 4.667239079872532e-06,
2792
+ "loss": 0.3202,
2793
+ "step": 904
2794
+ },
2795
+ {
2796
+ "epoch": 2.49,
2797
+ "learning_rate": 4.578629273954263e-06,
2798
+ "loss": 0.3431,
2799
+ "step": 906
2800
+ },
2801
+ {
2802
+ "epoch": 2.49,
2803
+ "learning_rate": 4.490783835837479e-06,
2804
+ "loss": 0.3089,
2805
+ "step": 908
2806
+ },
2807
+ {
2808
+ "epoch": 2.5,
2809
+ "learning_rate": 4.40370605355929e-06,
2810
+ "loss": 0.323,
2811
+ "step": 910
2812
+ },
2813
+ {
2814
+ "epoch": 2.5,
2815
+ "learning_rate": 4.317399186423574e-06,
2816
+ "loss": 0.3153,
2817
+ "step": 912
2818
+ },
2819
+ {
2820
+ "epoch": 2.51,
2821
+ "learning_rate": 4.231866464879014e-06,
2822
+ "loss": 0.318,
2823
+ "step": 914
2824
+ },
2825
+ {
2826
+ "epoch": 2.51,
2827
+ "learning_rate": 4.147111090398193e-06,
2828
+ "loss": 0.3284,
2829
+ "step": 916
2830
+ },
2831
+ {
2832
+ "epoch": 2.52,
2833
+ "learning_rate": 4.063136235357745e-06,
2834
+ "loss": 0.3277,
2835
+ "step": 918
2836
+ },
2837
+ {
2838
+ "epoch": 2.53,
2839
+ "learning_rate": 3.979945042919603e-06,
2840
+ "loss": 0.3316,
2841
+ "step": 920
2842
+ },
2843
+ {
2844
+ "epoch": 2.53,
2845
+ "learning_rate": 3.897540626913393e-06,
2846
+ "loss": 0.3272,
2847
+ "step": 922
2848
+ },
2849
+ {
2850
+ "epoch": 2.54,
2851
+ "learning_rate": 3.815926071719828e-06,
2852
+ "loss": 0.3392,
2853
+ "step": 924
2854
+ },
2855
+ {
2856
+ "epoch": 2.54,
2857
+ "learning_rate": 3.735104432155309e-06,
2858
+ "loss": 0.3284,
2859
+ "step": 926
2860
+ },
2861
+ {
2862
+ "epoch": 2.55,
2863
+ "learning_rate": 3.655078733357567e-06,
2864
+ "loss": 0.3141,
2865
+ "step": 928
2866
+ },
2867
+ {
2868
+ "epoch": 2.55,
2869
+ "learning_rate": 3.5758519706724086e-06,
2870
+ "loss": 0.3233,
2871
+ "step": 930
2872
+ },
2873
+ {
2874
+ "epoch": 2.56,
2875
+ "learning_rate": 3.497427109541651e-06,
2876
+ "loss": 0.3226,
2877
+ "step": 932
2878
+ },
2879
+ {
2880
+ "epoch": 2.56,
2881
+ "learning_rate": 3.4198070853920768e-06,
2882
+ "loss": 0.3259,
2883
+ "step": 934
2884
+ },
2885
+ {
2886
+ "epoch": 2.57,
2887
+ "learning_rate": 3.3429948035255733e-06,
2888
+ "loss": 0.3292,
2889
+ "step": 936
2890
+ },
2891
+ {
2892
+ "epoch": 2.58,
2893
+ "learning_rate": 3.266993139010438e-06,
2894
+ "loss": 0.3285,
2895
+ "step": 938
2896
+ },
2897
+ {
2898
+ "epoch": 2.58,
2899
+ "learning_rate": 3.191804936573681e-06,
2900
+ "loss": 0.3301,
2901
+ "step": 940
2902
+ },
2903
+ {
2904
+ "epoch": 2.59,
2905
+ "learning_rate": 3.1174330104946055e-06,
2906
+ "loss": 0.3446,
2907
+ "step": 942
2908
+ },
2909
+ {
2910
+ "epoch": 2.59,
2911
+ "learning_rate": 3.0438801444994587e-06,
2912
+ "loss": 0.3361,
2913
+ "step": 944
2914
+ },
2915
+ {
2916
+ "epoch": 2.6,
2917
+ "learning_rate": 2.9711490916572354e-06,
2918
+ "loss": 0.3387,
2919
+ "step": 946
2920
+ },
2921
+ {
2922
+ "epoch": 2.6,
2923
+ "learning_rate": 2.8992425742766145e-06,
2924
+ "loss": 0.3555,
2925
+ "step": 948
2926
+ },
2927
+ {
2928
+ "epoch": 2.61,
2929
+ "learning_rate": 2.828163283804097e-06,
2930
+ "loss": 0.328,
2931
+ "step": 950
2932
+ },
2933
+ {
2934
+ "epoch": 2.61,
2935
+ "learning_rate": 2.7579138807232283e-06,
2936
+ "loss": 0.3501,
2937
+ "step": 952
2938
+ },
2939
+ {
2940
+ "epoch": 2.62,
2941
+ "learning_rate": 2.6884969944550533e-06,
2942
+ "loss": 0.343,
2943
+ "step": 954
2944
+ },
2945
+ {
2946
+ "epoch": 2.62,
2947
+ "learning_rate": 2.6199152232596753e-06,
2948
+ "loss": 0.3288,
2949
+ "step": 956
2950
+ },
2951
+ {
2952
+ "epoch": 2.63,
2953
+ "learning_rate": 2.552171134138992e-06,
2954
+ "loss": 0.3377,
2955
+ "step": 958
2956
+ },
2957
+ {
2958
+ "epoch": 2.64,
2959
+ "learning_rate": 2.4852672627406564e-06,
2960
+ "loss": 0.3251,
2961
+ "step": 960
2962
+ },
2963
+ {
2964
+ "epoch": 2.64,
2965
+ "learning_rate": 2.41920611326312e-06,
2966
+ "loss": 0.3353,
2967
+ "step": 962
2968
+ },
2969
+ {
2970
+ "epoch": 2.65,
2971
+ "learning_rate": 2.3539901583619185e-06,
2972
+ "loss": 0.3315,
2973
+ "step": 964
2974
+ },
2975
+ {
2976
+ "epoch": 2.65,
2977
+ "learning_rate": 2.2896218390571546e-06,
2978
+ "loss": 0.3475,
2979
+ "step": 966
2980
+ },
2981
+ {
2982
+ "epoch": 2.66,
2983
+ "learning_rate": 2.2261035646420764e-06,
2984
+ "loss": 0.3494,
2985
+ "step": 968
2986
+ },
2987
+ {
2988
+ "epoch": 2.66,
2989
+ "learning_rate": 2.1634377125929166e-06,
2990
+ "loss": 0.3437,
2991
+ "step": 970
2992
+ },
2993
+ {
2994
+ "epoch": 2.67,
2995
+ "learning_rate": 2.10162662847993e-06,
2996
+ "loss": 0.3336,
2997
+ "step": 972
2998
+ },
2999
+ {
3000
+ "epoch": 2.67,
3001
+ "learning_rate": 2.040672625879575e-06,
3002
+ "loss": 0.3407,
3003
+ "step": 974
3004
+ },
3005
+ {
3006
+ "epoch": 2.68,
3007
+ "learning_rate": 1.980577986287907e-06,
3008
+ "loss": 0.336,
3009
+ "step": 976
3010
+ },
3011
+ {
3012
+ "epoch": 2.68,
3013
+ "learning_rate": 1.921344959035218e-06,
3014
+ "loss": 0.3192,
3015
+ "step": 978
3016
+ },
3017
+ {
3018
+ "epoch": 2.69,
3019
+ "learning_rate": 1.8629757612017961e-06,
3020
+ "loss": 0.3173,
3021
+ "step": 980
3022
+ },
3023
+ {
3024
+ "epoch": 2.7,
3025
+ "learning_rate": 1.8054725775349973e-06,
3026
+ "loss": 0.3434,
3027
+ "step": 982
3028
+ },
3029
+ {
3030
+ "epoch": 2.7,
3031
+ "learning_rate": 1.748837560367425e-06,
3032
+ "loss": 0.3383,
3033
+ "step": 984
3034
+ },
3035
+ {
3036
+ "epoch": 2.71,
3037
+ "learning_rate": 1.693072829536385e-06,
3038
+ "loss": 0.3597,
3039
+ "step": 986
3040
+ },
3041
+ {
3042
+ "epoch": 2.71,
3043
+ "learning_rate": 1.6381804723045485e-06,
3044
+ "loss": 0.342,
3045
+ "step": 988
3046
+ },
3047
+ {
3048
+ "epoch": 2.72,
3049
+ "learning_rate": 1.5841625432818057e-06,
3050
+ "loss": 0.3521,
3051
+ "step": 990
3052
+ },
3053
+ {
3054
+ "epoch": 2.72,
3055
+ "learning_rate": 1.5310210643483813e-06,
3056
+ "loss": 0.3277,
3057
+ "step": 992
3058
+ },
3059
+ {
3060
+ "epoch": 2.73,
3061
+ "learning_rate": 1.4787580245791631e-06,
3062
+ "loss": 0.3322,
3063
+ "step": 994
3064
+ },
3065
+ {
3066
+ "epoch": 2.73,
3067
+ "learning_rate": 1.427375380169213e-06,
3068
+ "loss": 0.3552,
3069
+ "step": 996
3070
+ },
3071
+ {
3072
+ "epoch": 2.74,
3073
+ "learning_rate": 1.3768750543605851e-06,
3074
+ "loss": 0.3639,
3075
+ "step": 998
3076
+ },
3077
+ {
3078
+ "epoch": 2.75,
3079
+ "learning_rate": 1.3272589373703236e-06,
3080
+ "loss": 0.3437,
3081
+ "step": 1000
3082
+ },
3083
+ {
3084
+ "epoch": 2.75,
3085
+ "eval_loss": 0.5839625000953674,
3086
+ "eval_runtime": 372.3615,
3087
+ "eval_samples_per_second": 26.356,
3088
+ "eval_steps_per_second": 0.207,
3089
+ "step": 1000
3090
+ },
3091
+ {
3092
+ "epoch": 2.75,
3093
+ "learning_rate": 1.2785288863197186e-06,
3094
+ "loss": 0.3565,
3095
+ "step": 1002
3096
+ },
3097
+ {
3098
+ "epoch": 2.76,
3099
+ "learning_rate": 1.2306867251647813e-06,
3100
+ "loss": 0.3187,
3101
+ "step": 1004
3102
+ },
3103
+ {
3104
+ "epoch": 2.76,
3105
+ "learning_rate": 1.1837342446279903e-06,
3106
+ "loss": 0.3312,
3107
+ "step": 1006
3108
+ },
3109
+ {
3110
+ "epoch": 2.77,
3111
+ "learning_rate": 1.1376732021312508e-06,
3112
+ "loss": 0.3494,
3113
+ "step": 1008
3114
+ },
3115
+ {
3116
+ "epoch": 2.77,
3117
+ "learning_rate": 1.0925053217301278e-06,
3118
+ "loss": 0.3399,
3119
+ "step": 1010
3120
+ },
3121
+ {
3122
+ "epoch": 2.78,
3123
+ "learning_rate": 1.048232294049309e-06,
3124
+ "loss": 0.3484,
3125
+ "step": 1012
3126
+ },
3127
+ {
3128
+ "epoch": 2.78,
3129
+ "learning_rate": 1.004855776219313e-06,
3130
+ "loss": 0.3534,
3131
+ "step": 1014
3132
+ },
3133
+ {
3134
+ "epoch": 2.79,
3135
+ "learning_rate": 9.623773918144897e-07,
3136
+ "loss": 0.3353,
3137
+ "step": 1016
3138
+ },
3139
+ {
3140
+ "epoch": 2.79,
3141
+ "learning_rate": 9.2079873079223e-07,
3142
+ "loss": 0.3362,
3143
+ "step": 1018
3144
+ },
3145
+ {
3146
+ "epoch": 2.8,
3147
+ "learning_rate": 8.801213494334464e-07,
3148
+ "loss": 0.3509,
3149
+ "step": 1020
3150
+ },
3151
+ {
3152
+ "epoch": 2.81,
3153
+ "learning_rate": 8.403467702843643e-07,
3154
+ "loss": 0.3282,
3155
+ "step": 1022
3156
+ },
3157
+ {
3158
+ "epoch": 2.81,
3159
+ "learning_rate": 8.014764820994807e-07,
3160
+ "loss": 0.3454,
3161
+ "step": 1024
3162
+ },
3163
+ {
3164
+ "epoch": 2.82,
3165
+ "learning_rate": 7.635119397858603e-07,
3166
+ "loss": 0.357,
3167
+ "step": 1026
3168
+ },
3169
+ {
3170
+ "epoch": 2.82,
3171
+ "learning_rate": 7.264545643486997e-07,
3172
+ "loss": 0.339,
3173
+ "step": 1028
3174
+ },
3175
+ {
3176
+ "epoch": 2.83,
3177
+ "learning_rate": 6.903057428381127e-07,
3178
+ "loss": 0.3577,
3179
+ "step": 1030
3180
+ },
3181
+ {
3182
+ "epoch": 2.83,
3183
+ "learning_rate": 6.550668282972155e-07,
3184
+ "loss": 0.3627,
3185
+ "step": 1032
3186
+ },
3187
+ {
3188
+ "epoch": 2.84,
3189
+ "learning_rate": 6.207391397115042e-07,
3190
+ "loss": 0.3554,
3191
+ "step": 1034
3192
+ },
3193
+ {
3194
+ "epoch": 2.84,
3195
+ "learning_rate": 5.8732396195946e-07,
3196
+ "loss": 0.3415,
3197
+ "step": 1036
3198
+ },
3199
+ {
3200
+ "epoch": 2.85,
3201
+ "learning_rate": 5.548225457644662e-07,
3202
+ "loss": 0.3328,
3203
+ "step": 1038
3204
+ },
3205
+ {
3206
+ "epoch": 2.86,
3207
+ "learning_rate": 5.232361076480035e-07,
3208
+ "loss": 0.3398,
3209
+ "step": 1040
3210
+ },
3211
+ {
3212
+ "epoch": 2.86,
3213
+ "learning_rate": 4.925658298840979e-07,
3214
+ "loss": 0.351,
3215
+ "step": 1042
3216
+ },
3217
+ {
3218
+ "epoch": 2.87,
3219
+ "learning_rate": 4.628128604550808e-07,
3220
+ "loss": 0.3402,
3221
+ "step": 1044
3222
+ },
3223
+ {
3224
+ "epoch": 2.87,
3225
+ "learning_rate": 4.3397831300862057e-07,
3226
+ "loss": 0.3467,
3227
+ "step": 1046
3228
+ },
3229
+ {
3230
+ "epoch": 2.88,
3231
+ "learning_rate": 4.060632668160286e-07,
3232
+ "loss": 0.3579,
3233
+ "step": 1048
3234
+ },
3235
+ {
3236
+ "epoch": 2.88,
3237
+ "learning_rate": 3.790687667318743e-07,
3238
+ "loss": 0.3643,
3239
+ "step": 1050
3240
+ },
3241
+ {
3242
+ "epoch": 2.89,
3243
+ "learning_rate": 3.5299582315487525e-07,
3244
+ "loss": 0.3507,
3245
+ "step": 1052
3246
+ },
3247
+ {
3248
+ "epoch": 2.89,
3249
+ "learning_rate": 3.278454119900576e-07,
3250
+ "loss": 0.349,
3251
+ "step": 1054
3252
+ },
3253
+ {
3254
+ "epoch": 2.9,
3255
+ "learning_rate": 3.0361847461226335e-07,
3256
+ "loss": 0.3555,
3257
+ "step": 1056
3258
+ },
3259
+ {
3260
+ "epoch": 2.9,
3261
+ "learning_rate": 2.8031591783088953e-07,
3262
+ "loss": 0.3669,
3263
+ "step": 1058
3264
+ },
3265
+ {
3266
+ "epoch": 2.91,
3267
+ "learning_rate": 2.579386138559514e-07,
3268
+ "loss": 0.3605,
3269
+ "step": 1060
3270
+ },
3271
+ {
3272
+ "epoch": 2.92,
3273
+ "learning_rate": 2.3648740026543658e-07,
3274
+ "loss": 0.3432,
3275
+ "step": 1062
3276
+ },
3277
+ {
3278
+ "epoch": 2.92,
3279
+ "learning_rate": 2.1596307997396037e-07,
3280
+ "loss": 0.3622,
3281
+ "step": 1064
3282
+ },
3283
+ {
3284
+ "epoch": 2.93,
3285
+ "learning_rate": 1.963664212027011e-07,
3286
+ "loss": 0.3309,
3287
+ "step": 1066
3288
+ },
3289
+ {
3290
+ "epoch": 2.93,
3291
+ "learning_rate": 1.7769815745066475e-07,
3292
+ "loss": 0.3553,
3293
+ "step": 1068
3294
+ },
3295
+ {
3296
+ "epoch": 2.94,
3297
+ "learning_rate": 1.5995898746720695e-07,
3298
+ "loss": 0.3621,
3299
+ "step": 1070
3300
+ },
3301
+ {
3302
+ "epoch": 2.94,
3303
+ "learning_rate": 1.4314957522589546e-07,
3304
+ "loss": 0.3567,
3305
+ "step": 1072
3306
+ },
3307
+ {
3308
+ "epoch": 2.95,
3309
+ "learning_rate": 1.2727054989965236e-07,
3310
+ "loss": 0.37,
3311
+ "step": 1074
3312
+ },
3313
+ {
3314
+ "epoch": 2.95,
3315
+ "learning_rate": 1.1232250583720072e-07,
3316
+ "loss": 0.3657,
3317
+ "step": 1076
3318
+ },
3319
+ {
3320
+ "epoch": 2.96,
3321
+ "learning_rate": 9.830600254082944e-08,
3322
+ "loss": 0.3695,
3323
+ "step": 1078
3324
+ },
3325
+ {
3326
+ "epoch": 2.96,
3327
+ "learning_rate": 8.522156464543518e-08,
3328
+ "loss": 0.353,
3329
+ "step": 1080
3330
+ },
3331
+ {
3332
+ "epoch": 2.97,
3333
+ "learning_rate": 7.306968189890184e-08,
3334
+ "loss": 0.3591,
3335
+ "step": 1082
3336
+ },
3337
+ {
3338
+ "epoch": 2.98,
3339
+ "learning_rate": 6.185080914375974e-08,
3340
+ "loss": 0.3629,
3341
+ "step": 1084
3342
+ },
3343
+ {
3344
+ "epoch": 2.98,
3345
+ "learning_rate": 5.156536630015474e-08,
3346
+ "loss": 0.3568,
3347
+ "step": 1086
3348
+ },
3349
+ {
3350
+ "epoch": 2.99,
3351
+ "learning_rate": 4.221373835014975e-08,
3352
+ "loss": 0.359,
3353
+ "step": 1088
3354
+ },
3355
+ {
3356
+ "epoch": 2.99,
3357
+ "learning_rate": 3.379627532329732e-08,
3358
+ "loss": 0.3634,
3359
+ "step": 1090
3360
+ },
3361
+ {
3362
+ "epoch": 3.0,
3363
+ "learning_rate": 2.631329228355017e-08,
3364
+ "loss": 0.359,
3365
+ "step": 1092
3366
+ },
3367
+ {
3368
+ "epoch": 3.0,
3369
+ "step": 1092,
3370
+ "total_flos": 1201247503384576.0,
3371
+ "train_loss": 0.44882795285610927,
3372
+ "train_runtime": 126278.8523,
3373
+ "train_samples_per_second": 4.43,
3374
+ "train_steps_per_second": 0.009
3375
+ }
3376
+ ],
3377
+ "logging_steps": 2,
3378
+ "max_steps": 1092,
3379
+ "num_input_tokens_seen": 0,
3380
+ "num_train_epochs": 3,
3381
+ "save_steps": 1000,
3382
+ "total_flos": 1201247503384576.0,
3383
+ "train_batch_size": 16,
3384
+ "trial_name": null,
3385
+ "trial_params": null
3386
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b9fe041bdb039614b51285d7db97506f84fd1f47e6ef342117ffe60bee9c01
3
+ size 6520