n0w0f commited on
Commit
432a13c
·
verified ·
1 Parent(s): 1e728e9

Upload checkpoint

Browse files
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "gradient_checkpointing": false,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 512,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 3072,
17
+ "is_decoder": false,
18
+ "layer_norm_eps": 1e-12,
19
+ "max_position_embeddings": 1024,
20
+ "model_type": "bert",
21
+ "num_attention_heads": 8,
22
+ "num_hidden_layers": 4,
23
+ "pad_token_id": 0,
24
+ "position_embedding_type": "absolute",
25
+ "tie_word_embeddings": true,
26
+ "transformers_version": "5.1.0",
27
+ "type_vocab_size": 2,
28
+ "use_cache": false,
29
+ "vocab_size": 30522
30
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98cf1c6faa88811f59aa13013ca31bf1edebffb25af2e9dac06b2bcca3d7c147
3
+ size 133031496
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9568e4ed0e11cf9638bc4e6b2dbecafbc7bdb8ba1944256687e54d34d4e1ee9
3
+ size 266109515
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adc224a15468b738f4e9ff1b3aafc868a0413ecd0a33f645e5656a7ef437e49b
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f20cc4cad0ee75aa7f2126f8cc833dff4647791fd2f3757830b765541674c0
3
+ size 1465
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "[BOS]",
4
+ "cls_token": "[CLS]",
5
+ "eos_token": "[EOS]",
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "tokenizer_class": "TokenizersBackend",
11
+ "unk_token": "[UNK]"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,1234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4000,
3
+ "best_metric": 0.19092191755771637,
4
+ "best_model_checkpoint": "/home/flytekit/n0w0f/data/mattext_ckpt/results/2026-02-05/18-01-14/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000",
5
+ "epoch": 8.602150537634408,
6
+ "eval_steps": 50,
7
+ "global_step": 4000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.10752688172043011,
14
+ "grad_norm": 1.1888866424560547,
15
+ "learning_rate": 0.00019957849462365592,
16
+ "loss": 5.97920654296875,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.10752688172043011,
21
+ "eval_loss": 4.124914646148682,
22
+ "eval_runtime": 60.5178,
23
+ "eval_samples_per_second": 314.023,
24
+ "eval_steps_per_second": 39.261,
25
+ "step": 50
26
+ },
27
+ {
28
+ "epoch": 0.21505376344086022,
29
+ "grad_norm": 0.9824994802474976,
30
+ "learning_rate": 0.00019914838709677422,
31
+ "loss": 3.916483154296875,
32
+ "step": 100
33
+ },
34
+ {
35
+ "epoch": 0.21505376344086022,
36
+ "eval_loss": 3.675534248352051,
37
+ "eval_runtime": 61.1234,
38
+ "eval_samples_per_second": 310.912,
39
+ "eval_steps_per_second": 38.872,
40
+ "step": 100
41
+ },
42
+ {
43
+ "epoch": 0.3225806451612903,
44
+ "grad_norm": 0.867065966129303,
45
+ "learning_rate": 0.00019871827956989248,
46
+ "loss": 3.620672302246094,
47
+ "step": 150
48
+ },
49
+ {
50
+ "epoch": 0.3225806451612903,
51
+ "eval_loss": 3.4746599197387695,
52
+ "eval_runtime": 61.4793,
53
+ "eval_samples_per_second": 309.112,
54
+ "eval_steps_per_second": 38.647,
55
+ "step": 150
56
+ },
57
+ {
58
+ "epoch": 0.43010752688172044,
59
+ "grad_norm": 1.192267894744873,
60
+ "learning_rate": 0.00019828817204301075,
61
+ "loss": 3.471976013183594,
62
+ "step": 200
63
+ },
64
+ {
65
+ "epoch": 0.43010752688172044,
66
+ "eval_loss": 3.353644371032715,
67
+ "eval_runtime": 60.5187,
68
+ "eval_samples_per_second": 314.019,
69
+ "eval_steps_per_second": 39.261,
70
+ "step": 200
71
+ },
72
+ {
73
+ "epoch": 0.5376344086021505,
74
+ "grad_norm": 1.0798981189727783,
75
+ "learning_rate": 0.00019785806451612904,
76
+ "loss": 3.360224609375,
77
+ "step": 250
78
+ },
79
+ {
80
+ "epoch": 0.5376344086021505,
81
+ "eval_loss": 3.247636079788208,
82
+ "eval_runtime": 61.527,
83
+ "eval_samples_per_second": 308.873,
84
+ "eval_steps_per_second": 38.617,
85
+ "step": 250
86
+ },
87
+ {
88
+ "epoch": 0.6451612903225806,
89
+ "grad_norm": 1.3051457405090332,
90
+ "learning_rate": 0.00019742795698924733,
91
+ "loss": 3.262052307128906,
92
+ "step": 300
93
+ },
94
+ {
95
+ "epoch": 0.6451612903225806,
96
+ "eval_loss": 3.1502654552459717,
97
+ "eval_runtime": 60.999,
98
+ "eval_samples_per_second": 311.546,
99
+ "eval_steps_per_second": 38.951,
100
+ "step": 300
101
+ },
102
+ {
103
+ "epoch": 0.7526881720430108,
104
+ "grad_norm": 1.1396135091781616,
105
+ "learning_rate": 0.0001969978494623656,
106
+ "loss": 3.225200500488281,
107
+ "step": 350
108
+ },
109
+ {
110
+ "epoch": 0.7526881720430108,
111
+ "eval_loss": 3.094292163848877,
112
+ "eval_runtime": 61.381,
113
+ "eval_samples_per_second": 309.607,
114
+ "eval_steps_per_second": 38.709,
115
+ "step": 350
116
+ },
117
+ {
118
+ "epoch": 0.8602150537634409,
119
+ "grad_norm": 1.0816289186477661,
120
+ "learning_rate": 0.0001965677419354839,
121
+ "loss": 3.1344537353515625,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.8602150537634409,
126
+ "eval_loss": 3.0037944316864014,
127
+ "eval_runtime": 61.1417,
128
+ "eval_samples_per_second": 310.819,
129
+ "eval_steps_per_second": 38.861,
130
+ "step": 400
131
+ },
132
+ {
133
+ "epoch": 0.967741935483871,
134
+ "grad_norm": 1.220457673072815,
135
+ "learning_rate": 0.00019613763440860216,
136
+ "loss": 3.024658203125,
137
+ "step": 450
138
+ },
139
+ {
140
+ "epoch": 0.967741935483871,
141
+ "eval_loss": 2.9253640174865723,
142
+ "eval_runtime": 61.6823,
143
+ "eval_samples_per_second": 308.095,
144
+ "eval_steps_per_second": 38.52,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 1.075268817204301,
149
+ "grad_norm": 1.18031644821167,
150
+ "learning_rate": 0.00019570752688172045,
151
+ "loss": 2.9539215087890627,
152
+ "step": 500
153
+ },
154
+ {
155
+ "epoch": 1.075268817204301,
156
+ "eval_loss": 2.827315092086792,
157
+ "eval_runtime": 64.027,
158
+ "eval_samples_per_second": 296.812,
159
+ "eval_steps_per_second": 37.109,
160
+ "step": 500
161
+ },
162
+ {
163
+ "epoch": 1.1827956989247312,
164
+ "grad_norm": 1.4481481313705444,
165
+ "learning_rate": 0.00019527741935483872,
166
+ "loss": 2.8536431884765623,
167
+ "step": 550
168
+ },
169
+ {
170
+ "epoch": 1.1827956989247312,
171
+ "eval_loss": 2.6743366718292236,
172
+ "eval_runtime": 60.9092,
173
+ "eval_samples_per_second": 312.005,
174
+ "eval_steps_per_second": 39.009,
175
+ "step": 550
176
+ },
177
+ {
178
+ "epoch": 1.2903225806451613,
179
+ "grad_norm": 1.5985803604125977,
180
+ "learning_rate": 0.00019484731182795698,
181
+ "loss": 2.7353704833984374,
182
+ "step": 600
183
+ },
184
+ {
185
+ "epoch": 1.2903225806451613,
186
+ "eval_loss": 2.4861812591552734,
187
+ "eval_runtime": 61.6826,
188
+ "eval_samples_per_second": 308.093,
189
+ "eval_steps_per_second": 38.52,
190
+ "step": 600
191
+ },
192
+ {
193
+ "epoch": 1.3978494623655915,
194
+ "grad_norm": 2.046145439147949,
195
+ "learning_rate": 0.00019441720430107528,
196
+ "loss": 2.464430084228516,
197
+ "step": 650
198
+ },
199
+ {
200
+ "epoch": 1.3978494623655915,
201
+ "eval_loss": 2.0265886783599854,
202
+ "eval_runtime": 61.2709,
203
+ "eval_samples_per_second": 310.164,
204
+ "eval_steps_per_second": 38.779,
205
+ "step": 650
206
+ },
207
+ {
208
+ "epoch": 1.5053763440860215,
209
+ "grad_norm": 1.8674232959747314,
210
+ "learning_rate": 0.00019398709677419354,
211
+ "loss": 1.9112973022460937,
212
+ "step": 700
213
+ },
214
+ {
215
+ "epoch": 1.5053763440860215,
216
+ "eval_loss": 1.3678908348083496,
217
+ "eval_runtime": 62.2031,
218
+ "eval_samples_per_second": 305.515,
219
+ "eval_steps_per_second": 38.197,
220
+ "step": 700
221
+ },
222
+ {
223
+ "epoch": 1.6129032258064515,
224
+ "grad_norm": 1.708408236503601,
225
+ "learning_rate": 0.00019355698924731184,
226
+ "loss": 1.4241523742675781,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 1.6129032258064515,
231
+ "eval_loss": 1.0675994157791138,
232
+ "eval_runtime": 62.2,
233
+ "eval_samples_per_second": 305.53,
234
+ "eval_steps_per_second": 38.199,
235
+ "step": 750
236
+ },
237
+ {
238
+ "epoch": 1.7204301075268817,
239
+ "grad_norm": 1.6592656373977661,
240
+ "learning_rate": 0.00019312688172043013,
241
+ "loss": 1.2252975463867188,
242
+ "step": 800
243
+ },
244
+ {
245
+ "epoch": 1.7204301075268817,
246
+ "eval_loss": 0.9175282716751099,
247
+ "eval_runtime": 61.3094,
248
+ "eval_samples_per_second": 309.969,
249
+ "eval_steps_per_second": 38.754,
250
+ "step": 800
251
+ },
252
+ {
253
+ "epoch": 1.827956989247312,
254
+ "grad_norm": 1.2984247207641602,
255
+ "learning_rate": 0.0001926967741935484,
256
+ "loss": 1.0399230194091797,
257
+ "step": 850
258
+ },
259
+ {
260
+ "epoch": 1.827956989247312,
261
+ "eval_loss": 0.8346064686775208,
262
+ "eval_runtime": 61.1605,
263
+ "eval_samples_per_second": 310.724,
264
+ "eval_steps_per_second": 38.849,
265
+ "step": 850
266
+ },
267
+ {
268
+ "epoch": 1.935483870967742,
269
+ "grad_norm": 1.1744712591171265,
270
+ "learning_rate": 0.0001922666666666667,
271
+ "loss": 0.9568134307861328,
272
+ "step": 900
273
+ },
274
+ {
275
+ "epoch": 1.935483870967742,
276
+ "eval_loss": 0.7724924087524414,
277
+ "eval_runtime": 62.2824,
278
+ "eval_samples_per_second": 305.126,
279
+ "eval_steps_per_second": 38.149,
280
+ "step": 900
281
+ },
282
+ {
283
+ "epoch": 2.043010752688172,
284
+ "grad_norm": 1.2494049072265625,
285
+ "learning_rate": 0.00019183655913978495,
286
+ "loss": 0.8979853820800782,
287
+ "step": 950
288
+ },
289
+ {
290
+ "epoch": 2.043010752688172,
291
+ "eval_loss": 0.7325491905212402,
292
+ "eval_runtime": 62.8935,
293
+ "eval_samples_per_second": 302.161,
294
+ "eval_steps_per_second": 37.778,
295
+ "step": 950
296
+ },
297
+ {
298
+ "epoch": 2.150537634408602,
299
+ "grad_norm": 1.0687495470046997,
300
+ "learning_rate": 0.00019140645161290322,
301
+ "loss": 0.8724540710449219,
302
+ "step": 1000
303
+ },
304
+ {
305
+ "epoch": 2.150537634408602,
306
+ "eval_loss": 0.6943864822387695,
307
+ "eval_runtime": 64.2005,
308
+ "eval_samples_per_second": 296.01,
309
+ "eval_steps_per_second": 37.009,
310
+ "step": 1000
311
+ },
312
+ {
313
+ "epoch": 2.258064516129032,
314
+ "grad_norm": 0.9108296036720276,
315
+ "learning_rate": 0.0001909763440860215,
316
+ "loss": 0.8106794738769532,
317
+ "step": 1050
318
+ },
319
+ {
320
+ "epoch": 2.258064516129032,
321
+ "eval_loss": 0.666123628616333,
322
+ "eval_runtime": 60.9142,
323
+ "eval_samples_per_second": 311.98,
324
+ "eval_steps_per_second": 39.006,
325
+ "step": 1050
326
+ },
327
+ {
328
+ "epoch": 2.3655913978494625,
329
+ "grad_norm": 0.8529163002967834,
330
+ "learning_rate": 0.00019054623655913978,
331
+ "loss": 0.7816014862060547,
332
+ "step": 1100
333
+ },
334
+ {
335
+ "epoch": 2.3655913978494625,
336
+ "eval_loss": 0.6435992121696472,
337
+ "eval_runtime": 61.9346,
338
+ "eval_samples_per_second": 306.84,
339
+ "eval_steps_per_second": 38.363,
340
+ "step": 1100
341
+ },
342
+ {
343
+ "epoch": 2.4731182795698925,
344
+ "grad_norm": 0.9023746848106384,
345
+ "learning_rate": 0.00019011612903225807,
346
+ "loss": 0.7448858642578124,
347
+ "step": 1150
348
+ },
349
+ {
350
+ "epoch": 2.4731182795698925,
351
+ "eval_loss": 0.6147477626800537,
352
+ "eval_runtime": 60.7037,
353
+ "eval_samples_per_second": 313.062,
354
+ "eval_steps_per_second": 39.141,
355
+ "step": 1150
356
+ },
357
+ {
358
+ "epoch": 2.5806451612903225,
359
+ "grad_norm": 0.7893891930580139,
360
+ "learning_rate": 0.00018968602150537636,
361
+ "loss": 0.7744358062744141,
362
+ "step": 1200
363
+ },
364
+ {
365
+ "epoch": 2.5806451612903225,
366
+ "eval_loss": 0.6008749604225159,
367
+ "eval_runtime": 62.0421,
368
+ "eval_samples_per_second": 306.308,
369
+ "eval_steps_per_second": 38.297,
370
+ "step": 1200
371
+ },
372
+ {
373
+ "epoch": 2.688172043010753,
374
+ "grad_norm": 0.8543435335159302,
375
+ "learning_rate": 0.00018925591397849463,
376
+ "loss": 0.698813705444336,
377
+ "step": 1250
378
+ },
379
+ {
380
+ "epoch": 2.688172043010753,
381
+ "eval_loss": 0.5843669176101685,
382
+ "eval_runtime": 61.7236,
383
+ "eval_samples_per_second": 307.889,
384
+ "eval_steps_per_second": 38.494,
385
+ "step": 1250
386
+ },
387
+ {
388
+ "epoch": 2.795698924731183,
389
+ "grad_norm": 0.862782895565033,
390
+ "learning_rate": 0.00018882580645161292,
391
+ "loss": 0.7231275939941406,
392
+ "step": 1300
393
+ },
394
+ {
395
+ "epoch": 2.795698924731183,
396
+ "eval_loss": 0.560819149017334,
397
+ "eval_runtime": 61.272,
398
+ "eval_samples_per_second": 310.158,
399
+ "eval_steps_per_second": 38.778,
400
+ "step": 1300
401
+ },
402
+ {
403
+ "epoch": 2.903225806451613,
404
+ "grad_norm": 0.8126527667045593,
405
+ "learning_rate": 0.0001883956989247312,
406
+ "loss": 0.6607036590576172,
407
+ "step": 1350
408
+ },
409
+ {
410
+ "epoch": 2.903225806451613,
411
+ "eval_loss": 0.5523199439048767,
412
+ "eval_runtime": 61.41,
413
+ "eval_samples_per_second": 309.461,
414
+ "eval_steps_per_second": 38.691,
415
+ "step": 1350
416
+ },
417
+ {
418
+ "epoch": 3.010752688172043,
419
+ "grad_norm": 0.8788714408874512,
420
+ "learning_rate": 0.00018796559139784945,
421
+ "loss": 0.658017349243164,
422
+ "step": 1400
423
+ },
424
+ {
425
+ "epoch": 3.010752688172043,
426
+ "eval_loss": 0.5504087805747986,
427
+ "eval_runtime": 61.2893,
428
+ "eval_samples_per_second": 310.07,
429
+ "eval_steps_per_second": 38.767,
430
+ "step": 1400
431
+ },
432
+ {
433
+ "epoch": 3.118279569892473,
434
+ "grad_norm": 0.8354722857475281,
435
+ "learning_rate": 0.00018753548387096775,
436
+ "loss": 0.6500599670410157,
437
+ "step": 1450
438
+ },
439
+ {
440
+ "epoch": 3.118279569892473,
441
+ "eval_loss": 0.5395110845565796,
442
+ "eval_runtime": 60.5063,
443
+ "eval_samples_per_second": 314.083,
444
+ "eval_steps_per_second": 39.269,
445
+ "step": 1450
446
+ },
447
+ {
448
+ "epoch": 3.225806451612903,
449
+ "grad_norm": 0.8122305870056152,
450
+ "learning_rate": 0.000187105376344086,
451
+ "loss": 0.6230792999267578,
452
+ "step": 1500
453
+ },
454
+ {
455
+ "epoch": 3.225806451612903,
456
+ "eval_loss": 0.5187473297119141,
457
+ "eval_runtime": 60.7322,
458
+ "eval_samples_per_second": 312.915,
459
+ "eval_steps_per_second": 39.123,
460
+ "step": 1500
461
+ },
462
+ {
463
+ "epoch": 3.3333333333333335,
464
+ "grad_norm": 0.673494815826416,
465
+ "learning_rate": 0.0001866752688172043,
466
+ "loss": 0.6118016052246094,
467
+ "step": 1550
468
+ },
469
+ {
470
+ "epoch": 3.3333333333333335,
471
+ "eval_loss": 0.5081239938735962,
472
+ "eval_runtime": 60.5862,
473
+ "eval_samples_per_second": 313.669,
474
+ "eval_steps_per_second": 39.217,
475
+ "step": 1550
476
+ },
477
+ {
478
+ "epoch": 3.4408602150537635,
479
+ "grad_norm": 0.8055212497711182,
480
+ "learning_rate": 0.0001862451612903226,
481
+ "loss": 0.6122843170166016,
482
+ "step": 1600
483
+ },
484
+ {
485
+ "epoch": 3.4408602150537635,
486
+ "eval_loss": 0.49499744176864624,
487
+ "eval_runtime": 60.6568,
488
+ "eval_samples_per_second": 313.304,
489
+ "eval_steps_per_second": 39.171,
490
+ "step": 1600
491
+ },
492
+ {
493
+ "epoch": 3.5483870967741935,
494
+ "grad_norm": 0.7935542464256287,
495
+ "learning_rate": 0.00018581505376344087,
496
+ "loss": 0.5825344467163086,
497
+ "step": 1650
498
+ },
499
+ {
500
+ "epoch": 3.5483870967741935,
501
+ "eval_loss": 0.48452192544937134,
502
+ "eval_runtime": 60.5763,
503
+ "eval_samples_per_second": 313.72,
504
+ "eval_steps_per_second": 39.223,
505
+ "step": 1650
506
+ },
507
+ {
508
+ "epoch": 3.6559139784946235,
509
+ "grad_norm": 0.6395400166511536,
510
+ "learning_rate": 0.00018538494623655916,
511
+ "loss": 0.5727723693847656,
512
+ "step": 1700
513
+ },
514
+ {
515
+ "epoch": 3.6559139784946235,
516
+ "eval_loss": 0.4738766551017761,
517
+ "eval_runtime": 60.5051,
518
+ "eval_samples_per_second": 314.089,
519
+ "eval_steps_per_second": 39.269,
520
+ "step": 1700
521
+ },
522
+ {
523
+ "epoch": 3.763440860215054,
524
+ "grad_norm": 0.6544663906097412,
525
+ "learning_rate": 0.00018495483870967742,
526
+ "loss": 0.5858316421508789,
527
+ "step": 1750
528
+ },
529
+ {
530
+ "epoch": 3.763440860215054,
531
+ "eval_loss": 0.4562221169471741,
532
+ "eval_runtime": 60.4697,
533
+ "eval_samples_per_second": 314.273,
534
+ "eval_steps_per_second": 39.292,
535
+ "step": 1750
536
+ },
537
+ {
538
+ "epoch": 3.870967741935484,
539
+ "grad_norm": 0.773256778717041,
540
+ "learning_rate": 0.00018452473118279572,
541
+ "loss": 0.5555976867675781,
542
+ "step": 1800
543
+ },
544
+ {
545
+ "epoch": 3.870967741935484,
546
+ "eval_loss": 0.4462752342224121,
547
+ "eval_runtime": 61.139,
548
+ "eval_samples_per_second": 310.833,
549
+ "eval_steps_per_second": 38.862,
550
+ "step": 1800
551
+ },
552
+ {
553
+ "epoch": 3.978494623655914,
554
+ "grad_norm": 0.6679997444152832,
555
+ "learning_rate": 0.00018409462365591398,
556
+ "loss": 0.5079600143432618,
557
+ "step": 1850
558
+ },
559
+ {
560
+ "epoch": 3.978494623655914,
561
+ "eval_loss": 0.43978169560432434,
562
+ "eval_runtime": 60.5103,
563
+ "eval_samples_per_second": 314.062,
564
+ "eval_steps_per_second": 39.266,
565
+ "step": 1850
566
+ },
567
+ {
568
+ "epoch": 4.086021505376344,
569
+ "grad_norm": 0.7930998206138611,
570
+ "learning_rate": 0.00018366451612903225,
571
+ "loss": 0.5580390548706055,
572
+ "step": 1900
573
+ },
574
+ {
575
+ "epoch": 4.086021505376344,
576
+ "eval_loss": 0.4352206587791443,
577
+ "eval_runtime": 60.8357,
578
+ "eval_samples_per_second": 312.382,
579
+ "eval_steps_per_second": 39.056,
580
+ "step": 1900
581
+ },
582
+ {
583
+ "epoch": 4.193548387096774,
584
+ "grad_norm": 0.6607942581176758,
585
+ "learning_rate": 0.00018323440860215054,
586
+ "loss": 0.49173324584960937,
587
+ "step": 1950
588
+ },
589
+ {
590
+ "epoch": 4.193548387096774,
591
+ "eval_loss": 0.4238659143447876,
592
+ "eval_runtime": 60.9872,
593
+ "eval_samples_per_second": 311.606,
594
+ "eval_steps_per_second": 38.959,
595
+ "step": 1950
596
+ },
597
+ {
598
+ "epoch": 4.301075268817204,
599
+ "grad_norm": 0.6287643909454346,
600
+ "learning_rate": 0.00018280430107526884,
601
+ "loss": 0.4687882232666016,
602
+ "step": 2000
603
+ },
604
+ {
605
+ "epoch": 4.301075268817204,
606
+ "eval_loss": 0.4168907403945923,
607
+ "eval_runtime": 61.005,
608
+ "eval_samples_per_second": 311.515,
609
+ "eval_steps_per_second": 38.948,
610
+ "step": 2000
611
+ },
612
+ {
613
+ "epoch": 4.408602150537634,
614
+ "grad_norm": 0.6433095932006836,
615
+ "learning_rate": 0.0001823741935483871,
616
+ "loss": 0.4763982009887695,
617
+ "step": 2050
618
+ },
619
+ {
620
+ "epoch": 4.408602150537634,
621
+ "eval_loss": 0.4120262861251831,
622
+ "eval_runtime": 61.5507,
623
+ "eval_samples_per_second": 308.753,
624
+ "eval_steps_per_second": 38.602,
625
+ "step": 2050
626
+ },
627
+ {
628
+ "epoch": 4.516129032258064,
629
+ "grad_norm": 0.76325523853302,
630
+ "learning_rate": 0.0001819440860215054,
631
+ "loss": 0.5169943237304687,
632
+ "step": 2100
633
+ },
634
+ {
635
+ "epoch": 4.516129032258064,
636
+ "eval_loss": 0.40777090191841125,
637
+ "eval_runtime": 61.9659,
638
+ "eval_samples_per_second": 306.685,
639
+ "eval_steps_per_second": 38.344,
640
+ "step": 2100
641
+ },
642
+ {
643
+ "epoch": 4.623655913978495,
644
+ "grad_norm": 0.7534022331237793,
645
+ "learning_rate": 0.00018151397849462366,
646
+ "loss": 0.4840876770019531,
647
+ "step": 2150
648
+ },
649
+ {
650
+ "epoch": 4.623655913978495,
651
+ "eval_loss": 0.396854966878891,
652
+ "eval_runtime": 61.4429,
653
+ "eval_samples_per_second": 309.295,
654
+ "eval_steps_per_second": 38.67,
655
+ "step": 2150
656
+ },
657
+ {
658
+ "epoch": 4.731182795698925,
659
+ "grad_norm": 0.688862144947052,
660
+ "learning_rate": 0.00018108387096774195,
661
+ "loss": 0.46516273498535154,
662
+ "step": 2200
663
+ },
664
+ {
665
+ "epoch": 4.731182795698925,
666
+ "eval_loss": 0.38546594977378845,
667
+ "eval_runtime": 60.8637,
668
+ "eval_samples_per_second": 312.239,
669
+ "eval_steps_per_second": 39.038,
670
+ "step": 2200
671
+ },
672
+ {
673
+ "epoch": 4.838709677419355,
674
+ "grad_norm": 0.5328208208084106,
675
+ "learning_rate": 0.00018065376344086022,
676
+ "loss": 0.5028326034545898,
677
+ "step": 2250
678
+ },
679
+ {
680
+ "epoch": 4.838709677419355,
681
+ "eval_loss": 0.37445569038391113,
682
+ "eval_runtime": 61.5819,
683
+ "eval_samples_per_second": 308.597,
684
+ "eval_steps_per_second": 38.583,
685
+ "step": 2250
686
+ },
687
+ {
688
+ "epoch": 4.946236559139785,
689
+ "grad_norm": 0.5857045650482178,
690
+ "learning_rate": 0.00018022365591397848,
691
+ "loss": 0.43645286560058594,
692
+ "step": 2300
693
+ },
694
+ {
695
+ "epoch": 4.946236559139785,
696
+ "eval_loss": 0.3690737187862396,
697
+ "eval_runtime": 61.4895,
698
+ "eval_samples_per_second": 309.061,
699
+ "eval_steps_per_second": 38.641,
700
+ "step": 2300
701
+ },
702
+ {
703
+ "epoch": 5.053763440860215,
704
+ "grad_norm": 0.6344749331474304,
705
+ "learning_rate": 0.00017979354838709678,
706
+ "loss": 0.42147178649902345,
707
+ "step": 2350
708
+ },
709
+ {
710
+ "epoch": 5.053763440860215,
711
+ "eval_loss": 0.3570445775985718,
712
+ "eval_runtime": 62.1748,
713
+ "eval_samples_per_second": 305.654,
714
+ "eval_steps_per_second": 38.215,
715
+ "step": 2350
716
+ },
717
+ {
718
+ "epoch": 5.161290322580645,
719
+ "grad_norm": 0.6610215306282043,
720
+ "learning_rate": 0.00017936344086021507,
721
+ "loss": 0.4157654571533203,
722
+ "step": 2400
723
+ },
724
+ {
725
+ "epoch": 5.161290322580645,
726
+ "eval_loss": 0.3497065603733063,
727
+ "eval_runtime": 61.6389,
728
+ "eval_samples_per_second": 308.312,
729
+ "eval_steps_per_second": 38.547,
730
+ "step": 2400
731
+ },
732
+ {
733
+ "epoch": 5.268817204301075,
734
+ "grad_norm": 0.5334368348121643,
735
+ "learning_rate": 0.00017893333333333336,
736
+ "loss": 0.4012648391723633,
737
+ "step": 2450
738
+ },
739
+ {
740
+ "epoch": 5.268817204301075,
741
+ "eval_loss": 0.33196908235549927,
742
+ "eval_runtime": 64.4623,
743
+ "eval_samples_per_second": 294.808,
744
+ "eval_steps_per_second": 36.859,
745
+ "step": 2450
746
+ },
747
+ {
748
+ "epoch": 5.376344086021505,
749
+ "grad_norm": 0.7559072971343994,
750
+ "learning_rate": 0.00017850322580645163,
751
+ "loss": 0.4343834686279297,
752
+ "step": 2500
753
+ },
754
+ {
755
+ "epoch": 5.376344086021505,
756
+ "eval_loss": 0.31756916642189026,
757
+ "eval_runtime": 64.0899,
758
+ "eval_samples_per_second": 296.521,
759
+ "eval_steps_per_second": 37.073,
760
+ "step": 2500
761
+ },
762
+ {
763
+ "epoch": 5.483870967741936,
764
+ "grad_norm": 0.6970711946487427,
765
+ "learning_rate": 0.0001780731182795699,
766
+ "loss": 0.3609016799926758,
767
+ "step": 2550
768
+ },
769
+ {
770
+ "epoch": 5.483870967741936,
771
+ "eval_loss": 0.3129482567310333,
772
+ "eval_runtime": 64.2007,
773
+ "eval_samples_per_second": 296.009,
774
+ "eval_steps_per_second": 37.009,
775
+ "step": 2550
776
+ },
777
+ {
778
+ "epoch": 5.591397849462366,
779
+ "grad_norm": 0.7393150329589844,
780
+ "learning_rate": 0.0001776430107526882,
781
+ "loss": 0.36085220336914064,
782
+ "step": 2600
783
+ },
784
+ {
785
+ "epoch": 5.591397849462366,
786
+ "eval_loss": 0.29907363653182983,
787
+ "eval_runtime": 64.2974,
788
+ "eval_samples_per_second": 295.564,
789
+ "eval_steps_per_second": 36.953,
790
+ "step": 2600
791
+ },
792
+ {
793
+ "epoch": 5.698924731182796,
794
+ "grad_norm": 0.6760246157646179,
795
+ "learning_rate": 0.00017721290322580645,
796
+ "loss": 0.3354073715209961,
797
+ "step": 2650
798
+ },
799
+ {
800
+ "epoch": 5.698924731182796,
801
+ "eval_loss": 0.28903692960739136,
802
+ "eval_runtime": 64.2379,
803
+ "eval_samples_per_second": 295.838,
804
+ "eval_steps_per_second": 36.988,
805
+ "step": 2650
806
+ },
807
+ {
808
+ "epoch": 5.806451612903226,
809
+ "grad_norm": 0.6342934370040894,
810
+ "learning_rate": 0.00017678279569892472,
811
+ "loss": 0.33487789154052733,
812
+ "step": 2700
813
+ },
814
+ {
815
+ "epoch": 5.806451612903226,
816
+ "eval_loss": 0.2763662040233612,
817
+ "eval_runtime": 63.0262,
818
+ "eval_samples_per_second": 301.525,
819
+ "eval_steps_per_second": 37.699,
820
+ "step": 2700
821
+ },
822
+ {
823
+ "epoch": 5.913978494623656,
824
+ "grad_norm": 0.6288059949874878,
825
+ "learning_rate": 0.00017635268817204301,
826
+ "loss": 0.3166103744506836,
827
+ "step": 2750
828
+ },
829
+ {
830
+ "epoch": 5.913978494623656,
831
+ "eval_loss": 0.27043381333351135,
832
+ "eval_runtime": 63.0792,
833
+ "eval_samples_per_second": 301.272,
834
+ "eval_steps_per_second": 37.667,
835
+ "step": 2750
836
+ },
837
+ {
838
+ "epoch": 6.021505376344086,
839
+ "grad_norm": 0.8228830695152283,
840
+ "learning_rate": 0.0001759225806451613,
841
+ "loss": 0.3166475486755371,
842
+ "step": 2800
843
+ },
844
+ {
845
+ "epoch": 6.021505376344086,
846
+ "eval_loss": 0.26023828983306885,
847
+ "eval_runtime": 64.4666,
848
+ "eval_samples_per_second": 294.788,
849
+ "eval_steps_per_second": 36.856,
850
+ "step": 2800
851
+ },
852
+ {
853
+ "epoch": 6.129032258064516,
854
+ "grad_norm": 0.6261463165283203,
855
+ "learning_rate": 0.0001754924731182796,
856
+ "loss": 0.30168416976928714,
857
+ "step": 2850
858
+ },
859
+ {
860
+ "epoch": 6.129032258064516,
861
+ "eval_loss": 0.2530518174171448,
862
+ "eval_runtime": 63.8775,
863
+ "eval_samples_per_second": 297.507,
864
+ "eval_steps_per_second": 37.196,
865
+ "step": 2850
866
+ },
867
+ {
868
+ "epoch": 6.236559139784946,
869
+ "grad_norm": 0.7265720367431641,
870
+ "learning_rate": 0.00017506236559139787,
871
+ "loss": 0.29341196060180663,
872
+ "step": 2900
873
+ },
874
+ {
875
+ "epoch": 6.236559139784946,
876
+ "eval_loss": 0.24442243576049805,
877
+ "eval_runtime": 63.2991,
878
+ "eval_samples_per_second": 300.226,
879
+ "eval_steps_per_second": 37.536,
880
+ "step": 2900
881
+ },
882
+ {
883
+ "epoch": 6.344086021505376,
884
+ "grad_norm": 0.5499133467674255,
885
+ "learning_rate": 0.00017463225806451613,
886
+ "loss": 0.2850730323791504,
887
+ "step": 2950
888
+ },
889
+ {
890
+ "epoch": 6.344086021505376,
891
+ "eval_loss": 0.237361341714859,
892
+ "eval_runtime": 64.5725,
893
+ "eval_samples_per_second": 294.305,
894
+ "eval_steps_per_second": 36.796,
895
+ "step": 2950
896
+ },
897
+ {
898
+ "epoch": 6.451612903225806,
899
+ "grad_norm": 0.7466527223587036,
900
+ "learning_rate": 0.00017420215053763442,
901
+ "loss": 0.2737441635131836,
902
+ "step": 3000
903
+ },
904
+ {
905
+ "epoch": 6.451612903225806,
906
+ "eval_loss": 0.22867611050605774,
907
+ "eval_runtime": 64.8912,
908
+ "eval_samples_per_second": 292.86,
909
+ "eval_steps_per_second": 36.615,
910
+ "step": 3000
911
+ },
912
+ {
913
+ "epoch": 6.559139784946236,
914
+ "grad_norm": 0.605771005153656,
915
+ "learning_rate": 0.0001737720430107527,
916
+ "loss": 0.26982501983642576,
917
+ "step": 3050
918
+ },
919
+ {
920
+ "epoch": 6.559139784946236,
921
+ "eval_loss": 0.22686000168323517,
922
+ "eval_runtime": 64.8566,
923
+ "eval_samples_per_second": 293.016,
924
+ "eval_steps_per_second": 36.635,
925
+ "step": 3050
926
+ },
927
+ {
928
+ "epoch": 6.666666666666667,
929
+ "grad_norm": 0.6927595138549805,
930
+ "learning_rate": 0.00017334193548387096,
931
+ "loss": 0.2592777633666992,
932
+ "step": 3100
933
+ },
934
+ {
935
+ "epoch": 6.666666666666667,
936
+ "eval_loss": 0.22359216213226318,
937
+ "eval_runtime": 64.9559,
938
+ "eval_samples_per_second": 292.568,
939
+ "eval_steps_per_second": 36.579,
940
+ "step": 3100
941
+ },
942
+ {
943
+ "epoch": 6.774193548387097,
944
+ "grad_norm": 0.6070519685745239,
945
+ "learning_rate": 0.00017291182795698925,
946
+ "loss": 0.2539858436584473,
947
+ "step": 3150
948
+ },
949
+ {
950
+ "epoch": 6.774193548387097,
951
+ "eval_loss": 0.22382962703704834,
952
+ "eval_runtime": 64.9172,
953
+ "eval_samples_per_second": 292.742,
954
+ "eval_steps_per_second": 36.6,
955
+ "step": 3150
956
+ },
957
+ {
958
+ "epoch": 6.881720430107527,
959
+ "grad_norm": 0.7206361889839172,
960
+ "learning_rate": 0.00017248172043010754,
961
+ "loss": 0.2550803184509277,
962
+ "step": 3200
963
+ },
964
+ {
965
+ "epoch": 6.881720430107527,
966
+ "eval_loss": 0.22055239975452423,
967
+ "eval_runtime": 65.5818,
968
+ "eval_samples_per_second": 289.775,
969
+ "eval_steps_per_second": 36.23,
970
+ "step": 3200
971
+ },
972
+ {
973
+ "epoch": 6.989247311827957,
974
+ "grad_norm": 0.6855896711349487,
975
+ "learning_rate": 0.00017205161290322584,
976
+ "loss": 0.2432615852355957,
977
+ "step": 3250
978
+ },
979
+ {
980
+ "epoch": 6.989247311827957,
981
+ "eval_loss": 0.21467819809913635,
982
+ "eval_runtime": 66.2905,
983
+ "eval_samples_per_second": 286.677,
984
+ "eval_steps_per_second": 35.842,
985
+ "step": 3250
986
+ },
987
+ {
988
+ "epoch": 7.096774193548387,
989
+ "grad_norm": 0.5612008571624756,
990
+ "learning_rate": 0.0001716215053763441,
991
+ "loss": 0.24562849044799806,
992
+ "step": 3300
993
+ },
994
+ {
995
+ "epoch": 7.096774193548387,
996
+ "eval_loss": 0.21375121176242828,
997
+ "eval_runtime": 66.0151,
998
+ "eval_samples_per_second": 287.874,
999
+ "eval_steps_per_second": 35.992,
1000
+ "step": 3300
1001
+ },
1002
+ {
1003
+ "epoch": 7.204301075268817,
1004
+ "grad_norm": 0.7433006763458252,
1005
+ "learning_rate": 0.00017119139784946237,
1006
+ "loss": 0.2393852424621582,
1007
+ "step": 3350
1008
+ },
1009
+ {
1010
+ "epoch": 7.204301075268817,
1011
+ "eval_loss": 0.20871323347091675,
1012
+ "eval_runtime": 61.9563,
1013
+ "eval_samples_per_second": 306.732,
1014
+ "eval_steps_per_second": 38.35,
1015
+ "step": 3350
1016
+ },
1017
+ {
1018
+ "epoch": 7.311827956989247,
1019
+ "grad_norm": 0.6491153836250305,
1020
+ "learning_rate": 0.00017076129032258066,
1021
+ "loss": 0.24959787368774414,
1022
+ "step": 3400
1023
+ },
1024
+ {
1025
+ "epoch": 7.311827956989247,
1026
+ "eval_loss": 0.21120016276836395,
1027
+ "eval_runtime": 60.6864,
1028
+ "eval_samples_per_second": 313.151,
1029
+ "eval_steps_per_second": 39.152,
1030
+ "step": 3400
1031
+ },
1032
+ {
1033
+ "epoch": 7.419354838709677,
1034
+ "grad_norm": 0.5620025992393494,
1035
+ "learning_rate": 0.00017033118279569893,
1036
+ "loss": 0.2320168685913086,
1037
+ "step": 3450
1038
+ },
1039
+ {
1040
+ "epoch": 7.419354838709677,
1041
+ "eval_loss": 0.20816229283809662,
1042
+ "eval_runtime": 61.036,
1043
+ "eval_samples_per_second": 311.357,
1044
+ "eval_steps_per_second": 38.928,
1045
+ "step": 3450
1046
+ },
1047
+ {
1048
+ "epoch": 7.526881720430108,
1049
+ "grad_norm": 0.6183444261550903,
1050
+ "learning_rate": 0.00016990107526881722,
1051
+ "loss": 0.2322225570678711,
1052
+ "step": 3500
1053
+ },
1054
+ {
1055
+ "epoch": 7.526881720430108,
1056
+ "eval_loss": 0.20497609674930573,
1057
+ "eval_runtime": 60.5328,
1058
+ "eval_samples_per_second": 313.946,
1059
+ "eval_steps_per_second": 39.251,
1060
+ "step": 3500
1061
+ },
1062
+ {
1063
+ "epoch": 7.634408602150538,
1064
+ "grad_norm": 0.5328448414802551,
1065
+ "learning_rate": 0.00016947096774193548,
1066
+ "loss": 0.23304037094116212,
1067
+ "step": 3550
1068
+ },
1069
+ {
1070
+ "epoch": 7.634408602150538,
1071
+ "eval_loss": 0.20321960747241974,
1072
+ "eval_runtime": 62.1711,
1073
+ "eval_samples_per_second": 305.672,
1074
+ "eval_steps_per_second": 38.217,
1075
+ "step": 3550
1076
+ },
1077
+ {
1078
+ "epoch": 7.741935483870968,
1079
+ "grad_norm": 0.5241938829421997,
1080
+ "learning_rate": 0.00016904086021505378,
1081
+ "loss": 0.22476686477661134,
1082
+ "step": 3600
1083
+ },
1084
+ {
1085
+ "epoch": 7.741935483870968,
1086
+ "eval_loss": 0.2034502625465393,
1087
+ "eval_runtime": 64.8022,
1088
+ "eval_samples_per_second": 293.262,
1089
+ "eval_steps_per_second": 36.665,
1090
+ "step": 3600
1091
+ },
1092
+ {
1093
+ "epoch": 7.849462365591398,
1094
+ "grad_norm": 0.5440294742584229,
1095
+ "learning_rate": 0.00016861075268817207,
1096
+ "loss": 0.227796630859375,
1097
+ "step": 3650
1098
+ },
1099
+ {
1100
+ "epoch": 7.849462365591398,
1101
+ "eval_loss": 0.20562465488910675,
1102
+ "eval_runtime": 65.1543,
1103
+ "eval_samples_per_second": 291.677,
1104
+ "eval_steps_per_second": 36.467,
1105
+ "step": 3650
1106
+ },
1107
+ {
1108
+ "epoch": 7.956989247311828,
1109
+ "grad_norm": 0.5037738680839539,
1110
+ "learning_rate": 0.00016818064516129034,
1111
+ "loss": 0.23125221252441405,
1112
+ "step": 3700
1113
+ },
1114
+ {
1115
+ "epoch": 7.956989247311828,
1116
+ "eval_loss": 0.20223356783390045,
1117
+ "eval_runtime": 65.5561,
1118
+ "eval_samples_per_second": 289.889,
1119
+ "eval_steps_per_second": 36.244,
1120
+ "step": 3700
1121
+ },
1122
+ {
1123
+ "epoch": 8.064516129032258,
1124
+ "grad_norm": 0.843550980091095,
1125
+ "learning_rate": 0.0001677505376344086,
1126
+ "loss": 0.2236369514465332,
1127
+ "step": 3750
1128
+ },
1129
+ {
1130
+ "epoch": 8.064516129032258,
1131
+ "eval_loss": 0.19716867804527283,
1132
+ "eval_runtime": 66.4534,
1133
+ "eval_samples_per_second": 285.975,
1134
+ "eval_steps_per_second": 35.754,
1135
+ "step": 3750
1136
+ },
1137
+ {
1138
+ "epoch": 8.172043010752688,
1139
+ "grad_norm": 0.5562386512756348,
1140
+ "learning_rate": 0.0001673204301075269,
1141
+ "loss": 0.22720510482788087,
1142
+ "step": 3800
1143
+ },
1144
+ {
1145
+ "epoch": 8.172043010752688,
1146
+ "eval_loss": 0.1974799931049347,
1147
+ "eval_runtime": 66.0022,
1148
+ "eval_samples_per_second": 287.93,
1149
+ "eval_steps_per_second": 35.999,
1150
+ "step": 3800
1151
+ },
1152
+ {
1153
+ "epoch": 8.279569892473118,
1154
+ "grad_norm": 0.5003981590270996,
1155
+ "learning_rate": 0.00016689032258064516,
1156
+ "loss": 0.22547555923461915,
1157
+ "step": 3850
1158
+ },
1159
+ {
1160
+ "epoch": 8.279569892473118,
1161
+ "eval_loss": 0.19821035861968994,
1162
+ "eval_runtime": 60.464,
1163
+ "eval_samples_per_second": 314.303,
1164
+ "eval_steps_per_second": 39.296,
1165
+ "step": 3850
1166
+ },
1167
+ {
1168
+ "epoch": 8.387096774193548,
1169
+ "grad_norm": 0.4629065692424774,
1170
+ "learning_rate": 0.00016646021505376345,
1171
+ "loss": 0.22113780975341796,
1172
+ "step": 3900
1173
+ },
1174
+ {
1175
+ "epoch": 8.387096774193548,
1176
+ "eval_loss": 0.1924905627965927,
1177
+ "eval_runtime": 60.595,
1178
+ "eval_samples_per_second": 313.623,
1179
+ "eval_steps_per_second": 39.211,
1180
+ "step": 3900
1181
+ },
1182
+ {
1183
+ "epoch": 8.494623655913978,
1184
+ "grad_norm": 0.5043092966079712,
1185
+ "learning_rate": 0.00016603010752688172,
1186
+ "loss": 0.21599315643310546,
1187
+ "step": 3950
1188
+ },
1189
+ {
1190
+ "epoch": 8.494623655913978,
1191
+ "eval_loss": 0.19553141295909882,
1192
+ "eval_runtime": 60.5,
1193
+ "eval_samples_per_second": 314.116,
1194
+ "eval_steps_per_second": 39.273,
1195
+ "step": 3950
1196
+ },
1197
+ {
1198
+ "epoch": 8.602150537634408,
1199
+ "grad_norm": 0.6413733959197998,
1200
+ "learning_rate": 0.0001656,
1201
+ "loss": 0.2173159408569336,
1202
+ "step": 4000
1203
+ },
1204
+ {
1205
+ "epoch": 8.602150537634408,
1206
+ "eval_loss": 0.19092191755771637,
1207
+ "eval_runtime": 60.5854,
1208
+ "eval_samples_per_second": 313.673,
1209
+ "eval_steps_per_second": 39.217,
1210
+ "step": 4000
1211
+ }
1212
+ ],
1213
+ "logging_steps": 50,
1214
+ "max_steps": 23250,
1215
+ "num_input_tokens_seen": 0,
1216
+ "num_train_epochs": 50,
1217
+ "save_steps": 1000,
1218
+ "stateful_callbacks": {
1219
+ "TrainerControl": {
1220
+ "args": {
1221
+ "should_epoch_stop": false,
1222
+ "should_evaluate": false,
1223
+ "should_log": false,
1224
+ "should_save": true,
1225
+ "should_training_stop": false
1226
+ },
1227
+ "attributes": {}
1228
+ }
1229
+ },
1230
+ "total_flos": 2.685471179194368e+16,
1231
+ "train_batch_size": 64,
1232
+ "trial_name": null,
1233
+ "trial_params": null
1234
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e07a6c5703ad19cc9b65702ca484a0bbe3465c5cd2906f1c5a9b738dfbd9aa
3
+ size 14993