Blancy commited on
Commit
38759ce
·
verified ·
1 Parent(s): a3d368a

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -3
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +270 -270
README.md CHANGED
@@ -1,10 +1,8 @@
1
  ---
2
- datasets: Blancy/verifiable-coding-problems-SFT
3
  library_name: transformers
4
  model_name: Qwen3-0.6B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  - trl
9
  - sft
10
  licence: license
@@ -12,7 +10,7 @@ licence: license
12
 
13
  # Model Card for Qwen3-0.6B-Open-R1-Distill
14
 
15
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [Blancy/verifiable-coding-problems-SFT](https://huggingface.co/datasets/Blancy/verifiable-coding-problems-SFT) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen3-0.6B-Open-R1-Distill
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
  - sft
8
  licence: license
 
10
 
11
  # Model Card for Qwen3-0.6B-Open-R1-Distill
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 7.227148038517555e+17,
3
- "train_loss": 0.08479578431808588,
4
- "train_runtime": 842.3759,
5
  "train_samples": 1086,
6
- "train_samples_per_second": 99.077,
7
- "train_steps_per_second": 0.783
8
  }
 
1
  {
2
+ "total_flos": 3.880913653947433e+18,
3
+ "train_loss": 0.06725140679063218,
4
+ "train_runtime": 3002.848,
5
  "train_samples": 1086,
6
+ "train_samples_per_second": 27.794,
7
+ "train_steps_per_second": 0.22
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 7.227148038517555e+17,
3
- "train_loss": 0.08479578431808588,
4
- "train_runtime": 842.3759,
5
  "train_samples": 1086,
6
- "train_samples_per_second": 99.077,
7
- "train_steps_per_second": 0.783
8
  }
 
1
  {
2
+ "total_flos": 3.880913653947433e+18,
3
+ "train_loss": 0.06725140679063218,
4
+ "train_runtime": 3002.848,
5
  "train_samples": 1086,
6
+ "train_samples_per_second": 27.794,
7
+ "train_steps_per_second": 0.22
8
  }
trainer_state.json CHANGED
@@ -11,936 +11,936 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.15151515151515152,
14
- "grad_norm": 1.984375,
15
  "learning_rate": 3.6363636363636366e-06,
16
- "loss": 0.1441,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.30303030303030304,
21
- "grad_norm": 1.40625,
22
  "learning_rate": 8.181818181818181e-06,
23
- "loss": 0.1383,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.45454545454545453,
28
- "grad_norm": 0.6484375,
29
  "learning_rate": 1.2727272727272728e-05,
30
- "loss": 0.1213,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.6060606060606061,
35
- "grad_norm": 0.423828125,
36
  "learning_rate": 1.7272727272727274e-05,
37
- "loss": 0.1133,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.7575757575757576,
42
- "grad_norm": 0.392578125,
43
  "learning_rate": 2.1818181818181818e-05,
44
- "loss": 0.1085,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.9090909090909091,
49
- "grad_norm": 0.28125,
50
  "learning_rate": 2.6363636363636365e-05,
51
- "loss": 0.1055,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 1.0606060606060606,
56
- "grad_norm": 0.2353515625,
57
  "learning_rate": 2.9999830539872836e-05,
58
- "loss": 0.0987,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 1.2121212121212122,
63
- "grad_norm": 0.2412109375,
64
  "learning_rate": 2.9993899882114902e-05,
65
- "loss": 0.1005,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 1.3636363636363638,
70
- "grad_norm": 0.21484375,
71
  "learning_rate": 2.997950047184977e-05,
72
- "loss": 0.0954,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 1.5151515151515151,
77
- "grad_norm": 0.2333984375,
78
  "learning_rate": 2.9956641346126986e-05,
79
- "loss": 0.0955,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 1.6666666666666665,
84
- "grad_norm": 0.224609375,
85
  "learning_rate": 2.9925336851301575e-05,
86
- "loss": 0.0946,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 1.8181818181818183,
91
- "grad_norm": 0.224609375,
92
  "learning_rate": 2.9885606634030267e-05,
93
- "loss": 0.0941,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 1.9696969696969697,
98
- "grad_norm": 0.2265625,
99
  "learning_rate": 2.98374756289413e-05,
100
- "loss": 0.0926,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 2.121212121212121,
105
- "grad_norm": 0.20703125,
106
  "learning_rate": 2.9780974042985506e-05,
107
- "loss": 0.0913,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 2.2727272727272725,
112
- "grad_norm": 0.2119140625,
113
  "learning_rate": 2.971613733647841e-05,
114
- "loss": 0.0898,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 2.4242424242424243,
119
- "grad_norm": 0.2236328125,
120
  "learning_rate": 2.9643006200845458e-05,
121
- "loss": 0.0914,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 2.5757575757575757,
126
- "grad_norm": 0.2177734375,
127
  "learning_rate": 2.9561626533084068e-05,
128
- "loss": 0.0912,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 2.7272727272727275,
133
- "grad_norm": 0.2138671875,
134
  "learning_rate": 2.9472049406958788e-05,
135
- "loss": 0.0906,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 2.878787878787879,
140
- "grad_norm": 0.2197265625,
141
  "learning_rate": 2.937433104094746e-05,
142
- "loss": 0.09,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 3.0303030303030303,
147
- "grad_norm": 0.2197265625,
148
  "learning_rate": 2.9268532762958568e-05,
149
- "loss": 0.0873,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 3.1818181818181817,
154
- "grad_norm": 0.2216796875,
155
  "learning_rate": 2.915472097184196e-05,
156
- "loss": 0.0893,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 3.3333333333333335,
161
- "grad_norm": 0.212890625,
162
  "learning_rate": 2.903296709571698e-05,
163
- "loss": 0.0877,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 3.484848484848485,
168
- "grad_norm": 0.20703125,
169
  "learning_rate": 2.8903347547144327e-05,
170
- "loss": 0.0881,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 3.6363636363636362,
175
- "grad_norm": 0.21875,
176
  "learning_rate": 2.876594367516961e-05,
177
- "loss": 0.0885,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 3.787878787878788,
182
- "grad_norm": 0.21875,
183
  "learning_rate": 2.8620841714268804e-05,
184
- "loss": 0.0886,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 3.9393939393939394,
189
- "grad_norm": 0.2158203125,
190
  "learning_rate": 2.846813273022764e-05,
191
- "loss": 0.0867,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 4.090909090909091,
196
- "grad_norm": 0.2041015625,
197
  "learning_rate": 2.83079125629888e-05,
198
- "loss": 0.0877,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 4.242424242424242,
203
- "grad_norm": 0.2294921875,
204
  "learning_rate": 2.8140281766502957e-05,
205
- "loss": 0.0877,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 4.393939393939394,
210
- "grad_norm": 0.2216796875,
211
  "learning_rate": 2.7965345545621217e-05,
212
- "loss": 0.0875,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 4.545454545454545,
217
- "grad_norm": 0.228515625,
218
  "learning_rate": 2.7783213690068737e-05,
219
- "loss": 0.0861,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 4.696969696969697,
224
- "grad_norm": 0.2158203125,
225
  "learning_rate": 2.7594000505540807e-05,
226
- "loss": 0.0896,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 4.848484848484849,
231
- "grad_norm": 0.224609375,
232
  "learning_rate": 2.7397824741964805e-05,
233
- "loss": 0.0834,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 5.0,
238
- "grad_norm": 0.2890625,
239
  "learning_rate": 2.7194809518972856e-05,
240
- "loss": 0.0865,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 5.151515151515151,
245
- "grad_norm": 0.2177734375,
246
  "learning_rate": 2.6985082248632174e-05,
247
- "loss": 0.0828,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 5.303030303030303,
252
- "grad_norm": 0.2197265625,
253
  "learning_rate": 2.676877455548141e-05,
254
- "loss": 0.0852,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 5.454545454545454,
259
- "grad_norm": 0.2109375,
260
  "learning_rate": 2.6546022193923274e-05,
261
- "loss": 0.086,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 5.606060606060606,
266
- "grad_norm": 0.22265625,
267
  "learning_rate": 2.631696496302526e-05,
268
- "loss": 0.0869,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 5.757575757575758,
273
- "grad_norm": 0.2158203125,
274
  "learning_rate": 2.6081746618781953e-05,
275
- "loss": 0.0862,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 5.909090909090909,
280
- "grad_norm": 0.23046875,
281
  "learning_rate": 2.584051478389399e-05,
282
- "loss": 0.0847,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 6.0606060606060606,
287
- "grad_norm": 0.212890625,
288
  "learning_rate": 2.559342085512022e-05,
289
- "loss": 0.0853,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 6.212121212121212,
294
- "grad_norm": 0.2265625,
295
  "learning_rate": 2.5340619908261352e-05,
296
- "loss": 0.0877,
297
  "step": 205
298
  },
299
  {
300
  "epoch": 6.363636363636363,
301
- "grad_norm": 0.2197265625,
302
  "learning_rate": 2.508227060083457e-05,
303
- "loss": 0.0807,
304
  "step": 210
305
  },
306
  {
307
  "epoch": 6.515151515151516,
308
- "grad_norm": 0.2236328125,
309
  "learning_rate": 2.4818535072500327e-05,
310
- "loss": 0.0793,
311
  "step": 215
312
  },
313
  {
314
  "epoch": 6.666666666666667,
315
- "grad_norm": 0.224609375,
316
  "learning_rate": 2.4549578843303708e-05,
317
- "loss": 0.0821,
318
  "step": 220
319
  },
320
  {
321
  "epoch": 6.818181818181818,
322
- "grad_norm": 0.2412109375,
323
  "learning_rate": 2.427557070979427e-05,
324
- "loss": 0.0832,
325
  "step": 225
326
  },
327
  {
328
  "epoch": 6.96969696969697,
329
- "grad_norm": 0.220703125,
330
  "learning_rate": 2.399668263908961e-05,
331
- "loss": 0.0844,
332
  "step": 230
333
  },
334
  {
335
  "epoch": 7.121212121212121,
336
- "grad_norm": 0.2236328125,
337
  "learning_rate": 2.3713089660948985e-05,
338
- "loss": 0.0846,
339
  "step": 235
340
  },
341
  {
342
  "epoch": 7.2727272727272725,
343
- "grad_norm": 0.208984375,
344
  "learning_rate": 2.342496975792494e-05,
345
- "loss": 0.0824,
346
  "step": 240
347
  },
348
  {
349
  "epoch": 7.424242424242424,
350
- "grad_norm": 0.1953125,
351
  "learning_rate": 2.313250375366167e-05,
352
- "loss": 0.0796,
353
  "step": 245
354
  },
355
  {
356
  "epoch": 7.575757575757576,
357
- "grad_norm": 0.21484375,
358
  "learning_rate": 2.283587519941036e-05,
359
- "loss": 0.0846,
360
  "step": 250
361
  },
362
  {
363
  "epoch": 7.7272727272727275,
364
- "grad_norm": 0.2275390625,
365
  "learning_rate": 2.253527025883271e-05,
366
- "loss": 0.0814,
367
  "step": 255
368
  },
369
  {
370
  "epoch": 7.878787878787879,
371
- "grad_norm": 0.228515625,
372
  "learning_rate": 2.2230877591164858e-05,
373
- "loss": 0.0846,
374
  "step": 260
375
  },
376
  {
377
  "epoch": 8.030303030303031,
378
- "grad_norm": 0.21875,
379
  "learning_rate": 2.192288823281509e-05,
380
- "loss": 0.0809,
381
  "step": 265
382
  },
383
  {
384
  "epoch": 8.181818181818182,
385
- "grad_norm": 0.2373046875,
386
  "learning_rate": 2.1611495477469712e-05,
387
- "loss": 0.0819,
388
  "step": 270
389
  },
390
  {
391
  "epoch": 8.333333333333334,
392
- "grad_norm": 0.2197265625,
393
  "learning_rate": 2.1296894754782155e-05,
394
- "loss": 0.0851,
395
  "step": 275
396
  },
397
  {
398
  "epoch": 8.484848484848484,
399
- "grad_norm": 0.22265625,
400
  "learning_rate": 2.0979283507721653e-05,
401
- "loss": 0.0821,
402
  "step": 280
403
  },
404
  {
405
  "epoch": 8.636363636363637,
406
- "grad_norm": 0.2119140625,
407
  "learning_rate": 2.0658861068658254e-05,
408
- "loss": 0.0788,
409
  "step": 285
410
  },
411
  {
412
  "epoch": 8.787878787878787,
413
- "grad_norm": 0.2216796875,
414
  "learning_rate": 2.0335828534262148e-05,
415
- "loss": 0.0819,
416
  "step": 290
417
  },
418
  {
419
  "epoch": 8.93939393939394,
420
- "grad_norm": 0.2353515625,
421
  "learning_rate": 2.001038863929568e-05,
422
- "loss": 0.0839,
423
  "step": 295
424
  },
425
  {
426
  "epoch": 9.090909090909092,
427
- "grad_norm": 0.2412109375,
428
  "learning_rate": 1.9682745629377267e-05,
429
- "loss": 0.0826,
430
  "step": 300
431
  },
432
  {
433
  "epoch": 9.242424242424242,
434
- "grad_norm": 0.220703125,
435
  "learning_rate": 1.9353105132797175e-05,
436
- "loss": 0.0796,
437
  "step": 305
438
  },
439
  {
440
  "epoch": 9.393939393939394,
441
- "grad_norm": 0.216796875,
442
  "learning_rate": 1.902167403146548e-05,
443
- "loss": 0.0803,
444
  "step": 310
445
  },
446
  {
447
  "epoch": 9.545454545454545,
448
- "grad_norm": 0.2314453125,
449
  "learning_rate": 1.8688660331073253e-05,
450
- "loss": 0.0812,
451
  "step": 315
452
  },
453
  {
454
  "epoch": 9.696969696969697,
455
- "grad_norm": 0.2158203125,
456
  "learning_rate": 1.8354273030548512e-05,
457
- "loss": 0.0792,
458
  "step": 320
459
  },
460
  {
461
  "epoch": 9.848484848484848,
462
- "grad_norm": 0.22265625,
463
  "learning_rate": 1.801872199088878e-05,
464
- "loss": 0.0783,
465
  "step": 325
466
  },
467
  {
468
  "epoch": 10.0,
469
- "grad_norm": 0.271484375,
470
  "learning_rate": 1.7682217803452616e-05,
471
- "loss": 0.0825,
472
  "step": 330
473
  },
474
  {
475
  "epoch": 10.151515151515152,
476
- "grad_norm": 0.216796875,
477
  "learning_rate": 1.7344971657792768e-05,
478
- "loss": 0.0846,
479
  "step": 335
480
  },
481
  {
482
  "epoch": 10.303030303030303,
483
- "grad_norm": 0.21484375,
484
  "learning_rate": 1.7007195209113934e-05,
485
- "loss": 0.0809,
486
  "step": 340
487
  },
488
  {
489
  "epoch": 10.454545454545455,
490
- "grad_norm": 0.2236328125,
491
  "learning_rate": 1.666910044543822e-05,
492
- "loss": 0.0823,
493
  "step": 345
494
  },
495
  {
496
  "epoch": 10.606060606060606,
497
- "grad_norm": 0.21484375,
498
  "learning_rate": 1.6330899554561785e-05,
499
- "loss": 0.0814,
500
  "step": 350
501
  },
502
  {
503
  "epoch": 10.757575757575758,
504
- "grad_norm": 0.2197265625,
505
  "learning_rate": 1.5992804790886075e-05,
506
- "loss": 0.0795,
507
  "step": 355
508
  },
509
  {
510
  "epoch": 10.909090909090908,
511
- "grad_norm": 0.2177734375,
512
  "learning_rate": 1.5655028342207235e-05,
513
- "loss": 0.0818,
514
  "step": 360
515
  },
516
  {
517
  "epoch": 11.06060606060606,
518
- "grad_norm": 0.228515625,
519
  "learning_rate": 1.5317782196547387e-05,
520
- "loss": 0.0817,
521
  "step": 365
522
  },
523
  {
524
  "epoch": 11.212121212121213,
525
- "grad_norm": 0.2373046875,
526
  "learning_rate": 1.4981278009111222e-05,
527
- "loss": 0.0819,
528
  "step": 370
529
  },
530
  {
531
  "epoch": 11.363636363636363,
532
- "grad_norm": 0.2294921875,
533
  "learning_rate": 1.4645726969451489e-05,
534
- "loss": 0.0778,
535
  "step": 375
536
  },
537
  {
538
  "epoch": 11.515151515151516,
539
- "grad_norm": 0.2197265625,
540
  "learning_rate": 1.4311339668926748e-05,
541
- "loss": 0.0796,
542
  "step": 380
543
  },
544
  {
545
  "epoch": 11.666666666666666,
546
- "grad_norm": 0.23828125,
547
  "learning_rate": 1.397832596853452e-05,
548
- "loss": 0.0823,
549
  "step": 385
550
  },
551
  {
552
  "epoch": 11.818181818181818,
553
- "grad_norm": 0.228515625,
554
  "learning_rate": 1.3646894867202821e-05,
555
- "loss": 0.0794,
556
  "step": 390
557
  },
558
  {
559
  "epoch": 11.969696969696969,
560
- "grad_norm": 0.2314453125,
561
  "learning_rate": 1.3317254370622732e-05,
562
- "loss": 0.0834,
563
  "step": 395
564
  },
565
  {
566
  "epoch": 12.121212121212121,
567
- "grad_norm": 0.22265625,
568
  "learning_rate": 1.298961136070432e-05,
569
- "loss": 0.0811,
570
  "step": 400
571
  },
572
  {
573
  "epoch": 12.272727272727273,
574
- "grad_norm": 0.2275390625,
575
  "learning_rate": 1.266417146573785e-05,
576
- "loss": 0.078,
577
  "step": 405
578
  },
579
  {
580
  "epoch": 12.424242424242424,
581
- "grad_norm": 0.2216796875,
582
  "learning_rate": 1.2341138931341752e-05,
583
- "loss": 0.0814,
584
  "step": 410
585
  },
586
  {
587
  "epoch": 12.575757575757576,
588
- "grad_norm": 0.2373046875,
589
  "learning_rate": 1.2020716492278353e-05,
590
- "loss": 0.0811,
591
  "step": 415
592
  },
593
  {
594
  "epoch": 12.727272727272727,
595
- "grad_norm": 0.212890625,
596
  "learning_rate": 1.1703105245217848e-05,
597
- "loss": 0.0787,
598
  "step": 420
599
  },
600
  {
601
  "epoch": 12.878787878787879,
602
- "grad_norm": 0.224609375,
603
  "learning_rate": 1.1388504522530296e-05,
604
- "loss": 0.0792,
605
  "step": 425
606
  },
607
  {
608
  "epoch": 13.030303030303031,
609
- "grad_norm": 0.2119140625,
610
  "learning_rate": 1.1077111767184916e-05,
611
- "loss": 0.0816,
612
  "step": 430
613
  },
614
  {
615
  "epoch": 13.181818181818182,
616
- "grad_norm": 0.234375,
617
  "learning_rate": 1.0769122408835148e-05,
618
- "loss": 0.0786,
619
  "step": 435
620
  },
621
  {
622
  "epoch": 13.333333333333334,
623
- "grad_norm": 0.216796875,
624
  "learning_rate": 1.0464729741167291e-05,
625
- "loss": 0.0825,
626
  "step": 440
627
  },
628
  {
629
  "epoch": 13.484848484848484,
630
- "grad_norm": 0.2119140625,
631
  "learning_rate": 1.016412480058964e-05,
632
- "loss": 0.0807,
633
  "step": 445
634
  },
635
  {
636
  "epoch": 13.636363636363637,
637
- "grad_norm": 0.2255859375,
638
  "learning_rate": 9.86749624633833e-06,
639
- "loss": 0.0832,
640
  "step": 450
641
  },
642
  {
643
  "epoch": 13.787878787878787,
644
- "grad_norm": 0.2294921875,
645
  "learning_rate": 9.575030242075062e-06,
646
- "loss": 0.0778,
647
  "step": 455
648
  },
649
  {
650
  "epoch": 13.93939393939394,
651
- "grad_norm": 0.4296875,
652
  "learning_rate": 9.286910339051015e-06,
653
- "loss": 0.0846,
654
  "step": 460
655
  },
656
  {
657
  "epoch": 14.090909090909092,
658
- "grad_norm": 0.216796875,
659
  "learning_rate": 9.003317360910392e-06,
660
- "loss": 0.0806,
661
  "step": 465
662
  },
663
  {
664
  "epoch": 14.242424242424242,
665
- "grad_norm": 0.220703125,
666
  "learning_rate": 8.724429290205732e-06,
667
- "loss": 0.0787,
668
  "step": 470
669
  },
670
  {
671
  "epoch": 14.393939393939394,
672
- "grad_norm": 0.2197265625,
673
  "learning_rate": 8.450421156696298e-06,
674
- "loss": 0.0817,
675
  "step": 475
676
  },
677
  {
678
  "epoch": 14.545454545454545,
679
- "grad_norm": 0.2158203125,
680
  "learning_rate": 8.181464927499674e-06,
681
- "loss": 0.0776,
682
  "step": 480
683
  },
684
  {
685
  "epoch": 14.696969696969697,
686
- "grad_norm": 0.228515625,
687
  "learning_rate": 7.917729399165435e-06,
688
- "loss": 0.0801,
689
  "step": 485
690
  },
691
  {
692
  "epoch": 14.848484848484848,
693
- "grad_norm": 0.2314453125,
694
  "learning_rate": 7.659380091738652e-06,
695
- "loss": 0.0781,
696
  "step": 490
697
  },
698
  {
699
  "epoch": 15.0,
700
- "grad_norm": 0.31640625,
701
  "learning_rate": 7.406579144879779e-06,
702
- "loss": 0.0786,
703
  "step": 495
704
  },
705
  {
706
  "epoch": 15.151515151515152,
707
- "grad_norm": 0.2275390625,
708
  "learning_rate": 7.159485216106013e-06,
709
- "loss": 0.0807,
710
  "step": 500
711
  },
712
  {
713
  "epoch": 15.303030303030303,
714
- "grad_norm": 0.232421875,
715
  "learning_rate": 6.918253381218046e-06,
716
- "loss": 0.0767,
717
  "step": 505
718
  },
719
  {
720
  "epoch": 15.454545454545455,
721
- "grad_norm": 0.234375,
722
  "learning_rate": 6.683035036974742e-06,
723
- "loss": 0.0787,
724
  "step": 510
725
  },
726
  {
727
  "epoch": 15.606060606060606,
728
- "grad_norm": 0.2373046875,
729
  "learning_rate": 6.45397780607673e-06,
730
- "loss": 0.0763,
731
  "step": 515
732
  },
733
  {
734
  "epoch": 15.757575757575758,
735
- "grad_norm": 0.2216796875,
736
  "learning_rate": 6.23122544451859e-06,
737
- "loss": 0.081,
738
  "step": 520
739
  },
740
  {
741
  "epoch": 15.909090909090908,
742
- "grad_norm": 0.234375,
743
  "learning_rate": 6.014917751367825e-06,
744
- "loss": 0.0794,
745
  "step": 525
746
  },
747
  {
748
  "epoch": 16.060606060606062,
749
- "grad_norm": 0.2236328125,
750
  "learning_rate": 5.80519048102715e-06,
751
- "loss": 0.0787,
752
  "step": 530
753
  },
754
  {
755
  "epoch": 16.21212121212121,
756
- "grad_norm": 0.2314453125,
757
  "learning_rate": 5.602175258035204e-06,
758
- "loss": 0.077,
759
  "step": 535
760
  },
761
  {
762
  "epoch": 16.363636363636363,
763
- "grad_norm": 0.2412109375,
764
  "learning_rate": 5.4059994944591914e-06,
765
- "loss": 0.0798,
766
  "step": 540
767
  },
768
  {
769
  "epoch": 16.515151515151516,
770
- "grad_norm": 0.21875,
771
  "learning_rate": 5.2167863099312636e-06,
772
- "loss": 0.0794,
773
  "step": 545
774
  },
775
  {
776
  "epoch": 16.666666666666668,
777
- "grad_norm": 0.2373046875,
778
  "learning_rate": 5.034654454378783e-06,
779
- "loss": 0.0793,
780
  "step": 550
781
  },
782
  {
783
  "epoch": 16.818181818181817,
784
- "grad_norm": 0.2353515625,
785
  "learning_rate": 4.859718233497048e-06,
786
- "loss": 0.0801,
787
  "step": 555
788
  },
789
  {
790
  "epoch": 16.96969696969697,
791
- "grad_norm": 0.2177734375,
792
  "learning_rate": 4.692087437011203e-06,
793
- "loss": 0.0791,
794
  "step": 560
795
  },
796
  {
797
  "epoch": 17.12121212121212,
798
- "grad_norm": 0.2255859375,
799
  "learning_rate": 4.5318672697723665e-06,
800
- "loss": 0.081,
801
  "step": 565
802
  },
803
  {
804
  "epoch": 17.272727272727273,
805
- "grad_norm": 0.2265625,
806
  "learning_rate": 4.3791582857311975e-06,
807
- "loss": 0.0792,
808
  "step": 570
809
  },
810
  {
811
  "epoch": 17.424242424242426,
812
- "grad_norm": 0.2119140625,
813
  "learning_rate": 4.2340563248303915e-06,
814
- "loss": 0.0805,
815
  "step": 575
816
  },
817
  {
818
  "epoch": 17.575757575757574,
819
- "grad_norm": 0.2197265625,
820
  "learning_rate": 4.096652452855675e-06,
821
- "loss": 0.0797,
822
  "step": 580
823
  },
824
  {
825
  "epoch": 17.727272727272727,
826
- "grad_norm": 0.212890625,
827
  "learning_rate": 3.967032904283021e-06,
828
- "loss": 0.0809,
829
  "step": 585
830
  },
831
  {
832
  "epoch": 17.87878787878788,
833
- "grad_norm": 0.236328125,
834
  "learning_rate": 3.8452790281580445e-06,
835
- "loss": 0.0803,
836
  "step": 590
837
  },
838
  {
839
  "epoch": 18.03030303030303,
840
- "grad_norm": 0.23828125,
841
  "learning_rate": 3.731467237041433e-06,
842
- "loss": 0.0804,
843
  "step": 595
844
  },
845
  {
846
  "epoch": 18.181818181818183,
847
- "grad_norm": 0.2236328125,
848
  "learning_rate": 3.6256689590525444e-06,
849
- "loss": 0.0824,
850
  "step": 600
851
  },
852
  {
853
  "epoch": 18.333333333333332,
854
- "grad_norm": 0.224609375,
855
  "learning_rate": 3.5279505930412164e-06,
856
- "loss": 0.0809,
857
  "step": 605
858
  },
859
  {
860
  "epoch": 18.484848484848484,
861
- "grad_norm": 0.22265625,
862
  "learning_rate": 3.4383734669159366e-06,
863
- "loss": 0.0811,
864
  "step": 610
865
  },
866
  {
867
  "epoch": 18.636363636363637,
868
- "grad_norm": 0.2265625,
869
  "learning_rate": 3.356993799154545e-06,
870
- "loss": 0.0786,
871
  "step": 615
872
  },
873
  {
874
  "epoch": 18.78787878787879,
875
- "grad_norm": 0.2158203125,
876
  "learning_rate": 3.2838626635215874e-06,
877
- "loss": 0.0811,
878
  "step": 620
879
  },
880
  {
881
  "epoch": 18.939393939393938,
882
- "grad_norm": 0.2216796875,
883
  "learning_rate": 3.2190259570144957e-06,
884
- "loss": 0.0834,
885
  "step": 625
886
  },
887
  {
888
  "epoch": 19.09090909090909,
889
- "grad_norm": 0.2255859375,
890
  "learning_rate": 3.162524371058697e-06,
891
- "loss": 0.0804,
892
  "step": 630
893
  },
894
  {
895
  "epoch": 19.242424242424242,
896
- "grad_norm": 0.220703125,
897
  "learning_rate": 3.1143933659697377e-06,
898
- "loss": 0.0777,
899
  "step": 635
900
  },
901
  {
902
  "epoch": 19.393939393939394,
903
- "grad_norm": 0.2333984375,
904
  "learning_rate": 3.0746631486984266e-06,
905
- "loss": 0.081,
906
  "step": 640
907
  },
908
  {
909
  "epoch": 19.545454545454547,
910
- "grad_norm": 0.2294921875,
911
  "learning_rate": 3.043358653873013e-06,
912
- "loss": 0.0776,
913
  "step": 645
914
  },
915
  {
916
  "epoch": 19.696969696969695,
917
- "grad_norm": 0.22265625,
918
  "learning_rate": 3.020499528150232e-06,
919
- "loss": 0.077,
920
  "step": 650
921
  },
922
  {
923
  "epoch": 19.848484848484848,
924
- "grad_norm": 0.220703125,
925
  "learning_rate": 3.006100117885101e-06,
926
- "loss": 0.0814,
927
  "step": 655
928
  },
929
  {
930
  "epoch": 20.0,
931
- "grad_norm": 0.298828125,
932
  "learning_rate": 3.000169460127164e-06,
933
- "loss": 0.0805,
934
  "step": 660
935
  },
936
  {
937
  "epoch": 20.0,
938
  "step": 660,
939
- "total_flos": 7.227148038517555e+17,
940
- "train_loss": 0.08479578431808588,
941
- "train_runtime": 842.3759,
942
- "train_samples_per_second": 99.077,
943
- "train_steps_per_second": 0.783
944
  }
945
  ],
946
  "logging_steps": 5,
@@ -960,7 +960,7 @@
960
  "attributes": {}
961
  }
962
  },
963
- "total_flos": 7.227148038517555e+17,
964
  "train_batch_size": 128,
965
  "trial_name": null,
966
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.15151515151515152,
14
+ "grad_norm": 0.796875,
15
  "learning_rate": 3.6363636363636366e-06,
16
+ "loss": 0.1258,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.30303030303030304,
21
+ "grad_norm": 0.69140625,
22
  "learning_rate": 8.181818181818181e-06,
23
+ "loss": 0.1212,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.45454545454545453,
28
+ "grad_norm": 0.421875,
29
  "learning_rate": 1.2727272727272728e-05,
30
+ "loss": 0.1055,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.6060606060606061,
35
+ "grad_norm": 0.2236328125,
36
  "learning_rate": 1.7272727272727274e-05,
37
+ "loss": 0.0963,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.7575757575757576,
42
+ "grad_norm": 0.2392578125,
43
  "learning_rate": 2.1818181818181818e-05,
44
+ "loss": 0.0931,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.9090909090909091,
49
+ "grad_norm": 0.1796875,
50
  "learning_rate": 2.6363636363636365e-05,
51
+ "loss": 0.0903,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 1.0606060606060606,
56
+ "grad_norm": 0.1552734375,
57
  "learning_rate": 2.9999830539872836e-05,
58
+ "loss": 0.0843,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 1.2121212121212122,
63
+ "grad_norm": 0.150390625,
64
  "learning_rate": 2.9993899882114902e-05,
65
+ "loss": 0.0853,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 1.3636363636363638,
70
+ "grad_norm": 0.138671875,
71
  "learning_rate": 2.997950047184977e-05,
72
+ "loss": 0.0804,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 1.5151515151515151,
77
+ "grad_norm": 0.146484375,
78
  "learning_rate": 2.9956641346126986e-05,
79
+ "loss": 0.0809,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 1.6666666666666665,
84
+ "grad_norm": 0.1396484375,
85
  "learning_rate": 2.9925336851301575e-05,
86
+ "loss": 0.0795,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 1.8181818181818183,
91
+ "grad_norm": 0.1376953125,
92
  "learning_rate": 2.9885606634030267e-05,
93
+ "loss": 0.0789,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 1.9696969696969697,
98
+ "grad_norm": 0.134765625,
99
  "learning_rate": 2.98374756289413e-05,
100
+ "loss": 0.0778,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 2.121212121212121,
105
+ "grad_norm": 0.1494140625,
106
  "learning_rate": 2.9780974042985506e-05,
107
+ "loss": 0.0761,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 2.2727272727272725,
112
+ "grad_norm": 0.2177734375,
113
  "learning_rate": 2.971613733647841e-05,
114
+ "loss": 0.0751,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 2.4242424242424243,
119
+ "grad_norm": 0.1435546875,
120
  "learning_rate": 2.9643006200845458e-05,
121
+ "loss": 0.0756,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 2.5757575757575757,
126
+ "grad_norm": 0.1376953125,
127
  "learning_rate": 2.9561626533084068e-05,
128
+ "loss": 0.0765,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 2.7272727272727275,
133
+ "grad_norm": 0.1318359375,
134
  "learning_rate": 2.9472049406958788e-05,
135
+ "loss": 0.0746,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 2.878787878787879,
140
+ "grad_norm": 0.1455078125,
141
  "learning_rate": 2.937433104094746e-05,
142
+ "loss": 0.0757,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 3.0303030303030303,
147
+ "grad_norm": 0.1328125,
148
  "learning_rate": 2.9268532762958568e-05,
149
+ "loss": 0.0725,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 3.1818181818181817,
154
+ "grad_norm": 0.1376953125,
155
  "learning_rate": 2.915472097184196e-05,
156
+ "loss": 0.0742,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 3.3333333333333335,
161
+ "grad_norm": 0.1318359375,
162
  "learning_rate": 2.903296709571698e-05,
163
+ "loss": 0.0707,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 3.484848484848485,
168
+ "grad_norm": 0.1337890625,
169
  "learning_rate": 2.8903347547144327e-05,
170
+ "loss": 0.0734,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 3.6363636363636362,
175
+ "grad_norm": 0.142578125,
176
  "learning_rate": 2.876594367516961e-05,
177
+ "loss": 0.0724,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 3.787878787878788,
182
+ "grad_norm": 0.1318359375,
183
  "learning_rate": 2.8620841714268804e-05,
184
+ "loss": 0.0725,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 3.9393939393939394,
189
+ "grad_norm": 0.1484375,
190
  "learning_rate": 2.846813273022764e-05,
191
+ "loss": 0.0714,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 4.090909090909091,
196
+ "grad_norm": 0.134765625,
197
  "learning_rate": 2.83079125629888e-05,
198
+ "loss": 0.0727,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 4.242424242424242,
203
+ "grad_norm": 0.146484375,
204
  "learning_rate": 2.8140281766502957e-05,
205
+ "loss": 0.0716,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 4.393939393939394,
210
+ "grad_norm": 0.140625,
211
  "learning_rate": 2.7965345545621217e-05,
212
+ "loss": 0.072,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 4.545454545454545,
217
+ "grad_norm": 0.1455078125,
218
  "learning_rate": 2.7783213690068737e-05,
219
+ "loss": 0.0701,
220
  "step": 150
221
  },
222
  {
223
  "epoch": 4.696969696969697,
224
+ "grad_norm": 0.1435546875,
225
  "learning_rate": 2.7594000505540807e-05,
226
+ "loss": 0.0741,
227
  "step": 155
228
  },
229
  {
230
  "epoch": 4.848484848484849,
231
+ "grad_norm": 0.142578125,
232
  "learning_rate": 2.7397824741964805e-05,
233
+ "loss": 0.0665,
234
  "step": 160
235
  },
236
  {
237
  "epoch": 5.0,
238
+ "grad_norm": 0.189453125,
239
  "learning_rate": 2.7194809518972856e-05,
240
+ "loss": 0.0705,
241
  "step": 165
242
  },
243
  {
244
  "epoch": 5.151515151515151,
245
+ "grad_norm": 0.1494140625,
246
  "learning_rate": 2.6985082248632174e-05,
247
+ "loss": 0.0679,
248
  "step": 170
249
  },
250
  {
251
  "epoch": 5.303030303030303,
252
+ "grad_norm": 0.14453125,
253
  "learning_rate": 2.676877455548141e-05,
254
+ "loss": 0.0693,
255
  "step": 175
256
  },
257
  {
258
  "epoch": 5.454545454545454,
259
+ "grad_norm": 0.1376953125,
260
  "learning_rate": 2.6546022193923274e-05,
261
+ "loss": 0.0696,
262
  "step": 180
263
  },
264
  {
265
  "epoch": 5.606060606060606,
266
+ "grad_norm": 0.1435546875,
267
  "learning_rate": 2.631696496302526e-05,
268
+ "loss": 0.0709,
269
  "step": 185
270
  },
271
  {
272
  "epoch": 5.757575757575758,
273
+ "grad_norm": 0.142578125,
274
  "learning_rate": 2.6081746618781953e-05,
275
+ "loss": 0.0694,
276
  "step": 190
277
  },
278
  {
279
  "epoch": 5.909090909090909,
280
+ "grad_norm": 0.1357421875,
281
  "learning_rate": 2.584051478389399e-05,
282
+ "loss": 0.0682,
283
  "step": 195
284
  },
285
  {
286
  "epoch": 6.0606060606060606,
287
+ "grad_norm": 0.154296875,
288
  "learning_rate": 2.559342085512022e-05,
289
+ "loss": 0.0686,
290
  "step": 200
291
  },
292
  {
293
  "epoch": 6.212121212121212,
294
+ "grad_norm": 0.154296875,
295
  "learning_rate": 2.5340619908261352e-05,
296
+ "loss": 0.0703,
297
  "step": 205
298
  },
299
  {
300
  "epoch": 6.363636363636363,
301
+ "grad_norm": 0.140625,
302
  "learning_rate": 2.508227060083457e-05,
303
+ "loss": 0.0647,
304
  "step": 210
305
  },
306
  {
307
  "epoch": 6.515151515151516,
308
+ "grad_norm": 0.1376953125,
309
  "learning_rate": 2.4818535072500327e-05,
310
+ "loss": 0.064,
311
  "step": 215
312
  },
313
  {
314
  "epoch": 6.666666666666667,
315
+ "grad_norm": 0.142578125,
316
  "learning_rate": 2.4549578843303708e-05,
317
+ "loss": 0.0676,
318
  "step": 220
319
  },
320
  {
321
  "epoch": 6.818181818181818,
322
+ "grad_norm": 0.14453125,
323
  "learning_rate": 2.427557070979427e-05,
324
+ "loss": 0.0669,
325
  "step": 225
326
  },
327
  {
328
  "epoch": 6.96969696969697,
329
+ "grad_norm": 0.1376953125,
330
  "learning_rate": 2.399668263908961e-05,
331
+ "loss": 0.0679,
332
  "step": 230
333
  },
334
  {
335
  "epoch": 7.121212121212121,
336
+ "grad_norm": 0.1357421875,
337
  "learning_rate": 2.3713089660948985e-05,
338
+ "loss": 0.0666,
339
  "step": 235
340
  },
341
  {
342
  "epoch": 7.2727272727272725,
343
+ "grad_norm": 0.1455078125,
344
  "learning_rate": 2.342496975792494e-05,
345
+ "loss": 0.066,
346
  "step": 240
347
  },
348
  {
349
  "epoch": 7.424242424242424,
350
+ "grad_norm": 0.123046875,
351
  "learning_rate": 2.313250375366167e-05,
352
+ "loss": 0.0637,
353
  "step": 245
354
  },
355
  {
356
  "epoch": 7.575757575757576,
357
+ "grad_norm": 0.1298828125,
358
  "learning_rate": 2.283587519941036e-05,
359
+ "loss": 0.0683,
360
  "step": 250
361
  },
362
  {
363
  "epoch": 7.7272727272727275,
364
+ "grad_norm": 0.1435546875,
365
  "learning_rate": 2.253527025883271e-05,
366
+ "loss": 0.0642,
367
  "step": 255
368
  },
369
  {
370
  "epoch": 7.878787878787879,
371
+ "grad_norm": 0.1533203125,
372
  "learning_rate": 2.2230877591164858e-05,
373
+ "loss": 0.0682,
374
  "step": 260
375
  },
376
  {
377
  "epoch": 8.030303030303031,
378
+ "grad_norm": 0.1328125,
379
  "learning_rate": 2.192288823281509e-05,
380
+ "loss": 0.0628,
381
  "step": 265
382
  },
383
  {
384
  "epoch": 8.181818181818182,
385
+ "grad_norm": 0.158203125,
386
  "learning_rate": 2.1611495477469712e-05,
387
+ "loss": 0.0635,
388
  "step": 270
389
  },
390
  {
391
  "epoch": 8.333333333333334,
392
+ "grad_norm": 0.15234375,
393
  "learning_rate": 2.1296894754782155e-05,
394
+ "loss": 0.0679,
395
  "step": 275
396
  },
397
  {
398
  "epoch": 8.484848484848484,
399
+ "grad_norm": 0.140625,
400
  "learning_rate": 2.0979283507721653e-05,
401
+ "loss": 0.0631,
402
  "step": 280
403
  },
404
  {
405
  "epoch": 8.636363636363637,
406
+ "grad_norm": 0.12890625,
407
  "learning_rate": 2.0658861068658254e-05,
408
+ "loss": 0.0634,
409
  "step": 285
410
  },
411
  {
412
  "epoch": 8.787878787878787,
413
+ "grad_norm": 0.14453125,
414
  "learning_rate": 2.0335828534262148e-05,
415
+ "loss": 0.0652,
416
  "step": 290
417
  },
418
  {
419
  "epoch": 8.93939393939394,
420
+ "grad_norm": 0.1484375,
421
  "learning_rate": 2.001038863929568e-05,
422
+ "loss": 0.067,
423
  "step": 295
424
  },
425
  {
426
  "epoch": 9.090909090909092,
427
+ "grad_norm": 0.1572265625,
428
  "learning_rate": 1.9682745629377267e-05,
429
+ "loss": 0.0647,
430
  "step": 300
431
  },
432
  {
433
  "epoch": 9.242424242424242,
434
+ "grad_norm": 0.1669921875,
435
  "learning_rate": 1.9353105132797175e-05,
436
+ "loss": 0.0628,
437
  "step": 305
438
  },
439
  {
440
  "epoch": 9.393939393939394,
441
+ "grad_norm": 0.140625,
442
  "learning_rate": 1.902167403146548e-05,
443
+ "loss": 0.0625,
444
  "step": 310
445
  },
446
  {
447
  "epoch": 9.545454545454545,
448
+ "grad_norm": 0.1728515625,
449
  "learning_rate": 1.8688660331073253e-05,
450
+ "loss": 0.0634,
451
  "step": 315
452
  },
453
  {
454
  "epoch": 9.696969696969697,
455
+ "grad_norm": 0.1357421875,
456
  "learning_rate": 1.8354273030548512e-05,
457
+ "loss": 0.0618,
458
  "step": 320
459
  },
460
  {
461
  "epoch": 9.848484848484848,
462
+ "grad_norm": 0.14453125,
463
  "learning_rate": 1.801872199088878e-05,
464
+ "loss": 0.0618,
465
  "step": 325
466
  },
467
  {
468
  "epoch": 10.0,
469
+ "grad_norm": 0.1630859375,
470
  "learning_rate": 1.7682217803452616e-05,
471
+ "loss": 0.0633,
472
  "step": 330
473
  },
474
  {
475
  "epoch": 10.151515151515152,
476
+ "grad_norm": 0.1376953125,
477
  "learning_rate": 1.7344971657792768e-05,
478
+ "loss": 0.0651,
479
  "step": 335
480
  },
481
  {
482
  "epoch": 10.303030303030303,
483
+ "grad_norm": 0.15625,
484
  "learning_rate": 1.7007195209113934e-05,
485
+ "loss": 0.0623,
486
  "step": 340
487
  },
488
  {
489
  "epoch": 10.454545454545455,
490
+ "grad_norm": 0.1533203125,
491
  "learning_rate": 1.666910044543822e-05,
492
+ "loss": 0.0647,
493
  "step": 345
494
  },
495
  {
496
  "epoch": 10.606060606060606,
497
+ "grad_norm": 0.12890625,
498
  "learning_rate": 1.6330899554561785e-05,
499
+ "loss": 0.0635,
500
  "step": 350
501
  },
502
  {
503
  "epoch": 10.757575757575758,
504
+ "grad_norm": 0.1552734375,
505
  "learning_rate": 1.5992804790886075e-05,
506
+ "loss": 0.0622,
507
  "step": 355
508
  },
509
  {
510
  "epoch": 10.909090909090908,
511
+ "grad_norm": 0.1396484375,
512
  "learning_rate": 1.5655028342207235e-05,
513
+ "loss": 0.0646,
514
  "step": 360
515
  },
516
  {
517
  "epoch": 11.06060606060606,
518
+ "grad_norm": 0.1533203125,
519
  "learning_rate": 1.5317782196547387e-05,
520
+ "loss": 0.0638,
521
  "step": 365
522
  },
523
  {
524
  "epoch": 11.212121212121213,
525
+ "grad_norm": 0.146484375,
526
  "learning_rate": 1.4981278009111222e-05,
527
+ "loss": 0.0633,
528
  "step": 370
529
  },
530
  {
531
  "epoch": 11.363636363636363,
532
+ "grad_norm": 0.1376953125,
533
  "learning_rate": 1.4645726969451489e-05,
534
+ "loss": 0.0602,
535
  "step": 375
536
  },
537
  {
538
  "epoch": 11.515151515151516,
539
+ "grad_norm": 0.1533203125,
540
  "learning_rate": 1.4311339668926748e-05,
541
+ "loss": 0.061,
542
  "step": 380
543
  },
544
  {
545
  "epoch": 11.666666666666666,
546
+ "grad_norm": 0.1513671875,
547
  "learning_rate": 1.397832596853452e-05,
548
+ "loss": 0.0636,
549
  "step": 385
550
  },
551
  {
552
  "epoch": 11.818181818181818,
553
+ "grad_norm": 0.1357421875,
554
  "learning_rate": 1.3646894867202821e-05,
555
+ "loss": 0.0605,
556
  "step": 390
557
  },
558
  {
559
  "epoch": 11.969696969696969,
560
+ "grad_norm": 0.1435546875,
561
  "learning_rate": 1.3317254370622732e-05,
562
+ "loss": 0.0642,
563
  "step": 395
564
  },
565
  {
566
  "epoch": 12.121212121212121,
567
+ "grad_norm": 0.1591796875,
568
  "learning_rate": 1.298961136070432e-05,
569
+ "loss": 0.0633,
570
  "step": 400
571
  },
572
  {
573
  "epoch": 12.272727272727273,
574
+ "grad_norm": 0.1396484375,
575
  "learning_rate": 1.266417146573785e-05,
576
+ "loss": 0.0605,
577
  "step": 405
578
  },
579
  {
580
  "epoch": 12.424242424242424,
581
+ "grad_norm": 0.146484375,
582
  "learning_rate": 1.2341138931341752e-05,
583
+ "loss": 0.0627,
584
  "step": 410
585
  },
586
  {
587
  "epoch": 12.575757575757576,
588
+ "grad_norm": 0.16015625,
589
  "learning_rate": 1.2020716492278353e-05,
590
+ "loss": 0.0628,
591
  "step": 415
592
  },
593
  {
594
  "epoch": 12.727272727272727,
595
+ "grad_norm": 0.1513671875,
596
  "learning_rate": 1.1703105245217848e-05,
597
+ "loss": 0.0598,
598
  "step": 420
599
  },
600
  {
601
  "epoch": 12.878787878787879,
602
+ "grad_norm": 0.1416015625,
603
  "learning_rate": 1.1388504522530296e-05,
604
+ "loss": 0.0611,
605
  "step": 425
606
  },
607
  {
608
  "epoch": 13.030303030303031,
609
+ "grad_norm": 0.1435546875,
610
  "learning_rate": 1.1077111767184916e-05,
611
+ "loss": 0.0638,
612
  "step": 430
613
  },
614
  {
615
  "epoch": 13.181818181818182,
616
+ "grad_norm": 0.1376953125,
617
  "learning_rate": 1.0769122408835148e-05,
618
+ "loss": 0.0585,
619
  "step": 435
620
  },
621
  {
622
  "epoch": 13.333333333333334,
623
+ "grad_norm": 0.1396484375,
624
  "learning_rate": 1.0464729741167291e-05,
625
+ "loss": 0.0635,
626
  "step": 440
627
  },
628
  {
629
  "epoch": 13.484848484848484,
630
+ "grad_norm": 0.14453125,
631
  "learning_rate": 1.016412480058964e-05,
632
+ "loss": 0.0621,
633
  "step": 445
634
  },
635
  {
636
  "epoch": 13.636363636363637,
637
+ "grad_norm": 0.1552734375,
638
  "learning_rate": 9.86749624633833e-06,
639
+ "loss": 0.0635,
640
  "step": 450
641
  },
642
  {
643
  "epoch": 13.787878787878787,
644
+ "grad_norm": 0.1474609375,
645
  "learning_rate": 9.575030242075062e-06,
646
+ "loss": 0.0597,
647
  "step": 455
648
  },
649
  {
650
  "epoch": 13.93939393939394,
651
+ "grad_norm": 0.1484375,
652
  "learning_rate": 9.286910339051015e-06,
653
+ "loss": 0.0659,
654
  "step": 460
655
  },
656
  {
657
  "epoch": 14.090909090909092,
658
+ "grad_norm": 0.140625,
659
  "learning_rate": 9.003317360910392e-06,
660
+ "loss": 0.0618,
661
  "step": 465
662
  },
663
  {
664
  "epoch": 14.242424242424242,
665
+ "grad_norm": 0.14453125,
666
  "learning_rate": 8.724429290205732e-06,
667
+ "loss": 0.0612,
668
  "step": 470
669
  },
670
  {
671
  "epoch": 14.393939393939394,
672
+ "grad_norm": 0.142578125,
673
  "learning_rate": 8.450421156696298e-06,
674
+ "loss": 0.0615,
675
  "step": 475
676
  },
677
  {
678
  "epoch": 14.545454545454545,
679
+ "grad_norm": 0.1357421875,
680
  "learning_rate": 8.181464927499674e-06,
681
+ "loss": 0.0591,
682
  "step": 480
683
  },
684
  {
685
  "epoch": 14.696969696969697,
686
+ "grad_norm": 0.15234375,
687
  "learning_rate": 7.917729399165435e-06,
688
+ "loss": 0.0606,
689
  "step": 485
690
  },
691
  {
692
  "epoch": 14.848484848484848,
693
+ "grad_norm": 0.1416015625,
694
  "learning_rate": 7.659380091738652e-06,
695
+ "loss": 0.0592,
696
  "step": 490
697
  },
698
  {
699
  "epoch": 15.0,
700
+ "grad_norm": 0.1865234375,
701
  "learning_rate": 7.406579144879779e-06,
702
+ "loss": 0.0601,
703
  "step": 495
704
  },
705
  {
706
  "epoch": 15.151515151515152,
707
+ "grad_norm": 0.1455078125,
708
  "learning_rate": 7.159485216106013e-06,
709
+ "loss": 0.0616,
710
  "step": 500
711
  },
712
  {
713
  "epoch": 15.303030303030303,
714
+ "grad_norm": 0.146484375,
715
  "learning_rate": 6.918253381218046e-06,
716
+ "loss": 0.0583,
717
  "step": 505
718
  },
719
  {
720
  "epoch": 15.454545454545455,
721
+ "grad_norm": 0.15234375,
722
  "learning_rate": 6.683035036974742e-06,
723
+ "loss": 0.0613,
724
  "step": 510
725
  },
726
  {
727
  "epoch": 15.606060606060606,
728
+ "grad_norm": 0.1484375,
729
  "learning_rate": 6.45397780607673e-06,
730
+ "loss": 0.0572,
731
  "step": 515
732
  },
733
  {
734
  "epoch": 15.757575757575758,
735
+ "grad_norm": 0.1884765625,
736
  "learning_rate": 6.23122544451859e-06,
737
+ "loss": 0.0616,
738
  "step": 520
739
  },
740
  {
741
  "epoch": 15.909090909090908,
742
+ "grad_norm": 0.15234375,
743
  "learning_rate": 6.014917751367825e-06,
744
+ "loss": 0.0601,
745
  "step": 525
746
  },
747
  {
748
  "epoch": 16.060606060606062,
749
+ "grad_norm": 0.154296875,
750
  "learning_rate": 5.80519048102715e-06,
751
+ "loss": 0.0597,
752
  "step": 530
753
  },
754
  {
755
  "epoch": 16.21212121212121,
756
+ "grad_norm": 0.140625,
757
  "learning_rate": 5.602175258035204e-06,
758
+ "loss": 0.0581,
759
  "step": 535
760
  },
761
  {
762
  "epoch": 16.363636363636363,
763
+ "grad_norm": 0.1611328125,
764
  "learning_rate": 5.4059994944591914e-06,
765
+ "loss": 0.0617,
766
  "step": 540
767
  },
768
  {
769
  "epoch": 16.515151515151516,
770
+ "grad_norm": 0.1416015625,
771
  "learning_rate": 5.2167863099312636e-06,
772
+ "loss": 0.0587,
773
  "step": 545
774
  },
775
  {
776
  "epoch": 16.666666666666668,
777
+ "grad_norm": 0.14453125,
778
  "learning_rate": 5.034654454378783e-06,
779
+ "loss": 0.0599,
780
  "step": 550
781
  },
782
  {
783
  "epoch": 16.818181818181817,
784
+ "grad_norm": 0.1533203125,
785
  "learning_rate": 4.859718233497048e-06,
786
+ "loss": 0.0624,
787
  "step": 555
788
  },
789
  {
790
  "epoch": 16.96969696969697,
791
+ "grad_norm": 0.1533203125,
792
  "learning_rate": 4.692087437011203e-06,
793
+ "loss": 0.0589,
794
  "step": 560
795
  },
796
  {
797
  "epoch": 17.12121212121212,
798
+ "grad_norm": 0.1533203125,
799
  "learning_rate": 4.5318672697723665e-06,
800
+ "loss": 0.0624,
801
  "step": 565
802
  },
803
  {
804
  "epoch": 17.272727272727273,
805
+ "grad_norm": 0.1494140625,
806
  "learning_rate": 4.3791582857311975e-06,
807
+ "loss": 0.0603,
808
  "step": 570
809
  },
810
  {
811
  "epoch": 17.424242424242426,
812
+ "grad_norm": 0.142578125,
813
  "learning_rate": 4.2340563248303915e-06,
814
+ "loss": 0.0621,
815
  "step": 575
816
  },
817
  {
818
  "epoch": 17.575757575757574,
819
+ "grad_norm": 0.1494140625,
820
  "learning_rate": 4.096652452855675e-06,
821
+ "loss": 0.0608,
822
  "step": 580
823
  },
824
  {
825
  "epoch": 17.727272727272727,
826
+ "grad_norm": 0.1435546875,
827
  "learning_rate": 3.967032904283021e-06,
828
+ "loss": 0.06,
829
  "step": 585
830
  },
831
  {
832
  "epoch": 17.87878787878788,
833
+ "grad_norm": 0.1484375,
834
  "learning_rate": 3.8452790281580445e-06,
835
+ "loss": 0.0605,
836
  "step": 590
837
  },
838
  {
839
  "epoch": 18.03030303030303,
840
+ "grad_norm": 0.154296875,
841
  "learning_rate": 3.731467237041433e-06,
842
+ "loss": 0.0601,
843
  "step": 595
844
  },
845
  {
846
  "epoch": 18.181818181818183,
847
+ "grad_norm": 0.1513671875,
848
  "learning_rate": 3.6256689590525444e-06,
849
+ "loss": 0.0628,
850
  "step": 600
851
  },
852
  {
853
  "epoch": 18.333333333333332,
854
+ "grad_norm": 0.1396484375,
855
  "learning_rate": 3.5279505930412164e-06,
856
+ "loss": 0.062,
857
  "step": 605
858
  },
859
  {
860
  "epoch": 18.484848484848484,
861
+ "grad_norm": 0.1591796875,
862
  "learning_rate": 3.4383734669159366e-06,
863
+ "loss": 0.0618,
864
  "step": 610
865
  },
866
  {
867
  "epoch": 18.636363636363637,
868
+ "grad_norm": 0.1591796875,
869
  "learning_rate": 3.356993799154545e-06,
870
+ "loss": 0.059,
871
  "step": 615
872
  },
873
  {
874
  "epoch": 18.78787878787879,
875
+ "grad_norm": 0.146484375,
876
  "learning_rate": 3.2838626635215874e-06,
877
+ "loss": 0.0595,
878
  "step": 620
879
  },
880
  {
881
  "epoch": 18.939393939393938,
882
+ "grad_norm": 0.1611328125,
883
  "learning_rate": 3.2190259570144957e-06,
884
+ "loss": 0.0629,
885
  "step": 625
886
  },
887
  {
888
  "epoch": 19.09090909090909,
889
+ "grad_norm": 0.150390625,
890
  "learning_rate": 3.162524371058697e-06,
891
+ "loss": 0.0612,
892
  "step": 630
893
  },
894
  {
895
  "epoch": 19.242424242424242,
896
+ "grad_norm": 0.1337890625,
897
  "learning_rate": 3.1143933659697377e-06,
898
+ "loss": 0.0583,
899
  "step": 635
900
  },
901
  {
902
  "epoch": 19.393939393939394,
903
+ "grad_norm": 0.1474609375,
904
  "learning_rate": 3.0746631486984266e-06,
905
+ "loss": 0.0626,
906
  "step": 640
907
  },
908
  {
909
  "epoch": 19.545454545454547,
910
+ "grad_norm": 0.15234375,
911
  "learning_rate": 3.043358653873013e-06,
912
+ "loss": 0.0589,
913
  "step": 645
914
  },
915
  {
916
  "epoch": 19.696969696969695,
917
+ "grad_norm": 0.1572265625,
918
  "learning_rate": 3.020499528150232e-06,
919
+ "loss": 0.0586,
920
  "step": 650
921
  },
922
  {
923
  "epoch": 19.848484848484848,
924
+ "grad_norm": 0.15625,
925
  "learning_rate": 3.006100117885101e-06,
926
+ "loss": 0.0591,
927
  "step": 655
928
  },
929
  {
930
  "epoch": 20.0,
931
+ "grad_norm": 0.2119140625,
932
  "learning_rate": 3.000169460127164e-06,
933
+ "loss": 0.0613,
934
  "step": 660
935
  },
936
  {
937
  "epoch": 20.0,
938
  "step": 660,
939
+ "total_flos": 3.880913653947433e+18,
940
+ "train_loss": 0.06725140679063218,
941
+ "train_runtime": 3002.848,
942
+ "train_samples_per_second": 27.794,
943
+ "train_steps_per_second": 0.22
944
  }
945
  ],
946
  "logging_steps": 5,
 
960
  "attributes": {}
961
  }
962
  },
963
+ "total_flos": 3.880913653947433e+18,
964
  "train_batch_size": 128,
965
  "trial_name": null,
966
  "trial_params": null