PurplelinkPL commited on
Commit
7cda0ea
·
verified ·
1 Parent(s): e485e3f

Upload 10 files

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer-001.pt +3 -0
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +2465 -515
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50ab72ee4006e90d132d254bd261b095cdf9f599f538918a37986c6071d1773e
3
  size 1583544840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5986497028108a88ad993e68c18562e52cb6c8e03114fe01200e5da0854e3b
3
  size 1583544840
optimizer-001.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d64b452c7cc1666de71a32e00b462c42364fad37b649fea63dfceeb3684a87ec
3
+ size 3167201739
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd7d1ab5fa201d20d98db247d9f5d0ebc8dcb20aa2e1128cb5af2b40e8ae23a1
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adbea6f60dc78ded4b6f92de10291bbc5facf95a1b4fb8015f15dc7bc7f39302
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:683f9d438efb114c8ac5a1515e4472a3e865f2fc3aea8a4b95df341b9ab5537f
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fc94cd9c1543eabca01e851af1d33e6ad7156a1958832e168dcad4d8856974b
3
  size 1465
trainer_state.json CHANGED
@@ -2,1189 +2,3139 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.015,
6
  "eval_steps": 1000,
7
- "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1e-06,
14
- "grad_norm": 1.1795536279678345,
15
  "learning_rate": 0.0,
16
- "loss": 1.4139,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0001,
21
- "grad_norm": 1.1734141111373901,
22
- "learning_rate": 9.900000000000001e-08,
23
- "loss": 1.387,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0002,
28
- "grad_norm": 1.1503151655197144,
29
- "learning_rate": 1.9900000000000002e-07,
30
- "loss": 1.3882,
31
  "step": 200
32
  },
33
  {
34
  "epoch": 0.0003,
35
- "grad_norm": 1.1478229761123657,
36
- "learning_rate": 2.99e-07,
37
- "loss": 1.386,
38
  "step": 300
39
  },
40
  {
41
  "epoch": 0.0004,
42
- "grad_norm": 1.1559761762619019,
43
- "learning_rate": 3.99e-07,
44
- "loss": 1.3823,
45
  "step": 400
46
  },
47
  {
48
  "epoch": 0.0005,
49
- "grad_norm": 1.1433175802230835,
50
- "learning_rate": 4.99e-07,
51
- "loss": 1.381,
52
  "step": 500
53
  },
54
  {
55
  "epoch": 0.0006,
56
- "grad_norm": 1.1483551263809204,
57
- "learning_rate": 5.990000000000001e-07,
58
- "loss": 1.3807,
59
  "step": 600
60
  },
61
  {
62
  "epoch": 0.0007,
63
- "grad_norm": 1.161496877670288,
64
- "learning_rate": 6.990000000000001e-07,
65
- "loss": 1.3833,
66
  "step": 700
67
  },
68
  {
69
  "epoch": 0.0008,
70
- "grad_norm": 1.139211654663086,
71
- "learning_rate": 7.990000000000001e-07,
72
- "loss": 1.3835,
73
  "step": 800
74
  },
75
  {
76
  "epoch": 0.0009,
77
- "grad_norm": 1.133931040763855,
78
- "learning_rate": 8.99e-07,
79
- "loss": 1.3719,
80
  "step": 900
81
  },
82
  {
83
  "epoch": 0.001,
84
- "grad_norm": 1.1143814325332642,
85
- "learning_rate": 9.99e-07,
86
- "loss": 1.3761,
87
  "step": 1000
88
  },
89
  {
90
  "epoch": 0.001,
91
- "eval_loss": 1.4045685529708862,
92
- "eval_runtime": 27.4497,
93
- "eval_samples_per_second": 182.152,
94
- "eval_steps_per_second": 2.878,
95
  "step": 1000
96
  },
97
  {
98
  "epoch": 0.0011,
99
- "grad_norm": 1.153507947921753,
100
- "learning_rate": 1.099e-06,
101
- "loss": 1.3669,
102
  "step": 1100
103
  },
104
  {
105
  "epoch": 0.0012,
106
- "grad_norm": 1.1281546354293823,
107
- "learning_rate": 1.199e-06,
108
- "loss": 1.375,
109
  "step": 1200
110
  },
111
  {
112
  "epoch": 0.0013,
113
- "grad_norm": 1.1093217134475708,
114
- "learning_rate": 1.299e-06,
115
- "loss": 1.3726,
116
  "step": 1300
117
  },
118
  {
119
  "epoch": 0.0014,
120
- "grad_norm": 1.1526917219161987,
121
- "learning_rate": 1.399e-06,
122
- "loss": 1.3696,
123
  "step": 1400
124
  },
125
  {
126
  "epoch": 0.0015,
127
- "grad_norm": 1.1092661619186401,
128
- "learning_rate": 1.4990000000000002e-06,
129
- "loss": 1.3699,
130
  "step": 1500
131
  },
132
  {
133
  "epoch": 0.0016,
134
- "grad_norm": 1.5104150772094727,
135
- "learning_rate": 1.599e-06,
136
- "loss": 1.3734,
137
  "step": 1600
138
  },
139
  {
140
  "epoch": 0.0017,
141
- "grad_norm": 1.1301764249801636,
142
- "learning_rate": 1.6990000000000002e-06,
143
- "loss": 1.3719,
144
  "step": 1700
145
  },
146
  {
147
  "epoch": 0.0018,
148
- "grad_norm": 1.120370626449585,
149
- "learning_rate": 1.7990000000000003e-06,
150
- "loss": 1.3695,
151
  "step": 1800
152
  },
153
  {
154
  "epoch": 0.0019,
155
- "grad_norm": 1.145676612854004,
156
- "learning_rate": 1.8990000000000004e-06,
157
- "loss": 1.3675,
158
  "step": 1900
159
  },
160
  {
161
  "epoch": 0.002,
162
- "grad_norm": 1.1365715265274048,
163
- "learning_rate": 1.9990000000000003e-06,
164
- "loss": 1.3616,
165
  "step": 2000
166
  },
167
  {
168
  "epoch": 0.002,
169
- "eval_loss": 1.3989644050598145,
170
- "eval_runtime": 24.4886,
171
- "eval_samples_per_second": 204.177,
172
- "eval_steps_per_second": 3.226,
173
  "step": 2000
174
  },
175
  {
176
  "epoch": 0.0021,
177
- "grad_norm": 1.118861198425293,
178
- "learning_rate": 2.099e-06,
179
- "loss": 1.3635,
180
  "step": 2100
181
  },
182
  {
183
  "epoch": 0.0022,
184
- "grad_norm": 1.1307072639465332,
185
- "learning_rate": 2.1990000000000005e-06,
186
- "loss": 1.375,
187
  "step": 2200
188
  },
189
  {
190
  "epoch": 0.0023,
191
- "grad_norm": 1.088172197341919,
192
- "learning_rate": 2.299e-06,
193
- "loss": 1.3627,
194
  "step": 2300
195
  },
196
  {
197
  "epoch": 0.0024,
198
- "grad_norm": 1.1681883335113525,
199
- "learning_rate": 2.3990000000000002e-06,
200
- "loss": 1.3607,
201
  "step": 2400
202
  },
203
  {
204
  "epoch": 0.0025,
205
- "grad_norm": 1.1483210325241089,
206
- "learning_rate": 2.499e-06,
207
- "loss": 1.3687,
208
  "step": 2500
209
  },
210
  {
211
  "epoch": 0.0026,
212
- "grad_norm": 1.1572397947311401,
213
- "learning_rate": 2.5990000000000004e-06,
214
- "loss": 1.3695,
215
  "step": 2600
216
  },
217
  {
218
  "epoch": 0.0027,
219
- "grad_norm": 1.124837875366211,
220
- "learning_rate": 2.699e-06,
221
- "loss": 1.3532,
222
  "step": 2700
223
  },
224
  {
225
  "epoch": 0.0028,
226
- "grad_norm": 1.0974047183990479,
227
- "learning_rate": 2.7990000000000002e-06,
228
- "loss": 1.3577,
229
  "step": 2800
230
  },
231
  {
232
  "epoch": 0.0029,
233
- "grad_norm": 1.1722006797790527,
234
- "learning_rate": 2.899e-06,
235
- "loss": 1.3673,
236
  "step": 2900
237
  },
238
  {
239
  "epoch": 0.003,
240
- "grad_norm": 1.106062650680542,
241
- "learning_rate": 2.9990000000000004e-06,
242
- "loss": 1.36,
243
  "step": 3000
244
  },
245
  {
246
  "epoch": 0.003,
247
- "eval_loss": 1.3754355907440186,
248
- "eval_runtime": 24.5927,
249
- "eval_samples_per_second": 203.312,
250
- "eval_steps_per_second": 3.212,
251
  "step": 3000
252
  },
253
  {
254
  "epoch": 0.0031,
255
- "grad_norm": 1.1039618253707886,
256
- "learning_rate": 3.0990000000000003e-06,
257
- "loss": 1.3567,
258
  "step": 3100
259
  },
260
  {
261
  "epoch": 0.0032,
262
- "grad_norm": 1.1439259052276611,
263
- "learning_rate": 3.1990000000000006e-06,
264
- "loss": 1.3543,
265
  "step": 3200
266
  },
267
  {
268
  "epoch": 0.0033,
269
- "grad_norm": 1.1732087135314941,
270
- "learning_rate": 3.2990000000000005e-06,
271
- "loss": 1.3464,
272
  "step": 3300
273
  },
274
  {
275
  "epoch": 0.0034,
276
- "grad_norm": 1.0517069101333618,
277
- "learning_rate": 3.399e-06,
278
- "loss": 1.3398,
279
  "step": 3400
280
  },
281
  {
282
  "epoch": 0.0035,
283
- "grad_norm": 1.0987197160720825,
284
- "learning_rate": 3.4990000000000003e-06,
285
- "loss": 1.356,
286
  "step": 3500
287
  },
288
  {
289
  "epoch": 0.0036,
290
- "grad_norm": 1.1524548530578613,
291
- "learning_rate": 3.599e-06,
292
- "loss": 1.3481,
293
  "step": 3600
294
  },
295
  {
296
  "epoch": 0.0037,
297
- "grad_norm": 1.10309636592865,
298
- "learning_rate": 3.6990000000000005e-06,
299
- "loss": 1.3515,
300
  "step": 3700
301
  },
302
  {
303
  "epoch": 0.0038,
304
- "grad_norm": 1.1285984516143799,
305
- "learning_rate": 3.7990000000000004e-06,
306
- "loss": 1.3541,
307
  "step": 3800
308
  },
309
  {
310
  "epoch": 0.0039,
311
- "grad_norm": 1.1621686220169067,
312
- "learning_rate": 3.899e-06,
313
- "loss": 1.3532,
314
  "step": 3900
315
  },
316
  {
317
  "epoch": 0.004,
318
- "grad_norm": 1.078803300857544,
319
- "learning_rate": 3.999e-06,
320
- "loss": 1.3468,
321
  "step": 4000
322
  },
323
  {
324
  "epoch": 0.004,
325
- "eval_loss": 1.3711252212524414,
326
- "eval_runtime": 24.5467,
327
- "eval_samples_per_second": 203.693,
328
- "eval_steps_per_second": 3.218,
329
  "step": 4000
330
  },
331
  {
332
  "epoch": 0.0041,
333
- "grad_norm": 1.1375211477279663,
334
- "learning_rate": 4.099e-06,
335
- "loss": 1.341,
336
  "step": 4100
337
  },
338
  {
339
  "epoch": 0.0042,
340
- "grad_norm": 1.0922551155090332,
341
- "learning_rate": 4.199e-06,
342
- "loss": 1.3427,
343
  "step": 4200
344
  },
345
  {
346
  "epoch": 0.0043,
347
- "grad_norm": 1.124060034751892,
348
- "learning_rate": 4.299000000000001e-06,
349
- "loss": 1.3409,
350
  "step": 4300
351
  },
352
  {
353
  "epoch": 0.0044,
354
- "grad_norm": 1.125467300415039,
355
- "learning_rate": 4.3990000000000006e-06,
356
- "loss": 1.3467,
357
  "step": 4400
358
  },
359
  {
360
  "epoch": 0.0045,
361
- "grad_norm": 1.1384063959121704,
362
- "learning_rate": 4.4990000000000005e-06,
363
- "loss": 1.3426,
364
  "step": 4500
365
  },
366
  {
367
  "epoch": 0.0046,
368
- "grad_norm": 1.1456679105758667,
369
- "learning_rate": 4.599e-06,
370
- "loss": 1.3445,
371
  "step": 4600
372
  },
373
  {
374
  "epoch": 0.0047,
375
- "grad_norm": 1.1553903818130493,
376
- "learning_rate": 4.699e-06,
377
- "loss": 1.3372,
378
  "step": 4700
379
  },
380
  {
381
  "epoch": 0.0048,
382
- "grad_norm": 1.1315921545028687,
383
- "learning_rate": 4.799e-06,
384
- "loss": 1.3398,
385
  "step": 4800
386
  },
387
  {
388
  "epoch": 0.0049,
389
- "grad_norm": 1.08122980594635,
390
- "learning_rate": 4.899e-06,
391
- "loss": 1.3364,
392
  "step": 4900
393
  },
394
  {
395
  "epoch": 0.005,
396
- "grad_norm": 1.09906804561615,
397
- "learning_rate": 4.999000000000001e-06,
398
- "loss": 1.3366,
399
  "step": 5000
400
  },
401
  {
402
  "epoch": 0.005,
403
- "eval_loss": 1.3536914587020874,
404
- "eval_runtime": 24.5551,
405
- "eval_samples_per_second": 203.624,
406
- "eval_steps_per_second": 3.217,
407
  "step": 5000
408
  },
409
  {
410
  "epoch": 0.0051,
411
- "grad_norm": 1.1291029453277588,
412
- "learning_rate": 5.099000000000001e-06,
413
- "loss": 1.3396,
414
  "step": 5100
415
  },
416
  {
417
  "epoch": 0.0052,
418
- "grad_norm": 1.1673402786254883,
419
- "learning_rate": 5.1990000000000005e-06,
420
- "loss": 1.3358,
421
  "step": 5200
422
  },
423
  {
424
  "epoch": 0.0053,
425
- "grad_norm": 1.1300634145736694,
426
- "learning_rate": 5.2990000000000004e-06,
427
- "loss": 1.3384,
428
  "step": 5300
429
  },
430
  {
431
  "epoch": 0.0054,
432
- "grad_norm": 1.1179150342941284,
433
- "learning_rate": 5.399000000000001e-06,
434
- "loss": 1.3332,
435
  "step": 5400
436
  },
437
  {
438
  "epoch": 0.0055,
439
- "grad_norm": 1.091856837272644,
440
- "learning_rate": 5.499000000000001e-06,
441
- "loss": 1.3348,
442
  "step": 5500
443
  },
444
  {
445
  "epoch": 0.0056,
446
- "grad_norm": 1.0551645755767822,
447
- "learning_rate": 5.599e-06,
448
- "loss": 1.336,
449
  "step": 5600
450
  },
451
  {
452
  "epoch": 0.0057,
453
- "grad_norm": 1.1457860469818115,
454
- "learning_rate": 5.699e-06,
455
- "loss": 1.333,
456
  "step": 5700
457
  },
458
  {
459
  "epoch": 0.0058,
460
- "grad_norm": 1.1662046909332275,
461
- "learning_rate": 5.799e-06,
462
- "loss": 1.3299,
463
  "step": 5800
464
  },
465
  {
466
  "epoch": 0.0059,
467
- "grad_norm": 1.1879452466964722,
468
- "learning_rate": 5.899000000000001e-06,
469
- "loss": 1.3354,
470
  "step": 5900
471
  },
472
  {
473
  "epoch": 0.006,
474
- "grad_norm": 1.1441973447799683,
475
- "learning_rate": 5.9990000000000005e-06,
476
- "loss": 1.3329,
477
  "step": 6000
478
  },
479
  {
480
  "epoch": 0.006,
481
- "eval_loss": 1.3535875082015991,
482
- "eval_runtime": 24.3908,
483
- "eval_samples_per_second": 204.995,
484
- "eval_steps_per_second": 3.239,
485
  "step": 6000
486
  },
487
  {
488
  "epoch": 0.0061,
489
- "grad_norm": 1.121394395828247,
490
- "learning_rate": 6.099e-06,
491
- "loss": 1.3295,
492
  "step": 6100
493
  },
494
  {
495
  "epoch": 0.0062,
496
- "grad_norm": 1.1496130228042603,
497
- "learning_rate": 6.199e-06,
498
- "loss": 1.3303,
499
  "step": 6200
500
  },
501
  {
502
  "epoch": 0.0063,
503
- "grad_norm": 1.2465569972991943,
504
- "learning_rate": 6.299000000000001e-06,
505
- "loss": 1.3268,
506
  "step": 6300
507
  },
508
  {
509
  "epoch": 0.0064,
510
- "grad_norm": 1.1363328695297241,
511
- "learning_rate": 6.399000000000001e-06,
512
- "loss": 1.3248,
513
  "step": 6400
514
  },
515
  {
516
  "epoch": 0.0065,
517
- "grad_norm": 1.1142207384109497,
518
- "learning_rate": 6.499000000000001e-06,
519
- "loss": 1.3212,
520
  "step": 6500
521
  },
522
  {
523
  "epoch": 0.0066,
524
- "grad_norm": 1.1020450592041016,
525
- "learning_rate": 6.599000000000001e-06,
526
- "loss": 1.3305,
527
  "step": 6600
528
  },
529
  {
530
  "epoch": 0.0067,
531
- "grad_norm": 1.0636595487594604,
532
- "learning_rate": 6.699000000000001e-06,
533
- "loss": 1.3343,
534
  "step": 6700
535
  },
536
  {
537
  "epoch": 0.0068,
538
- "grad_norm": 1.0846408605575562,
539
- "learning_rate": 6.7990000000000005e-06,
540
- "loss": 1.3306,
541
  "step": 6800
542
  },
543
  {
544
  "epoch": 0.0069,
545
- "grad_norm": 1.2017494440078735,
546
- "learning_rate": 6.899e-06,
547
- "loss": 1.3191,
548
  "step": 6900
549
  },
550
  {
551
  "epoch": 0.007,
552
- "grad_norm": 1.159947156906128,
553
- "learning_rate": 6.999e-06,
554
- "loss": 1.334,
555
  "step": 7000
556
  },
557
  {
558
  "epoch": 0.007,
559
- "eval_loss": 1.3385692834854126,
560
- "eval_runtime": 24.4488,
561
- "eval_samples_per_second": 204.509,
562
- "eval_steps_per_second": 3.231,
563
  "step": 7000
564
  },
565
  {
566
  "epoch": 0.0071,
567
- "grad_norm": 1.1962409019470215,
568
- "learning_rate": 7.099e-06,
569
- "loss": 1.323,
570
  "step": 7100
571
  },
572
  {
573
  "epoch": 0.0072,
574
- "grad_norm": 1.1551247835159302,
575
- "learning_rate": 7.199e-06,
576
- "loss": 1.3119,
577
  "step": 7200
578
  },
579
  {
580
  "epoch": 0.0073,
581
- "grad_norm": 1.1543225049972534,
582
- "learning_rate": 7.299000000000001e-06,
583
- "loss": 1.3261,
584
  "step": 7300
585
  },
586
  {
587
  "epoch": 0.0074,
588
- "grad_norm": 1.133355975151062,
589
- "learning_rate": 7.399000000000001e-06,
590
- "loss": 1.3241,
591
  "step": 7400
592
  },
593
  {
594
  "epoch": 0.0075,
595
- "grad_norm": 1.1490956544876099,
596
- "learning_rate": 7.4990000000000005e-06,
597
- "loss": 1.3293,
598
  "step": 7500
599
  },
600
  {
601
  "epoch": 0.0076,
602
- "grad_norm": 1.0732618570327759,
603
- "learning_rate": 7.5990000000000004e-06,
604
- "loss": 1.3216,
605
  "step": 7600
606
  },
607
  {
608
  "epoch": 0.0077,
609
- "grad_norm": 1.170203685760498,
610
- "learning_rate": 7.699e-06,
611
- "loss": 1.3193,
612
  "step": 7700
613
  },
614
  {
615
  "epoch": 0.0078,
616
- "grad_norm": 1.0613148212432861,
617
- "learning_rate": 7.799000000000001e-06,
618
- "loss": 1.329,
619
  "step": 7800
620
  },
621
  {
622
  "epoch": 0.0079,
623
- "grad_norm": 1.2019593715667725,
624
- "learning_rate": 7.899000000000002e-06,
625
- "loss": 1.315,
626
  "step": 7900
627
  },
628
  {
629
  "epoch": 0.008,
630
- "grad_norm": 1.1080353260040283,
631
- "learning_rate": 7.999e-06,
632
- "loss": 1.3181,
633
  "step": 8000
634
  },
635
  {
636
  "epoch": 0.008,
637
- "eval_loss": 1.3239587545394897,
638
- "eval_runtime": 24.4556,
639
- "eval_samples_per_second": 204.452,
640
- "eval_steps_per_second": 3.23,
641
  "step": 8000
642
  },
643
  {
644
  "epoch": 0.0081,
645
- "grad_norm": 1.1273937225341797,
646
- "learning_rate": 8.099e-06,
647
- "loss": 1.3252,
648
  "step": 8100
649
  },
650
  {
651
  "epoch": 0.0082,
652
- "grad_norm": 1.0942583084106445,
653
- "learning_rate": 8.199e-06,
654
- "loss": 1.3164,
655
  "step": 8200
656
  },
657
  {
658
  "epoch": 0.0083,
659
- "grad_norm": 1.1845577955245972,
660
- "learning_rate": 8.299e-06,
661
- "loss": 1.32,
662
  "step": 8300
663
  },
664
  {
665
  "epoch": 0.0084,
666
- "grad_norm": 1.2376071214675903,
667
- "learning_rate": 8.399e-06,
668
- "loss": 1.314,
669
  "step": 8400
670
  },
671
  {
672
  "epoch": 0.0085,
673
- "grad_norm": 1.5554766654968262,
674
- "learning_rate": 8.499000000000001e-06,
675
- "loss": 1.4128,
676
  "step": 8500
677
  },
678
  {
679
  "epoch": 0.0086,
680
- "grad_norm": 1.736693024635315,
681
- "learning_rate": 8.599e-06,
682
- "loss": 1.5028,
683
  "step": 8600
684
  },
685
  {
686
  "epoch": 0.0087,
687
- "grad_norm": 1.8339451551437378,
688
- "learning_rate": 8.699000000000001e-06,
689
- "loss": 1.5346,
690
  "step": 8700
691
  },
692
  {
693
  "epoch": 0.0088,
694
- "grad_norm": 1.827017068862915,
695
- "learning_rate": 8.799000000000002e-06,
696
- "loss": 1.5309,
697
  "step": 8800
698
  },
699
  {
700
  "epoch": 0.0089,
701
- "grad_norm": 1.7209491729736328,
702
- "learning_rate": 8.899e-06,
703
- "loss": 1.5202,
704
  "step": 8900
705
  },
706
  {
707
  "epoch": 0.009,
708
- "grad_norm": 1.7649836540222168,
709
- "learning_rate": 8.999000000000001e-06,
710
- "loss": 1.5311,
711
  "step": 9000
712
  },
713
  {
714
  "epoch": 0.009,
715
- "eval_loss": 1.3453279733657837,
716
- "eval_runtime": 24.4639,
717
- "eval_samples_per_second": 204.383,
718
- "eval_steps_per_second": 3.229,
719
  "step": 9000
720
  },
721
  {
722
  "epoch": 0.0091,
723
- "grad_norm": 1.758984923362732,
724
- "learning_rate": 9.099e-06,
725
- "loss": 1.5277,
726
  "step": 9100
727
  },
728
  {
729
  "epoch": 0.0092,
730
- "grad_norm": 1.5517253875732422,
731
- "learning_rate": 9.199000000000001e-06,
732
- "loss": 1.5331,
733
  "step": 9200
734
  },
735
  {
736
  "epoch": 0.0093,
737
- "grad_norm": 1.7491697072982788,
738
- "learning_rate": 9.299e-06,
739
- "loss": 1.5376,
740
  "step": 9300
741
  },
742
  {
743
  "epoch": 0.0094,
744
- "grad_norm": 1.7253761291503906,
745
- "learning_rate": 9.399000000000001e-06,
746
- "loss": 1.5319,
747
  "step": 9400
748
  },
749
  {
750
  "epoch": 0.0095,
751
- "grad_norm": 1.7779654264450073,
752
- "learning_rate": 9.499e-06,
753
- "loss": 1.5455,
754
  "step": 9500
755
  },
756
  {
757
  "epoch": 0.0096,
758
- "grad_norm": 1.8502960205078125,
759
- "learning_rate": 9.599e-06,
760
- "loss": 1.5256,
761
  "step": 9600
762
  },
763
  {
764
  "epoch": 0.0097,
765
- "grad_norm": 1.595805287361145,
766
- "learning_rate": 9.699e-06,
767
- "loss": 1.5338,
768
  "step": 9700
769
  },
770
  {
771
  "epoch": 0.0098,
772
- "grad_norm": 1.7826145887374878,
773
- "learning_rate": 9.799e-06,
774
- "loss": 1.5297,
775
  "step": 9800
776
  },
777
  {
778
  "epoch": 0.0099,
779
- "grad_norm": 1.8574384450912476,
780
- "learning_rate": 9.899000000000001e-06,
781
- "loss": 1.537,
782
  "step": 9900
783
  },
784
  {
785
  "epoch": 0.01,
786
- "grad_norm": 1.6225100755691528,
787
- "learning_rate": 9.999e-06,
788
- "loss": 1.5373,
789
  "step": 10000
790
  },
791
  {
792
  "epoch": 0.01,
793
- "eval_loss": 1.3474788665771484,
794
- "eval_runtime": 24.6009,
795
- "eval_samples_per_second": 203.244,
796
- "eval_steps_per_second": 3.211,
797
  "step": 10000
798
  },
799
  {
800
  "epoch": 0.0101,
801
- "grad_norm": 1.7013579607009888,
802
- "learning_rate": 9.999999753259893e-06,
803
- "loss": 1.5213,
804
  "step": 10100
805
  },
806
  {
807
  "epoch": 0.0102,
808
- "grad_norm": 1.8451807498931885,
809
- "learning_rate": 9.999999003045122e-06,
810
- "loss": 1.5252,
811
  "step": 10200
812
  },
813
  {
814
  "epoch": 0.0103,
815
- "grad_norm": 1.6487650871276855,
816
- "learning_rate": 9.999997749330588e-06,
817
- "loss": 1.5313,
818
  "step": 10300
819
  },
820
  {
821
  "epoch": 0.0104,
822
- "grad_norm": 1.7240970134735107,
823
- "learning_rate": 9.999995992116415e-06,
824
- "loss": 1.5375,
825
  "step": 10400
826
  },
827
  {
828
  "epoch": 0.0105,
829
- "grad_norm": 1.5860111713409424,
830
- "learning_rate": 9.999993731402786e-06,
831
- "loss": 1.535,
832
  "step": 10500
833
  },
834
  {
835
  "epoch": 0.0106,
836
- "grad_norm": 1.6990783214569092,
837
- "learning_rate": 9.999990967189924e-06,
838
- "loss": 1.5415,
839
  "step": 10600
840
  },
841
  {
842
  "epoch": 0.0107,
843
- "grad_norm": 1.7421098947525024,
844
- "learning_rate": 9.999987699478109e-06,
845
- "loss": 1.5266,
846
  "step": 10700
847
  },
848
  {
849
  "epoch": 0.0108,
850
- "grad_norm": 1.6578110456466675,
851
- "learning_rate": 9.999983928267668e-06,
852
- "loss": 1.5256,
853
  "step": 10800
854
  },
855
  {
856
  "epoch": 0.0109,
857
- "grad_norm": 1.8193341493606567,
858
- "learning_rate": 9.999979653558982e-06,
859
- "loss": 1.54,
860
  "step": 10900
861
  },
862
  {
863
  "epoch": 0.011,
864
- "grad_norm": 1.7376822233200073,
865
- "learning_rate": 9.999974875352482e-06,
866
- "loss": 1.5345,
867
  "step": 11000
868
  },
869
  {
870
  "epoch": 0.011,
871
- "eval_loss": 1.3439626693725586,
872
- "eval_runtime": 24.6158,
873
- "eval_samples_per_second": 203.122,
874
- "eval_steps_per_second": 3.209,
875
  "step": 11000
876
  },
877
  {
878
  "epoch": 0.0111,
879
- "grad_norm": 1.7770408391952515,
880
- "learning_rate": 9.999969593648651e-06,
881
- "loss": 1.5257,
882
  "step": 11100
883
  },
884
  {
885
  "epoch": 0.0112,
886
- "grad_norm": 1.703754186630249,
887
- "learning_rate": 9.999963808448016e-06,
888
- "loss": 1.523,
889
  "step": 11200
890
  },
891
  {
892
  "epoch": 0.0113,
893
- "grad_norm": 1.7194414138793945,
894
- "learning_rate": 9.999957519751165e-06,
895
- "loss": 1.5404,
896
  "step": 11300
897
  },
898
  {
899
  "epoch": 0.0114,
900
- "grad_norm": 1.694810390472412,
901
- "learning_rate": 9.999950727558727e-06,
902
- "loss": 1.534,
903
  "step": 11400
904
  },
905
  {
906
  "epoch": 0.0115,
907
- "grad_norm": 1.644400715827942,
908
- "learning_rate": 9.999943431871388e-06,
909
- "loss": 1.531,
910
  "step": 11500
911
  },
912
  {
913
  "epoch": 0.0116,
914
- "grad_norm": 1.792406678199768,
915
- "learning_rate": 9.99993563268988e-06,
916
- "loss": 1.5298,
917
  "step": 11600
918
  },
919
  {
920
  "epoch": 0.0117,
921
- "grad_norm": 1.9580830335617065,
922
- "learning_rate": 9.999927330014993e-06,
923
- "loss": 1.5268,
924
  "step": 11700
925
  },
926
  {
927
  "epoch": 0.0118,
928
- "grad_norm": 1.6442023515701294,
929
- "learning_rate": 9.99991852384756e-06,
930
- "loss": 1.5257,
931
  "step": 11800
932
  },
933
  {
934
  "epoch": 0.0119,
935
- "grad_norm": 1.680830478668213,
936
- "learning_rate": 9.99990921418847e-06,
937
- "loss": 1.5191,
938
  "step": 11900
939
  },
940
  {
941
  "epoch": 0.012,
942
- "grad_norm": 1.6746671199798584,
943
- "learning_rate": 9.999899401038656e-06,
944
- "loss": 1.5372,
945
  "step": 12000
946
  },
947
  {
948
  "epoch": 0.012,
949
- "eval_loss": 1.3511897325515747,
950
- "eval_runtime": 24.555,
951
- "eval_samples_per_second": 203.625,
952
- "eval_steps_per_second": 3.217,
953
  "step": 12000
954
  },
955
  {
956
  "epoch": 0.0121,
957
- "grad_norm": 1.7775862216949463,
958
- "learning_rate": 9.99988908439911e-06,
959
- "loss": 1.5182,
960
  "step": 12100
961
  },
962
  {
963
  "epoch": 0.0122,
964
- "grad_norm": 1.5296705961227417,
965
- "learning_rate": 9.999878264270871e-06,
966
- "loss": 1.5303,
967
  "step": 12200
968
  },
969
  {
970
  "epoch": 0.0123,
971
- "grad_norm": 1.7957079410552979,
972
- "learning_rate": 9.999866940655027e-06,
973
- "loss": 1.5328,
974
  "step": 12300
975
  },
976
  {
977
  "epoch": 0.0124,
978
- "grad_norm": 1.8484801054000854,
979
- "learning_rate": 9.99985511355272e-06,
980
- "loss": 1.5162,
981
  "step": 12400
982
  },
983
  {
984
  "epoch": 0.0125,
985
- "grad_norm": 1.7253010272979736,
986
- "learning_rate": 9.999842782965139e-06,
987
- "loss": 1.5178,
988
  "step": 12500
989
  },
990
  {
991
  "epoch": 0.0126,
992
- "grad_norm": 1.7495081424713135,
993
- "learning_rate": 9.999829948893528e-06,
994
- "loss": 1.5233,
995
  "step": 12600
996
  },
997
  {
998
  "epoch": 0.0127,
999
- "grad_norm": 1.6750719547271729,
1000
- "learning_rate": 9.999816611339175e-06,
1001
- "loss": 1.5203,
1002
  "step": 12700
1003
  },
1004
  {
1005
  "epoch": 0.0128,
1006
- "grad_norm": 1.7870038747787476,
1007
- "learning_rate": 9.999802770303427e-06,
1008
- "loss": 1.5106,
1009
  "step": 12800
1010
  },
1011
  {
1012
  "epoch": 0.0129,
1013
- "grad_norm": 1.6229153871536255,
1014
- "learning_rate": 9.999788425787678e-06,
1015
- "loss": 1.5399,
1016
  "step": 12900
1017
  },
1018
  {
1019
  "epoch": 0.013,
1020
- "grad_norm": 1.7483490705490112,
1021
- "learning_rate": 9.99977357779337e-06,
1022
- "loss": 1.519,
1023
  "step": 13000
1024
  },
1025
  {
1026
  "epoch": 0.013,
1027
- "eval_loss": 1.3341424465179443,
1028
- "eval_runtime": 24.5433,
1029
- "eval_samples_per_second": 203.722,
1030
- "eval_steps_per_second": 3.219,
1031
  "step": 13000
1032
  },
1033
  {
1034
  "epoch": 0.0131,
1035
- "grad_norm": 1.7631748914718628,
1036
- "learning_rate": 9.999758226322e-06,
1037
- "loss": 1.5232,
1038
  "step": 13100
1039
  },
1040
  {
1041
  "epoch": 0.0132,
1042
- "grad_norm": 1.6134735345840454,
1043
- "learning_rate": 9.999742371375114e-06,
1044
- "loss": 1.5352,
1045
  "step": 13200
1046
  },
1047
  {
1048
  "epoch": 0.0133,
1049
- "grad_norm": 1.8494335412979126,
1050
- "learning_rate": 9.999726012954308e-06,
1051
- "loss": 1.5254,
1052
  "step": 13300
1053
  },
1054
  {
1055
  "epoch": 0.0134,
1056
- "grad_norm": 1.9245802164077759,
1057
- "learning_rate": 9.999709151061228e-06,
1058
- "loss": 1.5358,
1059
  "step": 13400
1060
  },
1061
  {
1062
  "epoch": 0.0135,
1063
- "grad_norm": 1.755018711090088,
1064
- "learning_rate": 9.999691785697574e-06,
1065
- "loss": 1.5204,
1066
  "step": 13500
1067
  },
1068
  {
1069
  "epoch": 0.0136,
1070
- "grad_norm": 1.8922946453094482,
1071
- "learning_rate": 9.999673916865094e-06,
1072
- "loss": 1.5267,
1073
  "step": 13600
1074
  },
1075
  {
1076
  "epoch": 0.0137,
1077
- "grad_norm": 1.9781936407089233,
1078
- "learning_rate": 9.999655544565587e-06,
1079
- "loss": 1.5213,
1080
  "step": 13700
1081
  },
1082
  {
1083
  "epoch": 0.0138,
1084
- "grad_norm": 1.8312381505966187,
1085
- "learning_rate": 9.999636668800905e-06,
1086
- "loss": 1.517,
1087
  "step": 13800
1088
  },
1089
  {
1090
  "epoch": 0.0139,
1091
- "grad_norm": 1.6503413915634155,
1092
- "learning_rate": 9.999617289572946e-06,
1093
- "loss": 1.5169,
1094
  "step": 13900
1095
  },
1096
  {
1097
  "epoch": 0.014,
1098
- "grad_norm": 1.8612747192382812,
1099
- "learning_rate": 9.999597406883664e-06,
1100
- "loss": 1.5277,
1101
  "step": 14000
1102
  },
1103
  {
1104
  "epoch": 0.014,
1105
- "eval_loss": 1.3367455005645752,
1106
- "eval_runtime": 24.5718,
1107
- "eval_samples_per_second": 203.485,
1108
- "eval_steps_per_second": 3.215,
1109
  "step": 14000
1110
  },
1111
  {
1112
  "epoch": 0.0141,
1113
- "grad_norm": 1.8900790214538574,
1114
- "learning_rate": 9.999577020735059e-06,
1115
- "loss": 1.5276,
1116
  "step": 14100
1117
  },
1118
  {
1119
  "epoch": 0.0142,
1120
- "grad_norm": 1.720528244972229,
1121
- "learning_rate": 9.999556131129184e-06,
1122
- "loss": 1.5209,
1123
  "step": 14200
1124
  },
1125
  {
1126
  "epoch": 0.0143,
1127
- "grad_norm": 1.713659405708313,
1128
- "learning_rate": 9.999534738068145e-06,
1129
- "loss": 1.5194,
1130
  "step": 14300
1131
  },
1132
  {
1133
  "epoch": 0.0144,
1134
- "grad_norm": 1.662377119064331,
1135
- "learning_rate": 9.999512841554093e-06,
1136
- "loss": 1.5179,
1137
  "step": 14400
1138
  },
1139
  {
1140
  "epoch": 0.0145,
1141
- "grad_norm": 1.6507668495178223,
1142
- "learning_rate": 9.999490441589235e-06,
1143
- "loss": 1.5181,
1144
  "step": 14500
1145
  },
1146
  {
1147
  "epoch": 0.0146,
1148
- "grad_norm": 1.7075133323669434,
1149
- "learning_rate": 9.999467538175827e-06,
1150
- "loss": 1.5203,
1151
  "step": 14600
1152
  },
1153
  {
1154
  "epoch": 0.0147,
1155
- "grad_norm": 1.686068058013916,
1156
- "learning_rate": 9.999444131316173e-06,
1157
- "loss": 1.5156,
1158
  "step": 14700
1159
  },
1160
  {
1161
  "epoch": 0.0148,
1162
- "grad_norm": 1.6891603469848633,
1163
- "learning_rate": 9.999420221012635e-06,
1164
- "loss": 1.5195,
1165
  "step": 14800
1166
  },
1167
  {
1168
  "epoch": 0.0149,
1169
- "grad_norm": 1.784029245376587,
1170
- "learning_rate": 9.999395807267616e-06,
1171
- "loss": 1.509,
1172
  "step": 14900
1173
  },
1174
  {
1175
  "epoch": 0.015,
1176
- "grad_norm": 1.6361267566680908,
1177
- "learning_rate": 9.999370890083575e-06,
1178
- "loss": 1.5248,
1179
  "step": 15000
1180
  },
1181
  {
1182
  "epoch": 0.015,
1183
- "eval_loss": 1.3349226713180542,
1184
- "eval_runtime": 24.5747,
1185
- "eval_samples_per_second": 203.461,
1186
- "eval_steps_per_second": 3.215,
1187
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  }
1189
  ],
1190
  "logging_steps": 100,
@@ -1204,7 +3154,7 @@
1204
  "attributes": {}
1205
  }
1206
  },
1207
- "total_flos": 2.03079253229568e+18,
1208
  "train_batch_size": 64,
1209
  "trial_name": null,
1210
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.04,
6
  "eval_steps": 1000,
7
+ "global_step": 40000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1e-06,
14
+ "grad_norm": 1.7933695316314697,
15
  "learning_rate": 0.0,
16
+ "loss": 1.4289,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0001,
21
+ "grad_norm": 1.5815099477767944,
22
+ "learning_rate": 4.9500000000000006e-08,
23
+ "loss": 1.4071,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0002,
28
+ "grad_norm": 1.9177594184875488,
29
+ "learning_rate": 9.950000000000001e-08,
30
+ "loss": 1.3975,
31
  "step": 200
32
  },
33
  {
34
  "epoch": 0.0003,
35
+ "grad_norm": 1.749267816543579,
36
+ "learning_rate": 1.495e-07,
37
+ "loss": 1.39,
38
  "step": 300
39
  },
40
  {
41
  "epoch": 0.0004,
42
+ "grad_norm": 1.783465027809143,
43
+ "learning_rate": 1.995e-07,
44
+ "loss": 1.4035,
45
  "step": 400
46
  },
47
  {
48
  "epoch": 0.0005,
49
+ "grad_norm": 1.727728009223938,
50
+ "learning_rate": 2.495e-07,
51
+ "loss": 1.3944,
52
  "step": 500
53
  },
54
  {
55
  "epoch": 0.0006,
56
+ "grad_norm": 1.7734591960906982,
57
+ "learning_rate": 2.9950000000000005e-07,
58
+ "loss": 1.3968,
59
  "step": 600
60
  },
61
  {
62
  "epoch": 0.0007,
63
+ "grad_norm": 1.7365233898162842,
64
+ "learning_rate": 3.4950000000000005e-07,
65
+ "loss": 1.4015,
66
  "step": 700
67
  },
68
  {
69
  "epoch": 0.0008,
70
+ "grad_norm": 1.898462176322937,
71
+ "learning_rate": 3.9950000000000005e-07,
72
+ "loss": 1.404,
73
  "step": 800
74
  },
75
  {
76
  "epoch": 0.0009,
77
+ "grad_norm": 1.5940463542938232,
78
+ "learning_rate": 4.495e-07,
79
+ "loss": 1.4013,
80
  "step": 900
81
  },
82
  {
83
  "epoch": 0.001,
84
+ "grad_norm": 1.948064923286438,
85
+ "learning_rate": 4.995e-07,
86
+ "loss": 1.4016,
87
  "step": 1000
88
  },
89
  {
90
  "epoch": 0.001,
91
+ "eval_loss": 1.4141713380813599,
92
+ "eval_runtime": 30.1065,
93
+ "eval_samples_per_second": 166.077,
94
+ "eval_steps_per_second": 2.624,
95
  "step": 1000
96
  },
97
  {
98
  "epoch": 0.0011,
99
+ "grad_norm": 1.7923760414123535,
100
+ "learning_rate": 5.495e-07,
101
+ "loss": 1.3934,
102
  "step": 1100
103
  },
104
  {
105
  "epoch": 0.0012,
106
+ "grad_norm": 1.8533916473388672,
107
+ "learning_rate": 5.995e-07,
108
+ "loss": 1.4183,
109
  "step": 1200
110
  },
111
  {
112
  "epoch": 0.0013,
113
+ "grad_norm": 1.7842605113983154,
114
+ "learning_rate": 6.495e-07,
115
+ "loss": 1.4022,
116
  "step": 1300
117
  },
118
  {
119
  "epoch": 0.0014,
120
+ "grad_norm": 1.6869210004806519,
121
+ "learning_rate": 6.995e-07,
122
+ "loss": 1.4055,
123
  "step": 1400
124
  },
125
  {
126
  "epoch": 0.0015,
127
+ "grad_norm": 1.7422460317611694,
128
+ "learning_rate": 7.495000000000001e-07,
129
+ "loss": 1.4112,
130
  "step": 1500
131
  },
132
  {
133
  "epoch": 0.0016,
134
+ "grad_norm": 1.8024979829788208,
135
+ "learning_rate": 7.995e-07,
136
+ "loss": 1.4002,
137
  "step": 1600
138
  },
139
  {
140
  "epoch": 0.0017,
141
+ "grad_norm": 1.8899842500686646,
142
+ "learning_rate": 8.495000000000001e-07,
143
+ "loss": 1.3927,
144
  "step": 1700
145
  },
146
  {
147
  "epoch": 0.0018,
148
+ "grad_norm": 1.9851900339126587,
149
+ "learning_rate": 8.995000000000001e-07,
150
+ "loss": 1.4143,
151
  "step": 1800
152
  },
153
  {
154
  "epoch": 0.0019,
155
+ "grad_norm": 1.8107774257659912,
156
+ "learning_rate": 9.495000000000002e-07,
157
+ "loss": 1.4056,
158
  "step": 1900
159
  },
160
  {
161
  "epoch": 0.002,
162
+ "grad_norm": 1.723138689994812,
163
+ "learning_rate": 9.995000000000001e-07,
164
+ "loss": 1.3832,
165
  "step": 2000
166
  },
167
  {
168
  "epoch": 0.002,
169
+ "eval_loss": 1.4142277240753174,
170
+ "eval_runtime": 24.9444,
171
+ "eval_samples_per_second": 200.446,
172
+ "eval_steps_per_second": 3.167,
173
  "step": 2000
174
  },
175
  {
176
  "epoch": 0.0021,
177
+ "grad_norm": 1.8459888696670532,
178
+ "learning_rate": 1.0495e-06,
179
+ "loss": 1.4003,
180
  "step": 2100
181
  },
182
  {
183
  "epoch": 0.0022,
184
+ "grad_norm": 1.8294060230255127,
185
+ "learning_rate": 1.0995000000000002e-06,
186
+ "loss": 1.4083,
187
  "step": 2200
188
  },
189
  {
190
  "epoch": 0.0023,
191
+ "grad_norm": 1.8045350313186646,
192
+ "learning_rate": 1.1495e-06,
193
+ "loss": 1.408,
194
  "step": 2300
195
  },
196
  {
197
  "epoch": 0.0024,
198
+ "grad_norm": 1.7030452489852905,
199
+ "learning_rate": 1.1995000000000001e-06,
200
+ "loss": 1.3929,
201
  "step": 2400
202
  },
203
  {
204
  "epoch": 0.0025,
205
+ "grad_norm": 1.6902519464492798,
206
+ "learning_rate": 1.2495e-06,
207
+ "loss": 1.3857,
208
  "step": 2500
209
  },
210
  {
211
  "epoch": 0.0026,
212
+ "grad_norm": 1.7536392211914062,
213
+ "learning_rate": 1.2995000000000002e-06,
214
+ "loss": 1.4034,
215
  "step": 2600
216
  },
217
  {
218
  "epoch": 0.0027,
219
+ "grad_norm": 1.9011949300765991,
220
+ "learning_rate": 1.3495e-06,
221
+ "loss": 1.411,
222
  "step": 2700
223
  },
224
  {
225
  "epoch": 0.0028,
226
+ "grad_norm": 1.8409913778305054,
227
+ "learning_rate": 1.3995000000000001e-06,
228
+ "loss": 1.4002,
229
  "step": 2800
230
  },
231
  {
232
  "epoch": 0.0029,
233
+ "grad_norm": 1.832072377204895,
234
+ "learning_rate": 1.4495e-06,
235
+ "loss": 1.4111,
236
  "step": 2900
237
  },
238
  {
239
  "epoch": 0.003,
240
+ "grad_norm": 1.7680649757385254,
241
+ "learning_rate": 1.4995000000000002e-06,
242
+ "loss": 1.3953,
243
  "step": 3000
244
  },
245
  {
246
  "epoch": 0.003,
247
+ "eval_loss": 1.392706274986267,
248
+ "eval_runtime": 23.9892,
249
+ "eval_samples_per_second": 208.427,
250
+ "eval_steps_per_second": 3.293,
251
  "step": 3000
252
  },
253
  {
254
  "epoch": 0.0031,
255
+ "grad_norm": 1.8069273233413696,
256
+ "learning_rate": 1.5495000000000002e-06,
257
+ "loss": 1.4041,
258
  "step": 3100
259
  },
260
  {
261
  "epoch": 0.0032,
262
+ "grad_norm": 1.9078863859176636,
263
+ "learning_rate": 1.5995000000000003e-06,
264
+ "loss": 1.3922,
265
  "step": 3200
266
  },
267
  {
268
  "epoch": 0.0033,
269
+ "grad_norm": 1.8206089735031128,
270
+ "learning_rate": 1.6495000000000003e-06,
271
+ "loss": 1.3909,
272
  "step": 3300
273
  },
274
  {
275
  "epoch": 0.0034,
276
+ "grad_norm": 1.850622296333313,
277
+ "learning_rate": 1.6995e-06,
278
+ "loss": 1.3866,
279
  "step": 3400
280
  },
281
  {
282
  "epoch": 0.0035,
283
+ "grad_norm": 1.823333978652954,
284
+ "learning_rate": 1.7495000000000001e-06,
285
+ "loss": 1.3991,
286
  "step": 3500
287
  },
288
  {
289
  "epoch": 0.0036,
290
+ "grad_norm": 1.7618579864501953,
291
+ "learning_rate": 1.7995e-06,
292
+ "loss": 1.3932,
293
  "step": 3600
294
  },
295
  {
296
  "epoch": 0.0037,
297
+ "grad_norm": 1.778365135192871,
298
+ "learning_rate": 1.8495000000000002e-06,
299
+ "loss": 1.4059,
300
  "step": 3700
301
  },
302
  {
303
  "epoch": 0.0038,
304
+ "grad_norm": 1.757568359375,
305
+ "learning_rate": 1.8995000000000002e-06,
306
+ "loss": 1.4031,
307
  "step": 3800
308
  },
309
  {
310
  "epoch": 0.0039,
311
+ "grad_norm": 1.9897911548614502,
312
+ "learning_rate": 1.9495e-06,
313
+ "loss": 1.3958,
314
  "step": 3900
315
  },
316
  {
317
  "epoch": 0.004,
318
+ "grad_norm": 1.789900302886963,
319
+ "learning_rate": 1.9995e-06,
320
+ "loss": 1.3836,
321
  "step": 4000
322
  },
323
  {
324
  "epoch": 0.004,
325
+ "eval_loss": 1.3976104259490967,
326
+ "eval_runtime": 24.0065,
327
+ "eval_samples_per_second": 208.277,
328
+ "eval_steps_per_second": 3.291,
329
  "step": 4000
330
  },
331
  {
332
  "epoch": 0.0041,
333
+ "grad_norm": 1.9247645139694214,
334
+ "learning_rate": 2.0495e-06,
335
+ "loss": 1.4072,
336
  "step": 4100
337
  },
338
  {
339
  "epoch": 0.0042,
340
+ "grad_norm": 1.8118959665298462,
341
+ "learning_rate": 2.0995e-06,
342
+ "loss": 1.3973,
343
  "step": 4200
344
  },
345
  {
346
  "epoch": 0.0043,
347
+ "grad_norm": 1.8579908609390259,
348
+ "learning_rate": 2.1495000000000003e-06,
349
+ "loss": 1.3952,
350
  "step": 4300
351
  },
352
  {
353
  "epoch": 0.0044,
354
+ "grad_norm": 1.7695815563201904,
355
+ "learning_rate": 2.1995000000000003e-06,
356
+ "loss": 1.4004,
357
  "step": 4400
358
  },
359
  {
360
  "epoch": 0.0045,
361
+ "grad_norm": 1.7015595436096191,
362
+ "learning_rate": 2.2495000000000002e-06,
363
+ "loss": 1.3951,
364
  "step": 4500
365
  },
366
  {
367
  "epoch": 0.0046,
368
+ "grad_norm": 1.710094690322876,
369
+ "learning_rate": 2.2995e-06,
370
+ "loss": 1.3956,
371
  "step": 4600
372
  },
373
  {
374
  "epoch": 0.0047,
375
+ "grad_norm": 1.748152732849121,
376
+ "learning_rate": 2.3495e-06,
377
+ "loss": 1.3922,
378
  "step": 4700
379
  },
380
  {
381
  "epoch": 0.0048,
382
+ "grad_norm": 1.7241308689117432,
383
+ "learning_rate": 2.3995e-06,
384
+ "loss": 1.398,
385
  "step": 4800
386
  },
387
  {
388
  "epoch": 0.0049,
389
+ "grad_norm": 1.6674290895462036,
390
+ "learning_rate": 2.4495e-06,
391
+ "loss": 1.3975,
392
  "step": 4900
393
  },
394
  {
395
  "epoch": 0.005,
396
+ "grad_norm": 1.8750745058059692,
397
+ "learning_rate": 2.4995000000000004e-06,
398
+ "loss": 1.394,
399
  "step": 5000
400
  },
401
  {
402
  "epoch": 0.005,
403
+ "eval_loss": 1.393296480178833,
404
+ "eval_runtime": 24.0133,
405
+ "eval_samples_per_second": 208.218,
406
+ "eval_steps_per_second": 3.29,
407
  "step": 5000
408
  },
409
  {
410
  "epoch": 0.0051,
411
+ "grad_norm": 1.6334375143051147,
412
+ "learning_rate": 2.5495000000000003e-06,
413
+ "loss": 1.3927,
414
  "step": 5100
415
  },
416
  {
417
  "epoch": 0.0052,
418
+ "grad_norm": 1.7477636337280273,
419
+ "learning_rate": 2.5995000000000003e-06,
420
+ "loss": 1.398,
421
  "step": 5200
422
  },
423
  {
424
  "epoch": 0.0053,
425
+ "grad_norm": 1.6373661756515503,
426
+ "learning_rate": 2.6495000000000002e-06,
427
+ "loss": 1.4044,
428
  "step": 5300
429
  },
430
  {
431
  "epoch": 0.0054,
432
+ "grad_norm": 1.9190561771392822,
433
+ "learning_rate": 2.6995000000000006e-06,
434
+ "loss": 1.4047,
435
  "step": 5400
436
  },
437
  {
438
  "epoch": 0.0055,
439
+ "grad_norm": 1.708715796470642,
440
+ "learning_rate": 2.7495000000000005e-06,
441
+ "loss": 1.4035,
442
  "step": 5500
443
  },
444
  {
445
  "epoch": 0.0056,
446
+ "grad_norm": 1.772215723991394,
447
+ "learning_rate": 2.7995e-06,
448
+ "loss": 1.3896,
449
  "step": 5600
450
  },
451
  {
452
  "epoch": 0.0057,
453
+ "grad_norm": 1.8555290699005127,
454
+ "learning_rate": 2.8495e-06,
455
+ "loss": 1.4112,
456
  "step": 5700
457
  },
458
  {
459
  "epoch": 0.0058,
460
+ "grad_norm": 1.8145737648010254,
461
+ "learning_rate": 2.8995e-06,
462
+ "loss": 1.4022,
463
  "step": 5800
464
  },
465
  {
466
  "epoch": 0.0059,
467
+ "grad_norm": 1.9936249256134033,
468
+ "learning_rate": 2.9495000000000003e-06,
469
+ "loss": 1.3897,
470
  "step": 5900
471
  },
472
  {
473
  "epoch": 0.006,
474
+ "grad_norm": 1.8583012819290161,
475
+ "learning_rate": 2.9995000000000003e-06,
476
+ "loss": 1.3981,
477
  "step": 6000
478
  },
479
  {
480
  "epoch": 0.006,
481
+ "eval_loss": 1.4079989194869995,
482
+ "eval_runtime": 24.0862,
483
+ "eval_samples_per_second": 207.588,
484
+ "eval_steps_per_second": 3.28,
485
  "step": 6000
486
  },
487
  {
488
  "epoch": 0.0061,
489
+ "grad_norm": 1.9386086463928223,
490
+ "learning_rate": 3.0495e-06,
491
+ "loss": 1.3999,
492
  "step": 6100
493
  },
494
  {
495
  "epoch": 0.0062,
496
+ "grad_norm": 1.8759804964065552,
497
+ "learning_rate": 3.0995e-06,
498
+ "loss": 1.3985,
499
  "step": 6200
500
  },
501
  {
502
  "epoch": 0.0063,
503
+ "grad_norm": 1.764291763305664,
504
+ "learning_rate": 3.1495000000000005e-06,
505
+ "loss": 1.3913,
506
  "step": 6300
507
  },
508
  {
509
  "epoch": 0.0064,
510
+ "grad_norm": 1.7575565576553345,
511
+ "learning_rate": 3.1995000000000005e-06,
512
+ "loss": 1.386,
513
  "step": 6400
514
  },
515
  {
516
  "epoch": 0.0065,
517
+ "grad_norm": 1.718859314918518,
518
+ "learning_rate": 3.2495000000000004e-06,
519
+ "loss": 1.3982,
520
  "step": 6500
521
  },
522
  {
523
  "epoch": 0.0066,
524
+ "grad_norm": 1.9030193090438843,
525
+ "learning_rate": 3.2995000000000003e-06,
526
+ "loss": 1.4042,
527
  "step": 6600
528
  },
529
  {
530
  "epoch": 0.0067,
531
+ "grad_norm": 2.0007975101470947,
532
+ "learning_rate": 3.3495000000000007e-06,
533
+ "loss": 1.3914,
534
  "step": 6700
535
  },
536
  {
537
  "epoch": 0.0068,
538
+ "grad_norm": 1.9459850788116455,
539
+ "learning_rate": 3.3995000000000002e-06,
540
+ "loss": 1.3971,
541
  "step": 6800
542
  },
543
  {
544
  "epoch": 0.0069,
545
+ "grad_norm": 1.8755849599838257,
546
+ "learning_rate": 3.4495e-06,
547
+ "loss": 1.3866,
548
  "step": 6900
549
  },
550
  {
551
  "epoch": 0.007,
552
+ "grad_norm": 1.9434016942977905,
553
+ "learning_rate": 3.4995e-06,
554
+ "loss": 1.4039,
555
  "step": 7000
556
  },
557
  {
558
  "epoch": 0.007,
559
+ "eval_loss": 1.4000722169876099,
560
+ "eval_runtime": 24.0943,
561
+ "eval_samples_per_second": 207.518,
562
+ "eval_steps_per_second": 3.279,
563
  "step": 7000
564
  },
565
  {
566
  "epoch": 0.0071,
567
+ "grad_norm": 1.8206807374954224,
568
+ "learning_rate": 3.5495e-06,
569
+ "loss": 1.3962,
570
  "step": 7100
571
  },
572
  {
573
  "epoch": 0.0072,
574
+ "grad_norm": 1.8879231214523315,
575
+ "learning_rate": 3.5995e-06,
576
+ "loss": 1.3973,
577
  "step": 7200
578
  },
579
  {
580
  "epoch": 0.0073,
581
+ "grad_norm": 1.8702274560928345,
582
+ "learning_rate": 3.6495000000000004e-06,
583
+ "loss": 1.401,
584
  "step": 7300
585
  },
586
  {
587
  "epoch": 0.0074,
588
+ "grad_norm": 1.8962584733963013,
589
+ "learning_rate": 3.6995000000000003e-06,
590
+ "loss": 1.4069,
591
  "step": 7400
592
  },
593
  {
594
  "epoch": 0.0075,
595
+ "grad_norm": 1.6658656597137451,
596
+ "learning_rate": 3.7495000000000003e-06,
597
+ "loss": 1.386,
598
  "step": 7500
599
  },
600
  {
601
  "epoch": 0.0076,
602
+ "grad_norm": 1.5985746383666992,
603
+ "learning_rate": 3.7995000000000002e-06,
604
+ "loss": 1.387,
605
  "step": 7600
606
  },
607
  {
608
  "epoch": 0.0077,
609
+ "grad_norm": 1.8246830701828003,
610
+ "learning_rate": 3.8495e-06,
611
+ "loss": 1.3994,
612
  "step": 7700
613
  },
614
  {
615
  "epoch": 0.0078,
616
+ "grad_norm": 1.775139570236206,
617
+ "learning_rate": 3.8995000000000005e-06,
618
+ "loss": 1.3901,
619
  "step": 7800
620
  },
621
  {
622
  "epoch": 0.0079,
623
+ "grad_norm": 1.9915419816970825,
624
+ "learning_rate": 3.949500000000001e-06,
625
+ "loss": 1.3979,
626
  "step": 7900
627
  },
628
  {
629
  "epoch": 0.008,
630
+ "grad_norm": 1.8422917127609253,
631
+ "learning_rate": 3.9995e-06,
632
+ "loss": 1.4017,
633
  "step": 8000
634
  },
635
  {
636
  "epoch": 0.008,
637
+ "eval_loss": 1.3942360877990723,
638
+ "eval_runtime": 24.0764,
639
+ "eval_samples_per_second": 207.672,
640
+ "eval_steps_per_second": 3.281,
641
  "step": 8000
642
  },
643
  {
644
  "epoch": 0.0081,
645
+ "grad_norm": 1.727967381477356,
646
+ "learning_rate": 4.0495e-06,
647
+ "loss": 1.3918,
648
  "step": 8100
649
  },
650
  {
651
  "epoch": 0.0082,
652
+ "grad_norm": 1.7638025283813477,
653
+ "learning_rate": 4.0995e-06,
654
+ "loss": 1.3989,
655
  "step": 8200
656
  },
657
  {
658
  "epoch": 0.0083,
659
+ "grad_norm": 1.8059898614883423,
660
+ "learning_rate": 4.1495e-06,
661
+ "loss": 1.3976,
662
  "step": 8300
663
  },
664
  {
665
  "epoch": 0.0084,
666
+ "grad_norm": 1.9645200967788696,
667
+ "learning_rate": 4.1995e-06,
668
+ "loss": 1.3845,
669
  "step": 8400
670
  },
671
  {
672
  "epoch": 0.0085,
673
+ "grad_norm": 1.98493230342865,
674
+ "learning_rate": 4.2495000000000006e-06,
675
+ "loss": 1.4068,
676
  "step": 8500
677
  },
678
  {
679
  "epoch": 0.0086,
680
+ "grad_norm": 1.8089314699172974,
681
+ "learning_rate": 4.2995e-06,
682
+ "loss": 1.4057,
683
  "step": 8600
684
  },
685
  {
686
  "epoch": 0.0087,
687
+ "grad_norm": 1.7342643737792969,
688
+ "learning_rate": 4.3495000000000005e-06,
689
+ "loss": 1.4077,
690
  "step": 8700
691
  },
692
  {
693
  "epoch": 0.0088,
694
+ "grad_norm": 1.7645128965377808,
695
+ "learning_rate": 4.399500000000001e-06,
696
+ "loss": 1.3974,
697
  "step": 8800
698
  },
699
  {
700
  "epoch": 0.0089,
701
+ "grad_norm": 1.7658684253692627,
702
+ "learning_rate": 4.4495e-06,
703
+ "loss": 1.3819,
704
  "step": 8900
705
  },
706
  {
707
  "epoch": 0.009,
708
+ "grad_norm": 1.8355954885482788,
709
+ "learning_rate": 4.499500000000001e-06,
710
+ "loss": 1.4016,
711
  "step": 9000
712
  },
713
  {
714
  "epoch": 0.009,
715
+ "eval_loss": 1.3868581056594849,
716
+ "eval_runtime": 24.1145,
717
+ "eval_samples_per_second": 207.344,
718
+ "eval_steps_per_second": 3.276,
719
  "step": 9000
720
  },
721
  {
722
  "epoch": 0.0091,
723
+ "grad_norm": 1.7402777671813965,
724
+ "learning_rate": 4.5495e-06,
725
+ "loss": 1.3971,
726
  "step": 9100
727
  },
728
  {
729
  "epoch": 0.0092,
730
+ "grad_norm": 2.0781781673431396,
731
+ "learning_rate": 4.599500000000001e-06,
732
+ "loss": 1.4013,
733
  "step": 9200
734
  },
735
  {
736
  "epoch": 0.0093,
737
+ "grad_norm": 1.8011542558670044,
738
+ "learning_rate": 4.6495e-06,
739
+ "loss": 1.406,
740
  "step": 9300
741
  },
742
  {
743
  "epoch": 0.0094,
744
+ "grad_norm": 1.9290459156036377,
745
+ "learning_rate": 4.6995000000000005e-06,
746
+ "loss": 1.4083,
747
  "step": 9400
748
  },
749
  {
750
  "epoch": 0.0095,
751
+ "grad_norm": 1.7394887208938599,
752
+ "learning_rate": 4.7495e-06,
753
+ "loss": 1.3898,
754
  "step": 9500
755
  },
756
  {
757
  "epoch": 0.0096,
758
+ "grad_norm": 1.845847249031067,
759
+ "learning_rate": 4.7995e-06,
760
+ "loss": 1.416,
761
  "step": 9600
762
  },
763
  {
764
  "epoch": 0.0097,
765
+ "grad_norm": 1.9406960010528564,
766
+ "learning_rate": 4.8495e-06,
767
+ "loss": 1.3893,
768
  "step": 9700
769
  },
770
  {
771
  "epoch": 0.0098,
772
+ "grad_norm": 1.7145395278930664,
773
+ "learning_rate": 4.8995e-06,
774
+ "loss": 1.394,
775
  "step": 9800
776
  },
777
  {
778
  "epoch": 0.0099,
779
+ "grad_norm": 1.748119831085205,
780
+ "learning_rate": 4.949500000000001e-06,
781
+ "loss": 1.4016,
782
  "step": 9900
783
  },
784
  {
785
  "epoch": 0.01,
786
+ "grad_norm": 1.7311230897903442,
787
+ "learning_rate": 4.9995e-06,
788
+ "loss": 1.3965,
789
  "step": 10000
790
  },
791
  {
792
  "epoch": 0.01,
793
+ "eval_loss": 1.408516764640808,
794
+ "eval_runtime": 24.1385,
795
+ "eval_samples_per_second": 207.138,
796
+ "eval_steps_per_second": 3.273,
797
  "step": 10000
798
  },
799
  {
800
  "epoch": 0.0101,
801
+ "grad_norm": 1.8347134590148926,
802
+ "learning_rate": 4.999999876629946e-06,
803
+ "loss": 1.4036,
804
  "step": 10100
805
  },
806
  {
807
  "epoch": 0.0102,
808
+ "grad_norm": 1.7432448863983154,
809
+ "learning_rate": 4.999999501522561e-06,
810
+ "loss": 1.3914,
811
  "step": 10200
812
  },
813
  {
814
  "epoch": 0.0103,
815
+ "grad_norm": 1.796655297279358,
816
+ "learning_rate": 4.999998874665294e-06,
817
+ "loss": 1.397,
818
  "step": 10300
819
  },
820
  {
821
  "epoch": 0.0104,
822
+ "grad_norm": 1.7895489931106567,
823
+ "learning_rate": 4.999997996058208e-06,
824
+ "loss": 1.3876,
825
  "step": 10400
826
  },
827
  {
828
  "epoch": 0.0105,
829
+ "grad_norm": 1.865824580192566,
830
+ "learning_rate": 4.999996865701393e-06,
831
+ "loss": 1.4007,
832
  "step": 10500
833
  },
834
  {
835
  "epoch": 0.0106,
836
+ "grad_norm": 1.9039396047592163,
837
+ "learning_rate": 4.999995483594962e-06,
838
+ "loss": 1.3926,
839
  "step": 10600
840
  },
841
  {
842
  "epoch": 0.0107,
843
+ "grad_norm": 1.7815436124801636,
844
+ "learning_rate": 4.9999938497390545e-06,
845
+ "loss": 1.4058,
846
  "step": 10700
847
  },
848
  {
849
  "epoch": 0.0108,
850
+ "grad_norm": 1.8762564659118652,
851
+ "learning_rate": 4.999991964133834e-06,
852
+ "loss": 1.4,
853
  "step": 10800
854
  },
855
  {
856
  "epoch": 0.0109,
857
+ "grad_norm": 1.8858652114868164,
858
+ "learning_rate": 4.999989826779491e-06,
859
+ "loss": 1.3878,
860
  "step": 10900
861
  },
862
  {
863
  "epoch": 0.011,
864
+ "grad_norm": 1.6531654596328735,
865
+ "learning_rate": 4.999987437676241e-06,
866
+ "loss": 1.3956,
867
  "step": 11000
868
  },
869
  {
870
  "epoch": 0.011,
871
+ "eval_loss": 1.3868032693862915,
872
+ "eval_runtime": 24.1662,
873
+ "eval_samples_per_second": 206.901,
874
+ "eval_steps_per_second": 3.269,
875
  "step": 11000
876
  },
877
  {
878
  "epoch": 0.0111,
879
+ "grad_norm": 1.8017290830612183,
880
+ "learning_rate": 4.999984796824326e-06,
881
+ "loss": 1.3955,
882
  "step": 11100
883
  },
884
  {
885
  "epoch": 0.0112,
886
+ "grad_norm": 1.6868501901626587,
887
+ "learning_rate": 4.999981904224008e-06,
888
+ "loss": 1.3991,
889
  "step": 11200
890
  },
891
  {
892
  "epoch": 0.0113,
893
+ "grad_norm": 1.729891061782837,
894
+ "learning_rate": 4.999978759875582e-06,
895
+ "loss": 1.3921,
896
  "step": 11300
897
  },
898
  {
899
  "epoch": 0.0114,
900
+ "grad_norm": 1.7264313697814941,
901
+ "learning_rate": 4.9999753637793636e-06,
902
+ "loss": 1.4006,
903
  "step": 11400
904
  },
905
  {
906
  "epoch": 0.0115,
907
+ "grad_norm": 1.9104857444763184,
908
+ "learning_rate": 4.999971715935694e-06,
909
+ "loss": 1.3952,
910
  "step": 11500
911
  },
912
  {
913
  "epoch": 0.0116,
914
+ "grad_norm": 1.970773458480835,
915
+ "learning_rate": 4.99996781634494e-06,
916
+ "loss": 1.4001,
917
  "step": 11600
918
  },
919
  {
920
  "epoch": 0.0117,
921
+ "grad_norm": 1.7760286331176758,
922
+ "learning_rate": 4.9999636650074965e-06,
923
+ "loss": 1.3916,
924
  "step": 11700
925
  },
926
  {
927
  "epoch": 0.0118,
928
+ "grad_norm": 1.8842190504074097,
929
+ "learning_rate": 4.99995926192378e-06,
930
+ "loss": 1.3913,
931
  "step": 11800
932
  },
933
  {
934
  "epoch": 0.0119,
935
+ "grad_norm": 1.7593873739242554,
936
+ "learning_rate": 4.999954607094235e-06,
937
+ "loss": 1.3962,
938
  "step": 11900
939
  },
940
  {
941
  "epoch": 0.012,
942
+ "grad_norm": 1.6571648120880127,
943
+ "learning_rate": 4.999949700519328e-06,
944
+ "loss": 1.3931,
945
  "step": 12000
946
  },
947
  {
948
  "epoch": 0.012,
949
+ "eval_loss": 1.3871841430664062,
950
+ "eval_runtime": 24.1968,
951
+ "eval_samples_per_second": 206.639,
952
+ "eval_steps_per_second": 3.265,
953
  "step": 12000
954
  },
955
  {
956
  "epoch": 0.0121,
957
+ "grad_norm": 1.843850016593933,
958
+ "learning_rate": 4.999944542199555e-06,
959
+ "loss": 1.3916,
960
  "step": 12100
961
  },
962
  {
963
  "epoch": 0.0122,
964
+ "grad_norm": 1.7486441135406494,
965
+ "learning_rate": 4.999939132135436e-06,
966
+ "loss": 1.3958,
967
  "step": 12200
968
  },
969
  {
970
  "epoch": 0.0123,
971
+ "grad_norm": 1.875173568725586,
972
+ "learning_rate": 4.999933470327514e-06,
973
+ "loss": 1.3944,
974
  "step": 12300
975
  },
976
  {
977
  "epoch": 0.0124,
978
+ "grad_norm": 1.9309207201004028,
979
+ "learning_rate": 4.99992755677636e-06,
980
+ "loss": 1.4007,
981
  "step": 12400
982
  },
983
  {
984
  "epoch": 0.0125,
985
+ "grad_norm": 1.6403874158859253,
986
+ "learning_rate": 4.9999213914825695e-06,
987
+ "loss": 1.3969,
988
  "step": 12500
989
  },
990
  {
991
  "epoch": 0.0126,
992
+ "grad_norm": 1.9816149473190308,
993
+ "learning_rate": 4.999914974446764e-06,
994
+ "loss": 1.3758,
995
  "step": 12600
996
  },
997
  {
998
  "epoch": 0.0127,
999
+ "grad_norm": 1.7469569444656372,
1000
+ "learning_rate": 4.999908305669587e-06,
1001
+ "loss": 1.3926,
1002
  "step": 12700
1003
  },
1004
  {
1005
  "epoch": 0.0128,
1006
+ "grad_norm": 1.818030595779419,
1007
+ "learning_rate": 4.999901385151713e-06,
1008
+ "loss": 1.3845,
1009
  "step": 12800
1010
  },
1011
  {
1012
  "epoch": 0.0129,
1013
+ "grad_norm": 1.8535512685775757,
1014
+ "learning_rate": 4.999894212893839e-06,
1015
+ "loss": 1.4021,
1016
  "step": 12900
1017
  },
1018
  {
1019
  "epoch": 0.013,
1020
+ "grad_norm": 1.8381346464157104,
1021
+ "learning_rate": 4.999886788896685e-06,
1022
+ "loss": 1.3958,
1023
  "step": 13000
1024
  },
1025
  {
1026
  "epoch": 0.013,
1027
+ "eval_loss": 1.3848538398742676,
1028
+ "eval_runtime": 24.2052,
1029
+ "eval_samples_per_second": 206.568,
1030
+ "eval_steps_per_second": 3.264,
1031
  "step": 13000
1032
  },
1033
  {
1034
  "epoch": 0.0131,
1035
+ "grad_norm": 1.9233248233795166,
1036
+ "learning_rate": 4.999879113161e-06,
1037
+ "loss": 1.3985,
1038
  "step": 13100
1039
  },
1040
  {
1041
  "epoch": 0.0132,
1042
+ "grad_norm": 1.8300249576568604,
1043
+ "learning_rate": 4.999871185687557e-06,
1044
+ "loss": 1.3855,
1045
  "step": 13200
1046
  },
1047
  {
1048
  "epoch": 0.0133,
1049
+ "grad_norm": 1.789095163345337,
1050
+ "learning_rate": 4.999863006477154e-06,
1051
+ "loss": 1.3905,
1052
  "step": 13300
1053
  },
1054
  {
1055
  "epoch": 0.0134,
1056
+ "grad_norm": 1.7962862253189087,
1057
+ "learning_rate": 4.999854575530614e-06,
1058
+ "loss": 1.4135,
1059
  "step": 13400
1060
  },
1061
  {
1062
  "epoch": 0.0135,
1063
+ "grad_norm": 1.8320449590682983,
1064
+ "learning_rate": 4.999845892848787e-06,
1065
+ "loss": 1.4024,
1066
  "step": 13500
1067
  },
1068
  {
1069
  "epoch": 0.0136,
1070
+ "grad_norm": 1.848564863204956,
1071
+ "learning_rate": 4.999836958432547e-06,
1072
+ "loss": 1.4067,
1073
  "step": 13600
1074
  },
1075
  {
1076
  "epoch": 0.0137,
1077
+ "grad_norm": 1.8709757328033447,
1078
+ "learning_rate": 4.999827772282793e-06,
1079
+ "loss": 1.3817,
1080
  "step": 13700
1081
  },
1082
  {
1083
  "epoch": 0.0138,
1084
+ "grad_norm": 1.8266668319702148,
1085
+ "learning_rate": 4.999818334400452e-06,
1086
+ "loss": 1.3838,
1087
  "step": 13800
1088
  },
1089
  {
1090
  "epoch": 0.0139,
1091
+ "grad_norm": 1.8056071996688843,
1092
+ "learning_rate": 4.999808644786473e-06,
1093
+ "loss": 1.3817,
1094
  "step": 13900
1095
  },
1096
  {
1097
  "epoch": 0.014,
1098
+ "grad_norm": 1.703490138053894,
1099
+ "learning_rate": 4.999798703441832e-06,
1100
+ "loss": 1.3996,
1101
  "step": 14000
1102
  },
1103
  {
1104
  "epoch": 0.014,
1105
+ "eval_loss": 1.3908612728118896,
1106
+ "eval_runtime": 24.2804,
1107
+ "eval_samples_per_second": 205.927,
1108
+ "eval_steps_per_second": 3.254,
1109
  "step": 14000
1110
  },
1111
  {
1112
  "epoch": 0.0141,
1113
+ "grad_norm": 1.6894255876541138,
1114
+ "learning_rate": 4.999788510367529e-06,
1115
+ "loss": 1.4134,
1116
  "step": 14100
1117
  },
1118
  {
1119
  "epoch": 0.0142,
1120
+ "grad_norm": 1.7173762321472168,
1121
+ "learning_rate": 4.999778065564592e-06,
1122
+ "loss": 1.4046,
1123
  "step": 14200
1124
  },
1125
  {
1126
  "epoch": 0.0143,
1127
+ "grad_norm": 1.7411465644836426,
1128
+ "learning_rate": 4.999767369034072e-06,
1129
+ "loss": 1.4058,
1130
  "step": 14300
1131
  },
1132
  {
1133
  "epoch": 0.0144,
1134
+ "grad_norm": 1.9577834606170654,
1135
+ "learning_rate": 4.999756420777047e-06,
1136
+ "loss": 1.4061,
1137
  "step": 14400
1138
  },
1139
  {
1140
  "epoch": 0.0145,
1141
+ "grad_norm": 1.8880082368850708,
1142
+ "learning_rate": 4.999745220794618e-06,
1143
+ "loss": 1.4008,
1144
  "step": 14500
1145
  },
1146
  {
1147
  "epoch": 0.0146,
1148
+ "grad_norm": 1.7882862091064453,
1149
+ "learning_rate": 4.999733769087913e-06,
1150
+ "loss": 1.3991,
1151
  "step": 14600
1152
  },
1153
  {
1154
  "epoch": 0.0147,
1155
+ "grad_norm": 1.8364261388778687,
1156
+ "learning_rate": 4.999722065658087e-06,
1157
+ "loss": 1.3999,
1158
  "step": 14700
1159
  },
1160
  {
1161
  "epoch": 0.0148,
1162
+ "grad_norm": 1.8236931562423706,
1163
+ "learning_rate": 4.9997101105063175e-06,
1164
+ "loss": 1.4065,
1165
  "step": 14800
1166
  },
1167
  {
1168
  "epoch": 0.0149,
1169
+ "grad_norm": 1.8798736333847046,
1170
+ "learning_rate": 4.999697903633808e-06,
1171
+ "loss": 1.4183,
1172
  "step": 14900
1173
  },
1174
  {
1175
  "epoch": 0.015,
1176
+ "grad_norm": 1.9224145412445068,
1177
+ "learning_rate": 4.999685445041788e-06,
1178
+ "loss": 1.4229,
1179
  "step": 15000
1180
  },
1181
  {
1182
  "epoch": 0.015,
1183
+ "eval_loss": 1.3955188989639282,
1184
+ "eval_runtime": 24.2626,
1185
+ "eval_samples_per_second": 206.079,
1186
+ "eval_steps_per_second": 3.256,
1187
  "step": 15000
1188
+ },
1189
+ {
1190
+ "epoch": 0.0151,
1191
+ "grad_norm": 1.8474870920181274,
1192
+ "learning_rate": 4.999672734731511e-06,
1193
+ "loss": 1.4072,
1194
+ "step": 15100
1195
+ },
1196
+ {
1197
+ "epoch": 0.0152,
1198
+ "grad_norm": 1.7506942749023438,
1199
+ "learning_rate": 4.9996597727042596e-06,
1200
+ "loss": 1.4015,
1201
+ "step": 15200
1202
+ },
1203
+ {
1204
+ "epoch": 0.0153,
1205
+ "grad_norm": 1.9011894464492798,
1206
+ "learning_rate": 4.999646558961337e-06,
1207
+ "loss": 1.4073,
1208
+ "step": 15300
1209
+ },
1210
+ {
1211
+ "epoch": 0.0154,
1212
+ "grad_norm": 1.809842586517334,
1213
+ "learning_rate": 4.999633093504074e-06,
1214
+ "loss": 1.4026,
1215
+ "step": 15400
1216
+ },
1217
+ {
1218
+ "epoch": 0.0155,
1219
+ "grad_norm": 1.9759999513626099,
1220
+ "learning_rate": 4.999619376333827e-06,
1221
+ "loss": 1.4043,
1222
+ "step": 15500
1223
+ },
1224
+ {
1225
+ "epoch": 0.0156,
1226
+ "grad_norm": 1.892358422279358,
1227
+ "learning_rate": 4.999605407451977e-06,
1228
+ "loss": 1.4081,
1229
+ "step": 15600
1230
+ },
1231
+ {
1232
+ "epoch": 0.0157,
1233
+ "grad_norm": 1.8095463514328003,
1234
+ "learning_rate": 4.999591186859931e-06,
1235
+ "loss": 1.3961,
1236
+ "step": 15700
1237
+ },
1238
+ {
1239
+ "epoch": 0.0158,
1240
+ "grad_norm": 1.7918623685836792,
1241
+ "learning_rate": 4.999576714559121e-06,
1242
+ "loss": 1.4043,
1243
+ "step": 15800
1244
+ },
1245
+ {
1246
+ "epoch": 0.0159,
1247
+ "grad_norm": 1.7738561630249023,
1248
+ "learning_rate": 4.999561990551004e-06,
1249
+ "loss": 1.3977,
1250
+ "step": 15900
1251
+ },
1252
+ {
1253
+ "epoch": 0.016,
1254
+ "grad_norm": 1.9365230798721313,
1255
+ "learning_rate": 4.999547014837064e-06,
1256
+ "loss": 1.3994,
1257
+ "step": 16000
1258
+ },
1259
+ {
1260
+ "epoch": 0.016,
1261
+ "eval_loss": 1.3911411762237549,
1262
+ "eval_runtime": 24.3275,
1263
+ "eval_samples_per_second": 205.529,
1264
+ "eval_steps_per_second": 3.247,
1265
+ "step": 16000
1266
+ },
1267
+ {
1268
+ "epoch": 0.0161,
1269
+ "grad_norm": 1.869052767753601,
1270
+ "learning_rate": 4.9995317874188065e-06,
1271
+ "loss": 1.4078,
1272
+ "step": 16100
1273
+ },
1274
+ {
1275
+ "epoch": 0.0162,
1276
+ "grad_norm": 1.9397051334381104,
1277
+ "learning_rate": 4.999516308297767e-06,
1278
+ "loss": 1.4029,
1279
+ "step": 16200
1280
+ },
1281
+ {
1282
+ "epoch": 0.0163,
1283
+ "grad_norm": 1.6483432054519653,
1284
+ "learning_rate": 4.999500577475504e-06,
1285
+ "loss": 1.3943,
1286
+ "step": 16300
1287
+ },
1288
+ {
1289
+ "epoch": 0.0164,
1290
+ "grad_norm": 2.034252166748047,
1291
+ "learning_rate": 4.999484594953601e-06,
1292
+ "loss": 1.4213,
1293
+ "step": 16400
1294
+ },
1295
+ {
1296
+ "epoch": 0.0165,
1297
+ "grad_norm": 1.737054705619812,
1298
+ "learning_rate": 4.9994683607336675e-06,
1299
+ "loss": 1.397,
1300
+ "step": 16500
1301
+ },
1302
+ {
1303
+ "epoch": 0.0166,
1304
+ "grad_norm": 1.789985179901123,
1305
+ "learning_rate": 4.999451874817338e-06,
1306
+ "loss": 1.3983,
1307
+ "step": 16600
1308
+ },
1309
+ {
1310
+ "epoch": 0.0167,
1311
+ "grad_norm": 1.7947198152542114,
1312
+ "learning_rate": 4.999435137206274e-06,
1313
+ "loss": 1.4063,
1314
+ "step": 16700
1315
+ },
1316
+ {
1317
+ "epoch": 0.0168,
1318
+ "grad_norm": 1.9025126695632935,
1319
+ "learning_rate": 4.999418147902159e-06,
1320
+ "loss": 1.4006,
1321
+ "step": 16800
1322
+ },
1323
+ {
1324
+ "epoch": 0.0169,
1325
+ "grad_norm": 1.7516543865203857,
1326
+ "learning_rate": 4.999400906906707e-06,
1327
+ "loss": 1.4054,
1328
+ "step": 16900
1329
+ },
1330
+ {
1331
+ "epoch": 0.017,
1332
+ "grad_norm": 1.6800352334976196,
1333
+ "learning_rate": 4.99938341422165e-06,
1334
+ "loss": 1.3978,
1335
+ "step": 17000
1336
+ },
1337
+ {
1338
+ "epoch": 0.017,
1339
+ "eval_loss": 1.385777473449707,
1340
+ "eval_runtime": 24.2933,
1341
+ "eval_samples_per_second": 205.818,
1342
+ "eval_steps_per_second": 3.252,
1343
+ "step": 17000
1344
+ },
1345
+ {
1346
+ "epoch": 0.0171,
1347
+ "grad_norm": 1.778942584991455,
1348
+ "learning_rate": 4.999365669848752e-06,
1349
+ "loss": 1.3938,
1350
+ "step": 17100
1351
+ },
1352
+ {
1353
+ "epoch": 0.0172,
1354
+ "grad_norm": 1.8309190273284912,
1355
+ "learning_rate": 4.999347673789801e-06,
1356
+ "loss": 1.4017,
1357
+ "step": 17200
1358
+ },
1359
+ {
1360
+ "epoch": 0.0173,
1361
+ "grad_norm": 1.8281348943710327,
1362
+ "learning_rate": 4.999329426046606e-06,
1363
+ "loss": 1.3959,
1364
+ "step": 17300
1365
+ },
1366
+ {
1367
+ "epoch": 0.0174,
1368
+ "grad_norm": 1.8240395784378052,
1369
+ "learning_rate": 4.999310926621006e-06,
1370
+ "loss": 1.41,
1371
+ "step": 17400
1372
+ },
1373
+ {
1374
+ "epoch": 0.0175,
1375
+ "grad_norm": 2.054227590560913,
1376
+ "learning_rate": 4.9992921755148646e-06,
1377
+ "loss": 1.3994,
1378
+ "step": 17500
1379
+ },
1380
+ {
1381
+ "epoch": 0.0176,
1382
+ "grad_norm": 1.9895967245101929,
1383
+ "learning_rate": 4.99927317273007e-06,
1384
+ "loss": 1.3991,
1385
+ "step": 17600
1386
+ },
1387
+ {
1388
+ "epoch": 0.0177,
1389
+ "grad_norm": 1.7590620517730713,
1390
+ "learning_rate": 4.9992539182685345e-06,
1391
+ "loss": 1.4181,
1392
+ "step": 17700
1393
+ },
1394
+ {
1395
+ "epoch": 0.0178,
1396
+ "grad_norm": 1.8000540733337402,
1397
+ "learning_rate": 4.9992344121321975e-06,
1398
+ "loss": 1.397,
1399
+ "step": 17800
1400
+ },
1401
+ {
1402
+ "epoch": 0.0179,
1403
+ "grad_norm": 1.689989447593689,
1404
+ "learning_rate": 4.999214654323025e-06,
1405
+ "loss": 1.4146,
1406
+ "step": 17900
1407
+ },
1408
+ {
1409
+ "epoch": 0.018,
1410
+ "grad_norm": 1.9092588424682617,
1411
+ "learning_rate": 4.999194644843004e-06,
1412
+ "loss": 1.4044,
1413
+ "step": 18000
1414
+ },
1415
+ {
1416
+ "epoch": 0.018,
1417
+ "eval_loss": 1.3936209678649902,
1418
+ "eval_runtime": 25.2802,
1419
+ "eval_samples_per_second": 197.783,
1420
+ "eval_steps_per_second": 3.125,
1421
+ "step": 18000
1422
+ },
1423
+ {
1424
+ "epoch": 0.0181,
1425
+ "grad_norm": 1.7824491262435913,
1426
+ "learning_rate": 4.999174383694151e-06,
1427
+ "loss": 1.401,
1428
+ "step": 18100
1429
+ },
1430
+ {
1431
+ "epoch": 0.0182,
1432
+ "grad_norm": 1.9961062669754028,
1433
+ "learning_rate": 4.999153870878506e-06,
1434
+ "loss": 1.3982,
1435
+ "step": 18200
1436
+ },
1437
+ {
1438
+ "epoch": 0.0183,
1439
+ "grad_norm": 1.8174325227737427,
1440
+ "learning_rate": 4.999133106398135e-06,
1441
+ "loss": 1.4062,
1442
+ "step": 18300
1443
+ },
1444
+ {
1445
+ "epoch": 0.0184,
1446
+ "grad_norm": 1.807077169418335,
1447
+ "learning_rate": 4.999112090255129e-06,
1448
+ "loss": 1.4022,
1449
+ "step": 18400
1450
+ },
1451
+ {
1452
+ "epoch": 0.0185,
1453
+ "grad_norm": 1.8445507287979126,
1454
+ "learning_rate": 4.9990908224516025e-06,
1455
+ "loss": 1.3893,
1456
+ "step": 18500
1457
+ },
1458
+ {
1459
+ "epoch": 0.0186,
1460
+ "grad_norm": 1.9006763696670532,
1461
+ "learning_rate": 4.999069302989699e-06,
1462
+ "loss": 1.3918,
1463
+ "step": 18600
1464
+ },
1465
+ {
1466
+ "epoch": 0.0187,
1467
+ "grad_norm": 1.981102705001831,
1468
+ "learning_rate": 4.999047531871585e-06,
1469
+ "loss": 1.4038,
1470
+ "step": 18700
1471
+ },
1472
+ {
1473
+ "epoch": 0.0188,
1474
+ "grad_norm": 1.8729362487792969,
1475
+ "learning_rate": 4.9990255090994535e-06,
1476
+ "loss": 1.4021,
1477
+ "step": 18800
1478
+ },
1479
+ {
1480
+ "epoch": 0.0189,
1481
+ "grad_norm": 1.9796510934829712,
1482
+ "learning_rate": 4.999003234675521e-06,
1483
+ "loss": 1.3972,
1484
+ "step": 18900
1485
+ },
1486
+ {
1487
+ "epoch": 0.019,
1488
+ "grad_norm": 1.8760383129119873,
1489
+ "learning_rate": 4.998980708602031e-06,
1490
+ "loss": 1.3998,
1491
+ "step": 19000
1492
+ },
1493
+ {
1494
+ "epoch": 0.019,
1495
+ "eval_loss": 1.3851877450942993,
1496
+ "eval_runtime": 24.3386,
1497
+ "eval_samples_per_second": 205.435,
1498
+ "eval_steps_per_second": 3.246,
1499
+ "step": 19000
1500
+ },
1501
+ {
1502
+ "epoch": 0.0191,
1503
+ "grad_norm": 1.78969144821167,
1504
+ "learning_rate": 4.998957930881253e-06,
1505
+ "loss": 1.3957,
1506
+ "step": 19100
1507
+ },
1508
+ {
1509
+ "epoch": 0.0192,
1510
+ "grad_norm": 1.8471328020095825,
1511
+ "learning_rate": 4.998934901515479e-06,
1512
+ "loss": 1.4144,
1513
+ "step": 19200
1514
+ },
1515
+ {
1516
+ "epoch": 0.0193,
1517
+ "grad_norm": 1.8064422607421875,
1518
+ "learning_rate": 4.998911620507029e-06,
1519
+ "loss": 1.3871,
1520
+ "step": 19300
1521
+ },
1522
+ {
1523
+ "epoch": 0.0194,
1524
+ "grad_norm": 1.900911569595337,
1525
+ "learning_rate": 4.998888087858246e-06,
1526
+ "loss": 1.3946,
1527
+ "step": 19400
1528
+ },
1529
+ {
1530
+ "epoch": 0.0195,
1531
+ "grad_norm": 1.7808403968811035,
1532
+ "learning_rate": 4.998864303571502e-06,
1533
+ "loss": 1.394,
1534
+ "step": 19500
1535
+ },
1536
+ {
1537
+ "epoch": 0.0196,
1538
+ "grad_norm": 1.7433531284332275,
1539
+ "learning_rate": 4.998840267649191e-06,
1540
+ "loss": 1.3916,
1541
+ "step": 19600
1542
+ },
1543
+ {
1544
+ "epoch": 0.0197,
1545
+ "grad_norm": 1.9526238441467285,
1546
+ "learning_rate": 4.998815980093734e-06,
1547
+ "loss": 1.3901,
1548
+ "step": 19700
1549
+ },
1550
+ {
1551
+ "epoch": 0.0198,
1552
+ "grad_norm": 1.6053208112716675,
1553
+ "learning_rate": 4.998791440907575e-06,
1554
+ "loss": 1.4086,
1555
+ "step": 19800
1556
+ },
1557
+ {
1558
+ "epoch": 0.0199,
1559
+ "grad_norm": 1.9025638103485107,
1560
+ "learning_rate": 4.9987666500931874e-06,
1561
+ "loss": 1.4041,
1562
+ "step": 19900
1563
+ },
1564
+ {
1565
+ "epoch": 0.02,
1566
+ "grad_norm": 1.7806073427200317,
1567
+ "learning_rate": 4.998741607653066e-06,
1568
+ "loss": 1.3889,
1569
+ "step": 20000
1570
+ },
1571
+ {
1572
+ "epoch": 0.02,
1573
+ "eval_loss": 1.3910343647003174,
1574
+ "eval_runtime": 24.3678,
1575
+ "eval_samples_per_second": 205.189,
1576
+ "eval_steps_per_second": 3.242,
1577
+ "step": 20000
1578
+ },
1579
+ {
1580
+ "epoch": 0.0201,
1581
+ "grad_norm": 1.8703974485397339,
1582
+ "learning_rate": 4.9987163135897334e-06,
1583
+ "loss": 1.3995,
1584
+ "step": 20100
1585
+ },
1586
+ {
1587
+ "epoch": 0.0202,
1588
+ "grad_norm": 1.8790905475616455,
1589
+ "learning_rate": 4.998690767905736e-06,
1590
+ "loss": 1.4063,
1591
+ "step": 20200
1592
+ },
1593
+ {
1594
+ "epoch": 0.0203,
1595
+ "grad_norm": 1.9134712219238281,
1596
+ "learning_rate": 4.998664970603646e-06,
1597
+ "loss": 1.4038,
1598
+ "step": 20300
1599
+ },
1600
+ {
1601
+ "epoch": 0.0204,
1602
+ "grad_norm": 1.8452473878860474,
1603
+ "learning_rate": 4.998638921686063e-06,
1604
+ "loss": 1.4035,
1605
+ "step": 20400
1606
+ },
1607
+ {
1608
+ "epoch": 0.0205,
1609
+ "grad_norm": 1.8248049020767212,
1610
+ "learning_rate": 4.998612621155608e-06,
1611
+ "loss": 1.4003,
1612
+ "step": 20500
1613
+ },
1614
+ {
1615
+ "epoch": 0.0206,
1616
+ "grad_norm": 1.8242034912109375,
1617
+ "learning_rate": 4.9985860690149316e-06,
1618
+ "loss": 1.3927,
1619
+ "step": 20600
1620
+ },
1621
+ {
1622
+ "epoch": 0.0207,
1623
+ "grad_norm": 1.9893760681152344,
1624
+ "learning_rate": 4.998559265266705e-06,
1625
+ "loss": 1.4023,
1626
+ "step": 20700
1627
+ },
1628
+ {
1629
+ "epoch": 0.0208,
1630
+ "grad_norm": 1.9086755514144897,
1631
+ "learning_rate": 4.99853220991363e-06,
1632
+ "loss": 1.4064,
1633
+ "step": 20800
1634
+ },
1635
+ {
1636
+ "epoch": 0.0209,
1637
+ "grad_norm": 1.9896847009658813,
1638
+ "learning_rate": 4.998504902958429e-06,
1639
+ "loss": 1.3995,
1640
+ "step": 20900
1641
+ },
1642
+ {
1643
+ "epoch": 0.021,
1644
+ "grad_norm": 1.9010740518569946,
1645
+ "learning_rate": 4.998477344403852e-06,
1646
+ "loss": 1.4031,
1647
+ "step": 21000
1648
+ },
1649
+ {
1650
+ "epoch": 0.021,
1651
+ "eval_loss": 1.3773479461669922,
1652
+ "eval_runtime": 24.4357,
1653
+ "eval_samples_per_second": 204.619,
1654
+ "eval_steps_per_second": 3.233,
1655
+ "step": 21000
1656
+ },
1657
+ {
1658
+ "epoch": 0.0211,
1659
+ "grad_norm": 1.8723328113555908,
1660
+ "learning_rate": 4.9984495342526765e-06,
1661
+ "loss": 1.4064,
1662
+ "step": 21100
1663
+ },
1664
+ {
1665
+ "epoch": 0.0212,
1666
+ "grad_norm": 1.9470596313476562,
1667
+ "learning_rate": 4.998421472507701e-06,
1668
+ "loss": 1.3928,
1669
+ "step": 21200
1670
+ },
1671
+ {
1672
+ "epoch": 0.0213,
1673
+ "grad_norm": 1.7989906072616577,
1674
+ "learning_rate": 4.998393159171751e-06,
1675
+ "loss": 1.3981,
1676
+ "step": 21300
1677
+ },
1678
+ {
1679
+ "epoch": 0.0214,
1680
+ "grad_norm": 1.8689730167388916,
1681
+ "learning_rate": 4.998364594247678e-06,
1682
+ "loss": 1.3938,
1683
+ "step": 21400
1684
+ },
1685
+ {
1686
+ "epoch": 0.0215,
1687
+ "grad_norm": 1.9032299518585205,
1688
+ "learning_rate": 4.998335777738359e-06,
1689
+ "loss": 1.3947,
1690
+ "step": 21500
1691
+ },
1692
+ {
1693
+ "epoch": 0.0216,
1694
+ "grad_norm": 1.911832332611084,
1695
+ "learning_rate": 4.998306709646695e-06,
1696
+ "loss": 1.3973,
1697
+ "step": 21600
1698
+ },
1699
+ {
1700
+ "epoch": 0.0217,
1701
+ "grad_norm": 1.9614136219024658,
1702
+ "learning_rate": 4.998277389975615e-06,
1703
+ "loss": 1.3953,
1704
+ "step": 21700
1705
+ },
1706
+ {
1707
+ "epoch": 0.0218,
1708
+ "grad_norm": 1.9598075151443481,
1709
+ "learning_rate": 4.998247818728069e-06,
1710
+ "loss": 1.3869,
1711
+ "step": 21800
1712
+ },
1713
+ {
1714
+ "epoch": 0.0219,
1715
+ "grad_norm": 1.9342235326766968,
1716
+ "learning_rate": 4.998217995907037e-06,
1717
+ "loss": 1.3979,
1718
+ "step": 21900
1719
+ },
1720
+ {
1721
+ "epoch": 0.022,
1722
+ "grad_norm": 1.740506887435913,
1723
+ "learning_rate": 4.998187921515521e-06,
1724
+ "loss": 1.3896,
1725
+ "step": 22000
1726
+ },
1727
+ {
1728
+ "epoch": 0.022,
1729
+ "eval_loss": 1.4018659591674805,
1730
+ "eval_runtime": 24.3975,
1731
+ "eval_samples_per_second": 204.939,
1732
+ "eval_steps_per_second": 3.238,
1733
+ "step": 22000
1734
+ },
1735
+ {
1736
+ "epoch": 0.0221,
1737
+ "grad_norm": 1.7357391119003296,
1738
+ "learning_rate": 4.998157595556548e-06,
1739
+ "loss": 1.3981,
1740
+ "step": 22100
1741
+ },
1742
+ {
1743
+ "epoch": 0.0222,
1744
+ "grad_norm": 1.7457457780838013,
1745
+ "learning_rate": 4.998127018033176e-06,
1746
+ "loss": 1.4057,
1747
+ "step": 22200
1748
+ },
1749
+ {
1750
+ "epoch": 0.0223,
1751
+ "grad_norm": 1.8668625354766846,
1752
+ "learning_rate": 4.998096188948479e-06,
1753
+ "loss": 1.4009,
1754
+ "step": 22300
1755
+ },
1756
+ {
1757
+ "epoch": 0.0224,
1758
+ "grad_norm": 1.9003961086273193,
1759
+ "learning_rate": 4.998065108305567e-06,
1760
+ "loss": 1.3957,
1761
+ "step": 22400
1762
+ },
1763
+ {
1764
+ "epoch": 0.0225,
1765
+ "grad_norm": 1.7045166492462158,
1766
+ "learning_rate": 4.998033776107565e-06,
1767
+ "loss": 1.3831,
1768
+ "step": 22500
1769
+ },
1770
+ {
1771
+ "epoch": 0.0226,
1772
+ "grad_norm": 1.7204500436782837,
1773
+ "learning_rate": 4.99800219235763e-06,
1774
+ "loss": 1.393,
1775
+ "step": 22600
1776
+ },
1777
+ {
1778
+ "epoch": 0.0227,
1779
+ "grad_norm": 1.8276758193969727,
1780
+ "learning_rate": 4.9979703570589435e-06,
1781
+ "loss": 1.3911,
1782
+ "step": 22700
1783
+ },
1784
+ {
1785
+ "epoch": 0.0228,
1786
+ "grad_norm": 1.851925253868103,
1787
+ "learning_rate": 4.9979382702147095e-06,
1788
+ "loss": 1.3919,
1789
+ "step": 22800
1790
+ },
1791
+ {
1792
+ "epoch": 0.0229,
1793
+ "grad_norm": 1.9704827070236206,
1794
+ "learning_rate": 4.997905931828161e-06,
1795
+ "loss": 1.3913,
1796
+ "step": 22900
1797
+ },
1798
+ {
1799
+ "epoch": 0.023,
1800
+ "grad_norm": 1.9040639400482178,
1801
+ "learning_rate": 4.997873341902552e-06,
1802
+ "loss": 1.3898,
1803
+ "step": 23000
1804
+ },
1805
+ {
1806
+ "epoch": 0.023,
1807
+ "eval_loss": 1.3905709981918335,
1808
+ "eval_runtime": 24.4036,
1809
+ "eval_samples_per_second": 204.888,
1810
+ "eval_steps_per_second": 3.237,
1811
+ "step": 23000
1812
+ },
1813
+ {
1814
+ "epoch": 0.0231,
1815
+ "grad_norm": 1.64853835105896,
1816
+ "learning_rate": 4.9978405004411676e-06,
1817
+ "loss": 1.3999,
1818
+ "step": 23100
1819
+ },
1820
+ {
1821
+ "epoch": 0.0232,
1822
+ "grad_norm": 1.680251955986023,
1823
+ "learning_rate": 4.997807407447312e-06,
1824
+ "loss": 1.3959,
1825
+ "step": 23200
1826
+ },
1827
+ {
1828
+ "epoch": 0.0233,
1829
+ "grad_norm": 1.79887056350708,
1830
+ "learning_rate": 4.99777406292432e-06,
1831
+ "loss": 1.3984,
1832
+ "step": 23300
1833
+ },
1834
+ {
1835
+ "epoch": 0.0234,
1836
+ "grad_norm": 1.9963394403457642,
1837
+ "learning_rate": 4.997740466875547e-06,
1838
+ "loss": 1.3785,
1839
+ "step": 23400
1840
+ },
1841
+ {
1842
+ "epoch": 0.0235,
1843
+ "grad_norm": 1.934024691581726,
1844
+ "learning_rate": 4.997706619304378e-06,
1845
+ "loss": 1.3886,
1846
+ "step": 23500
1847
+ },
1848
+ {
1849
+ "epoch": 0.0236,
1850
+ "grad_norm": 1.8343491554260254,
1851
+ "learning_rate": 4.9976725202142204e-06,
1852
+ "loss": 1.3928,
1853
+ "step": 23600
1854
+ },
1855
+ {
1856
+ "epoch": 0.0237,
1857
+ "grad_norm": 1.7215723991394043,
1858
+ "learning_rate": 4.9976381696085086e-06,
1859
+ "loss": 1.3933,
1860
+ "step": 23700
1861
+ },
1862
+ {
1863
+ "epoch": 0.0238,
1864
+ "grad_norm": 1.8704638481140137,
1865
+ "learning_rate": 4.997603567490702e-06,
1866
+ "loss": 1.39,
1867
+ "step": 23800
1868
+ },
1869
+ {
1870
+ "epoch": 0.0239,
1871
+ "grad_norm": 2.0038602352142334,
1872
+ "learning_rate": 4.997568713864283e-06,
1873
+ "loss": 1.3948,
1874
+ "step": 23900
1875
+ },
1876
+ {
1877
+ "epoch": 0.024,
1878
+ "grad_norm": 1.8034144639968872,
1879
+ "learning_rate": 4.997533608732764e-06,
1880
+ "loss": 1.4015,
1881
+ "step": 24000
1882
+ },
1883
+ {
1884
+ "epoch": 0.024,
1885
+ "eval_loss": 1.404091477394104,
1886
+ "eval_runtime": 24.4038,
1887
+ "eval_samples_per_second": 204.886,
1888
+ "eval_steps_per_second": 3.237,
1889
+ "step": 24000
1890
+ },
1891
+ {
1892
+ "epoch": 0.0241,
1893
+ "grad_norm": 1.8736565113067627,
1894
+ "learning_rate": 4.997498252099678e-06,
1895
+ "loss": 1.3984,
1896
+ "step": 24100
1897
+ },
1898
+ {
1899
+ "epoch": 0.0242,
1900
+ "grad_norm": 1.9209901094436646,
1901
+ "learning_rate": 4.997462643968588e-06,
1902
+ "loss": 1.3909,
1903
+ "step": 24200
1904
+ },
1905
+ {
1906
+ "epoch": 0.0243,
1907
+ "grad_norm": 1.8445130586624146,
1908
+ "learning_rate": 4.997426784343077e-06,
1909
+ "loss": 1.403,
1910
+ "step": 24300
1911
+ },
1912
+ {
1913
+ "epoch": 0.0244,
1914
+ "grad_norm": 1.8695659637451172,
1915
+ "learning_rate": 4.997390673226758e-06,
1916
+ "loss": 1.3849,
1917
+ "step": 24400
1918
+ },
1919
+ {
1920
+ "epoch": 0.0245,
1921
+ "grad_norm": 1.918107032775879,
1922
+ "learning_rate": 4.997354310623265e-06,
1923
+ "loss": 1.3781,
1924
+ "step": 24500
1925
+ },
1926
+ {
1927
+ "epoch": 0.0246,
1928
+ "grad_norm": 1.6449661254882812,
1929
+ "learning_rate": 4.997317696536262e-06,
1930
+ "loss": 1.4057,
1931
+ "step": 24600
1932
+ },
1933
+ {
1934
+ "epoch": 0.0247,
1935
+ "grad_norm": 1.8361092805862427,
1936
+ "learning_rate": 4.997280830969436e-06,
1937
+ "loss": 1.3951,
1938
+ "step": 24700
1939
+ },
1940
+ {
1941
+ "epoch": 0.0248,
1942
+ "grad_norm": 2.0154218673706055,
1943
+ "learning_rate": 4.997243713926497e-06,
1944
+ "loss": 1.3975,
1945
+ "step": 24800
1946
+ },
1947
+ {
1948
+ "epoch": 0.0249,
1949
+ "grad_norm": 1.8617228269577026,
1950
+ "learning_rate": 4.997206345411185e-06,
1951
+ "loss": 1.3828,
1952
+ "step": 24900
1953
+ },
1954
+ {
1955
+ "epoch": 0.025,
1956
+ "grad_norm": 1.8779343366622925,
1957
+ "learning_rate": 4.997168725427263e-06,
1958
+ "loss": 1.3982,
1959
+ "step": 25000
1960
+ },
1961
+ {
1962
+ "epoch": 0.025,
1963
+ "eval_loss": 1.3792742490768433,
1964
+ "eval_runtime": 24.4701,
1965
+ "eval_samples_per_second": 204.331,
1966
+ "eval_steps_per_second": 3.228,
1967
+ "step": 25000
1968
+ },
1969
+ {
1970
+ "epoch": 0.0251,
1971
+ "grad_norm": 1.9437707662582397,
1972
+ "learning_rate": 4.997130853978519e-06,
1973
+ "loss": 1.3881,
1974
+ "step": 25100
1975
+ },
1976
+ {
1977
+ "epoch": 0.0252,
1978
+ "grad_norm": 1.8795158863067627,
1979
+ "learning_rate": 4.9970927310687655e-06,
1980
+ "loss": 1.3975,
1981
+ "step": 25200
1982
+ },
1983
+ {
1984
+ "epoch": 0.0253,
1985
+ "grad_norm": 1.732364535331726,
1986
+ "learning_rate": 4.997054356701842e-06,
1987
+ "loss": 1.3951,
1988
+ "step": 25300
1989
+ },
1990
+ {
1991
+ "epoch": 0.0254,
1992
+ "grad_norm": 1.7434965372085571,
1993
+ "learning_rate": 4.997015730881614e-06,
1994
+ "loss": 1.3864,
1995
+ "step": 25400
1996
+ },
1997
+ {
1998
+ "epoch": 0.0255,
1999
+ "grad_norm": 1.9448522329330444,
2000
+ "learning_rate": 4.99697685361197e-06,
2001
+ "loss": 1.3962,
2002
+ "step": 25500
2003
+ },
2004
+ {
2005
+ "epoch": 0.0256,
2006
+ "grad_norm": 1.7149606943130493,
2007
+ "learning_rate": 4.9969377248968245e-06,
2008
+ "loss": 1.395,
2009
+ "step": 25600
2010
+ },
2011
+ {
2012
+ "epoch": 0.0257,
2013
+ "grad_norm": 1.8136606216430664,
2014
+ "learning_rate": 4.996898344740119e-06,
2015
+ "loss": 1.3984,
2016
+ "step": 25700
2017
+ },
2018
+ {
2019
+ "epoch": 0.0258,
2020
+ "grad_norm": 1.8528047800064087,
2021
+ "learning_rate": 4.9968587131458184e-06,
2022
+ "loss": 1.4007,
2023
+ "step": 25800
2024
+ },
2025
+ {
2026
+ "epoch": 0.0259,
2027
+ "grad_norm": 1.781982660293579,
2028
+ "learning_rate": 4.996818830117914e-06,
2029
+ "loss": 1.3933,
2030
+ "step": 25900
2031
+ },
2032
+ {
2033
+ "epoch": 0.026,
2034
+ "grad_norm": 1.777457356452942,
2035
+ "learning_rate": 4.996778695660421e-06,
2036
+ "loss": 1.3912,
2037
+ "step": 26000
2038
+ },
2039
+ {
2040
+ "epoch": 0.026,
2041
+ "eval_loss": 1.3937982320785522,
2042
+ "eval_runtime": 24.4424,
2043
+ "eval_samples_per_second": 204.562,
2044
+ "eval_steps_per_second": 3.232,
2045
+ "step": 26000
2046
+ },
2047
+ {
2048
+ "epoch": 0.0261,
2049
+ "grad_norm": 1.9622981548309326,
2050
+ "learning_rate": 4.996738309777382e-06,
2051
+ "loss": 1.4005,
2052
+ "step": 26100
2053
+ },
2054
+ {
2055
+ "epoch": 0.0262,
2056
+ "grad_norm": 1.8607321977615356,
2057
+ "learning_rate": 4.996697672472864e-06,
2058
+ "loss": 1.4014,
2059
+ "step": 26200
2060
+ },
2061
+ {
2062
+ "epoch": 0.0263,
2063
+ "grad_norm": 2.0541224479675293,
2064
+ "learning_rate": 4.996656783750959e-06,
2065
+ "loss": 1.3966,
2066
+ "step": 26300
2067
+ },
2068
+ {
2069
+ "epoch": 0.0264,
2070
+ "grad_norm": 1.9274111986160278,
2071
+ "learning_rate": 4.996615643615783e-06,
2072
+ "loss": 1.3961,
2073
+ "step": 26400
2074
+ },
2075
+ {
2076
+ "epoch": 0.0265,
2077
+ "grad_norm": 1.861632227897644,
2078
+ "learning_rate": 4.99657425207148e-06,
2079
+ "loss": 1.393,
2080
+ "step": 26500
2081
+ },
2082
+ {
2083
+ "epoch": 0.0266,
2084
+ "grad_norm": 1.870938777923584,
2085
+ "learning_rate": 4.996532609122219e-06,
2086
+ "loss": 1.3987,
2087
+ "step": 26600
2088
+ },
2089
+ {
2090
+ "epoch": 0.0267,
2091
+ "grad_norm": 1.949152946472168,
2092
+ "learning_rate": 4.996490714772192e-06,
2093
+ "loss": 1.3972,
2094
+ "step": 26700
2095
+ },
2096
+ {
2097
+ "epoch": 0.0268,
2098
+ "grad_norm": 1.7645965814590454,
2099
+ "learning_rate": 4.996448569025618e-06,
2100
+ "loss": 1.3917,
2101
+ "step": 26800
2102
+ },
2103
+ {
2104
+ "epoch": 0.0269,
2105
+ "grad_norm": 1.955647587776184,
2106
+ "learning_rate": 4.9964061718867425e-06,
2107
+ "loss": 1.406,
2108
+ "step": 26900
2109
+ },
2110
+ {
2111
+ "epoch": 0.027,
2112
+ "grad_norm": 1.886497974395752,
2113
+ "learning_rate": 4.996363523359833e-06,
2114
+ "loss": 1.396,
2115
+ "step": 27000
2116
+ },
2117
+ {
2118
+ "epoch": 0.027,
2119
+ "eval_loss": 1.3594520092010498,
2120
+ "eval_runtime": 24.4577,
2121
+ "eval_samples_per_second": 204.435,
2122
+ "eval_steps_per_second": 3.23,
2123
+ "step": 27000
2124
+ },
2125
+ {
2126
+ "epoch": 0.0271,
2127
+ "grad_norm": 1.9091280698776245,
2128
+ "learning_rate": 4.996320623449186e-06,
2129
+ "loss": 1.4001,
2130
+ "step": 27100
2131
+ },
2132
+ {
2133
+ "epoch": 0.0272,
2134
+ "grad_norm": 1.9556379318237305,
2135
+ "learning_rate": 4.996277472159119e-06,
2136
+ "loss": 1.4045,
2137
+ "step": 27200
2138
+ },
2139
+ {
2140
+ "epoch": 0.0273,
2141
+ "grad_norm": 2.0282106399536133,
2142
+ "learning_rate": 4.99623406949398e-06,
2143
+ "loss": 1.3927,
2144
+ "step": 27300
2145
+ },
2146
+ {
2147
+ "epoch": 0.0274,
2148
+ "grad_norm": 1.8539470434188843,
2149
+ "learning_rate": 4.9961904154581374e-06,
2150
+ "loss": 1.3998,
2151
+ "step": 27400
2152
+ },
2153
+ {
2154
+ "epoch": 0.0275,
2155
+ "grad_norm": 1.7907086610794067,
2156
+ "learning_rate": 4.9961465100559896e-06,
2157
+ "loss": 1.3871,
2158
+ "step": 27500
2159
+ },
2160
+ {
2161
+ "epoch": 0.0276,
2162
+ "grad_norm": 1.8179974555969238,
2163
+ "learning_rate": 4.996102353291956e-06,
2164
+ "loss": 1.3898,
2165
+ "step": 27600
2166
+ },
2167
+ {
2168
+ "epoch": 0.0277,
2169
+ "grad_norm": 1.8779699802398682,
2170
+ "learning_rate": 4.996057945170483e-06,
2171
+ "loss": 1.3911,
2172
+ "step": 27700
2173
+ },
2174
+ {
2175
+ "epoch": 0.0278,
2176
+ "grad_norm": 1.8160297870635986,
2177
+ "learning_rate": 4.996013285696044e-06,
2178
+ "loss": 1.3985,
2179
+ "step": 27800
2180
+ },
2181
+ {
2182
+ "epoch": 0.0279,
2183
+ "grad_norm": 1.80897057056427,
2184
+ "learning_rate": 4.995968374873136e-06,
2185
+ "loss": 1.3956,
2186
+ "step": 27900
2187
+ },
2188
+ {
2189
+ "epoch": 0.028,
2190
+ "grad_norm": 1.6959145069122314,
2191
+ "learning_rate": 4.99592321270628e-06,
2192
+ "loss": 1.3905,
2193
+ "step": 28000
2194
+ },
2195
+ {
2196
+ "epoch": 0.028,
2197
+ "eval_loss": 1.3936872482299805,
2198
+ "eval_runtime": 24.4635,
2199
+ "eval_samples_per_second": 204.386,
2200
+ "eval_steps_per_second": 3.229,
2201
+ "step": 28000
2202
+ },
2203
+ {
2204
+ "epoch": 0.0281,
2205
+ "grad_norm": 1.7558468580245972,
2206
+ "learning_rate": 4.9958777992000255e-06,
2207
+ "loss": 1.3777,
2208
+ "step": 28100
2209
+ },
2210
+ {
2211
+ "epoch": 0.0282,
2212
+ "grad_norm": 1.8235905170440674,
2213
+ "learning_rate": 4.995832134358943e-06,
2214
+ "loss": 1.3818,
2215
+ "step": 28200
2216
+ },
2217
+ {
2218
+ "epoch": 0.0283,
2219
+ "grad_norm": 1.9419879913330078,
2220
+ "learning_rate": 4.995786218187635e-06,
2221
+ "loss": 1.4081,
2222
+ "step": 28300
2223
+ },
2224
+ {
2225
+ "epoch": 0.0284,
2226
+ "grad_norm": 1.8814060688018799,
2227
+ "learning_rate": 4.995740050690722e-06,
2228
+ "loss": 1.4014,
2229
+ "step": 28400
2230
+ },
2231
+ {
2232
+ "epoch": 0.0285,
2233
+ "grad_norm": 1.9103902578353882,
2234
+ "learning_rate": 4.995693631872855e-06,
2235
+ "loss": 1.3932,
2236
+ "step": 28500
2237
+ },
2238
+ {
2239
+ "epoch": 0.0286,
2240
+ "grad_norm": 1.813238263130188,
2241
+ "learning_rate": 4.995646961738707e-06,
2242
+ "loss": 1.3976,
2243
+ "step": 28600
2244
+ },
2245
+ {
2246
+ "epoch": 0.0287,
2247
+ "grad_norm": 1.9837982654571533,
2248
+ "learning_rate": 4.995600040292978e-06,
2249
+ "loss": 1.3953,
2250
+ "step": 28700
2251
+ },
2252
+ {
2253
+ "epoch": 0.0288,
2254
+ "grad_norm": 1.8860703706741333,
2255
+ "learning_rate": 4.995552867540394e-06,
2256
+ "loss": 1.4003,
2257
+ "step": 28800
2258
+ },
2259
+ {
2260
+ "epoch": 0.0289,
2261
+ "grad_norm": 2.1086032390594482,
2262
+ "learning_rate": 4.995505443485704e-06,
2263
+ "loss": 1.3941,
2264
+ "step": 28900
2265
+ },
2266
+ {
2267
+ "epoch": 0.029,
2268
+ "grad_norm": 1.6460460424423218,
2269
+ "learning_rate": 4.995457768133685e-06,
2270
+ "loss": 1.4004,
2271
+ "step": 29000
2272
+ },
2273
+ {
2274
+ "epoch": 0.029,
2275
+ "eval_loss": 1.3687199354171753,
2276
+ "eval_runtime": 24.5034,
2277
+ "eval_samples_per_second": 204.053,
2278
+ "eval_steps_per_second": 3.224,
2279
+ "step": 29000
2280
+ },
2281
+ {
2282
+ "epoch": 0.0291,
2283
+ "grad_norm": 1.6839542388916016,
2284
+ "learning_rate": 4.995409841489135e-06,
2285
+ "loss": 1.3888,
2286
+ "step": 29100
2287
+ },
2288
+ {
2289
+ "epoch": 0.0292,
2290
+ "grad_norm": 1.784096121788025,
2291
+ "learning_rate": 4.995361663556884e-06,
2292
+ "loss": 1.3948,
2293
+ "step": 29200
2294
+ },
2295
+ {
2296
+ "epoch": 0.0293,
2297
+ "grad_norm": 1.7756794691085815,
2298
+ "learning_rate": 4.995313234341781e-06,
2299
+ "loss": 1.3915,
2300
+ "step": 29300
2301
+ },
2302
+ {
2303
+ "epoch": 0.0294,
2304
+ "grad_norm": 1.887779951095581,
2305
+ "learning_rate": 4.995264553848704e-06,
2306
+ "loss": 1.3907,
2307
+ "step": 29400
2308
+ },
2309
+ {
2310
+ "epoch": 0.0295,
2311
+ "grad_norm": 1.8146642446517944,
2312
+ "learning_rate": 4.9952156220825545e-06,
2313
+ "loss": 1.3936,
2314
+ "step": 29500
2315
+ },
2316
+ {
2317
+ "epoch": 0.0296,
2318
+ "grad_norm": 1.8816800117492676,
2319
+ "learning_rate": 4.9951664390482605e-06,
2320
+ "loss": 1.4024,
2321
+ "step": 29600
2322
+ },
2323
+ {
2324
+ "epoch": 0.0297,
2325
+ "grad_norm": 1.7906370162963867,
2326
+ "learning_rate": 4.995117004750774e-06,
2327
+ "loss": 1.3899,
2328
+ "step": 29700
2329
+ },
2330
+ {
2331
+ "epoch": 0.0298,
2332
+ "grad_norm": 1.812466025352478,
2333
+ "learning_rate": 4.995067319195073e-06,
2334
+ "loss": 1.3991,
2335
+ "step": 29800
2336
+ },
2337
+ {
2338
+ "epoch": 0.0299,
2339
+ "grad_norm": 1.7394683361053467,
2340
+ "learning_rate": 4.995017382386162e-06,
2341
+ "loss": 1.4003,
2342
+ "step": 29900
2343
+ },
2344
+ {
2345
+ "epoch": 0.03,
2346
+ "grad_norm": 1.876086950302124,
2347
+ "learning_rate": 4.994967194329069e-06,
2348
+ "loss": 1.3957,
2349
+ "step": 30000
2350
+ },
2351
+ {
2352
+ "epoch": 0.03,
2353
+ "eval_loss": 1.3768850564956665,
2354
+ "eval_runtime": 24.5341,
2355
+ "eval_samples_per_second": 203.798,
2356
+ "eval_steps_per_second": 3.22,
2357
+ "step": 30000
2358
+ },
2359
+ {
2360
+ "epoch": 0.0301,
2361
+ "grad_norm": 1.9874565601348877,
2362
+ "learning_rate": 4.994916755028847e-06,
2363
+ "loss": 1.3796,
2364
+ "step": 30100
2365
+ },
2366
+ {
2367
+ "epoch": 0.0302,
2368
+ "grad_norm": 1.8175944089889526,
2369
+ "learning_rate": 4.994866064490577e-06,
2370
+ "loss": 1.3918,
2371
+ "step": 30200
2372
+ },
2373
+ {
2374
+ "epoch": 0.0303,
2375
+ "grad_norm": 1.8819736242294312,
2376
+ "learning_rate": 4.994815122719361e-06,
2377
+ "loss": 1.3847,
2378
+ "step": 30300
2379
+ },
2380
+ {
2381
+ "epoch": 0.0304,
2382
+ "grad_norm": 1.7215774059295654,
2383
+ "learning_rate": 4.994763929720331e-06,
2384
+ "loss": 1.3977,
2385
+ "step": 30400
2386
+ },
2387
+ {
2388
+ "epoch": 0.0305,
2389
+ "grad_norm": 1.7516419887542725,
2390
+ "learning_rate": 4.9947124854986425e-06,
2391
+ "loss": 1.3972,
2392
+ "step": 30500
2393
+ },
2394
+ {
2395
+ "epoch": 0.0306,
2396
+ "grad_norm": 1.7906025648117065,
2397
+ "learning_rate": 4.9946607900594745e-06,
2398
+ "loss": 1.3848,
2399
+ "step": 30600
2400
+ },
2401
+ {
2402
+ "epoch": 0.0307,
2403
+ "grad_norm": 1.7155588865280151,
2404
+ "learning_rate": 4.994608843408033e-06,
2405
+ "loss": 1.3892,
2406
+ "step": 30700
2407
+ },
2408
+ {
2409
+ "epoch": 0.0308,
2410
+ "grad_norm": 1.843208909034729,
2411
+ "learning_rate": 4.994556645549549e-06,
2412
+ "loss": 1.404,
2413
+ "step": 30800
2414
+ },
2415
+ {
2416
+ "epoch": 0.0309,
2417
+ "grad_norm": 1.7222075462341309,
2418
+ "learning_rate": 4.994504196489279e-06,
2419
+ "loss": 1.3807,
2420
+ "step": 30900
2421
+ },
2422
+ {
2423
+ "epoch": 0.031,
2424
+ "grad_norm": 1.783629298210144,
2425
+ "learning_rate": 4.994451496232505e-06,
2426
+ "loss": 1.3677,
2427
+ "step": 31000
2428
+ },
2429
+ {
2430
+ "epoch": 0.031,
2431
+ "eval_loss": 1.3631515502929688,
2432
+ "eval_runtime": 25.4272,
2433
+ "eval_samples_per_second": 196.64,
2434
+ "eval_steps_per_second": 3.107,
2435
+ "step": 31000
2436
+ },
2437
+ {
2438
+ "epoch": 0.0311,
2439
+ "grad_norm": 2.0314207077026367,
2440
+ "learning_rate": 4.9943985447845336e-06,
2441
+ "loss": 1.3947,
2442
+ "step": 31100
2443
+ },
2444
+ {
2445
+ "epoch": 0.0312,
2446
+ "grad_norm": 1.8006716966629028,
2447
+ "learning_rate": 4.9943453421506975e-06,
2448
+ "loss": 1.3883,
2449
+ "step": 31200
2450
+ },
2451
+ {
2452
+ "epoch": 0.0313,
2453
+ "grad_norm": 1.8027557134628296,
2454
+ "learning_rate": 4.9942918883363525e-06,
2455
+ "loss": 1.3893,
2456
+ "step": 31300
2457
+ },
2458
+ {
2459
+ "epoch": 0.0314,
2460
+ "grad_norm": 1.8935391902923584,
2461
+ "learning_rate": 4.994238183346883e-06,
2462
+ "loss": 1.3904,
2463
+ "step": 31400
2464
+ },
2465
+ {
2466
+ "epoch": 0.0315,
2467
+ "grad_norm": 1.7756006717681885,
2468
+ "learning_rate": 4.9941842271876975e-06,
2469
+ "loss": 1.3877,
2470
+ "step": 31500
2471
+ },
2472
+ {
2473
+ "epoch": 0.0316,
2474
+ "grad_norm": 1.7972420454025269,
2475
+ "learning_rate": 4.994130019864228e-06,
2476
+ "loss": 1.4124,
2477
+ "step": 31600
2478
+ },
2479
+ {
2480
+ "epoch": 0.0317,
2481
+ "grad_norm": 1.966475248336792,
2482
+ "learning_rate": 4.994075561381934e-06,
2483
+ "loss": 1.4365,
2484
+ "step": 31700
2485
+ },
2486
+ {
2487
+ "epoch": 0.0318,
2488
+ "grad_norm": 1.8581668138504028,
2489
+ "learning_rate": 4.994020851746298e-06,
2490
+ "loss": 1.4377,
2491
+ "step": 31800
2492
+ },
2493
+ {
2494
+ "epoch": 0.0319,
2495
+ "grad_norm": 2.0146875381469727,
2496
+ "learning_rate": 4.993965890962832e-06,
2497
+ "loss": 1.4372,
2498
+ "step": 31900
2499
+ },
2500
+ {
2501
+ "epoch": 0.032,
2502
+ "grad_norm": 2.064978837966919,
2503
+ "learning_rate": 4.993910679037069e-06,
2504
+ "loss": 1.4547,
2505
+ "step": 32000
2506
+ },
2507
+ {
2508
+ "epoch": 0.032,
2509
+ "eval_loss": 1.3730239868164062,
2510
+ "eval_runtime": 24.4948,
2511
+ "eval_samples_per_second": 204.125,
2512
+ "eval_steps_per_second": 3.225,
2513
+ "step": 32000
2514
+ },
2515
+ {
2516
+ "epoch": 0.0321,
2517
+ "grad_norm": 2.0712766647338867,
2518
+ "learning_rate": 4.993855215974568e-06,
2519
+ "loss": 1.4353,
2520
+ "step": 32100
2521
+ },
2522
+ {
2523
+ "epoch": 0.0322,
2524
+ "grad_norm": 1.8807464838027954,
2525
+ "learning_rate": 4.9937995017809145e-06,
2526
+ "loss": 1.4403,
2527
+ "step": 32200
2528
+ },
2529
+ {
2530
+ "epoch": 0.0323,
2531
+ "grad_norm": 2.0456089973449707,
2532
+ "learning_rate": 4.993743536461721e-06,
2533
+ "loss": 1.4426,
2534
+ "step": 32300
2535
+ },
2536
+ {
2537
+ "epoch": 0.0324,
2538
+ "grad_norm": 1.8087421655654907,
2539
+ "learning_rate": 4.993687320022621e-06,
2540
+ "loss": 1.4273,
2541
+ "step": 32400
2542
+ },
2543
+ {
2544
+ "epoch": 0.0325,
2545
+ "grad_norm": 1.8107932806015015,
2546
+ "learning_rate": 4.993630852469275e-06,
2547
+ "loss": 1.434,
2548
+ "step": 32500
2549
+ },
2550
+ {
2551
+ "epoch": 0.0326,
2552
+ "grad_norm": 1.9879577159881592,
2553
+ "learning_rate": 4.9935741338073715e-06,
2554
+ "loss": 1.4412,
2555
+ "step": 32600
2556
+ },
2557
+ {
2558
+ "epoch": 0.0327,
2559
+ "grad_norm": 1.936676025390625,
2560
+ "learning_rate": 4.993517164042621e-06,
2561
+ "loss": 1.4545,
2562
+ "step": 32700
2563
+ },
2564
+ {
2565
+ "epoch": 0.0328,
2566
+ "grad_norm": 2.0299065113067627,
2567
+ "learning_rate": 4.99345994318076e-06,
2568
+ "loss": 1.4361,
2569
+ "step": 32800
2570
+ },
2571
+ {
2572
+ "epoch": 0.0329,
2573
+ "grad_norm": 1.900130033493042,
2574
+ "learning_rate": 4.993402471227551e-06,
2575
+ "loss": 1.4438,
2576
+ "step": 32900
2577
+ },
2578
+ {
2579
+ "epoch": 0.033,
2580
+ "grad_norm": 1.991466760635376,
2581
+ "learning_rate": 4.993344748188782e-06,
2582
+ "loss": 1.4412,
2583
+ "step": 33000
2584
+ },
2585
+ {
2586
+ "epoch": 0.033,
2587
+ "eval_loss": 1.3792095184326172,
2588
+ "eval_runtime": 24.4986,
2589
+ "eval_samples_per_second": 204.093,
2590
+ "eval_steps_per_second": 3.225,
2591
+ "step": 33000
2592
+ },
2593
+ {
2594
+ "epoch": 0.0331,
2595
+ "grad_norm": 2.0281729698181152,
2596
+ "learning_rate": 4.993286774070264e-06,
2597
+ "loss": 1.4363,
2598
+ "step": 33100
2599
+ },
2600
+ {
2601
+ "epoch": 0.0332,
2602
+ "grad_norm": 1.8294743299484253,
2603
+ "learning_rate": 4.993228548877837e-06,
2604
+ "loss": 1.4507,
2605
+ "step": 33200
2606
+ },
2607
+ {
2608
+ "epoch": 0.0333,
2609
+ "grad_norm": 2.0411036014556885,
2610
+ "learning_rate": 4.993170072617362e-06,
2611
+ "loss": 1.4462,
2612
+ "step": 33300
2613
+ },
2614
+ {
2615
+ "epoch": 0.0334,
2616
+ "grad_norm": 1.9206632375717163,
2617
+ "learning_rate": 4.99311134529473e-06,
2618
+ "loss": 1.4385,
2619
+ "step": 33400
2620
+ },
2621
+ {
2622
+ "epoch": 0.0335,
2623
+ "grad_norm": 1.8697141408920288,
2624
+ "learning_rate": 4.993052366915853e-06,
2625
+ "loss": 1.4387,
2626
+ "step": 33500
2627
+ },
2628
+ {
2629
+ "epoch": 0.0336,
2630
+ "grad_norm": 1.9599472284317017,
2631
+ "learning_rate": 4.9929931374866715e-06,
2632
+ "loss": 1.4374,
2633
+ "step": 33600
2634
+ },
2635
+ {
2636
+ "epoch": 0.0337,
2637
+ "grad_norm": 1.9199376106262207,
2638
+ "learning_rate": 4.992933657013149e-06,
2639
+ "loss": 1.417,
2640
+ "step": 33700
2641
+ },
2642
+ {
2643
+ "epoch": 0.0338,
2644
+ "grad_norm": 1.742351770401001,
2645
+ "learning_rate": 4.992873925501276e-06,
2646
+ "loss": 1.4296,
2647
+ "step": 33800
2648
+ },
2649
+ {
2650
+ "epoch": 0.0339,
2651
+ "grad_norm": 2.089339017868042,
2652
+ "learning_rate": 4.992813942957067e-06,
2653
+ "loss": 1.4346,
2654
+ "step": 33900
2655
+ },
2656
+ {
2657
+ "epoch": 0.034,
2658
+ "grad_norm": 1.9663175344467163,
2659
+ "learning_rate": 4.992753709386562e-06,
2660
+ "loss": 1.4296,
2661
+ "step": 34000
2662
+ },
2663
+ {
2664
+ "epoch": 0.034,
2665
+ "eval_loss": 1.3703665733337402,
2666
+ "eval_runtime": 24.5247,
2667
+ "eval_samples_per_second": 203.876,
2668
+ "eval_steps_per_second": 3.221,
2669
+ "step": 34000
2670
+ },
2671
+ {
2672
+ "epoch": 0.0341,
2673
+ "grad_norm": 1.81997549533844,
2674
+ "learning_rate": 4.992693224795826e-06,
2675
+ "loss": 1.4413,
2676
+ "step": 34100
2677
+ },
2678
+ {
2679
+ "epoch": 0.0342,
2680
+ "grad_norm": 1.925018310546875,
2681
+ "learning_rate": 4.992632489190951e-06,
2682
+ "loss": 1.4287,
2683
+ "step": 34200
2684
+ },
2685
+ {
2686
+ "epoch": 0.0343,
2687
+ "grad_norm": 1.9179668426513672,
2688
+ "learning_rate": 4.992571502578052e-06,
2689
+ "loss": 1.4394,
2690
+ "step": 34300
2691
+ },
2692
+ {
2693
+ "epoch": 0.0344,
2694
+ "grad_norm": 1.8218015432357788,
2695
+ "learning_rate": 4.992510264963271e-06,
2696
+ "loss": 1.4283,
2697
+ "step": 34400
2698
+ },
2699
+ {
2700
+ "epoch": 0.0345,
2701
+ "grad_norm": 2.0702309608459473,
2702
+ "learning_rate": 4.992448776352775e-06,
2703
+ "loss": 1.4336,
2704
+ "step": 34500
2705
+ },
2706
+ {
2707
+ "epoch": 0.0346,
2708
+ "grad_norm": 1.958945393562317,
2709
+ "learning_rate": 4.992387036752755e-06,
2710
+ "loss": 1.4357,
2711
+ "step": 34600
2712
+ },
2713
+ {
2714
+ "epoch": 0.0347,
2715
+ "grad_norm": 1.986783742904663,
2716
+ "learning_rate": 4.992325046169429e-06,
2717
+ "loss": 1.4514,
2718
+ "step": 34700
2719
+ },
2720
+ {
2721
+ "epoch": 0.0348,
2722
+ "grad_norm": 1.9416749477386475,
2723
+ "learning_rate": 4.9922628046090385e-06,
2724
+ "loss": 1.4339,
2725
+ "step": 34800
2726
+ },
2727
+ {
2728
+ "epoch": 0.0349,
2729
+ "grad_norm": 2.0083744525909424,
2730
+ "learning_rate": 4.992200312077852e-06,
2731
+ "loss": 1.4341,
2732
+ "step": 34900
2733
+ },
2734
+ {
2735
+ "epoch": 0.035,
2736
+ "grad_norm": 1.904085636138916,
2737
+ "learning_rate": 4.992137568582162e-06,
2738
+ "loss": 1.4381,
2739
+ "step": 35000
2740
+ },
2741
+ {
2742
+ "epoch": 0.035,
2743
+ "eval_loss": 1.3807413578033447,
2744
+ "eval_runtime": 24.5144,
2745
+ "eval_samples_per_second": 203.962,
2746
+ "eval_steps_per_second": 3.223,
2747
+ "step": 35000
2748
+ },
2749
+ {
2750
+ "epoch": 0.0351,
2751
+ "grad_norm": 1.923017978668213,
2752
+ "learning_rate": 4.9920745741282886e-06,
2753
+ "loss": 1.4372,
2754
+ "step": 35100
2755
+ },
2756
+ {
2757
+ "epoch": 0.0352,
2758
+ "grad_norm": 2.0158944129943848,
2759
+ "learning_rate": 4.992011328722572e-06,
2760
+ "loss": 1.4409,
2761
+ "step": 35200
2762
+ },
2763
+ {
2764
+ "epoch": 0.0353,
2765
+ "grad_norm": 1.9902615547180176,
2766
+ "learning_rate": 4.991947832371384e-06,
2767
+ "loss": 1.4395,
2768
+ "step": 35300
2769
+ },
2770
+ {
2771
+ "epoch": 0.0354,
2772
+ "grad_norm": 1.8978103399276733,
2773
+ "learning_rate": 4.9918840850811155e-06,
2774
+ "loss": 1.4217,
2775
+ "step": 35400
2776
+ },
2777
+ {
2778
+ "epoch": 0.0355,
2779
+ "grad_norm": 2.14467453956604,
2780
+ "learning_rate": 4.99182008685819e-06,
2781
+ "loss": 1.4306,
2782
+ "step": 35500
2783
+ },
2784
+ {
2785
+ "epoch": 0.0356,
2786
+ "grad_norm": 2.108886241912842,
2787
+ "learning_rate": 4.991755837709049e-06,
2788
+ "loss": 1.429,
2789
+ "step": 35600
2790
+ },
2791
+ {
2792
+ "epoch": 0.0357,
2793
+ "grad_norm": 1.7032333612442017,
2794
+ "learning_rate": 4.991691337640163e-06,
2795
+ "loss": 1.4332,
2796
+ "step": 35700
2797
+ },
2798
+ {
2799
+ "epoch": 0.0358,
2800
+ "grad_norm": 1.8070473670959473,
2801
+ "learning_rate": 4.991626586658028e-06,
2802
+ "loss": 1.432,
2803
+ "step": 35800
2804
+ },
2805
+ {
2806
+ "epoch": 0.0359,
2807
+ "grad_norm": 1.8170287609100342,
2808
+ "learning_rate": 4.991561584769164e-06,
2809
+ "loss": 1.4158,
2810
+ "step": 35900
2811
+ },
2812
+ {
2813
+ "epoch": 0.036,
2814
+ "grad_norm": 2.1401708126068115,
2815
+ "learning_rate": 4.991496331980116e-06,
2816
+ "loss": 1.4398,
2817
+ "step": 36000
2818
+ },
2819
+ {
2820
+ "epoch": 0.036,
2821
+ "eval_loss": 1.3770599365234375,
2822
+ "eval_runtime": 24.5619,
2823
+ "eval_samples_per_second": 203.567,
2824
+ "eval_steps_per_second": 3.216,
2825
+ "step": 36000
2826
+ },
2827
+ {
2828
+ "epoch": 0.0361,
2829
+ "grad_norm": 1.8093080520629883,
2830
+ "learning_rate": 4.991430828297456e-06,
2831
+ "loss": 1.4422,
2832
+ "step": 36100
2833
+ },
2834
+ {
2835
+ "epoch": 0.0362,
2836
+ "grad_norm": 2.0092973709106445,
2837
+ "learning_rate": 4.99136507372778e-06,
2838
+ "loss": 1.4297,
2839
+ "step": 36200
2840
+ },
2841
+ {
2842
+ "epoch": 0.0363,
2843
+ "grad_norm": 1.928898572921753,
2844
+ "learning_rate": 4.991299068277709e-06,
2845
+ "loss": 1.43,
2846
+ "step": 36300
2847
+ },
2848
+ {
2849
+ "epoch": 0.0364,
2850
+ "grad_norm": 1.8470439910888672,
2851
+ "learning_rate": 4.99123281195389e-06,
2852
+ "loss": 1.4143,
2853
+ "step": 36400
2854
+ },
2855
+ {
2856
+ "epoch": 0.0365,
2857
+ "grad_norm": 1.7525688409805298,
2858
+ "learning_rate": 4.991166304762994e-06,
2859
+ "loss": 1.4316,
2860
+ "step": 36500
2861
+ },
2862
+ {
2863
+ "epoch": 0.0366,
2864
+ "grad_norm": 1.9879366159439087,
2865
+ "learning_rate": 4.9910995467117205e-06,
2866
+ "loss": 1.4501,
2867
+ "step": 36600
2868
+ },
2869
+ {
2870
+ "epoch": 0.0367,
2871
+ "grad_norm": 2.008660316467285,
2872
+ "learning_rate": 4.99103253780679e-06,
2873
+ "loss": 1.4331,
2874
+ "step": 36700
2875
+ },
2876
+ {
2877
+ "epoch": 0.0368,
2878
+ "grad_norm": 1.977774977684021,
2879
+ "learning_rate": 4.990965278054952e-06,
2880
+ "loss": 1.4271,
2881
+ "step": 36800
2882
+ },
2883
+ {
2884
+ "epoch": 0.0369,
2885
+ "grad_norm": 1.8421337604522705,
2886
+ "learning_rate": 4.990897767462978e-06,
2887
+ "loss": 1.4363,
2888
+ "step": 36900
2889
+ },
2890
+ {
2891
+ "epoch": 0.037,
2892
+ "grad_norm": 1.959547519683838,
2893
+ "learning_rate": 4.990830006037667e-06,
2894
+ "loss": 1.4322,
2895
+ "step": 37000
2896
+ },
2897
+ {
2898
+ "epoch": 0.037,
2899
+ "eval_loss": 1.3740063905715942,
2900
+ "eval_runtime": 24.5809,
2901
+ "eval_samples_per_second": 203.41,
2902
+ "eval_steps_per_second": 3.214,
2903
+ "step": 37000
2904
+ },
2905
+ {
2906
+ "epoch": 0.0371,
2907
+ "grad_norm": 1.9228745698928833,
2908
+ "learning_rate": 4.9907619937858435e-06,
2909
+ "loss": 1.4269,
2910
+ "step": 37100
2911
+ },
2912
+ {
2913
+ "epoch": 0.0372,
2914
+ "grad_norm": 1.996996521949768,
2915
+ "learning_rate": 4.990693730714354e-06,
2916
+ "loss": 1.4289,
2917
+ "step": 37200
2918
+ },
2919
+ {
2920
+ "epoch": 0.0373,
2921
+ "grad_norm": 1.9373116493225098,
2922
+ "learning_rate": 4.9906252168300755e-06,
2923
+ "loss": 1.4184,
2924
+ "step": 37300
2925
+ },
2926
+ {
2927
+ "epoch": 0.0374,
2928
+ "grad_norm": 1.7933897972106934,
2929
+ "learning_rate": 4.9905564521399046e-06,
2930
+ "loss": 1.4376,
2931
+ "step": 37400
2932
+ },
2933
+ {
2934
+ "epoch": 0.0375,
2935
+ "grad_norm": 2.018866539001465,
2936
+ "learning_rate": 4.9904874366507674e-06,
2937
+ "loss": 1.4241,
2938
+ "step": 37500
2939
+ },
2940
+ {
2941
+ "epoch": 0.0376,
2942
+ "grad_norm": 2.0614266395568848,
2943
+ "learning_rate": 4.990418170369613e-06,
2944
+ "loss": 1.4441,
2945
+ "step": 37600
2946
+ },
2947
+ {
2948
+ "epoch": 0.0377,
2949
+ "grad_norm": 1.9219199419021606,
2950
+ "learning_rate": 4.990348653303417e-06,
2951
+ "loss": 1.4285,
2952
+ "step": 37700
2953
+ },
2954
+ {
2955
+ "epoch": 0.0378,
2956
+ "grad_norm": 2.0715043544769287,
2957
+ "learning_rate": 4.990278885459181e-06,
2958
+ "loss": 1.4361,
2959
+ "step": 37800
2960
+ },
2961
+ {
2962
+ "epoch": 0.0379,
2963
+ "grad_norm": 1.9379918575286865,
2964
+ "learning_rate": 4.9902088668439284e-06,
2965
+ "loss": 1.4371,
2966
+ "step": 37900
2967
+ },
2968
+ {
2969
+ "epoch": 0.038,
2970
+ "grad_norm": 1.9890401363372803,
2971
+ "learning_rate": 4.990138597464711e-06,
2972
+ "loss": 1.4273,
2973
+ "step": 38000
2974
+ },
2975
+ {
2976
+ "epoch": 0.038,
2977
+ "eval_loss": 1.387099266052246,
2978
+ "eval_runtime": 24.5479,
2979
+ "eval_samples_per_second": 203.683,
2980
+ "eval_steps_per_second": 3.218,
2981
+ "step": 38000
2982
+ },
2983
+ {
2984
+ "epoch": 0.0381,
2985
+ "grad_norm": 2.0055789947509766,
2986
+ "learning_rate": 4.990068077328606e-06,
2987
+ "loss": 1.422,
2988
+ "step": 38100
2989
+ },
2990
+ {
2991
+ "epoch": 0.0382,
2992
+ "grad_norm": 2.1018118858337402,
2993
+ "learning_rate": 4.989997306442712e-06,
2994
+ "loss": 1.4257,
2995
+ "step": 38200
2996
+ },
2997
+ {
2998
+ "epoch": 0.0383,
2999
+ "grad_norm": 1.901002049446106,
3000
+ "learning_rate": 4.989926284814158e-06,
3001
+ "loss": 1.4173,
3002
+ "step": 38300
3003
+ },
3004
+ {
3005
+ "epoch": 0.0384,
3006
+ "grad_norm": 1.8262403011322021,
3007
+ "learning_rate": 4.989855012450096e-06,
3008
+ "loss": 1.4332,
3009
+ "step": 38400
3010
+ },
3011
+ {
3012
+ "epoch": 0.0385,
3013
+ "grad_norm": 2.0205042362213135,
3014
+ "learning_rate": 4.989783489357703e-06,
3015
+ "loss": 1.4454,
3016
+ "step": 38500
3017
+ },
3018
+ {
3019
+ "epoch": 0.0386,
3020
+ "grad_norm": 2.0293779373168945,
3021
+ "learning_rate": 4.989711715544179e-06,
3022
+ "loss": 1.4315,
3023
+ "step": 38600
3024
+ },
3025
+ {
3026
+ "epoch": 0.0387,
3027
+ "grad_norm": 1.9387352466583252,
3028
+ "learning_rate": 4.989639691016754e-06,
3029
+ "loss": 1.4266,
3030
+ "step": 38700
3031
+ },
3032
+ {
3033
+ "epoch": 0.0388,
3034
+ "grad_norm": 1.9366799592971802,
3035
+ "learning_rate": 4.98956741578268e-06,
3036
+ "loss": 1.425,
3037
+ "step": 38800
3038
+ },
3039
+ {
3040
+ "epoch": 0.0389,
3041
+ "grad_norm": 1.973802089691162,
3042
+ "learning_rate": 4.989494889849236e-06,
3043
+ "loss": 1.4382,
3044
+ "step": 38900
3045
+ },
3046
+ {
3047
+ "epoch": 0.039,
3048
+ "grad_norm": 1.9763784408569336,
3049
+ "learning_rate": 4.989422113223724e-06,
3050
+ "loss": 1.4255,
3051
+ "step": 39000
3052
+ },
3053
+ {
3054
+ "epoch": 0.039,
3055
+ "eval_loss": 1.3631179332733154,
3056
+ "eval_runtime": 24.5541,
3057
+ "eval_samples_per_second": 203.632,
3058
+ "eval_steps_per_second": 3.217,
3059
+ "step": 39000
3060
+ },
3061
+ {
3062
+ "epoch": 0.0391,
3063
+ "grad_norm": 1.9820029735565186,
3064
+ "learning_rate": 4.989349085913474e-06,
3065
+ "loss": 1.4326,
3066
+ "step": 39100
3067
+ },
3068
+ {
3069
+ "epoch": 0.0392,
3070
+ "grad_norm": 1.9469517469406128,
3071
+ "learning_rate": 4.989275807925838e-06,
3072
+ "loss": 1.4299,
3073
+ "step": 39200
3074
+ },
3075
+ {
3076
+ "epoch": 0.0393,
3077
+ "grad_norm": 1.8591476678848267,
3078
+ "learning_rate": 4.989202279268198e-06,
3079
+ "loss": 1.4096,
3080
+ "step": 39300
3081
+ },
3082
+ {
3083
+ "epoch": 0.0394,
3084
+ "grad_norm": 1.9623719453811646,
3085
+ "learning_rate": 4.989128499947956e-06,
3086
+ "loss": 1.4368,
3087
+ "step": 39400
3088
+ },
3089
+ {
3090
+ "epoch": 0.0395,
3091
+ "grad_norm": 1.8734135627746582,
3092
+ "learning_rate": 4.989054469972541e-06,
3093
+ "loss": 1.4442,
3094
+ "step": 39500
3095
+ },
3096
+ {
3097
+ "epoch": 0.0396,
3098
+ "grad_norm": 1.8288686275482178,
3099
+ "learning_rate": 4.98898018934941e-06,
3100
+ "loss": 1.4371,
3101
+ "step": 39600
3102
+ },
3103
+ {
3104
+ "epoch": 0.0397,
3105
+ "grad_norm": 2.0335354804992676,
3106
+ "learning_rate": 4.988905658086041e-06,
3107
+ "loss": 1.4147,
3108
+ "step": 39700
3109
+ },
3110
+ {
3111
+ "epoch": 0.0398,
3112
+ "grad_norm": 1.8424173593521118,
3113
+ "learning_rate": 4.988830876189941e-06,
3114
+ "loss": 1.4347,
3115
+ "step": 39800
3116
+ },
3117
+ {
3118
+ "epoch": 0.0399,
3119
+ "grad_norm": 1.9924767017364502,
3120
+ "learning_rate": 4.9887558436686415e-06,
3121
+ "loss": 1.4396,
3122
+ "step": 39900
3123
+ },
3124
+ {
3125
+ "epoch": 0.04,
3126
+ "grad_norm": 2.015306234359741,
3127
+ "learning_rate": 4.988680560529695e-06,
3128
+ "loss": 1.4322,
3129
+ "step": 40000
3130
+ },
3131
+ {
3132
+ "epoch": 0.04,
3133
+ "eval_loss": 1.3669300079345703,
3134
+ "eval_runtime": 24.5711,
3135
+ "eval_samples_per_second": 203.491,
3136
+ "eval_steps_per_second": 3.215,
3137
+ "step": 40000
3138
  }
3139
  ],
3140
  "logging_steps": 100,
 
3154
  "attributes": {}
3155
  }
3156
  },
3157
+ "total_flos": 5.41544675278848e+18,
3158
  "train_batch_size": 64,
3159
  "trial_name": null,
3160
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5045bb023fc9f9ce18adc5b4d8a1c05111e1d7d6f92b7d0e1888eae4ede00e23
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ed4792935f60351e97f9f7d12d441a439016de4467a0443320af5be32108cf
3
  size 5777