PurplelinkPL commited on
Commit
9750131
·
verified ·
1 Parent(s): 91d9c5b

Upload 9 files

Browse files
Files changed (5) hide show
  1. config.json +2 -0
  2. model.safetensors +1 -1
  3. rng_state.pth +1 -1
  4. trainer_state.json +686 -686
  5. training_args.bin +1 -1
config.json CHANGED
@@ -4,6 +4,7 @@
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
7
  "attn_implementation": "sdpa",
8
  "bos_token_id": 50281,
9
  "classifier_activation": "gelu",
@@ -20,6 +21,7 @@
20
  "global_rope_theta": 160000.0,
21
  "gradient_checkpointing": false,
22
  "hidden_activation": "gelu",
 
23
  "hidden_size": 1024,
24
  "initializer_cutoff_factor": 2.0,
25
  "initializer_range": 0.02,
 
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "attention_probs_dropout_prob": 0.1,
8
  "attn_implementation": "sdpa",
9
  "bos_token_id": 50281,
10
  "classifier_activation": "gelu",
 
21
  "global_rope_theta": 160000.0,
22
  "gradient_checkpointing": false,
23
  "hidden_activation": "gelu",
24
+ "hidden_dropout_prob": 0.1,
25
  "hidden_size": 1024,
26
  "initializer_cutoff_factor": 2.0,
27
  "initializer_range": 0.02,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c513c87136b7061f89a0058cf57e10feabc8eaa6dc84ac77ff0f5a223c2f19c
3
  size 1583544840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99e23c9c5e198f1a7197faedddc865f198f1ac2bcdc84e3402a78043d8ae5c8
3
  size 1583544840
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:449e44f9adf4d083aec6625b9110f6a9a09baba982e3a32de94ff0c135c00f4d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ce21f5865b864b77c3be4b62e9a259611aacea0d4451a245cb98c83253561d
3
  size 14645
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.08784,
6
  "eval_steps": 1000,
7
  "global_step": 25000,
8
  "is_hyper_param_search": false,
@@ -11,1959 +11,1959 @@
11
  "log_history": [
12
  {
13
  "epoch": 4e-05,
14
- "grad_norm": 0.911555290222168,
15
  "learning_rate": 0.0,
16
- "loss": 0.7505,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.004,
21
- "grad_norm": 1.2557882070541382,
22
  "learning_rate": 9.9e-07,
23
- "loss": 0.831,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.008,
28
- "grad_norm": 0.9086900353431702,
29
  "learning_rate": 1.9900000000000004e-06,
30
- "loss": 0.8295,
31
  "step": 200
32
  },
33
  {
34
  "epoch": 0.012,
35
- "grad_norm": 0.9221948385238647,
36
  "learning_rate": 2.4999758220143106e-06,
37
- "loss": 0.8411,
38
  "step": 300
39
  },
40
  {
41
  "epoch": 0.016,
42
- "grad_norm": 0.8809811472892761,
43
  "learning_rate": 2.4997764426529066e-06,
44
- "loss": 0.8288,
45
  "step": 400
46
  },
47
  {
48
  "epoch": 0.02,
49
- "grad_norm": 1.3145067691802979,
50
  "learning_rate": 2.499375702067717e-06,
51
- "loss": 0.8312,
52
  "step": 500
53
  },
54
  {
55
  "epoch": 0.024,
56
- "grad_norm": 0.9034631252288818,
57
  "learning_rate": 2.4987736648251815e-06,
58
- "loss": 0.8385,
59
  "step": 600
60
  },
61
  {
62
  "epoch": 0.028,
63
- "grad_norm": 0.8681179881095886,
64
  "learning_rate": 2.497970427924213e-06,
65
- "loss": 0.8175,
66
  "step": 700
67
  },
68
  {
69
  "epoch": 0.032,
70
- "grad_norm": 0.9303165674209595,
71
  "learning_rate": 2.496966120780569e-06,
72
- "loss": 0.8281,
73
  "step": 800
74
  },
75
  {
76
  "epoch": 0.036,
77
- "grad_norm": 0.9573058485984802,
78
  "learning_rate": 2.4957609052060012e-06,
79
- "loss": 0.8326,
80
  "step": 900
81
  },
82
  {
83
  "epoch": 0.04,
84
- "grad_norm": 0.9730055928230286,
85
  "learning_rate": 2.4943549753821847e-06,
86
- "loss": 0.8391,
87
  "step": 1000
88
  },
89
  {
90
  "epoch": 0.04,
91
- "eval_loss": 1.5264503955841064,
92
- "eval_runtime": 104.8997,
93
- "eval_samples_per_second": 130.553,
94
- "eval_steps_per_second": 2.04,
95
  "step": 1000
96
  },
97
  {
98
  "epoch": 0.044,
99
- "grad_norm": 0.8237825632095337,
100
  "learning_rate": 2.4927485578294313e-06,
101
- "loss": 0.8176,
102
  "step": 1100
103
  },
104
  {
105
  "epoch": 0.048,
106
- "grad_norm": 0.9133234620094299,
107
  "learning_rate": 2.4909419113701947e-06,
108
- "loss": 0.8303,
109
  "step": 1200
110
  },
111
  {
112
  "epoch": 0.052,
113
- "grad_norm": 0.9377557635307312,
114
  "learning_rate": 2.4889353270873663e-06,
115
- "loss": 0.8159,
116
  "step": 1300
117
  },
118
  {
119
  "epoch": 0.056,
120
- "grad_norm": 0.9034435749053955,
121
  "learning_rate": 2.4867291282773805e-06,
122
- "loss": 0.8145,
123
  "step": 1400
124
  },
125
  {
126
  "epoch": 0.06,
127
- "grad_norm": 1.0601003170013428,
128
  "learning_rate": 2.4843236703981235e-06,
129
- "loss": 0.8317,
130
  "step": 1500
131
  },
132
  {
133
  "epoch": 0.064,
134
- "grad_norm": 0.9157763719558716,
135
  "learning_rate": 2.481719341011662e-06,
136
- "loss": 0.8355,
137
  "step": 1600
138
  },
139
  {
140
  "epoch": 0.068,
141
- "grad_norm": 0.9011576175689697,
142
  "learning_rate": 2.4789165597218035e-06,
143
- "loss": 0.8319,
144
  "step": 1700
145
  },
146
  {
147
  "epoch": 0.072,
148
- "grad_norm": 0.8954268097877502,
149
  "learning_rate": 2.475915778106486e-06,
150
- "loss": 0.8156,
151
  "step": 1800
152
  },
153
  {
154
  "epoch": 0.076,
155
- "grad_norm": 0.8911709189414978,
156
  "learning_rate": 2.4727174796450266e-06,
157
- "loss": 0.8365,
158
  "step": 1900
159
  },
160
  {
161
  "epoch": 0.08,
162
- "grad_norm": 0.9407449960708618,
163
  "learning_rate": 2.4693221796402166e-06,
164
- "loss": 0.8288,
165
  "step": 2000
166
  },
167
  {
168
  "epoch": 0.08,
169
- "eval_loss": 1.5217734575271606,
170
- "eval_runtime": 98.2235,
171
- "eval_samples_per_second": 139.427,
172
- "eval_steps_per_second": 2.179,
173
  "step": 2000
174
  },
175
  {
176
  "epoch": 0.084,
177
- "grad_norm": 0.8769101500511169,
178
  "learning_rate": 2.4657304251353047e-06,
179
- "loss": 0.8131,
180
  "step": 2100
181
  },
182
  {
183
  "epoch": 0.088,
184
- "grad_norm": 0.8608514070510864,
185
  "learning_rate": 2.4619427948258547e-06,
186
- "loss": 0.8088,
187
  "step": 2200
188
  },
189
  {
190
  "epoch": 0.092,
191
- "grad_norm": 0.9365686178207397,
192
  "learning_rate": 2.4579598989665065e-06,
193
- "loss": 0.8286,
194
  "step": 2300
195
  },
196
  {
197
  "epoch": 0.096,
198
- "grad_norm": 0.928945779800415,
199
  "learning_rate": 2.453782379272657e-06,
200
- "loss": 0.8109,
201
  "step": 2400
202
  },
203
  {
204
  "epoch": 0.1,
205
- "grad_norm": 0.9162323474884033,
206
  "learning_rate": 2.449410908817064e-06,
207
- "loss": 0.806,
208
  "step": 2500
209
  },
210
  {
211
  "epoch": 0.104,
212
- "grad_norm": 0.9436105489730835,
213
  "learning_rate": 2.444846191921406e-06,
214
- "loss": 0.7969,
215
  "step": 2600
216
  },
217
  {
218
  "epoch": 0.108,
219
- "grad_norm": 0.9459385871887207,
220
  "learning_rate": 2.4400889640427992e-06,
221
- "loss": 0.8315,
222
  "step": 2700
223
  },
224
  {
225
  "epoch": 0.112,
226
- "grad_norm": 0.9575082063674927,
227
  "learning_rate": 2.435139991655308e-06,
228
- "loss": 0.8324,
229
  "step": 2800
230
  },
231
  {
232
  "epoch": 0.116,
233
- "grad_norm": 0.927148163318634,
234
  "learning_rate": 2.4300000721264466e-06,
235
- "loss": 0.8267,
236
  "step": 2900
237
  },
238
  {
239
  "epoch": 0.12,
240
- "grad_norm": 0.9774505496025085,
241
  "learning_rate": 2.4246700335887123e-06,
242
- "loss": 0.8262,
243
  "step": 3000
244
  },
245
  {
246
  "epoch": 0.12,
247
- "eval_loss": 1.5202959775924683,
248
- "eval_runtime": 98.5199,
249
- "eval_samples_per_second": 139.007,
250
- "eval_steps_per_second": 2.172,
251
  "step": 3000
252
  },
253
  {
254
  "epoch": 0.124,
255
- "grad_norm": 0.9433075785636902,
256
  "learning_rate": 2.4191507348061575e-06,
257
- "loss": 0.803,
258
  "step": 3100
259
  },
260
  {
261
  "epoch": 0.128,
262
- "grad_norm": 0.9418466091156006,
263
  "learning_rate": 2.4134430650360284e-06,
264
- "loss": 0.8088,
265
  "step": 3200
266
  },
267
  {
268
  "epoch": 0.132,
269
- "grad_norm": 0.9223436713218689,
270
  "learning_rate": 2.407547943885489e-06,
271
- "loss": 0.8116,
272
  "step": 3300
273
  },
274
  {
275
  "epoch": 0.136,
276
- "grad_norm": 0.9359924793243408,
277
  "learning_rate": 2.4014663211634552e-06,
278
- "loss": 0.8232,
279
  "step": 3400
280
  },
281
  {
282
  "epoch": 0.14,
283
- "grad_norm": 0.9347231388092041,
284
  "learning_rate": 2.395199176727567e-06,
285
- "loss": 0.8131,
286
  "step": 3500
287
  },
288
  {
289
  "epoch": 0.144,
290
- "grad_norm": 0.9255951046943665,
291
  "learning_rate": 2.388747520326311e-06,
292
- "loss": 0.8064,
293
  "step": 3600
294
  },
295
  {
296
  "epoch": 0.148,
297
- "grad_norm": 0.8580342531204224,
298
  "learning_rate": 2.3821123914363374e-06,
299
- "loss": 0.8247,
300
  "step": 3700
301
  },
302
  {
303
  "epoch": 0.152,
304
- "grad_norm": 0.8920683860778809,
305
  "learning_rate": 2.3752948590949766e-06,
306
- "loss": 0.8058,
307
  "step": 3800
308
  },
309
  {
310
  "epoch": 0.156,
311
- "grad_norm": 0.8848472237586975,
312
  "learning_rate": 2.368296021728002e-06,
313
- "loss": 0.8209,
314
  "step": 3900
315
  },
316
  {
317
  "epoch": 0.16,
318
- "grad_norm": 0.9708815217018127,
319
  "learning_rate": 2.3611170069726532e-06,
320
- "loss": 0.8216,
321
  "step": 4000
322
  },
323
  {
324
  "epoch": 0.16,
325
- "eval_loss": 1.5283503532409668,
326
- "eval_runtime": 98.9755,
327
- "eval_samples_per_second": 138.368,
328
- "eval_steps_per_second": 2.162,
329
  "step": 4000
330
  },
331
  {
332
  "epoch": 0.164,
333
- "grad_norm": 0.8715313673019409,
334
  "learning_rate": 2.3537589714959523e-06,
335
- "loss": 0.8185,
336
  "step": 4100
337
  },
338
  {
339
  "epoch": 0.168,
340
- "grad_norm": 0.9748795032501221,
341
  "learning_rate": 2.346223100808346e-06,
342
- "loss": 0.8172,
343
  "step": 4200
344
  },
345
  {
346
  "epoch": 0.172,
347
- "grad_norm": 0.900182843208313,
348
  "learning_rate": 2.3385106090726974e-06,
349
- "loss": 0.8101,
350
  "step": 4300
351
  },
352
  {
353
  "epoch": 0.176,
354
- "grad_norm": 0.8882376551628113,
355
  "learning_rate": 2.330622738908663e-06,
356
- "loss": 0.8004,
357
  "step": 4400
358
  },
359
  {
360
  "epoch": 0.18,
361
- "grad_norm": 0.9087768793106079,
362
  "learning_rate": 2.322560761192485e-06,
363
- "loss": 0.8028,
364
  "step": 4500
365
  },
366
  {
367
  "epoch": 0.184,
368
- "grad_norm": 0.9928045868873596,
369
  "learning_rate": 2.3143259748522308e-06,
370
- "loss": 0.8257,
371
  "step": 4600
372
  },
373
  {
374
  "epoch": 0.188,
375
- "grad_norm": 0.9519675970077515,
376
  "learning_rate": 2.3059197066585126e-06,
377
- "loss": 0.817,
378
  "step": 4700
379
  },
380
  {
381
  "epoch": 0.192,
382
- "grad_norm": 0.970738410949707,
383
  "learning_rate": 2.297343311010719e-06,
384
- "loss": 0.8109,
385
  "step": 4800
386
  },
387
  {
388
  "epoch": 0.196,
389
- "grad_norm": 0.9740980267524719,
390
  "learning_rate": 2.2885981697188002e-06,
391
- "loss": 0.8168,
392
  "step": 4900
393
  },
394
  {
395
  "epoch": 0.2,
396
- "grad_norm": 0.9454805850982666,
397
  "learning_rate": 2.2796856917806313e-06,
398
- "loss": 0.8305,
399
  "step": 5000
400
  },
401
  {
402
  "epoch": 0.2,
403
- "eval_loss": 1.5317082405090332,
404
- "eval_runtime": 98.9715,
405
- "eval_samples_per_second": 138.373,
406
  "eval_steps_per_second": 2.162,
407
  "step": 5000
408
  },
409
  {
410
  "epoch": 0.204,
411
- "grad_norm": 0.9181498289108276,
412
  "learning_rate": 2.270607313155e-06,
413
- "loss": 0.807,
414
  "step": 5100
415
  },
416
  {
417
  "epoch": 0.208,
418
- "grad_norm": 0.8452897071838379,
419
  "learning_rate": 2.2613644965302456e-06,
420
- "loss": 0.802,
421
  "step": 5200
422
  },
423
  {
424
  "epoch": 0.212,
425
- "grad_norm": 0.8827036619186401,
426
  "learning_rate": 2.251958731088596e-06,
427
- "loss": 0.8001,
428
  "step": 5300
429
  },
430
  {
431
  "epoch": 0.216,
432
- "grad_norm": 0.8728039264678955,
433
  "learning_rate": 2.242391532266232e-06,
434
- "loss": 0.8211,
435
  "step": 5400
436
  },
437
  {
438
  "epoch": 0.22,
439
- "grad_norm": 0.9410618543624878,
440
  "learning_rate": 2.2326644415091264e-06,
441
- "loss": 0.7996,
442
  "step": 5500
443
  },
444
  {
445
  "epoch": 0.224,
446
- "grad_norm": 0.9829330444335938,
447
  "learning_rate": 2.2227790260246856e-06,
448
- "loss": 0.7971,
449
  "step": 5600
450
  },
451
  {
452
  "epoch": 0.228,
453
- "grad_norm": 0.9688398241996765,
454
  "learning_rate": 2.2127368785292484e-06,
455
- "loss": 0.7854,
456
  "step": 5700
457
  },
458
  {
459
  "epoch": 0.232,
460
- "grad_norm": 0.864470362663269,
461
  "learning_rate": 2.2025396169914697e-06,
462
- "loss": 0.8192,
463
  "step": 5800
464
  },
465
  {
466
  "epoch": 0.236,
467
- "grad_norm": 0.9038395881652832,
468
  "learning_rate": 2.1921888843716356e-06,
469
- "loss": 0.8005,
470
  "step": 5900
471
  },
472
  {
473
  "epoch": 0.24,
474
- "grad_norm": 0.8807651996612549,
475
  "learning_rate": 2.181686348356955e-06,
476
- "loss": 0.806,
477
  "step": 6000
478
  },
479
  {
480
  "epoch": 0.24,
481
- "eval_loss": 1.524116039276123,
482
- "eval_runtime": 99.2477,
483
- "eval_samples_per_second": 137.988,
484
- "eval_steps_per_second": 2.156,
485
  "step": 6000
486
  },
487
  {
488
  "epoch": 0.244,
489
- "grad_norm": 1.0644515752792358,
490
  "learning_rate": 2.1710337010928655e-06,
491
- "loss": 0.8232,
492
  "step": 6100
493
  },
494
  {
495
  "epoch": 0.248,
496
- "grad_norm": 0.9187564253807068,
497
  "learning_rate": 2.1602326589103967e-06,
498
- "loss": 0.8036,
499
  "step": 6200
500
  },
501
  {
502
  "epoch": 0.252,
503
- "grad_norm": 0.9233301877975464,
504
  "learning_rate": 2.1492849620496414e-06,
505
- "loss": 0.8118,
506
  "step": 6300
507
  },
508
  {
509
  "epoch": 0.256,
510
- "grad_norm": 0.9559895396232605,
511
  "learning_rate": 2.13819237437937e-06,
512
- "loss": 0.7959,
513
  "step": 6400
514
  },
515
  {
516
  "epoch": 0.26,
517
- "grad_norm": 0.8455320000648499,
518
  "learning_rate": 2.126956683112842e-06,
519
- "loss": 0.8254,
520
  "step": 6500
521
  },
522
  {
523
  "epoch": 0.264,
524
- "grad_norm": 0.942471444606781,
525
  "learning_rate": 2.1155796985198495e-06,
526
- "loss": 0.808,
527
  "step": 6600
528
  },
529
  {
530
  "epoch": 0.268,
531
- "grad_norm": 0.8535305261611938,
532
  "learning_rate": 2.1040632536350573e-06,
533
- "loss": 0.8182,
534
  "step": 6700
535
  },
536
  {
537
  "epoch": 0.272,
538
- "grad_norm": 0.8879380226135254,
539
  "learning_rate": 2.092409203962663e-06,
540
- "loss": 0.8177,
541
  "step": 6800
542
  },
543
  {
544
  "epoch": 0.276,
545
- "grad_norm": 0.8684147000312805,
546
  "learning_rate": 2.080619427177443e-06,
547
- "loss": 0.7982,
548
  "step": 6900
549
  },
550
  {
551
  "epoch": 0.28,
552
- "grad_norm": 0.9437069892883301,
553
  "learning_rate": 2.0686958228222298e-06,
554
- "loss": 0.7984,
555
  "step": 7000
556
  },
557
  {
558
  "epoch": 0.28,
559
- "eval_loss": 1.530232548713684,
560
- "eval_runtime": 99.3518,
561
- "eval_samples_per_second": 137.844,
562
- "eval_steps_per_second": 2.154,
563
  "step": 7000
564
  },
565
  {
566
  "epoch": 0.284,
567
- "grad_norm": 0.9226755499839783,
568
  "learning_rate": 2.056640312001856e-06,
569
- "loss": 0.8072,
570
  "step": 7100
571
  },
572
  {
573
  "epoch": 0.288,
574
- "grad_norm": 0.9192745685577393,
575
  "learning_rate": 2.0444548370736335e-06,
576
- "loss": 0.8081,
577
  "step": 7200
578
  },
579
  {
580
  "epoch": 0.292,
581
- "grad_norm": 1.026985764503479,
582
  "learning_rate": 2.032141361334406e-06,
583
- "loss": 0.8074,
584
  "step": 7300
585
  },
586
  {
587
  "epoch": 0.296,
588
- "grad_norm": 0.8428290486335754,
589
  "learning_rate": 2.019701868704224e-06,
590
- "loss": 0.8081,
591
  "step": 7400
592
  },
593
  {
594
  "epoch": 0.3,
595
- "grad_norm": 0.9866459369659424,
596
  "learning_rate": 2.007138363406702e-06,
597
- "loss": 0.8241,
598
  "step": 7500
599
  },
600
  {
601
  "epoch": 0.304,
602
- "grad_norm": 0.9240759015083313,
603
  "learning_rate": 1.9944528696461016e-06,
604
- "loss": 0.8089,
605
  "step": 7600
606
  },
607
  {
608
  "epoch": 0.308,
609
- "grad_norm": 0.8980386853218079,
610
  "learning_rate": 1.9816474312811984e-06,
611
- "loss": 0.7995,
612
  "step": 7700
613
  },
614
  {
615
  "epoch": 0.312,
616
- "grad_norm": 0.9766695499420166,
617
  "learning_rate": 1.9687241114959753e-06,
618
- "loss": 0.7969,
619
  "step": 7800
620
  },
621
  {
622
  "epoch": 0.316,
623
- "grad_norm": 0.8739997148513794,
624
  "learning_rate": 1.955684992467211e-06,
625
- "loss": 0.8053,
626
  "step": 7900
627
  },
628
  {
629
  "epoch": 0.32,
630
- "grad_norm": 0.9071422219276428,
631
  "learning_rate": 1.942532175029003e-06,
632
- "loss": 0.7896,
633
  "step": 8000
634
  },
635
  {
636
  "epoch": 0.32,
637
- "eval_loss": 1.5243619680404663,
638
- "eval_runtime": 99.5243,
639
- "eval_samples_per_second": 137.605,
640
- "eval_steps_per_second": 2.15,
641
  "step": 8000
642
  },
643
  {
644
  "epoch": 0.324,
645
- "grad_norm": 0.9778127670288086,
646
  "learning_rate": 1.929267778334285e-06,
647
- "loss": 0.7878,
648
  "step": 8100
649
  },
650
  {
651
  "epoch": 0.328,
652
- "grad_norm": 0.9122934937477112,
653
  "learning_rate": 1.915893939513396e-06,
654
- "loss": 0.7967,
655
  "step": 8200
656
  },
657
  {
658
  "epoch": 0.332,
659
- "grad_norm": 0.90513676404953,
660
  "learning_rate": 1.9024128133297467e-06,
661
- "loss": 0.8048,
662
  "step": 8300
663
  },
664
  {
665
  "epoch": 0.336,
666
- "grad_norm": 0.9107154607772827,
667
  "learning_rate": 1.8888265718326532e-06,
668
- "loss": 0.7944,
669
  "step": 8400
670
  },
671
  {
672
  "epoch": 0.34,
673
- "grad_norm": 0.8964477777481079,
674
  "learning_rate": 1.8751374040073774e-06,
675
- "loss": 0.7958,
676
  "step": 8500
677
  },
678
  {
679
  "epoch": 0.344,
680
- "grad_norm": 0.9018213152885437,
681
  "learning_rate": 1.8613475154224456e-06,
682
- "loss": 0.8065,
683
  "step": 8600
684
  },
685
  {
686
  "epoch": 0.348,
687
- "grad_norm": 0.9653429985046387,
688
  "learning_rate": 1.8474591278742894e-06,
689
- "loss": 0.8194,
690
  "step": 8700
691
  },
692
  {
693
  "epoch": 0.352,
694
- "grad_norm": 0.9324017763137817,
695
  "learning_rate": 1.8334744790292766e-06,
696
- "loss": 0.796,
697
  "step": 8800
698
  },
699
  {
700
  "epoch": 0.356,
701
- "grad_norm": 1.0298709869384766,
702
  "learning_rate": 1.8193958220631833e-06,
703
- "loss": 0.8268,
704
  "step": 8900
705
  },
706
  {
707
  "epoch": 0.36,
708
- "grad_norm": 0.8846196532249451,
709
  "learning_rate": 1.805225425298166e-06,
710
- "loss": 0.825,
711
  "step": 9000
712
  },
713
  {
714
  "epoch": 0.36,
715
- "eval_loss": 1.5243308544158936,
716
- "eval_runtime": 100.5198,
717
- "eval_samples_per_second": 136.242,
718
- "eval_steps_per_second": 2.129,
719
  "step": 9000
720
  },
721
  {
722
  "epoch": 0.364,
723
- "grad_norm": 0.8830705881118774,
724
  "learning_rate": 1.790965571837296e-06,
725
- "loss": 0.8233,
726
  "step": 9100
727
  },
728
  {
729
  "epoch": 0.368,
730
- "grad_norm": 0.9197975993156433,
731
  "learning_rate": 1.7766185591967092e-06,
732
- "loss": 0.8299,
733
  "step": 9200
734
  },
735
  {
736
  "epoch": 0.372,
737
- "grad_norm": 1.0428673028945923,
738
  "learning_rate": 1.762186698935437e-06,
739
- "loss": 0.8182,
740
  "step": 9300
741
  },
742
  {
743
  "epoch": 0.376,
744
- "grad_norm": 0.9466006755828857,
745
  "learning_rate": 1.7476723162829723e-06,
746
- "loss": 0.8255,
747
  "step": 9400
748
  },
749
  {
750
  "epoch": 0.38,
751
- "grad_norm": 0.9237021803855896,
752
  "learning_rate": 1.7330777497646328e-06,
753
- "loss": 0.7672,
754
  "step": 9500
755
  },
756
  {
757
  "epoch": 0.384,
758
- "grad_norm": 0.917202353477478,
759
  "learning_rate": 1.7184053508247853e-06,
760
- "loss": 0.8427,
761
  "step": 9600
762
  },
763
  {
764
  "epoch": 0.388,
765
- "grad_norm": 0.9462612271308899,
766
  "learning_rate": 1.703657483447983e-06,
767
- "loss": 0.8409,
768
  "step": 9700
769
  },
770
  {
771
  "epoch": 0.392,
772
- "grad_norm": 0.8924245834350586,
773
  "learning_rate": 1.6888365237780886e-06,
774
- "loss": 0.8335,
775
  "step": 9800
776
  },
777
  {
778
  "epoch": 0.396,
779
- "grad_norm": 0.9719087481498718,
780
  "learning_rate": 1.6739448597354327e-06,
781
- "loss": 0.826,
782
  "step": 9900
783
  },
784
  {
785
  "epoch": 0.4,
786
- "grad_norm": 0.8893173336982727,
787
  "learning_rate": 1.6589848906320794e-06,
788
- "loss": 0.8326,
789
  "step": 10000
790
  },
791
  {
792
  "epoch": 0.4,
793
- "eval_loss": 1.5264792442321777,
794
- "eval_runtime": 101.5699,
795
- "eval_samples_per_second": 134.833,
796
- "eval_steps_per_second": 2.107,
797
  "step": 10000
798
  },
799
  {
800
  "epoch": 0.404,
801
- "grad_norm": 0.8719335198402405,
802
  "learning_rate": 1.6439590267852528e-06,
803
- "loss": 0.8198,
804
  "step": 10100
805
  },
806
  {
807
  "epoch": 0.408,
808
- "grad_norm": 0.8997857570648193,
809
  "learning_rate": 1.6288696891289938e-06,
810
- "loss": 0.8103,
811
  "step": 10200
812
  },
813
  {
814
  "epoch": 0.412,
815
- "grad_norm": 0.9756138920783997,
816
  "learning_rate": 1.6137193088241021e-06,
817
- "loss": 0.8245,
818
  "step": 10300
819
  },
820
  {
821
  "epoch": 0.416,
822
- "grad_norm": 1.009027123451233,
823
  "learning_rate": 1.598510326866435e-06,
824
- "loss": 0.8226,
825
  "step": 10400
826
  },
827
  {
828
  "epoch": 0.42,
829
- "grad_norm": 0.9941139221191406,
830
  "learning_rate": 1.583245193693619e-06,
831
- "loss": 0.8154,
832
  "step": 10500
833
  },
834
  {
835
  "epoch": 0.424,
836
- "grad_norm": 0.9156614542007446,
837
  "learning_rate": 1.5679263687902402e-06,
838
- "loss": 0.8194,
839
  "step": 10600
840
  },
841
  {
842
  "epoch": 0.428,
843
- "grad_norm": 0.9270005226135254,
844
  "learning_rate": 1.552556320291578e-06,
845
- "loss": 0.8144,
846
  "step": 10700
847
  },
848
  {
849
  "epoch": 0.432,
850
- "grad_norm": 0.9664807915687561,
851
  "learning_rate": 1.5371375245859446e-06,
852
- "loss": 0.823,
853
  "step": 10800
854
  },
855
  {
856
  "epoch": 0.436,
857
- "grad_norm": 0.9909628629684448,
858
  "learning_rate": 1.5216724659156944e-06,
859
- "loss": 0.8319,
860
  "step": 10900
861
  },
862
  {
863
  "epoch": 0.44,
864
- "grad_norm": 1.0144808292388916,
865
  "learning_rate": 1.506163635976969e-06,
866
- "loss": 0.8272,
867
  "step": 11000
868
  },
869
  {
870
  "epoch": 0.44,
871
- "eval_loss": 1.5209919214248657,
872
- "eval_runtime": 101.3638,
873
- "eval_samples_per_second": 135.107,
874
- "eval_steps_per_second": 2.111,
875
  "step": 11000
876
  },
877
  {
878
  "epoch": 0.444,
879
- "grad_norm": 0.9689117074012756,
880
  "learning_rate": 1.49061353351824e-06,
881
- "loss": 0.8408,
882
  "step": 11100
883
  },
884
  {
885
  "epoch": 0.448,
886
- "grad_norm": 1.0267921686172485,
887
  "learning_rate": 1.4750246639377161e-06,
888
- "loss": 0.8362,
889
  "step": 11200
890
  },
891
  {
892
  "epoch": 0.452,
893
- "grad_norm": 0.920600175857544,
894
  "learning_rate": 1.4593995388796797e-06,
895
- "loss": 0.8343,
896
  "step": 11300
897
  },
898
  {
899
  "epoch": 0.456,
900
- "grad_norm": 1.025995135307312,
901
  "learning_rate": 1.4437406758298156e-06,
902
- "loss": 0.8255,
903
  "step": 11400
904
  },
905
  {
906
  "epoch": 0.46,
907
- "grad_norm": 0.889402449131012,
908
  "learning_rate": 1.428050597709599e-06,
909
- "loss": 0.839,
910
  "step": 11500
911
  },
912
  {
913
  "epoch": 0.464,
914
- "grad_norm": 0.8957056999206543,
915
  "learning_rate": 1.412331832469809e-06,
916
- "loss": 0.8304,
917
  "step": 11600
918
  },
919
  {
920
  "epoch": 0.468,
921
- "grad_norm": 0.9389684796333313,
922
  "learning_rate": 1.39658691268323e-06,
923
- "loss": 0.8523,
924
  "step": 11700
925
  },
926
  {
927
  "epoch": 0.472,
928
- "grad_norm": 0.9115435481071472,
929
  "learning_rate": 1.3808183751366089e-06,
930
- "loss": 0.8421,
931
  "step": 11800
932
  },
933
  {
934
  "epoch": 0.476,
935
- "grad_norm": 0.9521908164024353,
936
  "learning_rate": 1.3650287604219342e-06,
937
- "loss": 0.8704,
938
  "step": 11900
939
  },
940
  {
941
  "epoch": 0.48,
942
- "grad_norm": 0.9166862964630127,
943
  "learning_rate": 1.3492206125271016e-06,
944
- "loss": 0.8527,
945
  "step": 12000
946
  },
947
  {
948
  "epoch": 0.48,
949
- "eval_loss": 1.5229912996292114,
950
- "eval_runtime": 101.6086,
951
- "eval_samples_per_second": 134.782,
952
- "eval_steps_per_second": 2.106,
953
  "step": 12000
954
  },
955
  {
956
  "epoch": 0.484,
957
- "grad_norm": 0.9557492733001709,
958
  "learning_rate": 1.333396478426031e-06,
959
- "loss": 0.8499,
960
  "step": 12100
961
  },
962
  {
963
  "epoch": 0.488,
964
- "grad_norm": 0.9957550764083862,
965
  "learning_rate": 1.317558907668306e-06,
966
- "loss": 0.8534,
967
  "step": 12200
968
  },
969
  {
970
  "epoch": 0.492,
971
- "grad_norm": 1.1370068788528442,
972
  "learning_rate": 1.3017104519683932e-06,
973
- "loss": 0.8336,
974
  "step": 12300
975
  },
976
  {
977
  "epoch": 0.496,
978
- "grad_norm": 0.9006808400154114,
979
  "learning_rate": 1.285853664794518e-06,
980
- "loss": 0.8196,
981
  "step": 12400
982
  },
983
  {
984
  "epoch": 0.5,
985
- "grad_norm": 0.9441719651222229,
986
  "learning_rate": 1.269991100957254e-06,
987
- "loss": 0.844,
988
  "step": 12500
989
  },
990
  {
991
  "epoch": 0.504,
992
- "grad_norm": 0.8616164922714233,
993
  "learning_rate": 1.2541253161978986e-06,
994
- "loss": 0.8319,
995
  "step": 12600
996
  },
997
  {
998
  "epoch": 0.508,
999
- "grad_norm": 0.9243165850639343,
1000
  "learning_rate": 1.238258866776697e-06,
1001
- "loss": 0.8307,
1002
  "step": 12700
1003
  },
1004
  {
1005
  "epoch": 0.512,
1006
- "grad_norm": 0.9617546796798706,
1007
  "learning_rate": 1.222394309060982e-06,
1008
- "loss": 0.8562,
1009
  "step": 12800
1010
  },
1011
  {
1012
  "epoch": 0.516,
1013
- "grad_norm": 0.8897221684455872,
1014
  "learning_rate": 1.2065341991133013e-06,
1015
- "loss": 0.8344,
1016
  "step": 12900
1017
  },
1018
  {
1019
  "epoch": 0.52,
1020
- "grad_norm": 0.8364721536636353,
1021
  "learning_rate": 1.1906810922795864e-06,
1022
- "loss": 0.8389,
1023
  "step": 13000
1024
  },
1025
  {
1026
  "epoch": 0.52,
1027
- "eval_loss": 1.5288289785385132,
1028
- "eval_runtime": 101.5647,
1029
- "eval_samples_per_second": 134.84,
1030
- "eval_steps_per_second": 2.107,
1031
  "step": 13000
1032
  },
1033
  {
1034
  "epoch": 0.524,
1035
- "grad_norm": 1.0084967613220215,
1036
  "learning_rate": 1.1748375427774422e-06,
1037
- "loss": 0.8498,
1038
  "step": 13100
1039
  },
1040
  {
1041
  "epoch": 0.528,
1042
- "grad_norm": 0.9439749717712402,
1043
  "learning_rate": 1.1590061032846182e-06,
1044
- "loss": 0.8509,
1045
  "step": 13200
1046
  },
1047
  {
1048
  "epoch": 0.532,
1049
- "grad_norm": 0.8930461406707764,
1050
  "learning_rate": 1.1431893245277262e-06,
1051
- "loss": 0.8384,
1052
  "step": 13300
1053
  },
1054
  {
1055
  "epoch": 0.536,
1056
- "grad_norm": 1.0605283975601196,
1057
  "learning_rate": 1.1273897548712726e-06,
1058
- "loss": 0.8557,
1059
  "step": 13400
1060
  },
1061
  {
1062
  "epoch": 0.54,
1063
- "grad_norm": 0.8892098069190979,
1064
  "learning_rate": 1.11160993990707e-06,
1065
- "loss": 0.8378,
1066
  "step": 13500
1067
  },
1068
  {
1069
  "epoch": 0.544,
1070
- "grad_norm": 0.9008782505989075,
1071
  "learning_rate": 1.0958524220440999e-06,
1072
- "loss": 0.8437,
1073
  "step": 13600
1074
  },
1075
  {
1076
  "epoch": 0.548,
1077
- "grad_norm": 0.8771668672561646,
1078
  "learning_rate": 1.0801197400988838e-06,
1079
- "loss": 0.8512,
1080
  "step": 13700
1081
  },
1082
  {
1083
  "epoch": 0.552,
1084
- "grad_norm": 0.9245998859405518,
1085
  "learning_rate": 1.0644144288864352e-06,
1086
- "loss": 0.8671,
1087
  "step": 13800
1088
  },
1089
  {
1090
  "epoch": 0.556,
1091
- "grad_norm": 0.9122968912124634,
1092
  "learning_rate": 1.048739018811855e-06,
1093
- "loss": 0.8328,
1094
  "step": 13900
1095
  },
1096
  {
1097
  "epoch": 0.56,
1098
- "grad_norm": 0.9968782067298889,
1099
  "learning_rate": 1.0330960354626384e-06,
1100
- "loss": 0.851,
1101
  "step": 14000
1102
  },
1103
  {
1104
  "epoch": 0.56,
1105
- "eval_loss": 1.5260618925094604,
1106
- "eval_runtime": 101.9042,
1107
- "eval_samples_per_second": 134.391,
1108
- "eval_steps_per_second": 2.1,
1109
  "step": 14000
1110
  },
1111
  {
1112
  "epoch": 0.564,
1113
- "grad_norm": 1.0338596105575562,
1114
  "learning_rate": 1.0174879992017586e-06,
1115
- "loss": 0.8374,
1116
  "step": 14100
1117
  },
1118
  {
1119
  "epoch": 0.568,
1120
- "grad_norm": 0.9291728734970093,
1121
  "learning_rate": 1.0019174247615919e-06,
1122
- "loss": 0.8356,
1123
  "step": 14200
1124
  },
1125
  {
1126
  "epoch": 0.572,
1127
- "grad_norm": 0.8955647945404053,
1128
  "learning_rate": 9.863868208387473e-07,
1129
- "loss": 0.839,
1130
  "step": 14300
1131
  },
1132
  {
1133
  "epoch": 0.576,
1134
- "grad_norm": 0.9726178050041199,
1135
  "learning_rate": 9.708986896898727e-07,
1136
- "loss": 0.8396,
1137
  "step": 14400
1138
  },
1139
  {
1140
  "epoch": 0.58,
1141
- "grad_norm": 0.9720205068588257,
1142
  "learning_rate": 9.554555267284956e-07,
1143
- "loss": 0.8334,
1144
  "step": 14500
1145
  },
1146
  {
1147
  "epoch": 0.584,
1148
- "grad_norm": 0.9503899216651917,
1149
  "learning_rate": 9.400598201229705e-07,
1150
- "loss": 0.8165,
1151
  "step": 14600
1152
  },
1153
  {
1154
  "epoch": 0.588,
1155
- "grad_norm": 0.8789735436439514,
1156
  "learning_rate": 9.247140503955863e-07,
1157
- "loss": 0.8262,
1158
  "step": 14700
1159
  },
1160
  {
1161
  "epoch": 0.592,
1162
- "grad_norm": 1.4387589693069458,
1163
  "learning_rate": 9.09420690022913e-07,
1164
- "loss": 0.8378,
1165
  "step": 14800
1166
  },
1167
  {
1168
  "epoch": 0.596,
1169
- "grad_norm": 1.1762765645980835,
1170
  "learning_rate": 8.941822030374405e-07,
1171
- "loss": 0.8428,
1172
  "step": 14900
1173
  },
1174
  {
1175
  "epoch": 0.6,
1176
- "grad_norm": 0.880807638168335,
1177
  "learning_rate": 8.790010446305814e-07,
1178
- "loss": 0.8254,
1179
  "step": 15000
1180
  },
1181
  {
1182
  "epoch": 0.6,
1183
- "eval_loss": 1.5283499956130981,
1184
- "eval_runtime": 103.2419,
1185
- "eval_samples_per_second": 132.65,
1186
- "eval_steps_per_second": 2.073,
1187
  "step": 15000
1188
  },
1189
  {
1190
  "epoch": 0.604,
1191
- "grad_norm": 0.9635188579559326,
1192
  "learning_rate": 8.63879660757092e-07,
1193
- "loss": 0.798,
1194
  "step": 15100
1195
  },
1196
  {
1197
  "epoch": 0.608,
1198
- "grad_norm": 0.9472705721855164,
1199
  "learning_rate": 8.488204877409884e-07,
1200
- "loss": 0.8033,
1201
  "step": 15200
1202
  },
1203
  {
1204
  "epoch": 0.612,
1205
- "grad_norm": 0.8378113508224487,
1206
  "learning_rate": 8.338259518830106e-07,
1207
- "loss": 0.8012,
1208
  "step": 15300
1209
  },
1210
  {
1211
  "epoch": 0.616,
1212
- "grad_norm": 0.9451029300689697,
1213
  "learning_rate": 8.18898469069703e-07,
1214
- "loss": 0.8047,
1215
  "step": 15400
1216
  },
1217
  {
1218
  "epoch": 0.62,
1219
- "grad_norm": 0.9609344005584717,
1220
  "learning_rate": 8.040404443841701e-07,
1221
- "loss": 0.7927,
1222
  "step": 15500
1223
  },
1224
  {
1225
  "epoch": 0.624,
1226
- "grad_norm": 0.8947242498397827,
1227
  "learning_rate": 7.892542717185766e-07,
1228
- "loss": 0.7885,
1229
  "step": 15600
1230
  },
1231
  {
1232
  "epoch": 0.628,
1233
- "grad_norm": 0.9105751514434814,
1234
  "learning_rate": 7.745423333884483e-07,
1235
- "loss": 0.801,
1236
  "step": 15700
1237
  },
1238
  {
1239
  "epoch": 0.632,
1240
- "grad_norm": 0.899936854839325,
1241
  "learning_rate": 7.599069997488386e-07,
1242
- "loss": 0.8005,
1243
  "step": 15800
1244
  },
1245
  {
1246
  "epoch": 0.636,
1247
- "grad_norm": 1.0273375511169434,
1248
  "learning_rate": 7.453506288124224e-07,
1249
- "loss": 0.8015,
1250
  "step": 15900
1251
  },
1252
  {
1253
  "epoch": 0.64,
1254
- "grad_norm": 0.8960332274436951,
1255
  "learning_rate": 7.308755658695775e-07,
1256
- "loss": 0.8074,
1257
  "step": 16000
1258
  },
1259
  {
1260
  "epoch": 0.64,
1261
- "eval_loss": 1.5343570709228516,
1262
- "eval_runtime": 102.3372,
1263
- "eval_samples_per_second": 133.822,
1264
- "eval_steps_per_second": 2.091,
1265
  "step": 16000
1266
  },
1267
  {
1268
  "epoch": 0.644,
1269
- "grad_norm": 0.8942509293556213,
1270
  "learning_rate": 7.164841431105172e-07,
1271
- "loss": 0.796,
1272
  "step": 16100
1273
  },
1274
  {
1275
  "epoch": 0.648,
1276
- "grad_norm": 0.9353269934654236,
1277
  "learning_rate": 7.021786792495325e-07,
1278
- "loss": 0.8196,
1279
  "step": 16200
1280
  },
1281
  {
1282
  "epoch": 0.652,
1283
- "grad_norm": 0.985683262348175,
1284
  "learning_rate": 6.879614791514075e-07,
1285
- "loss": 0.808,
1286
  "step": 16300
1287
  },
1288
  {
1289
  "epoch": 0.656,
1290
- "grad_norm": 0.8981220722198486,
1291
  "learning_rate": 6.738348334600634e-07,
1292
- "loss": 0.8015,
1293
  "step": 16400
1294
  },
1295
  {
1296
  "epoch": 0.66,
1297
- "grad_norm": 0.9412031173706055,
1298
  "learning_rate": 6.598010182294938e-07,
1299
- "loss": 0.8009,
1300
  "step": 16500
1301
  },
1302
  {
1303
  "epoch": 0.664,
1304
- "grad_norm": 0.8926331996917725,
1305
  "learning_rate": 6.458622945570538e-07,
1306
- "loss": 0.783,
1307
  "step": 16600
1308
  },
1309
  {
1310
  "epoch": 0.668,
1311
- "grad_norm": 0.8715830445289612,
1312
  "learning_rate": 6.320209082191569e-07,
1313
- "loss": 0.8127,
1314
  "step": 16700
1315
  },
1316
  {
1317
  "epoch": 0.672,
1318
- "grad_norm": 0.8215272426605225,
1319
  "learning_rate": 6.182790893094402e-07,
1320
- "loss": 0.7958,
1321
  "step": 16800
1322
  },
1323
  {
1324
  "epoch": 0.676,
1325
- "grad_norm": 0.9258244037628174,
1326
  "learning_rate": 6.046390518794556e-07,
1327
- "loss": 0.7931,
1328
  "step": 16900
1329
  },
1330
  {
1331
  "epoch": 0.68,
1332
- "grad_norm": 0.8930866122245789,
1333
  "learning_rate": 5.911029935819468e-07,
1334
- "loss": 0.7811,
1335
  "step": 17000
1336
  },
1337
  {
1338
  "epoch": 0.68,
1339
- "eval_loss": 1.5324440002441406,
1340
- "eval_runtime": 102.3251,
1341
- "eval_samples_per_second": 133.838,
1342
- "eval_steps_per_second": 2.091,
1343
  "step": 17000
1344
  },
1345
  {
1346
  "epoch": 0.684,
1347
- "grad_norm": 0.9415869116783142,
1348
  "learning_rate": 5.776730953167705e-07,
1349
- "loss": 0.8003,
1350
  "step": 17100
1351
  },
1352
  {
1353
  "epoch": 0.688,
1354
- "grad_norm": 0.892819344997406,
1355
  "learning_rate": 5.643515208795141e-07,
1356
- "loss": 0.7943,
1357
  "step": 17200
1358
  },
1359
  {
1360
  "epoch": 0.692,
1361
- "grad_norm": 0.9383297562599182,
1362
  "learning_rate": 5.511404166128647e-07,
1363
- "loss": 0.7998,
1364
  "step": 17300
1365
  },
1366
  {
1367
  "epoch": 0.696,
1368
- "grad_norm": 0.8630228638648987,
1369
  "learning_rate": 5.380419110608033e-07,
1370
- "loss": 0.7949,
1371
  "step": 17400
1372
  },
1373
  {
1374
  "epoch": 0.7,
1375
- "grad_norm": 0.9032106995582581,
1376
  "learning_rate": 5.250581146256524e-07,
1377
- "loss": 0.7928,
1378
  "step": 17500
1379
  },
1380
  {
1381
- "epoch": 0.704,
1382
- "grad_norm": 0.9039574265480042,
1383
  "learning_rate": 5.121911192280557e-07,
1384
- "loss": 0.8012,
1385
  "step": 17600
1386
  },
1387
  {
1388
- "epoch": 0.708,
1389
- "grad_norm": 0.9616802334785461,
1390
  "learning_rate": 4.994429979699302e-07,
1391
- "loss": 0.7964,
1392
  "step": 17700
1393
  },
1394
  {
1395
- "epoch": 0.712,
1396
- "grad_norm": 0.9427072405815125,
1397
  "learning_rate": 4.868158048004537e-07,
1398
- "loss": 0.805,
1399
  "step": 17800
1400
  },
1401
  {
1402
- "epoch": 0.716,
1403
- "grad_norm": 0.9399961829185486,
1404
  "learning_rate": 4.743115741851383e-07,
1405
- "loss": 0.7913,
1406
  "step": 17900
1407
  },
1408
  {
1409
- "epoch": 0.72,
1410
- "grad_norm": 0.8884769678115845,
1411
  "learning_rate": 4.6193232077804006e-07,
1412
- "loss": 0.7985,
1413
  "step": 18000
1414
  },
1415
  {
1416
- "epoch": 0.72,
1417
- "eval_loss": 1.5309633016586304,
1418
- "eval_runtime": 102.8962,
1419
- "eval_samples_per_second": 133.095,
1420
- "eval_steps_per_second": 2.08,
1421
  "step": 18000
1422
  },
1423
  {
1424
- "epoch": 0.724,
1425
- "grad_norm": 0.9725548028945923,
1426
  "learning_rate": 4.4968003909716243e-07,
1427
- "loss": 0.8162,
1428
  "step": 18100
1429
  },
1430
  {
1431
- "epoch": 0.728,
1432
- "grad_norm": 1.0075186491012573,
1433
  "learning_rate": 4.3755670320310443e-07,
1434
- "loss": 0.8054,
1435
  "step": 18200
1436
  },
1437
  {
1438
- "epoch": 0.732,
1439
- "grad_norm": 0.8749048113822937,
1440
  "learning_rate": 4.2556426638100555e-07,
1441
- "loss": 0.8056,
1442
  "step": 18300
1443
  },
1444
  {
1445
- "epoch": 0.736,
1446
- "grad_norm": 0.9941290616989136,
1447
  "learning_rate": 4.1370466082583353e-07,
1448
- "loss": 0.8052,
1449
  "step": 18400
1450
  },
1451
  {
1452
- "epoch": 0.74,
1453
- "grad_norm": 0.8676705956459045,
1454
  "learning_rate": 4.0197979733107755e-07,
1455
- "loss": 0.7861,
1456
  "step": 18500
1457
  },
1458
  {
1459
- "epoch": 0.744,
1460
- "grad_norm": 0.9036993980407715,
1461
  "learning_rate": 3.903915649808812e-07,
1462
- "loss": 0.8081,
1463
  "step": 18600
1464
  },
1465
  {
1466
- "epoch": 0.748,
1467
- "grad_norm": 0.9067134261131287,
1468
  "learning_rate": 3.789418308456812e-07,
1469
- "loss": 0.7956,
1470
  "step": 18700
1471
  },
1472
  {
1473
- "epoch": 0.752,
1474
- "grad_norm": 0.8146563768386841,
1475
  "learning_rate": 3.676324396813856e-07,
1476
- "loss": 0.8031,
1477
  "step": 18800
1478
  },
1479
  {
1480
- "epoch": 0.756,
1481
- "grad_norm": 0.9973321557044983,
1482
  "learning_rate": 3.5646521363215447e-07,
1483
- "loss": 0.794,
1484
  "step": 18900
1485
  },
1486
  {
1487
- "epoch": 0.76,
1488
- "grad_norm": 0.9761902689933777,
1489
  "learning_rate": 3.4544195193681615e-07,
1490
- "loss": 0.7816,
1491
  "step": 19000
1492
  },
1493
  {
1494
- "epoch": 0.76,
1495
- "eval_loss": 1.5294893980026245,
1496
- "eval_runtime": 102.4113,
1497
- "eval_samples_per_second": 133.726,
1498
- "eval_steps_per_second": 2.09,
1499
  "step": 19000
1500
  },
1501
  {
1502
- "epoch": 0.764,
1503
- "grad_norm": 0.8643273115158081,
1504
  "learning_rate": 3.3456443063898157e-07,
1505
- "loss": 0.7917,
1506
  "step": 19100
1507
  },
1508
  {
1509
- "epoch": 0.768,
1510
- "grad_norm": 0.9306071400642395,
1511
  "learning_rate": 3.238344023008888e-07,
1512
- "loss": 0.8012,
1513
  "step": 19200
1514
  },
1515
  {
1516
- "epoch": 0.772,
1517
- "grad_norm": 0.9324482083320618,
1518
  "learning_rate": 3.132535957210366e-07,
1519
- "loss": 0.7929,
1520
  "step": 19300
1521
  },
1522
  {
1523
- "epoch": 0.776,
1524
- "grad_norm": 0.8625467419624329,
1525
  "learning_rate": 3.0282371565564324e-07,
1526
- "loss": 0.7815,
1527
  "step": 19400
1528
  },
1529
  {
1530
- "epoch": 0.78,
1531
- "grad_norm": 0.8669098019599915,
1532
  "learning_rate": 2.925464425439789e-07,
1533
- "loss": 0.8214,
1534
  "step": 19500
1535
  },
1536
  {
1537
- "epoch": 0.784,
1538
- "grad_norm": 0.8781657814979553,
1539
  "learning_rate": 2.824234322376185e-07,
1540
- "loss": 0.7941,
1541
  "step": 19600
1542
  },
1543
  {
1544
- "epoch": 0.788,
1545
- "grad_norm": 0.8899013996124268,
1546
  "learning_rate": 2.724563157336542e-07,
1547
- "loss": 0.7966,
1548
  "step": 19700
1549
  },
1550
  {
1551
- "epoch": 0.792,
1552
- "grad_norm": 0.9773925542831421,
1553
  "learning_rate": 2.626466989119131e-07,
1554
- "loss": 0.8009,
1555
  "step": 19800
1556
  },
1557
  {
1558
- "epoch": 0.796,
1559
- "grad_norm": 0.912438690662384,
1560
  "learning_rate": 2.5299616227621946e-07,
1561
- "loss": 0.7902,
1562
  "step": 19900
1563
  },
1564
  {
1565
- "epoch": 0.8,
1566
- "grad_norm": 0.9557161927223206,
1567
  "learning_rate": 2.435062606997499e-07,
1568
- "loss": 0.7889,
1569
  "step": 20000
1570
  },
1571
  {
1572
- "epoch": 0.8,
1573
- "eval_loss": 1.5292094945907593,
1574
- "eval_runtime": 102.5763,
1575
- "eval_samples_per_second": 133.51,
1576
- "eval_steps_per_second": 2.086,
1577
  "step": 20000
1578
  },
1579
  {
1580
- "epoch": 0.804,
1581
- "grad_norm": 0.8561129570007324,
1582
  "learning_rate": 2.3417852317451418e-07,
1583
- "loss": 0.8033,
1584
  "step": 20100
1585
  },
1586
  {
1587
- "epoch": 0.808,
1588
- "grad_norm": 0.9422599673271179,
1589
  "learning_rate": 2.250144525650086e-07,
1590
- "loss": 0.7985,
1591
  "step": 20200
1592
  },
1593
  {
1594
- "epoch": 0.812,
1595
- "grad_norm": 0.8980026245117188,
1596
  "learning_rate": 2.160155253660759e-07,
1597
- "loss": 0.7951,
1598
  "step": 20300
1599
  },
1600
  {
1601
- "epoch": 0.816,
1602
- "grad_norm": 0.8675551414489746,
1603
  "learning_rate": 2.071831914650173e-07,
1604
- "loss": 0.7994,
1605
  "step": 20400
1606
  },
1607
  {
1608
- "epoch": 0.82,
1609
- "grad_norm": 0.8988806009292603,
1610
  "learning_rate": 1.9851887390798922e-07,
1611
- "loss": 0.7875,
1612
  "step": 20500
1613
  },
1614
  {
1615
- "epoch": 0.824,
1616
- "grad_norm": 0.9102202653884888,
1617
  "learning_rate": 1.9002396867072587e-07,
1618
- "loss": 0.7993,
1619
  "step": 20600
1620
  },
1621
  {
1622
- "epoch": 0.828,
1623
- "grad_norm": 0.9096868634223938,
1624
  "learning_rate": 1.816998444336214e-07,
1625
- "loss": 0.7704,
1626
  "step": 20700
1627
  },
1628
  {
1629
- "epoch": 0.832,
1630
- "grad_norm": 0.9461880922317505,
1631
  "learning_rate": 1.7354784236121206e-07,
1632
- "loss": 0.7853,
1633
  "step": 20800
1634
  },
1635
  {
1636
- "epoch": 0.836,
1637
- "grad_norm": 0.9219881296157837,
1638
  "learning_rate": 1.6556927588609078e-07,
1639
- "loss": 0.7857,
1640
  "step": 20900
1641
  },
1642
  {
1643
- "epoch": 0.84,
1644
- "grad_norm": 0.8964762687683105,
1645
  "learning_rate": 1.577654304972899e-07,
1646
- "loss": 0.7872,
1647
  "step": 21000
1648
  },
1649
  {
1650
- "epoch": 0.84,
1651
- "eval_loss": 1.524131178855896,
1652
- "eval_runtime": 102.4749,
1653
- "eval_samples_per_second": 133.642,
1654
- "eval_steps_per_second": 2.088,
1655
  "step": 21000
1656
  },
1657
  {
1658
- "epoch": 0.844,
1659
- "grad_norm": 0.9355736970901489,
1660
  "learning_rate": 1.501375635331652e-07,
1661
- "loss": 0.7957,
1662
  "step": 21100
1663
  },
1664
  {
1665
- "epoch": 0.848,
1666
- "grad_norm": 0.8686819076538086,
1667
  "learning_rate": 1.4268690397881675e-07,
1668
- "loss": 0.793,
1669
  "step": 21200
1670
  },
1671
  {
1672
- "epoch": 0.852,
1673
- "grad_norm": 0.874756395816803,
1674
  "learning_rate": 1.3541465226807813e-07,
1675
- "loss": 0.7878,
1676
  "step": 21300
1677
  },
1678
  {
1679
- "epoch": 0.856,
1680
- "grad_norm": 0.9285154342651367,
1681
  "learning_rate": 1.283219800901045e-07,
1682
- "loss": 0.7547,
1683
  "step": 21400
1684
  },
1685
  {
1686
- "epoch": 0.86,
1687
- "grad_norm": 0.9496791958808899,
1688
  "learning_rate": 1.2141003020059273e-07,
1689
- "loss": 0.7885,
1690
  "step": 21500
1691
  },
1692
  {
1693
- "epoch": 0.864,
1694
- "grad_norm": 0.879410445690155,
1695
  "learning_rate": 1.1467991623766287e-07,
1696
- "loss": 0.8123,
1697
  "step": 21600
1698
  },
1699
  {
1700
- "epoch": 0.868,
1701
- "grad_norm": 0.942361056804657,
1702
  "learning_rate": 1.081327225424321e-07,
1703
- "loss": 0.817,
1704
  "step": 21700
1705
  },
1706
  {
1707
- "epoch": 0.872,
1708
- "grad_norm": 0.9548047184944153,
1709
  "learning_rate": 1.0176950398430752e-07,
1710
- "loss": 0.7925,
1711
  "step": 21800
1712
  },
1713
  {
1714
- "epoch": 0.876,
1715
- "grad_norm": 0.8643764853477478,
1716
  "learning_rate": 9.559128579102767e-08,
1717
- "loss": 0.7985,
1718
  "step": 21900
1719
  },
1720
  {
1721
- "epoch": 0.88,
1722
- "grad_norm": 0.9450801014900208,
1723
  "learning_rate": 8.959906338348007e-08,
1724
- "loss": 0.7975,
1725
  "step": 22000
1726
  },
1727
  {
1728
- "epoch": 0.88,
1729
- "eval_loss": 1.5321519374847412,
1730
- "eval_runtime": 103.5374,
1731
- "eval_samples_per_second": 132.271,
1732
- "eval_steps_per_second": 2.067,
1733
  "step": 22000
1734
  },
1735
  {
1736
- "epoch": 0.884,
1737
- "grad_norm": 0.9130359292030334,
1738
  "learning_rate": 8.37938022153223e-08,
1739
- "loss": 0.8005,
1740
  "step": 22100
1741
  },
1742
  {
1743
- "epoch": 0.888,
1744
- "grad_norm": 0.8732690215110779,
1745
  "learning_rate": 7.817643761742891e-08,
1746
- "loss": 0.7857,
1747
  "step": 22200
1748
  },
1749
  {
1750
- "epoch": 0.892,
1751
- "grad_norm": 0.9094323515892029,
1752
  "learning_rate": 7.274787464719338e-08,
1753
- "loss": 0.8096,
1754
  "step": 22300
1755
  },
1756
  {
1757
- "epoch": 0.896,
1758
- "grad_norm": 0.8987523913383484,
1759
  "learning_rate": 6.75089879427078e-08,
1760
- "loss": 0.8072,
1761
  "step": 22400
1762
  },
1763
  {
1764
- "epoch": 0.9,
1765
- "grad_norm": 0.9105306267738342,
1766
  "learning_rate": 6.246062158184241e-08,
1767
- "loss": 0.7968,
1768
  "step": 22500
1769
  },
1770
  {
1771
- "epoch": 0.904,
1772
- "grad_norm": 0.8889061808586121,
1773
  "learning_rate": 5.7603588946250064e-08,
1774
- "loss": 0.7971,
1775
  "step": 22600
1776
  },
1777
  {
1778
- "epoch": 0.908,
1779
- "grad_norm": 0.9296440482139587,
1780
  "learning_rate": 5.293867259031568e-08,
1781
- "loss": 0.7896,
1782
  "step": 22700
1783
  },
1784
  {
1785
- "epoch": 0.912,
1786
- "grad_norm": 1.0374181270599365,
1787
  "learning_rate": 4.8466624115073164e-08,
1788
- "loss": 0.808,
1789
  "step": 22800
1790
  },
1791
  {
1792
- "epoch": 1.00384,
1793
- "grad_norm": 0.8791893124580383,
1794
  "learning_rate": 4.4188164047108403e-08,
1795
- "loss": 0.7835,
1796
  "step": 22900
1797
  },
1798
  {
1799
- "epoch": 1.00784,
1800
- "grad_norm": 0.8789498209953308,
1801
  "learning_rate": 4.010398172247104e-08,
1802
- "loss": 0.7987,
1803
  "step": 23000
1804
  },
1805
  {
1806
- "epoch": 1.00784,
1807
- "eval_loss": 1.5310029983520508,
1808
- "eval_runtime": 101.8479,
1809
- "eval_samples_per_second": 134.465,
1810
- "eval_steps_per_second": 2.101,
1811
  "step": 23000
1812
  },
1813
  {
1814
- "epoch": 1.01184,
1815
- "grad_norm": 0.9262071847915649,
1816
  "learning_rate": 3.6214735175608004e-08,
1817
- "loss": 0.7966,
1818
  "step": 23100
1819
  },
1820
  {
1821
- "epoch": 1.01584,
1822
- "grad_norm": 0.8986383676528931,
1823
  "learning_rate": 3.252105103334499e-08,
1824
- "loss": 0.7954,
1825
  "step": 23200
1826
  },
1827
  {
1828
- "epoch": 1.01984,
1829
- "grad_norm": 0.9548205733299255,
1830
  "learning_rate": 2.9023524413923365e-08,
1831
- "loss": 0.7934,
1832
  "step": 23300
1833
  },
1834
  {
1835
- "epoch": 1.02384,
1836
- "grad_norm": 0.9211428165435791,
1837
  "learning_rate": 2.5722718831117656e-08,
1838
- "loss": 0.8068,
1839
  "step": 23400
1840
  },
1841
  {
1842
- "epoch": 1.02784,
1843
- "grad_norm": 0.9240931272506714,
1844
  "learning_rate": 2.26191661034425e-08,
1845
- "loss": 0.787,
1846
  "step": 23500
1847
  },
1848
  {
1849
- "epoch": 1.03184,
1850
- "grad_norm": 0.9866804480552673,
1851
  "learning_rate": 1.9713366268468148e-08,
1852
- "loss": 0.7929,
1853
  "step": 23600
1854
  },
1855
  {
1856
- "epoch": 1.03584,
1857
- "grad_norm": 0.9947385787963867,
1858
  "learning_rate": 1.700578750225432e-08,
1859
- "loss": 0.7973,
1860
  "step": 23700
1861
  },
1862
  {
1863
- "epoch": 1.03984,
1864
- "grad_norm": 0.8872534036636353,
1865
  "learning_rate": 1.4496866043919865e-08,
1866
- "loss": 0.7995,
1867
  "step": 23800
1868
  },
1869
  {
1870
- "epoch": 1.04384,
1871
- "grad_norm": 0.8726480007171631,
1872
  "learning_rate": 1.2187006125356087e-08,
1873
- "loss": 0.7929,
1874
  "step": 23900
1875
  },
1876
  {
1877
- "epoch": 1.04784,
1878
- "grad_norm": 0.881963849067688,
1879
  "learning_rate": 1.0076579906098255e-08,
1880
- "loss": 0.8044,
1881
  "step": 24000
1882
  },
1883
  {
1884
- "epoch": 1.04784,
1885
- "eval_loss": 1.5276943445205688,
1886
- "eval_runtime": 99.4561,
1887
- "eval_samples_per_second": 137.699,
1888
- "eval_steps_per_second": 2.152,
1889
  "step": 24000
1890
  },
1891
  {
1892
- "epoch": 1.0518399999999999,
1893
- "grad_norm": 0.8809722065925598,
1894
  "learning_rate": 8.16592741336386e-09,
1895
- "loss": 0.7832,
1896
  "step": 24100
1897
  },
1898
  {
1899
- "epoch": 1.05584,
1900
- "grad_norm": 0.8471363186836243,
1901
  "learning_rate": 6.455356487267833e-09,
1902
- "loss": 0.7815,
1903
  "step": 24200
1904
  },
1905
  {
1906
- "epoch": 1.05984,
1907
- "grad_norm": 0.9595879912376404,
1908
  "learning_rate": 4.9451427312251224e-09,
1909
- "loss": 0.7943,
1910
  "step": 24300
1911
  },
1912
  {
1913
- "epoch": 1.06384,
1914
- "grad_norm": 0.8937146663665771,
1915
  "learning_rate": 3.635529467544696e-09,
1916
- "loss": 0.8066,
1917
  "step": 24400
1918
  },
1919
  {
1920
- "epoch": 1.06784,
1921
- "grad_norm": 0.9749945998191833,
1922
  "learning_rate": 2.526727698227288e-09,
1923
- "loss": 0.802,
1924
  "step": 24500
1925
  },
1926
  {
1927
- "epoch": 1.07184,
1928
- "grad_norm": 0.919170081615448,
1929
  "learning_rate": 1.6189160709680074e-09,
1930
- "loss": 0.79,
1931
  "step": 24600
1932
  },
1933
  {
1934
- "epoch": 1.07584,
1935
- "grad_norm": 0.9579231142997742,
1936
  "learning_rate": 9.122408503739466e-10,
1937
- "loss": 0.8092,
1938
  "step": 24700
1939
  },
1940
  {
1941
- "epoch": 1.07984,
1942
- "grad_norm": 0.8257275223731995,
1943
  "learning_rate": 4.0681589439789395e-10,
1944
- "loss": 0.8028,
1945
  "step": 24800
1946
  },
1947
  {
1948
- "epoch": 1.08384,
1949
- "grad_norm": 0.8641030788421631,
1950
  "learning_rate": 1.0272263599411803e-10,
1951
- "loss": 0.7852,
1952
  "step": 24900
1953
  },
1954
  {
1955
- "epoch": 1.08784,
1956
- "grad_norm": 0.929093062877655,
1957
  "learning_rate": 1.006999733599301e-14,
1958
- "loss": 0.7867,
1959
  "step": 25000
1960
  },
1961
  {
1962
- "epoch": 1.08784,
1963
- "eval_loss": 1.5288399457931519,
1964
- "eval_runtime": 99.9402,
1965
- "eval_samples_per_second": 137.032,
1966
- "eval_steps_per_second": 2.141,
1967
  "step": 25000
1968
  }
1969
  ],
@@ -1984,7 +1984,7 @@
1984
  "attributes": {}
1985
  }
1986
  },
1987
- "total_flos": 3.3846277778817024e+18,
1988
  "train_batch_size": 64,
1989
  "trial_name": null,
1990
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.29912,
6
  "eval_steps": 1000,
7
  "global_step": 25000,
8
  "is_hyper_param_search": false,
 
11
  "log_history": [
12
  {
13
  "epoch": 4e-05,
14
+ "grad_norm": 0.8918513655662537,
15
  "learning_rate": 0.0,
16
+ "loss": 0.7598,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.004,
21
+ "grad_norm": 0.884207010269165,
22
  "learning_rate": 9.9e-07,
23
+ "loss": 0.7961,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.008,
28
+ "grad_norm": 1.007287621498108,
29
  "learning_rate": 1.9900000000000004e-06,
30
+ "loss": 0.7856,
31
  "step": 200
32
  },
33
  {
34
  "epoch": 0.012,
35
+ "grad_norm": 0.8807647228240967,
36
  "learning_rate": 2.4999758220143106e-06,
37
+ "loss": 0.7875,
38
  "step": 300
39
  },
40
  {
41
  "epoch": 0.016,
42
+ "grad_norm": 0.9170143008232117,
43
  "learning_rate": 2.4997764426529066e-06,
44
+ "loss": 0.7709,
45
  "step": 400
46
  },
47
  {
48
  "epoch": 0.02,
49
+ "grad_norm": 0.9494316577911377,
50
  "learning_rate": 2.499375702067717e-06,
51
+ "loss": 0.8038,
52
  "step": 500
53
  },
54
  {
55
  "epoch": 0.024,
56
+ "grad_norm": 0.8737604022026062,
57
  "learning_rate": 2.4987736648251815e-06,
58
+ "loss": 0.7807,
59
  "step": 600
60
  },
61
  {
62
  "epoch": 0.028,
63
+ "grad_norm": 0.9298632740974426,
64
  "learning_rate": 2.497970427924213e-06,
65
+ "loss": 0.7916,
66
  "step": 700
67
  },
68
  {
69
  "epoch": 0.032,
70
+ "grad_norm": 0.8767964243888855,
71
  "learning_rate": 2.496966120780569e-06,
72
+ "loss": 0.8052,
73
  "step": 800
74
  },
75
  {
76
  "epoch": 0.036,
77
+ "grad_norm": 0.9334876537322998,
78
  "learning_rate": 2.4957609052060012e-06,
79
+ "loss": 0.7916,
80
  "step": 900
81
  },
82
  {
83
  "epoch": 0.04,
84
+ "grad_norm": 0.8898613452911377,
85
  "learning_rate": 2.4943549753821847e-06,
86
+ "loss": 0.7895,
87
  "step": 1000
88
  },
89
  {
90
  "epoch": 0.04,
91
+ "eval_loss": 1.534223198890686,
92
+ "eval_runtime": 104.6575,
93
+ "eval_samples_per_second": 130.855,
94
+ "eval_steps_per_second": 2.045,
95
  "step": 1000
96
  },
97
  {
98
  "epoch": 0.044,
99
+ "grad_norm": 0.9141249656677246,
100
  "learning_rate": 2.4927485578294313e-06,
101
+ "loss": 0.7806,
102
  "step": 1100
103
  },
104
  {
105
  "epoch": 0.048,
106
+ "grad_norm": 0.891128659248352,
107
  "learning_rate": 2.4909419113701947e-06,
108
+ "loss": 0.794,
109
  "step": 1200
110
  },
111
  {
112
  "epoch": 0.052,
113
+ "grad_norm": 0.8805925846099854,
114
  "learning_rate": 2.4889353270873663e-06,
115
+ "loss": 0.7984,
116
  "step": 1300
117
  },
118
  {
119
  "epoch": 0.056,
120
+ "grad_norm": 0.9282805919647217,
121
  "learning_rate": 2.4867291282773805e-06,
122
+ "loss": 0.8041,
123
  "step": 1400
124
  },
125
  {
126
  "epoch": 0.06,
127
+ "grad_norm": 0.9669321179389954,
128
  "learning_rate": 2.4843236703981235e-06,
129
+ "loss": 0.7924,
130
  "step": 1500
131
  },
132
  {
133
  "epoch": 0.064,
134
+ "grad_norm": 0.891028106212616,
135
  "learning_rate": 2.481719341011662e-06,
136
+ "loss": 0.789,
137
  "step": 1600
138
  },
139
  {
140
  "epoch": 0.068,
141
+ "grad_norm": 0.8550590872764587,
142
  "learning_rate": 2.4789165597218035e-06,
143
+ "loss": 0.784,
144
  "step": 1700
145
  },
146
  {
147
  "epoch": 0.072,
148
+ "grad_norm": 0.8960260152816772,
149
  "learning_rate": 2.475915778106486e-06,
150
+ "loss": 0.7901,
151
  "step": 1800
152
  },
153
  {
154
  "epoch": 0.076,
155
+ "grad_norm": 0.940194845199585,
156
  "learning_rate": 2.4727174796450266e-06,
157
+ "loss": 0.7909,
158
  "step": 1900
159
  },
160
  {
161
  "epoch": 0.08,
162
+ "grad_norm": 0.9140194058418274,
163
  "learning_rate": 2.4693221796402166e-06,
164
+ "loss": 0.793,
165
  "step": 2000
166
  },
167
  {
168
  "epoch": 0.08,
169
+ "eval_loss": 1.5293817520141602,
170
+ "eval_runtime": 97.8828,
171
+ "eval_samples_per_second": 139.912,
172
+ "eval_steps_per_second": 2.186,
173
  "step": 2000
174
  },
175
  {
176
  "epoch": 0.084,
177
+ "grad_norm": 0.8168792128562927,
178
  "learning_rate": 2.4657304251353047e-06,
179
+ "loss": 0.8011,
180
  "step": 2100
181
  },
182
  {
183
  "epoch": 0.088,
184
+ "grad_norm": 0.9001737833023071,
185
  "learning_rate": 2.4619427948258547e-06,
186
+ "loss": 0.7997,
187
  "step": 2200
188
  },
189
  {
190
  "epoch": 0.092,
191
+ "grad_norm": 0.9699570536613464,
192
  "learning_rate": 2.4579598989665065e-06,
193
+ "loss": 0.797,
194
  "step": 2300
195
  },
196
  {
197
  "epoch": 0.096,
198
+ "grad_norm": 0.9276746511459351,
199
  "learning_rate": 2.453782379272657e-06,
200
+ "loss": 0.78,
201
  "step": 2400
202
  },
203
  {
204
  "epoch": 0.1,
205
+ "grad_norm": 0.8759055733680725,
206
  "learning_rate": 2.449410908817064e-06,
207
+ "loss": 0.7799,
208
  "step": 2500
209
  },
210
  {
211
  "epoch": 0.104,
212
+ "grad_norm": 0.860933244228363,
213
  "learning_rate": 2.444846191921406e-06,
214
+ "loss": 0.7884,
215
  "step": 2600
216
  },
217
  {
218
  "epoch": 0.108,
219
+ "grad_norm": 0.9049354195594788,
220
  "learning_rate": 2.4400889640427992e-06,
221
+ "loss": 0.7802,
222
  "step": 2700
223
  },
224
  {
225
  "epoch": 0.112,
226
+ "grad_norm": 0.9561330080032349,
227
  "learning_rate": 2.435139991655308e-06,
228
+ "loss": 0.7677,
229
  "step": 2800
230
  },
231
  {
232
  "epoch": 0.116,
233
+ "grad_norm": 0.9253368377685547,
234
  "learning_rate": 2.4300000721264466e-06,
235
+ "loss": 0.7845,
236
  "step": 2900
237
  },
238
  {
239
  "epoch": 0.12,
240
+ "grad_norm": 0.9350365400314331,
241
  "learning_rate": 2.4246700335887123e-06,
242
+ "loss": 0.783,
243
  "step": 3000
244
  },
245
  {
246
  "epoch": 0.12,
247
+ "eval_loss": 1.5278704166412354,
248
+ "eval_runtime": 98.3389,
249
+ "eval_samples_per_second": 139.263,
250
+ "eval_steps_per_second": 2.176,
251
  "step": 3000
252
  },
253
  {
254
  "epoch": 0.124,
255
+ "grad_norm": 0.8862438797950745,
256
  "learning_rate": 2.4191507348061575e-06,
257
+ "loss": 0.7816,
258
  "step": 3100
259
  },
260
  {
261
  "epoch": 0.128,
262
+ "grad_norm": 0.8396490812301636,
263
  "learning_rate": 2.4134430650360284e-06,
264
+ "loss": 0.783,
265
  "step": 3200
266
  },
267
  {
268
  "epoch": 0.132,
269
+ "grad_norm": 0.9647367596626282,
270
  "learning_rate": 2.407547943885489e-06,
271
+ "loss": 0.7829,
272
  "step": 3300
273
  },
274
  {
275
  "epoch": 0.136,
276
+ "grad_norm": 0.9121189713478088,
277
  "learning_rate": 2.4014663211634552e-06,
278
+ "loss": 0.8046,
279
  "step": 3400
280
  },
281
  {
282
  "epoch": 0.14,
283
+ "grad_norm": 0.9058841466903687,
284
  "learning_rate": 2.395199176727567e-06,
285
+ "loss": 0.7856,
286
  "step": 3500
287
  },
288
  {
289
  "epoch": 0.144,
290
+ "grad_norm": 0.9816420078277588,
291
  "learning_rate": 2.388747520326311e-06,
292
+ "loss": 0.7993,
293
  "step": 3600
294
  },
295
  {
296
  "epoch": 0.148,
297
+ "grad_norm": 0.9734699726104736,
298
  "learning_rate": 2.3821123914363374e-06,
299
+ "loss": 0.8116,
300
  "step": 3700
301
  },
302
  {
303
  "epoch": 0.152,
304
+ "grad_norm": 1.0234724283218384,
305
  "learning_rate": 2.3752948590949766e-06,
306
+ "loss": 0.8049,
307
  "step": 3800
308
  },
309
  {
310
  "epoch": 0.156,
311
+ "grad_norm": 0.9250158667564392,
312
  "learning_rate": 2.368296021728002e-06,
313
+ "loss": 0.8132,
314
  "step": 3900
315
  },
316
  {
317
  "epoch": 0.16,
318
+ "grad_norm": 0.8814631104469299,
319
  "learning_rate": 2.3611170069726532e-06,
320
+ "loss": 0.8051,
321
  "step": 4000
322
  },
323
  {
324
  "epoch": 0.16,
325
+ "eval_loss": 1.533823847770691,
326
+ "eval_runtime": 98.6455,
327
+ "eval_samples_per_second": 138.83,
328
+ "eval_steps_per_second": 2.169,
329
  "step": 4000
330
  },
331
  {
332
  "epoch": 0.164,
333
+ "grad_norm": 0.8858230710029602,
334
  "learning_rate": 2.3537589714959523e-06,
335
+ "loss": 0.8123,
336
  "step": 4100
337
  },
338
  {
339
  "epoch": 0.168,
340
+ "grad_norm": 0.9251588582992554,
341
  "learning_rate": 2.346223100808346e-06,
342
+ "loss": 0.7421,
343
  "step": 4200
344
  },
345
  {
346
  "epoch": 0.172,
347
+ "grad_norm": 0.9473935961723328,
348
  "learning_rate": 2.3385106090726974e-06,
349
+ "loss": 0.8236,
350
  "step": 4300
351
  },
352
  {
353
  "epoch": 0.176,
354
+ "grad_norm": 0.9739401936531067,
355
  "learning_rate": 2.330622738908663e-06,
356
+ "loss": 0.817,
357
  "step": 4400
358
  },
359
  {
360
  "epoch": 0.18,
361
+ "grad_norm": 0.8801769614219666,
362
  "learning_rate": 2.322560761192485e-06,
363
+ "loss": 0.8118,
364
  "step": 4500
365
  },
366
  {
367
  "epoch": 0.184,
368
+ "grad_norm": 0.9835686087608337,
369
  "learning_rate": 2.3143259748522308e-06,
370
+ "loss": 0.8223,
371
  "step": 4600
372
  },
373
  {
374
  "epoch": 0.188,
375
+ "grad_norm": 0.9962617754936218,
376
  "learning_rate": 2.3059197066585126e-06,
377
+ "loss": 0.8198,
378
  "step": 4700
379
  },
380
  {
381
  "epoch": 0.192,
382
+ "grad_norm": 0.9080422520637512,
383
  "learning_rate": 2.297343311010719e-06,
384
+ "loss": 0.8065,
385
  "step": 4800
386
  },
387
  {
388
  "epoch": 0.196,
389
+ "grad_norm": 0.922578752040863,
390
  "learning_rate": 2.2885981697188002e-06,
391
+ "loss": 0.7944,
392
  "step": 4900
393
  },
394
  {
395
  "epoch": 0.2,
396
+ "grad_norm": 1.0266526937484741,
397
  "learning_rate": 2.2796856917806313e-06,
398
+ "loss": 0.8053,
399
  "step": 5000
400
  },
401
  {
402
  "epoch": 0.2,
403
+ "eval_loss": 1.537758231163025,
404
+ "eval_runtime": 98.9909,
405
+ "eval_samples_per_second": 138.346,
406
  "eval_steps_per_second": 2.162,
407
  "step": 5000
408
  },
409
  {
410
  "epoch": 0.204,
411
+ "grad_norm": 0.9883342981338501,
412
  "learning_rate": 2.270607313155e-06,
413
+ "loss": 0.7974,
414
  "step": 5100
415
  },
416
  {
417
  "epoch": 0.208,
418
+ "grad_norm": 0.9677822589874268,
419
  "learning_rate": 2.2613644965302456e-06,
420
+ "loss": 0.8085,
421
  "step": 5200
422
  },
423
  {
424
  "epoch": 0.212,
425
+ "grad_norm": 0.9194427132606506,
426
  "learning_rate": 2.251958731088596e-06,
427
+ "loss": 0.812,
428
  "step": 5300
429
  },
430
  {
431
  "epoch": 0.216,
432
+ "grad_norm": 0.895246684551239,
433
  "learning_rate": 2.242391532266232e-06,
434
+ "loss": 0.7911,
435
  "step": 5400
436
  },
437
  {
438
  "epoch": 0.22,
439
+ "grad_norm": 0.8815901279449463,
440
  "learning_rate": 2.2326644415091264e-06,
441
+ "loss": 0.8114,
442
  "step": 5500
443
  },
444
  {
445
  "epoch": 0.224,
446
+ "grad_norm": 0.921257495880127,
447
  "learning_rate": 2.2227790260246856e-06,
448
+ "loss": 0.8107,
449
  "step": 5600
450
  },
451
  {
452
  "epoch": 0.228,
453
+ "grad_norm": 0.9411821365356445,
454
  "learning_rate": 2.2127368785292484e-06,
455
+ "loss": 0.8051,
456
  "step": 5700
457
  },
458
  {
459
  "epoch": 0.232,
460
+ "grad_norm": 0.9523917436599731,
461
  "learning_rate": 2.2025396169914697e-06,
462
+ "loss": 0.8324,
463
  "step": 5800
464
  },
465
  {
466
  "epoch": 0.236,
467
+ "grad_norm": 0.9265521764755249,
468
  "learning_rate": 2.1921888843716356e-06,
469
+ "loss": 0.8152,
470
  "step": 5900
471
  },
472
  {
473
  "epoch": 0.24,
474
+ "grad_norm": 0.8343620300292969,
475
  "learning_rate": 2.181686348356955e-06,
476
+ "loss": 0.8146,
477
  "step": 6000
478
  },
479
  {
480
  "epoch": 0.24,
481
+ "eval_loss": 1.5309844017028809,
482
+ "eval_runtime": 99.1981,
483
+ "eval_samples_per_second": 138.057,
484
+ "eval_steps_per_second": 2.157,
485
  "step": 6000
486
  },
487
  {
488
  "epoch": 0.244,
489
+ "grad_norm": 0.9226271510124207,
490
  "learning_rate": 2.1710337010928655e-06,
491
+ "loss": 0.8101,
492
  "step": 6100
493
  },
494
  {
495
  "epoch": 0.248,
496
+ "grad_norm": 0.9523386359214783,
497
  "learning_rate": 2.1602326589103967e-06,
498
+ "loss": 0.8236,
499
  "step": 6200
500
  },
501
  {
502
  "epoch": 0.252,
503
+ "grad_norm": 0.967309296131134,
504
  "learning_rate": 2.1492849620496414e-06,
505
+ "loss": 0.8101,
506
  "step": 6300
507
  },
508
  {
509
  "epoch": 0.256,
510
+ "grad_norm": 0.9258893132209778,
511
  "learning_rate": 2.13819237437937e-06,
512
+ "loss": 0.8379,
513
  "step": 6400
514
  },
515
  {
516
  "epoch": 0.26,
517
+ "grad_norm": 0.9850086569786072,
518
  "learning_rate": 2.126956683112842e-06,
519
+ "loss": 0.8156,
520
  "step": 6500
521
  },
522
  {
523
  "epoch": 0.264,
524
+ "grad_norm": 0.9868414998054504,
525
  "learning_rate": 2.1155796985198495e-06,
526
+ "loss": 0.8565,
527
  "step": 6600
528
  },
529
  {
530
  "epoch": 0.268,
531
+ "grad_norm": 0.9697744250297546,
532
  "learning_rate": 2.1040632536350573e-06,
533
+ "loss": 0.8287,
534
  "step": 6700
535
  },
536
  {
537
  "epoch": 0.272,
538
+ "grad_norm": 0.8494826555252075,
539
  "learning_rate": 2.092409203962663e-06,
540
+ "loss": 0.8363,
541
  "step": 6800
542
  },
543
  {
544
  "epoch": 0.276,
545
+ "grad_norm": 0.9729273915290833,
546
  "learning_rate": 2.080619427177443e-06,
547
+ "loss": 0.842,
548
  "step": 6900
549
  },
550
  {
551
  "epoch": 0.28,
552
+ "grad_norm": 0.8614193201065063,
553
  "learning_rate": 2.0686958228222298e-06,
554
+ "loss": 0.8083,
555
  "step": 7000
556
  },
557
  {
558
  "epoch": 0.28,
559
+ "eval_loss": 1.5380674600601196,
560
+ "eval_runtime": 99.1938,
561
+ "eval_samples_per_second": 138.063,
562
+ "eval_steps_per_second": 2.157,
563
  "step": 7000
564
  },
565
  {
566
  "epoch": 0.284,
567
+ "grad_norm": 0.886326253414154,
568
  "learning_rate": 2.056640312001856e-06,
569
+ "loss": 0.8144,
570
  "step": 7100
571
  },
572
  {
573
  "epoch": 0.288,
574
+ "grad_norm": 0.9302154183387756,
575
  "learning_rate": 2.0444548370736335e-06,
576
+ "loss": 0.8195,
577
  "step": 7200
578
  },
579
  {
580
  "epoch": 0.292,
581
+ "grad_norm": 0.9282687902450562,
582
  "learning_rate": 2.032141361334406e-06,
583
+ "loss": 0.8105,
584
  "step": 7300
585
  },
586
  {
587
  "epoch": 0.296,
588
+ "grad_norm": 0.9001493453979492,
589
  "learning_rate": 2.019701868704224e-06,
590
+ "loss": 0.8129,
591
  "step": 7400
592
  },
593
  {
594
  "epoch": 0.3,
595
+ "grad_norm": 1.1394405364990234,
596
  "learning_rate": 2.007138363406702e-06,
597
+ "loss": 0.8323,
598
  "step": 7500
599
  },
600
  {
601
  "epoch": 0.304,
602
+ "grad_norm": 0.8882161378860474,
603
  "learning_rate": 1.9944528696461016e-06,
604
+ "loss": 0.8335,
605
  "step": 7600
606
  },
607
  {
608
  "epoch": 0.308,
609
+ "grad_norm": 0.9058473110198975,
610
  "learning_rate": 1.9816474312811984e-06,
611
+ "loss": 0.8275,
612
  "step": 7700
613
  },
614
  {
615
  "epoch": 0.312,
616
+ "grad_norm": 0.8733384609222412,
617
  "learning_rate": 1.9687241114959753e-06,
618
+ "loss": 0.8153,
619
  "step": 7800
620
  },
621
  {
622
  "epoch": 0.316,
623
+ "grad_norm": 0.926038920879364,
624
  "learning_rate": 1.955684992467211e-06,
625
+ "loss": 0.8372,
626
  "step": 7900
627
  },
628
  {
629
  "epoch": 0.32,
630
+ "grad_norm": 0.8697761297225952,
631
  "learning_rate": 1.942532175029003e-06,
632
+ "loss": 0.8299,
633
  "step": 8000
634
  },
635
  {
636
  "epoch": 0.32,
637
+ "eval_loss": 1.533625602722168,
638
+ "eval_runtime": 99.4163,
639
+ "eval_samples_per_second": 137.754,
640
+ "eval_steps_per_second": 2.153,
641
  "step": 8000
642
  },
643
  {
644
  "epoch": 0.324,
645
+ "grad_norm": 0.8937378525733948,
646
  "learning_rate": 1.929267778334285e-06,
647
+ "loss": 0.833,
648
  "step": 8100
649
  },
650
  {
651
  "epoch": 0.328,
652
+ "grad_norm": 0.8287638425827026,
653
  "learning_rate": 1.915893939513396e-06,
654
+ "loss": 0.8177,
655
  "step": 8200
656
  },
657
  {
658
  "epoch": 0.332,
659
+ "grad_norm": 0.8690105080604553,
660
  "learning_rate": 1.9024128133297467e-06,
661
+ "loss": 0.8248,
662
  "step": 8300
663
  },
664
  {
665
  "epoch": 0.336,
666
+ "grad_norm": 0.8646178841590881,
667
  "learning_rate": 1.8888265718326532e-06,
668
+ "loss": 0.823,
669
  "step": 8400
670
  },
671
  {
672
  "epoch": 0.34,
673
+ "grad_norm": 0.9007936716079712,
674
  "learning_rate": 1.8751374040073774e-06,
675
+ "loss": 0.8452,
676
  "step": 8500
677
  },
678
  {
679
  "epoch": 0.344,
680
+ "grad_norm": 0.9151228666305542,
681
  "learning_rate": 1.8613475154224456e-06,
682
+ "loss": 0.8223,
683
  "step": 8600
684
  },
685
  {
686
  "epoch": 0.348,
687
+ "grad_norm": 0.9274579882621765,
688
  "learning_rate": 1.8474591278742894e-06,
689
+ "loss": 0.8319,
690
  "step": 8700
691
  },
692
  {
693
  "epoch": 0.352,
694
+ "grad_norm": 0.9464965462684631,
695
  "learning_rate": 1.8334744790292766e-06,
696
+ "loss": 0.8232,
697
  "step": 8800
698
  },
699
  {
700
  "epoch": 0.356,
701
+ "grad_norm": 0.9213349223136902,
702
  "learning_rate": 1.8193958220631833e-06,
703
+ "loss": 0.8256,
704
  "step": 8900
705
  },
706
  {
707
  "epoch": 0.36,
708
+ "grad_norm": 0.8790585994720459,
709
  "learning_rate": 1.805225425298166e-06,
710
+ "loss": 0.8155,
711
  "step": 9000
712
  },
713
  {
714
  "epoch": 0.36,
715
+ "eval_loss": 1.5332763195037842,
716
+ "eval_runtime": 99.7769,
717
+ "eval_samples_per_second": 137.256,
718
+ "eval_steps_per_second": 2.145,
719
  "step": 9000
720
  },
721
  {
722
  "epoch": 0.364,
723
+ "grad_norm": 0.8666992783546448,
724
  "learning_rate": 1.790965571837296e-06,
725
+ "loss": 0.8276,
726
  "step": 9100
727
  },
728
  {
729
  "epoch": 0.368,
730
+ "grad_norm": 0.9162755012512207,
731
  "learning_rate": 1.7766185591967092e-06,
732
+ "loss": 0.814,
733
  "step": 9200
734
  },
735
  {
736
  "epoch": 0.372,
737
+ "grad_norm": 0.849520206451416,
738
  "learning_rate": 1.762186698935437e-06,
739
+ "loss": 0.8035,
740
  "step": 9300
741
  },
742
  {
743
  "epoch": 0.376,
744
+ "grad_norm": 0.8899989724159241,
745
  "learning_rate": 1.7476723162829723e-06,
746
+ "loss": 0.8045,
747
  "step": 9400
748
  },
749
  {
750
  "epoch": 0.38,
751
+ "grad_norm": 0.9177680015563965,
752
  "learning_rate": 1.7330777497646328e-06,
753
+ "loss": 0.8203,
754
  "step": 9500
755
  },
756
  {
757
  "epoch": 0.384,
758
+ "grad_norm": 0.8827778100967407,
759
  "learning_rate": 1.7184053508247853e-06,
760
+ "loss": 0.8308,
761
  "step": 9600
762
  },
763
  {
764
  "epoch": 0.388,
765
+ "grad_norm": 0.9426102042198181,
766
  "learning_rate": 1.703657483447983e-06,
767
+ "loss": 0.8027,
768
  "step": 9700
769
  },
770
  {
771
  "epoch": 0.392,
772
+ "grad_norm": 1.2273017168045044,
773
  "learning_rate": 1.6888365237780886e-06,
774
+ "loss": 0.8023,
775
  "step": 9800
776
  },
777
  {
778
  "epoch": 0.396,
779
+ "grad_norm": 0.8210641145706177,
780
  "learning_rate": 1.6739448597354327e-06,
781
+ "loss": 0.7839,
782
  "step": 9900
783
  },
784
  {
785
  "epoch": 0.4,
786
+ "grad_norm": 0.96668940782547,
787
  "learning_rate": 1.6589848906320794e-06,
788
+ "loss": 0.7787,
789
  "step": 10000
790
  },
791
  {
792
  "epoch": 0.4,
793
+ "eval_loss": 1.536078929901123,
794
+ "eval_runtime": 100.89,
795
+ "eval_samples_per_second": 135.742,
796
+ "eval_steps_per_second": 2.121,
797
  "step": 10000
798
  },
799
  {
800
  "epoch": 0.404,
801
+ "grad_norm": 0.8324722051620483,
802
  "learning_rate": 1.6439590267852528e-06,
803
+ "loss": 0.7875,
804
  "step": 10100
805
  },
806
  {
807
  "epoch": 0.408,
808
+ "grad_norm": 0.8992891907691956,
809
  "learning_rate": 1.6288696891289938e-06,
810
+ "loss": 0.7833,
811
  "step": 10200
812
  },
813
  {
814
  "epoch": 0.412,
815
+ "grad_norm": 0.935984194278717,
816
  "learning_rate": 1.6137193088241021e-06,
817
+ "loss": 0.7715,
818
  "step": 10300
819
  },
820
  {
821
  "epoch": 0.416,
822
+ "grad_norm": 0.9280077815055847,
823
  "learning_rate": 1.598510326866435e-06,
824
+ "loss": 0.7841,
825
  "step": 10400
826
  },
827
  {
828
  "epoch": 0.42,
829
+ "grad_norm": 0.8835504055023193,
830
  "learning_rate": 1.583245193693619e-06,
831
+ "loss": 0.7851,
832
  "step": 10500
833
  },
834
  {
835
  "epoch": 0.424,
836
+ "grad_norm": 0.9455707669258118,
837
  "learning_rate": 1.5679263687902402e-06,
838
+ "loss": 0.789,
839
  "step": 10600
840
  },
841
  {
842
  "epoch": 0.428,
843
+ "grad_norm": 0.9059398770332336,
844
  "learning_rate": 1.552556320291578e-06,
845
+ "loss": 0.7905,
846
  "step": 10700
847
  },
848
  {
849
  "epoch": 0.432,
850
+ "grad_norm": 0.992311418056488,
851
  "learning_rate": 1.5371375245859446e-06,
852
+ "loss": 0.7824,
853
  "step": 10800
854
  },
855
  {
856
  "epoch": 0.436,
857
+ "grad_norm": 0.9719735383987427,
858
  "learning_rate": 1.5216724659156944e-06,
859
+ "loss": 0.7952,
860
  "step": 10900
861
  },
862
  {
863
  "epoch": 0.44,
864
+ "grad_norm": 0.879607617855072,
865
  "learning_rate": 1.506163635976969e-06,
866
+ "loss": 0.7901,
867
  "step": 11000
868
  },
869
  {
870
  "epoch": 0.44,
871
+ "eval_loss": 1.5309207439422607,
872
+ "eval_runtime": 101.145,
873
+ "eval_samples_per_second": 135.4,
874
+ "eval_steps_per_second": 2.116,
875
  "step": 11000
876
  },
877
  {
878
  "epoch": 0.444,
879
+ "grad_norm": 0.8977655172348022,
880
  "learning_rate": 1.49061353351824e-06,
881
+ "loss": 0.7903,
882
  "step": 11100
883
  },
884
  {
885
  "epoch": 0.448,
886
+ "grad_norm": 0.9356399178504944,
887
  "learning_rate": 1.4750246639377161e-06,
888
+ "loss": 0.7906,
889
  "step": 11200
890
  },
891
  {
892
  "epoch": 0.452,
893
+ "grad_norm": 0.9783302545547485,
894
  "learning_rate": 1.4593995388796797e-06,
895
+ "loss": 0.7658,
896
  "step": 11300
897
  },
898
  {
899
  "epoch": 0.456,
900
+ "grad_norm": 0.9520688652992249,
901
  "learning_rate": 1.4437406758298156e-06,
902
+ "loss": 0.7962,
903
  "step": 11400
904
  },
905
  {
906
  "epoch": 0.46,
907
+ "grad_norm": 0.9219170808792114,
908
  "learning_rate": 1.428050597709599e-06,
909
+ "loss": 0.7857,
910
  "step": 11500
911
  },
912
  {
913
  "epoch": 0.464,
914
+ "grad_norm": 0.973076343536377,
915
  "learning_rate": 1.412331832469809e-06,
916
+ "loss": 0.7711,
917
  "step": 11600
918
  },
919
  {
920
  "epoch": 0.468,
921
+ "grad_norm": 0.918262779712677,
922
  "learning_rate": 1.39658691268323e-06,
923
+ "loss": 0.7708,
924
  "step": 11700
925
  },
926
  {
927
  "epoch": 0.472,
928
+ "grad_norm": 0.8629575371742249,
929
  "learning_rate": 1.3808183751366089e-06,
930
+ "loss": 0.7785,
931
  "step": 11800
932
  },
933
  {
934
  "epoch": 0.476,
935
+ "grad_norm": 0.9668097496032715,
936
  "learning_rate": 1.3650287604219342e-06,
937
+ "loss": 0.7756,
938
  "step": 11900
939
  },
940
  {
941
  "epoch": 0.48,
942
+ "grad_norm": 0.8705388307571411,
943
  "learning_rate": 1.3492206125271016e-06,
944
+ "loss": 0.7856,
945
  "step": 12000
946
  },
947
  {
948
  "epoch": 0.48,
949
+ "eval_loss": 1.5304718017578125,
950
+ "eval_runtime": 101.3541,
951
+ "eval_samples_per_second": 135.12,
952
+ "eval_steps_per_second": 2.111,
953
  "step": 12000
954
  },
955
  {
956
  "epoch": 0.484,
957
+ "grad_norm": 0.9626092314720154,
958
  "learning_rate": 1.333396478426031e-06,
959
+ "loss": 0.7866,
960
  "step": 12100
961
  },
962
  {
963
  "epoch": 0.488,
964
+ "grad_norm": 0.9647358655929565,
965
  "learning_rate": 1.317558907668306e-06,
966
+ "loss": 0.7718,
967
  "step": 12200
968
  },
969
  {
970
  "epoch": 0.492,
971
+ "grad_norm": 0.8542793393135071,
972
  "learning_rate": 1.3017104519683932e-06,
973
+ "loss": 0.7894,
974
  "step": 12300
975
  },
976
  {
977
  "epoch": 0.496,
978
+ "grad_norm": 0.9130584001541138,
979
  "learning_rate": 1.285853664794518e-06,
980
+ "loss": 0.7854,
981
  "step": 12400
982
  },
983
  {
984
  "epoch": 0.5,
985
+ "grad_norm": 0.9400973320007324,
986
  "learning_rate": 1.269991100957254e-06,
987
+ "loss": 0.7803,
988
  "step": 12500
989
  },
990
  {
991
  "epoch": 0.504,
992
+ "grad_norm": 0.9699378609657288,
993
  "learning_rate": 1.2541253161978986e-06,
994
+ "loss": 0.7809,
995
  "step": 12600
996
  },
997
  {
998
  "epoch": 0.508,
999
+ "grad_norm": 0.9242445826530457,
1000
  "learning_rate": 1.238258866776697e-06,
1001
+ "loss": 0.7724,
1002
  "step": 12700
1003
  },
1004
  {
1005
  "epoch": 0.512,
1006
+ "grad_norm": 0.899358868598938,
1007
  "learning_rate": 1.222394309060982e-06,
1008
+ "loss": 0.7883,
1009
  "step": 12800
1010
  },
1011
  {
1012
  "epoch": 0.516,
1013
+ "grad_norm": 0.9915406107902527,
1014
  "learning_rate": 1.2065341991133013e-06,
1015
+ "loss": 0.7908,
1016
  "step": 12900
1017
  },
1018
  {
1019
  "epoch": 0.52,
1020
+ "grad_norm": 0.9488005042076111,
1021
  "learning_rate": 1.1906810922795864e-06,
1022
+ "loss": 0.7955,
1023
  "step": 13000
1024
  },
1025
  {
1026
  "epoch": 0.52,
1027
+ "eval_loss": 1.5362560749053955,
1028
+ "eval_runtime": 101.5086,
1029
+ "eval_samples_per_second": 134.915,
1030
+ "eval_steps_per_second": 2.108,
1031
  "step": 13000
1032
  },
1033
  {
1034
  "epoch": 0.524,
1035
+ "grad_norm": 0.9982778429985046,
1036
  "learning_rate": 1.1748375427774422e-06,
1037
+ "loss": 0.7781,
1038
  "step": 13100
1039
  },
1040
  {
1041
  "epoch": 0.528,
1042
+ "grad_norm": 0.9015283584594727,
1043
  "learning_rate": 1.1590061032846182e-06,
1044
+ "loss": 0.7729,
1045
  "step": 13200
1046
  },
1047
  {
1048
  "epoch": 0.532,
1049
+ "grad_norm": 0.9279696941375732,
1050
  "learning_rate": 1.1431893245277262e-06,
1051
+ "loss": 0.7818,
1052
  "step": 13300
1053
  },
1054
  {
1055
  "epoch": 0.536,
1056
+ "grad_norm": 0.8429758548736572,
1057
  "learning_rate": 1.1273897548712726e-06,
1058
+ "loss": 0.7977,
1059
  "step": 13400
1060
  },
1061
  {
1062
  "epoch": 0.54,
1063
+ "grad_norm": 0.9567962288856506,
1064
  "learning_rate": 1.11160993990707e-06,
1065
+ "loss": 0.7762,
1066
  "step": 13500
1067
  },
1068
  {
1069
  "epoch": 0.544,
1070
+ "grad_norm": 0.8935989737510681,
1071
  "learning_rate": 1.0958524220440999e-06,
1072
+ "loss": 0.7862,
1073
  "step": 13600
1074
  },
1075
  {
1076
  "epoch": 0.548,
1077
+ "grad_norm": 0.9136532545089722,
1078
  "learning_rate": 1.0801197400988838e-06,
1079
+ "loss": 0.7656,
1080
  "step": 13700
1081
  },
1082
  {
1083
  "epoch": 0.552,
1084
+ "grad_norm": 0.9243994355201721,
1085
  "learning_rate": 1.0644144288864352e-06,
1086
+ "loss": 0.7751,
1087
  "step": 13800
1088
  },
1089
  {
1090
  "epoch": 0.556,
1091
+ "grad_norm": 0.9153244495391846,
1092
  "learning_rate": 1.048739018811855e-06,
1093
+ "loss": 0.7776,
1094
  "step": 13900
1095
  },
1096
  {
1097
  "epoch": 0.56,
1098
+ "grad_norm": 0.8624141812324524,
1099
  "learning_rate": 1.0330960354626384e-06,
1100
+ "loss": 0.7831,
1101
  "step": 14000
1102
  },
1103
  {
1104
  "epoch": 0.56,
1105
+ "eval_loss": 1.5321879386901855,
1106
+ "eval_runtime": 101.7151,
1107
+ "eval_samples_per_second": 134.641,
1108
+ "eval_steps_per_second": 2.104,
1109
  "step": 14000
1110
  },
1111
  {
1112
  "epoch": 0.564,
1113
+ "grad_norm": 0.944176971912384,
1114
  "learning_rate": 1.0174879992017586e-06,
1115
+ "loss": 0.7646,
1116
  "step": 14100
1117
  },
1118
  {
1119
  "epoch": 0.568,
1120
+ "grad_norm": 1.0256272554397583,
1121
  "learning_rate": 1.0019174247615919e-06,
1122
+ "loss": 0.7966,
1123
  "step": 14200
1124
  },
1125
  {
1126
  "epoch": 0.572,
1127
+ "grad_norm": 1.0110539197921753,
1128
  "learning_rate": 9.863868208387473e-07,
1129
+ "loss": 0.7878,
1130
  "step": 14300
1131
  },
1132
  {
1133
  "epoch": 0.576,
1134
+ "grad_norm": 0.9285057783126831,
1135
  "learning_rate": 9.708986896898727e-07,
1136
+ "loss": 0.7826,
1137
  "step": 14400
1138
  },
1139
  {
1140
  "epoch": 0.58,
1141
+ "grad_norm": 0.9270790219306946,
1142
  "learning_rate": 9.554555267284956e-07,
1143
+ "loss": 0.7832,
1144
  "step": 14500
1145
  },
1146
  {
1147
  "epoch": 0.584,
1148
+ "grad_norm": 0.901900053024292,
1149
  "learning_rate": 9.400598201229705e-07,
1150
+ "loss": 0.7747,
1151
  "step": 14600
1152
  },
1153
  {
1154
  "epoch": 0.588,
1155
+ "grad_norm": 0.9428499341011047,
1156
  "learning_rate": 9.247140503955863e-07,
1157
+ "loss": 0.7724,
1158
  "step": 14700
1159
  },
1160
  {
1161
  "epoch": 0.592,
1162
+ "grad_norm": 0.8610336780548096,
1163
  "learning_rate": 9.09420690022913e-07,
1164
+ "loss": 0.7818,
1165
  "step": 14800
1166
  },
1167
  {
1168
  "epoch": 0.596,
1169
+ "grad_norm": 0.9091448783874512,
1170
  "learning_rate": 8.941822030374405e-07,
1171
+ "loss": 0.7833,
1172
  "step": 14900
1173
  },
1174
  {
1175
  "epoch": 0.6,
1176
+ "grad_norm": 0.9281105995178223,
1177
  "learning_rate": 8.790010446305814e-07,
1178
+ "loss": 0.7762,
1179
  "step": 15000
1180
  },
1181
  {
1182
  "epoch": 0.6,
1183
+ "eval_loss": 1.5338356494903564,
1184
+ "eval_runtime": 101.6428,
1185
+ "eval_samples_per_second": 134.736,
1186
+ "eval_steps_per_second": 2.105,
1187
  "step": 15000
1188
  },
1189
  {
1190
  "epoch": 0.604,
1191
+ "grad_norm": 0.9168809056282043,
1192
  "learning_rate": 8.63879660757092e-07,
1193
+ "loss": 0.7757,
1194
  "step": 15100
1195
  },
1196
  {
1197
  "epoch": 0.608,
1198
+ "grad_norm": 0.9485521912574768,
1199
  "learning_rate": 8.488204877409884e-07,
1200
+ "loss": 0.7722,
1201
  "step": 15200
1202
  },
1203
  {
1204
  "epoch": 0.612,
1205
+ "grad_norm": 0.9768329858779907,
1206
  "learning_rate": 8.338259518830106e-07,
1207
+ "loss": 0.7677,
1208
  "step": 15300
1209
  },
1210
  {
1211
  "epoch": 0.616,
1212
+ "grad_norm": 0.9104267954826355,
1213
  "learning_rate": 8.18898469069703e-07,
1214
+ "loss": 0.7703,
1215
  "step": 15400
1216
  },
1217
  {
1218
  "epoch": 0.62,
1219
+ "grad_norm": 0.9525657296180725,
1220
  "learning_rate": 8.040404443841701e-07,
1221
+ "loss": 0.7567,
1222
  "step": 15500
1223
  },
1224
  {
1225
  "epoch": 0.624,
1226
+ "grad_norm": 0.8855301737785339,
1227
  "learning_rate": 7.892542717185766e-07,
1228
+ "loss": 0.7668,
1229
  "step": 15600
1230
  },
1231
  {
1232
  "epoch": 0.628,
1233
+ "grad_norm": 0.9275020956993103,
1234
  "learning_rate": 7.745423333884483e-07,
1235
+ "loss": 0.77,
1236
  "step": 15700
1237
  },
1238
  {
1239
  "epoch": 0.632,
1240
+ "grad_norm": 0.9016631841659546,
1241
  "learning_rate": 7.599069997488386e-07,
1242
+ "loss": 0.7769,
1243
  "step": 15800
1244
  },
1245
  {
1246
  "epoch": 0.636,
1247
+ "grad_norm": 0.9669123291969299,
1248
  "learning_rate": 7.453506288124224e-07,
1249
+ "loss": 0.776,
1250
  "step": 15900
1251
  },
1252
  {
1253
  "epoch": 0.64,
1254
+ "grad_norm": 0.9893669486045837,
1255
  "learning_rate": 7.308755658695775e-07,
1256
+ "loss": 0.7784,
1257
  "step": 16000
1258
  },
1259
  {
1260
  "epoch": 0.64,
1261
+ "eval_loss": 1.5408803224563599,
1262
+ "eval_runtime": 101.8477,
1263
+ "eval_samples_per_second": 134.465,
1264
+ "eval_steps_per_second": 2.101,
1265
  "step": 16000
1266
  },
1267
  {
1268
  "epoch": 0.644,
1269
+ "grad_norm": 0.9406617879867554,
1270
  "learning_rate": 7.164841431105172e-07,
1271
+ "loss": 0.7253,
1272
  "step": 16100
1273
  },
1274
  {
1275
  "epoch": 0.648,
1276
+ "grad_norm": 0.9569827914237976,
1277
  "learning_rate": 7.021786792495325e-07,
1278
+ "loss": 0.7643,
1279
  "step": 16200
1280
  },
1281
  {
1282
  "epoch": 0.652,
1283
+ "grad_norm": 0.910629391670227,
1284
  "learning_rate": 6.879614791514075e-07,
1285
+ "loss": 0.7921,
1286
  "step": 16300
1287
  },
1288
  {
1289
  "epoch": 0.656,
1290
+ "grad_norm": 0.9068129658699036,
1291
  "learning_rate": 6.738348334600634e-07,
1292
+ "loss": 0.7982,
1293
  "step": 16400
1294
  },
1295
  {
1296
  "epoch": 0.66,
1297
+ "grad_norm": 0.9654185175895691,
1298
  "learning_rate": 6.598010182294938e-07,
1299
+ "loss": 0.7691,
1300
  "step": 16500
1301
  },
1302
  {
1303
  "epoch": 0.664,
1304
+ "grad_norm": 0.8514201641082764,
1305
  "learning_rate": 6.458622945570538e-07,
1306
+ "loss": 0.7763,
1307
  "step": 16600
1308
  },
1309
  {
1310
  "epoch": 0.668,
1311
+ "grad_norm": 0.9092292189598083,
1312
  "learning_rate": 6.320209082191569e-07,
1313
+ "loss": 0.7707,
1314
  "step": 16700
1315
  },
1316
  {
1317
  "epoch": 0.672,
1318
+ "grad_norm": 0.9370847940444946,
1319
  "learning_rate": 6.182790893094402e-07,
1320
+ "loss": 0.7861,
1321
  "step": 16800
1322
  },
1323
  {
1324
  "epoch": 0.676,
1325
+ "grad_norm": 0.9759059548377991,
1326
  "learning_rate": 6.046390518794556e-07,
1327
+ "loss": 0.7661,
1328
  "step": 16900
1329
  },
1330
  {
1331
  "epoch": 0.68,
1332
+ "grad_norm": 0.9293698072433472,
1333
  "learning_rate": 5.911029935819468e-07,
1334
+ "loss": 0.7833,
1335
  "step": 17000
1336
  },
1337
  {
1338
  "epoch": 0.68,
1339
+ "eval_loss": 1.5393362045288086,
1340
+ "eval_runtime": 102.1612,
1341
+ "eval_samples_per_second": 134.053,
1342
+ "eval_steps_per_second": 2.095,
1343
  "step": 17000
1344
  },
1345
  {
1346
  "epoch": 0.684,
1347
+ "grad_norm": 0.8583377003669739,
1348
  "learning_rate": 5.776730953167705e-07,
1349
+ "loss": 0.783,
1350
  "step": 17100
1351
  },
1352
  {
1353
  "epoch": 0.688,
1354
+ "grad_norm": 0.9435706734657288,
1355
  "learning_rate": 5.643515208795141e-07,
1356
+ "loss": 0.7795,
1357
  "step": 17200
1358
  },
1359
  {
1360
  "epoch": 0.692,
1361
+ "grad_norm": 0.9650281667709351,
1362
  "learning_rate": 5.511404166128647e-07,
1363
+ "loss": 0.7861,
1364
  "step": 17300
1365
  },
1366
  {
1367
  "epoch": 0.696,
1368
+ "grad_norm": 0.8666310906410217,
1369
  "learning_rate": 5.380419110608033e-07,
1370
+ "loss": 0.7635,
1371
  "step": 17400
1372
  },
1373
  {
1374
  "epoch": 0.7,
1375
+ "grad_norm": 0.9947652816772461,
1376
  "learning_rate": 5.250581146256524e-07,
1377
+ "loss": 0.7823,
1378
  "step": 17500
1379
  },
1380
  {
1381
+ "epoch": 1.00312,
1382
+ "grad_norm": 0.9741705060005188,
1383
  "learning_rate": 5.121911192280557e-07,
1384
+ "loss": 0.7853,
1385
  "step": 17600
1386
  },
1387
  {
1388
+ "epoch": 1.00712,
1389
+ "grad_norm": 0.9121885299682617,
1390
  "learning_rate": 4.994429979699302e-07,
1391
+ "loss": 0.7667,
1392
  "step": 17700
1393
  },
1394
  {
1395
+ "epoch": 1.01112,
1396
+ "grad_norm": 0.9591026902198792,
1397
  "learning_rate": 4.868158048004537e-07,
1398
+ "loss": 0.7719,
1399
  "step": 17800
1400
  },
1401
  {
1402
+ "epoch": 1.01512,
1403
+ "grad_norm": 0.8141223192214966,
1404
  "learning_rate": 4.743115741851383e-07,
1405
+ "loss": 0.7512,
1406
  "step": 17900
1407
  },
1408
  {
1409
+ "epoch": 1.01912,
1410
+ "grad_norm": 0.9282773733139038,
1411
  "learning_rate": 4.6193232077804006e-07,
1412
+ "loss": 0.7685,
1413
  "step": 18000
1414
  },
1415
  {
1416
+ "epoch": 1.01912,
1417
+ "eval_loss": 1.5338866710662842,
1418
+ "eval_runtime": 101.9336,
1419
+ "eval_samples_per_second": 134.352,
1420
+ "eval_steps_per_second": 2.099,
1421
  "step": 18000
1422
  },
1423
  {
1424
+ "epoch": 1.02312,
1425
+ "grad_norm": 0.9288948774337769,
1426
  "learning_rate": 4.4968003909716243e-07,
1427
+ "loss": 0.7806,
1428
  "step": 18100
1429
  },
1430
  {
1431
+ "epoch": 1.02712,
1432
+ "grad_norm": 0.8646376729011536,
1433
  "learning_rate": 4.3755670320310443e-07,
1434
+ "loss": 0.7555,
1435
  "step": 18200
1436
  },
1437
  {
1438
+ "epoch": 1.03112,
1439
+ "grad_norm": 0.9440610408782959,
1440
  "learning_rate": 4.2556426638100555e-07,
1441
+ "loss": 0.7835,
1442
  "step": 18300
1443
  },
1444
  {
1445
+ "epoch": 1.03512,
1446
+ "grad_norm": 0.8950068354606628,
1447
  "learning_rate": 4.1370466082583353e-07,
1448
+ "loss": 0.7765,
1449
  "step": 18400
1450
  },
1451
  {
1452
+ "epoch": 1.03912,
1453
+ "grad_norm": 0.9245269298553467,
1454
  "learning_rate": 4.0197979733107755e-07,
1455
+ "loss": 0.7707,
1456
  "step": 18500
1457
  },
1458
  {
1459
+ "epoch": 1.04312,
1460
+ "grad_norm": 0.9309988021850586,
1461
  "learning_rate": 3.903915649808812e-07,
1462
+ "loss": 0.7608,
1463
  "step": 18600
1464
  },
1465
  {
1466
+ "epoch": 1.04712,
1467
+ "grad_norm": 0.9837161898612976,
1468
  "learning_rate": 3.789418308456812e-07,
1469
+ "loss": 0.7731,
1470
  "step": 18700
1471
  },
1472
  {
1473
+ "epoch": 1.05112,
1474
+ "grad_norm": 0.8932573795318604,
1475
  "learning_rate": 3.676324396813856e-07,
1476
+ "loss": 0.7901,
1477
  "step": 18800
1478
  },
1479
  {
1480
+ "epoch": 1.05512,
1481
+ "grad_norm": 0.9325317144393921,
1482
  "learning_rate": 3.5646521363215447e-07,
1483
+ "loss": 0.771,
1484
  "step": 18900
1485
  },
1486
  {
1487
+ "epoch": 1.05912,
1488
+ "grad_norm": 0.8987941741943359,
1489
  "learning_rate": 3.4544195193681615e-07,
1490
+ "loss": 0.779,
1491
  "step": 19000
1492
  },
1493
  {
1494
+ "epoch": 1.05912,
1495
+ "eval_loss": 1.5388661623001099,
1496
+ "eval_runtime": 99.4571,
1497
+ "eval_samples_per_second": 137.698,
1498
+ "eval_steps_per_second": 2.152,
1499
  "step": 19000
1500
  },
1501
  {
1502
+ "epoch": 1.06312,
1503
+ "grad_norm": 0.9514444470405579,
1504
  "learning_rate": 3.3456443063898157e-07,
1505
+ "loss": 0.7715,
1506
  "step": 19100
1507
  },
1508
  {
1509
+ "epoch": 1.06712,
1510
+ "grad_norm": 0.9194713234901428,
1511
  "learning_rate": 3.238344023008888e-07,
1512
+ "loss": 0.7666,
1513
  "step": 19200
1514
  },
1515
  {
1516
+ "epoch": 1.07112,
1517
+ "grad_norm": 0.8961077928543091,
1518
  "learning_rate": 3.132535957210366e-07,
1519
+ "loss": 0.7686,
1520
  "step": 19300
1521
  },
1522
  {
1523
+ "epoch": 1.07512,
1524
+ "grad_norm": 0.9342730045318604,
1525
  "learning_rate": 3.0282371565564324e-07,
1526
+ "loss": 0.7812,
1527
  "step": 19400
1528
  },
1529
  {
1530
+ "epoch": 1.07912,
1531
+ "grad_norm": 0.8818745613098145,
1532
  "learning_rate": 2.925464425439789e-07,
1533
+ "loss": 0.7694,
1534
  "step": 19500
1535
  },
1536
  {
1537
+ "epoch": 1.08312,
1538
+ "grad_norm": 0.8581619262695312,
1539
  "learning_rate": 2.824234322376185e-07,
1540
+ "loss": 0.7791,
1541
  "step": 19600
1542
  },
1543
  {
1544
+ "epoch": 1.08712,
1545
+ "grad_norm": 1.0447196960449219,
1546
  "learning_rate": 2.724563157336542e-07,
1547
+ "loss": 0.7864,
1548
  "step": 19700
1549
  },
1550
  {
1551
+ "epoch": 1.09112,
1552
+ "grad_norm": 0.9621513485908508,
1553
  "learning_rate": 2.626466989119131e-07,
1554
+ "loss": 0.7753,
1555
  "step": 19800
1556
  },
1557
  {
1558
+ "epoch": 1.09512,
1559
+ "grad_norm": 1.0203659534454346,
1560
  "learning_rate": 2.5299616227621946e-07,
1561
+ "loss": 0.7626,
1562
  "step": 19900
1563
  },
1564
  {
1565
+ "epoch": 1.09912,
1566
+ "grad_norm": 1.0235393047332764,
1567
  "learning_rate": 2.435062606997499e-07,
1568
+ "loss": 0.7727,
1569
  "step": 20000
1570
  },
1571
  {
1572
+ "epoch": 1.09912,
1573
+ "eval_loss": 1.5318886041641235,
1574
+ "eval_runtime": 99.8132,
1575
+ "eval_samples_per_second": 137.206,
1576
+ "eval_steps_per_second": 2.144,
1577
  "step": 20000
1578
  },
1579
  {
1580
+ "epoch": 1.10312,
1581
+ "grad_norm": 0.9197788834571838,
1582
  "learning_rate": 2.3417852317451418e-07,
1583
+ "loss": 0.7668,
1584
  "step": 20100
1585
  },
1586
  {
1587
+ "epoch": 1.10712,
1588
+ "grad_norm": 0.9011102914810181,
1589
  "learning_rate": 2.250144525650086e-07,
1590
+ "loss": 0.7605,
1591
  "step": 20200
1592
  },
1593
  {
1594
+ "epoch": 1.11112,
1595
+ "grad_norm": 0.9056942462921143,
1596
  "learning_rate": 2.160155253660759e-07,
1597
+ "loss": 0.7541,
1598
  "step": 20300
1599
  },
1600
  {
1601
+ "epoch": 1.1151200000000001,
1602
+ "grad_norm": 0.9336057305335999,
1603
  "learning_rate": 2.071831914650173e-07,
1604
+ "loss": 0.7635,
1605
  "step": 20400
1606
  },
1607
  {
1608
+ "epoch": 1.1191200000000001,
1609
+ "grad_norm": 0.886619508266449,
1610
  "learning_rate": 1.9851887390798922e-07,
1611
+ "loss": 0.7738,
1612
  "step": 20500
1613
  },
1614
  {
1615
+ "epoch": 1.12312,
1616
+ "grad_norm": 1.0144513845443726,
1617
  "learning_rate": 1.9002396867072587e-07,
1618
+ "loss": 0.7649,
1619
  "step": 20600
1620
  },
1621
  {
1622
+ "epoch": 1.1271200000000001,
1623
+ "grad_norm": 1.0580683946609497,
1624
  "learning_rate": 1.816998444336214e-07,
1625
+ "loss": 0.7703,
1626
  "step": 20700
1627
  },
1628
  {
1629
+ "epoch": 1.13112,
1630
+ "grad_norm": 0.9610932469367981,
1631
  "learning_rate": 1.7354784236121206e-07,
1632
+ "loss": 0.7649,
1633
  "step": 20800
1634
  },
1635
  {
1636
+ "epoch": 1.13512,
1637
+ "grad_norm": 0.9152004718780518,
1638
  "learning_rate": 1.6556927588609078e-07,
1639
+ "loss": 0.7774,
1640
  "step": 20900
1641
  },
1642
  {
1643
+ "epoch": 1.13912,
1644
+ "grad_norm": 0.9117056131362915,
1645
  "learning_rate": 1.577654304972899e-07,
1646
+ "loss": 0.7794,
1647
  "step": 21000
1648
  },
1649
  {
1650
+ "epoch": 1.13912,
1651
+ "eval_loss": 1.5366367101669312,
1652
+ "eval_runtime": 100.0918,
1653
+ "eval_samples_per_second": 136.824,
1654
+ "eval_steps_per_second": 2.138,
1655
  "step": 21000
1656
  },
1657
  {
1658
+ "epoch": 1.14312,
1659
+ "grad_norm": 0.8991335034370422,
1660
  "learning_rate": 1.501375635331652e-07,
1661
+ "loss": 0.769,
1662
  "step": 21100
1663
  },
1664
  {
1665
+ "epoch": 1.14712,
1666
+ "grad_norm": 0.9642401933670044,
1667
  "learning_rate": 1.4268690397881675e-07,
1668
+ "loss": 0.802,
1669
  "step": 21200
1670
  },
1671
  {
1672
+ "epoch": 1.15112,
1673
+ "grad_norm": 0.986643373966217,
1674
  "learning_rate": 1.3541465226807813e-07,
1675
+ "loss": 0.7899,
1676
  "step": 21300
1677
  },
1678
  {
1679
+ "epoch": 1.15512,
1680
+ "grad_norm": 1.0371657609939575,
1681
  "learning_rate": 1.283219800901045e-07,
1682
+ "loss": 0.7921,
1683
  "step": 21400
1684
  },
1685
  {
1686
+ "epoch": 1.15912,
1687
+ "grad_norm": 0.9209062457084656,
1688
  "learning_rate": 1.2141003020059273e-07,
1689
+ "loss": 0.7882,
1690
  "step": 21500
1691
  },
1692
  {
1693
+ "epoch": 1.16312,
1694
+ "grad_norm": 0.9286125898361206,
1695
  "learning_rate": 1.1467991623766287e-07,
1696
+ "loss": 0.7936,
1697
  "step": 21600
1698
  },
1699
  {
1700
+ "epoch": 1.16712,
1701
+ "grad_norm": 0.9320632219314575,
1702
  "learning_rate": 1.081327225424321e-07,
1703
+ "loss": 0.7274,
1704
  "step": 21700
1705
  },
1706
  {
1707
+ "epoch": 1.17112,
1708
+ "grad_norm": 0.9366289377212524,
1709
  "learning_rate": 1.0176950398430752e-07,
1710
+ "loss": 0.8088,
1711
  "step": 21800
1712
  },
1713
  {
1714
+ "epoch": 1.17512,
1715
+ "grad_norm": 0.9981914162635803,
1716
  "learning_rate": 9.559128579102767e-08,
1717
+ "loss": 0.8001,
1718
  "step": 21900
1719
  },
1720
  {
1721
+ "epoch": 1.17912,
1722
+ "grad_norm": 0.9554498791694641,
1723
  "learning_rate": 8.959906338348007e-08,
1724
+ "loss": 0.8014,
1725
  "step": 22000
1726
  },
1727
  {
1728
+ "epoch": 1.17912,
1729
+ "eval_loss": 1.536007046699524,
1730
+ "eval_runtime": 100.6846,
1731
+ "eval_samples_per_second": 136.019,
1732
+ "eval_steps_per_second": 2.125,
1733
  "step": 22000
1734
  },
1735
  {
1736
+ "epoch": 1.18312,
1737
+ "grad_norm": 0.9604577422142029,
1738
  "learning_rate": 8.37938022153223e-08,
1739
+ "loss": 0.8021,
1740
  "step": 22100
1741
  },
1742
  {
1743
+ "epoch": 1.18712,
1744
+ "grad_norm": 0.9202996492385864,
1745
  "learning_rate": 7.817643761742891e-08,
1746
+ "loss": 0.8013,
1747
  "step": 22200
1748
  },
1749
  {
1750
+ "epoch": 1.19112,
1751
+ "grad_norm": 0.8438488841056824,
1752
  "learning_rate": 7.274787464719338e-08,
1753
+ "loss": 0.7858,
1754
  "step": 22300
1755
  },
1756
  {
1757
+ "epoch": 1.19512,
1758
+ "grad_norm": 0.8734295964241028,
1759
  "learning_rate": 6.75089879427078e-08,
1760
+ "loss": 0.7898,
1761
  "step": 22400
1762
  },
1763
  {
1764
+ "epoch": 1.19912,
1765
+ "grad_norm": 0.9023377299308777,
1766
  "learning_rate": 6.246062158184241e-08,
1767
+ "loss": 0.7807,
1768
  "step": 22500
1769
  },
1770
  {
1771
+ "epoch": 1.20312,
1772
+ "grad_norm": 0.8997081518173218,
1773
  "learning_rate": 5.7603588946250064e-08,
1774
+ "loss": 0.8005,
1775
  "step": 22600
1776
  },
1777
  {
1778
+ "epoch": 1.20712,
1779
+ "grad_norm": 0.8866868019104004,
1780
  "learning_rate": 5.293867259031568e-08,
1781
+ "loss": 0.7906,
1782
  "step": 22700
1783
  },
1784
  {
1785
+ "epoch": 1.21112,
1786
+ "grad_norm": 1.0144649744033813,
1787
  "learning_rate": 4.8466624115073164e-08,
1788
+ "loss": 0.7948,
1789
  "step": 22800
1790
  },
1791
  {
1792
+ "epoch": 1.21512,
1793
+ "grad_norm": 0.9594987034797668,
1794
  "learning_rate": 4.4188164047108403e-08,
1795
+ "loss": 0.7796,
1796
  "step": 22900
1797
  },
1798
  {
1799
+ "epoch": 1.21912,
1800
+ "grad_norm": 0.9683875441551208,
1801
  "learning_rate": 4.010398172247104e-08,
1802
+ "loss": 0.7967,
1803
  "step": 23000
1804
  },
1805
  {
1806
+ "epoch": 1.21912,
1807
+ "eval_loss": 1.5381661653518677,
1808
+ "eval_runtime": 100.7572,
1809
+ "eval_samples_per_second": 135.921,
1810
+ "eval_steps_per_second": 2.124,
1811
  "step": 23000
1812
  },
1813
  {
1814
+ "epoch": 1.22312,
1815
+ "grad_norm": 0.982801079750061,
1816
  "learning_rate": 3.6214735175608004e-08,
1817
+ "loss": 0.7876,
1818
  "step": 23100
1819
  },
1820
  {
1821
+ "epoch": 1.22712,
1822
+ "grad_norm": 0.944310188293457,
1823
  "learning_rate": 3.252105103334499e-08,
1824
+ "loss": 0.8015,
1825
  "step": 23200
1826
  },
1827
  {
1828
+ "epoch": 1.23112,
1829
+ "grad_norm": 1.016548991203308,
1830
  "learning_rate": 2.9023524413923365e-08,
1831
+ "loss": 0.8183,
1832
  "step": 23300
1833
  },
1834
  {
1835
+ "epoch": 1.23512,
1836
+ "grad_norm": 0.9078572392463684,
1837
  "learning_rate": 2.5722718831117656e-08,
1838
+ "loss": 0.7934,
1839
  "step": 23400
1840
  },
1841
  {
1842
+ "epoch": 1.23912,
1843
+ "grad_norm": 0.9393835067749023,
1844
  "learning_rate": 2.26191661034425e-08,
1845
+ "loss": 0.8127,
1846
  "step": 23500
1847
  },
1848
  {
1849
+ "epoch": 1.24312,
1850
+ "grad_norm": 0.9347549080848694,
1851
  "learning_rate": 1.9713366268468148e-08,
1852
+ "loss": 0.7918,
1853
  "step": 23600
1854
  },
1855
  {
1856
+ "epoch": 1.24712,
1857
+ "grad_norm": 0.9802634716033936,
1858
  "learning_rate": 1.700578750225432e-08,
1859
+ "loss": 0.8041,
1860
  "step": 23700
1861
  },
1862
  {
1863
+ "epoch": 1.25112,
1864
+ "grad_norm": 0.9110268950462341,
1865
  "learning_rate": 1.4496866043919865e-08,
1866
+ "loss": 0.8109,
1867
  "step": 23800
1868
  },
1869
  {
1870
+ "epoch": 1.25512,
1871
+ "grad_norm": 1.0479713678359985,
1872
  "learning_rate": 1.2187006125356087e-08,
1873
+ "loss": 0.8167,
1874
  "step": 23900
1875
  },
1876
  {
1877
+ "epoch": 1.25912,
1878
+ "grad_norm": 0.9281754493713379,
1879
  "learning_rate": 1.0076579906098255e-08,
1880
+ "loss": 0.8093,
1881
  "step": 24000
1882
  },
1883
  {
1884
+ "epoch": 1.25912,
1885
+ "eval_loss": 1.5352606773376465,
1886
+ "eval_runtime": 100.7374,
1887
+ "eval_samples_per_second": 135.948,
1888
+ "eval_steps_per_second": 2.124,
1889
  "step": 24000
1890
  },
1891
  {
1892
+ "epoch": 1.26312,
1893
+ "grad_norm": 0.9759633541107178,
1894
  "learning_rate": 8.16592741336386e-09,
1895
+ "loss": 0.848,
1896
  "step": 24100
1897
  },
1898
  {
1899
+ "epoch": 1.26712,
1900
+ "grad_norm": 0.8610565066337585,
1901
  "learning_rate": 6.455356487267833e-09,
1902
+ "loss": 0.8128,
1903
  "step": 24200
1904
  },
1905
  {
1906
+ "epoch": 1.27112,
1907
+ "grad_norm": 0.9373227953910828,
1908
  "learning_rate": 4.9451427312251224e-09,
1909
+ "loss": 0.8322,
1910
  "step": 24300
1911
  },
1912
  {
1913
+ "epoch": 1.27512,
1914
+ "grad_norm": 0.9395922422409058,
1915
  "learning_rate": 3.635529467544696e-09,
1916
+ "loss": 0.828,
1917
  "step": 24400
1918
  },
1919
  {
1920
+ "epoch": 1.27912,
1921
+ "grad_norm": 0.9699512720108032,
1922
  "learning_rate": 2.526727698227288e-09,
1923
+ "loss": 0.8161,
1924
  "step": 24500
1925
  },
1926
  {
1927
+ "epoch": 1.28312,
1928
+ "grad_norm": 0.9437727332115173,
1929
  "learning_rate": 1.6189160709680074e-09,
1930
+ "loss": 0.8013,
1931
  "step": 24600
1932
  },
1933
  {
1934
+ "epoch": 1.28712,
1935
+ "grad_norm": 0.9620960354804993,
1936
  "learning_rate": 9.122408503739466e-10,
1937
+ "loss": 0.8119,
1938
  "step": 24700
1939
  },
1940
  {
1941
+ "epoch": 1.29112,
1942
+ "grad_norm": 0.8797083497047424,
1943
  "learning_rate": 4.0681589439789395e-10,
1944
+ "loss": 0.8052,
1945
  "step": 24800
1946
  },
1947
  {
1948
+ "epoch": 1.29512,
1949
+ "grad_norm": 0.9612630605697632,
1950
  "learning_rate": 1.0272263599411803e-10,
1951
+ "loss": 0.8062,
1952
  "step": 24900
1953
  },
1954
  {
1955
+ "epoch": 1.29912,
1956
+ "grad_norm": 1.0144433975219727,
1957
  "learning_rate": 1.006999733599301e-14,
1958
+ "loss": 0.8193,
1959
  "step": 25000
1960
  },
1961
  {
1962
+ "epoch": 1.29912,
1963
+ "eval_loss": 1.534403920173645,
1964
+ "eval_runtime": 101.277,
1965
+ "eval_samples_per_second": 135.223,
1966
+ "eval_steps_per_second": 2.113,
1967
  "step": 25000
1968
  }
1969
  ],
 
1984
  "attributes": {}
1985
  }
1986
  },
1987
+ "total_flos": 3.384632008699478e+18,
1988
  "train_batch_size": 64,
1989
  "trial_name": null,
1990
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8468e002e14d69abcb2f7de8e401f9fa2561c9e9f59ee528d9d623ec438f38ae
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f45ebc3b5cfb179371730040033cc60cf9d8216007feceb0c4d7cbbf1cda1e6
3
  size 5841