CocoRoF commited on
Commit
cfc1444
·
verified ·
1 Parent(s): 5922ff2

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f780a9a78fdadce0c173bf611a5da60db156d63194a2e6a49f1f18c27d761ce
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0decb17e1576a2e87ecfcfd97d8e2ab8486eb9a2ec6ff00fa3b7efa6f74327ba
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15d1e92749084ac9dde10d7d65367e2e60f9c34a59a6069753dd8472f0fc8a13
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018d14bd69f4e34f78162e646a75e937b89f1d651e49bb2da5fd566a3dc03363
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1754ce1fea08e0a1abf50b88b05ad2235accf247d46d7ee2f8c08c6670f73f31
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a711ae47907423581a85380ad2222bf6eaf1af9c9ec45797d4f1a9fb127db2c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae26017d4550577988f9e10089ab5b71db8da5c695439c0a0fea91d6a1fd0704
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e8c873ca3f378713a8a07acffb82e5be966b4efb3815b7ddf04ac4a39c37a73
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d1f128b23b661bf875e117cc47a5648d99e77550cfacf4588ce64a1dd7dbde3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0fcb54b765d5b0c806961a1b8bdc3214f4fc0489fbe2c720c7312b23d2db5cf
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aaf41da8bd40bcccaff03238fa84745187c3a9d568a9b5f691e9996625af1de6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2a30b2ad9b3632b41b5d2a70ad5aabce34a6f7a76a9e1e270a22f600a05ec22
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a50ddb223b7bd2b99f1b2554cda38ae044aac0f187628b6ded5c4d407979e294
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ee9cd8fd6ff53fdc84fbb7925a1d22d7707021b0e4b45ae16328680d2405512
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6bf0de30b7a6e43c74608e8f1fa3b7d38bb356d58e402c397bc6ad56aa95795
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b60c5d4b71ffd198beb51d796fd8e27c367782bb1efc7c5f1065d3ed20df402
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd016a3c3e3ba2a5ae38a6d0f24920c1961e6c3882d668aaebde5a2d6e1459fb
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c6f1afcb23fc820bb3d68d94d047f124b182adf1d874dcd0fa3a260a51bb2b
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4e5c265f62dd45b87e17d9c102ed3afb1ecc9d2d1466b032139f4181be9bfb9
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebfc4481eb53675078ccf162293df1d6b7500f8ba0b2d00cad430e67f4a70a3
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4aa89b4c5d338501a2c77924372d3acbefc23cb2b700c704822eb4c4c76c5fb
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ffafe779a971f149a59a73318cc7969252e85b03c3f756e6cdd7e796033658
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7766516329100582,
5
  "eval_steps": 1000,
6
- "global_step": 17500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12393,6 +12393,1780 @@
12393
  "learning_rate": 9.969662066626956e-06,
12394
  "loss": 10.1914,
12395
  "step": 17500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12396
  }
12397
  ],
12398
  "logging_steps": 10,
@@ -12412,7 +14186,7 @@
12412
  "attributes": {}
12413
  }
12414
  },
12415
- "total_flos": 6.107015608795136e+18,
12416
  "train_batch_size": 4,
12417
  "trial_name": null,
12418
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8876018661829237,
5
  "eval_steps": 1000,
6
+ "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12393
  "learning_rate": 9.969662066626956e-06,
12394
  "loss": 10.1914,
12395
  "step": 17500
12396
+ },
12397
+ {
12398
+ "epoch": 0.7770954338431496,
12399
+ "grad_norm": 66.09324645996094,
12400
+ "learning_rate": 9.969644730665029e-06,
12401
+ "loss": 10.7307,
12402
+ "step": 17510
12403
+ },
12404
+ {
12405
+ "epoch": 0.7775392347762411,
12406
+ "grad_norm": 64.93851470947266,
12407
+ "learning_rate": 9.969627394703102e-06,
12408
+ "loss": 10.0904,
12409
+ "step": 17520
12410
+ },
12411
+ {
12412
+ "epoch": 0.7779830357093326,
12413
+ "grad_norm": 72.18850708007812,
12414
+ "learning_rate": 9.969610058741173e-06,
12415
+ "loss": 10.9298,
12416
+ "step": 17530
12417
+ },
12418
+ {
12419
+ "epoch": 0.7784268366424241,
12420
+ "grad_norm": 74.83529663085938,
12421
+ "learning_rate": 9.969592722779246e-06,
12422
+ "loss": 10.3685,
12423
+ "step": 17540
12424
+ },
12425
+ {
12426
+ "epoch": 0.7788706375755156,
12427
+ "grad_norm": 69.4002456665039,
12428
+ "learning_rate": 9.96957538681732e-06,
12429
+ "loss": 10.3551,
12430
+ "step": 17550
12431
+ },
12432
+ {
12433
+ "epoch": 0.7793144385086069,
12434
+ "grad_norm": 76.83059692382812,
12435
+ "learning_rate": 9.969558050855391e-06,
12436
+ "loss": 10.6919,
12437
+ "step": 17560
12438
+ },
12439
+ {
12440
+ "epoch": 0.7797582394416984,
12441
+ "grad_norm": 67.19385528564453,
12442
+ "learning_rate": 9.969540714893464e-06,
12443
+ "loss": 10.5123,
12444
+ "step": 17570
12445
+ },
12446
+ {
12447
+ "epoch": 0.7802020403747899,
12448
+ "grad_norm": 80.98805236816406,
12449
+ "learning_rate": 9.969523378931537e-06,
12450
+ "loss": 10.7374,
12451
+ "step": 17580
12452
+ },
12453
+ {
12454
+ "epoch": 0.7806458413078814,
12455
+ "grad_norm": 67.6994400024414,
12456
+ "learning_rate": 9.96950604296961e-06,
12457
+ "loss": 10.768,
12458
+ "step": 17590
12459
+ },
12460
+ {
12461
+ "epoch": 0.7810896422409728,
12462
+ "grad_norm": 74.86246490478516,
12463
+ "learning_rate": 9.969488707007682e-06,
12464
+ "loss": 10.1819,
12465
+ "step": 17600
12466
+ },
12467
+ {
12468
+ "epoch": 0.7815334431740643,
12469
+ "grad_norm": 73.82647705078125,
12470
+ "learning_rate": 9.969471371045755e-06,
12471
+ "loss": 10.7857,
12472
+ "step": 17610
12473
+ },
12474
+ {
12475
+ "epoch": 0.7819772441071557,
12476
+ "grad_norm": 68.298095703125,
12477
+ "learning_rate": 9.969454035083828e-06,
12478
+ "loss": 10.6649,
12479
+ "step": 17620
12480
+ },
12481
+ {
12482
+ "epoch": 0.7824210450402472,
12483
+ "grad_norm": 68.30016326904297,
12484
+ "learning_rate": 9.969436699121899e-06,
12485
+ "loss": 10.3087,
12486
+ "step": 17630
12487
+ },
12488
+ {
12489
+ "epoch": 0.7828648459733386,
12490
+ "grad_norm": 67.62581634521484,
12491
+ "learning_rate": 9.969419363159972e-06,
12492
+ "loss": 9.9373,
12493
+ "step": 17640
12494
+ },
12495
+ {
12496
+ "epoch": 0.7833086469064301,
12497
+ "grad_norm": 78.02869415283203,
12498
+ "learning_rate": 9.969402027198045e-06,
12499
+ "loss": 10.7946,
12500
+ "step": 17650
12501
+ },
12502
+ {
12503
+ "epoch": 0.7837524478395216,
12504
+ "grad_norm": 64.94837951660156,
12505
+ "learning_rate": 9.969384691236117e-06,
12506
+ "loss": 10.1355,
12507
+ "step": 17660
12508
+ },
12509
+ {
12510
+ "epoch": 0.7841962487726131,
12511
+ "grad_norm": 65.18085479736328,
12512
+ "learning_rate": 9.96936735527419e-06,
12513
+ "loss": 10.4704,
12514
+ "step": 17670
12515
+ },
12516
+ {
12517
+ "epoch": 0.7846400497057046,
12518
+ "grad_norm": 70.01870727539062,
12519
+ "learning_rate": 9.969350019312263e-06,
12520
+ "loss": 10.6109,
12521
+ "step": 17680
12522
+ },
12523
+ {
12524
+ "epoch": 0.7850838506387959,
12525
+ "grad_norm": 65.34024047851562,
12526
+ "learning_rate": 9.969332683350334e-06,
12527
+ "loss": 10.3535,
12528
+ "step": 17690
12529
+ },
12530
+ {
12531
+ "epoch": 0.7855276515718874,
12532
+ "grad_norm": 79.30274200439453,
12533
+ "learning_rate": 9.969315347388407e-06,
12534
+ "loss": 10.3437,
12535
+ "step": 17700
12536
+ },
12537
+ {
12538
+ "epoch": 0.7859714525049789,
12539
+ "grad_norm": 67.2250747680664,
12540
+ "learning_rate": 9.96929801142648e-06,
12541
+ "loss": 10.4723,
12542
+ "step": 17710
12543
+ },
12544
+ {
12545
+ "epoch": 0.7864152534380704,
12546
+ "grad_norm": 68.58338928222656,
12547
+ "learning_rate": 9.969280675464552e-06,
12548
+ "loss": 10.1836,
12549
+ "step": 17720
12550
+ },
12551
+ {
12552
+ "epoch": 0.7868590543711618,
12553
+ "grad_norm": 70.7483901977539,
12554
+ "learning_rate": 9.969263339502625e-06,
12555
+ "loss": 11.1165,
12556
+ "step": 17730
12557
+ },
12558
+ {
12559
+ "epoch": 0.7873028553042533,
12560
+ "grad_norm": 62.17152404785156,
12561
+ "learning_rate": 9.969246003540698e-06,
12562
+ "loss": 10.2156,
12563
+ "step": 17740
12564
+ },
12565
+ {
12566
+ "epoch": 0.7877466562373447,
12567
+ "grad_norm": 69.29998016357422,
12568
+ "learning_rate": 9.96922866757877e-06,
12569
+ "loss": 10.6663,
12570
+ "step": 17750
12571
+ },
12572
+ {
12573
+ "epoch": 0.7881904571704362,
12574
+ "grad_norm": 64.87523651123047,
12575
+ "learning_rate": 9.969211331616842e-06,
12576
+ "loss": 10.9555,
12577
+ "step": 17760
12578
+ },
12579
+ {
12580
+ "epoch": 0.7886342581035276,
12581
+ "grad_norm": 66.5212173461914,
12582
+ "learning_rate": 9.969193995654915e-06,
12583
+ "loss": 10.5622,
12584
+ "step": 17770
12585
+ },
12586
+ {
12587
+ "epoch": 0.7890780590366191,
12588
+ "grad_norm": 68.40711975097656,
12589
+ "learning_rate": 9.969176659692987e-06,
12590
+ "loss": 10.3267,
12591
+ "step": 17780
12592
+ },
12593
+ {
12594
+ "epoch": 0.7895218599697106,
12595
+ "grad_norm": 70.49530029296875,
12596
+ "learning_rate": 9.96915932373106e-06,
12597
+ "loss": 10.6099,
12598
+ "step": 17790
12599
+ },
12600
+ {
12601
+ "epoch": 0.7899656609028021,
12602
+ "grad_norm": 68.68482971191406,
12603
+ "learning_rate": 9.969141987769133e-06,
12604
+ "loss": 10.5234,
12605
+ "step": 17800
12606
+ },
12607
+ {
12608
+ "epoch": 0.7904094618358936,
12609
+ "grad_norm": 73.87464904785156,
12610
+ "learning_rate": 9.969124651807206e-06,
12611
+ "loss": 10.7718,
12612
+ "step": 17810
12613
+ },
12614
+ {
12615
+ "epoch": 0.7908532627689849,
12616
+ "grad_norm": 65.32040405273438,
12617
+ "learning_rate": 9.969107315845277e-06,
12618
+ "loss": 10.347,
12619
+ "step": 17820
12620
+ },
12621
+ {
12622
+ "epoch": 0.7912970637020764,
12623
+ "grad_norm": 66.08610534667969,
12624
+ "learning_rate": 9.96908997988335e-06,
12625
+ "loss": 10.4261,
12626
+ "step": 17830
12627
+ },
12628
+ {
12629
+ "epoch": 0.7917408646351679,
12630
+ "grad_norm": 55.77153396606445,
12631
+ "learning_rate": 9.969072643921424e-06,
12632
+ "loss": 10.013,
12633
+ "step": 17840
12634
+ },
12635
+ {
12636
+ "epoch": 0.7921846655682594,
12637
+ "grad_norm": 72.32164764404297,
12638
+ "learning_rate": 9.969055307959495e-06,
12639
+ "loss": 10.5249,
12640
+ "step": 17850
12641
+ },
12642
+ {
12643
+ "epoch": 0.7926284665013508,
12644
+ "grad_norm": 73.15257263183594,
12645
+ "learning_rate": 9.969037971997568e-06,
12646
+ "loss": 10.86,
12647
+ "step": 17860
12648
+ },
12649
+ {
12650
+ "epoch": 0.7930722674344423,
12651
+ "grad_norm": 79.71673583984375,
12652
+ "learning_rate": 9.969020636035641e-06,
12653
+ "loss": 10.4343,
12654
+ "step": 17870
12655
+ },
12656
+ {
12657
+ "epoch": 0.7935160683675337,
12658
+ "grad_norm": 68.7105941772461,
12659
+ "learning_rate": 9.969003300073712e-06,
12660
+ "loss": 10.5593,
12661
+ "step": 17880
12662
+ },
12663
+ {
12664
+ "epoch": 0.7939598693006252,
12665
+ "grad_norm": 67.40699005126953,
12666
+ "learning_rate": 9.968985964111786e-06,
12667
+ "loss": 9.8434,
12668
+ "step": 17890
12669
+ },
12670
+ {
12671
+ "epoch": 0.7944036702337167,
12672
+ "grad_norm": 67.37322235107422,
12673
+ "learning_rate": 9.968968628149859e-06,
12674
+ "loss": 10.9941,
12675
+ "step": 17900
12676
+ },
12677
+ {
12678
+ "epoch": 0.7948474711668081,
12679
+ "grad_norm": 66.23359680175781,
12680
+ "learning_rate": 9.96895129218793e-06,
12681
+ "loss": 10.731,
12682
+ "step": 17910
12683
+ },
12684
+ {
12685
+ "epoch": 0.7952912720998996,
12686
+ "grad_norm": 65.20913696289062,
12687
+ "learning_rate": 9.968933956226003e-06,
12688
+ "loss": 10.537,
12689
+ "step": 17920
12690
+ },
12691
+ {
12692
+ "epoch": 0.7957350730329911,
12693
+ "grad_norm": 61.01829528808594,
12694
+ "learning_rate": 9.968916620264076e-06,
12695
+ "loss": 9.9038,
12696
+ "step": 17930
12697
+ },
12698
+ {
12699
+ "epoch": 0.7961788739660826,
12700
+ "grad_norm": 65.00994110107422,
12701
+ "learning_rate": 9.968899284302148e-06,
12702
+ "loss": 10.364,
12703
+ "step": 17940
12704
+ },
12705
+ {
12706
+ "epoch": 0.7966226748991739,
12707
+ "grad_norm": 72.74436950683594,
12708
+ "learning_rate": 9.96888194834022e-06,
12709
+ "loss": 10.5061,
12710
+ "step": 17950
12711
+ },
12712
+ {
12713
+ "epoch": 0.7970664758322654,
12714
+ "grad_norm": 75.44004821777344,
12715
+ "learning_rate": 9.968864612378294e-06,
12716
+ "loss": 10.646,
12717
+ "step": 17960
12718
+ },
12719
+ {
12720
+ "epoch": 0.7975102767653569,
12721
+ "grad_norm": 69.74024200439453,
12722
+ "learning_rate": 9.968847276416365e-06,
12723
+ "loss": 10.5158,
12724
+ "step": 17970
12725
+ },
12726
+ {
12727
+ "epoch": 0.7979540776984484,
12728
+ "grad_norm": 73.52366638183594,
12729
+ "learning_rate": 9.968829940454438e-06,
12730
+ "loss": 10.0728,
12731
+ "step": 17980
12732
+ },
12733
+ {
12734
+ "epoch": 0.7983978786315398,
12735
+ "grad_norm": 62.016883850097656,
12736
+ "learning_rate": 9.968812604492511e-06,
12737
+ "loss": 9.9823,
12738
+ "step": 17990
12739
+ },
12740
+ {
12741
+ "epoch": 0.7988416795646313,
12742
+ "grad_norm": 77.03231811523438,
12743
+ "learning_rate": 9.968795268530583e-06,
12744
+ "loss": 10.4617,
12745
+ "step": 18000
12746
+ },
12747
+ {
12748
+ "epoch": 0.7988416795646313,
12749
+ "eval_loss": 0.32583364844322205,
12750
+ "eval_runtime": 672.7495,
12751
+ "eval_samples_per_second": 1805.116,
12752
+ "eval_steps_per_second": 56.41,
12753
+ "step": 18000
12754
+ },
12755
+ {
12756
+ "epoch": 0.7992854804977227,
12757
+ "grad_norm": 62.19236755371094,
12758
+ "learning_rate": 9.968777932568656e-06,
12759
+ "loss": 10.465,
12760
+ "step": 18010
12761
+ },
12762
+ {
12763
+ "epoch": 0.7997292814308142,
12764
+ "grad_norm": 64.64230346679688,
12765
+ "learning_rate": 9.968760596606729e-06,
12766
+ "loss": 11.0006,
12767
+ "step": 18020
12768
+ },
12769
+ {
12770
+ "epoch": 0.8001730823639057,
12771
+ "grad_norm": 71.75637817382812,
12772
+ "learning_rate": 9.968743260644802e-06,
12773
+ "loss": 10.3129,
12774
+ "step": 18030
12775
+ },
12776
+ {
12777
+ "epoch": 0.8006168832969971,
12778
+ "grad_norm": 68.54603576660156,
12779
+ "learning_rate": 9.968725924682873e-06,
12780
+ "loss": 10.7563,
12781
+ "step": 18040
12782
+ },
12783
+ {
12784
+ "epoch": 0.8010606842300886,
12785
+ "grad_norm": 66.99270629882812,
12786
+ "learning_rate": 9.968708588720946e-06,
12787
+ "loss": 10.2859,
12788
+ "step": 18050
12789
+ },
12790
+ {
12791
+ "epoch": 0.8015044851631801,
12792
+ "grad_norm": 72.47330474853516,
12793
+ "learning_rate": 9.96869125275902e-06,
12794
+ "loss": 10.6183,
12795
+ "step": 18060
12796
+ },
12797
+ {
12798
+ "epoch": 0.8019482860962716,
12799
+ "grad_norm": 69.65715789794922,
12800
+ "learning_rate": 9.96867391679709e-06,
12801
+ "loss": 10.0611,
12802
+ "step": 18070
12803
+ },
12804
+ {
12805
+ "epoch": 0.8023920870293629,
12806
+ "grad_norm": 68.04208374023438,
12807
+ "learning_rate": 9.968656580835164e-06,
12808
+ "loss": 10.9295,
12809
+ "step": 18080
12810
+ },
12811
+ {
12812
+ "epoch": 0.8028358879624544,
12813
+ "grad_norm": 69.00373840332031,
12814
+ "learning_rate": 9.968639244873237e-06,
12815
+ "loss": 10.3371,
12816
+ "step": 18090
12817
+ },
12818
+ {
12819
+ "epoch": 0.8032796888955459,
12820
+ "grad_norm": 62.3974723815918,
12821
+ "learning_rate": 9.968621908911308e-06,
12822
+ "loss": 10.3889,
12823
+ "step": 18100
12824
+ },
12825
+ {
12826
+ "epoch": 0.8037234898286374,
12827
+ "grad_norm": 71.53374481201172,
12828
+ "learning_rate": 9.968604572949381e-06,
12829
+ "loss": 11.0522,
12830
+ "step": 18110
12831
+ },
12832
+ {
12833
+ "epoch": 0.8041672907617289,
12834
+ "grad_norm": 61.865089416503906,
12835
+ "learning_rate": 9.968587236987455e-06,
12836
+ "loss": 10.6471,
12837
+ "step": 18120
12838
+ },
12839
+ {
12840
+ "epoch": 0.8046110916948203,
12841
+ "grad_norm": 70.71613311767578,
12842
+ "learning_rate": 9.968569901025526e-06,
12843
+ "loss": 11.1749,
12844
+ "step": 18130
12845
+ },
12846
+ {
12847
+ "epoch": 0.8050548926279117,
12848
+ "grad_norm": 70.22577667236328,
12849
+ "learning_rate": 9.968552565063599e-06,
12850
+ "loss": 10.5659,
12851
+ "step": 18140
12852
+ },
12853
+ {
12854
+ "epoch": 0.8054986935610032,
12855
+ "grad_norm": 62.12384796142578,
12856
+ "learning_rate": 9.968535229101672e-06,
12857
+ "loss": 10.3108,
12858
+ "step": 18150
12859
+ },
12860
+ {
12861
+ "epoch": 0.8059424944940947,
12862
+ "grad_norm": 67.61980438232422,
12863
+ "learning_rate": 9.968517893139743e-06,
12864
+ "loss": 10.4519,
12865
+ "step": 18160
12866
+ },
12867
+ {
12868
+ "epoch": 0.8063862954271861,
12869
+ "grad_norm": 64.64510345458984,
12870
+ "learning_rate": 9.968500557177817e-06,
12871
+ "loss": 10.4827,
12872
+ "step": 18170
12873
+ },
12874
+ {
12875
+ "epoch": 0.8068300963602776,
12876
+ "grad_norm": 73.72003173828125,
12877
+ "learning_rate": 9.96848322121589e-06,
12878
+ "loss": 9.9235,
12879
+ "step": 18180
12880
+ },
12881
+ {
12882
+ "epoch": 0.8072738972933691,
12883
+ "grad_norm": 66.29298400878906,
12884
+ "learning_rate": 9.968465885253961e-06,
12885
+ "loss": 10.5438,
12886
+ "step": 18190
12887
+ },
12888
+ {
12889
+ "epoch": 0.8077176982264606,
12890
+ "grad_norm": 67.15369415283203,
12891
+ "learning_rate": 9.968448549292034e-06,
12892
+ "loss": 10.3155,
12893
+ "step": 18200
12894
+ },
12895
+ {
12896
+ "epoch": 0.8081614991595519,
12897
+ "grad_norm": 59.32017517089844,
12898
+ "learning_rate": 9.968431213330107e-06,
12899
+ "loss": 10.3011,
12900
+ "step": 18210
12901
+ },
12902
+ {
12903
+ "epoch": 0.8086053000926434,
12904
+ "grad_norm": 72.47640228271484,
12905
+ "learning_rate": 9.96841387736818e-06,
12906
+ "loss": 10.4339,
12907
+ "step": 18220
12908
+ },
12909
+ {
12910
+ "epoch": 0.8090491010257349,
12911
+ "grad_norm": 68.0174789428711,
12912
+ "learning_rate": 9.968396541406252e-06,
12913
+ "loss": 10.1946,
12914
+ "step": 18230
12915
+ },
12916
+ {
12917
+ "epoch": 0.8094929019588264,
12918
+ "grad_norm": 62.51362228393555,
12919
+ "learning_rate": 9.968379205444325e-06,
12920
+ "loss": 10.5557,
12921
+ "step": 18240
12922
+ },
12923
+ {
12924
+ "epoch": 0.8099367028919179,
12925
+ "grad_norm": 72.3086929321289,
12926
+ "learning_rate": 9.968361869482398e-06,
12927
+ "loss": 10.1581,
12928
+ "step": 18250
12929
+ },
12930
+ {
12931
+ "epoch": 0.8103805038250093,
12932
+ "grad_norm": 74.7848892211914,
12933
+ "learning_rate": 9.968344533520469e-06,
12934
+ "loss": 10.6374,
12935
+ "step": 18260
12936
+ },
12937
+ {
12938
+ "epoch": 0.8108243047581007,
12939
+ "grad_norm": 60.53010177612305,
12940
+ "learning_rate": 9.968327197558542e-06,
12941
+ "loss": 10.5352,
12942
+ "step": 18270
12943
+ },
12944
+ {
12945
+ "epoch": 0.8112681056911922,
12946
+ "grad_norm": 71.8178482055664,
12947
+ "learning_rate": 9.968309861596615e-06,
12948
+ "loss": 10.5353,
12949
+ "step": 18280
12950
+ },
12951
+ {
12952
+ "epoch": 0.8117119066242837,
12953
+ "grad_norm": 58.963165283203125,
12954
+ "learning_rate": 9.968292525634687e-06,
12955
+ "loss": 10.3527,
12956
+ "step": 18290
12957
+ },
12958
+ {
12959
+ "epoch": 0.8121557075573751,
12960
+ "grad_norm": 64.75,
12961
+ "learning_rate": 9.96827518967276e-06,
12962
+ "loss": 11.0742,
12963
+ "step": 18300
12964
+ },
12965
+ {
12966
+ "epoch": 0.8125995084904666,
12967
+ "grad_norm": 61.654296875,
12968
+ "learning_rate": 9.968257853710833e-06,
12969
+ "loss": 9.8275,
12970
+ "step": 18310
12971
+ },
12972
+ {
12973
+ "epoch": 0.8130433094235581,
12974
+ "grad_norm": 63.89625930786133,
12975
+ "learning_rate": 9.968240517748904e-06,
12976
+ "loss": 10.3494,
12977
+ "step": 18320
12978
+ },
12979
+ {
12980
+ "epoch": 0.8134871103566496,
12981
+ "grad_norm": 69.73605346679688,
12982
+ "learning_rate": 9.968223181786977e-06,
12983
+ "loss": 10.4358,
12984
+ "step": 18330
12985
+ },
12986
+ {
12987
+ "epoch": 0.8139309112897409,
12988
+ "grad_norm": 69.21589660644531,
12989
+ "learning_rate": 9.96820584582505e-06,
12990
+ "loss": 10.3777,
12991
+ "step": 18340
12992
+ },
12993
+ {
12994
+ "epoch": 0.8143747122228324,
12995
+ "grad_norm": 68.85872650146484,
12996
+ "learning_rate": 9.968188509863122e-06,
12997
+ "loss": 10.7136,
12998
+ "step": 18350
12999
+ },
13000
+ {
13001
+ "epoch": 0.8148185131559239,
13002
+ "grad_norm": 63.11106491088867,
13003
+ "learning_rate": 9.968171173901195e-06,
13004
+ "loss": 11.0215,
13005
+ "step": 18360
13006
+ },
13007
+ {
13008
+ "epoch": 0.8152623140890154,
13009
+ "grad_norm": 56.74385070800781,
13010
+ "learning_rate": 9.968153837939268e-06,
13011
+ "loss": 10.032,
13012
+ "step": 18370
13013
+ },
13014
+ {
13015
+ "epoch": 0.8157061150221069,
13016
+ "grad_norm": 65.63390350341797,
13017
+ "learning_rate": 9.968136501977341e-06,
13018
+ "loss": 10.615,
13019
+ "step": 18380
13020
+ },
13021
+ {
13022
+ "epoch": 0.8161499159551983,
13023
+ "grad_norm": 58.63720703125,
13024
+ "learning_rate": 9.968119166015412e-06,
13025
+ "loss": 10.3503,
13026
+ "step": 18390
13027
+ },
13028
+ {
13029
+ "epoch": 0.8165937168882897,
13030
+ "grad_norm": 60.3001708984375,
13031
+ "learning_rate": 9.968101830053485e-06,
13032
+ "loss": 10.2183,
13033
+ "step": 18400
13034
+ },
13035
+ {
13036
+ "epoch": 0.8170375178213812,
13037
+ "grad_norm": 68.03216552734375,
13038
+ "learning_rate": 9.968084494091559e-06,
13039
+ "loss": 10.5718,
13040
+ "step": 18410
13041
+ },
13042
+ {
13043
+ "epoch": 0.8174813187544727,
13044
+ "grad_norm": 71.72623443603516,
13045
+ "learning_rate": 9.96806715812963e-06,
13046
+ "loss": 10.7557,
13047
+ "step": 18420
13048
+ },
13049
+ {
13050
+ "epoch": 0.8179251196875641,
13051
+ "grad_norm": 69.74810791015625,
13052
+ "learning_rate": 9.968049822167703e-06,
13053
+ "loss": 10.1841,
13054
+ "step": 18430
13055
+ },
13056
+ {
13057
+ "epoch": 0.8183689206206556,
13058
+ "grad_norm": 58.47687530517578,
13059
+ "learning_rate": 9.968032486205776e-06,
13060
+ "loss": 10.0264,
13061
+ "step": 18440
13062
+ },
13063
+ {
13064
+ "epoch": 0.8188127215537471,
13065
+ "grad_norm": 67.85263061523438,
13066
+ "learning_rate": 9.968015150243847e-06,
13067
+ "loss": 10.3225,
13068
+ "step": 18450
13069
+ },
13070
+ {
13071
+ "epoch": 0.8192565224868386,
13072
+ "grad_norm": 67.8355712890625,
13073
+ "learning_rate": 9.96799781428192e-06,
13074
+ "loss": 9.9914,
13075
+ "step": 18460
13076
+ },
13077
+ {
13078
+ "epoch": 0.81970032341993,
13079
+ "grad_norm": 74.0328140258789,
13080
+ "learning_rate": 9.967980478319994e-06,
13081
+ "loss": 10.3044,
13082
+ "step": 18470
13083
+ },
13084
+ {
13085
+ "epoch": 0.8201441243530214,
13086
+ "grad_norm": 75.59931945800781,
13087
+ "learning_rate": 9.967963142358065e-06,
13088
+ "loss": 10.6398,
13089
+ "step": 18480
13090
+ },
13091
+ {
13092
+ "epoch": 0.8205879252861129,
13093
+ "grad_norm": 59.03470230102539,
13094
+ "learning_rate": 9.967945806396138e-06,
13095
+ "loss": 10.1309,
13096
+ "step": 18490
13097
+ },
13098
+ {
13099
+ "epoch": 0.8210317262192044,
13100
+ "grad_norm": 63.74763107299805,
13101
+ "learning_rate": 9.967928470434211e-06,
13102
+ "loss": 9.8492,
13103
+ "step": 18500
13104
+ },
13105
+ {
13106
+ "epoch": 0.8214755271522959,
13107
+ "grad_norm": 58.71684265136719,
13108
+ "learning_rate": 9.967911134472284e-06,
13109
+ "loss": 10.0112,
13110
+ "step": 18510
13111
+ },
13112
+ {
13113
+ "epoch": 0.8219193280853873,
13114
+ "grad_norm": 70.03022003173828,
13115
+ "learning_rate": 9.967893798510356e-06,
13116
+ "loss": 10.6968,
13117
+ "step": 18520
13118
+ },
13119
+ {
13120
+ "epoch": 0.8223631290184787,
13121
+ "grad_norm": 61.144004821777344,
13122
+ "learning_rate": 9.967876462548429e-06,
13123
+ "loss": 10.3381,
13124
+ "step": 18530
13125
+ },
13126
+ {
13127
+ "epoch": 0.8228069299515702,
13128
+ "grad_norm": 67.76824188232422,
13129
+ "learning_rate": 9.967859126586502e-06,
13130
+ "loss": 10.2819,
13131
+ "step": 18540
13132
+ },
13133
+ {
13134
+ "epoch": 0.8232507308846617,
13135
+ "grad_norm": 64.85346221923828,
13136
+ "learning_rate": 9.967841790624573e-06,
13137
+ "loss": 10.7358,
13138
+ "step": 18550
13139
+ },
13140
+ {
13141
+ "epoch": 0.8236945318177531,
13142
+ "grad_norm": 64.5184326171875,
13143
+ "learning_rate": 9.967824454662646e-06,
13144
+ "loss": 10.49,
13145
+ "step": 18560
13146
+ },
13147
+ {
13148
+ "epoch": 0.8241383327508446,
13149
+ "grad_norm": 69.41261291503906,
13150
+ "learning_rate": 9.96780711870072e-06,
13151
+ "loss": 10.6639,
13152
+ "step": 18570
13153
+ },
13154
+ {
13155
+ "epoch": 0.8245821336839361,
13156
+ "grad_norm": 67.25212097167969,
13157
+ "learning_rate": 9.96778978273879e-06,
13158
+ "loss": 10.5115,
13159
+ "step": 18580
13160
+ },
13161
+ {
13162
+ "epoch": 0.8250259346170276,
13163
+ "grad_norm": 62.52476501464844,
13164
+ "learning_rate": 9.967772446776864e-06,
13165
+ "loss": 10.0552,
13166
+ "step": 18590
13167
+ },
13168
+ {
13169
+ "epoch": 0.825469735550119,
13170
+ "grad_norm": 62.43718719482422,
13171
+ "learning_rate": 9.967755110814937e-06,
13172
+ "loss": 10.2405,
13173
+ "step": 18600
13174
+ },
13175
+ {
13176
+ "epoch": 0.8259135364832104,
13177
+ "grad_norm": 67.1116714477539,
13178
+ "learning_rate": 9.96773777485301e-06,
13179
+ "loss": 10.4917,
13180
+ "step": 18610
13181
+ },
13182
+ {
13183
+ "epoch": 0.8263573374163019,
13184
+ "grad_norm": 67.36260986328125,
13185
+ "learning_rate": 9.967720438891081e-06,
13186
+ "loss": 9.8809,
13187
+ "step": 18620
13188
+ },
13189
+ {
13190
+ "epoch": 0.8268011383493934,
13191
+ "grad_norm": 69.18153381347656,
13192
+ "learning_rate": 9.967703102929154e-06,
13193
+ "loss": 10.0411,
13194
+ "step": 18630
13195
+ },
13196
+ {
13197
+ "epoch": 0.8272449392824849,
13198
+ "grad_norm": 54.77642059326172,
13199
+ "learning_rate": 9.967685766967227e-06,
13200
+ "loss": 10.2812,
13201
+ "step": 18640
13202
+ },
13203
+ {
13204
+ "epoch": 0.8276887402155763,
13205
+ "grad_norm": 64.23429107666016,
13206
+ "learning_rate": 9.967668431005299e-06,
13207
+ "loss": 10.2792,
13208
+ "step": 18650
13209
+ },
13210
+ {
13211
+ "epoch": 0.8281325411486677,
13212
+ "grad_norm": 77.61302185058594,
13213
+ "learning_rate": 9.967651095043372e-06,
13214
+ "loss": 10.5122,
13215
+ "step": 18660
13216
+ },
13217
+ {
13218
+ "epoch": 0.8285763420817592,
13219
+ "grad_norm": 63.884666442871094,
13220
+ "learning_rate": 9.967633759081445e-06,
13221
+ "loss": 10.5592,
13222
+ "step": 18670
13223
+ },
13224
+ {
13225
+ "epoch": 0.8290201430148507,
13226
+ "grad_norm": 68.2164077758789,
13227
+ "learning_rate": 9.967616423119516e-06,
13228
+ "loss": 10.0031,
13229
+ "step": 18680
13230
+ },
13231
+ {
13232
+ "epoch": 0.8294639439479422,
13233
+ "grad_norm": 64.70232391357422,
13234
+ "learning_rate": 9.96759908715759e-06,
13235
+ "loss": 10.0672,
13236
+ "step": 18690
13237
+ },
13238
+ {
13239
+ "epoch": 0.8299077448810336,
13240
+ "grad_norm": 70.52904510498047,
13241
+ "learning_rate": 9.967581751195663e-06,
13242
+ "loss": 10.8457,
13243
+ "step": 18700
13244
+ },
13245
+ {
13246
+ "epoch": 0.8303515458141251,
13247
+ "grad_norm": 74.24815368652344,
13248
+ "learning_rate": 9.967564415233734e-06,
13249
+ "loss": 10.2618,
13250
+ "step": 18710
13251
+ },
13252
+ {
13253
+ "epoch": 0.8307953467472166,
13254
+ "grad_norm": 70.21379852294922,
13255
+ "learning_rate": 9.967547079271807e-06,
13256
+ "loss": 10.6413,
13257
+ "step": 18720
13258
+ },
13259
+ {
13260
+ "epoch": 0.831239147680308,
13261
+ "grad_norm": 59.27021408081055,
13262
+ "learning_rate": 9.96752974330988e-06,
13263
+ "loss": 10.0213,
13264
+ "step": 18730
13265
+ },
13266
+ {
13267
+ "epoch": 0.8316829486133994,
13268
+ "grad_norm": 68.8056869506836,
13269
+ "learning_rate": 9.967512407347953e-06,
13270
+ "loss": 10.6691,
13271
+ "step": 18740
13272
+ },
13273
+ {
13274
+ "epoch": 0.8321267495464909,
13275
+ "grad_norm": 59.90221405029297,
13276
+ "learning_rate": 9.967495071386025e-06,
13277
+ "loss": 10.2026,
13278
+ "step": 18750
13279
+ },
13280
+ {
13281
+ "epoch": 0.8325705504795824,
13282
+ "grad_norm": 59.39807891845703,
13283
+ "learning_rate": 9.967477735424098e-06,
13284
+ "loss": 10.296,
13285
+ "step": 18760
13286
+ },
13287
+ {
13288
+ "epoch": 0.8330143514126739,
13289
+ "grad_norm": 60.97962951660156,
13290
+ "learning_rate": 9.96746039946217e-06,
13291
+ "loss": 10.4608,
13292
+ "step": 18770
13293
+ },
13294
+ {
13295
+ "epoch": 0.8334581523457653,
13296
+ "grad_norm": 69.33479309082031,
13297
+ "learning_rate": 9.967443063500242e-06,
13298
+ "loss": 10.1023,
13299
+ "step": 18780
13300
+ },
13301
+ {
13302
+ "epoch": 0.8339019532788567,
13303
+ "grad_norm": 62.024993896484375,
13304
+ "learning_rate": 9.967425727538315e-06,
13305
+ "loss": 10.1188,
13306
+ "step": 18790
13307
+ },
13308
+ {
13309
+ "epoch": 0.8343457542119482,
13310
+ "grad_norm": 70.13167572021484,
13311
+ "learning_rate": 9.967408391576388e-06,
13312
+ "loss": 10.5289,
13313
+ "step": 18800
13314
+ },
13315
+ {
13316
+ "epoch": 0.8347895551450397,
13317
+ "grad_norm": 59.78411865234375,
13318
+ "learning_rate": 9.96739105561446e-06,
13319
+ "loss": 10.0232,
13320
+ "step": 18810
13321
+ },
13322
+ {
13323
+ "epoch": 0.8352333560781312,
13324
+ "grad_norm": 65.34579467773438,
13325
+ "learning_rate": 9.967373719652533e-06,
13326
+ "loss": 10.3009,
13327
+ "step": 18820
13328
+ },
13329
+ {
13330
+ "epoch": 0.8356771570112226,
13331
+ "grad_norm": 66.02912902832031,
13332
+ "learning_rate": 9.967356383690606e-06,
13333
+ "loss": 10.4396,
13334
+ "step": 18830
13335
+ },
13336
+ {
13337
+ "epoch": 0.8361209579443141,
13338
+ "grad_norm": 64.6055679321289,
13339
+ "learning_rate": 9.967339047728677e-06,
13340
+ "loss": 11.1638,
13341
+ "step": 18840
13342
+ },
13343
+ {
13344
+ "epoch": 0.8365647588774056,
13345
+ "grad_norm": 68.47040557861328,
13346
+ "learning_rate": 9.96732171176675e-06,
13347
+ "loss": 10.3824,
13348
+ "step": 18850
13349
+ },
13350
+ {
13351
+ "epoch": 0.837008559810497,
13352
+ "grad_norm": 70.76081848144531,
13353
+ "learning_rate": 9.967304375804823e-06,
13354
+ "loss": 10.4553,
13355
+ "step": 18860
13356
+ },
13357
+ {
13358
+ "epoch": 0.8374523607435884,
13359
+ "grad_norm": 69.01679229736328,
13360
+ "learning_rate": 9.967287039842896e-06,
13361
+ "loss": 10.7882,
13362
+ "step": 18870
13363
+ },
13364
+ {
13365
+ "epoch": 0.8378961616766799,
13366
+ "grad_norm": 72.1138687133789,
13367
+ "learning_rate": 9.967269703880968e-06,
13368
+ "loss": 10.7932,
13369
+ "step": 18880
13370
+ },
13371
+ {
13372
+ "epoch": 0.8383399626097714,
13373
+ "grad_norm": 63.26852035522461,
13374
+ "learning_rate": 9.967252367919041e-06,
13375
+ "loss": 9.8872,
13376
+ "step": 18890
13377
+ },
13378
+ {
13379
+ "epoch": 0.8387837635428629,
13380
+ "grad_norm": 74.27698516845703,
13381
+ "learning_rate": 9.967235031957114e-06,
13382
+ "loss": 10.3047,
13383
+ "step": 18900
13384
+ },
13385
+ {
13386
+ "epoch": 0.8392275644759543,
13387
+ "grad_norm": 69.13713073730469,
13388
+ "learning_rate": 9.967217695995185e-06,
13389
+ "loss": 10.4936,
13390
+ "step": 18910
13391
+ },
13392
+ {
13393
+ "epoch": 0.8396713654090457,
13394
+ "grad_norm": 66.47625732421875,
13395
+ "learning_rate": 9.967200360033258e-06,
13396
+ "loss": 10.6965,
13397
+ "step": 18920
13398
+ },
13399
+ {
13400
+ "epoch": 0.8401151663421372,
13401
+ "grad_norm": 62.18655776977539,
13402
+ "learning_rate": 9.967183024071331e-06,
13403
+ "loss": 10.3633,
13404
+ "step": 18930
13405
+ },
13406
+ {
13407
+ "epoch": 0.8405589672752287,
13408
+ "grad_norm": 60.379478454589844,
13409
+ "learning_rate": 9.967165688109403e-06,
13410
+ "loss": 10.4711,
13411
+ "step": 18940
13412
+ },
13413
+ {
13414
+ "epoch": 0.8410027682083202,
13415
+ "grad_norm": 62.18358612060547,
13416
+ "learning_rate": 9.967148352147476e-06,
13417
+ "loss": 10.3252,
13418
+ "step": 18950
13419
+ },
13420
+ {
13421
+ "epoch": 0.8414465691414116,
13422
+ "grad_norm": 69.07564544677734,
13423
+ "learning_rate": 9.967131016185549e-06,
13424
+ "loss": 10.785,
13425
+ "step": 18960
13426
+ },
13427
+ {
13428
+ "epoch": 0.8418903700745031,
13429
+ "grad_norm": 67.07147216796875,
13430
+ "learning_rate": 9.96711368022362e-06,
13431
+ "loss": 10.6486,
13432
+ "step": 18970
13433
+ },
13434
+ {
13435
+ "epoch": 0.8423341710075946,
13436
+ "grad_norm": 68.17425537109375,
13437
+ "learning_rate": 9.967096344261693e-06,
13438
+ "loss": 10.2245,
13439
+ "step": 18980
13440
+ },
13441
+ {
13442
+ "epoch": 0.842777971940686,
13443
+ "grad_norm": 62.00086975097656,
13444
+ "learning_rate": 9.967079008299767e-06,
13445
+ "loss": 10.4259,
13446
+ "step": 18990
13447
+ },
13448
+ {
13449
+ "epoch": 0.8432217728737774,
13450
+ "grad_norm": 68.44880676269531,
13451
+ "learning_rate": 9.96706167233784e-06,
13452
+ "loss": 10.5632,
13453
+ "step": 19000
13454
+ },
13455
+ {
13456
+ "epoch": 0.8432217728737774,
13457
+ "eval_loss": 0.3246602714061737,
13458
+ "eval_runtime": 673.5905,
13459
+ "eval_samples_per_second": 1802.863,
13460
+ "eval_steps_per_second": 56.34,
13461
+ "step": 19000
13462
+ },
13463
+ {
13464
+ "epoch": 0.8436655738068689,
13465
+ "grad_norm": 67.45813751220703,
13466
+ "learning_rate": 9.967044336375911e-06,
13467
+ "loss": 10.3405,
13468
+ "step": 19010
13469
+ },
13470
+ {
13471
+ "epoch": 0.8441093747399604,
13472
+ "grad_norm": 71.77626037597656,
13473
+ "learning_rate": 9.967027000413984e-06,
13474
+ "loss": 10.9004,
13475
+ "step": 19020
13476
+ },
13477
+ {
13478
+ "epoch": 0.8445531756730519,
13479
+ "grad_norm": 63.87392044067383,
13480
+ "learning_rate": 9.967009664452057e-06,
13481
+ "loss": 10.5393,
13482
+ "step": 19030
13483
+ },
13484
+ {
13485
+ "epoch": 0.8449969766061434,
13486
+ "grad_norm": 62.10248947143555,
13487
+ "learning_rate": 9.966992328490129e-06,
13488
+ "loss": 10.1483,
13489
+ "step": 19040
13490
+ },
13491
+ {
13492
+ "epoch": 0.8454407775392347,
13493
+ "grad_norm": 58.07029342651367,
13494
+ "learning_rate": 9.966974992528202e-06,
13495
+ "loss": 10.4056,
13496
+ "step": 19050
13497
+ },
13498
+ {
13499
+ "epoch": 0.8458845784723262,
13500
+ "grad_norm": 69.88272094726562,
13501
+ "learning_rate": 9.966957656566275e-06,
13502
+ "loss": 10.3336,
13503
+ "step": 19060
13504
+ },
13505
+ {
13506
+ "epoch": 0.8463283794054177,
13507
+ "grad_norm": 57.19210433959961,
13508
+ "learning_rate": 9.966940320604346e-06,
13509
+ "loss": 10.5325,
13510
+ "step": 19070
13511
+ },
13512
+ {
13513
+ "epoch": 0.8467721803385092,
13514
+ "grad_norm": 68.29473876953125,
13515
+ "learning_rate": 9.96692298464242e-06,
13516
+ "loss": 10.5171,
13517
+ "step": 19080
13518
+ },
13519
+ {
13520
+ "epoch": 0.8472159812716006,
13521
+ "grad_norm": 61.379425048828125,
13522
+ "learning_rate": 9.966905648680492e-06,
13523
+ "loss": 9.9401,
13524
+ "step": 19090
13525
+ },
13526
+ {
13527
+ "epoch": 0.8476597822046921,
13528
+ "grad_norm": 68.13114929199219,
13529
+ "learning_rate": 9.966888312718564e-06,
13530
+ "loss": 10.4374,
13531
+ "step": 19100
13532
+ },
13533
+ {
13534
+ "epoch": 0.8481035831377836,
13535
+ "grad_norm": 64.97882843017578,
13536
+ "learning_rate": 9.966870976756637e-06,
13537
+ "loss": 10.4241,
13538
+ "step": 19110
13539
+ },
13540
+ {
13541
+ "epoch": 0.848547384070875,
13542
+ "grad_norm": 66.36862182617188,
13543
+ "learning_rate": 9.96685364079471e-06,
13544
+ "loss": 10.3443,
13545
+ "step": 19120
13546
+ },
13547
+ {
13548
+ "epoch": 0.8489911850039664,
13549
+ "grad_norm": 68.75626373291016,
13550
+ "learning_rate": 9.966836304832783e-06,
13551
+ "loss": 10.2445,
13552
+ "step": 19130
13553
+ },
13554
+ {
13555
+ "epoch": 0.8494349859370579,
13556
+ "grad_norm": 69.54931640625,
13557
+ "learning_rate": 9.966818968870854e-06,
13558
+ "loss": 10.1782,
13559
+ "step": 19140
13560
+ },
13561
+ {
13562
+ "epoch": 0.8498787868701494,
13563
+ "grad_norm": 62.799842834472656,
13564
+ "learning_rate": 9.966801632908927e-06,
13565
+ "loss": 9.9597,
13566
+ "step": 19150
13567
+ },
13568
+ {
13569
+ "epoch": 0.8503225878032409,
13570
+ "grad_norm": 72.54212951660156,
13571
+ "learning_rate": 9.966784296947e-06,
13572
+ "loss": 10.6076,
13573
+ "step": 19160
13574
+ },
13575
+ {
13576
+ "epoch": 0.8507663887363324,
13577
+ "grad_norm": 66.57682037353516,
13578
+ "learning_rate": 9.966766960985072e-06,
13579
+ "loss": 10.3762,
13580
+ "step": 19170
13581
+ },
13582
+ {
13583
+ "epoch": 0.8512101896694237,
13584
+ "grad_norm": 59.173683166503906,
13585
+ "learning_rate": 9.966749625023145e-06,
13586
+ "loss": 10.1135,
13587
+ "step": 19180
13588
+ },
13589
+ {
13590
+ "epoch": 0.8516539906025152,
13591
+ "grad_norm": 74.29920959472656,
13592
+ "learning_rate": 9.966732289061218e-06,
13593
+ "loss": 10.5823,
13594
+ "step": 19190
13595
+ },
13596
+ {
13597
+ "epoch": 0.8520977915356067,
13598
+ "grad_norm": 65.05313873291016,
13599
+ "learning_rate": 9.96671495309929e-06,
13600
+ "loss": 10.5004,
13601
+ "step": 19200
13602
+ },
13603
+ {
13604
+ "epoch": 0.8525415924686982,
13605
+ "grad_norm": 69.46266174316406,
13606
+ "learning_rate": 9.966697617137362e-06,
13607
+ "loss": 10.02,
13608
+ "step": 19210
13609
+ },
13610
+ {
13611
+ "epoch": 0.8529853934017896,
13612
+ "grad_norm": 64.3421859741211,
13613
+ "learning_rate": 9.966680281175436e-06,
13614
+ "loss": 10.7279,
13615
+ "step": 19220
13616
+ },
13617
+ {
13618
+ "epoch": 0.8534291943348811,
13619
+ "grad_norm": 69.0867919921875,
13620
+ "learning_rate": 9.966662945213507e-06,
13621
+ "loss": 10.4257,
13622
+ "step": 19230
13623
+ },
13624
+ {
13625
+ "epoch": 0.8538729952679726,
13626
+ "grad_norm": 70.24497985839844,
13627
+ "learning_rate": 9.96664560925158e-06,
13628
+ "loss": 10.169,
13629
+ "step": 19240
13630
+ },
13631
+ {
13632
+ "epoch": 0.854316796201064,
13633
+ "grad_norm": 67.85358428955078,
13634
+ "learning_rate": 9.966628273289653e-06,
13635
+ "loss": 10.3167,
13636
+ "step": 19250
13637
+ },
13638
+ {
13639
+ "epoch": 0.8547605971341554,
13640
+ "grad_norm": 70.9292221069336,
13641
+ "learning_rate": 9.966610937327724e-06,
13642
+ "loss": 11.0968,
13643
+ "step": 19260
13644
+ },
13645
+ {
13646
+ "epoch": 0.8552043980672469,
13647
+ "grad_norm": 71.09864044189453,
13648
+ "learning_rate": 9.966593601365798e-06,
13649
+ "loss": 10.5742,
13650
+ "step": 19270
13651
+ },
13652
+ {
13653
+ "epoch": 0.8556481990003384,
13654
+ "grad_norm": 69.86164093017578,
13655
+ "learning_rate": 9.96657626540387e-06,
13656
+ "loss": 10.5123,
13657
+ "step": 19280
13658
+ },
13659
+ {
13660
+ "epoch": 0.8560919999334299,
13661
+ "grad_norm": 56.01103210449219,
13662
+ "learning_rate": 9.966558929441942e-06,
13663
+ "loss": 10.0181,
13664
+ "step": 19290
13665
+ },
13666
+ {
13667
+ "epoch": 0.8565358008665214,
13668
+ "grad_norm": 70.41612243652344,
13669
+ "learning_rate": 9.966541593480015e-06,
13670
+ "loss": 10.4778,
13671
+ "step": 19300
13672
+ },
13673
+ {
13674
+ "epoch": 0.8569796017996127,
13675
+ "grad_norm": 66.11145782470703,
13676
+ "learning_rate": 9.966524257518088e-06,
13677
+ "loss": 10.7167,
13678
+ "step": 19310
13679
+ },
13680
+ {
13681
+ "epoch": 0.8574234027327042,
13682
+ "grad_norm": 72.80441284179688,
13683
+ "learning_rate": 9.966506921556161e-06,
13684
+ "loss": 10.3861,
13685
+ "step": 19320
13686
+ },
13687
+ {
13688
+ "epoch": 0.8578672036657957,
13689
+ "grad_norm": 62.77549362182617,
13690
+ "learning_rate": 9.966489585594233e-06,
13691
+ "loss": 10.2368,
13692
+ "step": 19330
13693
+ },
13694
+ {
13695
+ "epoch": 0.8583110045988872,
13696
+ "grad_norm": 68.18376922607422,
13697
+ "learning_rate": 9.966472249632306e-06,
13698
+ "loss": 10.3437,
13699
+ "step": 19340
13700
+ },
13701
+ {
13702
+ "epoch": 0.8587548055319786,
13703
+ "grad_norm": 66.24810028076172,
13704
+ "learning_rate": 9.966454913670379e-06,
13705
+ "loss": 10.0901,
13706
+ "step": 19350
13707
+ },
13708
+ {
13709
+ "epoch": 0.8591986064650701,
13710
+ "grad_norm": 68.41353607177734,
13711
+ "learning_rate": 9.96643757770845e-06,
13712
+ "loss": 10.6208,
13713
+ "step": 19360
13714
+ },
13715
+ {
13716
+ "epoch": 0.8596424073981616,
13717
+ "grad_norm": 61.160438537597656,
13718
+ "learning_rate": 9.966420241746523e-06,
13719
+ "loss": 10.2203,
13720
+ "step": 19370
13721
+ },
13722
+ {
13723
+ "epoch": 0.860086208331253,
13724
+ "grad_norm": 66.53337097167969,
13725
+ "learning_rate": 9.966402905784596e-06,
13726
+ "loss": 10.4119,
13727
+ "step": 19380
13728
+ },
13729
+ {
13730
+ "epoch": 0.8605300092643445,
13731
+ "grad_norm": 74.49799346923828,
13732
+ "learning_rate": 9.966385569822668e-06,
13733
+ "loss": 10.1698,
13734
+ "step": 19390
13735
+ },
13736
+ {
13737
+ "epoch": 0.8609738101974359,
13738
+ "grad_norm": 76.49808502197266,
13739
+ "learning_rate": 9.96636823386074e-06,
13740
+ "loss": 10.356,
13741
+ "step": 19400
13742
+ },
13743
+ {
13744
+ "epoch": 0.8614176111305274,
13745
+ "grad_norm": 72.61251068115234,
13746
+ "learning_rate": 9.966350897898814e-06,
13747
+ "loss": 10.0123,
13748
+ "step": 19410
13749
+ },
13750
+ {
13751
+ "epoch": 0.8618614120636189,
13752
+ "grad_norm": 69.5442123413086,
13753
+ "learning_rate": 9.966333561936885e-06,
13754
+ "loss": 10.7127,
13755
+ "step": 19420
13756
+ },
13757
+ {
13758
+ "epoch": 0.8623052129967104,
13759
+ "grad_norm": 75.41436767578125,
13760
+ "learning_rate": 9.966316225974958e-06,
13761
+ "loss": 10.2879,
13762
+ "step": 19430
13763
+ },
13764
+ {
13765
+ "epoch": 0.8627490139298017,
13766
+ "grad_norm": 62.93849563598633,
13767
+ "learning_rate": 9.966298890013031e-06,
13768
+ "loss": 10.1251,
13769
+ "step": 19440
13770
+ },
13771
+ {
13772
+ "epoch": 0.8631928148628932,
13773
+ "grad_norm": 61.55092239379883,
13774
+ "learning_rate": 9.966281554051103e-06,
13775
+ "loss": 10.4005,
13776
+ "step": 19450
13777
+ },
13778
+ {
13779
+ "epoch": 0.8636366157959847,
13780
+ "grad_norm": 64.35807037353516,
13781
+ "learning_rate": 9.966264218089176e-06,
13782
+ "loss": 10.3085,
13783
+ "step": 19460
13784
+ },
13785
+ {
13786
+ "epoch": 0.8640804167290762,
13787
+ "grad_norm": 63.883033752441406,
13788
+ "learning_rate": 9.966246882127249e-06,
13789
+ "loss": 10.7861,
13790
+ "step": 19470
13791
+ },
13792
+ {
13793
+ "epoch": 0.8645242176621676,
13794
+ "grad_norm": 62.51860809326172,
13795
+ "learning_rate": 9.96622954616532e-06,
13796
+ "loss": 10.5587,
13797
+ "step": 19480
13798
+ },
13799
+ {
13800
+ "epoch": 0.8649680185952591,
13801
+ "grad_norm": 67.6877212524414,
13802
+ "learning_rate": 9.966212210203393e-06,
13803
+ "loss": 10.5259,
13804
+ "step": 19490
13805
+ },
13806
+ {
13807
+ "epoch": 0.8654118195283506,
13808
+ "grad_norm": 55.97256851196289,
13809
+ "learning_rate": 9.966194874241466e-06,
13810
+ "loss": 10.0621,
13811
+ "step": 19500
13812
+ },
13813
+ {
13814
+ "epoch": 0.865855620461442,
13815
+ "grad_norm": 66.48442077636719,
13816
+ "learning_rate": 9.966177538279538e-06,
13817
+ "loss": 10.1117,
13818
+ "step": 19510
13819
+ },
13820
+ {
13821
+ "epoch": 0.8662994213945335,
13822
+ "grad_norm": 71.5040512084961,
13823
+ "learning_rate": 9.966160202317611e-06,
13824
+ "loss": 10.0883,
13825
+ "step": 19520
13826
+ },
13827
+ {
13828
+ "epoch": 0.8667432223276249,
13829
+ "grad_norm": 62.448360443115234,
13830
+ "learning_rate": 9.966142866355684e-06,
13831
+ "loss": 9.8626,
13832
+ "step": 19530
13833
+ },
13834
+ {
13835
+ "epoch": 0.8671870232607164,
13836
+ "grad_norm": 70.04524230957031,
13837
+ "learning_rate": 9.966125530393757e-06,
13838
+ "loss": 10.3496,
13839
+ "step": 19540
13840
+ },
13841
+ {
13842
+ "epoch": 0.8676308241938079,
13843
+ "grad_norm": 68.1249008178711,
13844
+ "learning_rate": 9.966108194431828e-06,
13845
+ "loss": 10.2206,
13846
+ "step": 19550
13847
+ },
13848
+ {
13849
+ "epoch": 0.8680746251268994,
13850
+ "grad_norm": 60.323795318603516,
13851
+ "learning_rate": 9.966090858469902e-06,
13852
+ "loss": 10.545,
13853
+ "step": 19560
13854
+ },
13855
+ {
13856
+ "epoch": 0.8685184260599907,
13857
+ "grad_norm": 62.9224853515625,
13858
+ "learning_rate": 9.966073522507975e-06,
13859
+ "loss": 10.4674,
13860
+ "step": 19570
13861
+ },
13862
+ {
13863
+ "epoch": 0.8689622269930822,
13864
+ "grad_norm": 60.80291748046875,
13865
+ "learning_rate": 9.966056186546046e-06,
13866
+ "loss": 10.1987,
13867
+ "step": 19580
13868
+ },
13869
+ {
13870
+ "epoch": 0.8694060279261737,
13871
+ "grad_norm": 53.80615234375,
13872
+ "learning_rate": 9.966038850584119e-06,
13873
+ "loss": 10.4711,
13874
+ "step": 19590
13875
+ },
13876
+ {
13877
+ "epoch": 0.8698498288592652,
13878
+ "grad_norm": 58.71421813964844,
13879
+ "learning_rate": 9.966021514622192e-06,
13880
+ "loss": 10.2597,
13881
+ "step": 19600
13882
+ },
13883
+ {
13884
+ "epoch": 0.8702936297923567,
13885
+ "grad_norm": 60.54587936401367,
13886
+ "learning_rate": 9.966004178660264e-06,
13887
+ "loss": 10.3637,
13888
+ "step": 19610
13889
+ },
13890
+ {
13891
+ "epoch": 0.8707374307254481,
13892
+ "grad_norm": 58.478153228759766,
13893
+ "learning_rate": 9.965986842698337e-06,
13894
+ "loss": 10.1861,
13895
+ "step": 19620
13896
+ },
13897
+ {
13898
+ "epoch": 0.8711812316585396,
13899
+ "grad_norm": 70.70100402832031,
13900
+ "learning_rate": 9.96596950673641e-06,
13901
+ "loss": 9.8945,
13902
+ "step": 19630
13903
+ },
13904
+ {
13905
+ "epoch": 0.871625032591631,
13906
+ "grad_norm": 64.967041015625,
13907
+ "learning_rate": 9.965952170774481e-06,
13908
+ "loss": 10.0128,
13909
+ "step": 19640
13910
+ },
13911
+ {
13912
+ "epoch": 0.8720688335247225,
13913
+ "grad_norm": 67.52765655517578,
13914
+ "learning_rate": 9.965934834812554e-06,
13915
+ "loss": 10.5425,
13916
+ "step": 19650
13917
+ },
13918
+ {
13919
+ "epoch": 0.8725126344578139,
13920
+ "grad_norm": 63.967247009277344,
13921
+ "learning_rate": 9.965917498850627e-06,
13922
+ "loss": 10.6763,
13923
+ "step": 19660
13924
+ },
13925
+ {
13926
+ "epoch": 0.8729564353909054,
13927
+ "grad_norm": 71.41963958740234,
13928
+ "learning_rate": 9.965900162888699e-06,
13929
+ "loss": 10.6471,
13930
+ "step": 19670
13931
+ },
13932
+ {
13933
+ "epoch": 0.8734002363239969,
13934
+ "grad_norm": 60.292701721191406,
13935
+ "learning_rate": 9.965882826926772e-06,
13936
+ "loss": 10.1802,
13937
+ "step": 19680
13938
+ },
13939
+ {
13940
+ "epoch": 0.8738440372570884,
13941
+ "grad_norm": 63.403560638427734,
13942
+ "learning_rate": 9.965865490964845e-06,
13943
+ "loss": 10.2882,
13944
+ "step": 19690
13945
+ },
13946
+ {
13947
+ "epoch": 0.8742878381901797,
13948
+ "grad_norm": 65.6253662109375,
13949
+ "learning_rate": 9.965848155002916e-06,
13950
+ "loss": 10.2649,
13951
+ "step": 19700
13952
+ },
13953
+ {
13954
+ "epoch": 0.8747316391232712,
13955
+ "grad_norm": 61.846683502197266,
13956
+ "learning_rate": 9.96583081904099e-06,
13957
+ "loss": 10.2564,
13958
+ "step": 19710
13959
+ },
13960
+ {
13961
+ "epoch": 0.8751754400563627,
13962
+ "grad_norm": 65.23727416992188,
13963
+ "learning_rate": 9.965813483079062e-06,
13964
+ "loss": 10.5773,
13965
+ "step": 19720
13966
+ },
13967
+ {
13968
+ "epoch": 0.8756192409894542,
13969
+ "grad_norm": 57.89336395263672,
13970
+ "learning_rate": 9.965796147117134e-06,
13971
+ "loss": 10.3047,
13972
+ "step": 19730
13973
+ },
13974
+ {
13975
+ "epoch": 0.8760630419225457,
13976
+ "grad_norm": 63.75661849975586,
13977
+ "learning_rate": 9.965778811155207e-06,
13978
+ "loss": 9.6075,
13979
+ "step": 19740
13980
+ },
13981
+ {
13982
+ "epoch": 0.8765068428556371,
13983
+ "grad_norm": 52.67669677734375,
13984
+ "learning_rate": 9.96576147519328e-06,
13985
+ "loss": 10.1751,
13986
+ "step": 19750
13987
+ },
13988
+ {
13989
+ "epoch": 0.8769506437887286,
13990
+ "grad_norm": 62.90458297729492,
13991
+ "learning_rate": 9.965744139231353e-06,
13992
+ "loss": 9.9069,
13993
+ "step": 19760
13994
+ },
13995
+ {
13996
+ "epoch": 0.87739444472182,
13997
+ "grad_norm": 75.43021392822266,
13998
+ "learning_rate": 9.965726803269424e-06,
13999
+ "loss": 10.7198,
14000
+ "step": 19770
14001
+ },
14002
+ {
14003
+ "epoch": 0.8778382456549115,
14004
+ "grad_norm": 65.16674041748047,
14005
+ "learning_rate": 9.965709467307497e-06,
14006
+ "loss": 9.8234,
14007
+ "step": 19780
14008
+ },
14009
+ {
14010
+ "epoch": 0.8782820465880029,
14011
+ "grad_norm": 55.970890045166016,
14012
+ "learning_rate": 9.96569213134557e-06,
14013
+ "loss": 10.2538,
14014
+ "step": 19790
14015
+ },
14016
+ {
14017
+ "epoch": 0.8787258475210944,
14018
+ "grad_norm": 63.947113037109375,
14019
+ "learning_rate": 9.965674795383642e-06,
14020
+ "loss": 10.4209,
14021
+ "step": 19800
14022
+ },
14023
+ {
14024
+ "epoch": 0.8791696484541859,
14025
+ "grad_norm": 65.56307983398438,
14026
+ "learning_rate": 9.965657459421715e-06,
14027
+ "loss": 9.7725,
14028
+ "step": 19810
14029
+ },
14030
+ {
14031
+ "epoch": 0.8796134493872774,
14032
+ "grad_norm": 66.70881652832031,
14033
+ "learning_rate": 9.965640123459788e-06,
14034
+ "loss": 10.7015,
14035
+ "step": 19820
14036
+ },
14037
+ {
14038
+ "epoch": 0.8800572503203687,
14039
+ "grad_norm": 65.10243225097656,
14040
+ "learning_rate": 9.96562278749786e-06,
14041
+ "loss": 10.1794,
14042
+ "step": 19830
14043
+ },
14044
+ {
14045
+ "epoch": 0.8805010512534602,
14046
+ "grad_norm": 58.50627517700195,
14047
+ "learning_rate": 9.965605451535932e-06,
14048
+ "loss": 10.758,
14049
+ "step": 19840
14050
+ },
14051
+ {
14052
+ "epoch": 0.8809448521865517,
14053
+ "grad_norm": 66.39806365966797,
14054
+ "learning_rate": 9.965588115574006e-06,
14055
+ "loss": 9.9667,
14056
+ "step": 19850
14057
+ },
14058
+ {
14059
+ "epoch": 0.8813886531196432,
14060
+ "grad_norm": 78.95050048828125,
14061
+ "learning_rate": 9.965570779612077e-06,
14062
+ "loss": 10.5293,
14063
+ "step": 19860
14064
+ },
14065
+ {
14066
+ "epoch": 0.8818324540527347,
14067
+ "grad_norm": 63.072444915771484,
14068
+ "learning_rate": 9.96555344365015e-06,
14069
+ "loss": 10.1306,
14070
+ "step": 19870
14071
+ },
14072
+ {
14073
+ "epoch": 0.8822762549858261,
14074
+ "grad_norm": 61.44473648071289,
14075
+ "learning_rate": 9.965536107688223e-06,
14076
+ "loss": 10.3155,
14077
+ "step": 19880
14078
+ },
14079
+ {
14080
+ "epoch": 0.8827200559189176,
14081
+ "grad_norm": 70.35446166992188,
14082
+ "learning_rate": 9.965518771726294e-06,
14083
+ "loss": 10.2451,
14084
+ "step": 19890
14085
+ },
14086
+ {
14087
+ "epoch": 0.883163856852009,
14088
+ "grad_norm": 58.564395904541016,
14089
+ "learning_rate": 9.965501435764368e-06,
14090
+ "loss": 10.1813,
14091
+ "step": 19900
14092
+ },
14093
+ {
14094
+ "epoch": 0.8836076577851005,
14095
+ "grad_norm": 64.06719970703125,
14096
+ "learning_rate": 9.96548409980244e-06,
14097
+ "loss": 10.2219,
14098
+ "step": 19910
14099
+ },
14100
+ {
14101
+ "epoch": 0.8840514587181919,
14102
+ "grad_norm": 57.828590393066406,
14103
+ "learning_rate": 9.965466763840512e-06,
14104
+ "loss": 10.4087,
14105
+ "step": 19920
14106
+ },
14107
+ {
14108
+ "epoch": 0.8844952596512834,
14109
+ "grad_norm": 61.435123443603516,
14110
+ "learning_rate": 9.965449427878585e-06,
14111
+ "loss": 10.3527,
14112
+ "step": 19930
14113
+ },
14114
+ {
14115
+ "epoch": 0.8849390605843749,
14116
+ "grad_norm": 61.76189041137695,
14117
+ "learning_rate": 9.965432091916658e-06,
14118
+ "loss": 10.3732,
14119
+ "step": 19940
14120
+ },
14121
+ {
14122
+ "epoch": 0.8853828615174664,
14123
+ "grad_norm": 62.846946716308594,
14124
+ "learning_rate": 9.96541475595473e-06,
14125
+ "loss": 10.2244,
14126
+ "step": 19950
14127
+ },
14128
+ {
14129
+ "epoch": 0.8858266624505579,
14130
+ "grad_norm": 63.24193572998047,
14131
+ "learning_rate": 9.965397419992803e-06,
14132
+ "loss": 10.1221,
14133
+ "step": 19960
14134
+ },
14135
+ {
14136
+ "epoch": 0.8862704633836492,
14137
+ "grad_norm": 56.531044006347656,
14138
+ "learning_rate": 9.965380084030876e-06,
14139
+ "loss": 9.9695,
14140
+ "step": 19970
14141
+ },
14142
+ {
14143
+ "epoch": 0.8867142643167407,
14144
+ "grad_norm": 59.855491638183594,
14145
+ "learning_rate": 9.965362748068949e-06,
14146
+ "loss": 10.4246,
14147
+ "step": 19980
14148
+ },
14149
+ {
14150
+ "epoch": 0.8871580652498322,
14151
+ "grad_norm": 56.98590850830078,
14152
+ "learning_rate": 9.96534541210702e-06,
14153
+ "loss": 10.5376,
14154
+ "step": 19990
14155
+ },
14156
+ {
14157
+ "epoch": 0.8876018661829237,
14158
+ "grad_norm": 64.37902069091797,
14159
+ "learning_rate": 9.965328076145093e-06,
14160
+ "loss": 9.9193,
14161
+ "step": 20000
14162
+ },
14163
+ {
14164
+ "epoch": 0.8876018661829237,
14165
+ "eval_loss": 0.32305407524108887,
14166
+ "eval_runtime": 673.2893,
14167
+ "eval_samples_per_second": 1803.669,
14168
+ "eval_steps_per_second": 56.365,
14169
+ "step": 20000
14170
  }
14171
  ],
14172
  "logging_steps": 10,
 
14186
  "attributes": {}
14187
  }
14188
  },
14189
+ "total_flos": 6.979446410051584e+18,
14190
  "train_batch_size": 4,
14191
  "trial_name": null,
14192
  "trial_params": null