CocoRoF commited on
Commit
f55ebaa
·
verified ·
1 Parent(s): e7a44f1

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b16b8aa8e59ed005cab03437e858fc32c15da21710862fb7624c9d60fa09994
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0c398bc843eea803ee6d700d1537de79fdc32bede5d65c5c9f86d67c40a71de
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e9f520d78e8c2ec7d22e229de31961c7dc7141280c426b76ea47d2304f6e61e
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e62b87c8cdb6fbdf1ca09dea3f1e41c6f59a2ccbb87b7b025db5eb51b5fa0d
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b5ca0374fb823ceea8ffdfbdcc41c0cd34efad2563ba3211ec3385b1ecc3188
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c9ee2be288a50938aa76c672a598bafcd789d6a5d6e08c069ef8e7d474b5cd2
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.22190046654573092,
5
  "eval_steps": 1000,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3547,6 +3547,1772 @@
3547
  "eval_samples_per_second": 1798.056,
3548
  "eval_steps_per_second": 56.19,
3549
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3550
  }
3551
  ],
3552
  "logging_steps": 10,
@@ -3566,7 +5332,7 @@
3566
  "attributes": {}
3567
  }
3568
  },
3569
- "total_flos": 1.744861602512896e+18,
3570
  "train_batch_size": 4,
3571
  "trial_name": null,
3572
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3328506998185964,
5
  "eval_steps": 1000,
6
+ "global_step": 7500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3547
  "eval_samples_per_second": 1798.056,
3548
  "eval_steps_per_second": 56.19,
3549
  "step": 5000
3550
+ },
3551
+ {
3552
+ "epoch": 0.22234426747882238,
3553
+ "grad_norm": 100.71528625488281,
3554
+ "learning_rate": 9.991314683074347e-06,
3555
+ "loss": 11.8638,
3556
+ "step": 5010
3557
+ },
3558
+ {
3559
+ "epoch": 0.22278806841191384,
3560
+ "grad_norm": 114.36324310302734,
3561
+ "learning_rate": 9.991297347112418e-06,
3562
+ "loss": 12.7659,
3563
+ "step": 5020
3564
+ },
3565
+ {
3566
+ "epoch": 0.2232318693450053,
3567
+ "grad_norm": 123.63908386230469,
3568
+ "learning_rate": 9.991280011150491e-06,
3569
+ "loss": 12.2795,
3570
+ "step": 5030
3571
+ },
3572
+ {
3573
+ "epoch": 0.22367567027809676,
3574
+ "grad_norm": 120.35507202148438,
3575
+ "learning_rate": 9.991262675188565e-06,
3576
+ "loss": 12.277,
3577
+ "step": 5040
3578
+ },
3579
+ {
3580
+ "epoch": 0.22411947121118822,
3581
+ "grad_norm": 109.37151336669922,
3582
+ "learning_rate": 9.991245339226636e-06,
3583
+ "loss": 11.6699,
3584
+ "step": 5050
3585
+ },
3586
+ {
3587
+ "epoch": 0.22456327214427968,
3588
+ "grad_norm": 105.27191925048828,
3589
+ "learning_rate": 9.991228003264709e-06,
3590
+ "loss": 12.0232,
3591
+ "step": 5060
3592
+ },
3593
+ {
3594
+ "epoch": 0.22500707307737114,
3595
+ "grad_norm": 129.59083557128906,
3596
+ "learning_rate": 9.991210667302782e-06,
3597
+ "loss": 11.7139,
3598
+ "step": 5070
3599
+ },
3600
+ {
3601
+ "epoch": 0.2254508740104626,
3602
+ "grad_norm": 102.64170837402344,
3603
+ "learning_rate": 9.991193331340853e-06,
3604
+ "loss": 12.3,
3605
+ "step": 5080
3606
+ },
3607
+ {
3608
+ "epoch": 0.22589467494355406,
3609
+ "grad_norm": 97.96053314208984,
3610
+ "learning_rate": 9.991175995378927e-06,
3611
+ "loss": 12.2807,
3612
+ "step": 5090
3613
+ },
3614
+ {
3615
+ "epoch": 0.22633847587664552,
3616
+ "grad_norm": 111.13671112060547,
3617
+ "learning_rate": 9.991158659417e-06,
3618
+ "loss": 12.2035,
3619
+ "step": 5100
3620
+ },
3621
+ {
3622
+ "epoch": 0.226782276809737,
3623
+ "grad_norm": 127.20590209960938,
3624
+ "learning_rate": 9.991141323455071e-06,
3625
+ "loss": 11.7232,
3626
+ "step": 5110
3627
+ },
3628
+ {
3629
+ "epoch": 0.22722607774282846,
3630
+ "grad_norm": 128.55152893066406,
3631
+ "learning_rate": 9.991123987493144e-06,
3632
+ "loss": 12.2339,
3633
+ "step": 5120
3634
+ },
3635
+ {
3636
+ "epoch": 0.22766987867591992,
3637
+ "grad_norm": 152.3295135498047,
3638
+ "learning_rate": 9.991106651531217e-06,
3639
+ "loss": 12.0261,
3640
+ "step": 5130
3641
+ },
3642
+ {
3643
+ "epoch": 0.22811367960901138,
3644
+ "grad_norm": 125.02354431152344,
3645
+ "learning_rate": 9.99108931556929e-06,
3646
+ "loss": 11.9501,
3647
+ "step": 5140
3648
+ },
3649
+ {
3650
+ "epoch": 0.22855748054210284,
3651
+ "grad_norm": 126.0276107788086,
3652
+ "learning_rate": 9.991071979607362e-06,
3653
+ "loss": 11.9311,
3654
+ "step": 5150
3655
+ },
3656
+ {
3657
+ "epoch": 0.2290012814751943,
3658
+ "grad_norm": 101.98572540283203,
3659
+ "learning_rate": 9.991054643645435e-06,
3660
+ "loss": 11.7741,
3661
+ "step": 5160
3662
+ },
3663
+ {
3664
+ "epoch": 0.22944508240828576,
3665
+ "grad_norm": 93.39753723144531,
3666
+ "learning_rate": 9.991037307683508e-06,
3667
+ "loss": 12.3775,
3668
+ "step": 5170
3669
+ },
3670
+ {
3671
+ "epoch": 0.22988888334137722,
3672
+ "grad_norm": 119.41414642333984,
3673
+ "learning_rate": 9.99101997172158e-06,
3674
+ "loss": 12.3944,
3675
+ "step": 5180
3676
+ },
3677
+ {
3678
+ "epoch": 0.23033268427446868,
3679
+ "grad_norm": 102.53070068359375,
3680
+ "learning_rate": 9.991002635759652e-06,
3681
+ "loss": 11.7407,
3682
+ "step": 5190
3683
+ },
3684
+ {
3685
+ "epoch": 0.23077648520756014,
3686
+ "grad_norm": 114.39293670654297,
3687
+ "learning_rate": 9.990985299797725e-06,
3688
+ "loss": 12.0433,
3689
+ "step": 5200
3690
+ },
3691
+ {
3692
+ "epoch": 0.2312202861406516,
3693
+ "grad_norm": 95.0862808227539,
3694
+ "learning_rate": 9.990967963835797e-06,
3695
+ "loss": 11.8278,
3696
+ "step": 5210
3697
+ },
3698
+ {
3699
+ "epoch": 0.23166408707374309,
3700
+ "grad_norm": 95.9500732421875,
3701
+ "learning_rate": 9.99095062787387e-06,
3702
+ "loss": 12.0699,
3703
+ "step": 5220
3704
+ },
3705
+ {
3706
+ "epoch": 0.23210788800683455,
3707
+ "grad_norm": 108.53816223144531,
3708
+ "learning_rate": 9.990933291911943e-06,
3709
+ "loss": 12.0234,
3710
+ "step": 5230
3711
+ },
3712
+ {
3713
+ "epoch": 0.232551688939926,
3714
+ "grad_norm": 108.30989837646484,
3715
+ "learning_rate": 9.990915955950014e-06,
3716
+ "loss": 11.899,
3717
+ "step": 5240
3718
+ },
3719
+ {
3720
+ "epoch": 0.23299548987301746,
3721
+ "grad_norm": 103.17509460449219,
3722
+ "learning_rate": 9.990898619988087e-06,
3723
+ "loss": 12.0308,
3724
+ "step": 5250
3725
+ },
3726
+ {
3727
+ "epoch": 0.23343929080610892,
3728
+ "grad_norm": 115.72237396240234,
3729
+ "learning_rate": 9.99088128402616e-06,
3730
+ "loss": 11.7059,
3731
+ "step": 5260
3732
+ },
3733
+ {
3734
+ "epoch": 0.23388309173920038,
3735
+ "grad_norm": 120.50923919677734,
3736
+ "learning_rate": 9.990863948064232e-06,
3737
+ "loss": 12.0906,
3738
+ "step": 5270
3739
+ },
3740
+ {
3741
+ "epoch": 0.23432689267229184,
3742
+ "grad_norm": 103.17623138427734,
3743
+ "learning_rate": 9.990846612102305e-06,
3744
+ "loss": 11.5532,
3745
+ "step": 5280
3746
+ },
3747
+ {
3748
+ "epoch": 0.2347706936053833,
3749
+ "grad_norm": 116.489013671875,
3750
+ "learning_rate": 9.990829276140378e-06,
3751
+ "loss": 11.8999,
3752
+ "step": 5290
3753
+ },
3754
+ {
3755
+ "epoch": 0.23521449453847476,
3756
+ "grad_norm": 102.14614868164062,
3757
+ "learning_rate": 9.99081194017845e-06,
3758
+ "loss": 12.2294,
3759
+ "step": 5300
3760
+ },
3761
+ {
3762
+ "epoch": 0.23565829547156622,
3763
+ "grad_norm": 127.21760559082031,
3764
+ "learning_rate": 9.990794604216522e-06,
3765
+ "loss": 12.0739,
3766
+ "step": 5310
3767
+ },
3768
+ {
3769
+ "epoch": 0.23610209640465768,
3770
+ "grad_norm": 129.3851318359375,
3771
+ "learning_rate": 9.990777268254595e-06,
3772
+ "loss": 12.2383,
3773
+ "step": 5320
3774
+ },
3775
+ {
3776
+ "epoch": 0.23654589733774914,
3777
+ "grad_norm": 107.56005096435547,
3778
+ "learning_rate": 9.990759932292667e-06,
3779
+ "loss": 11.7989,
3780
+ "step": 5330
3781
+ },
3782
+ {
3783
+ "epoch": 0.23698969827084063,
3784
+ "grad_norm": 96.8632583618164,
3785
+ "learning_rate": 9.99074259633074e-06,
3786
+ "loss": 12.1831,
3787
+ "step": 5340
3788
+ },
3789
+ {
3790
+ "epoch": 0.23743349920393209,
3791
+ "grad_norm": 106.97894287109375,
3792
+ "learning_rate": 9.990725260368813e-06,
3793
+ "loss": 12.0481,
3794
+ "step": 5350
3795
+ },
3796
+ {
3797
+ "epoch": 0.23787730013702355,
3798
+ "grad_norm": 91.43546295166016,
3799
+ "learning_rate": 9.990707924406886e-06,
3800
+ "loss": 11.3025,
3801
+ "step": 5360
3802
+ },
3803
+ {
3804
+ "epoch": 0.238321101070115,
3805
+ "grad_norm": 104.5177001953125,
3806
+ "learning_rate": 9.990690588444957e-06,
3807
+ "loss": 11.5902,
3808
+ "step": 5370
3809
+ },
3810
+ {
3811
+ "epoch": 0.23876490200320646,
3812
+ "grad_norm": 117.70657348632812,
3813
+ "learning_rate": 9.99067325248303e-06,
3814
+ "loss": 11.7063,
3815
+ "step": 5380
3816
+ },
3817
+ {
3818
+ "epoch": 0.23920870293629792,
3819
+ "grad_norm": 113.05216979980469,
3820
+ "learning_rate": 9.990655916521104e-06,
3821
+ "loss": 12.0053,
3822
+ "step": 5390
3823
+ },
3824
+ {
3825
+ "epoch": 0.23965250386938938,
3826
+ "grad_norm": 115.97754669189453,
3827
+ "learning_rate": 9.990638580559175e-06,
3828
+ "loss": 12.3194,
3829
+ "step": 5400
3830
+ },
3831
+ {
3832
+ "epoch": 0.24009630480248084,
3833
+ "grad_norm": 110.10850524902344,
3834
+ "learning_rate": 9.990621244597248e-06,
3835
+ "loss": 11.2671,
3836
+ "step": 5410
3837
+ },
3838
+ {
3839
+ "epoch": 0.2405401057355723,
3840
+ "grad_norm": 110.06661987304688,
3841
+ "learning_rate": 9.990603908635321e-06,
3842
+ "loss": 12.2819,
3843
+ "step": 5420
3844
+ },
3845
+ {
3846
+ "epoch": 0.24098390666866376,
3847
+ "grad_norm": 126.98220825195312,
3848
+ "learning_rate": 9.990586572673393e-06,
3849
+ "loss": 12.0827,
3850
+ "step": 5430
3851
+ },
3852
+ {
3853
+ "epoch": 0.24142770760175522,
3854
+ "grad_norm": 101.59259033203125,
3855
+ "learning_rate": 9.990569236711466e-06,
3856
+ "loss": 11.5167,
3857
+ "step": 5440
3858
+ },
3859
+ {
3860
+ "epoch": 0.2418715085348467,
3861
+ "grad_norm": 91.08474731445312,
3862
+ "learning_rate": 9.990551900749539e-06,
3863
+ "loss": 11.4829,
3864
+ "step": 5450
3865
+ },
3866
+ {
3867
+ "epoch": 0.24231530946793817,
3868
+ "grad_norm": 100.78240966796875,
3869
+ "learning_rate": 9.99053456478761e-06,
3870
+ "loss": 12.0629,
3871
+ "step": 5460
3872
+ },
3873
+ {
3874
+ "epoch": 0.24275911040102963,
3875
+ "grad_norm": 121.29551696777344,
3876
+ "learning_rate": 9.990517228825683e-06,
3877
+ "loss": 11.9303,
3878
+ "step": 5470
3879
+ },
3880
+ {
3881
+ "epoch": 0.24320291133412109,
3882
+ "grad_norm": 100.81685638427734,
3883
+ "learning_rate": 9.990499892863756e-06,
3884
+ "loss": 11.4778,
3885
+ "step": 5480
3886
+ },
3887
+ {
3888
+ "epoch": 0.24364671226721255,
3889
+ "grad_norm": 134.68348693847656,
3890
+ "learning_rate": 9.990482556901828e-06,
3891
+ "loss": 12.5336,
3892
+ "step": 5490
3893
+ },
3894
+ {
3895
+ "epoch": 0.244090513200304,
3896
+ "grad_norm": 103.34284973144531,
3897
+ "learning_rate": 9.9904652209399e-06,
3898
+ "loss": 11.9692,
3899
+ "step": 5500
3900
+ },
3901
+ {
3902
+ "epoch": 0.24453431413339546,
3903
+ "grad_norm": 99.77964782714844,
3904
+ "learning_rate": 9.990447884977974e-06,
3905
+ "loss": 11.5971,
3906
+ "step": 5510
3907
+ },
3908
+ {
3909
+ "epoch": 0.24497811506648692,
3910
+ "grad_norm": 93.10983276367188,
3911
+ "learning_rate": 9.990430549016045e-06,
3912
+ "loss": 12.3927,
3913
+ "step": 5520
3914
+ },
3915
+ {
3916
+ "epoch": 0.24542191599957838,
3917
+ "grad_norm": 109.86616516113281,
3918
+ "learning_rate": 9.990413213054118e-06,
3919
+ "loss": 11.945,
3920
+ "step": 5530
3921
+ },
3922
+ {
3923
+ "epoch": 0.24586571693266984,
3924
+ "grad_norm": 112.61761474609375,
3925
+ "learning_rate": 9.990395877092191e-06,
3926
+ "loss": 11.7543,
3927
+ "step": 5540
3928
+ },
3929
+ {
3930
+ "epoch": 0.2463095178657613,
3931
+ "grad_norm": 115.5595703125,
3932
+ "learning_rate": 9.990378541130263e-06,
3933
+ "loss": 12.0871,
3934
+ "step": 5550
3935
+ },
3936
+ {
3937
+ "epoch": 0.24675331879885276,
3938
+ "grad_norm": 108.1959228515625,
3939
+ "learning_rate": 9.990361205168336e-06,
3940
+ "loss": 12.3206,
3941
+ "step": 5560
3942
+ },
3943
+ {
3944
+ "epoch": 0.24719711973194425,
3945
+ "grad_norm": 102.17387390136719,
3946
+ "learning_rate": 9.990343869206409e-06,
3947
+ "loss": 11.5629,
3948
+ "step": 5570
3949
+ },
3950
+ {
3951
+ "epoch": 0.2476409206650357,
3952
+ "grad_norm": 107.28736114501953,
3953
+ "learning_rate": 9.990326533244482e-06,
3954
+ "loss": 11.5035,
3955
+ "step": 5580
3956
+ },
3957
+ {
3958
+ "epoch": 0.24808472159812717,
3959
+ "grad_norm": 101.89647674560547,
3960
+ "learning_rate": 9.990309197282553e-06,
3961
+ "loss": 12.1624,
3962
+ "step": 5590
3963
+ },
3964
+ {
3965
+ "epoch": 0.24852852253121863,
3966
+ "grad_norm": 86.52133178710938,
3967
+ "learning_rate": 9.990291861320626e-06,
3968
+ "loss": 12.3599,
3969
+ "step": 5600
3970
+ },
3971
+ {
3972
+ "epoch": 0.24897232346431009,
3973
+ "grad_norm": 118.91706848144531,
3974
+ "learning_rate": 9.9902745253587e-06,
3975
+ "loss": 12.4713,
3976
+ "step": 5610
3977
+ },
3978
+ {
3979
+ "epoch": 0.24941612439740155,
3980
+ "grad_norm": 92.26959991455078,
3981
+ "learning_rate": 9.990257189396771e-06,
3982
+ "loss": 11.8632,
3983
+ "step": 5620
3984
+ },
3985
+ {
3986
+ "epoch": 0.249859925330493,
3987
+ "grad_norm": 96.55217742919922,
3988
+ "learning_rate": 9.990239853434844e-06,
3989
+ "loss": 11.7402,
3990
+ "step": 5630
3991
+ },
3992
+ {
3993
+ "epoch": 0.2503037262635845,
3994
+ "grad_norm": 108.4555892944336,
3995
+ "learning_rate": 9.990222517472917e-06,
3996
+ "loss": 11.7055,
3997
+ "step": 5640
3998
+ },
3999
+ {
4000
+ "epoch": 0.2507475271966759,
4001
+ "grad_norm": 103.9815902709961,
4002
+ "learning_rate": 9.990205181510988e-06,
4003
+ "loss": 12.0786,
4004
+ "step": 5650
4005
+ },
4006
+ {
4007
+ "epoch": 0.2511913281297674,
4008
+ "grad_norm": 112.2682876586914,
4009
+ "learning_rate": 9.990187845549062e-06,
4010
+ "loss": 12.11,
4011
+ "step": 5660
4012
+ },
4013
+ {
4014
+ "epoch": 0.25163512906285884,
4015
+ "grad_norm": 90.49382019042969,
4016
+ "learning_rate": 9.990170509587135e-06,
4017
+ "loss": 12.3861,
4018
+ "step": 5670
4019
+ },
4020
+ {
4021
+ "epoch": 0.25207892999595033,
4022
+ "grad_norm": 115.12799072265625,
4023
+ "learning_rate": 9.990153173625206e-06,
4024
+ "loss": 12.0726,
4025
+ "step": 5680
4026
+ },
4027
+ {
4028
+ "epoch": 0.25252273092904176,
4029
+ "grad_norm": 102.43321990966797,
4030
+ "learning_rate": 9.990135837663279e-06,
4031
+ "loss": 11.9243,
4032
+ "step": 5690
4033
+ },
4034
+ {
4035
+ "epoch": 0.25296653186213325,
4036
+ "grad_norm": 133.68328857421875,
4037
+ "learning_rate": 9.990118501701352e-06,
4038
+ "loss": 12.2475,
4039
+ "step": 5700
4040
+ },
4041
+ {
4042
+ "epoch": 0.2534103327952247,
4043
+ "grad_norm": 103.25040435791016,
4044
+ "learning_rate": 9.990101165739424e-06,
4045
+ "loss": 11.596,
4046
+ "step": 5710
4047
+ },
4048
+ {
4049
+ "epoch": 0.25385413372831617,
4050
+ "grad_norm": 106.11602783203125,
4051
+ "learning_rate": 9.990083829777497e-06,
4052
+ "loss": 12.3109,
4053
+ "step": 5720
4054
+ },
4055
+ {
4056
+ "epoch": 0.2542979346614076,
4057
+ "grad_norm": 99.80020904541016,
4058
+ "learning_rate": 9.99006649381557e-06,
4059
+ "loss": 11.2771,
4060
+ "step": 5730
4061
+ },
4062
+ {
4063
+ "epoch": 0.2547417355944991,
4064
+ "grad_norm": 115.24018859863281,
4065
+ "learning_rate": 9.990049157853641e-06,
4066
+ "loss": 11.7818,
4067
+ "step": 5740
4068
+ },
4069
+ {
4070
+ "epoch": 0.2551855365275906,
4071
+ "grad_norm": 93.93407440185547,
4072
+ "learning_rate": 9.990031821891714e-06,
4073
+ "loss": 12.2078,
4074
+ "step": 5750
4075
+ },
4076
+ {
4077
+ "epoch": 0.255629337460682,
4078
+ "grad_norm": 97.67652130126953,
4079
+ "learning_rate": 9.990014485929787e-06,
4080
+ "loss": 11.739,
4081
+ "step": 5760
4082
+ },
4083
+ {
4084
+ "epoch": 0.2560731383937735,
4085
+ "grad_norm": 117.71260070800781,
4086
+ "learning_rate": 9.989997149967859e-06,
4087
+ "loss": 11.7933,
4088
+ "step": 5770
4089
+ },
4090
+ {
4091
+ "epoch": 0.2565169393268649,
4092
+ "grad_norm": 85.63137817382812,
4093
+ "learning_rate": 9.989979814005932e-06,
4094
+ "loss": 11.5123,
4095
+ "step": 5780
4096
+ },
4097
+ {
4098
+ "epoch": 0.2569607402599564,
4099
+ "grad_norm": 91.07071685791016,
4100
+ "learning_rate": 9.989962478044005e-06,
4101
+ "loss": 11.7954,
4102
+ "step": 5790
4103
+ },
4104
+ {
4105
+ "epoch": 0.25740454119304784,
4106
+ "grad_norm": 94.04301452636719,
4107
+ "learning_rate": 9.989945142082078e-06,
4108
+ "loss": 11.4023,
4109
+ "step": 5800
4110
+ },
4111
+ {
4112
+ "epoch": 0.25784834212613933,
4113
+ "grad_norm": 98.91206359863281,
4114
+ "learning_rate": 9.98992780612015e-06,
4115
+ "loss": 12.2875,
4116
+ "step": 5810
4117
+ },
4118
+ {
4119
+ "epoch": 0.25829214305923076,
4120
+ "grad_norm": 94.439208984375,
4121
+ "learning_rate": 9.989910470158222e-06,
4122
+ "loss": 11.4681,
4123
+ "step": 5820
4124
+ },
4125
+ {
4126
+ "epoch": 0.25873594399232225,
4127
+ "grad_norm": 96.2354507446289,
4128
+ "learning_rate": 9.989893134196295e-06,
4129
+ "loss": 11.2651,
4130
+ "step": 5830
4131
+ },
4132
+ {
4133
+ "epoch": 0.2591797449254137,
4134
+ "grad_norm": 99.5677490234375,
4135
+ "learning_rate": 9.989875798234367e-06,
4136
+ "loss": 11.7862,
4137
+ "step": 5840
4138
+ },
4139
+ {
4140
+ "epoch": 0.25962354585850517,
4141
+ "grad_norm": 117.41152954101562,
4142
+ "learning_rate": 9.98985846227244e-06,
4143
+ "loss": 12.0639,
4144
+ "step": 5850
4145
+ },
4146
+ {
4147
+ "epoch": 0.26006734679159665,
4148
+ "grad_norm": 92.46183013916016,
4149
+ "learning_rate": 9.989841126310513e-06,
4150
+ "loss": 11.5852,
4151
+ "step": 5860
4152
+ },
4153
+ {
4154
+ "epoch": 0.2605111477246881,
4155
+ "grad_norm": 116.44026947021484,
4156
+ "learning_rate": 9.989823790348584e-06,
4157
+ "loss": 11.847,
4158
+ "step": 5870
4159
+ },
4160
+ {
4161
+ "epoch": 0.2609549486577796,
4162
+ "grad_norm": 109.089111328125,
4163
+ "learning_rate": 9.989806454386657e-06,
4164
+ "loss": 11.9711,
4165
+ "step": 5880
4166
+ },
4167
+ {
4168
+ "epoch": 0.261398749590871,
4169
+ "grad_norm": 126.8874740600586,
4170
+ "learning_rate": 9.98978911842473e-06,
4171
+ "loss": 11.7392,
4172
+ "step": 5890
4173
+ },
4174
+ {
4175
+ "epoch": 0.2618425505239625,
4176
+ "grad_norm": 121.22462463378906,
4177
+ "learning_rate": 9.989771782462802e-06,
4178
+ "loss": 11.8794,
4179
+ "step": 5900
4180
+ },
4181
+ {
4182
+ "epoch": 0.2622863514570539,
4183
+ "grad_norm": 87.884521484375,
4184
+ "learning_rate": 9.989754446500875e-06,
4185
+ "loss": 11.6205,
4186
+ "step": 5910
4187
+ },
4188
+ {
4189
+ "epoch": 0.2627301523901454,
4190
+ "grad_norm": 102.1272201538086,
4191
+ "learning_rate": 9.989737110538948e-06,
4192
+ "loss": 11.8747,
4193
+ "step": 5920
4194
+ },
4195
+ {
4196
+ "epoch": 0.26317395332323684,
4197
+ "grad_norm": 100.64566802978516,
4198
+ "learning_rate": 9.989719774577021e-06,
4199
+ "loss": 12.3826,
4200
+ "step": 5930
4201
+ },
4202
+ {
4203
+ "epoch": 0.26361775425632833,
4204
+ "grad_norm": 94.28556823730469,
4205
+ "learning_rate": 9.989702438615092e-06,
4206
+ "loss": 11.7302,
4207
+ "step": 5940
4208
+ },
4209
+ {
4210
+ "epoch": 0.26406155518941976,
4211
+ "grad_norm": 109.41671752929688,
4212
+ "learning_rate": 9.989685102653166e-06,
4213
+ "loss": 11.0984,
4214
+ "step": 5950
4215
+ },
4216
+ {
4217
+ "epoch": 0.26450535612251125,
4218
+ "grad_norm": 92.14616394042969,
4219
+ "learning_rate": 9.989667766691239e-06,
4220
+ "loss": 11.6492,
4221
+ "step": 5960
4222
+ },
4223
+ {
4224
+ "epoch": 0.26494915705560274,
4225
+ "grad_norm": 95.84307861328125,
4226
+ "learning_rate": 9.98965043072931e-06,
4227
+ "loss": 12.0779,
4228
+ "step": 5970
4229
+ },
4230
+ {
4231
+ "epoch": 0.26539295798869417,
4232
+ "grad_norm": 102.63638305664062,
4233
+ "learning_rate": 9.989633094767383e-06,
4234
+ "loss": 12.4218,
4235
+ "step": 5980
4236
+ },
4237
+ {
4238
+ "epoch": 0.26583675892178565,
4239
+ "grad_norm": 110.38811492919922,
4240
+ "learning_rate": 9.989615758805456e-06,
4241
+ "loss": 12.0949,
4242
+ "step": 5990
4243
+ },
4244
+ {
4245
+ "epoch": 0.2662805598548771,
4246
+ "grad_norm": 88.99332427978516,
4247
+ "learning_rate": 9.989598422843528e-06,
4248
+ "loss": 12.2146,
4249
+ "step": 6000
4250
+ },
4251
+ {
4252
+ "epoch": 0.2662805598548771,
4253
+ "eval_loss": 0.3712849020957947,
4254
+ "eval_runtime": 674.6735,
4255
+ "eval_samples_per_second": 1799.968,
4256
+ "eval_steps_per_second": 56.249,
4257
+ "step": 6000
4258
+ },
4259
+ {
4260
+ "epoch": 0.2667243607879686,
4261
+ "grad_norm": 106.79357147216797,
4262
+ "learning_rate": 9.9895810868816e-06,
4263
+ "loss": 11.816,
4264
+ "step": 6010
4265
+ },
4266
+ {
4267
+ "epoch": 0.26716816172106,
4268
+ "grad_norm": 101.64129638671875,
4269
+ "learning_rate": 9.989563750919674e-06,
4270
+ "loss": 11.551,
4271
+ "step": 6020
4272
+ },
4273
+ {
4274
+ "epoch": 0.2676119626541515,
4275
+ "grad_norm": 124.5982437133789,
4276
+ "learning_rate": 9.989546414957745e-06,
4277
+ "loss": 11.2434,
4278
+ "step": 6030
4279
+ },
4280
+ {
4281
+ "epoch": 0.2680557635872429,
4282
+ "grad_norm": 86.55455780029297,
4283
+ "learning_rate": 9.989529078995818e-06,
4284
+ "loss": 11.6358,
4285
+ "step": 6040
4286
+ },
4287
+ {
4288
+ "epoch": 0.2684995645203344,
4289
+ "grad_norm": 104.90837860107422,
4290
+ "learning_rate": 9.989511743033891e-06,
4291
+ "loss": 11.7908,
4292
+ "step": 6050
4293
+ },
4294
+ {
4295
+ "epoch": 0.26894336545342584,
4296
+ "grad_norm": 91.85465240478516,
4297
+ "learning_rate": 9.989494407071964e-06,
4298
+ "loss": 11.675,
4299
+ "step": 6060
4300
+ },
4301
+ {
4302
+ "epoch": 0.26938716638651733,
4303
+ "grad_norm": 110.98849487304688,
4304
+ "learning_rate": 9.989477071110036e-06,
4305
+ "loss": 11.9682,
4306
+ "step": 6070
4307
+ },
4308
+ {
4309
+ "epoch": 0.2698309673196088,
4310
+ "grad_norm": 117.15697479248047,
4311
+ "learning_rate": 9.989459735148109e-06,
4312
+ "loss": 12.6112,
4313
+ "step": 6080
4314
+ },
4315
+ {
4316
+ "epoch": 0.27027476825270025,
4317
+ "grad_norm": 116.58313751220703,
4318
+ "learning_rate": 9.989442399186182e-06,
4319
+ "loss": 11.8428,
4320
+ "step": 6090
4321
+ },
4322
+ {
4323
+ "epoch": 0.27071856918579174,
4324
+ "grad_norm": 141.74298095703125,
4325
+ "learning_rate": 9.989425063224253e-06,
4326
+ "loss": 11.678,
4327
+ "step": 6100
4328
+ },
4329
+ {
4330
+ "epoch": 0.27116237011888317,
4331
+ "grad_norm": 117.08960723876953,
4332
+ "learning_rate": 9.989407727262326e-06,
4333
+ "loss": 11.6677,
4334
+ "step": 6110
4335
+ },
4336
+ {
4337
+ "epoch": 0.27160617105197465,
4338
+ "grad_norm": 102.23706817626953,
4339
+ "learning_rate": 9.9893903913004e-06,
4340
+ "loss": 11.5424,
4341
+ "step": 6120
4342
+ },
4343
+ {
4344
+ "epoch": 0.2720499719850661,
4345
+ "grad_norm": 100.09423065185547,
4346
+ "learning_rate": 9.98937305533847e-06,
4347
+ "loss": 12.0797,
4348
+ "step": 6130
4349
+ },
4350
+ {
4351
+ "epoch": 0.2724937729181576,
4352
+ "grad_norm": 113.59011840820312,
4353
+ "learning_rate": 9.989355719376544e-06,
4354
+ "loss": 12.0007,
4355
+ "step": 6140
4356
+ },
4357
+ {
4358
+ "epoch": 0.272937573851249,
4359
+ "grad_norm": 94.99832153320312,
4360
+ "learning_rate": 9.989338383414617e-06,
4361
+ "loss": 11.4787,
4362
+ "step": 6150
4363
+ },
4364
+ {
4365
+ "epoch": 0.2733813747843405,
4366
+ "grad_norm": 135.75721740722656,
4367
+ "learning_rate": 9.989321047452688e-06,
4368
+ "loss": 11.9264,
4369
+ "step": 6160
4370
+ },
4371
+ {
4372
+ "epoch": 0.2738251757174319,
4373
+ "grad_norm": 112.27375793457031,
4374
+ "learning_rate": 9.989303711490761e-06,
4375
+ "loss": 11.4877,
4376
+ "step": 6170
4377
+ },
4378
+ {
4379
+ "epoch": 0.2742689766505234,
4380
+ "grad_norm": 115.73841094970703,
4381
+ "learning_rate": 9.989286375528834e-06,
4382
+ "loss": 11.6287,
4383
+ "step": 6180
4384
+ },
4385
+ {
4386
+ "epoch": 0.27471277758361484,
4387
+ "grad_norm": 108.56095886230469,
4388
+ "learning_rate": 9.989269039566908e-06,
4389
+ "loss": 12.2162,
4390
+ "step": 6190
4391
+ },
4392
+ {
4393
+ "epoch": 0.27515657851670633,
4394
+ "grad_norm": 106.11347961425781,
4395
+ "learning_rate": 9.989251703604979e-06,
4396
+ "loss": 12.4926,
4397
+ "step": 6200
4398
+ },
4399
+ {
4400
+ "epoch": 0.2756003794497978,
4401
+ "grad_norm": 93.0553207397461,
4402
+ "learning_rate": 9.989234367643052e-06,
4403
+ "loss": 11.5661,
4404
+ "step": 6210
4405
+ },
4406
+ {
4407
+ "epoch": 0.27604418038288925,
4408
+ "grad_norm": 114.06632232666016,
4409
+ "learning_rate": 9.989217031681125e-06,
4410
+ "loss": 11.9102,
4411
+ "step": 6220
4412
+ },
4413
+ {
4414
+ "epoch": 0.27648798131598074,
4415
+ "grad_norm": 105.9919204711914,
4416
+ "learning_rate": 9.989199695719196e-06,
4417
+ "loss": 11.4711,
4418
+ "step": 6230
4419
+ },
4420
+ {
4421
+ "epoch": 0.27693178224907217,
4422
+ "grad_norm": 84.44627380371094,
4423
+ "learning_rate": 9.98918235975727e-06,
4424
+ "loss": 11.6304,
4425
+ "step": 6240
4426
+ },
4427
+ {
4428
+ "epoch": 0.27737558318216365,
4429
+ "grad_norm": 96.19264221191406,
4430
+ "learning_rate": 9.989165023795343e-06,
4431
+ "loss": 11.414,
4432
+ "step": 6250
4433
+ },
4434
+ {
4435
+ "epoch": 0.2778193841152551,
4436
+ "grad_norm": 127.19998931884766,
4437
+ "learning_rate": 9.989147687833414e-06,
4438
+ "loss": 11.9031,
4439
+ "step": 6260
4440
+ },
4441
+ {
4442
+ "epoch": 0.2782631850483466,
4443
+ "grad_norm": 108.23567199707031,
4444
+ "learning_rate": 9.989130351871487e-06,
4445
+ "loss": 11.9478,
4446
+ "step": 6270
4447
+ },
4448
+ {
4449
+ "epoch": 0.278706985981438,
4450
+ "grad_norm": 96.11833190917969,
4451
+ "learning_rate": 9.98911301590956e-06,
4452
+ "loss": 12.053,
4453
+ "step": 6280
4454
+ },
4455
+ {
4456
+ "epoch": 0.2791507869145295,
4457
+ "grad_norm": 89.6744613647461,
4458
+ "learning_rate": 9.989095679947633e-06,
4459
+ "loss": 11.7384,
4460
+ "step": 6290
4461
+ },
4462
+ {
4463
+ "epoch": 0.2795945878476209,
4464
+ "grad_norm": 88.66259002685547,
4465
+ "learning_rate": 9.989078343985705e-06,
4466
+ "loss": 11.6472,
4467
+ "step": 6300
4468
+ },
4469
+ {
4470
+ "epoch": 0.2800383887807124,
4471
+ "grad_norm": 112.74403381347656,
4472
+ "learning_rate": 9.989061008023778e-06,
4473
+ "loss": 11.8507,
4474
+ "step": 6310
4475
+ },
4476
+ {
4477
+ "epoch": 0.2804821897138039,
4478
+ "grad_norm": 100.31635284423828,
4479
+ "learning_rate": 9.98904367206185e-06,
4480
+ "loss": 11.3632,
4481
+ "step": 6320
4482
+ },
4483
+ {
4484
+ "epoch": 0.28092599064689533,
4485
+ "grad_norm": 92.06365966796875,
4486
+ "learning_rate": 9.989026336099922e-06,
4487
+ "loss": 11.9611,
4488
+ "step": 6330
4489
+ },
4490
+ {
4491
+ "epoch": 0.2813697915799868,
4492
+ "grad_norm": 106.90202331542969,
4493
+ "learning_rate": 9.989009000137995e-06,
4494
+ "loss": 11.9601,
4495
+ "step": 6340
4496
+ },
4497
+ {
4498
+ "epoch": 0.28181359251307825,
4499
+ "grad_norm": 93.49620056152344,
4500
+ "learning_rate": 9.988991664176068e-06,
4501
+ "loss": 12.0237,
4502
+ "step": 6350
4503
+ },
4504
+ {
4505
+ "epoch": 0.28225739344616974,
4506
+ "grad_norm": 101.77459716796875,
4507
+ "learning_rate": 9.98897432821414e-06,
4508
+ "loss": 11.6238,
4509
+ "step": 6360
4510
+ },
4511
+ {
4512
+ "epoch": 0.28270119437926117,
4513
+ "grad_norm": 94.77214813232422,
4514
+ "learning_rate": 9.988956992252213e-06,
4515
+ "loss": 11.6693,
4516
+ "step": 6370
4517
+ },
4518
+ {
4519
+ "epoch": 0.28314499531235265,
4520
+ "grad_norm": 110.98194885253906,
4521
+ "learning_rate": 9.988939656290286e-06,
4522
+ "loss": 11.8191,
4523
+ "step": 6380
4524
+ },
4525
+ {
4526
+ "epoch": 0.2835887962454441,
4527
+ "grad_norm": 96.43860626220703,
4528
+ "learning_rate": 9.988922320328357e-06,
4529
+ "loss": 11.2866,
4530
+ "step": 6390
4531
+ },
4532
+ {
4533
+ "epoch": 0.2840325971785356,
4534
+ "grad_norm": 125.56005096435547,
4535
+ "learning_rate": 9.98890498436643e-06,
4536
+ "loss": 12.1676,
4537
+ "step": 6400
4538
+ },
4539
+ {
4540
+ "epoch": 0.284476398111627,
4541
+ "grad_norm": 122.20761108398438,
4542
+ "learning_rate": 9.988887648404503e-06,
4543
+ "loss": 12.474,
4544
+ "step": 6410
4545
+ },
4546
+ {
4547
+ "epoch": 0.2849201990447185,
4548
+ "grad_norm": 101.35332489013672,
4549
+ "learning_rate": 9.988870312442576e-06,
4550
+ "loss": 11.733,
4551
+ "step": 6420
4552
+ },
4553
+ {
4554
+ "epoch": 0.28536399997781,
4555
+ "grad_norm": 97.87244415283203,
4556
+ "learning_rate": 9.988852976480648e-06,
4557
+ "loss": 11.9138,
4558
+ "step": 6430
4559
+ },
4560
+ {
4561
+ "epoch": 0.2858078009109014,
4562
+ "grad_norm": 106.80064392089844,
4563
+ "learning_rate": 9.988835640518721e-06,
4564
+ "loss": 12.4942,
4565
+ "step": 6440
4566
+ },
4567
+ {
4568
+ "epoch": 0.2862516018439929,
4569
+ "grad_norm": 106.28848266601562,
4570
+ "learning_rate": 9.988818304556794e-06,
4571
+ "loss": 11.9158,
4572
+ "step": 6450
4573
+ },
4574
+ {
4575
+ "epoch": 0.28669540277708433,
4576
+ "grad_norm": 100.0877914428711,
4577
+ "learning_rate": 9.988800968594865e-06,
4578
+ "loss": 11.2765,
4579
+ "step": 6460
4580
+ },
4581
+ {
4582
+ "epoch": 0.2871392037101758,
4583
+ "grad_norm": 100.36260223388672,
4584
+ "learning_rate": 9.988783632632938e-06,
4585
+ "loss": 11.7645,
4586
+ "step": 6470
4587
+ },
4588
+ {
4589
+ "epoch": 0.28758300464326725,
4590
+ "grad_norm": 102.455078125,
4591
+ "learning_rate": 9.988766296671012e-06,
4592
+ "loss": 11.5561,
4593
+ "step": 6480
4594
+ },
4595
+ {
4596
+ "epoch": 0.28802680557635874,
4597
+ "grad_norm": 91.07699584960938,
4598
+ "learning_rate": 9.988748960709083e-06,
4599
+ "loss": 11.8894,
4600
+ "step": 6490
4601
+ },
4602
+ {
4603
+ "epoch": 0.28847060650945017,
4604
+ "grad_norm": 96.68830108642578,
4605
+ "learning_rate": 9.988731624747156e-06,
4606
+ "loss": 11.7245,
4607
+ "step": 6500
4608
+ },
4609
+ {
4610
+ "epoch": 0.28891440744254165,
4611
+ "grad_norm": 86.85298919677734,
4612
+ "learning_rate": 9.988714288785229e-06,
4613
+ "loss": 11.685,
4614
+ "step": 6510
4615
+ },
4616
+ {
4617
+ "epoch": 0.2893582083756331,
4618
+ "grad_norm": 100.70494842529297,
4619
+ "learning_rate": 9.9886969528233e-06,
4620
+ "loss": 11.6402,
4621
+ "step": 6520
4622
+ },
4623
+ {
4624
+ "epoch": 0.2898020093087246,
4625
+ "grad_norm": 101.03349304199219,
4626
+ "learning_rate": 9.988679616861374e-06,
4627
+ "loss": 12.2626,
4628
+ "step": 6530
4629
+ },
4630
+ {
4631
+ "epoch": 0.29024581024181606,
4632
+ "grad_norm": 91.4577865600586,
4633
+ "learning_rate": 9.988662280899447e-06,
4634
+ "loss": 11.8911,
4635
+ "step": 6540
4636
+ },
4637
+ {
4638
+ "epoch": 0.2906896111749075,
4639
+ "grad_norm": 97.14340209960938,
4640
+ "learning_rate": 9.98864494493752e-06,
4641
+ "loss": 11.6866,
4642
+ "step": 6550
4643
+ },
4644
+ {
4645
+ "epoch": 0.291133412107999,
4646
+ "grad_norm": 94.9050064086914,
4647
+ "learning_rate": 9.988627608975591e-06,
4648
+ "loss": 11.713,
4649
+ "step": 6560
4650
+ },
4651
+ {
4652
+ "epoch": 0.2915772130410904,
4653
+ "grad_norm": 94.01126861572266,
4654
+ "learning_rate": 9.988610273013664e-06,
4655
+ "loss": 11.6159,
4656
+ "step": 6570
4657
+ },
4658
+ {
4659
+ "epoch": 0.2920210139741819,
4660
+ "grad_norm": 89.02586364746094,
4661
+ "learning_rate": 9.988592937051737e-06,
4662
+ "loss": 11.6739,
4663
+ "step": 6580
4664
+ },
4665
+ {
4666
+ "epoch": 0.29246481490727333,
4667
+ "grad_norm": 126.11679077148438,
4668
+ "learning_rate": 9.988575601089809e-06,
4669
+ "loss": 11.7169,
4670
+ "step": 6590
4671
+ },
4672
+ {
4673
+ "epoch": 0.2929086158403648,
4674
+ "grad_norm": 91.34590148925781,
4675
+ "learning_rate": 9.988558265127882e-06,
4676
+ "loss": 11.5195,
4677
+ "step": 6600
4678
+ },
4679
+ {
4680
+ "epoch": 0.29335241677345625,
4681
+ "grad_norm": 108.34649658203125,
4682
+ "learning_rate": 9.988540929165955e-06,
4683
+ "loss": 11.6583,
4684
+ "step": 6610
4685
+ },
4686
+ {
4687
+ "epoch": 0.29379621770654774,
4688
+ "grad_norm": 111.23458099365234,
4689
+ "learning_rate": 9.988523593204026e-06,
4690
+ "loss": 11.5678,
4691
+ "step": 6620
4692
+ },
4693
+ {
4694
+ "epoch": 0.29424001863963917,
4695
+ "grad_norm": 92.88959503173828,
4696
+ "learning_rate": 9.9885062572421e-06,
4697
+ "loss": 12.0539,
4698
+ "step": 6630
4699
+ },
4700
+ {
4701
+ "epoch": 0.29468381957273065,
4702
+ "grad_norm": 119.5103530883789,
4703
+ "learning_rate": 9.988488921280172e-06,
4704
+ "loss": 11.7499,
4705
+ "step": 6640
4706
+ },
4707
+ {
4708
+ "epoch": 0.2951276205058221,
4709
+ "grad_norm": 107.3758544921875,
4710
+ "learning_rate": 9.988471585318244e-06,
4711
+ "loss": 11.3844,
4712
+ "step": 6650
4713
+ },
4714
+ {
4715
+ "epoch": 0.2955714214389136,
4716
+ "grad_norm": 103.09275817871094,
4717
+ "learning_rate": 9.988454249356317e-06,
4718
+ "loss": 11.3347,
4719
+ "step": 6660
4720
+ },
4721
+ {
4722
+ "epoch": 0.29601522237200506,
4723
+ "grad_norm": 104.75767517089844,
4724
+ "learning_rate": 9.98843691339439e-06,
4725
+ "loss": 12.1328,
4726
+ "step": 6670
4727
+ },
4728
+ {
4729
+ "epoch": 0.2964590233050965,
4730
+ "grad_norm": 105.1933364868164,
4731
+ "learning_rate": 9.988419577432463e-06,
4732
+ "loss": 12.1966,
4733
+ "step": 6680
4734
+ },
4735
+ {
4736
+ "epoch": 0.296902824238188,
4737
+ "grad_norm": 95.23583221435547,
4738
+ "learning_rate": 9.988402241470534e-06,
4739
+ "loss": 11.925,
4740
+ "step": 6690
4741
+ },
4742
+ {
4743
+ "epoch": 0.2973466251712794,
4744
+ "grad_norm": 93.23466491699219,
4745
+ "learning_rate": 9.988384905508607e-06,
4746
+ "loss": 11.9893,
4747
+ "step": 6700
4748
+ },
4749
+ {
4750
+ "epoch": 0.2977904261043709,
4751
+ "grad_norm": 96.76582336425781,
4752
+ "learning_rate": 9.98836756954668e-06,
4753
+ "loss": 11.4291,
4754
+ "step": 6710
4755
+ },
4756
+ {
4757
+ "epoch": 0.29823422703746233,
4758
+ "grad_norm": 95.39541625976562,
4759
+ "learning_rate": 9.988350233584752e-06,
4760
+ "loss": 11.6534,
4761
+ "step": 6720
4762
+ },
4763
+ {
4764
+ "epoch": 0.2986780279705538,
4765
+ "grad_norm": 111.02050018310547,
4766
+ "learning_rate": 9.988332897622825e-06,
4767
+ "loss": 11.215,
4768
+ "step": 6730
4769
+ },
4770
+ {
4771
+ "epoch": 0.29912182890364525,
4772
+ "grad_norm": 93.41612243652344,
4773
+ "learning_rate": 9.988315561660898e-06,
4774
+ "loss": 11.8045,
4775
+ "step": 6740
4776
+ },
4777
+ {
4778
+ "epoch": 0.29956562983673674,
4779
+ "grad_norm": 103.25040435791016,
4780
+ "learning_rate": 9.98829822569897e-06,
4781
+ "loss": 11.7352,
4782
+ "step": 6750
4783
+ },
4784
+ {
4785
+ "epoch": 0.30000943076982817,
4786
+ "grad_norm": 110.95781707763672,
4787
+ "learning_rate": 9.988280889737043e-06,
4788
+ "loss": 11.9544,
4789
+ "step": 6760
4790
+ },
4791
+ {
4792
+ "epoch": 0.30045323170291965,
4793
+ "grad_norm": 90.73185729980469,
4794
+ "learning_rate": 9.988263553775116e-06,
4795
+ "loss": 11.5593,
4796
+ "step": 6770
4797
+ },
4798
+ {
4799
+ "epoch": 0.30089703263601114,
4800
+ "grad_norm": 90.01508331298828,
4801
+ "learning_rate": 9.988246217813187e-06,
4802
+ "loss": 11.1305,
4803
+ "step": 6780
4804
+ },
4805
+ {
4806
+ "epoch": 0.3013408335691026,
4807
+ "grad_norm": 99.30635070800781,
4808
+ "learning_rate": 9.98822888185126e-06,
4809
+ "loss": 11.1892,
4810
+ "step": 6790
4811
+ },
4812
+ {
4813
+ "epoch": 0.30178463450219406,
4814
+ "grad_norm": 112.32921600341797,
4815
+ "learning_rate": 9.988211545889333e-06,
4816
+ "loss": 11.9221,
4817
+ "step": 6800
4818
+ },
4819
+ {
4820
+ "epoch": 0.3022284354352855,
4821
+ "grad_norm": 97.16471099853516,
4822
+ "learning_rate": 9.988194209927405e-06,
4823
+ "loss": 11.8916,
4824
+ "step": 6810
4825
+ },
4826
+ {
4827
+ "epoch": 0.302672236368377,
4828
+ "grad_norm": 93.2406234741211,
4829
+ "learning_rate": 9.988176873965478e-06,
4830
+ "loss": 11.4169,
4831
+ "step": 6820
4832
+ },
4833
+ {
4834
+ "epoch": 0.3031160373014684,
4835
+ "grad_norm": 111.53374481201172,
4836
+ "learning_rate": 9.98815953800355e-06,
4837
+ "loss": 11.1438,
4838
+ "step": 6830
4839
+ },
4840
+ {
4841
+ "epoch": 0.3035598382345599,
4842
+ "grad_norm": 94.99114990234375,
4843
+ "learning_rate": 9.988142202041622e-06,
4844
+ "loss": 12.0838,
4845
+ "step": 6840
4846
+ },
4847
+ {
4848
+ "epoch": 0.30400363916765133,
4849
+ "grad_norm": 117.2374496459961,
4850
+ "learning_rate": 9.988124866079695e-06,
4851
+ "loss": 11.4959,
4852
+ "step": 6850
4853
+ },
4854
+ {
4855
+ "epoch": 0.3044474401007428,
4856
+ "grad_norm": 97.72772216796875,
4857
+ "learning_rate": 9.988107530117768e-06,
4858
+ "loss": 11.8559,
4859
+ "step": 6860
4860
+ },
4861
+ {
4862
+ "epoch": 0.30489124103383425,
4863
+ "grad_norm": 105.31409454345703,
4864
+ "learning_rate": 9.98809019415584e-06,
4865
+ "loss": 11.537,
4866
+ "step": 6870
4867
+ },
4868
+ {
4869
+ "epoch": 0.30533504196692574,
4870
+ "grad_norm": 125.29707336425781,
4871
+ "learning_rate": 9.988072858193913e-06,
4872
+ "loss": 11.4942,
4873
+ "step": 6880
4874
+ },
4875
+ {
4876
+ "epoch": 0.3057788429000172,
4877
+ "grad_norm": 101.15106201171875,
4878
+ "learning_rate": 9.988055522231986e-06,
4879
+ "loss": 11.3884,
4880
+ "step": 6890
4881
+ },
4882
+ {
4883
+ "epoch": 0.30622264383310865,
4884
+ "grad_norm": 97.56317138671875,
4885
+ "learning_rate": 9.988038186270059e-06,
4886
+ "loss": 11.5489,
4887
+ "step": 6900
4888
+ },
4889
+ {
4890
+ "epoch": 0.30666644476620014,
4891
+ "grad_norm": 112.57002258300781,
4892
+ "learning_rate": 9.98802085030813e-06,
4893
+ "loss": 11.3843,
4894
+ "step": 6910
4895
+ },
4896
+ {
4897
+ "epoch": 0.3071102456992916,
4898
+ "grad_norm": 100.37068176269531,
4899
+ "learning_rate": 9.988003514346203e-06,
4900
+ "loss": 11.4439,
4901
+ "step": 6920
4902
+ },
4903
+ {
4904
+ "epoch": 0.30755404663238306,
4905
+ "grad_norm": 119.5300521850586,
4906
+ "learning_rate": 9.987986178384276e-06,
4907
+ "loss": 11.4964,
4908
+ "step": 6930
4909
+ },
4910
+ {
4911
+ "epoch": 0.3079978475654745,
4912
+ "grad_norm": 88.61663055419922,
4913
+ "learning_rate": 9.987968842422348e-06,
4914
+ "loss": 11.6089,
4915
+ "step": 6940
4916
+ },
4917
+ {
4918
+ "epoch": 0.308441648498566,
4919
+ "grad_norm": 101.71082305908203,
4920
+ "learning_rate": 9.98795150646042e-06,
4921
+ "loss": 11.5646,
4922
+ "step": 6950
4923
+ },
4924
+ {
4925
+ "epoch": 0.3088854494316574,
4926
+ "grad_norm": 111.21810913085938,
4927
+ "learning_rate": 9.987934170498494e-06,
4928
+ "loss": 11.2611,
4929
+ "step": 6960
4930
+ },
4931
+ {
4932
+ "epoch": 0.3093292503647489,
4933
+ "grad_norm": 110.25248718261719,
4934
+ "learning_rate": 9.987916834536565e-06,
4935
+ "loss": 11.5723,
4936
+ "step": 6970
4937
+ },
4938
+ {
4939
+ "epoch": 0.30977305129784033,
4940
+ "grad_norm": 104.40763092041016,
4941
+ "learning_rate": 9.987899498574638e-06,
4942
+ "loss": 12.0642,
4943
+ "step": 6980
4944
+ },
4945
+ {
4946
+ "epoch": 0.3102168522309318,
4947
+ "grad_norm": 89.2273941040039,
4948
+ "learning_rate": 9.987882162612711e-06,
4949
+ "loss": 11.4713,
4950
+ "step": 6990
4951
+ },
4952
+ {
4953
+ "epoch": 0.3106606531640233,
4954
+ "grad_norm": 85.18675994873047,
4955
+ "learning_rate": 9.987864826650783e-06,
4956
+ "loss": 11.8243,
4957
+ "step": 7000
4958
+ },
4959
+ {
4960
+ "epoch": 0.3106606531640233,
4961
+ "eval_loss": 0.36315852403640747,
4962
+ "eval_runtime": 674.7882,
4963
+ "eval_samples_per_second": 1799.663,
4964
+ "eval_steps_per_second": 56.24,
4965
+ "step": 7000
4966
+ },
4967
+ {
4968
+ "epoch": 0.31110445409711474,
4969
+ "grad_norm": 123.10943603515625,
4970
+ "learning_rate": 9.987847490688856e-06,
4971
+ "loss": 11.6931,
4972
+ "step": 7010
4973
+ },
4974
+ {
4975
+ "epoch": 0.3115482550302062,
4976
+ "grad_norm": 130.8447723388672,
4977
+ "learning_rate": 9.987830154726929e-06,
4978
+ "loss": 11.5164,
4979
+ "step": 7020
4980
+ },
4981
+ {
4982
+ "epoch": 0.31199205596329765,
4983
+ "grad_norm": 101.97557830810547,
4984
+ "learning_rate": 9.987812818765e-06,
4985
+ "loss": 11.8152,
4986
+ "step": 7030
4987
+ },
4988
+ {
4989
+ "epoch": 0.31243585689638914,
4990
+ "grad_norm": 93.44794464111328,
4991
+ "learning_rate": 9.987795482803073e-06,
4992
+ "loss": 11.6298,
4993
+ "step": 7040
4994
+ },
4995
+ {
4996
+ "epoch": 0.3128796578294806,
4997
+ "grad_norm": 97.57392120361328,
4998
+ "learning_rate": 9.987778146841147e-06,
4999
+ "loss": 11.3152,
5000
+ "step": 7050
5001
+ },
5002
+ {
5003
+ "epoch": 0.31332345876257206,
5004
+ "grad_norm": 102.27783203125,
5005
+ "learning_rate": 9.987760810879218e-06,
5006
+ "loss": 10.9668,
5007
+ "step": 7060
5008
+ },
5009
+ {
5010
+ "epoch": 0.3137672596956635,
5011
+ "grad_norm": 108.09846496582031,
5012
+ "learning_rate": 9.987743474917291e-06,
5013
+ "loss": 11.4038,
5014
+ "step": 7070
5015
+ },
5016
+ {
5017
+ "epoch": 0.314211060628755,
5018
+ "grad_norm": 90.89788055419922,
5019
+ "learning_rate": 9.987726138955364e-06,
5020
+ "loss": 11.3717,
5021
+ "step": 7080
5022
+ },
5023
+ {
5024
+ "epoch": 0.3146548615618464,
5025
+ "grad_norm": 115.7634506225586,
5026
+ "learning_rate": 9.987708802993435e-06,
5027
+ "loss": 11.682,
5028
+ "step": 7090
5029
+ },
5030
+ {
5031
+ "epoch": 0.3150986624949379,
5032
+ "grad_norm": 109.29789733886719,
5033
+ "learning_rate": 9.987691467031509e-06,
5034
+ "loss": 11.7889,
5035
+ "step": 7100
5036
+ },
5037
+ {
5038
+ "epoch": 0.3155424634280294,
5039
+ "grad_norm": 95.27775573730469,
5040
+ "learning_rate": 9.987674131069582e-06,
5041
+ "loss": 11.2461,
5042
+ "step": 7110
5043
+ },
5044
+ {
5045
+ "epoch": 0.3159862643611208,
5046
+ "grad_norm": 110.1352310180664,
5047
+ "learning_rate": 9.987656795107655e-06,
5048
+ "loss": 11.3557,
5049
+ "step": 7120
5050
+ },
5051
+ {
5052
+ "epoch": 0.3164300652942123,
5053
+ "grad_norm": 82.80999755859375,
5054
+ "learning_rate": 9.987639459145726e-06,
5055
+ "loss": 11.4224,
5056
+ "step": 7130
5057
+ },
5058
+ {
5059
+ "epoch": 0.31687386622730374,
5060
+ "grad_norm": 94.51629638671875,
5061
+ "learning_rate": 9.987622123183799e-06,
5062
+ "loss": 11.6793,
5063
+ "step": 7140
5064
+ },
5065
+ {
5066
+ "epoch": 0.3173176671603952,
5067
+ "grad_norm": 106.12570190429688,
5068
+ "learning_rate": 9.987604787221872e-06,
5069
+ "loss": 11.3187,
5070
+ "step": 7150
5071
+ },
5072
+ {
5073
+ "epoch": 0.31776146809348665,
5074
+ "grad_norm": 110.01351928710938,
5075
+ "learning_rate": 9.987587451259944e-06,
5076
+ "loss": 11.3371,
5077
+ "step": 7160
5078
+ },
5079
+ {
5080
+ "epoch": 0.31820526902657814,
5081
+ "grad_norm": 97.4388427734375,
5082
+ "learning_rate": 9.987570115298017e-06,
5083
+ "loss": 11.5326,
5084
+ "step": 7170
5085
+ },
5086
+ {
5087
+ "epoch": 0.3186490699596696,
5088
+ "grad_norm": 80.90787506103516,
5089
+ "learning_rate": 9.98755277933609e-06,
5090
+ "loss": 11.7102,
5091
+ "step": 7180
5092
+ },
5093
+ {
5094
+ "epoch": 0.31909287089276106,
5095
+ "grad_norm": 102.96366882324219,
5096
+ "learning_rate": 9.987535443374161e-06,
5097
+ "loss": 11.9188,
5098
+ "step": 7190
5099
+ },
5100
+ {
5101
+ "epoch": 0.3195366718258525,
5102
+ "grad_norm": 101.80105590820312,
5103
+ "learning_rate": 9.987518107412234e-06,
5104
+ "loss": 12.0429,
5105
+ "step": 7200
5106
+ },
5107
+ {
5108
+ "epoch": 0.319980472758944,
5109
+ "grad_norm": 90.94136047363281,
5110
+ "learning_rate": 9.987500771450307e-06,
5111
+ "loss": 10.825,
5112
+ "step": 7210
5113
+ },
5114
+ {
5115
+ "epoch": 0.3204242736920354,
5116
+ "grad_norm": 109.56967163085938,
5117
+ "learning_rate": 9.987483435488379e-06,
5118
+ "loss": 11.5279,
5119
+ "step": 7220
5120
+ },
5121
+ {
5122
+ "epoch": 0.3208680746251269,
5123
+ "grad_norm": 95.71635437011719,
5124
+ "learning_rate": 9.987466099526452e-06,
5125
+ "loss": 11.6178,
5126
+ "step": 7230
5127
+ },
5128
+ {
5129
+ "epoch": 0.3213118755582184,
5130
+ "grad_norm": 93.75989532470703,
5131
+ "learning_rate": 9.987448763564525e-06,
5132
+ "loss": 11.4102,
5133
+ "step": 7240
5134
+ },
5135
+ {
5136
+ "epoch": 0.3217556764913098,
5137
+ "grad_norm": 100.95425415039062,
5138
+ "learning_rate": 9.987431427602596e-06,
5139
+ "loss": 11.7016,
5140
+ "step": 7250
5141
+ },
5142
+ {
5143
+ "epoch": 0.3221994774244013,
5144
+ "grad_norm": 97.52702331542969,
5145
+ "learning_rate": 9.98741409164067e-06,
5146
+ "loss": 11.8841,
5147
+ "step": 7260
5148
+ },
5149
+ {
5150
+ "epoch": 0.32264327835749274,
5151
+ "grad_norm": 91.30542755126953,
5152
+ "learning_rate": 9.987396755678742e-06,
5153
+ "loss": 11.4776,
5154
+ "step": 7270
5155
+ },
5156
+ {
5157
+ "epoch": 0.3230870792905842,
5158
+ "grad_norm": 106.42786407470703,
5159
+ "learning_rate": 9.987379419716814e-06,
5160
+ "loss": 11.5718,
5161
+ "step": 7280
5162
+ },
5163
+ {
5164
+ "epoch": 0.32353088022367565,
5165
+ "grad_norm": 93.21934509277344,
5166
+ "learning_rate": 9.987362083754887e-06,
5167
+ "loss": 11.1913,
5168
+ "step": 7290
5169
+ },
5170
+ {
5171
+ "epoch": 0.32397468115676714,
5172
+ "grad_norm": 102.5538101196289,
5173
+ "learning_rate": 9.98734474779296e-06,
5174
+ "loss": 11.5974,
5175
+ "step": 7300
5176
+ },
5177
+ {
5178
+ "epoch": 0.3244184820898586,
5179
+ "grad_norm": 96.53839874267578,
5180
+ "learning_rate": 9.987327411831033e-06,
5181
+ "loss": 11.6911,
5182
+ "step": 7310
5183
+ },
5184
+ {
5185
+ "epoch": 0.32486228302295006,
5186
+ "grad_norm": 94.2166519165039,
5187
+ "learning_rate": 9.987310075869104e-06,
5188
+ "loss": 11.4638,
5189
+ "step": 7320
5190
+ },
5191
+ {
5192
+ "epoch": 0.3253060839560415,
5193
+ "grad_norm": 100.33325958251953,
5194
+ "learning_rate": 9.987292739907177e-06,
5195
+ "loss": 11.3276,
5196
+ "step": 7330
5197
+ },
5198
+ {
5199
+ "epoch": 0.325749884889133,
5200
+ "grad_norm": 94.38825988769531,
5201
+ "learning_rate": 9.98727540394525e-06,
5202
+ "loss": 11.408,
5203
+ "step": 7340
5204
+ },
5205
+ {
5206
+ "epoch": 0.32619368582222447,
5207
+ "grad_norm": 106.76789855957031,
5208
+ "learning_rate": 9.987258067983322e-06,
5209
+ "loss": 12.0534,
5210
+ "step": 7350
5211
+ },
5212
+ {
5213
+ "epoch": 0.3266374867553159,
5214
+ "grad_norm": 92.51216888427734,
5215
+ "learning_rate": 9.987240732021395e-06,
5216
+ "loss": 11.0724,
5217
+ "step": 7360
5218
+ },
5219
+ {
5220
+ "epoch": 0.3270812876884074,
5221
+ "grad_norm": 90.04621124267578,
5222
+ "learning_rate": 9.987223396059468e-06,
5223
+ "loss": 11.682,
5224
+ "step": 7370
5225
+ },
5226
+ {
5227
+ "epoch": 0.3275250886214988,
5228
+ "grad_norm": 112.20062255859375,
5229
+ "learning_rate": 9.98720606009754e-06,
5230
+ "loss": 11.4545,
5231
+ "step": 7380
5232
+ },
5233
+ {
5234
+ "epoch": 0.3279688895545903,
5235
+ "grad_norm": 98.15010833740234,
5236
+ "learning_rate": 9.987188724135613e-06,
5237
+ "loss": 11.6855,
5238
+ "step": 7390
5239
+ },
5240
+ {
5241
+ "epoch": 0.32841269048768174,
5242
+ "grad_norm": 79.66431427001953,
5243
+ "learning_rate": 9.987171388173686e-06,
5244
+ "loss": 11.2667,
5245
+ "step": 7400
5246
+ },
5247
+ {
5248
+ "epoch": 0.3288564914207732,
5249
+ "grad_norm": 103.56822204589844,
5250
+ "learning_rate": 9.987154052211757e-06,
5251
+ "loss": 11.5097,
5252
+ "step": 7410
5253
+ },
5254
+ {
5255
+ "epoch": 0.32930029235386465,
5256
+ "grad_norm": 101.66146850585938,
5257
+ "learning_rate": 9.98713671624983e-06,
5258
+ "loss": 11.2951,
5259
+ "step": 7420
5260
+ },
5261
+ {
5262
+ "epoch": 0.32974409328695614,
5263
+ "grad_norm": 95.05819702148438,
5264
+ "learning_rate": 9.987119380287903e-06,
5265
+ "loss": 11.6909,
5266
+ "step": 7430
5267
+ },
5268
+ {
5269
+ "epoch": 0.3301878942200476,
5270
+ "grad_norm": 97.97879028320312,
5271
+ "learning_rate": 9.987102044325975e-06,
5272
+ "loss": 10.9043,
5273
+ "step": 7440
5274
+ },
5275
+ {
5276
+ "epoch": 0.33063169515313906,
5277
+ "grad_norm": 139.6067352294922,
5278
+ "learning_rate": 9.987084708364048e-06,
5279
+ "loss": 11.3615,
5280
+ "step": 7450
5281
+ },
5282
+ {
5283
+ "epoch": 0.33107549608623055,
5284
+ "grad_norm": 106.51782989501953,
5285
+ "learning_rate": 9.98706737240212e-06,
5286
+ "loss": 11.3788,
5287
+ "step": 7460
5288
+ },
5289
+ {
5290
+ "epoch": 0.331519297019322,
5291
+ "grad_norm": 89.64512634277344,
5292
+ "learning_rate": 9.987050036440192e-06,
5293
+ "loss": 11.7731,
5294
+ "step": 7470
5295
+ },
5296
+ {
5297
+ "epoch": 0.33196309795241347,
5298
+ "grad_norm": 90.49308776855469,
5299
+ "learning_rate": 9.987032700478265e-06,
5300
+ "loss": 11.8977,
5301
+ "step": 7480
5302
+ },
5303
+ {
5304
+ "epoch": 0.3324068988855049,
5305
+ "grad_norm": 103.32622528076172,
5306
+ "learning_rate": 9.987015364516338e-06,
5307
+ "loss": 11.5663,
5308
+ "step": 7490
5309
+ },
5310
+ {
5311
+ "epoch": 0.3328506998185964,
5312
+ "grad_norm": 88.08988189697266,
5313
+ "learning_rate": 9.98699802855441e-06,
5314
+ "loss": 11.8236,
5315
+ "step": 7500
5316
  }
5317
  ],
5318
  "logging_steps": 10,
 
5332
  "attributes": {}
5333
  }
5334
  },
5335
+ "total_flos": 2.617292403769344e+18,
5336
  "train_batch_size": 4,
5337
  "trial_name": null,
5338
  "trial_params": null