CocoRoF commited on
Commit
0076792
·
verified ·
1 Parent(s): dfd5c89

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93c68d4f57aecd5e349b5c7323df8de547b94fc82347524cba7280d76e0e7875
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266509c575dfdf72a458e7b2d84b76257d04155a692ce14c2d7de35a15a19280
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee24b7c1f0faf41adb006e45d6263282d13f5f99cdfbeb8ec28e041f2947ac7d
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83d9948277ff1a200f25a50447020e053cceb593a430ae25d078c5789fa9e9b9
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb18ac8d6db3307b1c242f7cb069fc8b8dab957434ddfcafcac997cfd6a43abf
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bdab708057b5f34a402d9a2b4443f5f93a8e8ee2ddb66d955f0a15ad394ecc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599882a30c163a5a2a000c4e74b320ecc4a55aa1b079882fd66aa3d2559d19e7
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:567c3b482c209c2778fc017e39a38642c488edda20673ef29f571ef7177ad81e
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f9ffe9a916e778423aaed4ec842923c9ccfdd3d7a4fbad10dc6a3bfc278fb8e
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ede8a81aa3c780fb9c3cb57537752a782c4aed1dcecb7aafd6ca5a7ea90252
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b1c5c0c0afa907d332467e631e6cee80ba476689aa0caa77689ca273d83b3e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73025ac422abb13303ee974109cf39f6f848de7f7013e828d04aa4e2ec0e6757
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7219278083dbc646fe72946ab4301102da8206fb4979ac90c21b38ce89792e7
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82dd56e5d1fd0f536eca0ab3c8df16cc4dea4fcf65ca478171fc5290a211afc
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5779386373651777,
5
  "eval_steps": 500,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3547,6 +3547,714 @@
3547
  "eval_samples_per_second": 610.776,
3548
  "eval_steps_per_second": 38.174,
3549
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3550
  }
3551
  ],
3552
  "logging_steps": 5,
@@ -3566,7 +4274,7 @@
3566
  "attributes": {}
3567
  }
3568
  },
3569
- "total_flos": 1.0830894981971968e+19,
3570
  "train_batch_size": 4,
3571
  "trial_name": null,
3572
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6935263648382133,
5
  "eval_steps": 500,
6
+ "global_step": 3000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3547
  "eval_samples_per_second": 610.776,
3548
  "eval_steps_per_second": 38.174,
3549
  "step": 2500
3550
+ },
3551
+ {
3552
+ "epoch": 0.5790945146399081,
3553
+ "grad_norm": 172.125,
3554
+ "learning_rate": 4.676258992805755e-06,
3555
+ "loss": 69.669,
3556
+ "step": 2505
3557
+ },
3558
+ {
3559
+ "epoch": 0.5802503919146385,
3560
+ "grad_norm": 157.25,
3561
+ "learning_rate": 4.663412127440905e-06,
3562
+ "loss": 70.5345,
3563
+ "step": 2510
3564
+ },
3565
+ {
3566
+ "epoch": 0.5814062691893688,
3567
+ "grad_norm": 152.375,
3568
+ "learning_rate": 4.650565262076054e-06,
3569
+ "loss": 68.8704,
3570
+ "step": 2515
3571
+ },
3572
+ {
3573
+ "epoch": 0.5825621464640992,
3574
+ "grad_norm": 182.375,
3575
+ "learning_rate": 4.637718396711203e-06,
3576
+ "loss": 68.8443,
3577
+ "step": 2520
3578
+ },
3579
+ {
3580
+ "epoch": 0.5837180237388295,
3581
+ "grad_norm": 174.125,
3582
+ "learning_rate": 4.624871531346352e-06,
3583
+ "loss": 69.324,
3584
+ "step": 2525
3585
+ },
3586
+ {
3587
+ "epoch": 0.5848739010135598,
3588
+ "grad_norm": 170.375,
3589
+ "learning_rate": 4.612024665981501e-06,
3590
+ "loss": 69.2429,
3591
+ "step": 2530
3592
+ },
3593
+ {
3594
+ "epoch": 0.5860297782882903,
3595
+ "grad_norm": 162.875,
3596
+ "learning_rate": 4.59917780061665e-06,
3597
+ "loss": 68.8715,
3598
+ "step": 2535
3599
+ },
3600
+ {
3601
+ "epoch": 0.5871856555630206,
3602
+ "grad_norm": 154.875,
3603
+ "learning_rate": 4.586330935251799e-06,
3604
+ "loss": 70.5938,
3605
+ "step": 2540
3606
+ },
3607
+ {
3608
+ "epoch": 0.5883415328377509,
3609
+ "grad_norm": 164.125,
3610
+ "learning_rate": 4.5734840698869486e-06,
3611
+ "loss": 69.1381,
3612
+ "step": 2545
3613
+ },
3614
+ {
3615
+ "epoch": 0.5894974101124814,
3616
+ "grad_norm": 168.5,
3617
+ "learning_rate": 4.560637204522097e-06,
3618
+ "loss": 68.9723,
3619
+ "step": 2550
3620
+ },
3621
+ {
3622
+ "epoch": 0.5906532873872117,
3623
+ "grad_norm": 157.125,
3624
+ "learning_rate": 4.547790339157246e-06,
3625
+ "loss": 68.8563,
3626
+ "step": 2555
3627
+ },
3628
+ {
3629
+ "epoch": 0.591809164661942,
3630
+ "grad_norm": 170.125,
3631
+ "learning_rate": 4.534943473792395e-06,
3632
+ "loss": 69.5333,
3633
+ "step": 2560
3634
+ },
3635
+ {
3636
+ "epoch": 0.5929650419366723,
3637
+ "grad_norm": 168.75,
3638
+ "learning_rate": 4.522096608427544e-06,
3639
+ "loss": 68.3743,
3640
+ "step": 2565
3641
+ },
3642
+ {
3643
+ "epoch": 0.5941209192114028,
3644
+ "grad_norm": 161.5,
3645
+ "learning_rate": 4.509249743062693e-06,
3646
+ "loss": 68.4732,
3647
+ "step": 2570
3648
+ },
3649
+ {
3650
+ "epoch": 0.5952767964861331,
3651
+ "grad_norm": 177.375,
3652
+ "learning_rate": 4.496402877697842e-06,
3653
+ "loss": 69.9702,
3654
+ "step": 2575
3655
+ },
3656
+ {
3657
+ "epoch": 0.5964326737608634,
3658
+ "grad_norm": 160.5,
3659
+ "learning_rate": 4.483556012332991e-06,
3660
+ "loss": 69.5753,
3661
+ "step": 2580
3662
+ },
3663
+ {
3664
+ "epoch": 0.5975885510355938,
3665
+ "grad_norm": 170.625,
3666
+ "learning_rate": 4.47070914696814e-06,
3667
+ "loss": 69.2494,
3668
+ "step": 2585
3669
+ },
3670
+ {
3671
+ "epoch": 0.5987444283103242,
3672
+ "grad_norm": 160.5,
3673
+ "learning_rate": 4.457862281603289e-06,
3674
+ "loss": 69.5917,
3675
+ "step": 2590
3676
+ },
3677
+ {
3678
+ "epoch": 0.5999003055850545,
3679
+ "grad_norm": 157.875,
3680
+ "learning_rate": 4.445015416238438e-06,
3681
+ "loss": 69.0619,
3682
+ "step": 2595
3683
+ },
3684
+ {
3685
+ "epoch": 0.6010561828597849,
3686
+ "grad_norm": 150.875,
3687
+ "learning_rate": 4.432168550873588e-06,
3688
+ "loss": 69.3003,
3689
+ "step": 2600
3690
+ },
3691
+ {
3692
+ "epoch": 0.6022120601345152,
3693
+ "grad_norm": 163.125,
3694
+ "learning_rate": 4.4193216855087364e-06,
3695
+ "loss": 68.8558,
3696
+ "step": 2605
3697
+ },
3698
+ {
3699
+ "epoch": 0.6033679374092455,
3700
+ "grad_norm": 168.375,
3701
+ "learning_rate": 4.406474820143885e-06,
3702
+ "loss": 70.8878,
3703
+ "step": 2610
3704
+ },
3705
+ {
3706
+ "epoch": 0.604523814683976,
3707
+ "grad_norm": 157.25,
3708
+ "learning_rate": 4.393627954779034e-06,
3709
+ "loss": 69.8354,
3710
+ "step": 2615
3711
+ },
3712
+ {
3713
+ "epoch": 0.6056796919587063,
3714
+ "grad_norm": 154.625,
3715
+ "learning_rate": 4.380781089414183e-06,
3716
+ "loss": 69.2094,
3717
+ "step": 2620
3718
+ },
3719
+ {
3720
+ "epoch": 0.6068355692334366,
3721
+ "grad_norm": 178.875,
3722
+ "learning_rate": 4.367934224049332e-06,
3723
+ "loss": 69.4816,
3724
+ "step": 2625
3725
+ },
3726
+ {
3727
+ "epoch": 0.6079914465081669,
3728
+ "grad_norm": 148.5,
3729
+ "learning_rate": 4.355087358684481e-06,
3730
+ "loss": 69.5445,
3731
+ "step": 2630
3732
+ },
3733
+ {
3734
+ "epoch": 0.6091473237828974,
3735
+ "grad_norm": 189.375,
3736
+ "learning_rate": 4.34224049331963e-06,
3737
+ "loss": 69.7445,
3738
+ "step": 2635
3739
+ },
3740
+ {
3741
+ "epoch": 0.6103032010576277,
3742
+ "grad_norm": 163.375,
3743
+ "learning_rate": 4.329393627954779e-06,
3744
+ "loss": 70.1707,
3745
+ "step": 2640
3746
+ },
3747
+ {
3748
+ "epoch": 0.611459078332358,
3749
+ "grad_norm": 159.625,
3750
+ "learning_rate": 4.316546762589928e-06,
3751
+ "loss": 70.575,
3752
+ "step": 2645
3753
+ },
3754
+ {
3755
+ "epoch": 0.6126149556070885,
3756
+ "grad_norm": 163.875,
3757
+ "learning_rate": 4.303699897225078e-06,
3758
+ "loss": 68.2103,
3759
+ "step": 2650
3760
+ },
3761
+ {
3762
+ "epoch": 0.6137708328818188,
3763
+ "grad_norm": 173.25,
3764
+ "learning_rate": 4.290853031860227e-06,
3765
+ "loss": 68.2952,
3766
+ "step": 2655
3767
+ },
3768
+ {
3769
+ "epoch": 0.6149267101565491,
3770
+ "grad_norm": 173.75,
3771
+ "learning_rate": 4.2780061664953754e-06,
3772
+ "loss": 68.7904,
3773
+ "step": 2660
3774
+ },
3775
+ {
3776
+ "epoch": 0.6160825874312795,
3777
+ "grad_norm": 163.875,
3778
+ "learning_rate": 4.265159301130524e-06,
3779
+ "loss": 70.0719,
3780
+ "step": 2665
3781
+ },
3782
+ {
3783
+ "epoch": 0.6172384647060098,
3784
+ "grad_norm": 180.625,
3785
+ "learning_rate": 4.252312435765674e-06,
3786
+ "loss": 68.3693,
3787
+ "step": 2670
3788
+ },
3789
+ {
3790
+ "epoch": 0.6183943419807402,
3791
+ "grad_norm": 154.5,
3792
+ "learning_rate": 4.239465570400823e-06,
3793
+ "loss": 68.8033,
3794
+ "step": 2675
3795
+ },
3796
+ {
3797
+ "epoch": 0.6195502192554706,
3798
+ "grad_norm": 160.25,
3799
+ "learning_rate": 4.226618705035972e-06,
3800
+ "loss": 67.7923,
3801
+ "step": 2680
3802
+ },
3803
+ {
3804
+ "epoch": 0.6207060965302009,
3805
+ "grad_norm": 159.5,
3806
+ "learning_rate": 4.2137718396711205e-06,
3807
+ "loss": 69.2746,
3808
+ "step": 2685
3809
+ },
3810
+ {
3811
+ "epoch": 0.6218619738049312,
3812
+ "grad_norm": 164.0,
3813
+ "learning_rate": 4.200924974306269e-06,
3814
+ "loss": 68.537,
3815
+ "step": 2690
3816
+ },
3817
+ {
3818
+ "epoch": 0.6230178510796616,
3819
+ "grad_norm": 156.25,
3820
+ "learning_rate": 4.188078108941418e-06,
3821
+ "loss": 68.6553,
3822
+ "step": 2695
3823
+ },
3824
+ {
3825
+ "epoch": 0.624173728354392,
3826
+ "grad_norm": 158.625,
3827
+ "learning_rate": 4.175231243576567e-06,
3828
+ "loss": 68.8911,
3829
+ "step": 2700
3830
+ },
3831
+ {
3832
+ "epoch": 0.6253296056291223,
3833
+ "grad_norm": 166.25,
3834
+ "learning_rate": 4.162384378211717e-06,
3835
+ "loss": 69.7975,
3836
+ "step": 2705
3837
+ },
3838
+ {
3839
+ "epoch": 0.6264854829038526,
3840
+ "grad_norm": 160.25,
3841
+ "learning_rate": 4.149537512846866e-06,
3842
+ "loss": 69.9966,
3843
+ "step": 2710
3844
+ },
3845
+ {
3846
+ "epoch": 0.6276413601785831,
3847
+ "grad_norm": 167.0,
3848
+ "learning_rate": 4.1366906474820145e-06,
3849
+ "loss": 70.2918,
3850
+ "step": 2715
3851
+ },
3852
+ {
3853
+ "epoch": 0.6287972374533134,
3854
+ "grad_norm": 156.5,
3855
+ "learning_rate": 4.123843782117164e-06,
3856
+ "loss": 67.3517,
3857
+ "step": 2720
3858
+ },
3859
+ {
3860
+ "epoch": 0.6299531147280437,
3861
+ "grad_norm": 171.25,
3862
+ "learning_rate": 4.110996916752313e-06,
3863
+ "loss": 68.5519,
3864
+ "step": 2725
3865
+ },
3866
+ {
3867
+ "epoch": 0.6311089920027741,
3868
+ "grad_norm": 178.75,
3869
+ "learning_rate": 4.098150051387462e-06,
3870
+ "loss": 69.3377,
3871
+ "step": 2730
3872
+ },
3873
+ {
3874
+ "epoch": 0.6322648692775045,
3875
+ "grad_norm": 167.875,
3876
+ "learning_rate": 4.085303186022611e-06,
3877
+ "loss": 68.7498,
3878
+ "step": 2735
3879
+ },
3880
+ {
3881
+ "epoch": 0.6334207465522348,
3882
+ "grad_norm": 164.875,
3883
+ "learning_rate": 4.07245632065776e-06,
3884
+ "loss": 68.4089,
3885
+ "step": 2740
3886
+ },
3887
+ {
3888
+ "epoch": 0.6345766238269652,
3889
+ "grad_norm": 171.5,
3890
+ "learning_rate": 4.059609455292909e-06,
3891
+ "loss": 68.4604,
3892
+ "step": 2745
3893
+ },
3894
+ {
3895
+ "epoch": 0.6357325011016955,
3896
+ "grad_norm": 169.5,
3897
+ "learning_rate": 4.046762589928058e-06,
3898
+ "loss": 69.7823,
3899
+ "step": 2750
3900
+ },
3901
+ {
3902
+ "epoch": 0.6368883783764259,
3903
+ "grad_norm": 167.125,
3904
+ "learning_rate": 4.033915724563207e-06,
3905
+ "loss": 68.461,
3906
+ "step": 2755
3907
+ },
3908
+ {
3909
+ "epoch": 0.6380442556511562,
3910
+ "grad_norm": 157.75,
3911
+ "learning_rate": 4.021068859198356e-06,
3912
+ "loss": 69.8009,
3913
+ "step": 2760
3914
+ },
3915
+ {
3916
+ "epoch": 0.6392001329258866,
3917
+ "grad_norm": 153.25,
3918
+ "learning_rate": 4.008221993833505e-06,
3919
+ "loss": 69.0233,
3920
+ "step": 2765
3921
+ },
3922
+ {
3923
+ "epoch": 0.6403560102006169,
3924
+ "grad_norm": 153.375,
3925
+ "learning_rate": 3.9953751284686535e-06,
3926
+ "loss": 69.8202,
3927
+ "step": 2770
3928
+ },
3929
+ {
3930
+ "epoch": 0.6415118874753473,
3931
+ "grad_norm": 152.25,
3932
+ "learning_rate": 3.982528263103803e-06,
3933
+ "loss": 70.0482,
3934
+ "step": 2775
3935
+ },
3936
+ {
3937
+ "epoch": 0.6426677647500777,
3938
+ "grad_norm": 151.0,
3939
+ "learning_rate": 3.969681397738952e-06,
3940
+ "loss": 68.4574,
3941
+ "step": 2780
3942
+ },
3943
+ {
3944
+ "epoch": 0.643823642024808,
3945
+ "grad_norm": 153.75,
3946
+ "learning_rate": 3.956834532374101e-06,
3947
+ "loss": 68.9972,
3948
+ "step": 2785
3949
+ },
3950
+ {
3951
+ "epoch": 0.6449795192995383,
3952
+ "grad_norm": 155.0,
3953
+ "learning_rate": 3.9439876670092506e-06,
3954
+ "loss": 69.0547,
3955
+ "step": 2790
3956
+ },
3957
+ {
3958
+ "epoch": 0.6461353965742688,
3959
+ "grad_norm": 165.75,
3960
+ "learning_rate": 3.931140801644399e-06,
3961
+ "loss": 69.5258,
3962
+ "step": 2795
3963
+ },
3964
+ {
3965
+ "epoch": 0.6472912738489991,
3966
+ "grad_norm": 148.625,
3967
+ "learning_rate": 3.918293936279548e-06,
3968
+ "loss": 68.9722,
3969
+ "step": 2800
3970
+ },
3971
+ {
3972
+ "epoch": 0.6484471511237294,
3973
+ "grad_norm": 146.125,
3974
+ "learning_rate": 3.905447070914697e-06,
3975
+ "loss": 69.1489,
3976
+ "step": 2805
3977
+ },
3978
+ {
3979
+ "epoch": 0.6496030283984598,
3980
+ "grad_norm": 167.625,
3981
+ "learning_rate": 3.892600205549846e-06,
3982
+ "loss": 67.9903,
3983
+ "step": 2810
3984
+ },
3985
+ {
3986
+ "epoch": 0.6507589056731902,
3987
+ "grad_norm": 154.0,
3988
+ "learning_rate": 3.879753340184995e-06,
3989
+ "loss": 69.16,
3990
+ "step": 2815
3991
+ },
3992
+ {
3993
+ "epoch": 0.6519147829479205,
3994
+ "grad_norm": 157.25,
3995
+ "learning_rate": 3.866906474820144e-06,
3996
+ "loss": 70.0419,
3997
+ "step": 2820
3998
+ },
3999
+ {
4000
+ "epoch": 0.6530706602226508,
4001
+ "grad_norm": 156.625,
4002
+ "learning_rate": 3.854059609455293e-06,
4003
+ "loss": 69.2703,
4004
+ "step": 2825
4005
+ },
4006
+ {
4007
+ "epoch": 0.6542265374973812,
4008
+ "grad_norm": 153.875,
4009
+ "learning_rate": 3.841212744090442e-06,
4010
+ "loss": 69.3849,
4011
+ "step": 2830
4012
+ },
4013
+ {
4014
+ "epoch": 0.6553824147721116,
4015
+ "grad_norm": 161.125,
4016
+ "learning_rate": 3.828365878725591e-06,
4017
+ "loss": 69.6766,
4018
+ "step": 2835
4019
+ },
4020
+ {
4021
+ "epoch": 0.6565382920468419,
4022
+ "grad_norm": 147.875,
4023
+ "learning_rate": 3.81551901336074e-06,
4024
+ "loss": 67.732,
4025
+ "step": 2840
4026
+ },
4027
+ {
4028
+ "epoch": 0.6576941693215723,
4029
+ "grad_norm": 168.0,
4030
+ "learning_rate": 3.8026721479958896e-06,
4031
+ "loss": 69.0334,
4032
+ "step": 2845
4033
+ },
4034
+ {
4035
+ "epoch": 0.6588500465963026,
4036
+ "grad_norm": 153.75,
4037
+ "learning_rate": 3.7898252826310384e-06,
4038
+ "loss": 68.3081,
4039
+ "step": 2850
4040
+ },
4041
+ {
4042
+ "epoch": 0.660005923871033,
4043
+ "grad_norm": 184.25,
4044
+ "learning_rate": 3.7769784172661873e-06,
4045
+ "loss": 69.0602,
4046
+ "step": 2855
4047
+ },
4048
+ {
4049
+ "epoch": 0.6611618011457634,
4050
+ "grad_norm": 189.75,
4051
+ "learning_rate": 3.7641315519013365e-06,
4052
+ "loss": 67.2991,
4053
+ "step": 2860
4054
+ },
4055
+ {
4056
+ "epoch": 0.6623176784204937,
4057
+ "grad_norm": 169.0,
4058
+ "learning_rate": 3.7512846865364854e-06,
4059
+ "loss": 70.8647,
4060
+ "step": 2865
4061
+ },
4062
+ {
4063
+ "epoch": 0.663473555695224,
4064
+ "grad_norm": 157.75,
4065
+ "learning_rate": 3.7384378211716342e-06,
4066
+ "loss": 68.4063,
4067
+ "step": 2870
4068
+ },
4069
+ {
4070
+ "epoch": 0.6646294329699545,
4071
+ "grad_norm": 176.375,
4072
+ "learning_rate": 3.725590955806783e-06,
4073
+ "loss": 68.3025,
4074
+ "step": 2875
4075
+ },
4076
+ {
4077
+ "epoch": 0.6657853102446848,
4078
+ "grad_norm": 180.0,
4079
+ "learning_rate": 3.7127440904419328e-06,
4080
+ "loss": 68.6778,
4081
+ "step": 2880
4082
+ },
4083
+ {
4084
+ "epoch": 0.6669411875194151,
4085
+ "grad_norm": 168.5,
4086
+ "learning_rate": 3.6998972250770816e-06,
4087
+ "loss": 67.2759,
4088
+ "step": 2885
4089
+ },
4090
+ {
4091
+ "epoch": 0.6680970647941454,
4092
+ "grad_norm": 164.25,
4093
+ "learning_rate": 3.6870503597122305e-06,
4094
+ "loss": 69.538,
4095
+ "step": 2890
4096
+ },
4097
+ {
4098
+ "epoch": 0.6692529420688759,
4099
+ "grad_norm": 157.625,
4100
+ "learning_rate": 3.6742034943473797e-06,
4101
+ "loss": 68.8391,
4102
+ "step": 2895
4103
+ },
4104
+ {
4105
+ "epoch": 0.6704088193436062,
4106
+ "grad_norm": 171.625,
4107
+ "learning_rate": 3.6613566289825286e-06,
4108
+ "loss": 68.3289,
4109
+ "step": 2900
4110
+ },
4111
+ {
4112
+ "epoch": 0.6715646966183365,
4113
+ "grad_norm": 161.375,
4114
+ "learning_rate": 3.6485097636176774e-06,
4115
+ "loss": 67.6343,
4116
+ "step": 2905
4117
+ },
4118
+ {
4119
+ "epoch": 0.6727205738930669,
4120
+ "grad_norm": 159.0,
4121
+ "learning_rate": 3.6356628982528263e-06,
4122
+ "loss": 67.9106,
4123
+ "step": 2910
4124
+ },
4125
+ {
4126
+ "epoch": 0.6738764511677973,
4127
+ "grad_norm": 148.625,
4128
+ "learning_rate": 3.622816032887976e-06,
4129
+ "loss": 68.7488,
4130
+ "step": 2915
4131
+ },
4132
+ {
4133
+ "epoch": 0.6750323284425276,
4134
+ "grad_norm": 172.0,
4135
+ "learning_rate": 3.609969167523125e-06,
4136
+ "loss": 68.8937,
4137
+ "step": 2920
4138
+ },
4139
+ {
4140
+ "epoch": 0.676188205717258,
4141
+ "grad_norm": 169.875,
4142
+ "learning_rate": 3.5971223021582737e-06,
4143
+ "loss": 68.0938,
4144
+ "step": 2925
4145
+ },
4146
+ {
4147
+ "epoch": 0.6773440829919883,
4148
+ "grad_norm": 161.25,
4149
+ "learning_rate": 3.584275436793423e-06,
4150
+ "loss": 68.6723,
4151
+ "step": 2930
4152
+ },
4153
+ {
4154
+ "epoch": 0.6784999602667187,
4155
+ "grad_norm": 159.0,
4156
+ "learning_rate": 3.5714285714285718e-06,
4157
+ "loss": 68.2037,
4158
+ "step": 2935
4159
+ },
4160
+ {
4161
+ "epoch": 0.679655837541449,
4162
+ "grad_norm": 165.875,
4163
+ "learning_rate": 3.5585817060637206e-06,
4164
+ "loss": 68.3861,
4165
+ "step": 2940
4166
+ },
4167
+ {
4168
+ "epoch": 0.6808117148161794,
4169
+ "grad_norm": 160.625,
4170
+ "learning_rate": 3.5457348406988695e-06,
4171
+ "loss": 67.6351,
4172
+ "step": 2945
4173
+ },
4174
+ {
4175
+ "epoch": 0.6819675920909097,
4176
+ "grad_norm": 172.625,
4177
+ "learning_rate": 3.5328879753340187e-06,
4178
+ "loss": 68.8567,
4179
+ "step": 2950
4180
+ },
4181
+ {
4182
+ "epoch": 0.6831234693656401,
4183
+ "grad_norm": 185.125,
4184
+ "learning_rate": 3.520041109969168e-06,
4185
+ "loss": 66.5128,
4186
+ "step": 2955
4187
+ },
4188
+ {
4189
+ "epoch": 0.6842793466403705,
4190
+ "grad_norm": 157.75,
4191
+ "learning_rate": 3.507194244604317e-06,
4192
+ "loss": 68.0714,
4193
+ "step": 2960
4194
+ },
4195
+ {
4196
+ "epoch": 0.6854352239151008,
4197
+ "grad_norm": 151.375,
4198
+ "learning_rate": 3.494347379239466e-06,
4199
+ "loss": 67.9339,
4200
+ "step": 2965
4201
+ },
4202
+ {
4203
+ "epoch": 0.6865911011898311,
4204
+ "grad_norm": 164.25,
4205
+ "learning_rate": 3.481500513874615e-06,
4206
+ "loss": 68.5463,
4207
+ "step": 2970
4208
+ },
4209
+ {
4210
+ "epoch": 0.6877469784645616,
4211
+ "grad_norm": 138.0,
4212
+ "learning_rate": 3.468653648509764e-06,
4213
+ "loss": 69.1578,
4214
+ "step": 2975
4215
+ },
4216
+ {
4217
+ "epoch": 0.6889028557392919,
4218
+ "grad_norm": 200.75,
4219
+ "learning_rate": 3.4558067831449127e-06,
4220
+ "loss": 68.674,
4221
+ "step": 2980
4222
+ },
4223
+ {
4224
+ "epoch": 0.6900587330140222,
4225
+ "grad_norm": 149.75,
4226
+ "learning_rate": 3.442959917780062e-06,
4227
+ "loss": 68.8552,
4228
+ "step": 2985
4229
+ },
4230
+ {
4231
+ "epoch": 0.6912146102887526,
4232
+ "grad_norm": 165.125,
4233
+ "learning_rate": 3.430113052415211e-06,
4234
+ "loss": 68.541,
4235
+ "step": 2990
4236
+ },
4237
+ {
4238
+ "epoch": 0.692370487563483,
4239
+ "grad_norm": 175.625,
4240
+ "learning_rate": 3.4172661870503596e-06,
4241
+ "loss": 68.8684,
4242
+ "step": 2995
4243
+ },
4244
+ {
4245
+ "epoch": 0.6935263648382133,
4246
+ "grad_norm": 165.5,
4247
+ "learning_rate": 3.4044193216855093e-06,
4248
+ "loss": 68.3212,
4249
+ "step": 3000
4250
+ },
4251
+ {
4252
+ "epoch": 0.6935263648382133,
4253
+ "eval_loss": NaN,
4254
+ "eval_runtime": 380.967,
4255
+ "eval_samples_per_second": 611.953,
4256
+ "eval_steps_per_second": 38.247,
4257
+ "step": 3000
4258
  }
4259
  ],
4260
  "logging_steps": 5,
 
4274
  "attributes": {}
4275
  }
4276
  },
4277
+ "total_flos": 1.2997073978366362e+19,
4278
  "train_batch_size": 4,
4279
  "trial_name": null,
4280
  "trial_params": null