Plofski commited on
Commit
f65ea43
·
verified ·
1 Parent(s): f1b9a74

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17ab6fbe9c97d82ef7dac860e0afd63f233555e8f23a9fd5286c2c92aa0de809
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84f66cc33d9cd5915476a96d4590c19f424c7a30752f1c8fbfea7813b99ddcec
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2fdccc0924c16c14bbca889730272d2d9adcc2fdeb5cc2188b22634e6a65ba6
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff96785f117afab51789d3a95126f7a57e937335d9e00258dde9f7269e32c788
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b9d0e16227a53d102f718b321b6ebc380604ad5e862513fc6df0711cea1a67f
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b42a9ebffc25267408092ad255514977530cb80117fd8185edcad8326726d7b8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8059641345960105,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3608,6 +3608,456 @@
3608
  "mean_token_accuracy": 0.7772108554840088,
3609
  "num_tokens": 4430041.0,
3610
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3611
  }
3612
  ],
3613
  "logging_steps": 10,
@@ -3627,7 +4077,7 @@
3627
  "attributes": {}
3628
  }
3629
  },
3630
- "total_flos": 5359648531077120.0,
3631
  "train_batch_size": 8,
3632
  "trial_name": null,
3633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9067096514205117,
6
  "eval_steps": 500,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3608
  "mean_token_accuracy": 0.7772108554840088,
3609
  "num_tokens": 4430041.0,
3610
  "step": 4000
3611
+ },
3612
+ {
3613
+ "epoch": 0.8079790449325005,
3614
+ "grad_norm": 11.8125,
3615
+ "learning_rate": 1.461481630734099e-05,
3616
+ "loss": 0.803,
3617
+ "mean_token_accuracy": 0.7974342584609986,
3618
+ "num_tokens": 4440795.0,
3619
+ "step": 4010
3620
+ },
3621
+ {
3622
+ "epoch": 0.8099939552689905,
3623
+ "grad_norm": 9.625,
3624
+ "learning_rate": 1.460138357176439e-05,
3625
+ "loss": 0.885,
3626
+ "mean_token_accuracy": 0.7806336939334869,
3627
+ "num_tokens": 4452744.0,
3628
+ "step": 4020
3629
+ },
3630
+ {
3631
+ "epoch": 0.8120088656054806,
3632
+ "grad_norm": 10.5625,
3633
+ "learning_rate": 1.458795083618779e-05,
3634
+ "loss": 0.9102,
3635
+ "mean_token_accuracy": 0.7730875134468078,
3636
+ "num_tokens": 4463539.0,
3637
+ "step": 4030
3638
+ },
3639
+ {
3640
+ "epoch": 0.8140237759419706,
3641
+ "grad_norm": 11.0,
3642
+ "learning_rate": 1.457451810061119e-05,
3643
+ "loss": 0.8561,
3644
+ "mean_token_accuracy": 0.7935479283332825,
3645
+ "num_tokens": 4474515.0,
3646
+ "step": 4040
3647
+ },
3648
+ {
3649
+ "epoch": 0.8160386862784607,
3650
+ "grad_norm": 12.8125,
3651
+ "learning_rate": 1.456108536503459e-05,
3652
+ "loss": 0.9461,
3653
+ "mean_token_accuracy": 0.7769980370998383,
3654
+ "num_tokens": 4485302.0,
3655
+ "step": 4050
3656
+ },
3657
+ {
3658
+ "epoch": 0.8180535966149506,
3659
+ "grad_norm": 11.125,
3660
+ "learning_rate": 1.4547652629457991e-05,
3661
+ "loss": 0.9002,
3662
+ "mean_token_accuracy": 0.7809113264083862,
3663
+ "num_tokens": 4497764.0,
3664
+ "step": 4060
3665
+ },
3666
+ {
3667
+ "epoch": 0.8200685069514406,
3668
+ "grad_norm": 12.4375,
3669
+ "learning_rate": 1.453421989388139e-05,
3670
+ "loss": 0.8366,
3671
+ "mean_token_accuracy": 0.798139876127243,
3672
+ "num_tokens": 4508154.0,
3673
+ "step": 4070
3674
+ },
3675
+ {
3676
+ "epoch": 0.8220834172879307,
3677
+ "grad_norm": 13.4375,
3678
+ "learning_rate": 1.452078715830479e-05,
3679
+ "loss": 0.8682,
3680
+ "mean_token_accuracy": 0.790400379896164,
3681
+ "num_tokens": 4518543.0,
3682
+ "step": 4080
3683
+ },
3684
+ {
3685
+ "epoch": 0.8240983276244207,
3686
+ "grad_norm": 10.5625,
3687
+ "learning_rate": 1.4507354422728191e-05,
3688
+ "loss": 0.9201,
3689
+ "mean_token_accuracy": 0.7755493521690369,
3690
+ "num_tokens": 4529945.0,
3691
+ "step": 4090
3692
+ },
3693
+ {
3694
+ "epoch": 0.8261132379609107,
3695
+ "grad_norm": 12.625,
3696
+ "learning_rate": 1.4493921687151588e-05,
3697
+ "loss": 0.891,
3698
+ "mean_token_accuracy": 0.7825681924819946,
3699
+ "num_tokens": 4542358.0,
3700
+ "step": 4100
3701
+ },
3702
+ {
3703
+ "epoch": 0.8281281482974008,
3704
+ "grad_norm": 11.25,
3705
+ "learning_rate": 1.4480488951574989e-05,
3706
+ "loss": 0.9354,
3707
+ "mean_token_accuracy": 0.7727735102176666,
3708
+ "num_tokens": 4553817.0,
3709
+ "step": 4110
3710
+ },
3711
+ {
3712
+ "epoch": 0.8301430586338908,
3713
+ "grad_norm": 10.75,
3714
+ "learning_rate": 1.446705621599839e-05,
3715
+ "loss": 0.7511,
3716
+ "mean_token_accuracy": 0.8066883027553559,
3717
+ "num_tokens": 4563944.0,
3718
+ "step": 4120
3719
+ },
3720
+ {
3721
+ "epoch": 0.8321579689703809,
3722
+ "grad_norm": 12.75,
3723
+ "learning_rate": 1.445362348042179e-05,
3724
+ "loss": 0.8766,
3725
+ "mean_token_accuracy": 0.7842302858829499,
3726
+ "num_tokens": 4574025.0,
3727
+ "step": 4130
3728
+ },
3729
+ {
3730
+ "epoch": 0.8341728793068709,
3731
+ "grad_norm": 11.1875,
3732
+ "learning_rate": 1.4440190744845189e-05,
3733
+ "loss": 0.8223,
3734
+ "mean_token_accuracy": 0.8018840789794922,
3735
+ "num_tokens": 4584261.0,
3736
+ "step": 4140
3737
+ },
3738
+ {
3739
+ "epoch": 0.8361877896433608,
3740
+ "grad_norm": 13.375,
3741
+ "learning_rate": 1.442675800926859e-05,
3742
+ "loss": 0.7998,
3743
+ "mean_token_accuracy": 0.7976557493209839,
3744
+ "num_tokens": 4594866.0,
3745
+ "step": 4150
3746
+ },
3747
+ {
3748
+ "epoch": 0.8382026999798509,
3749
+ "grad_norm": 12.0625,
3750
+ "learning_rate": 1.441332527369199e-05,
3751
+ "loss": 0.9108,
3752
+ "mean_token_accuracy": 0.7819365322589874,
3753
+ "num_tokens": 4605638.0,
3754
+ "step": 4160
3755
+ },
3756
+ {
3757
+ "epoch": 0.8402176103163409,
3758
+ "grad_norm": 11.125,
3759
+ "learning_rate": 1.4399892538115387e-05,
3760
+ "loss": 0.8475,
3761
+ "mean_token_accuracy": 0.7921322703361511,
3762
+ "num_tokens": 4617406.0,
3763
+ "step": 4170
3764
+ },
3765
+ {
3766
+ "epoch": 0.8422325206528309,
3767
+ "grad_norm": 12.875,
3768
+ "learning_rate": 1.4386459802538787e-05,
3769
+ "loss": 0.9156,
3770
+ "mean_token_accuracy": 0.7791644930839539,
3771
+ "num_tokens": 4628983.0,
3772
+ "step": 4180
3773
+ },
3774
+ {
3775
+ "epoch": 0.844247430989321,
3776
+ "grad_norm": 12.75,
3777
+ "learning_rate": 1.4373027066962188e-05,
3778
+ "loss": 0.8978,
3779
+ "mean_token_accuracy": 0.7813169062137604,
3780
+ "num_tokens": 4640524.0,
3781
+ "step": 4190
3782
+ },
3783
+ {
3784
+ "epoch": 0.846262341325811,
3785
+ "grad_norm": 12.25,
3786
+ "learning_rate": 1.4359594331385587e-05,
3787
+ "loss": 0.8205,
3788
+ "mean_token_accuracy": 0.7985609114170075,
3789
+ "num_tokens": 4650666.0,
3790
+ "step": 4200
3791
+ },
3792
+ {
3793
+ "epoch": 0.8482772516623011,
3794
+ "grad_norm": 10.625,
3795
+ "learning_rate": 1.4346161595808987e-05,
3796
+ "loss": 0.9088,
3797
+ "mean_token_accuracy": 0.7806391000747681,
3798
+ "num_tokens": 4662823.0,
3799
+ "step": 4210
3800
+ },
3801
+ {
3802
+ "epoch": 0.8502921619987911,
3803
+ "grad_norm": 10.8125,
3804
+ "learning_rate": 1.4332728860232388e-05,
3805
+ "loss": 0.7677,
3806
+ "mean_token_accuracy": 0.8117966473102569,
3807
+ "num_tokens": 4672839.0,
3808
+ "step": 4220
3809
+ },
3810
+ {
3811
+ "epoch": 0.8523070723352811,
3812
+ "grad_norm": 12.1875,
3813
+ "learning_rate": 1.4319296124655788e-05,
3814
+ "loss": 0.8908,
3815
+ "mean_token_accuracy": 0.785522049665451,
3816
+ "num_tokens": 4684351.0,
3817
+ "step": 4230
3818
+ },
3819
+ {
3820
+ "epoch": 0.8543219826717711,
3821
+ "grad_norm": 11.6875,
3822
+ "learning_rate": 1.4305863389079187e-05,
3823
+ "loss": 0.8259,
3824
+ "mean_token_accuracy": 0.7945611894130706,
3825
+ "num_tokens": 4695517.0,
3826
+ "step": 4240
3827
+ },
3828
+ {
3829
+ "epoch": 0.8563368930082611,
3830
+ "grad_norm": 11.75,
3831
+ "learning_rate": 1.4292430653502588e-05,
3832
+ "loss": 0.8182,
3833
+ "mean_token_accuracy": 0.7957002699375153,
3834
+ "num_tokens": 4706066.0,
3835
+ "step": 4250
3836
+ },
3837
+ {
3838
+ "epoch": 0.8583518033447511,
3839
+ "grad_norm": 11.3125,
3840
+ "learning_rate": 1.4278997917925988e-05,
3841
+ "loss": 0.8748,
3842
+ "mean_token_accuracy": 0.7866592228412628,
3843
+ "num_tokens": 4717128.0,
3844
+ "step": 4260
3845
+ },
3846
+ {
3847
+ "epoch": 0.8603667136812412,
3848
+ "grad_norm": 12.125,
3849
+ "learning_rate": 1.4265565182349385e-05,
3850
+ "loss": 0.847,
3851
+ "mean_token_accuracy": 0.7929592907428742,
3852
+ "num_tokens": 4727372.0,
3853
+ "step": 4270
3854
+ },
3855
+ {
3856
+ "epoch": 0.8623816240177312,
3857
+ "grad_norm": 13.5,
3858
+ "learning_rate": 1.4252132446772786e-05,
3859
+ "loss": 0.8527,
3860
+ "mean_token_accuracy": 0.7962908685207367,
3861
+ "num_tokens": 4739316.0,
3862
+ "step": 4280
3863
+ },
3864
+ {
3865
+ "epoch": 0.8643965343542213,
3866
+ "grad_norm": 13.5,
3867
+ "learning_rate": 1.4238699711196186e-05,
3868
+ "loss": 0.9276,
3869
+ "mean_token_accuracy": 0.7727067172527313,
3870
+ "num_tokens": 4750907.0,
3871
+ "step": 4290
3872
+ },
3873
+ {
3874
+ "epoch": 0.8664114446907113,
3875
+ "grad_norm": 11.0625,
3876
+ "learning_rate": 1.4225266975619587e-05,
3877
+ "loss": 0.8325,
3878
+ "mean_token_accuracy": 0.7947500467300415,
3879
+ "num_tokens": 4761642.0,
3880
+ "step": 4300
3881
+ },
3882
+ {
3883
+ "epoch": 0.8684263550272013,
3884
+ "grad_norm": 12.6875,
3885
+ "learning_rate": 1.4211834240042986e-05,
3886
+ "loss": 0.9264,
3887
+ "mean_token_accuracy": 0.7780522584915162,
3888
+ "num_tokens": 4772339.0,
3889
+ "step": 4310
3890
+ },
3891
+ {
3892
+ "epoch": 0.8704412653636913,
3893
+ "grad_norm": 10.9375,
3894
+ "learning_rate": 1.4198401504466386e-05,
3895
+ "loss": 0.8859,
3896
+ "mean_token_accuracy": 0.7833628177642822,
3897
+ "num_tokens": 4782957.0,
3898
+ "step": 4320
3899
+ },
3900
+ {
3901
+ "epoch": 0.8724561757001813,
3902
+ "grad_norm": 10.375,
3903
+ "learning_rate": 1.4184968768889787e-05,
3904
+ "loss": 0.9069,
3905
+ "mean_token_accuracy": 0.776193904876709,
3906
+ "num_tokens": 4794205.0,
3907
+ "step": 4330
3908
+ },
3909
+ {
3910
+ "epoch": 0.8744710860366713,
3911
+ "grad_norm": 10.4375,
3912
+ "learning_rate": 1.4171536033313184e-05,
3913
+ "loss": 0.8802,
3914
+ "mean_token_accuracy": 0.7828757107257843,
3915
+ "num_tokens": 4805198.0,
3916
+ "step": 4340
3917
+ },
3918
+ {
3919
+ "epoch": 0.8764859963731614,
3920
+ "grad_norm": 10.8125,
3921
+ "learning_rate": 1.4158103297736585e-05,
3922
+ "loss": 0.8658,
3923
+ "mean_token_accuracy": 0.7872898876667023,
3924
+ "num_tokens": 4815383.0,
3925
+ "step": 4350
3926
+ },
3927
+ {
3928
+ "epoch": 0.8785009067096514,
3929
+ "grad_norm": 11.3125,
3930
+ "learning_rate": 1.4144670562159985e-05,
3931
+ "loss": 0.8317,
3932
+ "mean_token_accuracy": 0.7920398592948914,
3933
+ "num_tokens": 4825473.0,
3934
+ "step": 4360
3935
+ },
3936
+ {
3937
+ "epoch": 0.8805158170461415,
3938
+ "grad_norm": 12.625,
3939
+ "learning_rate": 1.4131237826583384e-05,
3940
+ "loss": 0.8622,
3941
+ "mean_token_accuracy": 0.7888808488845825,
3942
+ "num_tokens": 4836736.0,
3943
+ "step": 4370
3944
+ },
3945
+ {
3946
+ "epoch": 0.8825307273826315,
3947
+ "grad_norm": 10.625,
3948
+ "learning_rate": 1.4117805091006784e-05,
3949
+ "loss": 0.9218,
3950
+ "mean_token_accuracy": 0.7805340230464936,
3951
+ "num_tokens": 4847997.0,
3952
+ "step": 4380
3953
+ },
3954
+ {
3955
+ "epoch": 0.8845456377191215,
3956
+ "grad_norm": 10.8125,
3957
+ "learning_rate": 1.4104372355430185e-05,
3958
+ "loss": 0.8248,
3959
+ "mean_token_accuracy": 0.7985675752162933,
3960
+ "num_tokens": 4858799.0,
3961
+ "step": 4390
3962
+ },
3963
+ {
3964
+ "epoch": 0.8865605480556116,
3965
+ "grad_norm": 12.4375,
3966
+ "learning_rate": 1.4090939619853585e-05,
3967
+ "loss": 0.8753,
3968
+ "mean_token_accuracy": 0.7870100736618042,
3969
+ "num_tokens": 4870150.0,
3970
+ "step": 4400
3971
+ },
3972
+ {
3973
+ "epoch": 0.8885754583921015,
3974
+ "grad_norm": 11.0,
3975
+ "learning_rate": 1.4077506884276984e-05,
3976
+ "loss": 0.858,
3977
+ "mean_token_accuracy": 0.7853298187255859,
3978
+ "num_tokens": 4881976.0,
3979
+ "step": 4410
3980
+ },
3981
+ {
3982
+ "epoch": 0.8905903687285915,
3983
+ "grad_norm": 10.625,
3984
+ "learning_rate": 1.4064074148700385e-05,
3985
+ "loss": 0.827,
3986
+ "mean_token_accuracy": 0.7916330635547638,
3987
+ "num_tokens": 4893088.0,
3988
+ "step": 4420
3989
+ },
3990
+ {
3991
+ "epoch": 0.8926052790650816,
3992
+ "grad_norm": 11.4375,
3993
+ "learning_rate": 1.4050641413123784e-05,
3994
+ "loss": 0.918,
3995
+ "mean_token_accuracy": 0.7799353480339051,
3996
+ "num_tokens": 4904993.0,
3997
+ "step": 4430
3998
+ },
3999
+ {
4000
+ "epoch": 0.8946201894015716,
4001
+ "grad_norm": 12.0625,
4002
+ "learning_rate": 1.4037208677547183e-05,
4003
+ "loss": 0.8315,
4004
+ "mean_token_accuracy": 0.7943594753742218,
4005
+ "num_tokens": 4915216.0,
4006
+ "step": 4440
4007
+ },
4008
+ {
4009
+ "epoch": 0.8966350997380617,
4010
+ "grad_norm": 12.0625,
4011
+ "learning_rate": 1.4023775941970583e-05,
4012
+ "loss": 0.9096,
4013
+ "mean_token_accuracy": 0.7769247710704803,
4014
+ "num_tokens": 4925551.0,
4015
+ "step": 4450
4016
+ },
4017
+ {
4018
+ "epoch": 0.8986500100745517,
4019
+ "grad_norm": 11.6875,
4020
+ "learning_rate": 1.4010343206393984e-05,
4021
+ "loss": 0.8555,
4022
+ "mean_token_accuracy": 0.7878253519535064,
4023
+ "num_tokens": 4936738.0,
4024
+ "step": 4460
4025
+ },
4026
+ {
4027
+ "epoch": 0.9006649204110417,
4028
+ "grad_norm": 11.8125,
4029
+ "learning_rate": 1.3996910470817384e-05,
4030
+ "loss": 0.8728,
4031
+ "mean_token_accuracy": 0.7859396934509277,
4032
+ "num_tokens": 4947725.0,
4033
+ "step": 4470
4034
+ },
4035
+ {
4036
+ "epoch": 0.9026798307475318,
4037
+ "grad_norm": 13.8125,
4038
+ "learning_rate": 1.3983477735240783e-05,
4039
+ "loss": 0.7828,
4040
+ "mean_token_accuracy": 0.7994441330432892,
4041
+ "num_tokens": 4958978.0,
4042
+ "step": 4480
4043
+ },
4044
+ {
4045
+ "epoch": 0.9046947410840218,
4046
+ "grad_norm": 10.0625,
4047
+ "learning_rate": 1.3970044999664183e-05,
4048
+ "loss": 0.856,
4049
+ "mean_token_accuracy": 0.7900491297245026,
4050
+ "num_tokens": 4970121.0,
4051
+ "step": 4490
4052
+ },
4053
+ {
4054
+ "epoch": 0.9067096514205117,
4055
+ "grad_norm": 8.0625,
4056
+ "learning_rate": 1.3956612264087584e-05,
4057
+ "loss": 0.7457,
4058
+ "mean_token_accuracy": 0.8125901579856872,
4059
+ "num_tokens": 4982012.0,
4060
+ "step": 4500
4061
  }
4062
  ],
4063
  "logging_steps": 10,
 
4077
  "attributes": {}
4078
  }
4079
  },
4080
+ "total_flos": 6031407455969280.0,
4081
  "train_batch_size": 8,
4082
  "trial_name": null,
4083
  "trial_params": null