Wilsonwin commited on
Commit
2917a2c
·
verified ·
1 Parent(s): d879029

Training in progress, step 5500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9bdb004cc12734dde986cb14fdf851cce0f063e2d6a2ac9c9566bb962bc0873
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850517b9cf5da4903168f8b9dbfcfcb01385d34bc0d5bd1c93041c99d5afbbab
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68b278926ebe3e854059774715cf944c796b018b9ed04789c02ad5bd2ddb56db
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eaed0cac576a8a9a03addbea043ecae521ca2a1d3d91c2f8f4543bcfc559783
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5948a5161f7923aa0acf66b01adf35dc2196a8acf5bd2c21227561e5bff45666
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9c47849ad44860f45019fca12bd8b47e7589be1317a01ad6705b924156a6be
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a29280eedf28bde93a8485de1b90963ca69c84125cea86695b5935449e18f453
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda4b56b57284b5d776cea834f86539fa062d5e046885e07dcb7516921ccd6ee
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8447372867038351,
6
  "eval_steps": 500,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3595,6 +3595,364 @@
3595
  "eval_samples_per_second": 275.05,
3596
  "eval_steps_per_second": 5.776,
3597
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3598
  }
3599
  ],
3600
  "logging_steps": 10,
@@ -3614,7 +3972,7 @@
3614
  "attributes": {}
3615
  }
3616
  },
3617
- "total_flos": 1.6722841042944e+17,
3618
  "train_batch_size": 48,
3619
  "trial_name": null,
3620
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9292110153742186,
6
  "eval_steps": 500,
7
+ "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3595
  "eval_samples_per_second": 275.05,
3596
  "eval_steps_per_second": 5.776,
3597
  "step": 5000
3598
+ },
3599
+ {
3600
+ "epoch": 0.8464267612772428,
3601
+ "grad_norm": 0.5504565238952637,
3602
+ "learning_rate": 0.00023592089546305216,
3603
+ "loss": 4.576148986816406,
3604
+ "step": 5010
3605
+ },
3606
+ {
3607
+ "epoch": 0.8481162358506504,
3608
+ "grad_norm": 0.5207253098487854,
3609
+ "learning_rate": 0.00023552782714923343,
3610
+ "loss": 4.615359497070313,
3611
+ "step": 5020
3612
+ },
3613
+ {
3614
+ "epoch": 0.8498057104240582,
3615
+ "grad_norm": 0.5456526875495911,
3616
+ "learning_rate": 0.00023513388668141118,
3617
+ "loss": 4.583608627319336,
3618
+ "step": 5030
3619
+ },
3620
+ {
3621
+ "epoch": 0.8514951849974658,
3622
+ "grad_norm": 0.5371212959289551,
3623
+ "learning_rate": 0.00023473907807671952,
3624
+ "loss": 4.605810546875,
3625
+ "step": 5040
3626
+ },
3627
+ {
3628
+ "epoch": 0.8531846595708734,
3629
+ "grad_norm": 0.5273321270942688,
3630
+ "learning_rate": 0.00023434340536114531,
3631
+ "loss": 4.596974945068359,
3632
+ "step": 5050
3633
+ },
3634
+ {
3635
+ "epoch": 0.8548741341442812,
3636
+ "grad_norm": 0.5454714894294739,
3637
+ "learning_rate": 0.00023394687256948697,
3638
+ "loss": 4.595716094970703,
3639
+ "step": 5060
3640
+ },
3641
+ {
3642
+ "epoch": 0.8565636087176888,
3643
+ "grad_norm": 0.6011702418327332,
3644
+ "learning_rate": 0.00023354948374531344,
3645
+ "loss": 4.590705108642578,
3646
+ "step": 5070
3647
+ },
3648
+ {
3649
+ "epoch": 0.8582530832910965,
3650
+ "grad_norm": 0.5225823521614075,
3651
+ "learning_rate": 0.00023315124294092277,
3652
+ "loss": 4.578453063964844,
3653
+ "step": 5080
3654
+ },
3655
+ {
3656
+ "epoch": 0.8599425578645041,
3657
+ "grad_norm": 0.5181743502616882,
3658
+ "learning_rate": 0.000232752154217301,
3659
+ "loss": 4.5722908020019535,
3660
+ "step": 5090
3661
+ },
3662
+ {
3663
+ "epoch": 0.8616320324379118,
3664
+ "grad_norm": 0.5235112309455872,
3665
+ "learning_rate": 0.00023235222164408076,
3666
+ "loss": 4.600410461425781,
3667
+ "step": 5100
3668
+ },
3669
+ {
3670
+ "epoch": 0.8633215070113195,
3671
+ "grad_norm": 0.5427247881889343,
3672
+ "learning_rate": 0.00023195144929949953,
3673
+ "loss": 4.576435089111328,
3674
+ "step": 5110
3675
+ },
3676
+ {
3677
+ "epoch": 0.8650109815847271,
3678
+ "grad_norm": 0.5017905235290527,
3679
+ "learning_rate": 0.00023154984127035823,
3680
+ "loss": 4.6031841278076175,
3681
+ "step": 5120
3682
+ },
3683
+ {
3684
+ "epoch": 0.8667004561581348,
3685
+ "grad_norm": 0.5279256105422974,
3686
+ "learning_rate": 0.00023114740165197957,
3687
+ "loss": 4.570458221435547,
3688
+ "step": 5130
3689
+ },
3690
+ {
3691
+ "epoch": 0.8683899307315425,
3692
+ "grad_norm": 0.5026883482933044,
3693
+ "learning_rate": 0.00023074413454816619,
3694
+ "loss": 4.587477493286133,
3695
+ "step": 5140
3696
+ },
3697
+ {
3698
+ "epoch": 0.8700794053049502,
3699
+ "grad_norm": 0.5021783709526062,
3700
+ "learning_rate": 0.0002303400440711589,
3701
+ "loss": 4.580776977539062,
3702
+ "step": 5150
3703
+ },
3704
+ {
3705
+ "epoch": 0.8717688798783578,
3706
+ "grad_norm": 0.5208005309104919,
3707
+ "learning_rate": 0.00022993513434159464,
3708
+ "loss": 4.606272125244141,
3709
+ "step": 5160
3710
+ },
3711
+ {
3712
+ "epoch": 0.8734583544517655,
3713
+ "grad_norm": 0.4933724105358124,
3714
+ "learning_rate": 0.0002295294094884646,
3715
+ "loss": 4.598735046386719,
3716
+ "step": 5170
3717
+ },
3718
+ {
3719
+ "epoch": 0.8751478290251732,
3720
+ "grad_norm": 0.4844622015953064,
3721
+ "learning_rate": 0.00022912287364907204,
3722
+ "loss": 4.577612686157226,
3723
+ "step": 5180
3724
+ },
3725
+ {
3726
+ "epoch": 0.8768373035985808,
3727
+ "grad_norm": 0.49681806564331055,
3728
+ "learning_rate": 0.00022871553096899,
3729
+ "loss": 4.6206306457519535,
3730
+ "step": 5190
3731
+ },
3732
+ {
3733
+ "epoch": 0.8785267781719885,
3734
+ "grad_norm": 0.5069138407707214,
3735
+ "learning_rate": 0.00022830738560201911,
3736
+ "loss": 4.576866149902344,
3737
+ "step": 5200
3738
+ },
3739
+ {
3740
+ "epoch": 0.8802162527453962,
3741
+ "grad_norm": 0.49277958273887634,
3742
+ "learning_rate": 0.00022789844171014557,
3743
+ "loss": 4.570761489868164,
3744
+ "step": 5210
3745
+ },
3746
+ {
3747
+ "epoch": 0.8819057273188039,
3748
+ "grad_norm": 0.5152326822280884,
3749
+ "learning_rate": 0.00022748870346349796,
3750
+ "loss": 4.591669082641602,
3751
+ "step": 5220
3752
+ },
3753
+ {
3754
+ "epoch": 0.8835952018922115,
3755
+ "grad_norm": 0.5280734896659851,
3756
+ "learning_rate": 0.00022707817504030538,
3757
+ "loss": 4.600007629394531,
3758
+ "step": 5230
3759
+ },
3760
+ {
3761
+ "epoch": 0.8852846764656191,
3762
+ "grad_norm": 0.5109785795211792,
3763
+ "learning_rate": 0.0002266668606268545,
3764
+ "loss": 4.551007461547852,
3765
+ "step": 5240
3766
+ },
3767
+ {
3768
+ "epoch": 0.8869741510390269,
3769
+ "grad_norm": 0.511035144329071,
3770
+ "learning_rate": 0.00022625476441744715,
3771
+ "loss": 4.596772766113281,
3772
+ "step": 5250
3773
+ },
3774
+ {
3775
+ "epoch": 0.8886636256124345,
3776
+ "grad_norm": 0.5007238984107971,
3777
+ "learning_rate": 0.00022584189061435725,
3778
+ "loss": 4.5646717071533205,
3779
+ "step": 5260
3780
+ },
3781
+ {
3782
+ "epoch": 0.8903531001858422,
3783
+ "grad_norm": 0.517419159412384,
3784
+ "learning_rate": 0.00022542824342778806,
3785
+ "loss": 4.561199188232422,
3786
+ "step": 5270
3787
+ },
3788
+ {
3789
+ "epoch": 0.8920425747592499,
3790
+ "grad_norm": 0.5943387746810913,
3791
+ "learning_rate": 0.0002250138270758293,
3792
+ "loss": 4.576548385620117,
3793
+ "step": 5280
3794
+ },
3795
+ {
3796
+ "epoch": 0.8937320493326576,
3797
+ "grad_norm": 0.5131561160087585,
3798
+ "learning_rate": 0.00022459864578441415,
3799
+ "loss": 4.587300109863281,
3800
+ "step": 5290
3801
+ },
3802
+ {
3803
+ "epoch": 0.8954215239060652,
3804
+ "grad_norm": 0.5333006381988525,
3805
+ "learning_rate": 0.0002241827037872761,
3806
+ "loss": 4.5638988494873045,
3807
+ "step": 5300
3808
+ },
3809
+ {
3810
+ "epoch": 0.8971109984794728,
3811
+ "grad_norm": 0.46661046147346497,
3812
+ "learning_rate": 0.00022376600532590578,
3813
+ "loss": 4.5343585968017575,
3814
+ "step": 5310
3815
+ },
3816
+ {
3817
+ "epoch": 0.8988004730528806,
3818
+ "grad_norm": 0.4886866509914398,
3819
+ "learning_rate": 0.00022334855464950775,
3820
+ "loss": 4.5834095001220705,
3821
+ "step": 5320
3822
+ },
3823
+ {
3824
+ "epoch": 0.9004899476262882,
3825
+ "grad_norm": 0.5262774229049683,
3826
+ "learning_rate": 0.00022293035601495708,
3827
+ "loss": 4.579534912109375,
3828
+ "step": 5330
3829
+ },
3830
+ {
3831
+ "epoch": 0.9021794221996959,
3832
+ "grad_norm": 0.5163218975067139,
3833
+ "learning_rate": 0.00022251141368675607,
3834
+ "loss": 4.577048492431641,
3835
+ "step": 5340
3836
+ },
3837
+ {
3838
+ "epoch": 0.9038688967731036,
3839
+ "grad_norm": 0.5345433950424194,
3840
+ "learning_rate": 0.00022209173193699067,
3841
+ "loss": 4.582790374755859,
3842
+ "step": 5350
3843
+ },
3844
+ {
3845
+ "epoch": 0.9055583713465112,
3846
+ "grad_norm": 0.5151252150535583,
3847
+ "learning_rate": 0.00022167131504528695,
3848
+ "loss": 4.594097900390625,
3849
+ "step": 5360
3850
+ },
3851
+ {
3852
+ "epoch": 0.9072478459199189,
3853
+ "grad_norm": 0.47062498331069946,
3854
+ "learning_rate": 0.00022125016729876743,
3855
+ "loss": 4.574803161621094,
3856
+ "step": 5370
3857
+ },
3858
+ {
3859
+ "epoch": 0.9089373204933265,
3860
+ "grad_norm": 0.49667978286743164,
3861
+ "learning_rate": 0.00022082829299200743,
3862
+ "loss": 4.580567932128906,
3863
+ "step": 5380
3864
+ },
3865
+ {
3866
+ "epoch": 0.9106267950667343,
3867
+ "grad_norm": 0.48394060134887695,
3868
+ "learning_rate": 0.00022040569642699112,
3869
+ "loss": 4.555598449707031,
3870
+ "step": 5390
3871
+ },
3872
+ {
3873
+ "epoch": 0.9123162696401419,
3874
+ "grad_norm": 0.48837390542030334,
3875
+ "learning_rate": 0.00021998238191306798,
3876
+ "loss": 4.534821319580078,
3877
+ "step": 5400
3878
+ },
3879
+ {
3880
+ "epoch": 0.9140057442135496,
3881
+ "grad_norm": 0.5261453986167908,
3882
+ "learning_rate": 0.00021955835376690841,
3883
+ "loss": 4.546956634521484,
3884
+ "step": 5410
3885
+ },
3886
+ {
3887
+ "epoch": 0.9156952187869573,
3888
+ "grad_norm": 0.5199710130691528,
3889
+ "learning_rate": 0.00021913361631246004,
3890
+ "loss": 4.561407852172851,
3891
+ "step": 5420
3892
+ },
3893
+ {
3894
+ "epoch": 0.9173846933603649,
3895
+ "grad_norm": 0.5369474291801453,
3896
+ "learning_rate": 0.0002187081738809036,
3897
+ "loss": 4.550098419189453,
3898
+ "step": 5430
3899
+ },
3900
+ {
3901
+ "epoch": 0.9190741679337726,
3902
+ "grad_norm": 0.5480945110321045,
3903
+ "learning_rate": 0.00021828203081060858,
3904
+ "loss": 4.559786224365235,
3905
+ "step": 5440
3906
+ },
3907
+ {
3908
+ "epoch": 0.9207636425071802,
3909
+ "grad_norm": 0.5149338245391846,
3910
+ "learning_rate": 0.00021785519144708912,
3911
+ "loss": 4.534018325805664,
3912
+ "step": 5450
3913
+ },
3914
+ {
3915
+ "epoch": 0.922453117080588,
3916
+ "grad_norm": 0.5365586280822754,
3917
+ "learning_rate": 0.00021742766014295976,
3918
+ "loss": 4.546533584594727,
3919
+ "step": 5460
3920
+ },
3921
+ {
3922
+ "epoch": 0.9241425916539956,
3923
+ "grad_norm": 0.5260055661201477,
3924
+ "learning_rate": 0.00021699944125789096,
3925
+ "loss": 4.534712600708008,
3926
+ "step": 5470
3927
+ },
3928
+ {
3929
+ "epoch": 0.9258320662274033,
3930
+ "grad_norm": 0.4802268147468567,
3931
+ "learning_rate": 0.00021657053915856455,
3932
+ "loss": 4.560755920410156,
3933
+ "step": 5480
3934
+ },
3935
+ {
3936
+ "epoch": 0.927521540800811,
3937
+ "grad_norm": 0.4982668459415436,
3938
+ "learning_rate": 0.0002161409582186294,
3939
+ "loss": 4.584963989257813,
3940
+ "step": 5490
3941
+ },
3942
+ {
3943
+ "epoch": 0.9292110153742186,
3944
+ "grad_norm": 0.49544209241867065,
3945
+ "learning_rate": 0.0002157107028186567,
3946
+ "loss": 4.547665786743164,
3947
+ "step": 5500
3948
+ },
3949
+ {
3950
+ "epoch": 0.9292110153742186,
3951
+ "eval_loss": 4.52970552444458,
3952
+ "eval_runtime": 3.6346,
3953
+ "eval_samples_per_second": 275.136,
3954
+ "eval_steps_per_second": 5.778,
3955
+ "step": 5500
3956
  }
3957
  ],
3958
  "logging_steps": 10,
 
3972
  "attributes": {}
3973
  }
3974
  },
3975
+ "total_flos": 1.83951251472384e+17,
3976
  "train_batch_size": 48,
3977
  "trial_name": null,
3978
  "trial_params": null