Wilsonwin commited on
Commit
a2ee843
·
verified ·
1 Parent(s): a32965e

Training in progress, step 5500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29ab3bcbd54c5e63c4e604ac4ad2f368ae42aa766977dc0340b7b8e0814fb858
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8ae4d439763bede675a7bb8407ca626ba1a1ca1d28d508145ff27990bcdfd60
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5104f05c76008a8cc4ebab2ab5f343ccdca71dafda81e126d612fe143dbfa54
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df4f8c8c0f93c7a4647906cc1e5f85c72386b1b581eb687df3d305abbdc44a7
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a04575953c998a8fd3197b1b8249c8e72c33f4bb7c27b036788a4d9e537cf3cd
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af3cc3f1560f815527e73bcdf0bbfb03998a87b5067ff9928ca94f46e638231
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a29280eedf28bde93a8485de1b90963ca69c84125cea86695b5935449e18f453
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda4b56b57284b5d776cea834f86539fa062d5e046885e07dcb7516921ccd6ee
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8447372867038351,
6
  "eval_steps": 500,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3595,6 +3595,364 @@
3595
  "eval_samples_per_second": 279.83,
3596
  "eval_steps_per_second": 5.876,
3597
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3598
  }
3599
  ],
3600
  "logging_steps": 10,
@@ -3614,7 +3972,7 @@
3614
  "attributes": {}
3615
  }
3616
  },
3617
- "total_flos": 1.6722841042944e+17,
3618
  "train_batch_size": 48,
3619
  "trial_name": null,
3620
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9292110153742186,
6
  "eval_steps": 500,
7
+ "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3595
  "eval_samples_per_second": 279.83,
3596
  "eval_steps_per_second": 5.876,
3597
  "step": 5000
3598
+ },
3599
+ {
3600
+ "epoch": 0.8464267612772428,
3601
+ "grad_norm": 0.559013307094574,
3602
+ "learning_rate": 0.00023592089546305216,
3603
+ "loss": 4.576456832885742,
3604
+ "step": 5010
3605
+ },
3606
+ {
3607
+ "epoch": 0.8481162358506504,
3608
+ "grad_norm": 0.49753278493881226,
3609
+ "learning_rate": 0.00023552782714923343,
3610
+ "loss": 4.614764404296875,
3611
+ "step": 5020
3612
+ },
3613
+ {
3614
+ "epoch": 0.8498057104240582,
3615
+ "grad_norm": 0.56475430727005,
3616
+ "learning_rate": 0.00023513388668141118,
3617
+ "loss": 4.58197250366211,
3618
+ "step": 5030
3619
+ },
3620
+ {
3621
+ "epoch": 0.8514951849974658,
3622
+ "grad_norm": 0.4953176975250244,
3623
+ "learning_rate": 0.00023473907807671952,
3624
+ "loss": 4.605267333984375,
3625
+ "step": 5040
3626
+ },
3627
+ {
3628
+ "epoch": 0.8531846595708734,
3629
+ "grad_norm": 0.5137485861778259,
3630
+ "learning_rate": 0.00023434340536114531,
3631
+ "loss": 4.596157836914062,
3632
+ "step": 5050
3633
+ },
3634
+ {
3635
+ "epoch": 0.8548741341442812,
3636
+ "grad_norm": 0.5363683104515076,
3637
+ "learning_rate": 0.00023394687256948697,
3638
+ "loss": 4.594855499267578,
3639
+ "step": 5060
3640
+ },
3641
+ {
3642
+ "epoch": 0.8565636087176888,
3643
+ "grad_norm": 0.5634586811065674,
3644
+ "learning_rate": 0.00023354948374531344,
3645
+ "loss": 4.589244842529297,
3646
+ "step": 5070
3647
+ },
3648
+ {
3649
+ "epoch": 0.8582530832910965,
3650
+ "grad_norm": 0.5043785572052002,
3651
+ "learning_rate": 0.00023315124294092277,
3652
+ "loss": 4.576361083984375,
3653
+ "step": 5080
3654
+ },
3655
+ {
3656
+ "epoch": 0.8599425578645041,
3657
+ "grad_norm": 0.5204640626907349,
3658
+ "learning_rate": 0.000232752154217301,
3659
+ "loss": 4.570015716552734,
3660
+ "step": 5090
3661
+ },
3662
+ {
3663
+ "epoch": 0.8616320324379118,
3664
+ "grad_norm": 0.5251067280769348,
3665
+ "learning_rate": 0.00023235222164408076,
3666
+ "loss": 4.598841857910156,
3667
+ "step": 5100
3668
+ },
3669
+ {
3670
+ "epoch": 0.8633215070113195,
3671
+ "grad_norm": 0.5268970131874084,
3672
+ "learning_rate": 0.00023195144929949953,
3673
+ "loss": 4.574850082397461,
3674
+ "step": 5110
3675
+ },
3676
+ {
3677
+ "epoch": 0.8650109815847271,
3678
+ "grad_norm": 0.5099704265594482,
3679
+ "learning_rate": 0.00023154984127035823,
3680
+ "loss": 4.602288436889649,
3681
+ "step": 5120
3682
+ },
3683
+ {
3684
+ "epoch": 0.8667004561581348,
3685
+ "grad_norm": 0.49661147594451904,
3686
+ "learning_rate": 0.00023114740165197957,
3687
+ "loss": 4.56927604675293,
3688
+ "step": 5130
3689
+ },
3690
+ {
3691
+ "epoch": 0.8683899307315425,
3692
+ "grad_norm": 0.5453396439552307,
3693
+ "learning_rate": 0.00023074413454816619,
3694
+ "loss": 4.587471771240234,
3695
+ "step": 5140
3696
+ },
3697
+ {
3698
+ "epoch": 0.8700794053049502,
3699
+ "grad_norm": 0.49157091975212097,
3700
+ "learning_rate": 0.0002303400440711589,
3701
+ "loss": 4.580040740966797,
3702
+ "step": 5150
3703
+ },
3704
+ {
3705
+ "epoch": 0.8717688798783578,
3706
+ "grad_norm": 0.5203030705451965,
3707
+ "learning_rate": 0.00022993513434159464,
3708
+ "loss": 4.604449462890625,
3709
+ "step": 5160
3710
+ },
3711
+ {
3712
+ "epoch": 0.8734583544517655,
3713
+ "grad_norm": 0.49596408009529114,
3714
+ "learning_rate": 0.0002295294094884646,
3715
+ "loss": 4.597255706787109,
3716
+ "step": 5170
3717
+ },
3718
+ {
3719
+ "epoch": 0.8751478290251732,
3720
+ "grad_norm": 0.482197642326355,
3721
+ "learning_rate": 0.00022912287364907204,
3722
+ "loss": 4.575711822509765,
3723
+ "step": 5180
3724
+ },
3725
+ {
3726
+ "epoch": 0.8768373035985808,
3727
+ "grad_norm": 0.49015575647354126,
3728
+ "learning_rate": 0.00022871553096899,
3729
+ "loss": 4.620565032958984,
3730
+ "step": 5190
3731
+ },
3732
+ {
3733
+ "epoch": 0.8785267781719885,
3734
+ "grad_norm": 0.49912846088409424,
3735
+ "learning_rate": 0.00022830738560201911,
3736
+ "loss": 4.575767898559571,
3737
+ "step": 5200
3738
+ },
3739
+ {
3740
+ "epoch": 0.8802162527453962,
3741
+ "grad_norm": 0.5013103485107422,
3742
+ "learning_rate": 0.00022789844171014557,
3743
+ "loss": 4.5688629150390625,
3744
+ "step": 5210
3745
+ },
3746
+ {
3747
+ "epoch": 0.8819057273188039,
3748
+ "grad_norm": 0.5322986245155334,
3749
+ "learning_rate": 0.00022748870346349796,
3750
+ "loss": 4.590381622314453,
3751
+ "step": 5220
3752
+ },
3753
+ {
3754
+ "epoch": 0.8835952018922115,
3755
+ "grad_norm": 0.5467557311058044,
3756
+ "learning_rate": 0.00022707817504030538,
3757
+ "loss": 4.598742294311523,
3758
+ "step": 5230
3759
+ },
3760
+ {
3761
+ "epoch": 0.8852846764656191,
3762
+ "grad_norm": 0.5180667638778687,
3763
+ "learning_rate": 0.0002266668606268545,
3764
+ "loss": 4.550464630126953,
3765
+ "step": 5240
3766
+ },
3767
+ {
3768
+ "epoch": 0.8869741510390269,
3769
+ "grad_norm": 0.5265566110610962,
3770
+ "learning_rate": 0.00022625476441744715,
3771
+ "loss": 4.595706176757813,
3772
+ "step": 5250
3773
+ },
3774
+ {
3775
+ "epoch": 0.8886636256124345,
3776
+ "grad_norm": 0.5108802318572998,
3777
+ "learning_rate": 0.00022584189061435725,
3778
+ "loss": 4.564280700683594,
3779
+ "step": 5260
3780
+ },
3781
+ {
3782
+ "epoch": 0.8903531001858422,
3783
+ "grad_norm": 0.5016060471534729,
3784
+ "learning_rate": 0.00022542824342778806,
3785
+ "loss": 4.561073303222656,
3786
+ "step": 5270
3787
+ },
3788
+ {
3789
+ "epoch": 0.8920425747592499,
3790
+ "grad_norm": 0.531934916973114,
3791
+ "learning_rate": 0.0002250138270758293,
3792
+ "loss": 4.576354598999023,
3793
+ "step": 5280
3794
+ },
3795
+ {
3796
+ "epoch": 0.8937320493326576,
3797
+ "grad_norm": 0.5116508603096008,
3798
+ "learning_rate": 0.00022459864578441415,
3799
+ "loss": 4.586645889282226,
3800
+ "step": 5290
3801
+ },
3802
+ {
3803
+ "epoch": 0.8954215239060652,
3804
+ "grad_norm": 0.5074120163917542,
3805
+ "learning_rate": 0.0002241827037872761,
3806
+ "loss": 4.564454650878906,
3807
+ "step": 5300
3808
+ },
3809
+ {
3810
+ "epoch": 0.8971109984794728,
3811
+ "grad_norm": 0.47892510890960693,
3812
+ "learning_rate": 0.00022376600532590578,
3813
+ "loss": 4.534092712402344,
3814
+ "step": 5310
3815
+ },
3816
+ {
3817
+ "epoch": 0.8988004730528806,
3818
+ "grad_norm": 0.4657728970050812,
3819
+ "learning_rate": 0.00022334855464950775,
3820
+ "loss": 4.5831245422363285,
3821
+ "step": 5320
3822
+ },
3823
+ {
3824
+ "epoch": 0.9004899476262882,
3825
+ "grad_norm": 0.4809263348579407,
3826
+ "learning_rate": 0.00022293035601495708,
3827
+ "loss": 4.578067398071289,
3828
+ "step": 5330
3829
+ },
3830
+ {
3831
+ "epoch": 0.9021794221996959,
3832
+ "grad_norm": 0.5298095941543579,
3833
+ "learning_rate": 0.00022251141368675607,
3834
+ "loss": 4.576302719116211,
3835
+ "step": 5340
3836
+ },
3837
+ {
3838
+ "epoch": 0.9038688967731036,
3839
+ "grad_norm": 0.5315806865692139,
3840
+ "learning_rate": 0.00022209173193699067,
3841
+ "loss": 4.582082748413086,
3842
+ "step": 5350
3843
+ },
3844
+ {
3845
+ "epoch": 0.9055583713465112,
3846
+ "grad_norm": 0.5084795355796814,
3847
+ "learning_rate": 0.00022167131504528695,
3848
+ "loss": 4.5921672821044925,
3849
+ "step": 5360
3850
+ },
3851
+ {
3852
+ "epoch": 0.9072478459199189,
3853
+ "grad_norm": 0.4921436011791229,
3854
+ "learning_rate": 0.00022125016729876743,
3855
+ "loss": 4.573263931274414,
3856
+ "step": 5370
3857
+ },
3858
+ {
3859
+ "epoch": 0.9089373204933265,
3860
+ "grad_norm": 0.4985114336013794,
3861
+ "learning_rate": 0.00022082829299200743,
3862
+ "loss": 4.580036163330078,
3863
+ "step": 5380
3864
+ },
3865
+ {
3866
+ "epoch": 0.9106267950667343,
3867
+ "grad_norm": 0.486751526594162,
3868
+ "learning_rate": 0.00022040569642699112,
3869
+ "loss": 4.554470062255859,
3870
+ "step": 5390
3871
+ },
3872
+ {
3873
+ "epoch": 0.9123162696401419,
3874
+ "grad_norm": 0.49089571833610535,
3875
+ "learning_rate": 0.00021998238191306798,
3876
+ "loss": 4.53393783569336,
3877
+ "step": 5400
3878
+ },
3879
+ {
3880
+ "epoch": 0.9140057442135496,
3881
+ "grad_norm": 0.5415358543395996,
3882
+ "learning_rate": 0.00021955835376690841,
3883
+ "loss": 4.545674514770508,
3884
+ "step": 5410
3885
+ },
3886
+ {
3887
+ "epoch": 0.9156952187869573,
3888
+ "grad_norm": 0.5360416173934937,
3889
+ "learning_rate": 0.00021913361631246004,
3890
+ "loss": 4.559771728515625,
3891
+ "step": 5420
3892
+ },
3893
+ {
3894
+ "epoch": 0.9173846933603649,
3895
+ "grad_norm": 0.5081850290298462,
3896
+ "learning_rate": 0.0002187081738809036,
3897
+ "loss": 4.550452804565429,
3898
+ "step": 5430
3899
+ },
3900
+ {
3901
+ "epoch": 0.9190741679337726,
3902
+ "grad_norm": 0.5538184642791748,
3903
+ "learning_rate": 0.00021828203081060858,
3904
+ "loss": 4.558630752563476,
3905
+ "step": 5440
3906
+ },
3907
+ {
3908
+ "epoch": 0.9207636425071802,
3909
+ "grad_norm": 0.4913816452026367,
3910
+ "learning_rate": 0.00021785519144708912,
3911
+ "loss": 4.532632446289062,
3912
+ "step": 5450
3913
+ },
3914
+ {
3915
+ "epoch": 0.922453117080588,
3916
+ "grad_norm": 0.4864713251590729,
3917
+ "learning_rate": 0.00021742766014295976,
3918
+ "loss": 4.546042251586914,
3919
+ "step": 5460
3920
+ },
3921
+ {
3922
+ "epoch": 0.9241425916539956,
3923
+ "grad_norm": 0.5309963822364807,
3924
+ "learning_rate": 0.00021699944125789096,
3925
+ "loss": 4.532254791259765,
3926
+ "step": 5470
3927
+ },
3928
+ {
3929
+ "epoch": 0.9258320662274033,
3930
+ "grad_norm": 0.47752541303634644,
3931
+ "learning_rate": 0.00021657053915856455,
3932
+ "loss": 4.558838272094727,
3933
+ "step": 5480
3934
+ },
3935
+ {
3936
+ "epoch": 0.927521540800811,
3937
+ "grad_norm": 0.47722741961479187,
3938
+ "learning_rate": 0.0002161409582186294,
3939
+ "loss": 4.583000183105469,
3940
+ "step": 5490
3941
+ },
3942
+ {
3943
+ "epoch": 0.9292110153742186,
3944
+ "grad_norm": 0.5001320838928223,
3945
+ "learning_rate": 0.0002157107028186567,
3946
+ "loss": 4.5466560363769535,
3947
+ "step": 5500
3948
+ },
3949
+ {
3950
+ "epoch": 0.9292110153742186,
3951
+ "eval_loss": 4.510837554931641,
3952
+ "eval_runtime": 3.6228,
3953
+ "eval_samples_per_second": 276.03,
3954
+ "eval_steps_per_second": 5.797,
3955
+ "step": 5500
3956
  }
3957
  ],
3958
  "logging_steps": 10,
 
3972
  "attributes": {}
3973
  }
3974
  },
3975
+ "total_flos": 1.83951251472384e+17,
3976
  "train_batch_size": 48,
3977
  "trial_name": null,
3978
  "trial_params": null