Azrail commited on
Commit
d19e657
·
verified ·
1 Parent(s): 1ebff58

Training in progress, step 22000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0634cd3b48faa896331e649d644ee85a0e0af72246ab7393a66a3c2518bb02e
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b81985a8a1ebad5f960997d908e43f8d285835abd4645a1ad5e8d86d7a91e976
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:611dbdaa20f4f869458e449fe2e70d417e2df56bd8ff59602f5187369567bda1
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e989ff28ea16b1edbb8530fe11c0f4057d65c6350ad0a17cbf0a4960b2cb6ea
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd42aefaf8cffc05ebd908742fc863dc5486d9c9296568766959af6a5b7610ad
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ec14043d0cb9b7579fdf9075af0f9a31a2393b8ca68497f0e5375a4fe1a3cf9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6662ae68d38995d5846f13e724946a2acb1395046b7d08977dde3dab733945c0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ce6ab6f335eafbbff78f85b703b750b35d8b96e0da89ad49a445d3e07ab4df8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.43932086485803756,
6
  "eval_steps": 500,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3568,11 +3568,367 @@
3568
  "eval_steps_per_second": 18.841,
3569
  "num_input_tokens_seen": 20971520000,
3570
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3571
  }
3572
  ],
3573
  "logging_steps": 50,
3574
  "max_steps": 200000,
3575
- "num_input_tokens_seen": 20971520000,
3576
  "num_train_epochs": 5,
3577
  "save_steps": 1000,
3578
  "stateful_callbacks": {
@@ -3587,7 +3943,7 @@
3587
  "attributes": {}
3588
  }
3589
  },
3590
- "total_flos": 1.194343431929856e+19,
3591
  "train_batch_size": 64,
3592
  "trial_name": null,
3593
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.48325295134384133,
6
  "eval_steps": 500,
7
+ "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3568
  "eval_steps_per_second": 18.841,
3569
  "num_input_tokens_seen": 20971520000,
3570
  "step": 20000
3571
+ },
3572
+ {
3573
+ "epoch": 0.4404191670201827,
3574
+ "grad_norm": 0.14489957690238953,
3575
+ "learning_rate": 0.001,
3576
+ "loss": 2.7139,
3577
+ "num_input_tokens_seen": 21023948800,
3578
+ "step": 20050
3579
+ },
3580
+ {
3581
+ "epoch": 0.44151746918232776,
3582
+ "grad_norm": 0.13994646072387695,
3583
+ "learning_rate": 0.001,
3584
+ "loss": 2.7091,
3585
+ "num_input_tokens_seen": 21076377600,
3586
+ "step": 20100
3587
+ },
3588
+ {
3589
+ "epoch": 0.4426157713444729,
3590
+ "grad_norm": 0.17211903631687164,
3591
+ "learning_rate": 0.001,
3592
+ "loss": 2.7176,
3593
+ "num_input_tokens_seen": 21128806400,
3594
+ "step": 20150
3595
+ },
3596
+ {
3597
+ "epoch": 0.44371407350661796,
3598
+ "grad_norm": 0.16364862024784088,
3599
+ "learning_rate": 0.001,
3600
+ "loss": 2.7181,
3601
+ "num_input_tokens_seen": 21181235200,
3602
+ "step": 20200
3603
+ },
3604
+ {
3605
+ "epoch": 0.444812375668763,
3606
+ "grad_norm": 0.14166216552257538,
3607
+ "learning_rate": 0.001,
3608
+ "loss": 2.7127,
3609
+ "num_input_tokens_seen": 21233664000,
3610
+ "step": 20250
3611
+ },
3612
+ {
3613
+ "epoch": 0.44591067783090815,
3614
+ "grad_norm": 0.12995755672454834,
3615
+ "learning_rate": 0.001,
3616
+ "loss": 2.7085,
3617
+ "num_input_tokens_seen": 21286092800,
3618
+ "step": 20300
3619
+ },
3620
+ {
3621
+ "epoch": 0.4470089799930532,
3622
+ "grad_norm": 0.15717202425003052,
3623
+ "learning_rate": 0.001,
3624
+ "loss": 2.7071,
3625
+ "num_input_tokens_seen": 21338521600,
3626
+ "step": 20350
3627
+ },
3628
+ {
3629
+ "epoch": 0.44810728215519835,
3630
+ "grad_norm": 0.13354860246181488,
3631
+ "learning_rate": 0.001,
3632
+ "loss": 2.7094,
3633
+ "num_input_tokens_seen": 21390950400,
3634
+ "step": 20400
3635
+ },
3636
+ {
3637
+ "epoch": 0.4492055843173434,
3638
+ "grad_norm": 0.16004188358783722,
3639
+ "learning_rate": 0.001,
3640
+ "loss": 2.7109,
3641
+ "num_input_tokens_seen": 21443379200,
3642
+ "step": 20450
3643
+ },
3644
+ {
3645
+ "epoch": 0.45030388647948855,
3646
+ "grad_norm": 0.148077592253685,
3647
+ "learning_rate": 0.001,
3648
+ "loss": 2.7058,
3649
+ "num_input_tokens_seen": 21495808000,
3650
+ "step": 20500
3651
+ },
3652
+ {
3653
+ "epoch": 0.45030388647948855,
3654
+ "eval_loss": 2.6089115142822266,
3655
+ "eval_runtime": 65.5589,
3656
+ "eval_samples_per_second": 76.267,
3657
+ "eval_steps_per_second": 19.067,
3658
+ "num_input_tokens_seen": 21495808000,
3659
+ "step": 20500
3660
+ },
3661
+ {
3662
+ "epoch": 0.4514021886416336,
3663
+ "grad_norm": 0.16992634534835815,
3664
+ "learning_rate": 0.001,
3665
+ "loss": 2.7026,
3666
+ "num_input_tokens_seen": 21548236800,
3667
+ "step": 20550
3668
+ },
3669
+ {
3670
+ "epoch": 0.4525004908037787,
3671
+ "grad_norm": 0.14876551926136017,
3672
+ "learning_rate": 0.001,
3673
+ "loss": 2.7105,
3674
+ "num_input_tokens_seen": 21600665600,
3675
+ "step": 20600
3676
+ },
3677
+ {
3678
+ "epoch": 0.4535987929659238,
3679
+ "grad_norm": 0.16025613248348236,
3680
+ "learning_rate": 0.001,
3681
+ "loss": 2.707,
3682
+ "num_input_tokens_seen": 21653094400,
3683
+ "step": 20650
3684
+ },
3685
+ {
3686
+ "epoch": 0.4546970951280689,
3687
+ "grad_norm": 0.14609012007713318,
3688
+ "learning_rate": 0.001,
3689
+ "loss": 2.7086,
3690
+ "num_input_tokens_seen": 21705523200,
3691
+ "step": 20700
3692
+ },
3693
+ {
3694
+ "epoch": 0.455795397290214,
3695
+ "grad_norm": 0.14725832641124725,
3696
+ "learning_rate": 0.001,
3697
+ "loss": 2.7075,
3698
+ "num_input_tokens_seen": 21757952000,
3699
+ "step": 20750
3700
+ },
3701
+ {
3702
+ "epoch": 0.4568936994523591,
3703
+ "grad_norm": 0.1736454963684082,
3704
+ "learning_rate": 0.001,
3705
+ "loss": 2.7033,
3706
+ "num_input_tokens_seen": 21810380800,
3707
+ "step": 20800
3708
+ },
3709
+ {
3710
+ "epoch": 0.45799200161450415,
3711
+ "grad_norm": 0.14904257655143738,
3712
+ "learning_rate": 0.001,
3713
+ "loss": 2.7012,
3714
+ "num_input_tokens_seen": 21862809600,
3715
+ "step": 20850
3716
+ },
3717
+ {
3718
+ "epoch": 0.4590903037766493,
3719
+ "grad_norm": 0.14407765865325928,
3720
+ "learning_rate": 0.001,
3721
+ "loss": 2.7055,
3722
+ "num_input_tokens_seen": 21915238400,
3723
+ "step": 20900
3724
+ },
3725
+ {
3726
+ "epoch": 0.46018860593879435,
3727
+ "grad_norm": 0.13943473994731903,
3728
+ "learning_rate": 0.001,
3729
+ "loss": 2.6999,
3730
+ "num_input_tokens_seen": 21967667200,
3731
+ "step": 20950
3732
+ },
3733
+ {
3734
+ "epoch": 0.4612869081009395,
3735
+ "grad_norm": 0.1592896729707718,
3736
+ "learning_rate": 0.001,
3737
+ "loss": 2.7072,
3738
+ "num_input_tokens_seen": 22020096000,
3739
+ "step": 21000
3740
+ },
3741
+ {
3742
+ "epoch": 0.4612869081009395,
3743
+ "eval_loss": 2.605719566345215,
3744
+ "eval_runtime": 65.6879,
3745
+ "eval_samples_per_second": 76.117,
3746
+ "eval_steps_per_second": 19.029,
3747
+ "num_input_tokens_seen": 22020096000,
3748
+ "step": 21000
3749
+ },
3750
+ {
3751
+ "epoch": 0.46238521026308455,
3752
+ "grad_norm": 0.1428702473640442,
3753
+ "learning_rate": 0.001,
3754
+ "loss": 2.7042,
3755
+ "num_input_tokens_seen": 22072524800,
3756
+ "step": 21050
3757
+ },
3758
+ {
3759
+ "epoch": 0.46348351242522967,
3760
+ "grad_norm": 0.13529072701931,
3761
+ "learning_rate": 0.001,
3762
+ "loss": 2.7093,
3763
+ "num_input_tokens_seen": 22124953600,
3764
+ "step": 21100
3765
+ },
3766
+ {
3767
+ "epoch": 0.46458181458737474,
3768
+ "grad_norm": 0.17529748380184174,
3769
+ "learning_rate": 0.001,
3770
+ "loss": 2.713,
3771
+ "num_input_tokens_seen": 22177382400,
3772
+ "step": 21150
3773
+ },
3774
+ {
3775
+ "epoch": 0.4656801167495198,
3776
+ "grad_norm": 0.1479254513978958,
3777
+ "learning_rate": 0.001,
3778
+ "loss": 2.6984,
3779
+ "num_input_tokens_seen": 22229811200,
3780
+ "step": 21200
3781
+ },
3782
+ {
3783
+ "epoch": 0.46677841891166494,
3784
+ "grad_norm": 0.15110637247562408,
3785
+ "learning_rate": 0.001,
3786
+ "loss": 2.7128,
3787
+ "num_input_tokens_seen": 22282240000,
3788
+ "step": 21250
3789
+ },
3790
+ {
3791
+ "epoch": 0.46787672107381,
3792
+ "grad_norm": 0.13746944069862366,
3793
+ "learning_rate": 0.001,
3794
+ "loss": 2.7036,
3795
+ "num_input_tokens_seen": 22334668800,
3796
+ "step": 21300
3797
+ },
3798
+ {
3799
+ "epoch": 0.46897502323595514,
3800
+ "grad_norm": 0.17940136790275574,
3801
+ "learning_rate": 0.001,
3802
+ "loss": 2.7048,
3803
+ "num_input_tokens_seen": 22387097600,
3804
+ "step": 21350
3805
+ },
3806
+ {
3807
+ "epoch": 0.4700733253981002,
3808
+ "grad_norm": 0.14203256368637085,
3809
+ "learning_rate": 0.001,
3810
+ "loss": 2.6997,
3811
+ "num_input_tokens_seen": 22439526400,
3812
+ "step": 21400
3813
+ },
3814
+ {
3815
+ "epoch": 0.47117162756024533,
3816
+ "grad_norm": 0.14260704815387726,
3817
+ "learning_rate": 0.001,
3818
+ "loss": 2.7092,
3819
+ "num_input_tokens_seen": 22491955200,
3820
+ "step": 21450
3821
+ },
3822
+ {
3823
+ "epoch": 0.4722699297223904,
3824
+ "grad_norm": 0.16455897688865662,
3825
+ "learning_rate": 0.001,
3826
+ "loss": 2.6969,
3827
+ "num_input_tokens_seen": 22544384000,
3828
+ "step": 21500
3829
+ },
3830
+ {
3831
+ "epoch": 0.4722699297223904,
3832
+ "eval_loss": 2.60367751121521,
3833
+ "eval_runtime": 65.4304,
3834
+ "eval_samples_per_second": 76.417,
3835
+ "eval_steps_per_second": 19.104,
3836
+ "num_input_tokens_seen": 22544384000,
3837
+ "step": 21500
3838
+ },
3839
+ {
3840
+ "epoch": 0.4733682318845355,
3841
+ "grad_norm": 0.1529170274734497,
3842
+ "learning_rate": 0.001,
3843
+ "loss": 2.7003,
3844
+ "num_input_tokens_seen": 22596812800,
3845
+ "step": 21550
3846
+ },
3847
+ {
3848
+ "epoch": 0.4744665340466806,
3849
+ "grad_norm": 0.1921636164188385,
3850
+ "learning_rate": 0.001,
3851
+ "loss": 2.7014,
3852
+ "num_input_tokens_seen": 22649241600,
3853
+ "step": 21600
3854
+ },
3855
+ {
3856
+ "epoch": 0.47556483620882567,
3857
+ "grad_norm": 0.16029173135757446,
3858
+ "learning_rate": 0.001,
3859
+ "loss": 2.7028,
3860
+ "num_input_tokens_seen": 22701670400,
3861
+ "step": 21650
3862
+ },
3863
+ {
3864
+ "epoch": 0.4766631383709708,
3865
+ "grad_norm": 0.14740578830242157,
3866
+ "learning_rate": 0.001,
3867
+ "loss": 2.7019,
3868
+ "num_input_tokens_seen": 22754099200,
3869
+ "step": 21700
3870
+ },
3871
+ {
3872
+ "epoch": 0.47776144053311587,
3873
+ "grad_norm": 0.1734548658132553,
3874
+ "learning_rate": 0.001,
3875
+ "loss": 2.6985,
3876
+ "num_input_tokens_seen": 22806528000,
3877
+ "step": 21750
3878
+ },
3879
+ {
3880
+ "epoch": 0.47885974269526094,
3881
+ "grad_norm": 0.15502890944480896,
3882
+ "learning_rate": 0.001,
3883
+ "loss": 2.6973,
3884
+ "num_input_tokens_seen": 22858956800,
3885
+ "step": 21800
3886
+ },
3887
+ {
3888
+ "epoch": 0.47995804485740606,
3889
+ "grad_norm": 0.16783900558948517,
3890
+ "learning_rate": 0.001,
3891
+ "loss": 2.7003,
3892
+ "num_input_tokens_seen": 22911385600,
3893
+ "step": 21850
3894
+ },
3895
+ {
3896
+ "epoch": 0.48105634701955113,
3897
+ "grad_norm": 0.14911381900310516,
3898
+ "learning_rate": 0.001,
3899
+ "loss": 2.6992,
3900
+ "num_input_tokens_seen": 22963814400,
3901
+ "step": 21900
3902
+ },
3903
+ {
3904
+ "epoch": 0.48215464918169626,
3905
+ "grad_norm": 0.15027394890785217,
3906
+ "learning_rate": 0.001,
3907
+ "loss": 2.6957,
3908
+ "num_input_tokens_seen": 23016243200,
3909
+ "step": 21950
3910
+ },
3911
+ {
3912
+ "epoch": 0.48325295134384133,
3913
+ "grad_norm": 0.1261301189661026,
3914
+ "learning_rate": 0.001,
3915
+ "loss": 2.7064,
3916
+ "num_input_tokens_seen": 23068672000,
3917
+ "step": 22000
3918
+ },
3919
+ {
3920
+ "epoch": 0.48325295134384133,
3921
+ "eval_loss": 2.6012015342712402,
3922
+ "eval_runtime": 64.9701,
3923
+ "eval_samples_per_second": 76.958,
3924
+ "eval_steps_per_second": 19.24,
3925
+ "num_input_tokens_seen": 23068672000,
3926
+ "step": 22000
3927
  }
3928
  ],
3929
  "logging_steps": 50,
3930
  "max_steps": 200000,
3931
+ "num_input_tokens_seen": 23068672000,
3932
  "num_train_epochs": 5,
3933
  "save_steps": 1000,
3934
  "stateful_callbacks": {
 
3943
  "attributes": {}
3944
  }
3945
  },
3946
+ "total_flos": 1.3137777751228416e+19,
3947
  "train_batch_size": 64,
3948
  "trial_name": null,
3949
  "trial_params": null