irodkin commited on
Commit
dd4fc56
·
verified ·
1 Parent(s): 765f857

Training checkpoint at step 11000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 9800,
3
- "best_metric": 2.4076178073883057,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-9000",
5
- "epoch": 0.2,
6
  "eval_steps": 100,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3608,6 +3608,366 @@
3608
  "eval_samples_per_second": 3.215,
3609
  "eval_steps_per_second": 1.607,
3610
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3611
  }
3612
  ],
3613
  "logging_steps": 25,
@@ -3627,7 +3987,7 @@
3627
  "attributes": {}
3628
  }
3629
  },
3630
- "total_flos": 3.183202298327204e+19,
3631
  "train_batch_size": 1,
3632
  "trial_name": null,
3633
  "trial_params": null
 
1
  {
2
+ "best_global_step": 10900,
3
+ "best_metric": 2.4055566787719727,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-9000",
5
+ "epoch": 0.22,
6
  "eval_steps": 100,
7
+ "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3608
  "eval_samples_per_second": 3.215,
3609
  "eval_steps_per_second": 1.607,
3610
  "step": 10000
3611
+ },
3612
+ {
3613
+ "epoch": 0.2005,
3614
+ "grad_norm": 0.5959223333719136,
3615
+ "learning_rate": 8.883555555555557e-06,
3616
+ "loss": 2.387,
3617
+ "step": 10025
3618
+ },
3619
+ {
3620
+ "epoch": 0.201,
3621
+ "grad_norm": 0.604008744038432,
3622
+ "learning_rate": 8.878e-06,
3623
+ "loss": 2.4016,
3624
+ "step": 10050
3625
+ },
3626
+ {
3627
+ "epoch": 0.2015,
3628
+ "grad_norm": 0.5721419521050413,
3629
+ "learning_rate": 8.872444444444444e-06,
3630
+ "loss": 2.3884,
3631
+ "step": 10075
3632
+ },
3633
+ {
3634
+ "epoch": 0.202,
3635
+ "grad_norm": 0.5986167284289824,
3636
+ "learning_rate": 8.86688888888889e-06,
3637
+ "loss": 2.3945,
3638
+ "step": 10100
3639
+ },
3640
+ {
3641
+ "epoch": 0.202,
3642
+ "eval_loss": 2.4074654579162598,
3643
+ "eval_runtime": 31.8658,
3644
+ "eval_samples_per_second": 3.201,
3645
+ "eval_steps_per_second": 1.6,
3646
+ "step": 10100
3647
+ },
3648
+ {
3649
+ "epoch": 0.2025,
3650
+ "grad_norm": 0.6046479507995179,
3651
+ "learning_rate": 8.861333333333334e-06,
3652
+ "loss": 2.3858,
3653
+ "step": 10125
3654
+ },
3655
+ {
3656
+ "epoch": 0.203,
3657
+ "grad_norm": 0.5633013817443194,
3658
+ "learning_rate": 8.855777777777778e-06,
3659
+ "loss": 2.3879,
3660
+ "step": 10150
3661
+ },
3662
+ {
3663
+ "epoch": 0.2035,
3664
+ "grad_norm": 0.5953174401982892,
3665
+ "learning_rate": 8.850222222222223e-06,
3666
+ "loss": 2.3967,
3667
+ "step": 10175
3668
+ },
3669
+ {
3670
+ "epoch": 0.204,
3671
+ "grad_norm": 0.6306212647705982,
3672
+ "learning_rate": 8.844666666666667e-06,
3673
+ "loss": 2.3927,
3674
+ "step": 10200
3675
+ },
3676
+ {
3677
+ "epoch": 0.204,
3678
+ "eval_loss": 2.407031297683716,
3679
+ "eval_runtime": 31.7801,
3680
+ "eval_samples_per_second": 3.21,
3681
+ "eval_steps_per_second": 1.605,
3682
+ "step": 10200
3683
+ },
3684
+ {
3685
+ "epoch": 0.2045,
3686
+ "grad_norm": 0.5605617492602121,
3687
+ "learning_rate": 8.839111111111112e-06,
3688
+ "loss": 2.4081,
3689
+ "step": 10225
3690
+ },
3691
+ {
3692
+ "epoch": 0.205,
3693
+ "grad_norm": 0.5739246143474902,
3694
+ "learning_rate": 8.833555555555556e-06,
3695
+ "loss": 2.3841,
3696
+ "step": 10250
3697
+ },
3698
+ {
3699
+ "epoch": 0.2055,
3700
+ "grad_norm": 0.5938549959471341,
3701
+ "learning_rate": 8.828000000000001e-06,
3702
+ "loss": 2.3902,
3703
+ "step": 10275
3704
+ },
3705
+ {
3706
+ "epoch": 0.206,
3707
+ "grad_norm": 0.5902936931354175,
3708
+ "learning_rate": 8.822444444444446e-06,
3709
+ "loss": 2.3905,
3710
+ "step": 10300
3711
+ },
3712
+ {
3713
+ "epoch": 0.206,
3714
+ "eval_loss": 2.4066004753112793,
3715
+ "eval_runtime": 31.7707,
3716
+ "eval_samples_per_second": 3.211,
3717
+ "eval_steps_per_second": 1.605,
3718
+ "step": 10300
3719
+ },
3720
+ {
3721
+ "epoch": 0.2065,
3722
+ "grad_norm": 0.5697435057211838,
3723
+ "learning_rate": 8.81688888888889e-06,
3724
+ "loss": 2.3854,
3725
+ "step": 10325
3726
+ },
3727
+ {
3728
+ "epoch": 0.207,
3729
+ "grad_norm": 0.5879126074250441,
3730
+ "learning_rate": 8.811333333333333e-06,
3731
+ "loss": 2.3917,
3732
+ "step": 10350
3733
+ },
3734
+ {
3735
+ "epoch": 0.2075,
3736
+ "grad_norm": 0.5800642153182343,
3737
+ "learning_rate": 8.805777777777778e-06,
3738
+ "loss": 2.3929,
3739
+ "step": 10375
3740
+ },
3741
+ {
3742
+ "epoch": 0.208,
3743
+ "grad_norm": 0.5794546973922929,
3744
+ "learning_rate": 8.800222222222224e-06,
3745
+ "loss": 2.3912,
3746
+ "step": 10400
3747
+ },
3748
+ {
3749
+ "epoch": 0.208,
3750
+ "eval_loss": 2.4065024852752686,
3751
+ "eval_runtime": 31.7191,
3752
+ "eval_samples_per_second": 3.216,
3753
+ "eval_steps_per_second": 1.608,
3754
+ "step": 10400
3755
+ },
3756
+ {
3757
+ "epoch": 0.2085,
3758
+ "grad_norm": 0.5776454190712899,
3759
+ "learning_rate": 8.794666666666667e-06,
3760
+ "loss": 2.386,
3761
+ "step": 10425
3762
+ },
3763
+ {
3764
+ "epoch": 0.209,
3765
+ "grad_norm": 0.5578455228918948,
3766
+ "learning_rate": 8.78911111111111e-06,
3767
+ "loss": 2.3869,
3768
+ "step": 10450
3769
+ },
3770
+ {
3771
+ "epoch": 0.2095,
3772
+ "grad_norm": 0.5721674793656858,
3773
+ "learning_rate": 8.783555555555556e-06,
3774
+ "loss": 2.3779,
3775
+ "step": 10475
3776
+ },
3777
+ {
3778
+ "epoch": 0.21,
3779
+ "grad_norm": 0.5950633442730316,
3780
+ "learning_rate": 8.778000000000001e-06,
3781
+ "loss": 2.3845,
3782
+ "step": 10500
3783
+ },
3784
+ {
3785
+ "epoch": 0.21,
3786
+ "eval_loss": 2.4065566062927246,
3787
+ "eval_runtime": 31.8091,
3788
+ "eval_samples_per_second": 3.207,
3789
+ "eval_steps_per_second": 1.603,
3790
+ "step": 10500
3791
+ },
3792
+ {
3793
+ "epoch": 0.2105,
3794
+ "grad_norm": 0.605078293663896,
3795
+ "learning_rate": 8.772444444444445e-06,
3796
+ "loss": 2.3913,
3797
+ "step": 10525
3798
+ },
3799
+ {
3800
+ "epoch": 0.211,
3801
+ "grad_norm": 0.567849892850204,
3802
+ "learning_rate": 8.766888888888888e-06,
3803
+ "loss": 2.3966,
3804
+ "step": 10550
3805
+ },
3806
+ {
3807
+ "epoch": 0.2115,
3808
+ "grad_norm": 0.6876645024191659,
3809
+ "learning_rate": 8.761333333333334e-06,
3810
+ "loss": 2.3993,
3811
+ "step": 10575
3812
+ },
3813
+ {
3814
+ "epoch": 0.212,
3815
+ "grad_norm": 0.5841938304908528,
3816
+ "learning_rate": 8.755777777777779e-06,
3817
+ "loss": 2.3916,
3818
+ "step": 10600
3819
+ },
3820
+ {
3821
+ "epoch": 0.212,
3822
+ "eval_loss": 2.4061877727508545,
3823
+ "eval_runtime": 31.8484,
3824
+ "eval_samples_per_second": 3.203,
3825
+ "eval_steps_per_second": 1.601,
3826
+ "step": 10600
3827
+ },
3828
+ {
3829
+ "epoch": 0.2125,
3830
+ "grad_norm": 0.5649004204666818,
3831
+ "learning_rate": 8.750222222222223e-06,
3832
+ "loss": 2.381,
3833
+ "step": 10625
3834
+ },
3835
+ {
3836
+ "epoch": 0.213,
3837
+ "grad_norm": 0.5678489376050115,
3838
+ "learning_rate": 8.744666666666666e-06,
3839
+ "loss": 2.3995,
3840
+ "step": 10650
3841
+ },
3842
+ {
3843
+ "epoch": 0.2135,
3844
+ "grad_norm": 0.5712733595317334,
3845
+ "learning_rate": 8.739111111111111e-06,
3846
+ "loss": 2.3954,
3847
+ "step": 10675
3848
+ },
3849
+ {
3850
+ "epoch": 0.214,
3851
+ "grad_norm": 0.573353636066434,
3852
+ "learning_rate": 8.733555555555557e-06,
3853
+ "loss": 2.379,
3854
+ "step": 10700
3855
+ },
3856
+ {
3857
+ "epoch": 0.214,
3858
+ "eval_loss": 2.4055771827697754,
3859
+ "eval_runtime": 31.8192,
3860
+ "eval_samples_per_second": 3.206,
3861
+ "eval_steps_per_second": 1.603,
3862
+ "step": 10700
3863
+ },
3864
+ {
3865
+ "epoch": 0.2145,
3866
+ "grad_norm": 0.6133309651928519,
3867
+ "learning_rate": 8.728e-06,
3868
+ "loss": 2.3946,
3869
+ "step": 10725
3870
+ },
3871
+ {
3872
+ "epoch": 0.215,
3873
+ "grad_norm": 0.6033931866035528,
3874
+ "learning_rate": 8.722444444444445e-06,
3875
+ "loss": 2.3935,
3876
+ "step": 10750
3877
+ },
3878
+ {
3879
+ "epoch": 0.2155,
3880
+ "grad_norm": 0.6008672136487845,
3881
+ "learning_rate": 8.716888888888889e-06,
3882
+ "loss": 2.3872,
3883
+ "step": 10775
3884
+ },
3885
+ {
3886
+ "epoch": 0.216,
3887
+ "grad_norm": 0.5728704483928734,
3888
+ "learning_rate": 8.711333333333334e-06,
3889
+ "loss": 2.3917,
3890
+ "step": 10800
3891
+ },
3892
+ {
3893
+ "epoch": 0.216,
3894
+ "eval_loss": 2.4059016704559326,
3895
+ "eval_runtime": 31.7995,
3896
+ "eval_samples_per_second": 3.208,
3897
+ "eval_steps_per_second": 1.604,
3898
+ "step": 10800
3899
+ },
3900
+ {
3901
+ "epoch": 0.2165,
3902
+ "grad_norm": 0.5888944153423502,
3903
+ "learning_rate": 8.705777777777778e-06,
3904
+ "loss": 2.3946,
3905
+ "step": 10825
3906
+ },
3907
+ {
3908
+ "epoch": 0.217,
3909
+ "grad_norm": 0.5947880979306366,
3910
+ "learning_rate": 8.700222222222223e-06,
3911
+ "loss": 2.3736,
3912
+ "step": 10850
3913
+ },
3914
+ {
3915
+ "epoch": 0.2175,
3916
+ "grad_norm": 0.6163696606959644,
3917
+ "learning_rate": 8.694666666666668e-06,
3918
+ "loss": 2.3838,
3919
+ "step": 10875
3920
+ },
3921
+ {
3922
+ "epoch": 0.218,
3923
+ "grad_norm": 0.6004092938812543,
3924
+ "learning_rate": 8.689111111111112e-06,
3925
+ "loss": 2.3942,
3926
+ "step": 10900
3927
+ },
3928
+ {
3929
+ "epoch": 0.218,
3930
+ "eval_loss": 2.4055566787719727,
3931
+ "eval_runtime": 31.7386,
3932
+ "eval_samples_per_second": 3.214,
3933
+ "eval_steps_per_second": 1.607,
3934
+ "step": 10900
3935
+ },
3936
+ {
3937
+ "epoch": 0.2185,
3938
+ "grad_norm": 0.5786273641598245,
3939
+ "learning_rate": 8.683555555555555e-06,
3940
+ "loss": 2.3938,
3941
+ "step": 10925
3942
+ },
3943
+ {
3944
+ "epoch": 0.219,
3945
+ "grad_norm": 0.5764162885826465,
3946
+ "learning_rate": 8.678e-06,
3947
+ "loss": 2.3939,
3948
+ "step": 10950
3949
+ },
3950
+ {
3951
+ "epoch": 0.2195,
3952
+ "grad_norm": 0.5923291223123188,
3953
+ "learning_rate": 8.672444444444446e-06,
3954
+ "loss": 2.3847,
3955
+ "step": 10975
3956
+ },
3957
+ {
3958
+ "epoch": 0.22,
3959
+ "grad_norm": 0.6102815146568634,
3960
+ "learning_rate": 8.66688888888889e-06,
3961
+ "loss": 2.3901,
3962
+ "step": 11000
3963
+ },
3964
+ {
3965
+ "epoch": 0.22,
3966
+ "eval_loss": 2.405616044998169,
3967
+ "eval_runtime": 31.7048,
3968
+ "eval_samples_per_second": 3.217,
3969
+ "eval_steps_per_second": 1.609,
3970
+ "step": 11000
3971
  }
3972
  ],
3973
  "logging_steps": 25,
 
3987
  "attributes": {}
3988
  }
3989
  },
3990
+ "total_flos": 3.5015225281599242e+19,
3991
  "train_batch_size": 1,
3992
  "trial_name": null,
3993
  "trial_params": null