irodkin commited on
Commit
fed1305
·
verified ·
1 Parent(s): a9f463c

Training checkpoint at step 14000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 13000,
3
- "best_metric": 2.4009385108947754,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
5
- "epoch": 0.26,
6
  "eval_steps": 100,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4688,6 +4688,366 @@
4688
  "eval_samples_per_second": 3.207,
4689
  "eval_steps_per_second": 1.603,
4690
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4691
  }
4692
  ],
4693
  "logging_steps": 25,
@@ -4707,7 +5067,7 @@
4707
  "attributes": {}
4708
  }
4709
  },
4710
- "total_flos": 4.138162987825365e+19,
4711
  "train_batch_size": 1,
4712
  "trial_name": null,
4713
  "trial_params": null
 
1
  {
2
+ "best_global_step": 13900,
3
+ "best_metric": 2.3990118503570557,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-13000",
5
+ "epoch": 0.28,
6
  "eval_steps": 100,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4688
  "eval_samples_per_second": 3.207,
4689
  "eval_steps_per_second": 1.603,
4690
  "step": 13000
4691
+ },
4692
+ {
4693
+ "epoch": 0.2605,
4694
+ "grad_norm": 0.5758550911461594,
4695
+ "learning_rate": 8.21688888888889e-06,
4696
+ "loss": 2.39,
4697
+ "step": 13025
4698
+ },
4699
+ {
4700
+ "epoch": 0.261,
4701
+ "grad_norm": 0.5506335078390368,
4702
+ "learning_rate": 8.211333333333334e-06,
4703
+ "loss": 2.3879,
4704
+ "step": 13050
4705
+ },
4706
+ {
4707
+ "epoch": 0.2615,
4708
+ "grad_norm": 0.578047700560021,
4709
+ "learning_rate": 8.205777777777777e-06,
4710
+ "loss": 2.3772,
4711
+ "step": 13075
4712
+ },
4713
+ {
4714
+ "epoch": 0.262,
4715
+ "grad_norm": 0.5517825098879646,
4716
+ "learning_rate": 8.200222222222223e-06,
4717
+ "loss": 2.3751,
4718
+ "step": 13100
4719
+ },
4720
+ {
4721
+ "epoch": 0.262,
4722
+ "eval_loss": 2.4008378982543945,
4723
+ "eval_runtime": 31.8219,
4724
+ "eval_samples_per_second": 3.205,
4725
+ "eval_steps_per_second": 1.603,
4726
+ "step": 13100
4727
+ },
4728
+ {
4729
+ "epoch": 0.2625,
4730
+ "grad_norm": 0.6060142395322289,
4731
+ "learning_rate": 8.194666666666668e-06,
4732
+ "loss": 2.3859,
4733
+ "step": 13125
4734
+ },
4735
+ {
4736
+ "epoch": 0.263,
4737
+ "grad_norm": 0.6151379264003006,
4738
+ "learning_rate": 8.189111111111111e-06,
4739
+ "loss": 2.3906,
4740
+ "step": 13150
4741
+ },
4742
+ {
4743
+ "epoch": 0.2635,
4744
+ "grad_norm": 0.5889091981712471,
4745
+ "learning_rate": 8.183555555555555e-06,
4746
+ "loss": 2.3813,
4747
+ "step": 13175
4748
+ },
4749
+ {
4750
+ "epoch": 0.264,
4751
+ "grad_norm": 0.7021686085407579,
4752
+ "learning_rate": 8.178e-06,
4753
+ "loss": 2.3844,
4754
+ "step": 13200
4755
+ },
4756
+ {
4757
+ "epoch": 0.264,
4758
+ "eval_loss": 2.400826930999756,
4759
+ "eval_runtime": 31.7255,
4760
+ "eval_samples_per_second": 3.215,
4761
+ "eval_steps_per_second": 1.608,
4762
+ "step": 13200
4763
+ },
4764
+ {
4765
+ "epoch": 0.2645,
4766
+ "grad_norm": 0.5738899506070113,
4767
+ "learning_rate": 8.172444444444446e-06,
4768
+ "loss": 2.3974,
4769
+ "step": 13225
4770
+ },
4771
+ {
4772
+ "epoch": 0.265,
4773
+ "grad_norm": 0.618543215020873,
4774
+ "learning_rate": 8.166888888888889e-06,
4775
+ "loss": 2.3846,
4776
+ "step": 13250
4777
+ },
4778
+ {
4779
+ "epoch": 0.2655,
4780
+ "grad_norm": 0.5529480549821216,
4781
+ "learning_rate": 8.161333333333334e-06,
4782
+ "loss": 2.3816,
4783
+ "step": 13275
4784
+ },
4785
+ {
4786
+ "epoch": 0.266,
4787
+ "grad_norm": 0.569904631452621,
4788
+ "learning_rate": 8.155777777777778e-06,
4789
+ "loss": 2.3809,
4790
+ "step": 13300
4791
+ },
4792
+ {
4793
+ "epoch": 0.266,
4794
+ "eval_loss": 2.4002933502197266,
4795
+ "eval_runtime": 31.6983,
4796
+ "eval_samples_per_second": 3.218,
4797
+ "eval_steps_per_second": 1.609,
4798
+ "step": 13300
4799
+ },
4800
+ {
4801
+ "epoch": 0.2665,
4802
+ "grad_norm": 0.5743878084278218,
4803
+ "learning_rate": 8.150222222222223e-06,
4804
+ "loss": 2.3941,
4805
+ "step": 13325
4806
+ },
4807
+ {
4808
+ "epoch": 0.267,
4809
+ "grad_norm": 0.5594243149898632,
4810
+ "learning_rate": 8.144666666666667e-06,
4811
+ "loss": 2.3878,
4812
+ "step": 13350
4813
+ },
4814
+ {
4815
+ "epoch": 0.2675,
4816
+ "grad_norm": 0.5810666087448406,
4817
+ "learning_rate": 8.139111111111112e-06,
4818
+ "loss": 2.381,
4819
+ "step": 13375
4820
+ },
4821
+ {
4822
+ "epoch": 0.268,
4823
+ "grad_norm": 0.5595852108101106,
4824
+ "learning_rate": 8.133555555555557e-06,
4825
+ "loss": 2.3792,
4826
+ "step": 13400
4827
+ },
4828
+ {
4829
+ "epoch": 0.268,
4830
+ "eval_loss": 2.400261878967285,
4831
+ "eval_runtime": 31.6975,
4832
+ "eval_samples_per_second": 3.218,
4833
+ "eval_steps_per_second": 1.609,
4834
+ "step": 13400
4835
+ },
4836
+ {
4837
+ "epoch": 0.2685,
4838
+ "grad_norm": 0.5789530002361615,
4839
+ "learning_rate": 8.128e-06,
4840
+ "loss": 2.3759,
4841
+ "step": 13425
4842
+ },
4843
+ {
4844
+ "epoch": 0.269,
4845
+ "grad_norm": 0.5662301407639397,
4846
+ "learning_rate": 8.122444444444444e-06,
4847
+ "loss": 2.3791,
4848
+ "step": 13450
4849
+ },
4850
+ {
4851
+ "epoch": 0.2695,
4852
+ "grad_norm": 0.6131145841315326,
4853
+ "learning_rate": 8.11688888888889e-06,
4854
+ "loss": 2.3833,
4855
+ "step": 13475
4856
+ },
4857
+ {
4858
+ "epoch": 0.27,
4859
+ "grad_norm": 0.5607318024001929,
4860
+ "learning_rate": 8.111333333333335e-06,
4861
+ "loss": 2.3724,
4862
+ "step": 13500
4863
+ },
4864
+ {
4865
+ "epoch": 0.27,
4866
+ "eval_loss": 2.4000020027160645,
4867
+ "eval_runtime": 31.71,
4868
+ "eval_samples_per_second": 3.217,
4869
+ "eval_steps_per_second": 1.608,
4870
+ "step": 13500
4871
+ },
4872
+ {
4873
+ "epoch": 0.2705,
4874
+ "grad_norm": 0.5692755244185855,
4875
+ "learning_rate": 8.105777777777778e-06,
4876
+ "loss": 2.3788,
4877
+ "step": 13525
4878
+ },
4879
+ {
4880
+ "epoch": 0.271,
4881
+ "grad_norm": 0.5647342769538716,
4882
+ "learning_rate": 8.100222222222222e-06,
4883
+ "loss": 2.3799,
4884
+ "step": 13550
4885
+ },
4886
+ {
4887
+ "epoch": 0.2715,
4888
+ "grad_norm": 0.5976773519089553,
4889
+ "learning_rate": 8.094666666666667e-06,
4890
+ "loss": 2.3828,
4891
+ "step": 13575
4892
+ },
4893
+ {
4894
+ "epoch": 0.272,
4895
+ "grad_norm": 0.5642506953063758,
4896
+ "learning_rate": 8.089111111111112e-06,
4897
+ "loss": 2.3835,
4898
+ "step": 13600
4899
+ },
4900
+ {
4901
+ "epoch": 0.272,
4902
+ "eval_loss": 2.400066614151001,
4903
+ "eval_runtime": 31.8128,
4904
+ "eval_samples_per_second": 3.206,
4905
+ "eval_steps_per_second": 1.603,
4906
+ "step": 13600
4907
+ },
4908
+ {
4909
+ "epoch": 0.2725,
4910
+ "grad_norm": 0.5616659241704035,
4911
+ "learning_rate": 8.083555555555556e-06,
4912
+ "loss": 2.3801,
4913
+ "step": 13625
4914
+ },
4915
+ {
4916
+ "epoch": 0.273,
4917
+ "grad_norm": 0.5878315825498157,
4918
+ "learning_rate": 8.078e-06,
4919
+ "loss": 2.3781,
4920
+ "step": 13650
4921
+ },
4922
+ {
4923
+ "epoch": 0.2735,
4924
+ "grad_norm": 0.5716337786191225,
4925
+ "learning_rate": 8.072444444444445e-06,
4926
+ "loss": 2.3932,
4927
+ "step": 13675
4928
+ },
4929
+ {
4930
+ "epoch": 0.274,
4931
+ "grad_norm": 0.5636757577555458,
4932
+ "learning_rate": 8.06688888888889e-06,
4933
+ "loss": 2.4041,
4934
+ "step": 13700
4935
+ },
4936
+ {
4937
+ "epoch": 0.274,
4938
+ "eval_loss": 2.3997650146484375,
4939
+ "eval_runtime": 31.4871,
4940
+ "eval_samples_per_second": 3.239,
4941
+ "eval_steps_per_second": 1.62,
4942
+ "step": 13700
4943
+ },
4944
+ {
4945
+ "epoch": 0.2745,
4946
+ "grad_norm": 0.5564992808480433,
4947
+ "learning_rate": 8.061333333333334e-06,
4948
+ "loss": 2.3971,
4949
+ "step": 13725
4950
+ },
4951
+ {
4952
+ "epoch": 0.275,
4953
+ "grad_norm": 0.5736246457745038,
4954
+ "learning_rate": 8.055777777777777e-06,
4955
+ "loss": 2.3847,
4956
+ "step": 13750
4957
+ },
4958
+ {
4959
+ "epoch": 0.2755,
4960
+ "grad_norm": 0.5423430973262378,
4961
+ "learning_rate": 8.050222222222222e-06,
4962
+ "loss": 2.3786,
4963
+ "step": 13775
4964
+ },
4965
+ {
4966
+ "epoch": 0.276,
4967
+ "grad_norm": 0.5672815850751382,
4968
+ "learning_rate": 8.044666666666668e-06,
4969
+ "loss": 2.3945,
4970
+ "step": 13800
4971
+ },
4972
+ {
4973
+ "epoch": 0.276,
4974
+ "eval_loss": 2.399338483810425,
4975
+ "eval_runtime": 31.3741,
4976
+ "eval_samples_per_second": 3.251,
4977
+ "eval_steps_per_second": 1.626,
4978
+ "step": 13800
4979
+ },
4980
+ {
4981
+ "epoch": 0.2765,
4982
+ "grad_norm": 0.5919813611615313,
4983
+ "learning_rate": 8.039111111111111e-06,
4984
+ "loss": 2.3738,
4985
+ "step": 13825
4986
+ },
4987
+ {
4988
+ "epoch": 0.277,
4989
+ "grad_norm": 0.5679311638374708,
4990
+ "learning_rate": 8.033555555555556e-06,
4991
+ "loss": 2.3771,
4992
+ "step": 13850
4993
+ },
4994
+ {
4995
+ "epoch": 0.2775,
4996
+ "grad_norm": 0.5533203763453908,
4997
+ "learning_rate": 8.028e-06,
4998
+ "loss": 2.3831,
4999
+ "step": 13875
5000
+ },
5001
+ {
5002
+ "epoch": 0.278,
5003
+ "grad_norm": 0.5674818164725537,
5004
+ "learning_rate": 8.022444444444445e-06,
5005
+ "loss": 2.3811,
5006
+ "step": 13900
5007
+ },
5008
+ {
5009
+ "epoch": 0.278,
5010
+ "eval_loss": 2.3990118503570557,
5011
+ "eval_runtime": 31.47,
5012
+ "eval_samples_per_second": 3.241,
5013
+ "eval_steps_per_second": 1.621,
5014
+ "step": 13900
5015
+ },
5016
+ {
5017
+ "epoch": 0.2785,
5018
+ "grad_norm": 0.5664699981127816,
5019
+ "learning_rate": 8.016888888888889e-06,
5020
+ "loss": 2.3848,
5021
+ "step": 13925
5022
+ },
5023
+ {
5024
+ "epoch": 0.279,
5025
+ "grad_norm": 0.6085875103795902,
5026
+ "learning_rate": 8.011333333333334e-06,
5027
+ "loss": 2.3822,
5028
+ "step": 13950
5029
+ },
5030
+ {
5031
+ "epoch": 0.2795,
5032
+ "grad_norm": 0.561160479481643,
5033
+ "learning_rate": 8.00577777777778e-06,
5034
+ "loss": 2.3722,
5035
+ "step": 13975
5036
+ },
5037
+ {
5038
+ "epoch": 0.28,
5039
+ "grad_norm": 0.566395855978902,
5040
+ "learning_rate": 8.000222222222223e-06,
5041
+ "loss": 2.3922,
5042
+ "step": 14000
5043
+ },
5044
+ {
5045
+ "epoch": 0.28,
5046
+ "eval_loss": 2.3991119861602783,
5047
+ "eval_runtime": 31.6591,
5048
+ "eval_samples_per_second": 3.222,
5049
+ "eval_steps_per_second": 1.611,
5050
+ "step": 14000
5051
  }
5052
  ],
5053
  "logging_steps": 25,
 
5067
  "attributes": {}
5068
  }
5069
  },
5070
+ "total_flos": 4.456483217658085e+19,
5071
  "train_batch_size": 1,
5072
  "trial_name": null,
5073
  "trial_params": null