irodkin commited on
Commit
cb684df
·
verified ·
1 Parent(s): cc0c8c5

Training checkpoint at step 28000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 26800,
3
- "best_metric": 2.381396532058716,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
- "epoch": 0.54,
6
  "eval_steps": 100,
7
- "global_step": 27000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9728,6 +9728,366 @@
9728
  "eval_samples_per_second": 3.212,
9729
  "eval_steps_per_second": 1.606,
9730
  "step": 27000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9731
  }
9732
  ],
9733
  "logging_steps": 25,
@@ -9747,7 +10107,7 @@
9747
  "attributes": {}
9748
  }
9749
  },
9750
- "total_flos": 8.59464620548345e+19,
9751
  "train_batch_size": 1,
9752
  "trial_name": null,
9753
  "trial_params": null
 
1
  {
2
+ "best_global_step": 28000,
3
+ "best_metric": 2.380680799484253,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-28000",
5
+ "epoch": 0.56,
6
  "eval_steps": 100,
7
+ "global_step": 28000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9728
  "eval_samples_per_second": 3.212,
9729
  "eval_steps_per_second": 1.606,
9730
  "step": 27000
9731
+ },
9732
+ {
9733
+ "epoch": 0.5405,
9734
+ "grad_norm": 0.5639894142193489,
9735
+ "learning_rate": 5.105777777777778e-06,
9736
+ "loss": 2.3604,
9737
+ "step": 27025
9738
+ },
9739
+ {
9740
+ "epoch": 0.541,
9741
+ "grad_norm": 0.5650474829629732,
9742
+ "learning_rate": 5.100222222222223e-06,
9743
+ "loss": 2.3615,
9744
+ "step": 27050
9745
+ },
9746
+ {
9747
+ "epoch": 0.5415,
9748
+ "grad_norm": 0.5549449402784257,
9749
+ "learning_rate": 5.094666666666666e-06,
9750
+ "loss": 2.3679,
9751
+ "step": 27075
9752
+ },
9753
+ {
9754
+ "epoch": 0.542,
9755
+ "grad_norm": 0.5615002192664388,
9756
+ "learning_rate": 5.0891111111111115e-06,
9757
+ "loss": 2.3634,
9758
+ "step": 27100
9759
+ },
9760
+ {
9761
+ "epoch": 0.542,
9762
+ "eval_loss": 2.381121873855591,
9763
+ "eval_runtime": 31.7586,
9764
+ "eval_samples_per_second": 3.212,
9765
+ "eval_steps_per_second": 1.606,
9766
+ "step": 27100
9767
+ },
9768
+ {
9769
+ "epoch": 0.5425,
9770
+ "grad_norm": 0.5403095468370492,
9771
+ "learning_rate": 5.083555555555556e-06,
9772
+ "loss": 2.3665,
9773
+ "step": 27125
9774
+ },
9775
+ {
9776
+ "epoch": 0.543,
9777
+ "grad_norm": 0.5421716749680758,
9778
+ "learning_rate": 5.078e-06,
9779
+ "loss": 2.369,
9780
+ "step": 27150
9781
+ },
9782
+ {
9783
+ "epoch": 0.5435,
9784
+ "grad_norm": 0.5590064616229682,
9785
+ "learning_rate": 5.072444444444446e-06,
9786
+ "loss": 2.3594,
9787
+ "step": 27175
9788
+ },
9789
+ {
9790
+ "epoch": 0.544,
9791
+ "grad_norm": 0.5444799207706167,
9792
+ "learning_rate": 5.066888888888889e-06,
9793
+ "loss": 2.3582,
9794
+ "step": 27200
9795
+ },
9796
+ {
9797
+ "epoch": 0.544,
9798
+ "eval_loss": 2.3811404705047607,
9799
+ "eval_runtime": 31.8368,
9800
+ "eval_samples_per_second": 3.204,
9801
+ "eval_steps_per_second": 1.602,
9802
+ "step": 27200
9803
+ },
9804
+ {
9805
+ "epoch": 0.5445,
9806
+ "grad_norm": 0.5694522608963828,
9807
+ "learning_rate": 5.0613333333333336e-06,
9808
+ "loss": 2.3651,
9809
+ "step": 27225
9810
+ },
9811
+ {
9812
+ "epoch": 0.545,
9813
+ "grad_norm": 0.5357232316900923,
9814
+ "learning_rate": 5.055777777777778e-06,
9815
+ "loss": 2.3595,
9816
+ "step": 27250
9817
+ },
9818
+ {
9819
+ "epoch": 0.5455,
9820
+ "grad_norm": 0.5449200504756736,
9821
+ "learning_rate": 5.050222222222223e-06,
9822
+ "loss": 2.3563,
9823
+ "step": 27275
9824
+ },
9825
+ {
9826
+ "epoch": 0.546,
9827
+ "grad_norm": 0.5669179572699722,
9828
+ "learning_rate": 5.044666666666667e-06,
9829
+ "loss": 2.3705,
9830
+ "step": 27300
9831
+ },
9832
+ {
9833
+ "epoch": 0.546,
9834
+ "eval_loss": 2.3810057640075684,
9835
+ "eval_runtime": 31.7869,
9836
+ "eval_samples_per_second": 3.209,
9837
+ "eval_steps_per_second": 1.604,
9838
+ "step": 27300
9839
+ },
9840
+ {
9841
+ "epoch": 0.5465,
9842
+ "grad_norm": 0.5536644347581473,
9843
+ "learning_rate": 5.039111111111111e-06,
9844
+ "loss": 2.3658,
9845
+ "step": 27325
9846
+ },
9847
+ {
9848
+ "epoch": 0.547,
9849
+ "grad_norm": 0.5774297317851765,
9850
+ "learning_rate": 5.0335555555555556e-06,
9851
+ "loss": 2.3553,
9852
+ "step": 27350
9853
+ },
9854
+ {
9855
+ "epoch": 0.5475,
9856
+ "grad_norm": 0.567395549600367,
9857
+ "learning_rate": 5.028000000000001e-06,
9858
+ "loss": 2.3694,
9859
+ "step": 27375
9860
+ },
9861
+ {
9862
+ "epoch": 0.548,
9863
+ "grad_norm": 0.5501789999743681,
9864
+ "learning_rate": 5.022444444444445e-06,
9865
+ "loss": 2.3643,
9866
+ "step": 27400
9867
+ },
9868
+ {
9869
+ "epoch": 0.548,
9870
+ "eval_loss": 2.3811025619506836,
9871
+ "eval_runtime": 31.9197,
9872
+ "eval_samples_per_second": 3.196,
9873
+ "eval_steps_per_second": 1.598,
9874
+ "step": 27400
9875
+ },
9876
+ {
9877
+ "epoch": 0.5485,
9878
+ "grad_norm": 0.5719215133111718,
9879
+ "learning_rate": 5.016888888888889e-06,
9880
+ "loss": 2.365,
9881
+ "step": 27425
9882
+ },
9883
+ {
9884
+ "epoch": 0.549,
9885
+ "grad_norm": 0.5899241097551456,
9886
+ "learning_rate": 5.011333333333333e-06,
9887
+ "loss": 2.3774,
9888
+ "step": 27450
9889
+ },
9890
+ {
9891
+ "epoch": 0.5495,
9892
+ "grad_norm": 0.5731413292155066,
9893
+ "learning_rate": 5.0057777777777784e-06,
9894
+ "loss": 2.3706,
9895
+ "step": 27475
9896
+ },
9897
+ {
9898
+ "epoch": 0.55,
9899
+ "grad_norm": 0.5425656065958468,
9900
+ "learning_rate": 5.000222222222223e-06,
9901
+ "loss": 2.3566,
9902
+ "step": 27500
9903
+ },
9904
+ {
9905
+ "epoch": 0.55,
9906
+ "eval_loss": 2.380763292312622,
9907
+ "eval_runtime": 31.8162,
9908
+ "eval_samples_per_second": 3.206,
9909
+ "eval_steps_per_second": 1.603,
9910
+ "step": 27500
9911
+ },
9912
+ {
9913
+ "epoch": 0.5505,
9914
+ "grad_norm": 0.5601626399029922,
9915
+ "learning_rate": 4.994666666666667e-06,
9916
+ "loss": 2.3762,
9917
+ "step": 27525
9918
+ },
9919
+ {
9920
+ "epoch": 0.551,
9921
+ "grad_norm": 0.5715204135637444,
9922
+ "learning_rate": 4.989111111111112e-06,
9923
+ "loss": 2.363,
9924
+ "step": 27550
9925
+ },
9926
+ {
9927
+ "epoch": 0.5515,
9928
+ "grad_norm": 0.547533853702179,
9929
+ "learning_rate": 4.983555555555556e-06,
9930
+ "loss": 2.3659,
9931
+ "step": 27575
9932
+ },
9933
+ {
9934
+ "epoch": 0.552,
9935
+ "grad_norm": 0.5817399132816639,
9936
+ "learning_rate": 4.9780000000000005e-06,
9937
+ "loss": 2.3693,
9938
+ "step": 27600
9939
+ },
9940
+ {
9941
+ "epoch": 0.552,
9942
+ "eval_loss": 2.3807787895202637,
9943
+ "eval_runtime": 31.8396,
9944
+ "eval_samples_per_second": 3.204,
9945
+ "eval_steps_per_second": 1.602,
9946
+ "step": 27600
9947
+ },
9948
+ {
9949
+ "epoch": 0.5525,
9950
+ "grad_norm": 0.544660595894246,
9951
+ "learning_rate": 4.972444444444445e-06,
9952
+ "loss": 2.3661,
9953
+ "step": 27625
9954
+ },
9955
+ {
9956
+ "epoch": 0.553,
9957
+ "grad_norm": 0.5813863819688693,
9958
+ "learning_rate": 4.966888888888889e-06,
9959
+ "loss": 2.365,
9960
+ "step": 27650
9961
+ },
9962
+ {
9963
+ "epoch": 0.5535,
9964
+ "grad_norm": 0.555794514365692,
9965
+ "learning_rate": 4.961333333333334e-06,
9966
+ "loss": 2.3724,
9967
+ "step": 27675
9968
+ },
9969
+ {
9970
+ "epoch": 0.554,
9971
+ "grad_norm": 0.5549771654031,
9972
+ "learning_rate": 4.955777777777778e-06,
9973
+ "loss": 2.3712,
9974
+ "step": 27700
9975
+ },
9976
+ {
9977
+ "epoch": 0.554,
9978
+ "eval_loss": 2.380859613418579,
9979
+ "eval_runtime": 32.035,
9980
+ "eval_samples_per_second": 3.184,
9981
+ "eval_steps_per_second": 1.592,
9982
+ "step": 27700
9983
+ },
9984
+ {
9985
+ "epoch": 0.5545,
9986
+ "grad_norm": 0.5660580874490311,
9987
+ "learning_rate": 4.9502222222222225e-06,
9988
+ "loss": 2.3626,
9989
+ "step": 27725
9990
+ },
9991
+ {
9992
+ "epoch": 0.555,
9993
+ "grad_norm": 0.5408935222204184,
9994
+ "learning_rate": 4.944666666666667e-06,
9995
+ "loss": 2.3546,
9996
+ "step": 27750
9997
+ },
9998
+ {
9999
+ "epoch": 0.5555,
10000
+ "grad_norm": 0.5574539497290301,
10001
+ "learning_rate": 4.939111111111112e-06,
10002
+ "loss": 2.3503,
10003
+ "step": 27775
10004
+ },
10005
+ {
10006
+ "epoch": 0.556,
10007
+ "grad_norm": 0.5733587459238179,
10008
+ "learning_rate": 4.933555555555556e-06,
10009
+ "loss": 2.3787,
10010
+ "step": 27800
10011
+ },
10012
+ {
10013
+ "epoch": 0.556,
10014
+ "eval_loss": 2.380819082260132,
10015
+ "eval_runtime": 31.8731,
10016
+ "eval_samples_per_second": 3.2,
10017
+ "eval_steps_per_second": 1.6,
10018
+ "step": 27800
10019
+ },
10020
+ {
10021
+ "epoch": 0.5565,
10022
+ "grad_norm": 0.5469010479471977,
10023
+ "learning_rate": 4.928000000000001e-06,
10024
+ "loss": 2.3728,
10025
+ "step": 27825
10026
+ },
10027
+ {
10028
+ "epoch": 0.557,
10029
+ "grad_norm": 0.5575923461377743,
10030
+ "learning_rate": 4.9224444444444445e-06,
10031
+ "loss": 2.3587,
10032
+ "step": 27850
10033
+ },
10034
+ {
10035
+ "epoch": 0.5575,
10036
+ "grad_norm": 0.5484615569385746,
10037
+ "learning_rate": 4.91688888888889e-06,
10038
+ "loss": 2.3554,
10039
+ "step": 27875
10040
+ },
10041
+ {
10042
+ "epoch": 0.558,
10043
+ "grad_norm": 0.5700580906470195,
10044
+ "learning_rate": 4.911333333333333e-06,
10045
+ "loss": 2.3591,
10046
+ "step": 27900
10047
+ },
10048
+ {
10049
+ "epoch": 0.558,
10050
+ "eval_loss": 2.380748748779297,
10051
+ "eval_runtime": 31.8799,
10052
+ "eval_samples_per_second": 3.2,
10053
+ "eval_steps_per_second": 1.6,
10054
+ "step": 27900
10055
+ },
10056
+ {
10057
+ "epoch": 0.5585,
10058
+ "grad_norm": 0.5644741625244013,
10059
+ "learning_rate": 4.9057777777777785e-06,
10060
+ "loss": 2.3573,
10061
+ "step": 27925
10062
+ },
10063
+ {
10064
+ "epoch": 0.559,
10065
+ "grad_norm": 0.5518750142742082,
10066
+ "learning_rate": 4.900222222222223e-06,
10067
+ "loss": 2.3722,
10068
+ "step": 27950
10069
+ },
10070
+ {
10071
+ "epoch": 0.5595,
10072
+ "grad_norm": 0.5570570164343176,
10073
+ "learning_rate": 4.894666666666667e-06,
10074
+ "loss": 2.3644,
10075
+ "step": 27975
10076
+ },
10077
+ {
10078
+ "epoch": 0.56,
10079
+ "grad_norm": 0.5454507656456767,
10080
+ "learning_rate": 4.889111111111112e-06,
10081
+ "loss": 2.3545,
10082
+ "step": 28000
10083
+ },
10084
+ {
10085
+ "epoch": 0.56,
10086
+ "eval_loss": 2.380680799484253,
10087
+ "eval_runtime": 31.8506,
10088
+ "eval_samples_per_second": 3.202,
10089
+ "eval_steps_per_second": 1.601,
10090
+ "step": 28000
10091
  }
10092
  ],
10093
  "logging_steps": 25,
 
10107
  "attributes": {}
10108
  }
10109
  },
10110
+ "total_flos": 8.91296643531617e+19,
10111
  "train_batch_size": 1,
10112
  "trial_name": null,
10113
  "trial_params": null