irodkin commited on
Commit
81cbc68
·
verified ·
1 Parent(s): dc00825

Training checkpoint at step 17000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 15900,
3
- "best_metric": 2.3957200050354004,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-15000",
5
- "epoch": 0.32,
6
  "eval_steps": 100,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5768,6 +5768,366 @@
5768
  "eval_samples_per_second": 3.19,
5769
  "eval_steps_per_second": 1.595,
5770
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5771
  }
5772
  ],
5773
  "logging_steps": 25,
@@ -5787,7 +6147,7 @@
5787
  "attributes": {}
5788
  }
5789
  },
5790
- "total_flos": 5.093123677323526e+19,
5791
  "train_batch_size": 1,
5792
  "trial_name": null,
5793
  "trial_params": null
 
1
  {
2
+ "best_global_step": 17000,
3
+ "best_metric": 2.394216775894165,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-17000",
5
+ "epoch": 0.34,
6
  "eval_steps": 100,
7
+ "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5768
  "eval_samples_per_second": 3.19,
5769
  "eval_steps_per_second": 1.595,
5770
  "step": 16000
5771
+ },
5772
+ {
5773
+ "epoch": 0.3205,
5774
+ "grad_norm": 0.5490540509150809,
5775
+ "learning_rate": 7.550222222222223e-06,
5776
+ "loss": 2.3908,
5777
+ "step": 16025
5778
+ },
5779
+ {
5780
+ "epoch": 0.321,
5781
+ "grad_norm": 0.5604566538327537,
5782
+ "learning_rate": 7.5446666666666665e-06,
5783
+ "loss": 2.3816,
5784
+ "step": 16050
5785
+ },
5786
+ {
5787
+ "epoch": 0.3215,
5788
+ "grad_norm": 0.5482351645184266,
5789
+ "learning_rate": 7.539111111111112e-06,
5790
+ "loss": 2.3783,
5791
+ "step": 16075
5792
+ },
5793
+ {
5794
+ "epoch": 0.322,
5795
+ "grad_norm": 0.5738611670880387,
5796
+ "learning_rate": 7.533555555555556e-06,
5797
+ "loss": 2.3807,
5798
+ "step": 16100
5799
+ },
5800
+ {
5801
+ "epoch": 0.322,
5802
+ "eval_loss": 2.3955187797546387,
5803
+ "eval_runtime": 31.7782,
5804
+ "eval_samples_per_second": 3.21,
5805
+ "eval_steps_per_second": 1.605,
5806
+ "step": 16100
5807
+ },
5808
+ {
5809
+ "epoch": 0.3225,
5810
+ "grad_norm": 0.6007459037823811,
5811
+ "learning_rate": 7.528000000000001e-06,
5812
+ "loss": 2.3908,
5813
+ "step": 16125
5814
+ },
5815
+ {
5816
+ "epoch": 0.323,
5817
+ "grad_norm": 0.5719140015142068,
5818
+ "learning_rate": 7.522444444444446e-06,
5819
+ "loss": 2.379,
5820
+ "step": 16150
5821
+ },
5822
+ {
5823
+ "epoch": 0.3235,
5824
+ "grad_norm": 0.5722843141001409,
5825
+ "learning_rate": 7.516888888888889e-06,
5826
+ "loss": 2.3831,
5827
+ "step": 16175
5828
+ },
5829
+ {
5830
+ "epoch": 0.324,
5831
+ "grad_norm": 0.5500359198684006,
5832
+ "learning_rate": 7.511333333333334e-06,
5833
+ "loss": 2.3899,
5834
+ "step": 16200
5835
+ },
5836
+ {
5837
+ "epoch": 0.324,
5838
+ "eval_loss": 2.3954145908355713,
5839
+ "eval_runtime": 31.9265,
5840
+ "eval_samples_per_second": 3.195,
5841
+ "eval_steps_per_second": 1.597,
5842
+ "step": 16200
5843
+ },
5844
+ {
5845
+ "epoch": 0.3245,
5846
+ "grad_norm": 0.5988197648020003,
5847
+ "learning_rate": 7.505777777777778e-06,
5848
+ "loss": 2.3768,
5849
+ "step": 16225
5850
+ },
5851
+ {
5852
+ "epoch": 0.325,
5853
+ "grad_norm": 0.566314534087209,
5854
+ "learning_rate": 7.5002222222222235e-06,
5855
+ "loss": 2.3731,
5856
+ "step": 16250
5857
+ },
5858
+ {
5859
+ "epoch": 0.3255,
5860
+ "grad_norm": 0.5462158611596983,
5861
+ "learning_rate": 7.494666666666667e-06,
5862
+ "loss": 2.3821,
5863
+ "step": 16275
5864
+ },
5865
+ {
5866
+ "epoch": 0.326,
5867
+ "grad_norm": 0.5546038414202229,
5868
+ "learning_rate": 7.4891111111111114e-06,
5869
+ "loss": 2.3725,
5870
+ "step": 16300
5871
+ },
5872
+ {
5873
+ "epoch": 0.326,
5874
+ "eval_loss": 2.395524501800537,
5875
+ "eval_runtime": 31.8126,
5876
+ "eval_samples_per_second": 3.206,
5877
+ "eval_steps_per_second": 1.603,
5878
+ "step": 16300
5879
+ },
5880
+ {
5881
+ "epoch": 0.3265,
5882
+ "grad_norm": 0.5596467845027929,
5883
+ "learning_rate": 7.483555555555556e-06,
5884
+ "loss": 2.3843,
5885
+ "step": 16325
5886
+ },
5887
+ {
5888
+ "epoch": 0.327,
5889
+ "grad_norm": 0.5815120805791782,
5890
+ "learning_rate": 7.478000000000001e-06,
5891
+ "loss": 2.3815,
5892
+ "step": 16350
5893
+ },
5894
+ {
5895
+ "epoch": 0.3275,
5896
+ "grad_norm": 0.5597449596999192,
5897
+ "learning_rate": 7.4724444444444455e-06,
5898
+ "loss": 2.3732,
5899
+ "step": 16375
5900
+ },
5901
+ {
5902
+ "epoch": 0.328,
5903
+ "grad_norm": 0.5818958282150155,
5904
+ "learning_rate": 7.466888888888889e-06,
5905
+ "loss": 2.3793,
5906
+ "step": 16400
5907
+ },
5908
+ {
5909
+ "epoch": 0.328,
5910
+ "eval_loss": 2.3949294090270996,
5911
+ "eval_runtime": 31.7738,
5912
+ "eval_samples_per_second": 3.21,
5913
+ "eval_steps_per_second": 1.605,
5914
+ "step": 16400
5915
+ },
5916
+ {
5917
+ "epoch": 0.3285,
5918
+ "grad_norm": 0.5662000485734395,
5919
+ "learning_rate": 7.4613333333333334e-06,
5920
+ "loss": 2.3812,
5921
+ "step": 16425
5922
+ },
5923
+ {
5924
+ "epoch": 0.329,
5925
+ "grad_norm": 0.5563577533028059,
5926
+ "learning_rate": 7.455777777777779e-06,
5927
+ "loss": 2.3761,
5928
+ "step": 16450
5929
+ },
5930
+ {
5931
+ "epoch": 0.3295,
5932
+ "grad_norm": 0.5687992956190129,
5933
+ "learning_rate": 7.450222222222223e-06,
5934
+ "loss": 2.381,
5935
+ "step": 16475
5936
+ },
5937
+ {
5938
+ "epoch": 0.33,
5939
+ "grad_norm": 0.5487444076942639,
5940
+ "learning_rate": 7.4446666666666675e-06,
5941
+ "loss": 2.3883,
5942
+ "step": 16500
5943
+ },
5944
+ {
5945
+ "epoch": 0.33,
5946
+ "eval_loss": 2.395174026489258,
5947
+ "eval_runtime": 31.7762,
5948
+ "eval_samples_per_second": 3.21,
5949
+ "eval_steps_per_second": 1.605,
5950
+ "step": 16500
5951
+ },
5952
+ {
5953
+ "epoch": 0.3305,
5954
+ "grad_norm": 0.5469101598299175,
5955
+ "learning_rate": 7.439111111111111e-06,
5956
+ "loss": 2.3766,
5957
+ "step": 16525
5958
+ },
5959
+ {
5960
+ "epoch": 0.331,
5961
+ "grad_norm": 0.5567200858341991,
5962
+ "learning_rate": 7.433555555555556e-06,
5963
+ "loss": 2.3939,
5964
+ "step": 16550
5965
+ },
5966
+ {
5967
+ "epoch": 0.3315,
5968
+ "grad_norm": 0.600536691861987,
5969
+ "learning_rate": 7.428000000000001e-06,
5970
+ "loss": 2.3822,
5971
+ "step": 16575
5972
+ },
5973
+ {
5974
+ "epoch": 0.332,
5975
+ "grad_norm": 0.5505048207350117,
5976
+ "learning_rate": 7.422444444444445e-06,
5977
+ "loss": 2.378,
5978
+ "step": 16600
5979
+ },
5980
+ {
5981
+ "epoch": 0.332,
5982
+ "eval_loss": 2.39481520652771,
5983
+ "eval_runtime": 31.8394,
5984
+ "eval_samples_per_second": 3.204,
5985
+ "eval_steps_per_second": 1.602,
5986
+ "step": 16600
5987
+ },
5988
+ {
5989
+ "epoch": 0.3325,
5990
+ "grad_norm": 0.5492676702406505,
5991
+ "learning_rate": 7.416888888888889e-06,
5992
+ "loss": 2.3769,
5993
+ "step": 16625
5994
+ },
5995
+ {
5996
+ "epoch": 0.333,
5997
+ "grad_norm": 0.5492443037384863,
5998
+ "learning_rate": 7.411333333333334e-06,
5999
+ "loss": 2.3701,
6000
+ "step": 16650
6001
+ },
6002
+ {
6003
+ "epoch": 0.3335,
6004
+ "grad_norm": 0.5857568383624908,
6005
+ "learning_rate": 7.405777777777778e-06,
6006
+ "loss": 2.381,
6007
+ "step": 16675
6008
+ },
6009
+ {
6010
+ "epoch": 0.334,
6011
+ "grad_norm": 0.5647204860919086,
6012
+ "learning_rate": 7.400222222222223e-06,
6013
+ "loss": 2.3819,
6014
+ "step": 16700
6015
+ },
6016
+ {
6017
+ "epoch": 0.334,
6018
+ "eval_loss": 2.394426107406616,
6019
+ "eval_runtime": 31.892,
6020
+ "eval_samples_per_second": 3.198,
6021
+ "eval_steps_per_second": 1.599,
6022
+ "step": 16700
6023
+ },
6024
+ {
6025
+ "epoch": 0.3345,
6026
+ "grad_norm": 0.5730702201176824,
6027
+ "learning_rate": 7.394666666666668e-06,
6028
+ "loss": 2.3857,
6029
+ "step": 16725
6030
+ },
6031
+ {
6032
+ "epoch": 0.335,
6033
+ "grad_norm": 0.5521969424083262,
6034
+ "learning_rate": 7.3891111111111115e-06,
6035
+ "loss": 2.363,
6036
+ "step": 16750
6037
+ },
6038
+ {
6039
+ "epoch": 0.3355,
6040
+ "grad_norm": 0.6057695700506919,
6041
+ "learning_rate": 7.383555555555556e-06,
6042
+ "loss": 2.3848,
6043
+ "step": 16775
6044
+ },
6045
+ {
6046
+ "epoch": 0.336,
6047
+ "grad_norm": 0.5749986280132275,
6048
+ "learning_rate": 7.378e-06,
6049
+ "loss": 2.389,
6050
+ "step": 16800
6051
+ },
6052
+ {
6053
+ "epoch": 0.336,
6054
+ "eval_loss": 2.3945508003234863,
6055
+ "eval_runtime": 31.7463,
6056
+ "eval_samples_per_second": 3.213,
6057
+ "eval_steps_per_second": 1.606,
6058
+ "step": 16800
6059
+ },
6060
+ {
6061
+ "epoch": 0.3365,
6062
+ "grad_norm": 0.5947076066210849,
6063
+ "learning_rate": 7.372444444444446e-06,
6064
+ "loss": 2.3865,
6065
+ "step": 16825
6066
+ },
6067
+ {
6068
+ "epoch": 0.337,
6069
+ "grad_norm": 0.564221658006085,
6070
+ "learning_rate": 7.366888888888889e-06,
6071
+ "loss": 2.3696,
6072
+ "step": 16850
6073
+ },
6074
+ {
6075
+ "epoch": 0.3375,
6076
+ "grad_norm": 0.5702041520098122,
6077
+ "learning_rate": 7.3613333333333336e-06,
6078
+ "loss": 2.3872,
6079
+ "step": 16875
6080
+ },
6081
+ {
6082
+ "epoch": 0.338,
6083
+ "grad_norm": 0.5538661614565709,
6084
+ "learning_rate": 7.355777777777778e-06,
6085
+ "loss": 2.3828,
6086
+ "step": 16900
6087
+ },
6088
+ {
6089
+ "epoch": 0.338,
6090
+ "eval_loss": 2.3942644596099854,
6091
+ "eval_runtime": 31.8144,
6092
+ "eval_samples_per_second": 3.206,
6093
+ "eval_steps_per_second": 1.603,
6094
+ "step": 16900
6095
+ },
6096
+ {
6097
+ "epoch": 0.3385,
6098
+ "grad_norm": 0.5614412730199092,
6099
+ "learning_rate": 7.350222222222223e-06,
6100
+ "loss": 2.3898,
6101
+ "step": 16925
6102
+ },
6103
+ {
6104
+ "epoch": 0.339,
6105
+ "grad_norm": 0.5656638849693418,
6106
+ "learning_rate": 7.344666666666668e-06,
6107
+ "loss": 2.3639,
6108
+ "step": 16950
6109
+ },
6110
+ {
6111
+ "epoch": 0.3395,
6112
+ "grad_norm": 0.5587793192894792,
6113
+ "learning_rate": 7.339111111111111e-06,
6114
+ "loss": 2.3761,
6115
+ "step": 16975
6116
+ },
6117
+ {
6118
+ "epoch": 0.34,
6119
+ "grad_norm": 0.5537041511919,
6120
+ "learning_rate": 7.3335555555555556e-06,
6121
+ "loss": 2.3785,
6122
+ "step": 17000
6123
+ },
6124
+ {
6125
+ "epoch": 0.34,
6126
+ "eval_loss": 2.394216775894165,
6127
+ "eval_runtime": 31.7287,
6128
+ "eval_samples_per_second": 3.215,
6129
+ "eval_steps_per_second": 1.607,
6130
+ "step": 17000
6131
  }
6132
  ],
6133
  "logging_steps": 25,
 
6147
  "attributes": {}
6148
  }
6149
  },
6150
+ "total_flos": 5.4114439071562465e+19,
6151
  "train_batch_size": 1,
6152
  "trial_name": null,
6153
  "trial_params": null