irodkin commited on
Commit
e1e3a52
·
verified ·
1 Parent(s): 99276fc

Training checkpoint at step 25000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 24000,
3
- "best_metric": 2.3842599391937256,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-24000",
5
- "epoch": 0.48,
6
  "eval_steps": 100,
7
- "global_step": 24000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8648,6 +8648,366 @@
8648
  "eval_samples_per_second": 3.216,
8649
  "eval_steps_per_second": 1.608,
8650
  "step": 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8651
  }
8652
  ],
8653
  "logging_steps": 25,
@@ -8667,7 +9027,7 @@
8667
  "attributes": {}
8668
  }
8669
  },
8670
- "total_flos": 7.63968551598529e+19,
8671
  "train_batch_size": 1,
8672
  "trial_name": null,
8673
  "trial_params": null
 
1
  {
2
+ "best_global_step": 25000,
3
+ "best_metric": 2.3832170963287354,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
+ "epoch": 0.5,
6
  "eval_steps": 100,
7
+ "global_step": 25000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8648
  "eval_samples_per_second": 3.216,
8649
  "eval_steps_per_second": 1.608,
8650
  "step": 24000
8651
+ },
8652
+ {
8653
+ "epoch": 0.4805,
8654
+ "grad_norm": 0.5640470742370431,
8655
+ "learning_rate": 5.772444444444445e-06,
8656
+ "loss": 2.3622,
8657
+ "step": 24025
8658
+ },
8659
+ {
8660
+ "epoch": 0.481,
8661
+ "grad_norm": 0.5463055265939479,
8662
+ "learning_rate": 5.76688888888889e-06,
8663
+ "loss": 2.3609,
8664
+ "step": 24050
8665
+ },
8666
+ {
8667
+ "epoch": 0.4815,
8668
+ "grad_norm": 0.566766243472923,
8669
+ "learning_rate": 5.7613333333333345e-06,
8670
+ "loss": 2.3824,
8671
+ "step": 24075
8672
+ },
8673
+ {
8674
+ "epoch": 0.482,
8675
+ "grad_norm": 0.5584478304684121,
8676
+ "learning_rate": 5.755777777777778e-06,
8677
+ "loss": 2.3744,
8678
+ "step": 24100
8679
+ },
8680
+ {
8681
+ "epoch": 0.482,
8682
+ "eval_loss": 2.384092330932617,
8683
+ "eval_runtime": 31.7835,
8684
+ "eval_samples_per_second": 3.209,
8685
+ "eval_steps_per_second": 1.605,
8686
+ "step": 24100
8687
+ },
8688
+ {
8689
+ "epoch": 0.4825,
8690
+ "grad_norm": 0.5731740442874064,
8691
+ "learning_rate": 5.7502222222222224e-06,
8692
+ "loss": 2.3733,
8693
+ "step": 24125
8694
+ },
8695
+ {
8696
+ "epoch": 0.483,
8697
+ "grad_norm": 0.5552901331066319,
8698
+ "learning_rate": 5.744666666666668e-06,
8699
+ "loss": 2.3755,
8700
+ "step": 24150
8701
+ },
8702
+ {
8703
+ "epoch": 0.4835,
8704
+ "grad_norm": 0.5535450397337369,
8705
+ "learning_rate": 5.739111111111112e-06,
8706
+ "loss": 2.3777,
8707
+ "step": 24175
8708
+ },
8709
+ {
8710
+ "epoch": 0.484,
8711
+ "grad_norm": 0.5622658531288893,
8712
+ "learning_rate": 5.733555555555556e-06,
8713
+ "loss": 2.3671,
8714
+ "step": 24200
8715
+ },
8716
+ {
8717
+ "epoch": 0.484,
8718
+ "eval_loss": 2.3840036392211914,
8719
+ "eval_runtime": 31.7615,
8720
+ "eval_samples_per_second": 3.211,
8721
+ "eval_steps_per_second": 1.606,
8722
+ "step": 24200
8723
+ },
8724
+ {
8725
+ "epoch": 0.4845,
8726
+ "grad_norm": 0.5526779804173192,
8727
+ "learning_rate": 5.728e-06,
8728
+ "loss": 2.374,
8729
+ "step": 24225
8730
+ },
8731
+ {
8732
+ "epoch": 0.485,
8733
+ "grad_norm": 0.5383978006357063,
8734
+ "learning_rate": 5.722444444444445e-06,
8735
+ "loss": 2.3664,
8736
+ "step": 24250
8737
+ },
8738
+ {
8739
+ "epoch": 0.4855,
8740
+ "grad_norm": 0.5542389650019858,
8741
+ "learning_rate": 5.71688888888889e-06,
8742
+ "loss": 2.3692,
8743
+ "step": 24275
8744
+ },
8745
+ {
8746
+ "epoch": 0.486,
8747
+ "grad_norm": 0.5542459781042757,
8748
+ "learning_rate": 5.711333333333334e-06,
8749
+ "loss": 2.379,
8750
+ "step": 24300
8751
+ },
8752
+ {
8753
+ "epoch": 0.486,
8754
+ "eval_loss": 2.3838605880737305,
8755
+ "eval_runtime": 31.8313,
8756
+ "eval_samples_per_second": 3.204,
8757
+ "eval_steps_per_second": 1.602,
8758
+ "step": 24300
8759
+ },
8760
+ {
8761
+ "epoch": 0.4865,
8762
+ "grad_norm": 0.5371257785961498,
8763
+ "learning_rate": 5.705777777777778e-06,
8764
+ "loss": 2.3759,
8765
+ "step": 24325
8766
+ },
8767
+ {
8768
+ "epoch": 0.487,
8769
+ "grad_norm": 0.5334074315105899,
8770
+ "learning_rate": 5.700222222222223e-06,
8771
+ "loss": 2.3842,
8772
+ "step": 24350
8773
+ },
8774
+ {
8775
+ "epoch": 0.4875,
8776
+ "grad_norm": 0.5712028005119992,
8777
+ "learning_rate": 5.694666666666667e-06,
8778
+ "loss": 2.373,
8779
+ "step": 24375
8780
+ },
8781
+ {
8782
+ "epoch": 0.488,
8783
+ "grad_norm": 0.5527635817323101,
8784
+ "learning_rate": 5.689111111111112e-06,
8785
+ "loss": 2.3632,
8786
+ "step": 24400
8787
+ },
8788
+ {
8789
+ "epoch": 0.488,
8790
+ "eval_loss": 2.383908987045288,
8791
+ "eval_runtime": 31.8006,
8792
+ "eval_samples_per_second": 3.207,
8793
+ "eval_steps_per_second": 1.604,
8794
+ "step": 24400
8795
+ },
8796
+ {
8797
+ "epoch": 0.4885,
8798
+ "grad_norm": 0.5497988709199122,
8799
+ "learning_rate": 5.683555555555555e-06,
8800
+ "loss": 2.3674,
8801
+ "step": 24425
8802
+ },
8803
+ {
8804
+ "epoch": 0.489,
8805
+ "grad_norm": 0.5478963614360626,
8806
+ "learning_rate": 5.6780000000000005e-06,
8807
+ "loss": 2.3795,
8808
+ "step": 24450
8809
+ },
8810
+ {
8811
+ "epoch": 0.4895,
8812
+ "grad_norm": 0.5418443665589167,
8813
+ "learning_rate": 5.672444444444445e-06,
8814
+ "loss": 2.3769,
8815
+ "step": 24475
8816
+ },
8817
+ {
8818
+ "epoch": 0.49,
8819
+ "grad_norm": 0.5637739038034214,
8820
+ "learning_rate": 5.666888888888889e-06,
8821
+ "loss": 2.3754,
8822
+ "step": 24500
8823
+ },
8824
+ {
8825
+ "epoch": 0.49,
8826
+ "eval_loss": 2.3835647106170654,
8827
+ "eval_runtime": 31.695,
8828
+ "eval_samples_per_second": 3.218,
8829
+ "eval_steps_per_second": 1.609,
8830
+ "step": 24500
8831
+ },
8832
+ {
8833
+ "epoch": 0.4905,
8834
+ "grad_norm": 0.5352738455560374,
8835
+ "learning_rate": 5.661333333333335e-06,
8836
+ "loss": 2.3665,
8837
+ "step": 24525
8838
+ },
8839
+ {
8840
+ "epoch": 0.491,
8841
+ "grad_norm": 0.5593898219847685,
8842
+ "learning_rate": 5.655777777777778e-06,
8843
+ "loss": 2.3621,
8844
+ "step": 24550
8845
+ },
8846
+ {
8847
+ "epoch": 0.4915,
8848
+ "grad_norm": 0.5340153226573613,
8849
+ "learning_rate": 5.6502222222222225e-06,
8850
+ "loss": 2.3704,
8851
+ "step": 24575
8852
+ },
8853
+ {
8854
+ "epoch": 0.492,
8855
+ "grad_norm": 0.5434269177198789,
8856
+ "learning_rate": 5.644666666666667e-06,
8857
+ "loss": 2.3707,
8858
+ "step": 24600
8859
+ },
8860
+ {
8861
+ "epoch": 0.492,
8862
+ "eval_loss": 2.38376522064209,
8863
+ "eval_runtime": 31.8117,
8864
+ "eval_samples_per_second": 3.206,
8865
+ "eval_steps_per_second": 1.603,
8866
+ "step": 24600
8867
+ },
8868
+ {
8869
+ "epoch": 0.4925,
8870
+ "grad_norm": 0.5555073289213541,
8871
+ "learning_rate": 5.639111111111112e-06,
8872
+ "loss": 2.3702,
8873
+ "step": 24625
8874
+ },
8875
+ {
8876
+ "epoch": 0.493,
8877
+ "grad_norm": 0.5608796205061338,
8878
+ "learning_rate": 5.633555555555557e-06,
8879
+ "loss": 2.373,
8880
+ "step": 24650
8881
+ },
8882
+ {
8883
+ "epoch": 0.4935,
8884
+ "grad_norm": 0.5639681025688454,
8885
+ "learning_rate": 5.628e-06,
8886
+ "loss": 2.3641,
8887
+ "step": 24675
8888
+ },
8889
+ {
8890
+ "epoch": 0.494,
8891
+ "grad_norm": 0.5610119210421548,
8892
+ "learning_rate": 5.6224444444444446e-06,
8893
+ "loss": 2.372,
8894
+ "step": 24700
8895
+ },
8896
+ {
8897
+ "epoch": 0.494,
8898
+ "eval_loss": 2.383573293685913,
8899
+ "eval_runtime": 31.6948,
8900
+ "eval_samples_per_second": 3.218,
8901
+ "eval_steps_per_second": 1.609,
8902
+ "step": 24700
8903
+ },
8904
+ {
8905
+ "epoch": 0.4945,
8906
+ "grad_norm": 0.5442392815853518,
8907
+ "learning_rate": 5.61688888888889e-06,
8908
+ "loss": 2.3651,
8909
+ "step": 24725
8910
+ },
8911
+ {
8912
+ "epoch": 0.495,
8913
+ "grad_norm": 0.5562532962787945,
8914
+ "learning_rate": 5.611333333333334e-06,
8915
+ "loss": 2.3705,
8916
+ "step": 24750
8917
+ },
8918
+ {
8919
+ "epoch": 0.4955,
8920
+ "grad_norm": 0.5488206873990799,
8921
+ "learning_rate": 5.605777777777778e-06,
8922
+ "loss": 2.3623,
8923
+ "step": 24775
8924
+ },
8925
+ {
8926
+ "epoch": 0.496,
8927
+ "grad_norm": 0.5653453728755813,
8928
+ "learning_rate": 5.600222222222222e-06,
8929
+ "loss": 2.3746,
8930
+ "step": 24800
8931
+ },
8932
+ {
8933
+ "epoch": 0.496,
8934
+ "eval_loss": 2.383600950241089,
8935
+ "eval_runtime": 31.8215,
8936
+ "eval_samples_per_second": 3.205,
8937
+ "eval_steps_per_second": 1.603,
8938
+ "step": 24800
8939
+ },
8940
+ {
8941
+ "epoch": 0.4965,
8942
+ "grad_norm": 0.5714575887868236,
8943
+ "learning_rate": 5.5946666666666674e-06,
8944
+ "loss": 2.3698,
8945
+ "step": 24825
8946
+ },
8947
+ {
8948
+ "epoch": 0.497,
8949
+ "grad_norm": 0.5479503311373944,
8950
+ "learning_rate": 5.589111111111112e-06,
8951
+ "loss": 2.3753,
8952
+ "step": 24850
8953
+ },
8954
+ {
8955
+ "epoch": 0.4975,
8956
+ "grad_norm": 0.5465196721627547,
8957
+ "learning_rate": 5.583555555555556e-06,
8958
+ "loss": 2.3627,
8959
+ "step": 24875
8960
+ },
8961
+ {
8962
+ "epoch": 0.498,
8963
+ "grad_norm": 0.5545182382115218,
8964
+ "learning_rate": 5.578e-06,
8965
+ "loss": 2.3623,
8966
+ "step": 24900
8967
+ },
8968
+ {
8969
+ "epoch": 0.498,
8970
+ "eval_loss": 2.383317470550537,
8971
+ "eval_runtime": 31.8409,
8972
+ "eval_samples_per_second": 3.203,
8973
+ "eval_steps_per_second": 1.602,
8974
+ "step": 24900
8975
+ },
8976
+ {
8977
+ "epoch": 0.4985,
8978
+ "grad_norm": 0.5624766646317664,
8979
+ "learning_rate": 5.572444444444445e-06,
8980
+ "loss": 2.3659,
8981
+ "step": 24925
8982
+ },
8983
+ {
8984
+ "epoch": 0.499,
8985
+ "grad_norm": 0.5642199082921324,
8986
+ "learning_rate": 5.5668888888888894e-06,
8987
+ "loss": 2.3684,
8988
+ "step": 24950
8989
+ },
8990
+ {
8991
+ "epoch": 0.4995,
8992
+ "grad_norm": 0.5917431910025611,
8993
+ "learning_rate": 5.561333333333334e-06,
8994
+ "loss": 2.3723,
8995
+ "step": 24975
8996
+ },
8997
+ {
8998
+ "epoch": 0.5,
8999
+ "grad_norm": 0.5530201275821488,
9000
+ "learning_rate": 5.555777777777777e-06,
9001
+ "loss": 2.3685,
9002
+ "step": 25000
9003
+ },
9004
+ {
9005
+ "epoch": 0.5,
9006
+ "eval_loss": 2.3832170963287354,
9007
+ "eval_runtime": 31.7959,
9008
+ "eval_samples_per_second": 3.208,
9009
+ "eval_steps_per_second": 1.604,
9010
+ "step": 25000
9011
  }
9012
  ],
9013
  "logging_steps": 25,
 
9027
  "attributes": {}
9028
  }
9029
  },
9030
+ "total_flos": 7.95800574581801e+19,
9031
  "train_batch_size": 1,
9032
  "trial_name": null,
9033
  "trial_params": null