irodkin commited on
Commit
6707e05
·
verified ·
1 Parent(s): 1a3be17

Training checkpoint at step 20000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 19000,
3
- "best_metric": 2.390749454498291,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
- "epoch": 0.38,
6
  "eval_steps": 100,
7
- "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6848,6 +6848,366 @@
6848
  "eval_samples_per_second": 3.207,
6849
  "eval_steps_per_second": 1.603,
6850
  "step": 19000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6851
  }
6852
  ],
6853
  "logging_steps": 25,
@@ -6867,7 +7227,7 @@
6867
  "attributes": {}
6868
  }
6869
  },
6870
- "total_flos": 6.048084366821687e+19,
6871
  "train_batch_size": 1,
6872
  "trial_name": null,
6873
  "trial_params": null
 
1
  {
2
+ "best_global_step": 19900,
3
+ "best_metric": 2.388927698135376,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
+ "epoch": 0.4,
6
  "eval_steps": 100,
7
+ "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6848
  "eval_samples_per_second": 3.207,
6849
  "eval_steps_per_second": 1.603,
6850
  "step": 19000
6851
+ },
6852
+ {
6853
+ "epoch": 0.3805,
6854
+ "grad_norm": 0.5526396554433541,
6855
+ "learning_rate": 6.8835555555555565e-06,
6856
+ "loss": 2.3779,
6857
+ "step": 19025
6858
+ },
6859
+ {
6860
+ "epoch": 0.381,
6861
+ "grad_norm": 0.574490460414078,
6862
+ "learning_rate": 6.878e-06,
6863
+ "loss": 2.3727,
6864
+ "step": 19050
6865
+ },
6866
+ {
6867
+ "epoch": 0.3815,
6868
+ "grad_norm": 0.5611671894801677,
6869
+ "learning_rate": 6.872444444444445e-06,
6870
+ "loss": 2.379,
6871
+ "step": 19075
6872
+ },
6873
+ {
6874
+ "epoch": 0.382,
6875
+ "grad_norm": 0.5434475778092571,
6876
+ "learning_rate": 6.86688888888889e-06,
6877
+ "loss": 2.3788,
6878
+ "step": 19100
6879
+ },
6880
+ {
6881
+ "epoch": 0.382,
6882
+ "eval_loss": 2.390854597091675,
6883
+ "eval_runtime": 31.4727,
6884
+ "eval_samples_per_second": 3.241,
6885
+ "eval_steps_per_second": 1.62,
6886
+ "step": 19100
6887
+ },
6888
+ {
6889
+ "epoch": 0.3825,
6890
+ "grad_norm": 0.5438441040943751,
6891
+ "learning_rate": 6.861333333333334e-06,
6892
+ "loss": 2.3849,
6893
+ "step": 19125
6894
+ },
6895
+ {
6896
+ "epoch": 0.383,
6897
+ "grad_norm": 0.5617582167520553,
6898
+ "learning_rate": 6.855777777777778e-06,
6899
+ "loss": 2.3778,
6900
+ "step": 19150
6901
+ },
6902
+ {
6903
+ "epoch": 0.3835,
6904
+ "grad_norm": 0.5734148354957039,
6905
+ "learning_rate": 6.850222222222223e-06,
6906
+ "loss": 2.3749,
6907
+ "step": 19175
6908
+ },
6909
+ {
6910
+ "epoch": 0.384,
6911
+ "grad_norm": 0.5567016447555824,
6912
+ "learning_rate": 6.844666666666667e-06,
6913
+ "loss": 2.3786,
6914
+ "step": 19200
6915
+ },
6916
+ {
6917
+ "epoch": 0.384,
6918
+ "eval_loss": 2.390947103500366,
6919
+ "eval_runtime": 31.472,
6920
+ "eval_samples_per_second": 3.241,
6921
+ "eval_steps_per_second": 1.62,
6922
+ "step": 19200
6923
+ },
6924
+ {
6925
+ "epoch": 0.3845,
6926
+ "grad_norm": 0.5630941651558155,
6927
+ "learning_rate": 6.839111111111112e-06,
6928
+ "loss": 2.371,
6929
+ "step": 19225
6930
+ },
6931
+ {
6932
+ "epoch": 0.385,
6933
+ "grad_norm": 0.5472891744821744,
6934
+ "learning_rate": 6.833555555555557e-06,
6935
+ "loss": 2.371,
6936
+ "step": 19250
6937
+ },
6938
+ {
6939
+ "epoch": 0.3855,
6940
+ "grad_norm": 0.563854124925733,
6941
+ "learning_rate": 6.8280000000000005e-06,
6942
+ "loss": 2.3802,
6943
+ "step": 19275
6944
+ },
6945
+ {
6946
+ "epoch": 0.386,
6947
+ "grad_norm": 0.5535188682099162,
6948
+ "learning_rate": 6.822444444444445e-06,
6949
+ "loss": 2.3668,
6950
+ "step": 19300
6951
+ },
6952
+ {
6953
+ "epoch": 0.386,
6954
+ "eval_loss": 2.3904383182525635,
6955
+ "eval_runtime": 31.5109,
6956
+ "eval_samples_per_second": 3.237,
6957
+ "eval_steps_per_second": 1.618,
6958
+ "step": 19300
6959
+ },
6960
+ {
6961
+ "epoch": 0.3865,
6962
+ "grad_norm": 0.5847689751509554,
6963
+ "learning_rate": 6.816888888888889e-06,
6964
+ "loss": 2.3723,
6965
+ "step": 19325
6966
+ },
6967
+ {
6968
+ "epoch": 0.387,
6969
+ "grad_norm": 0.5477508463021717,
6970
+ "learning_rate": 6.811333333333335e-06,
6971
+ "loss": 2.3748,
6972
+ "step": 19350
6973
+ },
6974
+ {
6975
+ "epoch": 0.3875,
6976
+ "grad_norm": 0.5530662776524751,
6977
+ "learning_rate": 6.805777777777778e-06,
6978
+ "loss": 2.372,
6979
+ "step": 19375
6980
+ },
6981
+ {
6982
+ "epoch": 0.388,
6983
+ "grad_norm": 0.5627088332087185,
6984
+ "learning_rate": 6.8002222222222225e-06,
6985
+ "loss": 2.3649,
6986
+ "step": 19400
6987
+ },
6988
+ {
6989
+ "epoch": 0.388,
6990
+ "eval_loss": 2.3902432918548584,
6991
+ "eval_runtime": 31.5016,
6992
+ "eval_samples_per_second": 3.238,
6993
+ "eval_steps_per_second": 1.619,
6994
+ "step": 19400
6995
+ },
6996
+ {
6997
+ "epoch": 0.3885,
6998
+ "grad_norm": 0.5917805991329846,
6999
+ "learning_rate": 6.794666666666667e-06,
7000
+ "loss": 2.389,
7001
+ "step": 19425
7002
+ },
7003
+ {
7004
+ "epoch": 0.389,
7005
+ "grad_norm": 0.5637153841856668,
7006
+ "learning_rate": 6.789111111111112e-06,
7007
+ "loss": 2.381,
7008
+ "step": 19450
7009
+ },
7010
+ {
7011
+ "epoch": 0.3895,
7012
+ "grad_norm": 0.5638546592221216,
7013
+ "learning_rate": 6.783555555555557e-06,
7014
+ "loss": 2.3674,
7015
+ "step": 19475
7016
+ },
7017
+ {
7018
+ "epoch": 0.39,
7019
+ "grad_norm": 0.5442599823902955,
7020
+ "learning_rate": 6.778e-06,
7021
+ "loss": 2.3684,
7022
+ "step": 19500
7023
+ },
7024
+ {
7025
+ "epoch": 0.39,
7026
+ "eval_loss": 2.3898606300354004,
7027
+ "eval_runtime": 31.4637,
7028
+ "eval_samples_per_second": 3.242,
7029
+ "eval_steps_per_second": 1.621,
7030
+ "step": 19500
7031
+ },
7032
+ {
7033
+ "epoch": 0.3905,
7034
+ "grad_norm": 0.582280869057288,
7035
+ "learning_rate": 6.7724444444444446e-06,
7036
+ "loss": 2.3691,
7037
+ "step": 19525
7038
+ },
7039
+ {
7040
+ "epoch": 0.391,
7041
+ "grad_norm": 0.5427829071455205,
7042
+ "learning_rate": 6.76688888888889e-06,
7043
+ "loss": 2.372,
7044
+ "step": 19550
7045
+ },
7046
+ {
7047
+ "epoch": 0.3915,
7048
+ "grad_norm": 0.5690660297920415,
7049
+ "learning_rate": 6.761333333333334e-06,
7050
+ "loss": 2.3696,
7051
+ "step": 19575
7052
+ },
7053
+ {
7054
+ "epoch": 0.392,
7055
+ "grad_norm": 0.5887280660795969,
7056
+ "learning_rate": 6.755777777777779e-06,
7057
+ "loss": 2.3647,
7058
+ "step": 19600
7059
+ },
7060
+ {
7061
+ "epoch": 0.392,
7062
+ "eval_loss": 2.389928102493286,
7063
+ "eval_runtime": 31.425,
7064
+ "eval_samples_per_second": 3.246,
7065
+ "eval_steps_per_second": 1.623,
7066
+ "step": 19600
7067
+ },
7068
+ {
7069
+ "epoch": 0.3925,
7070
+ "grad_norm": 0.5706193677763675,
7071
+ "learning_rate": 6.750222222222222e-06,
7072
+ "loss": 2.3693,
7073
+ "step": 19625
7074
+ },
7075
+ {
7076
+ "epoch": 0.393,
7077
+ "grad_norm": 0.5446782496969111,
7078
+ "learning_rate": 6.7446666666666674e-06,
7079
+ "loss": 2.3808,
7080
+ "step": 19650
7081
+ },
7082
+ {
7083
+ "epoch": 0.3935,
7084
+ "grad_norm": 0.5571942248079983,
7085
+ "learning_rate": 6.739111111111112e-06,
7086
+ "loss": 2.3825,
7087
+ "step": 19675
7088
+ },
7089
+ {
7090
+ "epoch": 0.394,
7091
+ "grad_norm": 0.5452923856402259,
7092
+ "learning_rate": 6.733555555555556e-06,
7093
+ "loss": 2.3689,
7094
+ "step": 19700
7095
+ },
7096
+ {
7097
+ "epoch": 0.394,
7098
+ "eval_loss": 2.3896048069000244,
7099
+ "eval_runtime": 31.5836,
7100
+ "eval_samples_per_second": 3.23,
7101
+ "eval_steps_per_second": 1.615,
7102
+ "step": 19700
7103
+ },
7104
+ {
7105
+ "epoch": 0.3945,
7106
+ "grad_norm": 0.5828792681612529,
7107
+ "learning_rate": 6.728e-06,
7108
+ "loss": 2.3733,
7109
+ "step": 19725
7110
+ },
7111
+ {
7112
+ "epoch": 0.395,
7113
+ "grad_norm": 0.5615201455315739,
7114
+ "learning_rate": 6.722444444444445e-06,
7115
+ "loss": 2.3689,
7116
+ "step": 19750
7117
+ },
7118
+ {
7119
+ "epoch": 0.3955,
7120
+ "grad_norm": 0.5585669738111114,
7121
+ "learning_rate": 6.7168888888888894e-06,
7122
+ "loss": 2.3873,
7123
+ "step": 19775
7124
+ },
7125
+ {
7126
+ "epoch": 0.396,
7127
+ "grad_norm": 0.5412795214285975,
7128
+ "learning_rate": 6.711333333333334e-06,
7129
+ "loss": 2.3786,
7130
+ "step": 19800
7131
+ },
7132
+ {
7133
+ "epoch": 0.396,
7134
+ "eval_loss": 2.3894851207733154,
7135
+ "eval_runtime": 31.4877,
7136
+ "eval_samples_per_second": 3.239,
7137
+ "eval_steps_per_second": 1.62,
7138
+ "step": 19800
7139
+ },
7140
+ {
7141
+ "epoch": 0.3965,
7142
+ "grad_norm": 0.5778930227780084,
7143
+ "learning_rate": 6.705777777777779e-06,
7144
+ "loss": 2.3766,
7145
+ "step": 19825
7146
+ },
7147
+ {
7148
+ "epoch": 0.397,
7149
+ "grad_norm": 0.5682987690385847,
7150
+ "learning_rate": 6.700222222222223e-06,
7151
+ "loss": 2.3783,
7152
+ "step": 19850
7153
+ },
7154
+ {
7155
+ "epoch": 0.3975,
7156
+ "grad_norm": 0.5763865594632764,
7157
+ "learning_rate": 6.694666666666667e-06,
7158
+ "loss": 2.3738,
7159
+ "step": 19875
7160
+ },
7161
+ {
7162
+ "epoch": 0.398,
7163
+ "grad_norm": 0.5514756259491804,
7164
+ "learning_rate": 6.6891111111111115e-06,
7165
+ "loss": 2.3764,
7166
+ "step": 19900
7167
+ },
7168
+ {
7169
+ "epoch": 0.398,
7170
+ "eval_loss": 2.388927698135376,
7171
+ "eval_runtime": 31.7775,
7172
+ "eval_samples_per_second": 3.21,
7173
+ "eval_steps_per_second": 1.605,
7174
+ "step": 19900
7175
+ },
7176
+ {
7177
+ "epoch": 0.3985,
7178
+ "grad_norm": 0.5577240438533453,
7179
+ "learning_rate": 6.683555555555557e-06,
7180
+ "loss": 2.374,
7181
+ "step": 19925
7182
+ },
7183
+ {
7184
+ "epoch": 0.399,
7185
+ "grad_norm": 0.553314104963858,
7186
+ "learning_rate": 6.678e-06,
7187
+ "loss": 2.3726,
7188
+ "step": 19950
7189
+ },
7190
+ {
7191
+ "epoch": 0.3995,
7192
+ "grad_norm": 0.5615070159418603,
7193
+ "learning_rate": 6.672444444444445e-06,
7194
+ "loss": 2.3683,
7195
+ "step": 19975
7196
+ },
7197
+ {
7198
+ "epoch": 0.4,
7199
+ "grad_norm": 0.5595654854755111,
7200
+ "learning_rate": 6.666888888888889e-06,
7201
+ "loss": 2.3632,
7202
+ "step": 20000
7203
+ },
7204
+ {
7205
+ "epoch": 0.4,
7206
+ "eval_loss": 2.389249801635742,
7207
+ "eval_runtime": 31.7934,
7208
+ "eval_samples_per_second": 3.208,
7209
+ "eval_steps_per_second": 1.604,
7210
+ "step": 20000
7211
  }
7212
  ],
7213
  "logging_steps": 25,
 
7227
  "attributes": {}
7228
  }
7229
  },
7230
+ "total_flos": 6.366404596654408e+19,
7231
  "train_batch_size": 1,
7232
  "trial_name": null,
7233
  "trial_params": null