irodkin commited on
Commit
f3877b3
·
verified ·
1 Parent(s): 06065d5

Training checkpoint at step 23000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 22000,
3
- "best_metric": 2.3865110874176025,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
5
- "epoch": 0.44,
6
  "eval_steps": 100,
7
- "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7928,6 +7928,366 @@
7928
  "eval_samples_per_second": 3.206,
7929
  "eval_steps_per_second": 1.603,
7930
  "step": 22000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7931
  }
7932
  ],
7933
  "logging_steps": 25,
@@ -7947,7 +8307,7 @@
7947
  "attributes": {}
7948
  }
7949
  },
7950
- "total_flos": 7.0030450563198484e+19,
7951
  "train_batch_size": 1,
7952
  "trial_name": null,
7953
  "trial_params": null
 
1
  {
2
+ "best_global_step": 22700,
3
+ "best_metric": 2.3853445053100586,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
5
+ "epoch": 0.46,
6
  "eval_steps": 100,
7
+ "global_step": 23000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7928
  "eval_samples_per_second": 3.206,
7929
  "eval_steps_per_second": 1.603,
7930
  "step": 22000
7931
+ },
7932
+ {
7933
+ "epoch": 0.4405,
7934
+ "grad_norm": 0.5533397760322247,
7935
+ "learning_rate": 6.216888888888889e-06,
7936
+ "loss": 2.371,
7937
+ "step": 22025
7938
+ },
7939
+ {
7940
+ "epoch": 0.441,
7941
+ "grad_norm": 0.5551275205002794,
7942
+ "learning_rate": 6.2113333333333336e-06,
7943
+ "loss": 2.3684,
7944
+ "step": 22050
7945
+ },
7946
+ {
7947
+ "epoch": 0.4415,
7948
+ "grad_norm": 0.5520948023453888,
7949
+ "learning_rate": 6.205777777777778e-06,
7950
+ "loss": 2.3602,
7951
+ "step": 22075
7952
+ },
7953
+ {
7954
+ "epoch": 0.442,
7955
+ "grad_norm": 0.5679529169964138,
7956
+ "learning_rate": 6.200222222222223e-06,
7957
+ "loss": 2.3867,
7958
+ "step": 22100
7959
+ },
7960
+ {
7961
+ "epoch": 0.442,
7962
+ "eval_loss": 2.3863022327423096,
7963
+ "eval_runtime": 32.0036,
7964
+ "eval_samples_per_second": 3.187,
7965
+ "eval_steps_per_second": 1.594,
7966
+ "step": 22100
7967
+ },
7968
+ {
7969
+ "epoch": 0.4425,
7970
+ "grad_norm": 0.5619895216629556,
7971
+ "learning_rate": 6.194666666666668e-06,
7972
+ "loss": 2.3701,
7973
+ "step": 22125
7974
+ },
7975
+ {
7976
+ "epoch": 0.443,
7977
+ "grad_norm": 0.5515875809771505,
7978
+ "learning_rate": 6.189111111111111e-06,
7979
+ "loss": 2.3734,
7980
+ "step": 22150
7981
+ },
7982
+ {
7983
+ "epoch": 0.4435,
7984
+ "grad_norm": 0.5686425996531567,
7985
+ "learning_rate": 6.1835555555555556e-06,
7986
+ "loss": 2.3698,
7987
+ "step": 22175
7988
+ },
7989
+ {
7990
+ "epoch": 0.444,
7991
+ "grad_norm": 0.5580871882801617,
7992
+ "learning_rate": 6.178000000000001e-06,
7993
+ "loss": 2.3676,
7994
+ "step": 22200
7995
+ },
7996
+ {
7997
+ "epoch": 0.444,
7998
+ "eval_loss": 2.3865246772766113,
7999
+ "eval_runtime": 31.7174,
8000
+ "eval_samples_per_second": 3.216,
8001
+ "eval_steps_per_second": 1.608,
8002
+ "step": 22200
8003
+ },
8004
+ {
8005
+ "epoch": 0.4445,
8006
+ "grad_norm": 0.5784261034385078,
8007
+ "learning_rate": 6.172444444444445e-06,
8008
+ "loss": 2.3723,
8009
+ "step": 22225
8010
+ },
8011
+ {
8012
+ "epoch": 0.445,
8013
+ "grad_norm": 0.5570688655308026,
8014
+ "learning_rate": 6.166888888888889e-06,
8015
+ "loss": 2.3709,
8016
+ "step": 22250
8017
+ },
8018
+ {
8019
+ "epoch": 0.4455,
8020
+ "grad_norm": 0.5716930839552549,
8021
+ "learning_rate": 6.161333333333334e-06,
8022
+ "loss": 2.3734,
8023
+ "step": 22275
8024
+ },
8025
+ {
8026
+ "epoch": 0.446,
8027
+ "grad_norm": 0.5550340902020618,
8028
+ "learning_rate": 6.1557777777777784e-06,
8029
+ "loss": 2.3648,
8030
+ "step": 22300
8031
+ },
8032
+ {
8033
+ "epoch": 0.446,
8034
+ "eval_loss": 2.38633131980896,
8035
+ "eval_runtime": 31.7943,
8036
+ "eval_samples_per_second": 3.208,
8037
+ "eval_steps_per_second": 1.604,
8038
+ "step": 22300
8039
+ },
8040
+ {
8041
+ "epoch": 0.4465,
8042
+ "grad_norm": 0.5719936248106342,
8043
+ "learning_rate": 6.150222222222223e-06,
8044
+ "loss": 2.3751,
8045
+ "step": 22325
8046
+ },
8047
+ {
8048
+ "epoch": 0.447,
8049
+ "grad_norm": 0.5616671760742846,
8050
+ "learning_rate": 6.144666666666668e-06,
8051
+ "loss": 2.3748,
8052
+ "step": 22350
8053
+ },
8054
+ {
8055
+ "epoch": 0.4475,
8056
+ "grad_norm": 0.5785985644213604,
8057
+ "learning_rate": 6.139111111111112e-06,
8058
+ "loss": 2.3837,
8059
+ "step": 22375
8060
+ },
8061
+ {
8062
+ "epoch": 0.448,
8063
+ "grad_norm": 0.5645620599147937,
8064
+ "learning_rate": 6.133555555555556e-06,
8065
+ "loss": 2.3745,
8066
+ "step": 22400
8067
+ },
8068
+ {
8069
+ "epoch": 0.448,
8070
+ "eval_loss": 2.3862569332122803,
8071
+ "eval_runtime": 31.9593,
8072
+ "eval_samples_per_second": 3.192,
8073
+ "eval_steps_per_second": 1.596,
8074
+ "step": 22400
8075
+ },
8076
+ {
8077
+ "epoch": 0.4485,
8078
+ "grad_norm": 0.5469950240628229,
8079
+ "learning_rate": 6.1280000000000005e-06,
8080
+ "loss": 2.3642,
8081
+ "step": 22425
8082
+ },
8083
+ {
8084
+ "epoch": 0.449,
8085
+ "grad_norm": 0.5324393599981698,
8086
+ "learning_rate": 6.122444444444446e-06,
8087
+ "loss": 2.379,
8088
+ "step": 22450
8089
+ },
8090
+ {
8091
+ "epoch": 0.4495,
8092
+ "grad_norm": 0.5519962387254249,
8093
+ "learning_rate": 6.116888888888889e-06,
8094
+ "loss": 2.3635,
8095
+ "step": 22475
8096
+ },
8097
+ {
8098
+ "epoch": 0.45,
8099
+ "grad_norm": 0.5588336399127953,
8100
+ "learning_rate": 6.111333333333334e-06,
8101
+ "loss": 2.3718,
8102
+ "step": 22500
8103
+ },
8104
+ {
8105
+ "epoch": 0.45,
8106
+ "eval_loss": 2.385950803756714,
8107
+ "eval_runtime": 31.7208,
8108
+ "eval_samples_per_second": 3.216,
8109
+ "eval_steps_per_second": 1.608,
8110
+ "step": 22500
8111
+ },
8112
+ {
8113
+ "epoch": 0.4505,
8114
+ "grad_norm": 0.5923640418917652,
8115
+ "learning_rate": 6.105777777777778e-06,
8116
+ "loss": 2.3719,
8117
+ "step": 22525
8118
+ },
8119
+ {
8120
+ "epoch": 0.451,
8121
+ "grad_norm": 0.5653562982992056,
8122
+ "learning_rate": 6.100222222222223e-06,
8123
+ "loss": 2.3808,
8124
+ "step": 22550
8125
+ },
8126
+ {
8127
+ "epoch": 0.4515,
8128
+ "grad_norm": 0.5636846873459127,
8129
+ "learning_rate": 6.094666666666668e-06,
8130
+ "loss": 2.3641,
8131
+ "step": 22575
8132
+ },
8133
+ {
8134
+ "epoch": 0.452,
8135
+ "grad_norm": 0.5850003926588586,
8136
+ "learning_rate": 6.089111111111111e-06,
8137
+ "loss": 2.3572,
8138
+ "step": 22600
8139
+ },
8140
+ {
8141
+ "epoch": 0.452,
8142
+ "eval_loss": 2.386296033859253,
8143
+ "eval_runtime": 31.8709,
8144
+ "eval_samples_per_second": 3.2,
8145
+ "eval_steps_per_second": 1.6,
8146
+ "step": 22600
8147
+ },
8148
+ {
8149
+ "epoch": 0.4525,
8150
+ "grad_norm": 0.5334735362781007,
8151
+ "learning_rate": 6.083555555555556e-06,
8152
+ "loss": 2.3732,
8153
+ "step": 22625
8154
+ },
8155
+ {
8156
+ "epoch": 0.453,
8157
+ "grad_norm": 0.5809776122118506,
8158
+ "learning_rate": 6.078000000000001e-06,
8159
+ "loss": 2.3842,
8160
+ "step": 22650
8161
+ },
8162
+ {
8163
+ "epoch": 0.4535,
8164
+ "grad_norm": 0.5438625993671827,
8165
+ "learning_rate": 6.072444444444445e-06,
8166
+ "loss": 2.3802,
8167
+ "step": 22675
8168
+ },
8169
+ {
8170
+ "epoch": 0.454,
8171
+ "grad_norm": 0.5581266930595516,
8172
+ "learning_rate": 6.06688888888889e-06,
8173
+ "loss": 2.3757,
8174
+ "step": 22700
8175
+ },
8176
+ {
8177
+ "epoch": 0.454,
8178
+ "eval_loss": 2.3853445053100586,
8179
+ "eval_runtime": 31.9465,
8180
+ "eval_samples_per_second": 3.193,
8181
+ "eval_steps_per_second": 1.596,
8182
+ "step": 22700
8183
+ },
8184
+ {
8185
+ "epoch": 0.4545,
8186
+ "grad_norm": 0.5665471911134969,
8187
+ "learning_rate": 6.061333333333333e-06,
8188
+ "loss": 2.3632,
8189
+ "step": 22725
8190
+ },
8191
+ {
8192
+ "epoch": 0.455,
8193
+ "grad_norm": 0.5602817372745607,
8194
+ "learning_rate": 6.0557777777777785e-06,
8195
+ "loss": 2.3759,
8196
+ "step": 22750
8197
+ },
8198
+ {
8199
+ "epoch": 0.4555,
8200
+ "grad_norm": 0.5546395592927382,
8201
+ "learning_rate": 6.050222222222223e-06,
8202
+ "loss": 2.3654,
8203
+ "step": 22775
8204
+ },
8205
+ {
8206
+ "epoch": 0.456,
8207
+ "grad_norm": 0.5466059675730089,
8208
+ "learning_rate": 6.044666666666667e-06,
8209
+ "loss": 2.3747,
8210
+ "step": 22800
8211
+ },
8212
+ {
8213
+ "epoch": 0.456,
8214
+ "eval_loss": 2.3854382038116455,
8215
+ "eval_runtime": 31.8135,
8216
+ "eval_samples_per_second": 3.206,
8217
+ "eval_steps_per_second": 1.603,
8218
+ "step": 22800
8219
+ },
8220
+ {
8221
+ "epoch": 0.4565,
8222
+ "grad_norm": 0.556576922176953,
8223
+ "learning_rate": 6.039111111111111e-06,
8224
+ "loss": 2.3752,
8225
+ "step": 22825
8226
+ },
8227
+ {
8228
+ "epoch": 0.457,
8229
+ "grad_norm": 0.5587160453347744,
8230
+ "learning_rate": 6.033555555555556e-06,
8231
+ "loss": 2.3753,
8232
+ "step": 22850
8233
+ },
8234
+ {
8235
+ "epoch": 0.4575,
8236
+ "grad_norm": 0.5581750567947692,
8237
+ "learning_rate": 6.0280000000000006e-06,
8238
+ "loss": 2.3744,
8239
+ "step": 22875
8240
+ },
8241
+ {
8242
+ "epoch": 0.458,
8243
+ "grad_norm": 0.5665211201226871,
8244
+ "learning_rate": 6.022444444444445e-06,
8245
+ "loss": 2.3707,
8246
+ "step": 22900
8247
+ },
8248
+ {
8249
+ "epoch": 0.458,
8250
+ "eval_loss": 2.3854050636291504,
8251
+ "eval_runtime": 31.8453,
8252
+ "eval_samples_per_second": 3.203,
8253
+ "eval_steps_per_second": 1.601,
8254
+ "step": 22900
8255
+ },
8256
+ {
8257
+ "epoch": 0.4585,
8258
+ "grad_norm": 0.559138638343371,
8259
+ "learning_rate": 6.01688888888889e-06,
8260
+ "loss": 2.3771,
8261
+ "step": 22925
8262
+ },
8263
+ {
8264
+ "epoch": 0.459,
8265
+ "grad_norm": 0.5765629867304476,
8266
+ "learning_rate": 6.011333333333334e-06,
8267
+ "loss": 2.3751,
8268
+ "step": 22950
8269
+ },
8270
+ {
8271
+ "epoch": 0.4595,
8272
+ "grad_norm": 0.5697804508664757,
8273
+ "learning_rate": 6.005777777777778e-06,
8274
+ "loss": 2.3837,
8275
+ "step": 22975
8276
+ },
8277
+ {
8278
+ "epoch": 0.46,
8279
+ "grad_norm": 0.5813773268685459,
8280
+ "learning_rate": 6.000222222222223e-06,
8281
+ "loss": 2.37,
8282
+ "step": 23000
8283
+ },
8284
+ {
8285
+ "epoch": 0.46,
8286
+ "eval_loss": 2.385390520095825,
8287
+ "eval_runtime": 31.767,
8288
+ "eval_samples_per_second": 3.211,
8289
+ "eval_steps_per_second": 1.605,
8290
+ "step": 23000
8291
  }
8292
  ],
8293
  "logging_steps": 25,
 
8307
  "attributes": {}
8308
  }
8309
  },
8310
+ "total_flos": 7.321365286152569e+19,
8311
  "train_batch_size": 1,
8312
  "trial_name": null,
8313
  "trial_params": null