irodkin commited on
Commit
1028f30
·
verified ·
1 Parent(s): 40f0eb0

Training checkpoint at step 26000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 25000,
3
- "best_metric": 2.3832170963287354,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
- "epoch": 0.5,
6
  "eval_steps": 100,
7
- "global_step": 25000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9008,6 +9008,366 @@
9008
  "eval_samples_per_second": 3.208,
9009
  "eval_steps_per_second": 1.604,
9010
  "step": 25000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9011
  }
9012
  ],
9013
  "logging_steps": 25,
@@ -9027,7 +9387,7 @@
9027
  "attributes": {}
9028
  }
9029
  },
9030
- "total_flos": 7.95800574581801e+19,
9031
  "train_batch_size": 1,
9032
  "trial_name": null,
9033
  "trial_params": null
 
1
  {
2
+ "best_global_step": 25900,
3
+ "best_metric": 2.3824901580810547,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
+ "epoch": 0.52,
6
  "eval_steps": 100,
7
+ "global_step": 26000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9008
  "eval_samples_per_second": 3.208,
9009
  "eval_steps_per_second": 1.604,
9010
  "step": 25000
9011
+ },
9012
+ {
9013
+ "epoch": 0.5005,
9014
+ "grad_norm": 0.5509816083841773,
9015
+ "learning_rate": 5.550222222222223e-06,
9016
+ "loss": 2.3559,
9017
+ "step": 25025
9018
+ },
9019
+ {
9020
+ "epoch": 0.501,
9021
+ "grad_norm": 0.5547472529206742,
9022
+ "learning_rate": 5.544666666666667e-06,
9023
+ "loss": 2.3648,
9024
+ "step": 25050
9025
+ },
9026
+ {
9027
+ "epoch": 0.5015,
9028
+ "grad_norm": 0.546260980184131,
9029
+ "learning_rate": 5.5391111111111115e-06,
9030
+ "loss": 2.3701,
9031
+ "step": 25075
9032
+ },
9033
+ {
9034
+ "epoch": 0.502,
9035
+ "grad_norm": 0.5481216862316385,
9036
+ "learning_rate": 5.533555555555557e-06,
9037
+ "loss": 2.3798,
9038
+ "step": 25100
9039
+ },
9040
+ {
9041
+ "epoch": 0.502,
9042
+ "eval_loss": 2.38305926322937,
9043
+ "eval_runtime": 32.0473,
9044
+ "eval_samples_per_second": 3.183,
9045
+ "eval_steps_per_second": 1.591,
9046
+ "step": 25100
9047
+ },
9048
+ {
9049
+ "epoch": 0.5025,
9050
+ "grad_norm": 0.5670640165543723,
9051
+ "learning_rate": 5.528e-06,
9052
+ "loss": 2.3622,
9053
+ "step": 25125
9054
+ },
9055
+ {
9056
+ "epoch": 0.503,
9057
+ "grad_norm": 0.5463137917421312,
9058
+ "learning_rate": 5.522444444444445e-06,
9059
+ "loss": 2.3719,
9060
+ "step": 25150
9061
+ },
9062
+ {
9063
+ "epoch": 0.5035,
9064
+ "grad_norm": 0.5400999701410277,
9065
+ "learning_rate": 5.516888888888889e-06,
9066
+ "loss": 2.3616,
9067
+ "step": 25175
9068
+ },
9069
+ {
9070
+ "epoch": 0.504,
9071
+ "grad_norm": 0.5802126499364532,
9072
+ "learning_rate": 5.511333333333334e-06,
9073
+ "loss": 2.3721,
9074
+ "step": 25200
9075
+ },
9076
+ {
9077
+ "epoch": 0.504,
9078
+ "eval_loss": 2.3829147815704346,
9079
+ "eval_runtime": 31.7438,
9080
+ "eval_samples_per_second": 3.213,
9081
+ "eval_steps_per_second": 1.607,
9082
+ "step": 25200
9083
+ },
9084
+ {
9085
+ "epoch": 0.5045,
9086
+ "grad_norm": 0.5435607747773122,
9087
+ "learning_rate": 5.505777777777779e-06,
9088
+ "loss": 2.3603,
9089
+ "step": 25225
9090
+ },
9091
+ {
9092
+ "epoch": 0.505,
9093
+ "grad_norm": 0.5453890322127348,
9094
+ "learning_rate": 5.500222222222222e-06,
9095
+ "loss": 2.3636,
9096
+ "step": 25250
9097
+ },
9098
+ {
9099
+ "epoch": 0.5055,
9100
+ "grad_norm": 0.5477131217196112,
9101
+ "learning_rate": 5.494666666666667e-06,
9102
+ "loss": 2.3697,
9103
+ "step": 25275
9104
+ },
9105
+ {
9106
+ "epoch": 0.506,
9107
+ "grad_norm": 0.5621665226631756,
9108
+ "learning_rate": 5.489111111111112e-06,
9109
+ "loss": 2.3687,
9110
+ "step": 25300
9111
+ },
9112
+ {
9113
+ "epoch": 0.506,
9114
+ "eval_loss": 2.3831355571746826,
9115
+ "eval_runtime": 31.7979,
9116
+ "eval_samples_per_second": 3.208,
9117
+ "eval_steps_per_second": 1.604,
9118
+ "step": 25300
9119
+ },
9120
+ {
9121
+ "epoch": 0.5065,
9122
+ "grad_norm": 0.5622191727496813,
9123
+ "learning_rate": 5.483555555555556e-06,
9124
+ "loss": 2.368,
9125
+ "step": 25325
9126
+ },
9127
+ {
9128
+ "epoch": 0.507,
9129
+ "grad_norm": 0.5375310388584507,
9130
+ "learning_rate": 5.478e-06,
9131
+ "loss": 2.3617,
9132
+ "step": 25350
9133
+ },
9134
+ {
9135
+ "epoch": 0.5075,
9136
+ "grad_norm": 0.5421092937376346,
9137
+ "learning_rate": 5.472444444444444e-06,
9138
+ "loss": 2.3759,
9139
+ "step": 25375
9140
+ },
9141
+ {
9142
+ "epoch": 0.508,
9143
+ "grad_norm": 0.5726686989658507,
9144
+ "learning_rate": 5.4668888888888896e-06,
9145
+ "loss": 2.37,
9146
+ "step": 25400
9147
+ },
9148
+ {
9149
+ "epoch": 0.508,
9150
+ "eval_loss": 2.383046865463257,
9151
+ "eval_runtime": 31.8165,
9152
+ "eval_samples_per_second": 3.206,
9153
+ "eval_steps_per_second": 1.603,
9154
+ "step": 25400
9155
+ },
9156
+ {
9157
+ "epoch": 0.5085,
9158
+ "grad_norm": 0.536904504012326,
9159
+ "learning_rate": 5.461333333333334e-06,
9160
+ "loss": 2.3683,
9161
+ "step": 25425
9162
+ },
9163
+ {
9164
+ "epoch": 0.509,
9165
+ "grad_norm": 0.5792290465322086,
9166
+ "learning_rate": 5.455777777777778e-06,
9167
+ "loss": 2.3641,
9168
+ "step": 25450
9169
+ },
9170
+ {
9171
+ "epoch": 0.5095,
9172
+ "grad_norm": 0.5667490944788528,
9173
+ "learning_rate": 5.450222222222222e-06,
9174
+ "loss": 2.3673,
9175
+ "step": 25475
9176
+ },
9177
+ {
9178
+ "epoch": 0.51,
9179
+ "grad_norm": 0.5581091402617585,
9180
+ "learning_rate": 5.444666666666667e-06,
9181
+ "loss": 2.374,
9182
+ "step": 25500
9183
+ },
9184
+ {
9185
+ "epoch": 0.51,
9186
+ "eval_loss": 2.3831074237823486,
9187
+ "eval_runtime": 31.8462,
9188
+ "eval_samples_per_second": 3.203,
9189
+ "eval_steps_per_second": 1.601,
9190
+ "step": 25500
9191
+ },
9192
+ {
9193
+ "epoch": 0.5105,
9194
+ "grad_norm": 0.5629059983127724,
9195
+ "learning_rate": 5.4391111111111116e-06,
9196
+ "loss": 2.376,
9197
+ "step": 25525
9198
+ },
9199
+ {
9200
+ "epoch": 0.511,
9201
+ "grad_norm": 0.5600711744363054,
9202
+ "learning_rate": 5.433555555555556e-06,
9203
+ "loss": 2.3702,
9204
+ "step": 25550
9205
+ },
9206
+ {
9207
+ "epoch": 0.5115,
9208
+ "grad_norm": 0.5500784026204207,
9209
+ "learning_rate": 5.4279999999999995e-06,
9210
+ "loss": 2.3704,
9211
+ "step": 25575
9212
+ },
9213
+ {
9214
+ "epoch": 0.512,
9215
+ "grad_norm": 0.553377338742942,
9216
+ "learning_rate": 5.422444444444445e-06,
9217
+ "loss": 2.3644,
9218
+ "step": 25600
9219
+ },
9220
+ {
9221
+ "epoch": 0.512,
9222
+ "eval_loss": 2.3826544284820557,
9223
+ "eval_runtime": 31.7739,
9224
+ "eval_samples_per_second": 3.21,
9225
+ "eval_steps_per_second": 1.605,
9226
+ "step": 25600
9227
+ },
9228
+ {
9229
+ "epoch": 0.5125,
9230
+ "grad_norm": 0.5861763037221558,
9231
+ "learning_rate": 5.416888888888889e-06,
9232
+ "loss": 2.3658,
9233
+ "step": 25625
9234
+ },
9235
+ {
9236
+ "epoch": 0.513,
9237
+ "grad_norm": 0.5538084648071333,
9238
+ "learning_rate": 5.411333333333334e-06,
9239
+ "loss": 2.3693,
9240
+ "step": 25650
9241
+ },
9242
+ {
9243
+ "epoch": 0.5135,
9244
+ "grad_norm": 0.5699472071254841,
9245
+ "learning_rate": 5.405777777777779e-06,
9246
+ "loss": 2.3707,
9247
+ "step": 25675
9248
+ },
9249
+ {
9250
+ "epoch": 0.514,
9251
+ "grad_norm": 0.5440880568370218,
9252
+ "learning_rate": 5.400222222222222e-06,
9253
+ "loss": 2.3664,
9254
+ "step": 25700
9255
+ },
9256
+ {
9257
+ "epoch": 0.514,
9258
+ "eval_loss": 2.382906675338745,
9259
+ "eval_runtime": 31.7874,
9260
+ "eval_samples_per_second": 3.209,
9261
+ "eval_steps_per_second": 1.604,
9262
+ "step": 25700
9263
+ },
9264
+ {
9265
+ "epoch": 0.5145,
9266
+ "grad_norm": 0.551256815387497,
9267
+ "learning_rate": 5.394666666666667e-06,
9268
+ "loss": 2.3608,
9269
+ "step": 25725
9270
+ },
9271
+ {
9272
+ "epoch": 0.515,
9273
+ "grad_norm": 0.552653919875225,
9274
+ "learning_rate": 5.389111111111112e-06,
9275
+ "loss": 2.3648,
9276
+ "step": 25750
9277
+ },
9278
+ {
9279
+ "epoch": 0.5155,
9280
+ "grad_norm": 0.5489775829628063,
9281
+ "learning_rate": 5.3835555555555565e-06,
9282
+ "loss": 2.368,
9283
+ "step": 25775
9284
+ },
9285
+ {
9286
+ "epoch": 0.516,
9287
+ "grad_norm": 0.545224524462321,
9288
+ "learning_rate": 5.378e-06,
9289
+ "loss": 2.37,
9290
+ "step": 25800
9291
+ },
9292
+ {
9293
+ "epoch": 0.516,
9294
+ "eval_loss": 2.382946491241455,
9295
+ "eval_runtime": 31.8142,
9296
+ "eval_samples_per_second": 3.206,
9297
+ "eval_steps_per_second": 1.603,
9298
+ "step": 25800
9299
+ },
9300
+ {
9301
+ "epoch": 0.5165,
9302
+ "grad_norm": 0.6177434912819645,
9303
+ "learning_rate": 5.372444444444444e-06,
9304
+ "loss": 2.3576,
9305
+ "step": 25825
9306
+ },
9307
+ {
9308
+ "epoch": 0.517,
9309
+ "grad_norm": 0.5731672053410489,
9310
+ "learning_rate": 5.36688888888889e-06,
9311
+ "loss": 2.3641,
9312
+ "step": 25850
9313
+ },
9314
+ {
9315
+ "epoch": 0.5175,
9316
+ "grad_norm": 0.547417736306074,
9317
+ "learning_rate": 5.361333333333334e-06,
9318
+ "loss": 2.3669,
9319
+ "step": 25875
9320
+ },
9321
+ {
9322
+ "epoch": 0.518,
9323
+ "grad_norm": 0.5666721324439973,
9324
+ "learning_rate": 5.3557777777777785e-06,
9325
+ "loss": 2.3633,
9326
+ "step": 25900
9327
+ },
9328
+ {
9329
+ "epoch": 0.518,
9330
+ "eval_loss": 2.3824901580810547,
9331
+ "eval_runtime": 31.8236,
9332
+ "eval_samples_per_second": 3.205,
9333
+ "eval_steps_per_second": 1.603,
9334
+ "step": 25900
9335
+ },
9336
+ {
9337
+ "epoch": 0.5185,
9338
+ "grad_norm": 0.5493694553264233,
9339
+ "learning_rate": 5.350222222222222e-06,
9340
+ "loss": 2.3676,
9341
+ "step": 25925
9342
+ },
9343
+ {
9344
+ "epoch": 0.519,
9345
+ "grad_norm": 0.5581911332398992,
9346
+ "learning_rate": 5.344666666666667e-06,
9347
+ "loss": 2.3665,
9348
+ "step": 25950
9349
+ },
9350
+ {
9351
+ "epoch": 0.5195,
9352
+ "grad_norm": 0.5523156791576098,
9353
+ "learning_rate": 5.339111111111112e-06,
9354
+ "loss": 2.3634,
9355
+ "step": 25975
9356
+ },
9357
+ {
9358
+ "epoch": 0.52,
9359
+ "grad_norm": 0.5394984851015033,
9360
+ "learning_rate": 5.333555555555556e-06,
9361
+ "loss": 2.3693,
9362
+ "step": 26000
9363
+ },
9364
+ {
9365
+ "epoch": 0.52,
9366
+ "eval_loss": 2.3825063705444336,
9367
+ "eval_runtime": 31.7579,
9368
+ "eval_samples_per_second": 3.212,
9369
+ "eval_steps_per_second": 1.606,
9370
+ "step": 26000
9371
  }
9372
  ],
9373
  "logging_steps": 25,
 
9387
  "attributes": {}
9388
  }
9389
  },
9390
+ "total_flos": 8.27632597565073e+19,
9391
  "train_batch_size": 1,
9392
  "trial_name": null,
9393
  "trial_params": null