irodkin commited on
Commit
59283bd
·
verified ·
1 Parent(s): a411d34

Training checkpoint at step 18000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 17000,
3
- "best_metric": 2.394216775894165,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-17000",
5
- "epoch": 0.34,
6
  "eval_steps": 100,
7
- "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6128,6 +6128,366 @@
6128
  "eval_samples_per_second": 3.215,
6129
  "eval_steps_per_second": 1.607,
6130
  "step": 17000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6131
  }
6132
  ],
6133
  "logging_steps": 25,
@@ -6147,7 +6507,7 @@
6147
  "attributes": {}
6148
  }
6149
  },
6150
- "total_flos": 5.4114439071562465e+19,
6151
  "train_batch_size": 1,
6152
  "trial_name": null,
6153
  "trial_params": null
 
1
  {
2
+ "best_global_step": 18000,
3
+ "best_metric": 2.3920133113861084,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-18000",
5
+ "epoch": 0.36,
6
  "eval_steps": 100,
7
+ "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6128
  "eval_samples_per_second": 3.215,
6129
  "eval_steps_per_second": 1.607,
6130
  "step": 17000
6131
+ },
6132
+ {
6133
+ "epoch": 0.3405,
6134
+ "grad_norm": 0.5625979440161315,
6135
+ "learning_rate": 7.328000000000001e-06,
6136
+ "loss": 2.3706,
6137
+ "step": 17025
6138
+ },
6139
+ {
6140
+ "epoch": 0.341,
6141
+ "grad_norm": 0.5578934058534382,
6142
+ "learning_rate": 7.322444444444445e-06,
6143
+ "loss": 2.3717,
6144
+ "step": 17050
6145
+ },
6146
+ {
6147
+ "epoch": 0.3415,
6148
+ "grad_norm": 0.5600783145650656,
6149
+ "learning_rate": 7.31688888888889e-06,
6150
+ "loss": 2.3549,
6151
+ "step": 17075
6152
+ },
6153
+ {
6154
+ "epoch": 0.342,
6155
+ "grad_norm": 0.5443562716925451,
6156
+ "learning_rate": 7.311333333333334e-06,
6157
+ "loss": 2.3818,
6158
+ "step": 17100
6159
+ },
6160
+ {
6161
+ "epoch": 0.342,
6162
+ "eval_loss": 2.3939199447631836,
6163
+ "eval_runtime": 31.7183,
6164
+ "eval_samples_per_second": 3.216,
6165
+ "eval_steps_per_second": 1.608,
6166
+ "step": 17100
6167
+ },
6168
+ {
6169
+ "epoch": 0.3425,
6170
+ "grad_norm": 0.6040551095214175,
6171
+ "learning_rate": 7.3057777777777784e-06,
6172
+ "loss": 2.3856,
6173
+ "step": 17125
6174
+ },
6175
+ {
6176
+ "epoch": 0.343,
6177
+ "grad_norm": 0.5800600768624563,
6178
+ "learning_rate": 7.300222222222223e-06,
6179
+ "loss": 2.3812,
6180
+ "step": 17150
6181
+ },
6182
+ {
6183
+ "epoch": 0.3435,
6184
+ "grad_norm": 0.606456873691792,
6185
+ "learning_rate": 7.294666666666668e-06,
6186
+ "loss": 2.3823,
6187
+ "step": 17175
6188
+ },
6189
+ {
6190
+ "epoch": 0.344,
6191
+ "grad_norm": 0.5820033666001653,
6192
+ "learning_rate": 7.289111111111112e-06,
6193
+ "loss": 2.3772,
6194
+ "step": 17200
6195
+ },
6196
+ {
6197
+ "epoch": 0.344,
6198
+ "eval_loss": 2.39414644241333,
6199
+ "eval_runtime": 31.4591,
6200
+ "eval_samples_per_second": 3.242,
6201
+ "eval_steps_per_second": 1.621,
6202
+ "step": 17200
6203
+ },
6204
+ {
6205
+ "epoch": 0.3445,
6206
+ "grad_norm": 0.592691728166079,
6207
+ "learning_rate": 7.283555555555556e-06,
6208
+ "loss": 2.3757,
6209
+ "step": 17225
6210
+ },
6211
+ {
6212
+ "epoch": 0.345,
6213
+ "grad_norm": 0.5475066044517582,
6214
+ "learning_rate": 7.2780000000000005e-06,
6215
+ "loss": 2.393,
6216
+ "step": 17250
6217
+ },
6218
+ {
6219
+ "epoch": 0.3455,
6220
+ "grad_norm": 0.5412153350606916,
6221
+ "learning_rate": 7.272444444444446e-06,
6222
+ "loss": 2.3775,
6223
+ "step": 17275
6224
+ },
6225
+ {
6226
+ "epoch": 0.346,
6227
+ "grad_norm": 0.5703055910606494,
6228
+ "learning_rate": 7.26688888888889e-06,
6229
+ "loss": 2.3919,
6230
+ "step": 17300
6231
+ },
6232
+ {
6233
+ "epoch": 0.346,
6234
+ "eval_loss": 2.393954277038574,
6235
+ "eval_runtime": 31.4832,
6236
+ "eval_samples_per_second": 3.24,
6237
+ "eval_steps_per_second": 1.62,
6238
+ "step": 17300
6239
+ },
6240
+ {
6241
+ "epoch": 0.3465,
6242
+ "grad_norm": 0.5720004911842855,
6243
+ "learning_rate": 7.261333333333334e-06,
6244
+ "loss": 2.3744,
6245
+ "step": 17325
6246
+ },
6247
+ {
6248
+ "epoch": 0.347,
6249
+ "grad_norm": 0.5651936652229611,
6250
+ "learning_rate": 7.255777777777778e-06,
6251
+ "loss": 2.3766,
6252
+ "step": 17350
6253
+ },
6254
+ {
6255
+ "epoch": 0.3475,
6256
+ "grad_norm": 0.552954097582646,
6257
+ "learning_rate": 7.250222222222223e-06,
6258
+ "loss": 2.38,
6259
+ "step": 17375
6260
+ },
6261
+ {
6262
+ "epoch": 0.348,
6263
+ "grad_norm": 0.5753937605402671,
6264
+ "learning_rate": 7.244666666666668e-06,
6265
+ "loss": 2.3825,
6266
+ "step": 17400
6267
+ },
6268
+ {
6269
+ "epoch": 0.348,
6270
+ "eval_loss": 2.3936057090759277,
6271
+ "eval_runtime": 31.5155,
6272
+ "eval_samples_per_second": 3.237,
6273
+ "eval_steps_per_second": 1.618,
6274
+ "step": 17400
6275
+ },
6276
+ {
6277
+ "epoch": 0.3485,
6278
+ "grad_norm": 0.5982429265702776,
6279
+ "learning_rate": 7.239111111111111e-06,
6280
+ "loss": 2.3748,
6281
+ "step": 17425
6282
+ },
6283
+ {
6284
+ "epoch": 0.349,
6285
+ "grad_norm": 0.5707105076014326,
6286
+ "learning_rate": 7.233555555555556e-06,
6287
+ "loss": 2.3871,
6288
+ "step": 17450
6289
+ },
6290
+ {
6291
+ "epoch": 0.3495,
6292
+ "grad_norm": 0.5749982454192974,
6293
+ "learning_rate": 7.228000000000001e-06,
6294
+ "loss": 2.3722,
6295
+ "step": 17475
6296
+ },
6297
+ {
6298
+ "epoch": 0.35,
6299
+ "grad_norm": 0.5667678087541999,
6300
+ "learning_rate": 7.222444444444445e-06,
6301
+ "loss": 2.3897,
6302
+ "step": 17500
6303
+ },
6304
+ {
6305
+ "epoch": 0.35,
6306
+ "eval_loss": 2.3934316635131836,
6307
+ "eval_runtime": 31.5133,
6308
+ "eval_samples_per_second": 3.237,
6309
+ "eval_steps_per_second": 1.618,
6310
+ "step": 17500
6311
+ },
6312
+ {
6313
+ "epoch": 0.3505,
6314
+ "grad_norm": 0.551269238238286,
6315
+ "learning_rate": 7.21688888888889e-06,
6316
+ "loss": 2.3759,
6317
+ "step": 17525
6318
+ },
6319
+ {
6320
+ "epoch": 0.351,
6321
+ "grad_norm": 0.5683477126287287,
6322
+ "learning_rate": 7.211333333333333e-06,
6323
+ "loss": 2.3751,
6324
+ "step": 17550
6325
+ },
6326
+ {
6327
+ "epoch": 0.3515,
6328
+ "grad_norm": 0.5534527601932518,
6329
+ "learning_rate": 7.2057777777777785e-06,
6330
+ "loss": 2.3749,
6331
+ "step": 17575
6332
+ },
6333
+ {
6334
+ "epoch": 0.352,
6335
+ "grad_norm": 0.5444580304379504,
6336
+ "learning_rate": 7.200222222222223e-06,
6337
+ "loss": 2.3839,
6338
+ "step": 17600
6339
+ },
6340
+ {
6341
+ "epoch": 0.352,
6342
+ "eval_loss": 2.3928964138031006,
6343
+ "eval_runtime": 31.79,
6344
+ "eval_samples_per_second": 3.209,
6345
+ "eval_steps_per_second": 1.604,
6346
+ "step": 17600
6347
+ },
6348
+ {
6349
+ "epoch": 0.3525,
6350
+ "grad_norm": 0.5683011717419817,
6351
+ "learning_rate": 7.194666666666667e-06,
6352
+ "loss": 2.3697,
6353
+ "step": 17625
6354
+ },
6355
+ {
6356
+ "epoch": 0.353,
6357
+ "grad_norm": 0.5597200154635523,
6358
+ "learning_rate": 7.189111111111111e-06,
6359
+ "loss": 2.3758,
6360
+ "step": 17650
6361
+ },
6362
+ {
6363
+ "epoch": 0.3535,
6364
+ "grad_norm": 0.5389975543023572,
6365
+ "learning_rate": 7.183555555555556e-06,
6366
+ "loss": 2.3748,
6367
+ "step": 17675
6368
+ },
6369
+ {
6370
+ "epoch": 0.354,
6371
+ "grad_norm": 0.5766556300730846,
6372
+ "learning_rate": 7.1780000000000006e-06,
6373
+ "loss": 2.3863,
6374
+ "step": 17700
6375
+ },
6376
+ {
6377
+ "epoch": 0.354,
6378
+ "eval_loss": 2.3929381370544434,
6379
+ "eval_runtime": 31.4662,
6380
+ "eval_samples_per_second": 3.242,
6381
+ "eval_steps_per_second": 1.621,
6382
+ "step": 17700
6383
+ },
6384
+ {
6385
+ "epoch": 0.3545,
6386
+ "grad_norm": 0.5422601731930108,
6387
+ "learning_rate": 7.172444444444445e-06,
6388
+ "loss": 2.3795,
6389
+ "step": 17725
6390
+ },
6391
+ {
6392
+ "epoch": 0.355,
6393
+ "grad_norm": 0.587749563771833,
6394
+ "learning_rate": 7.16688888888889e-06,
6395
+ "loss": 2.3741,
6396
+ "step": 17750
6397
+ },
6398
+ {
6399
+ "epoch": 0.3555,
6400
+ "grad_norm": 0.5448174780243932,
6401
+ "learning_rate": 7.161333333333334e-06,
6402
+ "loss": 2.374,
6403
+ "step": 17775
6404
+ },
6405
+ {
6406
+ "epoch": 0.356,
6407
+ "grad_norm": 0.5487711297157323,
6408
+ "learning_rate": 7.155777777777778e-06,
6409
+ "loss": 2.3872,
6410
+ "step": 17800
6411
+ },
6412
+ {
6413
+ "epoch": 0.356,
6414
+ "eval_loss": 2.3928709030151367,
6415
+ "eval_runtime": 31.7364,
6416
+ "eval_samples_per_second": 3.214,
6417
+ "eval_steps_per_second": 1.607,
6418
+ "step": 17800
6419
+ },
6420
+ {
6421
+ "epoch": 0.3565,
6422
+ "grad_norm": 0.5749112760792647,
6423
+ "learning_rate": 7.150222222222223e-06,
6424
+ "loss": 2.375,
6425
+ "step": 17825
6426
+ },
6427
+ {
6428
+ "epoch": 0.357,
6429
+ "grad_norm": 0.5657127084376901,
6430
+ "learning_rate": 7.144666666666668e-06,
6431
+ "loss": 2.3635,
6432
+ "step": 17850
6433
+ },
6434
+ {
6435
+ "epoch": 0.3575,
6436
+ "grad_norm": 0.5552559911086609,
6437
+ "learning_rate": 7.139111111111112e-06,
6438
+ "loss": 2.3791,
6439
+ "step": 17875
6440
+ },
6441
+ {
6442
+ "epoch": 0.358,
6443
+ "grad_norm": 0.5587079571658956,
6444
+ "learning_rate": 7.133555555555556e-06,
6445
+ "loss": 2.3792,
6446
+ "step": 17900
6447
+ },
6448
+ {
6449
+ "epoch": 0.358,
6450
+ "eval_loss": 2.39250111579895,
6451
+ "eval_runtime": 31.8377,
6452
+ "eval_samples_per_second": 3.204,
6453
+ "eval_steps_per_second": 1.602,
6454
+ "step": 17900
6455
+ },
6456
+ {
6457
+ "epoch": 0.3585,
6458
+ "grad_norm": 0.5476769108414363,
6459
+ "learning_rate": 7.128e-06,
6460
+ "loss": 2.3796,
6461
+ "step": 17925
6462
+ },
6463
+ {
6464
+ "epoch": 0.359,
6465
+ "grad_norm": 0.5519286017800472,
6466
+ "learning_rate": 7.1224444444444454e-06,
6467
+ "loss": 2.3689,
6468
+ "step": 17950
6469
+ },
6470
+ {
6471
+ "epoch": 0.3595,
6472
+ "grad_norm": 0.5690523665272621,
6473
+ "learning_rate": 7.11688888888889e-06,
6474
+ "loss": 2.3758,
6475
+ "step": 17975
6476
+ },
6477
+ {
6478
+ "epoch": 0.36,
6479
+ "grad_norm": 0.575484852893059,
6480
+ "learning_rate": 7.111333333333333e-06,
6481
+ "loss": 2.3723,
6482
+ "step": 18000
6483
+ },
6484
+ {
6485
+ "epoch": 0.36,
6486
+ "eval_loss": 2.3920133113861084,
6487
+ "eval_runtime": 31.9286,
6488
+ "eval_samples_per_second": 3.195,
6489
+ "eval_steps_per_second": 1.597,
6490
+ "step": 18000
6491
  }
6492
  ],
6493
  "logging_steps": 25,
 
6507
  "attributes": {}
6508
  }
6509
  },
6510
+ "total_flos": 5.729764136988967e+19,
6511
  "train_batch_size": 1,
6512
  "trial_name": null,
6513
  "trial_params": null