irodkin commited on
Commit
8ac23f9
·
verified ·
1 Parent(s): 55a3e35

Training checkpoint at step 21000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 19900,
3
- "best_metric": 2.388927698135376,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
- "epoch": 0.4,
6
  "eval_steps": 100,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7208,6 +7208,366 @@
7208
  "eval_samples_per_second": 3.208,
7209
  "eval_steps_per_second": 1.604,
7210
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7211
  }
7212
  ],
7213
  "logging_steps": 25,
@@ -7227,7 +7587,7 @@
7227
  "attributes": {}
7228
  }
7229
  },
7230
- "total_flos": 6.366404596654408e+19,
7231
  "train_batch_size": 1,
7232
  "trial_name": null,
7233
  "trial_params": null
 
1
  {
2
+ "best_global_step": 20900,
3
+ "best_metric": 2.388044595718384,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
+ "epoch": 0.42,
6
  "eval_steps": 100,
7
+ "global_step": 21000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7208
  "eval_samples_per_second": 3.208,
7209
  "eval_steps_per_second": 1.604,
7210
  "step": 20000
7211
+ },
7212
+ {
7213
+ "epoch": 0.4005,
7214
+ "grad_norm": 0.5697829378233469,
7215
+ "learning_rate": 6.661333333333334e-06,
7216
+ "loss": 2.3675,
7217
+ "step": 20025
7218
+ },
7219
+ {
7220
+ "epoch": 0.401,
7221
+ "grad_norm": 0.5582897347067457,
7222
+ "learning_rate": 6.655777777777779e-06,
7223
+ "loss": 2.3672,
7224
+ "step": 20050
7225
+ },
7226
+ {
7227
+ "epoch": 0.4015,
7228
+ "grad_norm": 0.5926925535950422,
7229
+ "learning_rate": 6.650222222222222e-06,
7230
+ "loss": 2.3733,
7231
+ "step": 20075
7232
+ },
7233
+ {
7234
+ "epoch": 0.402,
7235
+ "grad_norm": 0.544270592824537,
7236
+ "learning_rate": 6.644666666666667e-06,
7237
+ "loss": 2.3803,
7238
+ "step": 20100
7239
+ },
7240
+ {
7241
+ "epoch": 0.402,
7242
+ "eval_loss": 2.389204502105713,
7243
+ "eval_runtime": 31.8367,
7244
+ "eval_samples_per_second": 3.204,
7245
+ "eval_steps_per_second": 1.602,
7246
+ "step": 20100
7247
+ },
7248
+ {
7249
+ "epoch": 0.4025,
7250
+ "grad_norm": 0.5530370407597024,
7251
+ "learning_rate": 6.639111111111112e-06,
7252
+ "loss": 2.3633,
7253
+ "step": 20125
7254
+ },
7255
+ {
7256
+ "epoch": 0.403,
7257
+ "grad_norm": 0.5731039592674091,
7258
+ "learning_rate": 6.633555555555556e-06,
7259
+ "loss": 2.3642,
7260
+ "step": 20150
7261
+ },
7262
+ {
7263
+ "epoch": 0.4035,
7264
+ "grad_norm": 0.5599029138977244,
7265
+ "learning_rate": 6.628e-06,
7266
+ "loss": 2.378,
7267
+ "step": 20175
7268
+ },
7269
+ {
7270
+ "epoch": 0.404,
7271
+ "grad_norm": 0.5833746985921849,
7272
+ "learning_rate": 6.622444444444444e-06,
7273
+ "loss": 2.3797,
7274
+ "step": 20200
7275
+ },
7276
+ {
7277
+ "epoch": 0.404,
7278
+ "eval_loss": 2.388874053955078,
7279
+ "eval_runtime": 31.8821,
7280
+ "eval_samples_per_second": 3.199,
7281
+ "eval_steps_per_second": 1.6,
7282
+ "step": 20200
7283
+ },
7284
+ {
7285
+ "epoch": 0.4045,
7286
+ "grad_norm": 0.5758811776953918,
7287
+ "learning_rate": 6.6168888888888896e-06,
7288
+ "loss": 2.3759,
7289
+ "step": 20225
7290
+ },
7291
+ {
7292
+ "epoch": 0.405,
7293
+ "grad_norm": 0.559073322750905,
7294
+ "learning_rate": 6.611333333333334e-06,
7295
+ "loss": 2.3743,
7296
+ "step": 20250
7297
+ },
7298
+ {
7299
+ "epoch": 0.4055,
7300
+ "grad_norm": 0.5638862668814341,
7301
+ "learning_rate": 6.605777777777778e-06,
7302
+ "loss": 2.3726,
7303
+ "step": 20275
7304
+ },
7305
+ {
7306
+ "epoch": 0.406,
7307
+ "grad_norm": 0.5611977328077278,
7308
+ "learning_rate": 6.600222222222222e-06,
7309
+ "loss": 2.3704,
7310
+ "step": 20300
7311
+ },
7312
+ {
7313
+ "epoch": 0.406,
7314
+ "eval_loss": 2.3888099193573,
7315
+ "eval_runtime": 31.7076,
7316
+ "eval_samples_per_second": 3.217,
7317
+ "eval_steps_per_second": 1.608,
7318
+ "step": 20300
7319
+ },
7320
+ {
7321
+ "epoch": 0.4065,
7322
+ "grad_norm": 0.5664333139784736,
7323
+ "learning_rate": 6.594666666666667e-06,
7324
+ "loss": 2.3644,
7325
+ "step": 20325
7326
+ },
7327
+ {
7328
+ "epoch": 0.407,
7329
+ "grad_norm": 0.5549238936705829,
7330
+ "learning_rate": 6.5891111111111116e-06,
7331
+ "loss": 2.3594,
7332
+ "step": 20350
7333
+ },
7334
+ {
7335
+ "epoch": 0.4075,
7336
+ "grad_norm": 0.56940110218198,
7337
+ "learning_rate": 6.583555555555556e-06,
7338
+ "loss": 2.3743,
7339
+ "step": 20375
7340
+ },
7341
+ {
7342
+ "epoch": 0.408,
7343
+ "grad_norm": 0.5757908141952881,
7344
+ "learning_rate": 6.578000000000001e-06,
7345
+ "loss": 2.3774,
7346
+ "step": 20400
7347
+ },
7348
+ {
7349
+ "epoch": 0.408,
7350
+ "eval_loss": 2.3890221118927,
7351
+ "eval_runtime": 31.8193,
7352
+ "eval_samples_per_second": 3.206,
7353
+ "eval_steps_per_second": 1.603,
7354
+ "step": 20400
7355
+ },
7356
+ {
7357
+ "epoch": 0.4085,
7358
+ "grad_norm": 0.6023338293027314,
7359
+ "learning_rate": 6.572444444444445e-06,
7360
+ "loss": 2.3774,
7361
+ "step": 20425
7362
+ },
7363
+ {
7364
+ "epoch": 0.409,
7365
+ "grad_norm": 0.5398042018053211,
7366
+ "learning_rate": 6.566888888888889e-06,
7367
+ "loss": 2.3785,
7368
+ "step": 20450
7369
+ },
7370
+ {
7371
+ "epoch": 0.4095,
7372
+ "grad_norm": 0.5961544515028506,
7373
+ "learning_rate": 6.561333333333334e-06,
7374
+ "loss": 2.3867,
7375
+ "step": 20475
7376
+ },
7377
+ {
7378
+ "epoch": 0.41,
7379
+ "grad_norm": 0.5517605161130648,
7380
+ "learning_rate": 6.555777777777779e-06,
7381
+ "loss": 2.3713,
7382
+ "step": 20500
7383
+ },
7384
+ {
7385
+ "epoch": 0.41,
7386
+ "eval_loss": 2.38859224319458,
7387
+ "eval_runtime": 31.8577,
7388
+ "eval_samples_per_second": 3.202,
7389
+ "eval_steps_per_second": 1.601,
7390
+ "step": 20500
7391
+ },
7392
+ {
7393
+ "epoch": 0.4105,
7394
+ "grad_norm": 0.5753260144360031,
7395
+ "learning_rate": 6.550222222222222e-06,
7396
+ "loss": 2.3653,
7397
+ "step": 20525
7398
+ },
7399
+ {
7400
+ "epoch": 0.411,
7401
+ "grad_norm": 0.6404542212883029,
7402
+ "learning_rate": 6.544666666666667e-06,
7403
+ "loss": 2.3869,
7404
+ "step": 20550
7405
+ },
7406
+ {
7407
+ "epoch": 0.4115,
7408
+ "grad_norm": 0.5777253920326619,
7409
+ "learning_rate": 6.539111111111112e-06,
7410
+ "loss": 2.3813,
7411
+ "step": 20575
7412
+ },
7413
+ {
7414
+ "epoch": 0.412,
7415
+ "grad_norm": 0.5698546516216307,
7416
+ "learning_rate": 6.5335555555555565e-06,
7417
+ "loss": 2.3775,
7418
+ "step": 20600
7419
+ },
7420
+ {
7421
+ "epoch": 0.412,
7422
+ "eval_loss": 2.388434648513794,
7423
+ "eval_runtime": 31.8295,
7424
+ "eval_samples_per_second": 3.205,
7425
+ "eval_steps_per_second": 1.602,
7426
+ "step": 20600
7427
+ },
7428
+ {
7429
+ "epoch": 0.4125,
7430
+ "grad_norm": 0.5842535685269022,
7431
+ "learning_rate": 6.528000000000001e-06,
7432
+ "loss": 2.3896,
7433
+ "step": 20625
7434
+ },
7435
+ {
7436
+ "epoch": 0.413,
7437
+ "grad_norm": 0.5595088265556925,
7438
+ "learning_rate": 6.522444444444444e-06,
7439
+ "loss": 2.3878,
7440
+ "step": 20650
7441
+ },
7442
+ {
7443
+ "epoch": 0.4135,
7444
+ "grad_norm": 0.5751254243123975,
7445
+ "learning_rate": 6.51688888888889e-06,
7446
+ "loss": 2.367,
7447
+ "step": 20675
7448
+ },
7449
+ {
7450
+ "epoch": 0.414,
7451
+ "grad_norm": 0.5394876201865446,
7452
+ "learning_rate": 6.511333333333334e-06,
7453
+ "loss": 2.3776,
7454
+ "step": 20700
7455
+ },
7456
+ {
7457
+ "epoch": 0.414,
7458
+ "eval_loss": 2.3883957862854004,
7459
+ "eval_runtime": 31.8095,
7460
+ "eval_samples_per_second": 3.207,
7461
+ "eval_steps_per_second": 1.603,
7462
+ "step": 20700
7463
+ },
7464
+ {
7465
+ "epoch": 0.4145,
7466
+ "grad_norm": 0.5601399673585632,
7467
+ "learning_rate": 6.5057777777777785e-06,
7468
+ "loss": 2.3679,
7469
+ "step": 20725
7470
+ },
7471
+ {
7472
+ "epoch": 0.415,
7473
+ "grad_norm": 0.5715098373270459,
7474
+ "learning_rate": 6.500222222222222e-06,
7475
+ "loss": 2.3811,
7476
+ "step": 20750
7477
+ },
7478
+ {
7479
+ "epoch": 0.4155,
7480
+ "grad_norm": 0.5517830411358287,
7481
+ "learning_rate": 6.494666666666667e-06,
7482
+ "loss": 2.3723,
7483
+ "step": 20775
7484
+ },
7485
+ {
7486
+ "epoch": 0.416,
7487
+ "grad_norm": 0.5736440167807991,
7488
+ "learning_rate": 6.489111111111112e-06,
7489
+ "loss": 2.3804,
7490
+ "step": 20800
7491
+ },
7492
+ {
7493
+ "epoch": 0.416,
7494
+ "eval_loss": 2.388143539428711,
7495
+ "eval_runtime": 31.9362,
7496
+ "eval_samples_per_second": 3.194,
7497
+ "eval_steps_per_second": 1.597,
7498
+ "step": 20800
7499
+ },
7500
+ {
7501
+ "epoch": 0.4165,
7502
+ "grad_norm": 0.5772877970336647,
7503
+ "learning_rate": 6.483555555555556e-06,
7504
+ "loss": 2.3721,
7505
+ "step": 20825
7506
+ },
7507
+ {
7508
+ "epoch": 0.417,
7509
+ "grad_norm": 0.5746556720939705,
7510
+ "learning_rate": 6.478000000000001e-06,
7511
+ "loss": 2.3662,
7512
+ "step": 20850
7513
+ },
7514
+ {
7515
+ "epoch": 0.4175,
7516
+ "grad_norm": 0.5605696940354651,
7517
+ "learning_rate": 6.472444444444445e-06,
7518
+ "loss": 2.3783,
7519
+ "step": 20875
7520
+ },
7521
+ {
7522
+ "epoch": 0.418,
7523
+ "grad_norm": 0.5474840165552274,
7524
+ "learning_rate": 6.466888888888889e-06,
7525
+ "loss": 2.3799,
7526
+ "step": 20900
7527
+ },
7528
+ {
7529
+ "epoch": 0.418,
7530
+ "eval_loss": 2.388044595718384,
7531
+ "eval_runtime": 31.8313,
7532
+ "eval_samples_per_second": 3.204,
7533
+ "eval_steps_per_second": 1.602,
7534
+ "step": 20900
7535
+ },
7536
+ {
7537
+ "epoch": 0.4185,
7538
+ "grad_norm": 0.5663680125421368,
7539
+ "learning_rate": 6.461333333333334e-06,
7540
+ "loss": 2.3843,
7541
+ "step": 20925
7542
+ },
7543
+ {
7544
+ "epoch": 0.419,
7545
+ "grad_norm": 0.5531423851896319,
7546
+ "learning_rate": 6.455777777777779e-06,
7547
+ "loss": 2.3661,
7548
+ "step": 20950
7549
+ },
7550
+ {
7551
+ "epoch": 0.4195,
7552
+ "grad_norm": 0.5644562314935403,
7553
+ "learning_rate": 6.450222222222223e-06,
7554
+ "loss": 2.3762,
7555
+ "step": 20975
7556
+ },
7557
+ {
7558
+ "epoch": 0.42,
7559
+ "grad_norm": 0.5653831391780122,
7560
+ "learning_rate": 6.444666666666667e-06,
7561
+ "loss": 2.3588,
7562
+ "step": 21000
7563
+ },
7564
+ {
7565
+ "epoch": 0.42,
7566
+ "eval_loss": 2.388213872909546,
7567
+ "eval_runtime": 31.7864,
7568
+ "eval_samples_per_second": 3.209,
7569
+ "eval_steps_per_second": 1.604,
7570
+ "step": 21000
7571
  }
7572
  ],
7573
  "logging_steps": 25,
 
7587
  "attributes": {}
7588
  }
7589
  },
7590
+ "total_flos": 6.684724826487128e+19,
7591
  "train_batch_size": 1,
7592
  "trial_name": null,
7593
  "trial_params": null