irodkin commited on
Commit
3d951fb
·
verified ·
1 Parent(s): ad48a90

Training checkpoint at step 24000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 22700,
3
- "best_metric": 2.3853445053100586,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
5
- "epoch": 0.46,
6
  "eval_steps": 100,
7
- "global_step": 23000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8288,6 +8288,366 @@
8288
  "eval_samples_per_second": 3.211,
8289
  "eval_steps_per_second": 1.605,
8290
  "step": 23000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8291
  }
8292
  ],
8293
  "logging_steps": 25,
@@ -8307,7 +8667,7 @@
8307
  "attributes": {}
8308
  }
8309
  },
8310
- "total_flos": 7.321365286152569e+19,
8311
  "train_batch_size": 1,
8312
  "trial_name": null,
8313
  "trial_params": null
 
1
  {
2
+ "best_global_step": 24000,
3
+ "best_metric": 2.3842599391937256,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-24000",
5
+ "epoch": 0.48,
6
  "eval_steps": 100,
7
+ "global_step": 24000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8288
  "eval_samples_per_second": 3.211,
8289
  "eval_steps_per_second": 1.605,
8290
  "step": 23000
8291
+ },
8292
+ {
8293
+ "epoch": 0.4605,
8294
+ "grad_norm": 0.5644614073323889,
8295
+ "learning_rate": 5.994666666666668e-06,
8296
+ "loss": 2.3627,
8297
+ "step": 23025
8298
+ },
8299
+ {
8300
+ "epoch": 0.461,
8301
+ "grad_norm": 0.561196100799294,
8302
+ "learning_rate": 5.989111111111111e-06,
8303
+ "loss": 2.373,
8304
+ "step": 23050
8305
+ },
8306
+ {
8307
+ "epoch": 0.4615,
8308
+ "grad_norm": 0.5988172465498709,
8309
+ "learning_rate": 5.983555555555556e-06,
8310
+ "loss": 2.3625,
8311
+ "step": 23075
8312
+ },
8313
+ {
8314
+ "epoch": 0.462,
8315
+ "grad_norm": 0.5561927981892911,
8316
+ "learning_rate": 5.978e-06,
8317
+ "loss": 2.366,
8318
+ "step": 23100
8319
+ },
8320
+ {
8321
+ "epoch": 0.462,
8322
+ "eval_loss": 2.3851592540740967,
8323
+ "eval_runtime": 31.9972,
8324
+ "eval_samples_per_second": 3.188,
8325
+ "eval_steps_per_second": 1.594,
8326
+ "step": 23100
8327
+ },
8328
+ {
8329
+ "epoch": 0.4625,
8330
+ "grad_norm": 0.5473375939412587,
8331
+ "learning_rate": 5.9724444444444454e-06,
8332
+ "loss": 2.3577,
8333
+ "step": 23125
8334
+ },
8335
+ {
8336
+ "epoch": 0.463,
8337
+ "grad_norm": 0.5422432723666715,
8338
+ "learning_rate": 5.96688888888889e-06,
8339
+ "loss": 2.3724,
8340
+ "step": 23150
8341
+ },
8342
+ {
8343
+ "epoch": 0.4635,
8344
+ "grad_norm": 0.5459369802725026,
8345
+ "learning_rate": 5.961333333333333e-06,
8346
+ "loss": 2.3693,
8347
+ "step": 23175
8348
+ },
8349
+ {
8350
+ "epoch": 0.464,
8351
+ "grad_norm": 0.5602391995824985,
8352
+ "learning_rate": 5.955777777777778e-06,
8353
+ "loss": 2.3662,
8354
+ "step": 23200
8355
+ },
8356
+ {
8357
+ "epoch": 0.464,
8358
+ "eval_loss": 2.384812593460083,
8359
+ "eval_runtime": 31.7736,
8360
+ "eval_samples_per_second": 3.21,
8361
+ "eval_steps_per_second": 1.605,
8362
+ "step": 23200
8363
+ },
8364
+ {
8365
+ "epoch": 0.4645,
8366
+ "grad_norm": 0.5382771454200044,
8367
+ "learning_rate": 5.950222222222223e-06,
8368
+ "loss": 2.373,
8369
+ "step": 23225
8370
+ },
8371
+ {
8372
+ "epoch": 0.465,
8373
+ "grad_norm": 0.5616408548500356,
8374
+ "learning_rate": 5.9446666666666675e-06,
8375
+ "loss": 2.3744,
8376
+ "step": 23250
8377
+ },
8378
+ {
8379
+ "epoch": 0.4655,
8380
+ "grad_norm": 0.5626270768454595,
8381
+ "learning_rate": 5.939111111111111e-06,
8382
+ "loss": 2.3745,
8383
+ "step": 23275
8384
+ },
8385
+ {
8386
+ "epoch": 0.466,
8387
+ "grad_norm": 0.5771198592247021,
8388
+ "learning_rate": 5.933555555555555e-06,
8389
+ "loss": 2.3712,
8390
+ "step": 23300
8391
+ },
8392
+ {
8393
+ "epoch": 0.466,
8394
+ "eval_loss": 2.385037660598755,
8395
+ "eval_runtime": 31.6688,
8396
+ "eval_samples_per_second": 3.221,
8397
+ "eval_steps_per_second": 1.61,
8398
+ "step": 23300
8399
+ },
8400
+ {
8401
+ "epoch": 0.4665,
8402
+ "grad_norm": 0.553677767303205,
8403
+ "learning_rate": 5.928000000000001e-06,
8404
+ "loss": 2.3688,
8405
+ "step": 23325
8406
+ },
8407
+ {
8408
+ "epoch": 0.467,
8409
+ "grad_norm": 0.5761122434148291,
8410
+ "learning_rate": 5.922444444444445e-06,
8411
+ "loss": 2.3697,
8412
+ "step": 23350
8413
+ },
8414
+ {
8415
+ "epoch": 0.4675,
8416
+ "grad_norm": 0.5776134096430138,
8417
+ "learning_rate": 5.9168888888888895e-06,
8418
+ "loss": 2.3696,
8419
+ "step": 23375
8420
+ },
8421
+ {
8422
+ "epoch": 0.468,
8423
+ "grad_norm": 0.5410943763458229,
8424
+ "learning_rate": 5.911333333333333e-06,
8425
+ "loss": 2.3748,
8426
+ "step": 23400
8427
+ },
8428
+ {
8429
+ "epoch": 0.468,
8430
+ "eval_loss": 2.3850579261779785,
8431
+ "eval_runtime": 31.7506,
8432
+ "eval_samples_per_second": 3.213,
8433
+ "eval_steps_per_second": 1.606,
8434
+ "step": 23400
8435
+ },
8436
+ {
8437
+ "epoch": 0.4685,
8438
+ "grad_norm": 0.5496846088073756,
8439
+ "learning_rate": 5.905777777777778e-06,
8440
+ "loss": 2.3631,
8441
+ "step": 23425
8442
+ },
8443
+ {
8444
+ "epoch": 0.469,
8445
+ "grad_norm": 0.5489837887647091,
8446
+ "learning_rate": 5.900222222222223e-06,
8447
+ "loss": 2.3752,
8448
+ "step": 23450
8449
+ },
8450
+ {
8451
+ "epoch": 0.4695,
8452
+ "grad_norm": 0.5595321821458019,
8453
+ "learning_rate": 5.894666666666667e-06,
8454
+ "loss": 2.3681,
8455
+ "step": 23475
8456
+ },
8457
+ {
8458
+ "epoch": 0.47,
8459
+ "grad_norm": 0.5441176871533538,
8460
+ "learning_rate": 5.889111111111112e-06,
8461
+ "loss": 2.3689,
8462
+ "step": 23500
8463
+ },
8464
+ {
8465
+ "epoch": 0.47,
8466
+ "eval_loss": 2.3847615718841553,
8467
+ "eval_runtime": 31.7515,
8468
+ "eval_samples_per_second": 3.212,
8469
+ "eval_steps_per_second": 1.606,
8470
+ "step": 23500
8471
+ },
8472
+ {
8473
+ "epoch": 0.4705,
8474
+ "grad_norm": 0.5591005943894303,
8475
+ "learning_rate": 5.883555555555556e-06,
8476
+ "loss": 2.3687,
8477
+ "step": 23525
8478
+ },
8479
+ {
8480
+ "epoch": 0.471,
8481
+ "grad_norm": 0.5569068986313633,
8482
+ "learning_rate": 5.878e-06,
8483
+ "loss": 2.3579,
8484
+ "step": 23550
8485
+ },
8486
+ {
8487
+ "epoch": 0.4715,
8488
+ "grad_norm": 0.5544550604142251,
8489
+ "learning_rate": 5.872444444444445e-06,
8490
+ "loss": 2.3654,
8491
+ "step": 23575
8492
+ },
8493
+ {
8494
+ "epoch": 0.472,
8495
+ "grad_norm": 0.5682698532685105,
8496
+ "learning_rate": 5.86688888888889e-06,
8497
+ "loss": 2.3686,
8498
+ "step": 23600
8499
+ },
8500
+ {
8501
+ "epoch": 0.472,
8502
+ "eval_loss": 2.384906053543091,
8503
+ "eval_runtime": 31.7623,
8504
+ "eval_samples_per_second": 3.211,
8505
+ "eval_steps_per_second": 1.606,
8506
+ "step": 23600
8507
+ },
8508
+ {
8509
+ "epoch": 0.4725,
8510
+ "grad_norm": 0.5754081011772445,
8511
+ "learning_rate": 5.8613333333333335e-06,
8512
+ "loss": 2.3629,
8513
+ "step": 23625
8514
+ },
8515
+ {
8516
+ "epoch": 0.473,
8517
+ "grad_norm": 0.605492062724259,
8518
+ "learning_rate": 5.855777777777778e-06,
8519
+ "loss": 2.3702,
8520
+ "step": 23650
8521
+ },
8522
+ {
8523
+ "epoch": 0.4735,
8524
+ "grad_norm": 0.5407520724247802,
8525
+ "learning_rate": 5.850222222222222e-06,
8526
+ "loss": 2.3652,
8527
+ "step": 23675
8528
+ },
8529
+ {
8530
+ "epoch": 0.474,
8531
+ "grad_norm": 0.5531865604429913,
8532
+ "learning_rate": 5.8446666666666676e-06,
8533
+ "loss": 2.3724,
8534
+ "step": 23700
8535
+ },
8536
+ {
8537
+ "epoch": 0.474,
8538
+ "eval_loss": 2.3844547271728516,
8539
+ "eval_runtime": 31.833,
8540
+ "eval_samples_per_second": 3.204,
8541
+ "eval_steps_per_second": 1.602,
8542
+ "step": 23700
8543
+ },
8544
+ {
8545
+ "epoch": 0.4745,
8546
+ "grad_norm": 0.573840223481603,
8547
+ "learning_rate": 5.839111111111112e-06,
8548
+ "loss": 2.365,
8549
+ "step": 23725
8550
+ },
8551
+ {
8552
+ "epoch": 0.475,
8553
+ "grad_norm": 0.545580569851831,
8554
+ "learning_rate": 5.8335555555555555e-06,
8555
+ "loss": 2.3813,
8556
+ "step": 23750
8557
+ },
8558
+ {
8559
+ "epoch": 0.4755,
8560
+ "grad_norm": 0.551471960312376,
8561
+ "learning_rate": 5.828e-06,
8562
+ "loss": 2.3617,
8563
+ "step": 23775
8564
+ },
8565
+ {
8566
+ "epoch": 0.476,
8567
+ "grad_norm": 0.5953130526303944,
8568
+ "learning_rate": 5.822444444444445e-06,
8569
+ "loss": 2.3781,
8570
+ "step": 23800
8571
+ },
8572
+ {
8573
+ "epoch": 0.476,
8574
+ "eval_loss": 2.38433575630188,
8575
+ "eval_runtime": 31.8506,
8576
+ "eval_samples_per_second": 3.202,
8577
+ "eval_steps_per_second": 1.601,
8578
+ "step": 23800
8579
+ },
8580
+ {
8581
+ "epoch": 0.4765,
8582
+ "grad_norm": 0.5604797565202618,
8583
+ "learning_rate": 5.81688888888889e-06,
8584
+ "loss": 2.3716,
8585
+ "step": 23825
8586
+ },
8587
+ {
8588
+ "epoch": 0.477,
8589
+ "grad_norm": 0.554661200228578,
8590
+ "learning_rate": 5.811333333333333e-06,
8591
+ "loss": 2.3724,
8592
+ "step": 23850
8593
+ },
8594
+ {
8595
+ "epoch": 0.4775,
8596
+ "grad_norm": 0.5534736868914567,
8597
+ "learning_rate": 5.8057777777777775e-06,
8598
+ "loss": 2.3754,
8599
+ "step": 23875
8600
+ },
8601
+ {
8602
+ "epoch": 0.478,
8603
+ "grad_norm": 0.541434243018937,
8604
+ "learning_rate": 5.800222222222223e-06,
8605
+ "loss": 2.3612,
8606
+ "step": 23900
8607
+ },
8608
+ {
8609
+ "epoch": 0.478,
8610
+ "eval_loss": 2.3843014240264893,
8611
+ "eval_runtime": 31.7803,
8612
+ "eval_samples_per_second": 3.21,
8613
+ "eval_steps_per_second": 1.605,
8614
+ "step": 23900
8615
+ },
8616
+ {
8617
+ "epoch": 0.4785,
8618
+ "grad_norm": 0.5557683143124796,
8619
+ "learning_rate": 5.794666666666667e-06,
8620
+ "loss": 2.3639,
8621
+ "step": 23925
8622
+ },
8623
+ {
8624
+ "epoch": 0.479,
8625
+ "grad_norm": 0.5799527873689908,
8626
+ "learning_rate": 5.789111111111112e-06,
8627
+ "loss": 2.373,
8628
+ "step": 23950
8629
+ },
8630
+ {
8631
+ "epoch": 0.4795,
8632
+ "grad_norm": 0.590904770982699,
8633
+ "learning_rate": 5.783555555555556e-06,
8634
+ "loss": 2.3778,
8635
+ "step": 23975
8636
+ },
8637
+ {
8638
+ "epoch": 0.48,
8639
+ "grad_norm": 0.5561040991296016,
8640
+ "learning_rate": 5.778e-06,
8641
+ "loss": 2.3552,
8642
+ "step": 24000
8643
+ },
8644
+ {
8645
+ "epoch": 0.48,
8646
+ "eval_loss": 2.3842599391937256,
8647
+ "eval_runtime": 31.7209,
8648
+ "eval_samples_per_second": 3.216,
8649
+ "eval_steps_per_second": 1.608,
8650
+ "step": 24000
8651
  }
8652
  ],
8653
  "logging_steps": 25,
 
8667
  "attributes": {}
8668
  }
8669
  },
8670
+ "total_flos": 7.63968551598529e+19,
8671
  "train_batch_size": 1,
8672
  "trial_name": null,
8673
  "trial_params": null