irodkin commited on
Commit
e12c15e
·
verified ·
1 Parent(s): da0f1ec

Training checkpoint at step 27000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 25900,
3
- "best_metric": 2.3824901580810547,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
- "epoch": 0.52,
6
  "eval_steps": 100,
7
- "global_step": 26000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9368,6 +9368,366 @@
9368
  "eval_samples_per_second": 3.212,
9369
  "eval_steps_per_second": 1.606,
9370
  "step": 26000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9371
  }
9372
  ],
9373
  "logging_steps": 25,
@@ -9387,7 +9747,7 @@
9387
  "attributes": {}
9388
  }
9389
  },
9390
- "total_flos": 8.27632597565073e+19,
9391
  "train_batch_size": 1,
9392
  "trial_name": null,
9393
  "trial_params": null
 
1
  {
2
+ "best_global_step": 26800,
3
+ "best_metric": 2.381396532058716,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-25000",
5
+ "epoch": 0.54,
6
  "eval_steps": 100,
7
+ "global_step": 27000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9368
  "eval_samples_per_second": 3.212,
9369
  "eval_steps_per_second": 1.606,
9370
  "step": 26000
9371
+ },
9372
+ {
9373
+ "epoch": 0.5205,
9374
+ "grad_norm": 0.5425846904290578,
9375
+ "learning_rate": 5.328000000000001e-06,
9376
+ "loss": 2.3675,
9377
+ "step": 26025
9378
+ },
9379
+ {
9380
+ "epoch": 0.521,
9381
+ "grad_norm": 0.5621800567569987,
9382
+ "learning_rate": 5.322444444444445e-06,
9383
+ "loss": 2.3759,
9384
+ "step": 26050
9385
+ },
9386
+ {
9387
+ "epoch": 0.5215,
9388
+ "grad_norm": 0.5544103291449336,
9389
+ "learning_rate": 5.316888888888889e-06,
9390
+ "loss": 2.3576,
9391
+ "step": 26075
9392
+ },
9393
+ {
9394
+ "epoch": 0.522,
9395
+ "grad_norm": 0.550125457461572,
9396
+ "learning_rate": 5.311333333333334e-06,
9397
+ "loss": 2.3567,
9398
+ "step": 26100
9399
+ },
9400
+ {
9401
+ "epoch": 0.522,
9402
+ "eval_loss": 2.382749319076538,
9403
+ "eval_runtime": 31.8184,
9404
+ "eval_samples_per_second": 3.206,
9405
+ "eval_steps_per_second": 1.603,
9406
+ "step": 26100
9407
+ },
9408
+ {
9409
+ "epoch": 0.5225,
9410
+ "grad_norm": 0.5441956885780074,
9411
+ "learning_rate": 5.305777777777779e-06,
9412
+ "loss": 2.3562,
9413
+ "step": 26125
9414
+ },
9415
+ {
9416
+ "epoch": 0.523,
9417
+ "grad_norm": 0.5677266247403775,
9418
+ "learning_rate": 5.3002222222222225e-06,
9419
+ "loss": 2.3666,
9420
+ "step": 26150
9421
+ },
9422
+ {
9423
+ "epoch": 0.5235,
9424
+ "grad_norm": 0.5396975563673215,
9425
+ "learning_rate": 5.294666666666667e-06,
9426
+ "loss": 2.351,
9427
+ "step": 26175
9428
+ },
9429
+ {
9430
+ "epoch": 0.524,
9431
+ "grad_norm": 0.5374437057610971,
9432
+ "learning_rate": 5.289111111111111e-06,
9433
+ "loss": 2.3625,
9434
+ "step": 26200
9435
+ },
9436
+ {
9437
+ "epoch": 0.524,
9438
+ "eval_loss": 2.3822991847991943,
9439
+ "eval_runtime": 31.8822,
9440
+ "eval_samples_per_second": 3.199,
9441
+ "eval_steps_per_second": 1.6,
9442
+ "step": 26200
9443
+ },
9444
+ {
9445
+ "epoch": 0.5245,
9446
+ "grad_norm": 0.5627076715491244,
9447
+ "learning_rate": 5.2835555555555566e-06,
9448
+ "loss": 2.3699,
9449
+ "step": 26225
9450
+ },
9451
+ {
9452
+ "epoch": 0.525,
9453
+ "grad_norm": 0.5430691314388109,
9454
+ "learning_rate": 5.278000000000001e-06,
9455
+ "loss": 2.3648,
9456
+ "step": 26250
9457
+ },
9458
+ {
9459
+ "epoch": 0.5255,
9460
+ "grad_norm": 0.5319128139639624,
9461
+ "learning_rate": 5.2724444444444445e-06,
9462
+ "loss": 2.3722,
9463
+ "step": 26275
9464
+ },
9465
+ {
9466
+ "epoch": 0.526,
9467
+ "grad_norm": 0.5560009569047116,
9468
+ "learning_rate": 5.266888888888889e-06,
9469
+ "loss": 2.3763,
9470
+ "step": 26300
9471
+ },
9472
+ {
9473
+ "epoch": 0.526,
9474
+ "eval_loss": 2.3822247982025146,
9475
+ "eval_runtime": 31.7558,
9476
+ "eval_samples_per_second": 3.212,
9477
+ "eval_steps_per_second": 1.606,
9478
+ "step": 26300
9479
+ },
9480
+ {
9481
+ "epoch": 0.5265,
9482
+ "grad_norm": 0.5586923319248112,
9483
+ "learning_rate": 5.261333333333334e-06,
9484
+ "loss": 2.366,
9485
+ "step": 26325
9486
+ },
9487
+ {
9488
+ "epoch": 0.527,
9489
+ "grad_norm": 0.5621950392943218,
9490
+ "learning_rate": 5.255777777777779e-06,
9491
+ "loss": 2.3713,
9492
+ "step": 26350
9493
+ },
9494
+ {
9495
+ "epoch": 0.5275,
9496
+ "grad_norm": 0.5630783729958978,
9497
+ "learning_rate": 5.250222222222222e-06,
9498
+ "loss": 2.3508,
9499
+ "step": 26375
9500
+ },
9501
+ {
9502
+ "epoch": 0.528,
9503
+ "grad_norm": 0.5543463911581646,
9504
+ "learning_rate": 5.2446666666666665e-06,
9505
+ "loss": 2.3588,
9506
+ "step": 26400
9507
+ },
9508
+ {
9509
+ "epoch": 0.528,
9510
+ "eval_loss": 2.3820412158966064,
9511
+ "eval_runtime": 31.7735,
9512
+ "eval_samples_per_second": 3.21,
9513
+ "eval_steps_per_second": 1.605,
9514
+ "step": 26400
9515
+ },
9516
+ {
9517
+ "epoch": 0.5285,
9518
+ "grad_norm": 0.5521701819516005,
9519
+ "learning_rate": 5.239111111111112e-06,
9520
+ "loss": 2.3798,
9521
+ "step": 26425
9522
+ },
9523
+ {
9524
+ "epoch": 0.529,
9525
+ "grad_norm": 0.5697290541696707,
9526
+ "learning_rate": 5.233555555555556e-06,
9527
+ "loss": 2.3705,
9528
+ "step": 26450
9529
+ },
9530
+ {
9531
+ "epoch": 0.5295,
9532
+ "grad_norm": 0.5456656767494042,
9533
+ "learning_rate": 5.228000000000001e-06,
9534
+ "loss": 2.3603,
9535
+ "step": 26475
9536
+ },
9537
+ {
9538
+ "epoch": 0.53,
9539
+ "grad_norm": 0.544157308823069,
9540
+ "learning_rate": 5.222444444444444e-06,
9541
+ "loss": 2.3598,
9542
+ "step": 26500
9543
+ },
9544
+ {
9545
+ "epoch": 0.53,
9546
+ "eval_loss": 2.3819408416748047,
9547
+ "eval_runtime": 31.804,
9548
+ "eval_samples_per_second": 3.207,
9549
+ "eval_steps_per_second": 1.604,
9550
+ "step": 26500
9551
+ },
9552
+ {
9553
+ "epoch": 0.5305,
9554
+ "grad_norm": 0.5399718074412095,
9555
+ "learning_rate": 5.216888888888889e-06,
9556
+ "loss": 2.3765,
9557
+ "step": 26525
9558
+ },
9559
+ {
9560
+ "epoch": 0.531,
9561
+ "grad_norm": 0.542440216852853,
9562
+ "learning_rate": 5.211333333333334e-06,
9563
+ "loss": 2.3758,
9564
+ "step": 26550
9565
+ },
9566
+ {
9567
+ "epoch": 0.5315,
9568
+ "grad_norm": 0.5648571300651365,
9569
+ "learning_rate": 5.205777777777778e-06,
9570
+ "loss": 2.3685,
9571
+ "step": 26575
9572
+ },
9573
+ {
9574
+ "epoch": 0.532,
9575
+ "grad_norm": 0.573442767423831,
9576
+ "learning_rate": 5.2002222222222235e-06,
9577
+ "loss": 2.3556,
9578
+ "step": 26600
9579
+ },
9580
+ {
9581
+ "epoch": 0.532,
9582
+ "eval_loss": 2.382056951522827,
9583
+ "eval_runtime": 31.8038,
9584
+ "eval_samples_per_second": 3.207,
9585
+ "eval_steps_per_second": 1.604,
9586
+ "step": 26600
9587
+ },
9588
+ {
9589
+ "epoch": 0.5325,
9590
+ "grad_norm": 0.6056414806190663,
9591
+ "learning_rate": 5.194666666666667e-06,
9592
+ "loss": 2.3595,
9593
+ "step": 26625
9594
+ },
9595
+ {
9596
+ "epoch": 0.533,
9597
+ "grad_norm": 0.5481757619700885,
9598
+ "learning_rate": 5.189111111111111e-06,
9599
+ "loss": 2.3727,
9600
+ "step": 26650
9601
+ },
9602
+ {
9603
+ "epoch": 0.5335,
9604
+ "grad_norm": 0.5610562792027696,
9605
+ "learning_rate": 5.183555555555556e-06,
9606
+ "loss": 2.3673,
9607
+ "step": 26675
9608
+ },
9609
+ {
9610
+ "epoch": 0.534,
9611
+ "grad_norm": 0.5702347426339772,
9612
+ "learning_rate": 5.178000000000001e-06,
9613
+ "loss": 2.3622,
9614
+ "step": 26700
9615
+ },
9616
+ {
9617
+ "epoch": 0.534,
9618
+ "eval_loss": 2.381828546524048,
9619
+ "eval_runtime": 31.992,
9620
+ "eval_samples_per_second": 3.188,
9621
+ "eval_steps_per_second": 1.594,
9622
+ "step": 26700
9623
+ },
9624
+ {
9625
+ "epoch": 0.5345,
9626
+ "grad_norm": 0.5565593579595437,
9627
+ "learning_rate": 5.172444444444445e-06,
9628
+ "loss": 2.3651,
9629
+ "step": 26725
9630
+ },
9631
+ {
9632
+ "epoch": 0.535,
9633
+ "grad_norm": 0.5398272748687973,
9634
+ "learning_rate": 5.166888888888889e-06,
9635
+ "loss": 2.3703,
9636
+ "step": 26750
9637
+ },
9638
+ {
9639
+ "epoch": 0.5355,
9640
+ "grad_norm": 0.5611538131409728,
9641
+ "learning_rate": 5.1613333333333334e-06,
9642
+ "loss": 2.3778,
9643
+ "step": 26775
9644
+ },
9645
+ {
9646
+ "epoch": 0.536,
9647
+ "grad_norm": 0.5436520053621182,
9648
+ "learning_rate": 5.155777777777779e-06,
9649
+ "loss": 2.3561,
9650
+ "step": 26800
9651
+ },
9652
+ {
9653
+ "epoch": 0.536,
9654
+ "eval_loss": 2.381396532058716,
9655
+ "eval_runtime": 31.772,
9656
+ "eval_samples_per_second": 3.21,
9657
+ "eval_steps_per_second": 1.605,
9658
+ "step": 26800
9659
+ },
9660
+ {
9661
+ "epoch": 0.5365,
9662
+ "grad_norm": 0.5574841239488896,
9663
+ "learning_rate": 5.150222222222223e-06,
9664
+ "loss": 2.3607,
9665
+ "step": 26825
9666
+ },
9667
+ {
9668
+ "epoch": 0.537,
9669
+ "grad_norm": 0.5459267231396281,
9670
+ "learning_rate": 5.144666666666667e-06,
9671
+ "loss": 2.3652,
9672
+ "step": 26850
9673
+ },
9674
+ {
9675
+ "epoch": 0.5375,
9676
+ "grad_norm": 0.5764624554311072,
9677
+ "learning_rate": 5.139111111111111e-06,
9678
+ "loss": 2.3748,
9679
+ "step": 26875
9680
+ },
9681
+ {
9682
+ "epoch": 0.538,
9683
+ "grad_norm": 0.5452582655691465,
9684
+ "learning_rate": 5.133555555555556e-06,
9685
+ "loss": 2.3751,
9686
+ "step": 26900
9687
+ },
9688
+ {
9689
+ "epoch": 0.538,
9690
+ "eval_loss": 2.3815813064575195,
9691
+ "eval_runtime": 31.833,
9692
+ "eval_samples_per_second": 3.204,
9693
+ "eval_steps_per_second": 1.602,
9694
+ "step": 26900
9695
+ },
9696
+ {
9697
+ "epoch": 0.5385,
9698
+ "grad_norm": 0.5591974032204698,
9699
+ "learning_rate": 5.128000000000001e-06,
9700
+ "loss": 2.3595,
9701
+ "step": 26925
9702
+ },
9703
+ {
9704
+ "epoch": 0.539,
9705
+ "grad_norm": 0.5910956937930101,
9706
+ "learning_rate": 5.122444444444444e-06,
9707
+ "loss": 2.3712,
9708
+ "step": 26950
9709
+ },
9710
+ {
9711
+ "epoch": 0.5395,
9712
+ "grad_norm": 0.5532516136915937,
9713
+ "learning_rate": 5.116888888888889e-06,
9714
+ "loss": 2.3673,
9715
+ "step": 26975
9716
+ },
9717
+ {
9718
+ "epoch": 0.54,
9719
+ "grad_norm": 0.5654498740726267,
9720
+ "learning_rate": 5.111333333333334e-06,
9721
+ "loss": 2.3667,
9722
+ "step": 27000
9723
+ },
9724
+ {
9725
+ "epoch": 0.54,
9726
+ "eval_loss": 2.3814122676849365,
9727
+ "eval_runtime": 31.7588,
9728
+ "eval_samples_per_second": 3.212,
9729
+ "eval_steps_per_second": 1.606,
9730
+ "step": 27000
9731
  }
9732
  ],
9733
  "logging_steps": 25,
 
9747
  "attributes": {}
9748
  }
9749
  },
9750
+ "total_flos": 8.59464620548345e+19,
9751
  "train_batch_size": 1,
9752
  "trial_name": null,
9753
  "trial_params": null