irodkin commited on
Commit
aa46eb8
·
verified ·
1 Parent(s): 5d6a324

Training checkpoint at step 16000

Browse files
Files changed (1) hide show
  1. trainer_state.json +365 -5
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 15000,
3
- "best_metric": 2.397136688232422,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-15000",
5
- "epoch": 0.3,
6
  "eval_steps": 100,
7
- "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5408,6 +5408,366 @@
5408
  "eval_samples_per_second": 3.216,
5409
  "eval_steps_per_second": 1.608,
5410
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5411
  }
5412
  ],
5413
  "logging_steps": 25,
@@ -5427,7 +5787,7 @@
5427
  "attributes": {}
5428
  }
5429
  },
5430
- "total_flos": 4.774803447490806e+19,
5431
  "train_batch_size": 1,
5432
  "trial_name": null,
5433
  "trial_params": null
 
1
  {
2
+ "best_global_step": 15900,
3
+ "best_metric": 2.3957200050354004,
4
  "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-15000",
5
+ "epoch": 0.32,
6
  "eval_steps": 100,
7
+ "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5408
  "eval_samples_per_second": 3.216,
5409
  "eval_steps_per_second": 1.608,
5410
  "step": 15000
5411
+ },
5412
+ {
5413
+ "epoch": 0.3005,
5414
+ "grad_norm": 0.5657448746286691,
5415
+ "learning_rate": 7.772444444444444e-06,
5416
+ "loss": 2.3897,
5417
+ "step": 15025
5418
+ },
5419
+ {
5420
+ "epoch": 0.301,
5421
+ "grad_norm": 0.5523525627604269,
5422
+ "learning_rate": 7.76688888888889e-06,
5423
+ "loss": 2.3795,
5424
+ "step": 15050
5425
+ },
5426
+ {
5427
+ "epoch": 0.3015,
5428
+ "grad_norm": 0.5950789860717867,
5429
+ "learning_rate": 7.761333333333335e-06,
5430
+ "loss": 2.3914,
5431
+ "step": 15075
5432
+ },
5433
+ {
5434
+ "epoch": 0.302,
5435
+ "grad_norm": 0.5999400034143391,
5436
+ "learning_rate": 7.755777777777778e-06,
5437
+ "loss": 2.3769,
5438
+ "step": 15100
5439
+ },
5440
+ {
5441
+ "epoch": 0.302,
5442
+ "eval_loss": 2.396873950958252,
5443
+ "eval_runtime": 31.7696,
5444
+ "eval_samples_per_second": 3.211,
5445
+ "eval_steps_per_second": 1.605,
5446
+ "step": 15100
5447
+ },
5448
+ {
5449
+ "epoch": 0.3025,
5450
+ "grad_norm": 0.558834977842146,
5451
+ "learning_rate": 7.750222222222222e-06,
5452
+ "loss": 2.3854,
5453
+ "step": 15125
5454
+ },
5455
+ {
5456
+ "epoch": 0.303,
5457
+ "grad_norm": 0.5582295283472423,
5458
+ "learning_rate": 7.744666666666667e-06,
5459
+ "loss": 2.3821,
5460
+ "step": 15150
5461
+ },
5462
+ {
5463
+ "epoch": 0.3035,
5464
+ "grad_norm": 0.5632905015995245,
5465
+ "learning_rate": 7.739111111111112e-06,
5466
+ "loss": 2.3798,
5467
+ "step": 15175
5468
+ },
5469
+ {
5470
+ "epoch": 0.304,
5471
+ "grad_norm": 0.5514118333084079,
5472
+ "learning_rate": 7.733555555555556e-06,
5473
+ "loss": 2.3788,
5474
+ "step": 15200
5475
+ },
5476
+ {
5477
+ "epoch": 0.304,
5478
+ "eval_loss": 2.3965888023376465,
5479
+ "eval_runtime": 31.7152,
5480
+ "eval_samples_per_second": 3.216,
5481
+ "eval_steps_per_second": 1.608,
5482
+ "step": 15200
5483
+ },
5484
+ {
5485
+ "epoch": 0.3045,
5486
+ "grad_norm": 0.5649018768322466,
5487
+ "learning_rate": 7.728000000000001e-06,
5488
+ "loss": 2.3912,
5489
+ "step": 15225
5490
+ },
5491
+ {
5492
+ "epoch": 0.305,
5493
+ "grad_norm": 0.581561230195339,
5494
+ "learning_rate": 7.722444444444445e-06,
5495
+ "loss": 2.3766,
5496
+ "step": 15250
5497
+ },
5498
+ {
5499
+ "epoch": 0.3055,
5500
+ "grad_norm": 0.5604985750115082,
5501
+ "learning_rate": 7.71688888888889e-06,
5502
+ "loss": 2.3852,
5503
+ "step": 15275
5504
+ },
5505
+ {
5506
+ "epoch": 0.306,
5507
+ "grad_norm": 0.5602736035393524,
5508
+ "learning_rate": 7.711333333333334e-06,
5509
+ "loss": 2.3867,
5510
+ "step": 15300
5511
+ },
5512
+ {
5513
+ "epoch": 0.306,
5514
+ "eval_loss": 2.3968026638031006,
5515
+ "eval_runtime": 31.8105,
5516
+ "eval_samples_per_second": 3.206,
5517
+ "eval_steps_per_second": 1.603,
5518
+ "step": 15300
5519
+ },
5520
+ {
5521
+ "epoch": 0.3065,
5522
+ "grad_norm": 0.5404472339052024,
5523
+ "learning_rate": 7.705777777777779e-06,
5524
+ "loss": 2.3835,
5525
+ "step": 15325
5526
+ },
5527
+ {
5528
+ "epoch": 0.307,
5529
+ "grad_norm": 0.5732167481475767,
5530
+ "learning_rate": 7.700222222222224e-06,
5531
+ "loss": 2.386,
5532
+ "step": 15350
5533
+ },
5534
+ {
5535
+ "epoch": 0.3075,
5536
+ "grad_norm": 0.5668975128857069,
5537
+ "learning_rate": 7.694666666666668e-06,
5538
+ "loss": 2.3838,
5539
+ "step": 15375
5540
+ },
5541
+ {
5542
+ "epoch": 0.308,
5543
+ "grad_norm": 0.5478312505357384,
5544
+ "learning_rate": 7.689111111111111e-06,
5545
+ "loss": 2.4068,
5546
+ "step": 15400
5547
+ },
5548
+ {
5549
+ "epoch": 0.308,
5550
+ "eval_loss": 2.39662766456604,
5551
+ "eval_runtime": 31.4625,
5552
+ "eval_samples_per_second": 3.242,
5553
+ "eval_steps_per_second": 1.621,
5554
+ "step": 15400
5555
+ },
5556
+ {
5557
+ "epoch": 0.3085,
5558
+ "grad_norm": 0.5853236703412803,
5559
+ "learning_rate": 7.683555555555556e-06,
5560
+ "loss": 2.3781,
5561
+ "step": 15425
5562
+ },
5563
+ {
5564
+ "epoch": 0.309,
5565
+ "grad_norm": 0.566498029803985,
5566
+ "learning_rate": 7.678000000000002e-06,
5567
+ "loss": 2.3825,
5568
+ "step": 15450
5569
+ },
5570
+ {
5571
+ "epoch": 0.3095,
5572
+ "grad_norm": 0.5876295223419085,
5573
+ "learning_rate": 7.672444444444445e-06,
5574
+ "loss": 2.3821,
5575
+ "step": 15475
5576
+ },
5577
+ {
5578
+ "epoch": 0.31,
5579
+ "grad_norm": 0.5308633915785282,
5580
+ "learning_rate": 7.666888888888889e-06,
5581
+ "loss": 2.3762,
5582
+ "step": 15500
5583
+ },
5584
+ {
5585
+ "epoch": 0.31,
5586
+ "eval_loss": 2.39650559425354,
5587
+ "eval_runtime": 31.6255,
5588
+ "eval_samples_per_second": 3.225,
5589
+ "eval_steps_per_second": 1.613,
5590
+ "step": 15500
5591
+ },
5592
+ {
5593
+ "epoch": 0.3105,
5594
+ "grad_norm": 1.090575647217174,
5595
+ "learning_rate": 7.661333333333334e-06,
5596
+ "loss": 2.3854,
5597
+ "step": 15525
5598
+ },
5599
+ {
5600
+ "epoch": 0.311,
5601
+ "grad_norm": 0.5608565584872227,
5602
+ "learning_rate": 7.65577777777778e-06,
5603
+ "loss": 2.3909,
5604
+ "step": 15550
5605
+ },
5606
+ {
5607
+ "epoch": 0.3115,
5608
+ "grad_norm": 0.5664910219445479,
5609
+ "learning_rate": 7.650222222222223e-06,
5610
+ "loss": 2.3876,
5611
+ "step": 15575
5612
+ },
5613
+ {
5614
+ "epoch": 0.312,
5615
+ "grad_norm": 0.5743138998726522,
5616
+ "learning_rate": 7.644666666666666e-06,
5617
+ "loss": 2.3891,
5618
+ "step": 15600
5619
+ },
5620
+ {
5621
+ "epoch": 0.312,
5622
+ "eval_loss": 2.395846128463745,
5623
+ "eval_runtime": 31.422,
5624
+ "eval_samples_per_second": 3.246,
5625
+ "eval_steps_per_second": 1.623,
5626
+ "step": 15600
5627
+ },
5628
+ {
5629
+ "epoch": 0.3125,
5630
+ "grad_norm": 0.5838966503811626,
5631
+ "learning_rate": 7.639111111111112e-06,
5632
+ "loss": 2.3744,
5633
+ "step": 15625
5634
+ },
5635
+ {
5636
+ "epoch": 0.313,
5637
+ "grad_norm": 0.5861982665217826,
5638
+ "learning_rate": 7.633555555555557e-06,
5639
+ "loss": 2.386,
5640
+ "step": 15650
5641
+ },
5642
+ {
5643
+ "epoch": 0.3135,
5644
+ "grad_norm": 0.5623110973377239,
5645
+ "learning_rate": 7.628000000000001e-06,
5646
+ "loss": 2.3729,
5647
+ "step": 15675
5648
+ },
5649
+ {
5650
+ "epoch": 0.314,
5651
+ "grad_norm": 0.5546807091447383,
5652
+ "learning_rate": 7.622444444444445e-06,
5653
+ "loss": 2.3758,
5654
+ "step": 15700
5655
+ },
5656
+ {
5657
+ "epoch": 0.314,
5658
+ "eval_loss": 2.396050453186035,
5659
+ "eval_runtime": 31.4839,
5660
+ "eval_samples_per_second": 3.24,
5661
+ "eval_steps_per_second": 1.62,
5662
+ "step": 15700
5663
+ },
5664
+ {
5665
+ "epoch": 0.3145,
5666
+ "grad_norm": 0.566357543453858,
5667
+ "learning_rate": 7.616888888888889e-06,
5668
+ "loss": 2.3814,
5669
+ "step": 15725
5670
+ },
5671
+ {
5672
+ "epoch": 0.315,
5673
+ "grad_norm": 0.5863021742964364,
5674
+ "learning_rate": 7.611333333333334e-06,
5675
+ "loss": 2.3912,
5676
+ "step": 15750
5677
+ },
5678
+ {
5679
+ "epoch": 0.3155,
5680
+ "grad_norm": 0.5448091994015362,
5681
+ "learning_rate": 7.605777777777779e-06,
5682
+ "loss": 2.3949,
5683
+ "step": 15775
5684
+ },
5685
+ {
5686
+ "epoch": 0.316,
5687
+ "grad_norm": 0.5571622234957405,
5688
+ "learning_rate": 7.600222222222223e-06,
5689
+ "loss": 2.3893,
5690
+ "step": 15800
5691
+ },
5692
+ {
5693
+ "epoch": 0.316,
5694
+ "eval_loss": 2.3957884311676025,
5695
+ "eval_runtime": 31.4676,
5696
+ "eval_samples_per_second": 3.241,
5697
+ "eval_steps_per_second": 1.621,
5698
+ "step": 15800
5699
+ },
5700
+ {
5701
+ "epoch": 0.3165,
5702
+ "grad_norm": 0.6175149611764096,
5703
+ "learning_rate": 7.594666666666667e-06,
5704
+ "loss": 2.3858,
5705
+ "step": 15825
5706
+ },
5707
+ {
5708
+ "epoch": 0.317,
5709
+ "grad_norm": 0.5811416818392343,
5710
+ "learning_rate": 7.589111111111111e-06,
5711
+ "loss": 2.3893,
5712
+ "step": 15850
5713
+ },
5714
+ {
5715
+ "epoch": 0.3175,
5716
+ "grad_norm": 0.5685262674194088,
5717
+ "learning_rate": 7.5835555555555566e-06,
5718
+ "loss": 2.3895,
5719
+ "step": 15875
5720
+ },
5721
+ {
5722
+ "epoch": 0.318,
5723
+ "grad_norm": 0.5726231388910242,
5724
+ "learning_rate": 7.578000000000001e-06,
5725
+ "loss": 2.3924,
5726
+ "step": 15900
5727
+ },
5728
+ {
5729
+ "epoch": 0.318,
5730
+ "eval_loss": 2.3957200050354004,
5731
+ "eval_runtime": 31.6833,
5732
+ "eval_samples_per_second": 3.219,
5733
+ "eval_steps_per_second": 1.61,
5734
+ "step": 15900
5735
+ },
5736
+ {
5737
+ "epoch": 0.3185,
5738
+ "grad_norm": 0.5881014617899262,
5739
+ "learning_rate": 7.572444444444445e-06,
5740
+ "loss": 2.3719,
5741
+ "step": 15925
5742
+ },
5743
+ {
5744
+ "epoch": 0.319,
5745
+ "grad_norm": 0.5635459036409981,
5746
+ "learning_rate": 7.566888888888889e-06,
5747
+ "loss": 2.378,
5748
+ "step": 15950
5749
+ },
5750
+ {
5751
+ "epoch": 0.3195,
5752
+ "grad_norm": 0.5604907919572244,
5753
+ "learning_rate": 7.561333333333334e-06,
5754
+ "loss": 2.3744,
5755
+ "step": 15975
5756
+ },
5757
+ {
5758
+ "epoch": 0.32,
5759
+ "grad_norm": 0.5743956921241223,
5760
+ "learning_rate": 7.555777777777779e-06,
5761
+ "loss": 2.3872,
5762
+ "step": 16000
5763
+ },
5764
+ {
5765
+ "epoch": 0.32,
5766
+ "eval_loss": 2.3958442211151123,
5767
+ "eval_runtime": 31.9703,
5768
+ "eval_samples_per_second": 3.19,
5769
+ "eval_steps_per_second": 1.595,
5770
+ "step": 16000
5771
  }
5772
  ],
5773
  "logging_steps": 25,
 
5787
  "attributes": {}
5788
  }
5789
  },
5790
+ "total_flos": 5.093123677323526e+19,
5791
  "train_batch_size": 1,
5792
  "trial_name": null,
5793
  "trial_params": null