irodkin commited on
Commit
787b8f8
·
verified ·
1 Parent(s): e61b665

Training checkpoint at step 19000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 18000,
3
- "best_metric": 2.3920133113861084,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-18000",
5
- "epoch": 0.36,
6
  "eval_steps": 100,
7
- "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6488,6 +6488,366 @@
6488
  "eval_samples_per_second": 3.195,
6489
  "eval_steps_per_second": 1.597,
6490
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6491
  }
6492
  ],
6493
  "logging_steps": 25,
@@ -6507,7 +6867,7 @@
6507
  "attributes": {}
6508
  }
6509
  },
6510
- "total_flos": 5.729764136988967e+19,
6511
  "train_batch_size": 1,
6512
  "trial_name": null,
6513
  "trial_params": null
 
1
  {
2
+ "best_global_step": 19000,
3
+ "best_metric": 2.390749454498291,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
+ "epoch": 0.38,
6
  "eval_steps": 100,
7
+ "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6488
  "eval_samples_per_second": 3.195,
6489
  "eval_steps_per_second": 1.597,
6490
  "step": 18000
6491
+ },
6492
+ {
6493
+ "epoch": 0.3605,
6494
+ "grad_norm": 0.5779120077378331,
6495
+ "learning_rate": 7.105777777777778e-06,
6496
+ "loss": 2.3798,
6497
+ "step": 18025
6498
+ },
6499
+ {
6500
+ "epoch": 0.361,
6501
+ "grad_norm": 0.575309417070187,
6502
+ "learning_rate": 7.100222222222223e-06,
6503
+ "loss": 2.3875,
6504
+ "step": 18050
6505
+ },
6506
+ {
6507
+ "epoch": 0.3615,
6508
+ "grad_norm": 0.6000430306182747,
6509
+ "learning_rate": 7.0946666666666675e-06,
6510
+ "loss": 2.3727,
6511
+ "step": 18075
6512
+ },
6513
+ {
6514
+ "epoch": 0.362,
6515
+ "grad_norm": 0.5701734522791184,
6516
+ "learning_rate": 7.089111111111112e-06,
6517
+ "loss": 2.3793,
6518
+ "step": 18100
6519
+ },
6520
+ {
6521
+ "epoch": 0.362,
6522
+ "eval_loss": 2.392152786254883,
6523
+ "eval_runtime": 31.8363,
6524
+ "eval_samples_per_second": 3.204,
6525
+ "eval_steps_per_second": 1.602,
6526
+ "step": 18100
6527
+ },
6528
+ {
6529
+ "epoch": 0.3625,
6530
+ "grad_norm": 0.5731611332750656,
6531
+ "learning_rate": 7.083555555555555e-06,
6532
+ "loss": 2.3715,
6533
+ "step": 18125
6534
+ },
6535
+ {
6536
+ "epoch": 0.363,
6537
+ "grad_norm": 0.6114229583074544,
6538
+ "learning_rate": 7.078000000000001e-06,
6539
+ "loss": 2.383,
6540
+ "step": 18150
6541
+ },
6542
+ {
6543
+ "epoch": 0.3635,
6544
+ "grad_norm": 0.541007634609165,
6545
+ "learning_rate": 7.072444444444445e-06,
6546
+ "loss": 2.3686,
6547
+ "step": 18175
6548
+ },
6549
+ {
6550
+ "epoch": 0.364,
6551
+ "grad_norm": 0.5725748950012406,
6552
+ "learning_rate": 7.0668888888888895e-06,
6553
+ "loss": 2.3873,
6554
+ "step": 18200
6555
+ },
6556
+ {
6557
+ "epoch": 0.364,
6558
+ "eval_loss": 2.392261505126953,
6559
+ "eval_runtime": 31.7706,
6560
+ "eval_samples_per_second": 3.211,
6561
+ "eval_steps_per_second": 1.605,
6562
+ "step": 18200
6563
+ },
6564
+ {
6565
+ "epoch": 0.3645,
6566
+ "grad_norm": 0.5593670656564304,
6567
+ "learning_rate": 7.061333333333333e-06,
6568
+ "loss": 2.3804,
6569
+ "step": 18225
6570
+ },
6571
+ {
6572
+ "epoch": 0.365,
6573
+ "grad_norm": 0.6009795583649221,
6574
+ "learning_rate": 7.055777777777778e-06,
6575
+ "loss": 2.3795,
6576
+ "step": 18250
6577
+ },
6578
+ {
6579
+ "epoch": 0.3655,
6580
+ "grad_norm": 0.5664495345544722,
6581
+ "learning_rate": 7.050222222222223e-06,
6582
+ "loss": 2.3631,
6583
+ "step": 18275
6584
+ },
6585
+ {
6586
+ "epoch": 0.366,
6587
+ "grad_norm": 0.6104006309418994,
6588
+ "learning_rate": 7.044666666666667e-06,
6589
+ "loss": 2.3748,
6590
+ "step": 18300
6591
+ },
6592
+ {
6593
+ "epoch": 0.366,
6594
+ "eval_loss": 2.392148971557617,
6595
+ "eval_runtime": 31.734,
6596
+ "eval_samples_per_second": 3.214,
6597
+ "eval_steps_per_second": 1.607,
6598
+ "step": 18300
6599
+ },
6600
+ {
6601
+ "epoch": 0.3665,
6602
+ "grad_norm": 0.5506059883330837,
6603
+ "learning_rate": 7.039111111111112e-06,
6604
+ "loss": 2.3714,
6605
+ "step": 18325
6606
+ },
6607
+ {
6608
+ "epoch": 0.367,
6609
+ "grad_norm": 0.5621509156408089,
6610
+ "learning_rate": 7.033555555555556e-06,
6611
+ "loss": 2.368,
6612
+ "step": 18350
6613
+ },
6614
+ {
6615
+ "epoch": 0.3675,
6616
+ "grad_norm": 0.5587181787810226,
6617
+ "learning_rate": 7.028e-06,
6618
+ "loss": 2.3791,
6619
+ "step": 18375
6620
+ },
6621
+ {
6622
+ "epoch": 0.368,
6623
+ "grad_norm": 0.5677798724220077,
6624
+ "learning_rate": 7.022444444444445e-06,
6625
+ "loss": 2.384,
6626
+ "step": 18400
6627
+ },
6628
+ {
6629
+ "epoch": 0.368,
6630
+ "eval_loss": 2.391704559326172,
6631
+ "eval_runtime": 31.7798,
6632
+ "eval_samples_per_second": 3.21,
6633
+ "eval_steps_per_second": 1.605,
6634
+ "step": 18400
6635
+ },
6636
+ {
6637
+ "epoch": 0.3685,
6638
+ "grad_norm": 0.5905061339542746,
6639
+ "learning_rate": 7.01688888888889e-06,
6640
+ "loss": 2.3881,
6641
+ "step": 18425
6642
+ },
6643
+ {
6644
+ "epoch": 0.369,
6645
+ "grad_norm": 0.554978244766298,
6646
+ "learning_rate": 7.011333333333334e-06,
6647
+ "loss": 2.3683,
6648
+ "step": 18450
6649
+ },
6650
+ {
6651
+ "epoch": 0.3695,
6652
+ "grad_norm": 0.5517801842410981,
6653
+ "learning_rate": 7.005777777777778e-06,
6654
+ "loss": 2.3835,
6655
+ "step": 18475
6656
+ },
6657
+ {
6658
+ "epoch": 0.37,
6659
+ "grad_norm": 0.5501181046318251,
6660
+ "learning_rate": 7.000222222222222e-06,
6661
+ "loss": 2.374,
6662
+ "step": 18500
6663
+ },
6664
+ {
6665
+ "epoch": 0.37,
6666
+ "eval_loss": 2.3915836811065674,
6667
+ "eval_runtime": 31.7662,
6668
+ "eval_samples_per_second": 3.211,
6669
+ "eval_steps_per_second": 1.605,
6670
+ "step": 18500
6671
+ },
6672
+ {
6673
+ "epoch": 0.3705,
6674
+ "grad_norm": 0.576826996404141,
6675
+ "learning_rate": 6.9946666666666676e-06,
6676
+ "loss": 2.3819,
6677
+ "step": 18525
6678
+ },
6679
+ {
6680
+ "epoch": 0.371,
6681
+ "grad_norm": 0.5739797151959755,
6682
+ "learning_rate": 6.989111111111112e-06,
6683
+ "loss": 2.3794,
6684
+ "step": 18550
6685
+ },
6686
+ {
6687
+ "epoch": 0.3715,
6688
+ "grad_norm": 0.5511012262440002,
6689
+ "learning_rate": 6.9835555555555555e-06,
6690
+ "loss": 2.3894,
6691
+ "step": 18575
6692
+ },
6693
+ {
6694
+ "epoch": 0.372,
6695
+ "grad_norm": 0.5958849979817049,
6696
+ "learning_rate": 6.978e-06,
6697
+ "loss": 2.3674,
6698
+ "step": 18600
6699
+ },
6700
+ {
6701
+ "epoch": 0.372,
6702
+ "eval_loss": 2.391352415084839,
6703
+ "eval_runtime": 31.7756,
6704
+ "eval_samples_per_second": 3.21,
6705
+ "eval_steps_per_second": 1.605,
6706
+ "step": 18600
6707
+ },
6708
+ {
6709
+ "epoch": 0.3725,
6710
+ "grad_norm": 0.5595892595435197,
6711
+ "learning_rate": 6.972444444444445e-06,
6712
+ "loss": 2.3835,
6713
+ "step": 18625
6714
+ },
6715
+ {
6716
+ "epoch": 0.373,
6717
+ "grad_norm": 0.5946746403488841,
6718
+ "learning_rate": 6.96688888888889e-06,
6719
+ "loss": 2.3716,
6720
+ "step": 18650
6721
+ },
6722
+ {
6723
+ "epoch": 0.3735,
6724
+ "grad_norm": 0.5613740876716816,
6725
+ "learning_rate": 6.961333333333334e-06,
6726
+ "loss": 2.3843,
6727
+ "step": 18675
6728
+ },
6729
+ {
6730
+ "epoch": 0.374,
6731
+ "grad_norm": 0.58419422677193,
6732
+ "learning_rate": 6.9557777777777776e-06,
6733
+ "loss": 2.3883,
6734
+ "step": 18700
6735
+ },
6736
+ {
6737
+ "epoch": 0.374,
6738
+ "eval_loss": 2.391383409500122,
6739
+ "eval_runtime": 31.7182,
6740
+ "eval_samples_per_second": 3.216,
6741
+ "eval_steps_per_second": 1.608,
6742
+ "step": 18700
6743
+ },
6744
+ {
6745
+ "epoch": 0.3745,
6746
+ "grad_norm": 0.5508427755524951,
6747
+ "learning_rate": 6.950222222222223e-06,
6748
+ "loss": 2.3749,
6749
+ "step": 18725
6750
+ },
6751
+ {
6752
+ "epoch": 0.375,
6753
+ "grad_norm": 0.5686856026931271,
6754
+ "learning_rate": 6.944666666666667e-06,
6755
+ "loss": 2.38,
6756
+ "step": 18750
6757
+ },
6758
+ {
6759
+ "epoch": 0.3755,
6760
+ "grad_norm": 0.5531747783480245,
6761
+ "learning_rate": 6.939111111111112e-06,
6762
+ "loss": 2.3718,
6763
+ "step": 18775
6764
+ },
6765
+ {
6766
+ "epoch": 0.376,
6767
+ "grad_norm": 0.5800045444885175,
6768
+ "learning_rate": 6.933555555555556e-06,
6769
+ "loss": 2.3703,
6770
+ "step": 18800
6771
+ },
6772
+ {
6773
+ "epoch": 0.376,
6774
+ "eval_loss": 2.391113042831421,
6775
+ "eval_runtime": 31.7446,
6776
+ "eval_samples_per_second": 3.213,
6777
+ "eval_steps_per_second": 1.607,
6778
+ "step": 18800
6779
+ },
6780
+ {
6781
+ "epoch": 0.3765,
6782
+ "grad_norm": 0.5451395919825731,
6783
+ "learning_rate": 6.928e-06,
6784
+ "loss": 2.3746,
6785
+ "step": 18825
6786
+ },
6787
+ {
6788
+ "epoch": 0.377,
6789
+ "grad_norm": 0.5619738492106079,
6790
+ "learning_rate": 6.922444444444445e-06,
6791
+ "loss": 2.3815,
6792
+ "step": 18850
6793
+ },
6794
+ {
6795
+ "epoch": 0.3775,
6796
+ "grad_norm": 0.5811440137998495,
6797
+ "learning_rate": 6.91688888888889e-06,
6798
+ "loss": 2.3655,
6799
+ "step": 18875
6800
+ },
6801
+ {
6802
+ "epoch": 0.378,
6803
+ "grad_norm": 0.5528301840539304,
6804
+ "learning_rate": 6.9113333333333345e-06,
6805
+ "loss": 2.3721,
6806
+ "step": 18900
6807
+ },
6808
+ {
6809
+ "epoch": 0.378,
6810
+ "eval_loss": 2.3908257484436035,
6811
+ "eval_runtime": 31.6268,
6812
+ "eval_samples_per_second": 3.225,
6813
+ "eval_steps_per_second": 1.613,
6814
+ "step": 18900
6815
+ },
6816
+ {
6817
+ "epoch": 0.3785,
6818
+ "grad_norm": 0.5791069800351532,
6819
+ "learning_rate": 6.905777777777778e-06,
6820
+ "loss": 2.3798,
6821
+ "step": 18925
6822
+ },
6823
+ {
6824
+ "epoch": 0.379,
6825
+ "grad_norm": 0.5692008495737035,
6826
+ "learning_rate": 6.9002222222222224e-06,
6827
+ "loss": 2.3723,
6828
+ "step": 18950
6829
+ },
6830
+ {
6831
+ "epoch": 0.3795,
6832
+ "grad_norm": 0.5614405054433378,
6833
+ "learning_rate": 6.894666666666668e-06,
6834
+ "loss": 2.3739,
6835
+ "step": 18975
6836
+ },
6837
+ {
6838
+ "epoch": 0.38,
6839
+ "grad_norm": 0.5641420025760586,
6840
+ "learning_rate": 6.889111111111112e-06,
6841
+ "loss": 2.3728,
6842
+ "step": 19000
6843
+ },
6844
+ {
6845
+ "epoch": 0.38,
6846
+ "eval_loss": 2.390749454498291,
6847
+ "eval_runtime": 31.8098,
6848
+ "eval_samples_per_second": 3.207,
6849
+ "eval_steps_per_second": 1.603,
6850
+ "step": 19000
6851
  }
6852
  ],
6853
  "logging_steps": 25,
 
6867
  "attributes": {}
6868
  }
6869
  },
6870
+ "total_flos": 6.048084366821687e+19,
6871
  "train_batch_size": 1,
6872
  "trial_name": null,
6873
  "trial_params": null