irodkin commited on
Commit
c766656
·
verified ·
1 Parent(s): 4816e4d

Training checkpoint at step 22000

Browse files
Files changed (1) hide show
  1. trainer_state.json +366 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 20900,
3
- "best_metric": 2.388044595718384,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-19000",
5
- "epoch": 0.42,
6
  "eval_steps": 100,
7
- "global_step": 21000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7568,6 +7568,366 @@
7568
  "eval_samples_per_second": 3.209,
7569
  "eval_steps_per_second": 1.604,
7570
  "step": 21000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7571
  }
7572
  ],
7573
  "logging_steps": 25,
@@ -7587,7 +7947,7 @@
7587
  "attributes": {}
7588
  }
7589
  },
7590
- "total_flos": 6.684724826487128e+19,
7591
  "train_batch_size": 1,
7592
  "trial_name": null,
7593
  "trial_params": null
 
1
  {
2
+ "best_global_step": 22000,
3
+ "best_metric": 2.3865110874176025,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_21/checkpoint-22000",
5
+ "epoch": 0.44,
6
  "eval_steps": 100,
7
+ "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7568
  "eval_samples_per_second": 3.209,
7569
  "eval_steps_per_second": 1.604,
7570
  "step": 21000
7571
+ },
7572
+ {
7573
+ "epoch": 0.4205,
7574
+ "grad_norm": 0.5447308357523696,
7575
+ "learning_rate": 6.439111111111111e-06,
7576
+ "loss": 2.3803,
7577
+ "step": 21025
7578
+ },
7579
+ {
7580
+ "epoch": 0.421,
7581
+ "grad_norm": 0.5426314550064573,
7582
+ "learning_rate": 6.4335555555555566e-06,
7583
+ "loss": 2.3798,
7584
+ "step": 21050
7585
+ },
7586
+ {
7587
+ "epoch": 0.4215,
7588
+ "grad_norm": 0.5623213994558643,
7589
+ "learning_rate": 6.428000000000001e-06,
7590
+ "loss": 2.3855,
7591
+ "step": 21075
7592
+ },
7593
+ {
7594
+ "epoch": 0.422,
7595
+ "grad_norm": 0.551782200199429,
7596
+ "learning_rate": 6.4224444444444445e-06,
7597
+ "loss": 2.3744,
7598
+ "step": 21100
7599
+ },
7600
+ {
7601
+ "epoch": 0.422,
7602
+ "eval_loss": 2.3879234790802,
7603
+ "eval_runtime": 31.7247,
7604
+ "eval_samples_per_second": 3.215,
7605
+ "eval_steps_per_second": 1.608,
7606
+ "step": 21100
7607
+ },
7608
+ {
7609
+ "epoch": 0.4225,
7610
+ "grad_norm": 0.527718965025146,
7611
+ "learning_rate": 6.416888888888889e-06,
7612
+ "loss": 2.3629,
7613
+ "step": 21125
7614
+ },
7615
+ {
7616
+ "epoch": 0.423,
7617
+ "grad_norm": 0.5608708238117702,
7618
+ "learning_rate": 6.411333333333334e-06,
7619
+ "loss": 2.3775,
7620
+ "step": 21150
7621
+ },
7622
+ {
7623
+ "epoch": 0.4235,
7624
+ "grad_norm": 0.5448339479028284,
7625
+ "learning_rate": 6.405777777777779e-06,
7626
+ "loss": 2.379,
7627
+ "step": 21175
7628
+ },
7629
+ {
7630
+ "epoch": 0.424,
7631
+ "grad_norm": 0.5418336159854089,
7632
+ "learning_rate": 6.400222222222223e-06,
7633
+ "loss": 2.3771,
7634
+ "step": 21200
7635
+ },
7636
+ {
7637
+ "epoch": 0.424,
7638
+ "eval_loss": 2.3878672122955322,
7639
+ "eval_runtime": 31.8891,
7640
+ "eval_samples_per_second": 3.199,
7641
+ "eval_steps_per_second": 1.599,
7642
+ "step": 21200
7643
+ },
7644
+ {
7645
+ "epoch": 0.4245,
7646
+ "grad_norm": 0.5765916975285049,
7647
+ "learning_rate": 6.3946666666666665e-06,
7648
+ "loss": 2.3838,
7649
+ "step": 21225
7650
+ },
7651
+ {
7652
+ "epoch": 0.425,
7653
+ "grad_norm": 0.5482787584221817,
7654
+ "learning_rate": 6.389111111111112e-06,
7655
+ "loss": 2.3751,
7656
+ "step": 21250
7657
+ },
7658
+ {
7659
+ "epoch": 0.4255,
7660
+ "grad_norm": 0.5592623692636863,
7661
+ "learning_rate": 6.383555555555556e-06,
7662
+ "loss": 2.3714,
7663
+ "step": 21275
7664
+ },
7665
+ {
7666
+ "epoch": 0.426,
7667
+ "grad_norm": 0.5502456266750644,
7668
+ "learning_rate": 6.378000000000001e-06,
7669
+ "loss": 2.3687,
7670
+ "step": 21300
7671
+ },
7672
+ {
7673
+ "epoch": 0.426,
7674
+ "eval_loss": 2.387702226638794,
7675
+ "eval_runtime": 31.8474,
7676
+ "eval_samples_per_second": 3.203,
7677
+ "eval_steps_per_second": 1.601,
7678
+ "step": 21300
7679
+ },
7680
+ {
7681
+ "epoch": 0.4265,
7682
+ "grad_norm": 0.5508844144432443,
7683
+ "learning_rate": 6.372444444444444e-06,
7684
+ "loss": 2.3705,
7685
+ "step": 21325
7686
+ },
7687
+ {
7688
+ "epoch": 0.427,
7689
+ "grad_norm": 0.5551955771008479,
7690
+ "learning_rate": 6.366888888888889e-06,
7691
+ "loss": 2.3616,
7692
+ "step": 21350
7693
+ },
7694
+ {
7695
+ "epoch": 0.4275,
7696
+ "grad_norm": 0.5482174863813819,
7697
+ "learning_rate": 6.361333333333334e-06,
7698
+ "loss": 2.3679,
7699
+ "step": 21375
7700
+ },
7701
+ {
7702
+ "epoch": 0.428,
7703
+ "grad_norm": 0.540793837360148,
7704
+ "learning_rate": 6.355777777777778e-06,
7705
+ "loss": 2.3724,
7706
+ "step": 21400
7707
+ },
7708
+ {
7709
+ "epoch": 0.428,
7710
+ "eval_loss": 2.3876450061798096,
7711
+ "eval_runtime": 32.2051,
7712
+ "eval_samples_per_second": 3.167,
7713
+ "eval_steps_per_second": 1.584,
7714
+ "step": 21400
7715
+ },
7716
+ {
7717
+ "epoch": 0.4285,
7718
+ "grad_norm": 0.5478812262209652,
7719
+ "learning_rate": 6.3502222222222235e-06,
7720
+ "loss": 2.3639,
7721
+ "step": 21425
7722
+ },
7723
+ {
7724
+ "epoch": 0.429,
7725
+ "grad_norm": 0.5598419449976438,
7726
+ "learning_rate": 6.344666666666667e-06,
7727
+ "loss": 2.3686,
7728
+ "step": 21450
7729
+ },
7730
+ {
7731
+ "epoch": 0.4295,
7732
+ "grad_norm": 0.5650989625187698,
7733
+ "learning_rate": 6.339111111111111e-06,
7734
+ "loss": 2.3755,
7735
+ "step": 21475
7736
+ },
7737
+ {
7738
+ "epoch": 0.43,
7739
+ "grad_norm": 0.5521104434834965,
7740
+ "learning_rate": 6.333555555555556e-06,
7741
+ "loss": 2.3819,
7742
+ "step": 21500
7743
+ },
7744
+ {
7745
+ "epoch": 0.43,
7746
+ "eval_loss": 2.386732578277588,
7747
+ "eval_runtime": 32.423,
7748
+ "eval_samples_per_second": 3.146,
7749
+ "eval_steps_per_second": 1.573,
7750
+ "step": 21500
7751
+ },
7752
+ {
7753
+ "epoch": 0.4305,
7754
+ "grad_norm": 0.5718504697288973,
7755
+ "learning_rate": 6.328000000000001e-06,
7756
+ "loss": 2.3768,
7757
+ "step": 21525
7758
+ },
7759
+ {
7760
+ "epoch": 0.431,
7761
+ "grad_norm": 0.5647383482527034,
7762
+ "learning_rate": 6.3224444444444455e-06,
7763
+ "loss": 2.3634,
7764
+ "step": 21550
7765
+ },
7766
+ {
7767
+ "epoch": 0.4315,
7768
+ "grad_norm": 0.5740444089490578,
7769
+ "learning_rate": 6.316888888888889e-06,
7770
+ "loss": 2.3683,
7771
+ "step": 21575
7772
+ },
7773
+ {
7774
+ "epoch": 0.432,
7775
+ "grad_norm": 0.5468815860778439,
7776
+ "learning_rate": 6.3113333333333334e-06,
7777
+ "loss": 2.3775,
7778
+ "step": 21600
7779
+ },
7780
+ {
7781
+ "epoch": 0.432,
7782
+ "eval_loss": 2.386624813079834,
7783
+ "eval_runtime": 32.2361,
7784
+ "eval_samples_per_second": 3.164,
7785
+ "eval_steps_per_second": 1.582,
7786
+ "step": 21600
7787
+ },
7788
+ {
7789
+ "epoch": 0.4325,
7790
+ "grad_norm": 0.5491782166979611,
7791
+ "learning_rate": 6.305777777777779e-06,
7792
+ "loss": 2.3678,
7793
+ "step": 21625
7794
+ },
7795
+ {
7796
+ "epoch": 0.433,
7797
+ "grad_norm": 0.5493956319744467,
7798
+ "learning_rate": 6.300222222222223e-06,
7799
+ "loss": 2.3632,
7800
+ "step": 21650
7801
+ },
7802
+ {
7803
+ "epoch": 0.4335,
7804
+ "grad_norm": 0.5517199994093782,
7805
+ "learning_rate": 6.294666666666667e-06,
7806
+ "loss": 2.3719,
7807
+ "step": 21675
7808
+ },
7809
+ {
7810
+ "epoch": 0.434,
7811
+ "grad_norm": 0.5480082798934808,
7812
+ "learning_rate": 6.289111111111111e-06,
7813
+ "loss": 2.3705,
7814
+ "step": 21700
7815
+ },
7816
+ {
7817
+ "epoch": 0.434,
7818
+ "eval_loss": 2.386605978012085,
7819
+ "eval_runtime": 31.811,
7820
+ "eval_samples_per_second": 3.206,
7821
+ "eval_steps_per_second": 1.603,
7822
+ "step": 21700
7823
+ },
7824
+ {
7825
+ "epoch": 0.4345,
7826
+ "grad_norm": 0.5988374708555845,
7827
+ "learning_rate": 6.283555555555556e-06,
7828
+ "loss": 2.3736,
7829
+ "step": 21725
7830
+ },
7831
+ {
7832
+ "epoch": 0.435,
7833
+ "grad_norm": 0.5394989364015422,
7834
+ "learning_rate": 6.278000000000001e-06,
7835
+ "loss": 2.38,
7836
+ "step": 21750
7837
+ },
7838
+ {
7839
+ "epoch": 0.4355,
7840
+ "grad_norm": 0.5660475248416822,
7841
+ "learning_rate": 6.272444444444445e-06,
7842
+ "loss": 2.3712,
7843
+ "step": 21775
7844
+ },
7845
+ {
7846
+ "epoch": 0.436,
7847
+ "grad_norm": 0.5824076374736812,
7848
+ "learning_rate": 6.266888888888889e-06,
7849
+ "loss": 2.3781,
7850
+ "step": 21800
7851
+ },
7852
+ {
7853
+ "epoch": 0.436,
7854
+ "eval_loss": 2.3868014812469482,
7855
+ "eval_runtime": 32.0011,
7856
+ "eval_samples_per_second": 3.187,
7857
+ "eval_steps_per_second": 1.594,
7858
+ "step": 21800
7859
+ },
7860
+ {
7861
+ "epoch": 0.4365,
7862
+ "grad_norm": 0.5604649354431509,
7863
+ "learning_rate": 6.261333333333334e-06,
7864
+ "loss": 2.3673,
7865
+ "step": 21825
7866
+ },
7867
+ {
7868
+ "epoch": 0.437,
7869
+ "grad_norm": 0.5581917280058185,
7870
+ "learning_rate": 6.255777777777778e-06,
7871
+ "loss": 2.3575,
7872
+ "step": 21850
7873
+ },
7874
+ {
7875
+ "epoch": 0.4375,
7876
+ "grad_norm": 0.5682187519985219,
7877
+ "learning_rate": 6.250222222222223e-06,
7878
+ "loss": 2.3752,
7879
+ "step": 21875
7880
+ },
7881
+ {
7882
+ "epoch": 0.438,
7883
+ "grad_norm": 0.5343819916754123,
7884
+ "learning_rate": 6.244666666666666e-06,
7885
+ "loss": 2.3688,
7886
+ "step": 21900
7887
+ },
7888
+ {
7889
+ "epoch": 0.438,
7890
+ "eval_loss": 2.3865694999694824,
7891
+ "eval_runtime": 31.8681,
7892
+ "eval_samples_per_second": 3.201,
7893
+ "eval_steps_per_second": 1.6,
7894
+ "step": 21900
7895
+ },
7896
+ {
7897
+ "epoch": 0.4385,
7898
+ "grad_norm": 0.6084740129821103,
7899
+ "learning_rate": 6.2391111111111115e-06,
7900
+ "loss": 2.3611,
7901
+ "step": 21925
7902
+ },
7903
+ {
7904
+ "epoch": 0.439,
7905
+ "grad_norm": 0.5550908983577711,
7906
+ "learning_rate": 6.233555555555556e-06,
7907
+ "loss": 2.364,
7908
+ "step": 21950
7909
+ },
7910
+ {
7911
+ "epoch": 0.4395,
7912
+ "grad_norm": 0.5605896822575689,
7913
+ "learning_rate": 6.228e-06,
7914
+ "loss": 2.3875,
7915
+ "step": 21975
7916
+ },
7917
+ {
7918
+ "epoch": 0.44,
7919
+ "grad_norm": 0.5679795530728957,
7920
+ "learning_rate": 6.222444444444446e-06,
7921
+ "loss": 2.3637,
7922
+ "step": 22000
7923
+ },
7924
+ {
7925
+ "epoch": 0.44,
7926
+ "eval_loss": 2.3865110874176025,
7927
+ "eval_runtime": 31.8116,
7928
+ "eval_samples_per_second": 3.206,
7929
+ "eval_steps_per_second": 1.603,
7930
+ "step": 22000
7931
  }
7932
  ],
7933
  "logging_steps": 25,
 
7947
  "attributes": {}
7948
  }
7949
  },
7950
+ "total_flos": 7.0030450563198484e+19,
7951
  "train_batch_size": 1,
7952
  "trial_name": null,
7953
  "trial_params": null