mtzig commited on
Commit
b729e31
·
verified ·
1 Parent(s): 3427c7f

Training in progress, step 6100, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:407cea8cd4c1444b6fd3dbbc1796efb64886678cd52d2935445d4ee150b19cd9
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4558b733d32e90c4e5c89bcba7e81f8b773afc6aa52a225d4a1952b193271193
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f09e4f286d588fdd8dee70e7788283d8f82c437d873e13a263f824d89ba1dc09
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:481608ef601eeee9cd85ec29231d62de3814d11712fe3bb63383faaa39db9e5b
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ce1f760bbd4c96a2756283dc0ed0049eaa28a856cc915b2efea1a4cad775044
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:596f270fa924dc50f57e12f2747dd1d30dfc07fc2ee00e143030c1b9a7de0239
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4119461e04c64bd9cb35fc4677eb47b0256885eb2bf830e5e575de68f0787410
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9be32303f0039603765d77ac706bef56128491b375b7cab5a7ca9e2dd0c20e1
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1789239bff9adb9c6876b4d099f2ed19463d2be8a749c02ae1a04bf9c4fab87a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1dc871b1d1595e1e47cbc3a3462b01da1390680ed602cc4977fcc0ae598b0ab
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4821bd33219546f03dfe0ef15028c7679b8d9837b37430def9e4de554b5dc22a
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7d82623ea7825bea9aa6e58232cb5ab536747b4e2584fee539f8ebb85840589
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7914c18071ba453e15120e4e8596755dd9d2166fc0ded479a8498bd53bfc83d
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8827dca82cdf8c9dc0048ecc8da1ac0c4a5995aa9c070303bd1e4628bd21c2b1
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfaec33f43af8375c51ba9ca0f8679ccb2f8f39889358a6c520af5ba2029ceed
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6644c5e24b376442f37af7277f310848ba0091903a3e17bb78348c667f27d6a
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c51cd242b6ad96b1a7bd50ac0129e12f629372d44073ce6176ca7a37443f9b6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa8e8f4afb4ad3590db680bccacca81a9fea479e638f91fd5eb34e67e733103
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67dc8d7c29a337d2af8cab636481f46a6a24034554d74820938adde6717b070b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2039e0dd851cf50efc5c92eae55ef9d90644f479d007e1a04912e5dfe8b441a
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:480547ac130fa2a4d7ed2c72cff8ffd28b33c257079ad7f33a9553e30ee18b86
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:704157ddb23baa7ea252d705881891eb9017ede4c98afdcc2fe424b1da003854
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93d2249e6619e5c532aedb71a6fa0b27cb8510666f06ef4286647cbebdeb62f8
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f75f460626823b08c0b5d748bd6e356df4fad31b4d6f1bee0ea68d6dd231541
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4055e4142f36e5b7ad8acd183073cd010060ffca6c79c7221bfc55a921e1e477
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea03a23b5e2bdcb4bd9a8db175e30d4861f4d46b3e4ebdc845dc49850878e7a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8852821836960532,
5
  "eval_steps": 20,
6
- "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -45619,6 +45619,766 @@
45619
  "eval_samples_per_second": 5.864,
45620
  "eval_steps_per_second": 0.201,
45621
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45622
  }
45623
  ],
45624
  "logging_steps": 1,
@@ -45638,7 +46398,7 @@
45638
  "attributes": {}
45639
  }
45640
  },
45641
- "total_flos": 1.8481947946526966e+18,
45642
  "train_batch_size": 8,
45643
  "trial_name": null,
45644
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.900036886757654,
5
  "eval_steps": 20,
6
+ "global_step": 6100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
45619
  "eval_samples_per_second": 5.864,
45620
  "eval_steps_per_second": 0.201,
45621
  "step": 6000
45622
+ },
45623
+ {
45624
+ "epoch": 0.8854297307266691,
45625
+ "grad_norm": 1.9520748853683472,
45626
+ "learning_rate": 7.882895975685912e-07,
45627
+ "loss": 0.039,
45628
+ "step": 6001
45629
+ },
45630
+ {
45631
+ "epoch": 0.8855772777572851,
45632
+ "grad_norm": 1.3660613298416138,
45633
+ "learning_rate": 7.862862685205296e-07,
45634
+ "loss": 0.0214,
45635
+ "step": 6002
45636
+ },
45637
+ {
45638
+ "epoch": 0.8857248247879012,
45639
+ "grad_norm": 3.9320688247680664,
45640
+ "learning_rate": 7.842853841252463e-07,
45641
+ "loss": 0.0278,
45642
+ "step": 6003
45643
+ },
45644
+ {
45645
+ "epoch": 0.8858723718185172,
45646
+ "grad_norm": 1.4639867544174194,
45647
+ "learning_rate": 7.822869449136328e-07,
45648
+ "loss": 0.0421,
45649
+ "step": 6004
45650
+ },
45651
+ {
45652
+ "epoch": 0.8860199188491331,
45653
+ "grad_norm": 3.7693629264831543,
45654
+ "learning_rate": 7.802909514159285e-07,
45655
+ "loss": 0.091,
45656
+ "step": 6005
45657
+ },
45658
+ {
45659
+ "epoch": 0.8861674658797491,
45660
+ "grad_norm": 1.5250922441482544,
45661
+ "learning_rate": 7.782974041617253e-07,
45662
+ "loss": 0.0195,
45663
+ "step": 6006
45664
+ },
45665
+ {
45666
+ "epoch": 0.8863150129103652,
45667
+ "grad_norm": 3.0657765865325928,
45668
+ "learning_rate": 7.763063036799701e-07,
45669
+ "loss": 0.0338,
45670
+ "step": 6007
45671
+ },
45672
+ {
45673
+ "epoch": 0.8864625599409812,
45674
+ "grad_norm": 1.9614242315292358,
45675
+ "learning_rate": 7.743176504989513e-07,
45676
+ "loss": 0.0447,
45677
+ "step": 6008
45678
+ },
45679
+ {
45680
+ "epoch": 0.8866101069715971,
45681
+ "grad_norm": 3.2453866004943848,
45682
+ "learning_rate": 7.723314451463193e-07,
45683
+ "loss": 0.137,
45684
+ "step": 6009
45685
+ },
45686
+ {
45687
+ "epoch": 0.8867576540022132,
45688
+ "grad_norm": 2.51401424407959,
45689
+ "learning_rate": 7.703476881490634e-07,
45690
+ "loss": 0.098,
45691
+ "step": 6010
45692
+ },
45693
+ {
45694
+ "epoch": 0.8869052010328292,
45695
+ "grad_norm": 3.9559733867645264,
45696
+ "learning_rate": 7.683663800335328e-07,
45697
+ "loss": 0.0941,
45698
+ "step": 6011
45699
+ },
45700
+ {
45701
+ "epoch": 0.8870527480634453,
45702
+ "grad_norm": 1.9767736196517944,
45703
+ "learning_rate": 7.663875213254246e-07,
45704
+ "loss": 0.0472,
45705
+ "step": 6012
45706
+ },
45707
+ {
45708
+ "epoch": 0.8872002950940613,
45709
+ "grad_norm": 1.6465672254562378,
45710
+ "learning_rate": 7.644111125497822e-07,
45711
+ "loss": 0.0145,
45712
+ "step": 6013
45713
+ },
45714
+ {
45715
+ "epoch": 0.8873478421246772,
45716
+ "grad_norm": 1.3866339921951294,
45717
+ "learning_rate": 7.624371542310005e-07,
45718
+ "loss": 0.0386,
45719
+ "step": 6014
45720
+ },
45721
+ {
45722
+ "epoch": 0.8874953891552932,
45723
+ "grad_norm": 2.0481443405151367,
45724
+ "learning_rate": 7.604656468928262e-07,
45725
+ "loss": 0.0383,
45726
+ "step": 6015
45727
+ },
45728
+ {
45729
+ "epoch": 0.8876429361859093,
45730
+ "grad_norm": 3.9279582500457764,
45731
+ "learning_rate": 7.584965910583564e-07,
45732
+ "loss": 0.0489,
45733
+ "step": 6016
45734
+ },
45735
+ {
45736
+ "epoch": 0.8877904832165253,
45737
+ "grad_norm": 1.7083287239074707,
45738
+ "learning_rate": 7.565299872500331e-07,
45739
+ "loss": 0.0244,
45740
+ "step": 6017
45741
+ },
45742
+ {
45743
+ "epoch": 0.8879380302471412,
45744
+ "grad_norm": 1.2823542356491089,
45745
+ "learning_rate": 7.545658359896547e-07,
45746
+ "loss": 0.0164,
45747
+ "step": 6018
45748
+ },
45749
+ {
45750
+ "epoch": 0.8880855772777573,
45751
+ "grad_norm": 0.9202921390533447,
45752
+ "learning_rate": 7.526041377983596e-07,
45753
+ "loss": 0.0263,
45754
+ "step": 6019
45755
+ },
45756
+ {
45757
+ "epoch": 0.8882331243083733,
45758
+ "grad_norm": 1.0891423225402832,
45759
+ "learning_rate": 7.506448931966436e-07,
45760
+ "loss": 0.0291,
45761
+ "step": 6020
45762
+ },
45763
+ {
45764
+ "epoch": 0.8882331243083733,
45765
+ "eval_accuracy": 0.9782923299565847,
45766
+ "eval_f1": 0.9629629629629629,
45767
+ "eval_loss": 0.05567174404859543,
45768
+ "eval_precision": 0.9798994974874372,
45769
+ "eval_recall": 0.9466019417475728,
45770
+ "eval_runtime": 48.4257,
45771
+ "eval_samples_per_second": 6.009,
45772
+ "eval_steps_per_second": 0.207,
45773
+ "step": 6020
45774
+ },
45775
+ {
45776
+ "epoch": 0.8883806713389893,
45777
+ "grad_norm": 1.6009353399276733,
45778
+ "learning_rate": 7.486881027043491e-07,
45779
+ "loss": 0.0527,
45780
+ "step": 6021
45781
+ },
45782
+ {
45783
+ "epoch": 0.8885282183696053,
45784
+ "grad_norm": 6.035427570343018,
45785
+ "learning_rate": 7.467337668406638e-07,
45786
+ "loss": 0.0817,
45787
+ "step": 6022
45788
+ },
45789
+ {
45790
+ "epoch": 0.8886757654002213,
45791
+ "grad_norm": 1.1803969144821167,
45792
+ "learning_rate": 7.447818861241308e-07,
45793
+ "loss": 0.0216,
45794
+ "step": 6023
45795
+ },
45796
+ {
45797
+ "epoch": 0.8888233124308373,
45798
+ "grad_norm": 2.7183499336242676,
45799
+ "learning_rate": 7.428324610726345e-07,
45800
+ "loss": 0.0815,
45801
+ "step": 6024
45802
+ },
45803
+ {
45804
+ "epoch": 0.8889708594614534,
45805
+ "grad_norm": 1.5589689016342163,
45806
+ "learning_rate": 7.408854922034126e-07,
45807
+ "loss": 0.0311,
45808
+ "step": 6025
45809
+ },
45810
+ {
45811
+ "epoch": 0.8891184064920693,
45812
+ "grad_norm": 3.315922498703003,
45813
+ "learning_rate": 7.389409800330516e-07,
45814
+ "loss": 0.0731,
45815
+ "step": 6026
45816
+ },
45817
+ {
45818
+ "epoch": 0.8892659535226853,
45819
+ "grad_norm": 1.9708974361419678,
45820
+ "learning_rate": 7.369989250774812e-07,
45821
+ "loss": 0.0703,
45822
+ "step": 6027
45823
+ },
45824
+ {
45825
+ "epoch": 0.8894135005533014,
45826
+ "grad_norm": 1.7425222396850586,
45827
+ "learning_rate": 7.350593278519824e-07,
45828
+ "loss": 0.0496,
45829
+ "step": 6028
45830
+ },
45831
+ {
45832
+ "epoch": 0.8895610475839174,
45833
+ "grad_norm": 1.3247371912002563,
45834
+ "learning_rate": 7.331221888711859e-07,
45835
+ "loss": 0.0324,
45836
+ "step": 6029
45837
+ },
45838
+ {
45839
+ "epoch": 0.8897085946145333,
45840
+ "grad_norm": 1.3036816120147705,
45841
+ "learning_rate": 7.311875086490683e-07,
45842
+ "loss": 0.033,
45843
+ "step": 6030
45844
+ },
45845
+ {
45846
+ "epoch": 0.8898561416451494,
45847
+ "grad_norm": 2.608323097229004,
45848
+ "learning_rate": 7.292552876989511e-07,
45849
+ "loss": 0.0983,
45850
+ "step": 6031
45851
+ },
45852
+ {
45853
+ "epoch": 0.8900036886757654,
45854
+ "grad_norm": 2.148296356201172,
45855
+ "learning_rate": 7.273255265335088e-07,
45856
+ "loss": 0.0705,
45857
+ "step": 6032
45858
+ },
45859
+ {
45860
+ "epoch": 0.8901512357063814,
45861
+ "grad_norm": 2.8998477458953857,
45862
+ "learning_rate": 7.253982256647574e-07,
45863
+ "loss": 0.0512,
45864
+ "step": 6033
45865
+ },
45866
+ {
45867
+ "epoch": 0.8902987827369974,
45868
+ "grad_norm": 3.0811049938201904,
45869
+ "learning_rate": 7.234733856040654e-07,
45870
+ "loss": 0.1346,
45871
+ "step": 6034
45872
+ },
45873
+ {
45874
+ "epoch": 0.8904463297676134,
45875
+ "grad_norm": 2.189905881881714,
45876
+ "learning_rate": 7.215510068621467e-07,
45877
+ "loss": 0.0503,
45878
+ "step": 6035
45879
+ },
45880
+ {
45881
+ "epoch": 0.8905938767982294,
45882
+ "grad_norm": 3.884209394454956,
45883
+ "learning_rate": 7.196310899490577e-07,
45884
+ "loss": 0.0547,
45885
+ "step": 6036
45886
+ },
45887
+ {
45888
+ "epoch": 0.8907414238288455,
45889
+ "grad_norm": 1.5360444784164429,
45890
+ "learning_rate": 7.177136353742098e-07,
45891
+ "loss": 0.0523,
45892
+ "step": 6037
45893
+ },
45894
+ {
45895
+ "epoch": 0.8908889708594615,
45896
+ "grad_norm": 1.0694087743759155,
45897
+ "learning_rate": 7.157986436463537e-07,
45898
+ "loss": 0.022,
45899
+ "step": 6038
45900
+ },
45901
+ {
45902
+ "epoch": 0.8910365178900774,
45903
+ "grad_norm": 1.7816051244735718,
45904
+ "learning_rate": 7.138861152735898e-07,
45905
+ "loss": 0.0618,
45906
+ "step": 6039
45907
+ },
45908
+ {
45909
+ "epoch": 0.8911840649206935,
45910
+ "grad_norm": 2.7336528301239014,
45911
+ "learning_rate": 7.119760507633678e-07,
45912
+ "loss": 0.0872,
45913
+ "step": 6040
45914
+ },
45915
+ {
45916
+ "epoch": 0.8911840649206935,
45917
+ "eval_accuracy": 0.9782923299565847,
45918
+ "eval_f1": 0.9629629629629629,
45919
+ "eval_loss": 0.05596858263015747,
45920
+ "eval_precision": 0.9798994974874372,
45921
+ "eval_recall": 0.9466019417475728,
45922
+ "eval_runtime": 48.9314,
45923
+ "eval_samples_per_second": 5.947,
45924
+ "eval_steps_per_second": 0.204,
45925
+ "step": 6040
45926
+ },
45927
+ {
45928
+ "epoch": 0.8913316119513095,
45929
+ "grad_norm": 1.510573148727417,
45930
+ "learning_rate": 7.100684506224775e-07,
45931
+ "loss": 0.0494,
45932
+ "step": 6041
45933
+ },
45934
+ {
45935
+ "epoch": 0.8914791589819255,
45936
+ "grad_norm": 4.796574115753174,
45937
+ "learning_rate": 7.081633153570577e-07,
45938
+ "loss": 0.0196,
45939
+ "step": 6042
45940
+ },
45941
+ {
45942
+ "epoch": 0.8916267060125415,
45943
+ "grad_norm": 1.6497973203659058,
45944
+ "learning_rate": 7.06260645472594e-07,
45945
+ "loss": 0.0334,
45946
+ "step": 6043
45947
+ },
45948
+ {
45949
+ "epoch": 0.8917742530431575,
45950
+ "grad_norm": 1.3166993856430054,
45951
+ "learning_rate": 7.043604414739203e-07,
45952
+ "loss": 0.0214,
45953
+ "step": 6044
45954
+ },
45955
+ {
45956
+ "epoch": 0.8919218000737735,
45957
+ "grad_norm": 3.4348347187042236,
45958
+ "learning_rate": 7.024627038652077e-07,
45959
+ "loss": 0.0721,
45960
+ "step": 6045
45961
+ },
45962
+ {
45963
+ "epoch": 0.8920693471043896,
45964
+ "grad_norm": 2.7059199810028076,
45965
+ "learning_rate": 7.005674331499834e-07,
45966
+ "loss": 0.0711,
45967
+ "step": 6046
45968
+ },
45969
+ {
45970
+ "epoch": 0.8922168941350055,
45971
+ "grad_norm": 2.4796156883239746,
45972
+ "learning_rate": 6.98674629831112e-07,
45973
+ "loss": 0.0405,
45974
+ "step": 6047
45975
+ },
45976
+ {
45977
+ "epoch": 0.8923644411656215,
45978
+ "grad_norm": 5.228332042694092,
45979
+ "learning_rate": 6.96784294410806e-07,
45980
+ "loss": 0.1015,
45981
+ "step": 6048
45982
+ },
45983
+ {
45984
+ "epoch": 0.8925119881962376,
45985
+ "grad_norm": 2.4152002334594727,
45986
+ "learning_rate": 6.948964273906278e-07,
45987
+ "loss": 0.0533,
45988
+ "step": 6049
45989
+ },
45990
+ {
45991
+ "epoch": 0.8926595352268536,
45992
+ "grad_norm": 3.5302391052246094,
45993
+ "learning_rate": 6.930110292714765e-07,
45994
+ "loss": 0.0874,
45995
+ "step": 6050
45996
+ },
45997
+ {
45998
+ "epoch": 0.8928070822574695,
45999
+ "grad_norm": 1.5268058776855469,
46000
+ "learning_rate": 6.911281005536031e-07,
46001
+ "loss": 0.037,
46002
+ "step": 6051
46003
+ },
46004
+ {
46005
+ "epoch": 0.8929546292880856,
46006
+ "grad_norm": 2.8932971954345703,
46007
+ "learning_rate": 6.892476417365989e-07,
46008
+ "loss": 0.0532,
46009
+ "step": 6052
46010
+ },
46011
+ {
46012
+ "epoch": 0.8931021763187016,
46013
+ "grad_norm": 2.512969732284546,
46014
+ "learning_rate": 6.873696533193996e-07,
46015
+ "loss": 0.0757,
46016
+ "step": 6053
46017
+ },
46018
+ {
46019
+ "epoch": 0.8932497233493176,
46020
+ "grad_norm": 3.8057785034179688,
46021
+ "learning_rate": 6.85494135800292e-07,
46022
+ "loss": 0.036,
46023
+ "step": 6054
46024
+ },
46025
+ {
46026
+ "epoch": 0.8933972703799336,
46027
+ "grad_norm": 3.723708391189575,
46028
+ "learning_rate": 6.836210896769014e-07,
46029
+ "loss": 0.0765,
46030
+ "step": 6055
46031
+ },
46032
+ {
46033
+ "epoch": 0.8935448174105496,
46034
+ "grad_norm": 2.7616844177246094,
46035
+ "learning_rate": 6.817505154461956e-07,
46036
+ "loss": 0.0504,
46037
+ "step": 6056
46038
+ },
46039
+ {
46040
+ "epoch": 0.8936923644411656,
46041
+ "grad_norm": 2.264693260192871,
46042
+ "learning_rate": 6.798824136044913e-07,
46043
+ "loss": 0.0518,
46044
+ "step": 6057
46045
+ },
46046
+ {
46047
+ "epoch": 0.8938399114717817,
46048
+ "grad_norm": 4.492088317871094,
46049
+ "learning_rate": 6.780167846474506e-07,
46050
+ "loss": 0.1116,
46051
+ "step": 6058
46052
+ },
46053
+ {
46054
+ "epoch": 0.8939874585023977,
46055
+ "grad_norm": 5.52939510345459,
46056
+ "learning_rate": 6.761536290700721e-07,
46057
+ "loss": 0.0688,
46058
+ "step": 6059
46059
+ },
46060
+ {
46061
+ "epoch": 0.8941350055330136,
46062
+ "grad_norm": 2.8091180324554443,
46063
+ "learning_rate": 6.742929473667048e-07,
46064
+ "loss": 0.0819,
46065
+ "step": 6060
46066
+ },
46067
+ {
46068
+ "epoch": 0.8941350055330136,
46069
+ "eval_accuracy": 0.9782923299565847,
46070
+ "eval_f1": 0.9629629629629629,
46071
+ "eval_loss": 0.0556659959256649,
46072
+ "eval_precision": 0.9798994974874372,
46073
+ "eval_recall": 0.9466019417475728,
46074
+ "eval_runtime": 48.6299,
46075
+ "eval_samples_per_second": 5.984,
46076
+ "eval_steps_per_second": 0.206,
46077
+ "step": 6060
46078
+ },
46079
+ {
46080
+ "epoch": 0.8942825525636297,
46081
+ "grad_norm": 2.870699167251587,
46082
+ "learning_rate": 6.724347400310371e-07,
46083
+ "loss": 0.0833,
46084
+ "step": 6061
46085
+ },
46086
+ {
46087
+ "epoch": 0.8944300995942457,
46088
+ "grad_norm": 2.0958855152130127,
46089
+ "learning_rate": 6.705790075561047e-07,
46090
+ "loss": 0.0436,
46091
+ "step": 6062
46092
+ },
46093
+ {
46094
+ "epoch": 0.8945776466248617,
46095
+ "grad_norm": 3.7520968914031982,
46096
+ "learning_rate": 6.687257504342848e-07,
46097
+ "loss": 0.0666,
46098
+ "step": 6063
46099
+ },
46100
+ {
46101
+ "epoch": 0.8947251936554776,
46102
+ "grad_norm": 1.8063980340957642,
46103
+ "learning_rate": 6.668749691572951e-07,
46104
+ "loss": 0.0345,
46105
+ "step": 6064
46106
+ },
46107
+ {
46108
+ "epoch": 0.8948727406860937,
46109
+ "grad_norm": 4.453982830047607,
46110
+ "learning_rate": 6.650266642162029e-07,
46111
+ "loss": 0.1076,
46112
+ "step": 6065
46113
+ },
46114
+ {
46115
+ "epoch": 0.8950202877167097,
46116
+ "grad_norm": 2.0510618686676025,
46117
+ "learning_rate": 6.631808361014113e-07,
46118
+ "loss": 0.0227,
46119
+ "step": 6066
46120
+ },
46121
+ {
46122
+ "epoch": 0.8951678347473258,
46123
+ "grad_norm": 2.578852415084839,
46124
+ "learning_rate": 6.613374853026689e-07,
46125
+ "loss": 0.04,
46126
+ "step": 6067
46127
+ },
46128
+ {
46129
+ "epoch": 0.8953153817779417,
46130
+ "grad_norm": 2.0627481937408447,
46131
+ "learning_rate": 6.594966123090718e-07,
46132
+ "loss": 0.0538,
46133
+ "step": 6068
46134
+ },
46135
+ {
46136
+ "epoch": 0.8954629288085577,
46137
+ "grad_norm": 5.8762617111206055,
46138
+ "learning_rate": 6.576582176090518e-07,
46139
+ "loss": 0.0398,
46140
+ "step": 6069
46141
+ },
46142
+ {
46143
+ "epoch": 0.8956104758391737,
46144
+ "grad_norm": 2.373622179031372,
46145
+ "learning_rate": 6.558223016903842e-07,
46146
+ "loss": 0.0969,
46147
+ "step": 6070
46148
+ },
46149
+ {
46150
+ "epoch": 0.8957580228697898,
46151
+ "grad_norm": 1.838910698890686,
46152
+ "learning_rate": 6.539888650401916e-07,
46153
+ "loss": 0.0363,
46154
+ "step": 6071
46155
+ },
46156
+ {
46157
+ "epoch": 0.8959055699004057,
46158
+ "grad_norm": 10.581929206848145,
46159
+ "learning_rate": 6.521579081449325e-07,
46160
+ "loss": 0.0918,
46161
+ "step": 6072
46162
+ },
46163
+ {
46164
+ "epoch": 0.8960531169310217,
46165
+ "grad_norm": 3.3708910942077637,
46166
+ "learning_rate": 6.503294314904108e-07,
46167
+ "loss": 0.059,
46168
+ "step": 6073
46169
+ },
46170
+ {
46171
+ "epoch": 0.8962006639616378,
46172
+ "grad_norm": 2.5104010105133057,
46173
+ "learning_rate": 6.485034355617748e-07,
46174
+ "loss": 0.0224,
46175
+ "step": 6074
46176
+ },
46177
+ {
46178
+ "epoch": 0.8963482109922538,
46179
+ "grad_norm": 1.8458657264709473,
46180
+ "learning_rate": 6.466799208435081e-07,
46181
+ "loss": 0.0375,
46182
+ "step": 6075
46183
+ },
46184
+ {
46185
+ "epoch": 0.8964957580228697,
46186
+ "grad_norm": 4.009340763092041,
46187
+ "learning_rate": 6.448588878194406e-07,
46188
+ "loss": 0.0681,
46189
+ "step": 6076
46190
+ },
46191
+ {
46192
+ "epoch": 0.8966433050534858,
46193
+ "grad_norm": 1.2859127521514893,
46194
+ "learning_rate": 6.430403369727445e-07,
46195
+ "loss": 0.0283,
46196
+ "step": 6077
46197
+ },
46198
+ {
46199
+ "epoch": 0.8967908520841018,
46200
+ "grad_norm": 1.2199759483337402,
46201
+ "learning_rate": 6.412242687859294e-07,
46202
+ "loss": 0.0296,
46203
+ "step": 6078
46204
+ },
46205
+ {
46206
+ "epoch": 0.8969383991147178,
46207
+ "grad_norm": 2.5927979946136475,
46208
+ "learning_rate": 6.394106837408487e-07,
46209
+ "loss": 0.0468,
46210
+ "step": 6079
46211
+ },
46212
+ {
46213
+ "epoch": 0.8970859461453339,
46214
+ "grad_norm": 7.891746997833252,
46215
+ "learning_rate": 6.375995823186987e-07,
46216
+ "loss": 0.0912,
46217
+ "step": 6080
46218
+ },
46219
+ {
46220
+ "epoch": 0.8970859461453339,
46221
+ "eval_accuracy": 0.9782923299565847,
46222
+ "eval_f1": 0.9629629629629629,
46223
+ "eval_loss": 0.05581069737672806,
46224
+ "eval_precision": 0.9798994974874372,
46225
+ "eval_recall": 0.9466019417475728,
46226
+ "eval_runtime": 48.7869,
46227
+ "eval_samples_per_second": 5.965,
46228
+ "eval_steps_per_second": 0.205,
46229
+ "step": 6080
46230
+ },
46231
+ {
46232
+ "epoch": 0.8972334931759498,
46233
+ "grad_norm": 2.4385836124420166,
46234
+ "learning_rate": 6.357909650000094e-07,
46235
+ "loss": 0.0744,
46236
+ "step": 6081
46237
+ },
46238
+ {
46239
+ "epoch": 0.8973810402065658,
46240
+ "grad_norm": 1.913123607635498,
46241
+ "learning_rate": 6.339848322646625e-07,
46242
+ "loss": 0.0558,
46243
+ "step": 6082
46244
+ },
46245
+ {
46246
+ "epoch": 0.8975285872371819,
46247
+ "grad_norm": 2.230882167816162,
46248
+ "learning_rate": 6.321811845918735e-07,
46249
+ "loss": 0.0325,
46250
+ "step": 6083
46251
+ },
46252
+ {
46253
+ "epoch": 0.8976761342677979,
46254
+ "grad_norm": 1.8755987882614136,
46255
+ "learning_rate": 6.303800224601986e-07,
46256
+ "loss": 0.0615,
46257
+ "step": 6084
46258
+ },
46259
+ {
46260
+ "epoch": 0.8978236812984138,
46261
+ "grad_norm": 3.3818376064300537,
46262
+ "learning_rate": 6.285813463475366e-07,
46263
+ "loss": 0.0615,
46264
+ "step": 6085
46265
+ },
46266
+ {
46267
+ "epoch": 0.8979712283290299,
46268
+ "grad_norm": 1.8080580234527588,
46269
+ "learning_rate": 6.267851567311256e-07,
46270
+ "loss": 0.0506,
46271
+ "step": 6086
46272
+ },
46273
+ {
46274
+ "epoch": 0.8981187753596459,
46275
+ "grad_norm": 3.3100953102111816,
46276
+ "learning_rate": 6.249914540875445e-07,
46277
+ "loss": 0.0438,
46278
+ "step": 6087
46279
+ },
46280
+ {
46281
+ "epoch": 0.8982663223902619,
46282
+ "grad_norm": 2.4367289543151855,
46283
+ "learning_rate": 6.232002388927127e-07,
46284
+ "loss": 0.0651,
46285
+ "step": 6088
46286
+ },
46287
+ {
46288
+ "epoch": 0.8984138694208779,
46289
+ "grad_norm": 3.0320327281951904,
46290
+ "learning_rate": 6.214115116218877e-07,
46291
+ "loss": 0.0466,
46292
+ "step": 6089
46293
+ },
46294
+ {
46295
+ "epoch": 0.8985614164514939,
46296
+ "grad_norm": 1.3278833627700806,
46297
+ "learning_rate": 6.19625272749671e-07,
46298
+ "loss": 0.0228,
46299
+ "step": 6090
46300
+ },
46301
+ {
46302
+ "epoch": 0.8987089634821099,
46303
+ "grad_norm": 2.3165955543518066,
46304
+ "learning_rate": 6.178415227499979e-07,
46305
+ "loss": 0.0507,
46306
+ "step": 6091
46307
+ },
46308
+ {
46309
+ "epoch": 0.898856510512726,
46310
+ "grad_norm": 1.7083970308303833,
46311
+ "learning_rate": 6.160602620961487e-07,
46312
+ "loss": 0.043,
46313
+ "step": 6092
46314
+ },
46315
+ {
46316
+ "epoch": 0.8990040575433419,
46317
+ "grad_norm": 2.6195430755615234,
46318
+ "learning_rate": 6.142814912607409e-07,
46319
+ "loss": 0.0706,
46320
+ "step": 6093
46321
+ },
46322
+ {
46323
+ "epoch": 0.8991516045739579,
46324
+ "grad_norm": 2.1232919692993164,
46325
+ "learning_rate": 6.125052107157304e-07,
46326
+ "loss": 0.0561,
46327
+ "step": 6094
46328
+ },
46329
+ {
46330
+ "epoch": 0.899299151604574,
46331
+ "grad_norm": 3.1567232608795166,
46332
+ "learning_rate": 6.107314209324123e-07,
46333
+ "loss": 0.0495,
46334
+ "step": 6095
46335
+ },
46336
+ {
46337
+ "epoch": 0.89944669863519,
46338
+ "grad_norm": 1.2741518020629883,
46339
+ "learning_rate": 6.089601223814268e-07,
46340
+ "loss": 0.0388,
46341
+ "step": 6096
46342
+ },
46343
+ {
46344
+ "epoch": 0.8995942456658059,
46345
+ "grad_norm": 2.4637041091918945,
46346
+ "learning_rate": 6.071913155327447e-07,
46347
+ "loss": 0.0538,
46348
+ "step": 6097
46349
+ },
46350
+ {
46351
+ "epoch": 0.899741792696422,
46352
+ "grad_norm": 2.406578302383423,
46353
+ "learning_rate": 6.054250008556783e-07,
46354
+ "loss": 0.0796,
46355
+ "step": 6098
46356
+ },
46357
+ {
46358
+ "epoch": 0.899889339727038,
46359
+ "grad_norm": 3.432889223098755,
46360
+ "learning_rate": 6.036611788188829e-07,
46361
+ "loss": 0.0655,
46362
+ "step": 6099
46363
+ },
46364
+ {
46365
+ "epoch": 0.900036886757654,
46366
+ "grad_norm": 3.666919708251953,
46367
+ "learning_rate": 6.018998498903472e-07,
46368
+ "loss": 0.0307,
46369
+ "step": 6100
46370
+ },
46371
+ {
46372
+ "epoch": 0.900036886757654,
46373
+ "eval_accuracy": 0.9782923299565847,
46374
+ "eval_f1": 0.9629629629629629,
46375
+ "eval_loss": 0.05625491961836815,
46376
+ "eval_precision": 0.9798994974874372,
46377
+ "eval_recall": 0.9466019417475728,
46378
+ "eval_runtime": 49.337,
46379
+ "eval_samples_per_second": 5.898,
46380
+ "eval_steps_per_second": 0.203,
46381
+ "step": 6100
46382
  }
46383
  ],
46384
  "logging_steps": 1,
 
46398
  "attributes": {}
46399
  }
46400
  },
46401
+ "total_flos": 1.8789295751031685e+18,
46402
  "train_batch_size": 8,
46403
  "trial_name": null,
46404
  "trial_params": null