shareit commited on
Commit
d365532
·
verified ·
1 Parent(s): b7733e8

Training in progress, step 14100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aafb57b910d4cccd85ea24209a70be5b99344b88a9876d8830c026cbfc09984
3
  size 340808816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e9d5196271e33d944777c3a49fccbfa302a70e1321dd5cbd1d0f98d610c0e19
3
  size 340808816
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:059f6f8429397406dface628a4284dcb8938551965f3c045168f79f5fba8d08b
3
  size 173247691
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f25152ec3704298acc5baeda37a016ce684c5ed788f7275c1932e40edc1f8d
3
  size 173247691
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97f621c40c37c796d75663f728f7490ddcd9db068eaa82d92691bb9a37ff256f
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b928d2d8033ac6bd87c58a39b741faace8bd1c6b0d070b7fad23c19520ff9f1a
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e55996a4913dc987fc027e04b41b6008152cac112575a6ab3bd719b08b3ee43
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5e3d260b6ccf56bb7f54043de5aacef72dc68b9e723cd5fda1af160f114d6bb
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 15.717948717948717,
6
  "eval_steps": 500,
7
- "global_step": 13800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -96608,6 +96608,2106 @@
96608
  "learning_rate": 5.893886653718317e-06,
96609
  "loss": 0.7302,
96610
  "step": 13800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96611
  }
96612
  ],
96613
  "logging_steps": 1,
@@ -96627,7 +98727,7 @@
96627
  "attributes": {}
96628
  }
96629
  },
96630
- "total_flos": 7.715994433344258e+19,
96631
  "train_batch_size": 8,
96632
  "trial_name": null,
96633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 16.05925925925926,
6
  "eval_steps": 500,
7
+ "global_step": 14100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
96608
  "learning_rate": 5.893886653718317e-06,
96609
  "loss": 0.7302,
96610
  "step": 13800
96611
+ },
96612
+ {
96613
+ "epoch": 15.71908831908832,
96614
+ "grad_norm": 0.22730977833271027,
96615
+ "learning_rate": 5.890882155790686e-06,
96616
+ "loss": 0.8484,
96617
+ "step": 13801
96618
+ },
96619
+ {
96620
+ "epoch": 15.72022792022792,
96621
+ "grad_norm": 0.18116992712020874,
96622
+ "learning_rate": 5.887878321575266e-06,
96623
+ "loss": 0.7743,
96624
+ "step": 13802
96625
+ },
96626
+ {
96627
+ "epoch": 15.72136752136752,
96628
+ "grad_norm": 0.2321796417236328,
96629
+ "learning_rate": 5.8848751511764e-06,
96630
+ "loss": 0.4451,
96631
+ "step": 13803
96632
+ },
96633
+ {
96634
+ "epoch": 15.722507122507123,
96635
+ "grad_norm": 0.18436527252197266,
96636
+ "learning_rate": 5.88187264469838e-06,
96637
+ "loss": 0.9361,
96638
+ "step": 13804
96639
+ },
96640
+ {
96641
+ "epoch": 15.723646723646723,
96642
+ "grad_norm": 0.2078021615743637,
96643
+ "learning_rate": 5.8788708022454984e-06,
96644
+ "loss": 0.6999,
96645
+ "step": 13805
96646
+ },
96647
+ {
96648
+ "epoch": 15.724786324786324,
96649
+ "grad_norm": 0.18605077266693115,
96650
+ "learning_rate": 5.875869623922014e-06,
96651
+ "loss": 0.8222,
96652
+ "step": 13806
96653
+ },
96654
+ {
96655
+ "epoch": 15.725925925925926,
96656
+ "grad_norm": 0.1825038194656372,
96657
+ "learning_rate": 5.87286910983218e-06,
96658
+ "loss": 0.8306,
96659
+ "step": 13807
96660
+ },
96661
+ {
96662
+ "epoch": 15.727065527065527,
96663
+ "grad_norm": 0.24502849578857422,
96664
+ "learning_rate": 5.8698692600801905e-06,
96665
+ "loss": 0.7141,
96666
+ "step": 13808
96667
+ },
96668
+ {
96669
+ "epoch": 15.728205128205127,
96670
+ "grad_norm": 0.26166555285453796,
96671
+ "learning_rate": 5.866870074770253e-06,
96672
+ "loss": 0.5176,
96673
+ "step": 13809
96674
+ },
96675
+ {
96676
+ "epoch": 15.72934472934473,
96677
+ "grad_norm": 0.17108017206192017,
96678
+ "learning_rate": 5.863871554006534e-06,
96679
+ "loss": 0.6432,
96680
+ "step": 13810
96681
+ },
96682
+ {
96683
+ "epoch": 15.73048433048433,
96684
+ "grad_norm": 0.1834830939769745,
96685
+ "learning_rate": 5.860873697893179e-06,
96686
+ "loss": 0.6208,
96687
+ "step": 13811
96688
+ },
96689
+ {
96690
+ "epoch": 15.73162393162393,
96691
+ "grad_norm": 0.19852334260940552,
96692
+ "learning_rate": 5.857876506534313e-06,
96693
+ "loss": 0.5493,
96694
+ "step": 13812
96695
+ },
96696
+ {
96697
+ "epoch": 15.732763532763533,
96698
+ "grad_norm": 0.1938435286283493,
96699
+ "learning_rate": 5.854879980034039e-06,
96700
+ "loss": 0.7416,
96701
+ "step": 13813
96702
+ },
96703
+ {
96704
+ "epoch": 15.733903133903134,
96705
+ "grad_norm": 0.19227145612239838,
96706
+ "learning_rate": 5.851884118496432e-06,
96707
+ "loss": 0.6168,
96708
+ "step": 13814
96709
+ },
96710
+ {
96711
+ "epoch": 15.735042735042736,
96712
+ "grad_norm": 0.26434120535850525,
96713
+ "learning_rate": 5.848888922025553e-06,
96714
+ "loss": 0.4461,
96715
+ "step": 13815
96716
+ },
96717
+ {
96718
+ "epoch": 15.736182336182337,
96719
+ "grad_norm": 0.20051394402980804,
96720
+ "learning_rate": 5.845894390725421e-06,
96721
+ "loss": 0.5806,
96722
+ "step": 13816
96723
+ },
96724
+ {
96725
+ "epoch": 15.737321937321937,
96726
+ "grad_norm": 0.23545600473880768,
96727
+ "learning_rate": 5.842900524700051e-06,
96728
+ "loss": 0.6093,
96729
+ "step": 13817
96730
+ },
96731
+ {
96732
+ "epoch": 15.73846153846154,
96733
+ "grad_norm": 0.2062317430973053,
96734
+ "learning_rate": 5.839907324053425e-06,
96735
+ "loss": 0.7049,
96736
+ "step": 13818
96737
+ },
96738
+ {
96739
+ "epoch": 15.73960113960114,
96740
+ "grad_norm": 0.2594757676124573,
96741
+ "learning_rate": 5.836914788889519e-06,
96742
+ "loss": 0.6828,
96743
+ "step": 13819
96744
+ },
96745
+ {
96746
+ "epoch": 15.74074074074074,
96747
+ "grad_norm": 0.2455168217420578,
96748
+ "learning_rate": 5.8339229193122544e-06,
96749
+ "loss": 0.8516,
96750
+ "step": 13820
96751
+ },
96752
+ {
96753
+ "epoch": 15.741880341880343,
96754
+ "grad_norm": 0.2117108553647995,
96755
+ "learning_rate": 5.830931715425553e-06,
96756
+ "loss": 0.675,
96757
+ "step": 13821
96758
+ },
96759
+ {
96760
+ "epoch": 15.743019943019943,
96761
+ "grad_norm": 0.22628933191299438,
96762
+ "learning_rate": 5.827941177333307e-06,
96763
+ "loss": 0.6739,
96764
+ "step": 13822
96765
+ },
96766
+ {
96767
+ "epoch": 15.744159544159544,
96768
+ "grad_norm": 0.17668086290359497,
96769
+ "learning_rate": 5.824951305139387e-06,
96770
+ "loss": 0.7275,
96771
+ "step": 13823
96772
+ },
96773
+ {
96774
+ "epoch": 15.745299145299146,
96775
+ "grad_norm": 0.1669437438249588,
96776
+ "learning_rate": 5.821962098947642e-06,
96777
+ "loss": 0.957,
96778
+ "step": 13824
96779
+ },
96780
+ {
96781
+ "epoch": 15.746438746438747,
96782
+ "grad_norm": 0.22993691265583038,
96783
+ "learning_rate": 5.81897355886189e-06,
96784
+ "loss": 0.6483,
96785
+ "step": 13825
96786
+ },
96787
+ {
96788
+ "epoch": 15.747578347578347,
96789
+ "grad_norm": 0.23172104358673096,
96790
+ "learning_rate": 5.815985684985945e-06,
96791
+ "loss": 0.7056,
96792
+ "step": 13826
96793
+ },
96794
+ {
96795
+ "epoch": 15.74871794871795,
96796
+ "grad_norm": 0.1977315992116928,
96797
+ "learning_rate": 5.812998477423562e-06,
96798
+ "loss": 0.5988,
96799
+ "step": 13827
96800
+ },
96801
+ {
96802
+ "epoch": 15.74985754985755,
96803
+ "grad_norm": 0.18591248989105225,
96804
+ "learning_rate": 5.810011936278509e-06,
96805
+ "loss": 0.7036,
96806
+ "step": 13828
96807
+ },
96808
+ {
96809
+ "epoch": 15.75099715099715,
96810
+ "grad_norm": 0.20047298073768616,
96811
+ "learning_rate": 5.807026061654513e-06,
96812
+ "loss": 0.7494,
96813
+ "step": 13829
96814
+ },
96815
+ {
96816
+ "epoch": 15.752136752136753,
96817
+ "grad_norm": 0.22419284284114838,
96818
+ "learning_rate": 5.804040853655293e-06,
96819
+ "loss": 0.6049,
96820
+ "step": 13830
96821
+ },
96822
+ {
96823
+ "epoch": 15.753276353276354,
96824
+ "grad_norm": 0.19881680607795715,
96825
+ "learning_rate": 5.801056312384512e-06,
96826
+ "loss": 0.6976,
96827
+ "step": 13831
96828
+ },
96829
+ {
96830
+ "epoch": 15.754415954415954,
96831
+ "grad_norm": 0.22455157339572906,
96832
+ "learning_rate": 5.798072437945845e-06,
96833
+ "loss": 0.3943,
96834
+ "step": 13832
96835
+ },
96836
+ {
96837
+ "epoch": 15.755555555555556,
96838
+ "grad_norm": 0.17858223617076874,
96839
+ "learning_rate": 5.795089230442927e-06,
96840
+ "loss": 0.7836,
96841
+ "step": 13833
96842
+ },
96843
+ {
96844
+ "epoch": 15.756695156695157,
96845
+ "grad_norm": 0.1665424108505249,
96846
+ "learning_rate": 5.792106689979373e-06,
96847
+ "loss": 0.7137,
96848
+ "step": 13834
96849
+ },
96850
+ {
96851
+ "epoch": 15.757834757834758,
96852
+ "grad_norm": 0.2321743220090866,
96853
+ "learning_rate": 5.789124816658778e-06,
96854
+ "loss": 0.2724,
96855
+ "step": 13835
96856
+ },
96857
+ {
96858
+ "epoch": 15.75897435897436,
96859
+ "grad_norm": 0.15501980483531952,
96860
+ "learning_rate": 5.786143610584707e-06,
96861
+ "loss": 0.784,
96862
+ "step": 13836
96863
+ },
96864
+ {
96865
+ "epoch": 15.76011396011396,
96866
+ "grad_norm": 0.24102312326431274,
96867
+ "learning_rate": 5.783163071860715e-06,
96868
+ "loss": 0.7432,
96869
+ "step": 13837
96870
+ },
96871
+ {
96872
+ "epoch": 15.761253561253561,
96873
+ "grad_norm": 0.21585829555988312,
96874
+ "learning_rate": 5.780183200590306e-06,
96875
+ "loss": 0.6081,
96876
+ "step": 13838
96877
+ },
96878
+ {
96879
+ "epoch": 15.762393162393163,
96880
+ "grad_norm": 0.23042422533035278,
96881
+ "learning_rate": 5.77720399687699e-06,
96882
+ "loss": 0.5227,
96883
+ "step": 13839
96884
+ },
96885
+ {
96886
+ "epoch": 15.763532763532764,
96887
+ "grad_norm": 0.22137843072414398,
96888
+ "learning_rate": 5.774225460824243e-06,
96889
+ "loss": 0.7376,
96890
+ "step": 13840
96891
+ },
96892
+ {
96893
+ "epoch": 15.764672364672364,
96894
+ "grad_norm": 0.1850956231355667,
96895
+ "learning_rate": 5.771247592535523e-06,
96896
+ "loss": 0.6656,
96897
+ "step": 13841
96898
+ },
96899
+ {
96900
+ "epoch": 15.765811965811967,
96901
+ "grad_norm": 0.18516525626182556,
96902
+ "learning_rate": 5.7682703921142474e-06,
96903
+ "loss": 0.6501,
96904
+ "step": 13842
96905
+ },
96906
+ {
96907
+ "epoch": 15.766951566951567,
96908
+ "grad_norm": 0.22501075267791748,
96909
+ "learning_rate": 5.7652938596638286e-06,
96910
+ "loss": 0.7376,
96911
+ "step": 13843
96912
+ },
96913
+ {
96914
+ "epoch": 15.768091168091168,
96915
+ "grad_norm": 0.19035011529922485,
96916
+ "learning_rate": 5.762317995287641e-06,
96917
+ "loss": 0.615,
96918
+ "step": 13844
96919
+ },
96920
+ {
96921
+ "epoch": 15.76923076923077,
96922
+ "grad_norm": 0.18901343643665314,
96923
+ "learning_rate": 5.759342799089068e-06,
96924
+ "loss": 0.6742,
96925
+ "step": 13845
96926
+ },
96927
+ {
96928
+ "epoch": 15.77037037037037,
96929
+ "grad_norm": 0.24338462948799133,
96930
+ "learning_rate": 5.756368271171425e-06,
96931
+ "loss": 0.4775,
96932
+ "step": 13846
96933
+ },
96934
+ {
96935
+ "epoch": 15.771509971509971,
96936
+ "grad_norm": 0.21235759556293488,
96937
+ "learning_rate": 5.753394411638033e-06,
96938
+ "loss": 0.6555,
96939
+ "step": 13847
96940
+ },
96941
+ {
96942
+ "epoch": 15.772649572649573,
96943
+ "grad_norm": 0.202910378575325,
96944
+ "learning_rate": 5.7504212205921806e-06,
96945
+ "loss": 0.5172,
96946
+ "step": 13848
96947
+ },
96948
+ {
96949
+ "epoch": 15.773789173789174,
96950
+ "grad_norm": 0.16425685584545135,
96951
+ "learning_rate": 5.747448698137142e-06,
96952
+ "loss": 0.7835,
96953
+ "step": 13849
96954
+ },
96955
+ {
96956
+ "epoch": 15.774928774928775,
96957
+ "grad_norm": 0.21649502217769623,
96958
+ "learning_rate": 5.744476844376148e-06,
96959
+ "loss": 0.6296,
96960
+ "step": 13850
96961
+ },
96962
+ {
96963
+ "epoch": 15.776068376068377,
96964
+ "grad_norm": 0.18373684585094452,
96965
+ "learning_rate": 5.7415056594124274e-06,
96966
+ "loss": 0.7905,
96967
+ "step": 13851
96968
+ },
96969
+ {
96970
+ "epoch": 15.777207977207977,
96971
+ "grad_norm": 0.21412786841392517,
96972
+ "learning_rate": 5.738535143349178e-06,
96973
+ "loss": 0.7043,
96974
+ "step": 13852
96975
+ },
96976
+ {
96977
+ "epoch": 15.778347578347578,
96978
+ "grad_norm": 0.23520193994045258,
96979
+ "learning_rate": 5.735565296289574e-06,
96980
+ "loss": 0.472,
96981
+ "step": 13853
96982
+ },
96983
+ {
96984
+ "epoch": 15.77948717948718,
96985
+ "grad_norm": 0.15721645951271057,
96986
+ "learning_rate": 5.732596118336761e-06,
96987
+ "loss": 0.8033,
96988
+ "step": 13854
96989
+ },
96990
+ {
96991
+ "epoch": 15.78062678062678,
96992
+ "grad_norm": 0.22200199961662292,
96993
+ "learning_rate": 5.729627609593863e-06,
96994
+ "loss": 0.5786,
96995
+ "step": 13855
96996
+ },
96997
+ {
96998
+ "epoch": 15.781766381766381,
96999
+ "grad_norm": 0.19021391868591309,
97000
+ "learning_rate": 5.726659770164006e-06,
97001
+ "loss": 0.6212,
97002
+ "step": 13856
97003
+ },
97004
+ {
97005
+ "epoch": 15.782905982905984,
97006
+ "grad_norm": 0.21510300040245056,
97007
+ "learning_rate": 5.723692600150249e-06,
97008
+ "loss": 0.5331,
97009
+ "step": 13857
97010
+ },
97011
+ {
97012
+ "epoch": 15.784045584045584,
97013
+ "grad_norm": 0.1836748570203781,
97014
+ "learning_rate": 5.72072609965566e-06,
97015
+ "loss": 0.6963,
97016
+ "step": 13858
97017
+ },
97018
+ {
97019
+ "epoch": 15.785185185185185,
97020
+ "grad_norm": 0.17830480635166168,
97021
+ "learning_rate": 5.717760268783271e-06,
97022
+ "loss": 0.8189,
97023
+ "step": 13859
97024
+ },
97025
+ {
97026
+ "epoch": 15.786324786324787,
97027
+ "grad_norm": 0.2077512890100479,
97028
+ "learning_rate": 5.714795107636101e-06,
97029
+ "loss": 0.7427,
97030
+ "step": 13860
97031
+ },
97032
+ {
97033
+ "epoch": 15.787464387464388,
97034
+ "grad_norm": 0.19024313986301422,
97035
+ "learning_rate": 5.711830616317123e-06,
97036
+ "loss": 0.5893,
97037
+ "step": 13861
97038
+ },
97039
+ {
97040
+ "epoch": 15.788603988603988,
97041
+ "grad_norm": 0.20871800184249878,
97042
+ "learning_rate": 5.708866794929313e-06,
97043
+ "loss": 0.7435,
97044
+ "step": 13862
97045
+ },
97046
+ {
97047
+ "epoch": 15.78974358974359,
97048
+ "grad_norm": 0.19228936731815338,
97049
+ "learning_rate": 5.705903643575608e-06,
97050
+ "loss": 0.7723,
97051
+ "step": 13863
97052
+ },
97053
+ {
97054
+ "epoch": 15.790883190883191,
97055
+ "grad_norm": 0.18718524277210236,
97056
+ "learning_rate": 5.702941162358935e-06,
97057
+ "loss": 0.7106,
97058
+ "step": 13864
97059
+ },
97060
+ {
97061
+ "epoch": 15.792022792022792,
97062
+ "grad_norm": 0.19854268431663513,
97063
+ "learning_rate": 5.6999793513821785e-06,
97064
+ "loss": 0.6359,
97065
+ "step": 13865
97066
+ },
97067
+ {
97068
+ "epoch": 15.793162393162394,
97069
+ "grad_norm": 0.2403852492570877,
97070
+ "learning_rate": 5.697018210748206e-06,
97071
+ "loss": 0.4741,
97072
+ "step": 13866
97073
+ },
97074
+ {
97075
+ "epoch": 15.794301994301994,
97076
+ "grad_norm": 0.20986339449882507,
97077
+ "learning_rate": 5.694057740559889e-06,
97078
+ "loss": 0.7468,
97079
+ "step": 13867
97080
+ },
97081
+ {
97082
+ "epoch": 15.795441595441595,
97083
+ "grad_norm": 0.2499758005142212,
97084
+ "learning_rate": 5.691097940920029e-06,
97085
+ "loss": 0.4523,
97086
+ "step": 13868
97087
+ },
97088
+ {
97089
+ "epoch": 15.796581196581197,
97090
+ "grad_norm": 0.23681160807609558,
97091
+ "learning_rate": 5.688138811931437e-06,
97092
+ "loss": 0.5846,
97093
+ "step": 13869
97094
+ },
97095
+ {
97096
+ "epoch": 15.797720797720798,
97097
+ "grad_norm": 0.1891166716814041,
97098
+ "learning_rate": 5.685180353696895e-06,
97099
+ "loss": 0.6728,
97100
+ "step": 13870
97101
+ },
97102
+ {
97103
+ "epoch": 15.798860398860398,
97104
+ "grad_norm": 0.20599184930324554,
97105
+ "learning_rate": 5.682222566319159e-06,
97106
+ "loss": 0.8015,
97107
+ "step": 13871
97108
+ },
97109
+ {
97110
+ "epoch": 15.8,
97111
+ "grad_norm": 0.17965242266654968,
97112
+ "learning_rate": 5.679265449900953e-06,
97113
+ "loss": 0.88,
97114
+ "step": 13872
97115
+ },
97116
+ {
97117
+ "epoch": 15.801139601139601,
97118
+ "grad_norm": 0.18087612092494965,
97119
+ "learning_rate": 5.676309004544989e-06,
97120
+ "loss": 0.6112,
97121
+ "step": 13873
97122
+ },
97123
+ {
97124
+ "epoch": 15.802279202279202,
97125
+ "grad_norm": 0.2494555115699768,
97126
+ "learning_rate": 5.673353230353954e-06,
97127
+ "loss": 0.5585,
97128
+ "step": 13874
97129
+ },
97130
+ {
97131
+ "epoch": 15.803418803418804,
97132
+ "grad_norm": 0.21561256051063538,
97133
+ "learning_rate": 5.670398127430515e-06,
97134
+ "loss": 0.5682,
97135
+ "step": 13875
97136
+ },
97137
+ {
97138
+ "epoch": 15.804558404558405,
97139
+ "grad_norm": 0.1912786066532135,
97140
+ "learning_rate": 5.6674436958773e-06,
97141
+ "loss": 0.6802,
97142
+ "step": 13876
97143
+ },
97144
+ {
97145
+ "epoch": 15.805698005698005,
97146
+ "grad_norm": 0.24314413964748383,
97147
+ "learning_rate": 5.6644899357969235e-06,
97148
+ "loss": 0.5237,
97149
+ "step": 13877
97150
+ },
97151
+ {
97152
+ "epoch": 15.806837606837608,
97153
+ "grad_norm": 0.18922634422779083,
97154
+ "learning_rate": 5.661536847291998e-06,
97155
+ "loss": 0.7806,
97156
+ "step": 13878
97157
+ },
97158
+ {
97159
+ "epoch": 15.807977207977208,
97160
+ "grad_norm": 0.2381921410560608,
97161
+ "learning_rate": 5.658584430465072e-06,
97162
+ "loss": 0.5844,
97163
+ "step": 13879
97164
+ },
97165
+ {
97166
+ "epoch": 15.809116809116809,
97167
+ "grad_norm": 0.25195929408073425,
97168
+ "learning_rate": 5.655632685418699e-06,
97169
+ "loss": 0.4855,
97170
+ "step": 13880
97171
+ },
97172
+ {
97173
+ "epoch": 15.810256410256411,
97174
+ "grad_norm": 0.19405119121074677,
97175
+ "learning_rate": 5.652681612255398e-06,
97176
+ "loss": 0.6456,
97177
+ "step": 13881
97178
+ },
97179
+ {
97180
+ "epoch": 15.811396011396011,
97181
+ "grad_norm": 0.19623373448848724,
97182
+ "learning_rate": 5.64973121107768e-06,
97183
+ "loss": 0.5443,
97184
+ "step": 13882
97185
+ },
97186
+ {
97187
+ "epoch": 15.812535612535612,
97188
+ "grad_norm": 0.20332282781600952,
97189
+ "learning_rate": 5.646781481988e-06,
97190
+ "loss": 0.7967,
97191
+ "step": 13883
97192
+ },
97193
+ {
97194
+ "epoch": 15.813675213675214,
97195
+ "grad_norm": 0.26592984795570374,
97196
+ "learning_rate": 5.643832425088821e-06,
97197
+ "loss": 0.5693,
97198
+ "step": 13884
97199
+ },
97200
+ {
97201
+ "epoch": 15.814814814814815,
97202
+ "grad_norm": 0.19840383529663086,
97203
+ "learning_rate": 5.640884040482574e-06,
97204
+ "loss": 0.8236,
97205
+ "step": 13885
97206
+ },
97207
+ {
97208
+ "epoch": 15.815954415954415,
97209
+ "grad_norm": 0.22513839602470398,
97210
+ "learning_rate": 5.6379363282716675e-06,
97211
+ "loss": 0.6565,
97212
+ "step": 13886
97213
+ },
97214
+ {
97215
+ "epoch": 15.817094017094018,
97216
+ "grad_norm": 0.19240371882915497,
97217
+ "learning_rate": 5.63498928855847e-06,
97218
+ "loss": 0.6027,
97219
+ "step": 13887
97220
+ },
97221
+ {
97222
+ "epoch": 15.818233618233618,
97223
+ "grad_norm": 0.2948347330093384,
97224
+ "learning_rate": 5.632042921445352e-06,
97225
+ "loss": 0.7077,
97226
+ "step": 13888
97227
+ },
97228
+ {
97229
+ "epoch": 15.819373219373219,
97230
+ "grad_norm": 0.23926644027233124,
97231
+ "learning_rate": 5.629097227034635e-06,
97232
+ "loss": 0.4717,
97233
+ "step": 13889
97234
+ },
97235
+ {
97236
+ "epoch": 15.820512820512821,
97237
+ "grad_norm": 0.2090224325656891,
97238
+ "learning_rate": 5.626152205428656e-06,
97239
+ "loss": 0.5267,
97240
+ "step": 13890
97241
+ },
97242
+ {
97243
+ "epoch": 15.821652421652422,
97244
+ "grad_norm": 0.18229421973228455,
97245
+ "learning_rate": 5.6232078567296845e-06,
97246
+ "loss": 0.7855,
97247
+ "step": 13891
97248
+ },
97249
+ {
97250
+ "epoch": 15.822792022792022,
97251
+ "grad_norm": 0.18569529056549072,
97252
+ "learning_rate": 5.620264181039989e-06,
97253
+ "loss": 0.6802,
97254
+ "step": 13892
97255
+ },
97256
+ {
97257
+ "epoch": 15.823931623931625,
97258
+ "grad_norm": 0.20608794689178467,
97259
+ "learning_rate": 5.6173211784618125e-06,
97260
+ "loss": 0.6522,
97261
+ "step": 13893
97262
+ },
97263
+ {
97264
+ "epoch": 15.825071225071225,
97265
+ "grad_norm": 0.2130715250968933,
97266
+ "learning_rate": 5.614378849097382e-06,
97267
+ "loss": 0.5559,
97268
+ "step": 13894
97269
+ },
97270
+ {
97271
+ "epoch": 15.826210826210826,
97272
+ "grad_norm": 0.21179170906543732,
97273
+ "learning_rate": 5.611437193048877e-06,
97274
+ "loss": 0.808,
97275
+ "step": 13895
97276
+ },
97277
+ {
97278
+ "epoch": 15.827350427350428,
97279
+ "grad_norm": 0.21439428627490997,
97280
+ "learning_rate": 5.608496210418476e-06,
97281
+ "loss": 0.5765,
97282
+ "step": 13896
97283
+ },
97284
+ {
97285
+ "epoch": 15.828490028490029,
97286
+ "grad_norm": 0.16110475361347198,
97287
+ "learning_rate": 5.6055559013083295e-06,
97288
+ "loss": 0.6759,
97289
+ "step": 13897
97290
+ },
97291
+ {
97292
+ "epoch": 15.829629629629629,
97293
+ "grad_norm": 0.19680850207805634,
97294
+ "learning_rate": 5.602616265820568e-06,
97295
+ "loss": 0.5224,
97296
+ "step": 13898
97297
+ },
97298
+ {
97299
+ "epoch": 15.830769230769231,
97300
+ "grad_norm": 0.20465679466724396,
97301
+ "learning_rate": 5.5996773040572795e-06,
97302
+ "loss": 0.7224,
97303
+ "step": 13899
97304
+ },
97305
+ {
97306
+ "epoch": 15.831908831908832,
97307
+ "grad_norm": 0.21229791641235352,
97308
+ "learning_rate": 5.596739016120545e-06,
97309
+ "loss": 0.5391,
97310
+ "step": 13900
97311
+ },
97312
+ {
97313
+ "epoch": 15.833048433048432,
97314
+ "grad_norm": 0.20955872535705566,
97315
+ "learning_rate": 5.593801402112436e-06,
97316
+ "loss": 0.7048,
97317
+ "step": 13901
97318
+ },
97319
+ {
97320
+ "epoch": 15.834188034188035,
97321
+ "grad_norm": 0.24753613770008087,
97322
+ "learning_rate": 5.590864462134965e-06,
97323
+ "loss": 0.4146,
97324
+ "step": 13902
97325
+ },
97326
+ {
97327
+ "epoch": 15.835327635327635,
97328
+ "grad_norm": 0.17635095119476318,
97329
+ "learning_rate": 5.587928196290143e-06,
97330
+ "loss": 0.759,
97331
+ "step": 13903
97332
+ },
97333
+ {
97334
+ "epoch": 15.836467236467236,
97335
+ "grad_norm": 0.18288402259349823,
97336
+ "learning_rate": 5.584992604679961e-06,
97337
+ "loss": 0.7086,
97338
+ "step": 13904
97339
+ },
97340
+ {
97341
+ "epoch": 15.837606837606838,
97342
+ "grad_norm": 0.18178793787956238,
97343
+ "learning_rate": 5.582057687406386e-06,
97344
+ "loss": 0.6704,
97345
+ "step": 13905
97346
+ },
97347
+ {
97348
+ "epoch": 15.838746438746439,
97349
+ "grad_norm": 0.19446249306201935,
97350
+ "learning_rate": 5.579123444571338e-06,
97351
+ "loss": 0.6034,
97352
+ "step": 13906
97353
+ },
97354
+ {
97355
+ "epoch": 15.83988603988604,
97356
+ "grad_norm": 0.17122094333171844,
97357
+ "learning_rate": 5.576189876276741e-06,
97358
+ "loss": 0.6343,
97359
+ "step": 13907
97360
+ },
97361
+ {
97362
+ "epoch": 15.841025641025642,
97363
+ "grad_norm": 0.2367812544107437,
97364
+ "learning_rate": 5.573256982624483e-06,
97365
+ "loss": 0.4544,
97366
+ "step": 13908
97367
+ },
97368
+ {
97369
+ "epoch": 15.842165242165242,
97370
+ "grad_norm": 0.15342922508716583,
97371
+ "learning_rate": 5.570324763716445e-06,
97372
+ "loss": 0.5693,
97373
+ "step": 13909
97374
+ },
97375
+ {
97376
+ "epoch": 15.843304843304843,
97377
+ "grad_norm": 0.19328589737415314,
97378
+ "learning_rate": 5.5673932196544485e-06,
97379
+ "loss": 0.6786,
97380
+ "step": 13910
97381
+ },
97382
+ {
97383
+ "epoch": 15.844444444444445,
97384
+ "grad_norm": 0.22357133030891418,
97385
+ "learning_rate": 5.564462350540323e-06,
97386
+ "loss": 0.7917,
97387
+ "step": 13911
97388
+ },
97389
+ {
97390
+ "epoch": 15.845584045584046,
97391
+ "grad_norm": 0.22361674904823303,
97392
+ "learning_rate": 5.561532156475879e-06,
97393
+ "loss": 0.5247,
97394
+ "step": 13912
97395
+ },
97396
+ {
97397
+ "epoch": 15.846723646723646,
97398
+ "grad_norm": 0.20205283164978027,
97399
+ "learning_rate": 5.558602637562871e-06,
97400
+ "loss": 0.7483,
97401
+ "step": 13913
97402
+ },
97403
+ {
97404
+ "epoch": 15.847863247863248,
97405
+ "grad_norm": 0.205192431807518,
97406
+ "learning_rate": 5.55567379390306e-06,
97407
+ "loss": 0.6263,
97408
+ "step": 13914
97409
+ },
97410
+ {
97411
+ "epoch": 15.849002849002849,
97412
+ "grad_norm": 0.19363215565681458,
97413
+ "learning_rate": 5.552745625598169e-06,
97414
+ "loss": 0.864,
97415
+ "step": 13915
97416
+ },
97417
+ {
97418
+ "epoch": 15.85014245014245,
97419
+ "grad_norm": 0.24662263691425323,
97420
+ "learning_rate": 5.5498181327499095e-06,
97421
+ "loss": 0.7262,
97422
+ "step": 13916
97423
+ },
97424
+ {
97425
+ "epoch": 15.851282051282052,
97426
+ "grad_norm": 0.20658141374588013,
97427
+ "learning_rate": 5.546891315459948e-06,
97428
+ "loss": 0.5356,
97429
+ "step": 13917
97430
+ },
97431
+ {
97432
+ "epoch": 15.852421652421652,
97433
+ "grad_norm": 0.2509874999523163,
97434
+ "learning_rate": 5.543965173829949e-06,
97435
+ "loss": 0.6354,
97436
+ "step": 13918
97437
+ },
97438
+ {
97439
+ "epoch": 15.853561253561253,
97440
+ "grad_norm": 0.22638174891471863,
97441
+ "learning_rate": 5.54103970796154e-06,
97442
+ "loss": 0.3051,
97443
+ "step": 13919
97444
+ },
97445
+ {
97446
+ "epoch": 15.854700854700855,
97447
+ "grad_norm": 0.1767934411764145,
97448
+ "learning_rate": 5.5381149179563444e-06,
97449
+ "loss": 0.839,
97450
+ "step": 13920
97451
+ },
97452
+ {
97453
+ "epoch": 15.855840455840456,
97454
+ "grad_norm": 0.16481101512908936,
97455
+ "learning_rate": 5.5351908039159295e-06,
97456
+ "loss": 0.7852,
97457
+ "step": 13921
97458
+ },
97459
+ {
97460
+ "epoch": 15.856980056980056,
97461
+ "grad_norm": 0.22488468885421753,
97462
+ "learning_rate": 5.53226736594186e-06,
97463
+ "loss": 0.5728,
97464
+ "step": 13922
97465
+ },
97466
+ {
97467
+ "epoch": 15.858119658119659,
97468
+ "grad_norm": 0.2066558301448822,
97469
+ "learning_rate": 5.529344604135689e-06,
97470
+ "loss": 0.764,
97471
+ "step": 13923
97472
+ },
97473
+ {
97474
+ "epoch": 15.85925925925926,
97475
+ "grad_norm": 0.18836940824985504,
97476
+ "learning_rate": 5.526422518598928e-06,
97477
+ "loss": 0.6421,
97478
+ "step": 13924
97479
+ },
97480
+ {
97481
+ "epoch": 15.86039886039886,
97482
+ "grad_norm": 0.1548687368631363,
97483
+ "learning_rate": 5.523501109433063e-06,
97484
+ "loss": 0.5461,
97485
+ "step": 13925
97486
+ },
97487
+ {
97488
+ "epoch": 15.861538461538462,
97489
+ "grad_norm": 0.18227815628051758,
97490
+ "learning_rate": 5.520580376739562e-06,
97491
+ "loss": 0.5217,
97492
+ "step": 13926
97493
+ },
97494
+ {
97495
+ "epoch": 15.862678062678063,
97496
+ "grad_norm": 0.18795578181743622,
97497
+ "learning_rate": 5.5176603206198746e-06,
97498
+ "loss": 0.7182,
97499
+ "step": 13927
97500
+ },
97501
+ {
97502
+ "epoch": 15.863817663817663,
97503
+ "grad_norm": 0.1918874830007553,
97504
+ "learning_rate": 5.514740941175428e-06,
97505
+ "loss": 0.7007,
97506
+ "step": 13928
97507
+ },
97508
+ {
97509
+ "epoch": 15.864957264957265,
97510
+ "grad_norm": 0.18768425285816193,
97511
+ "learning_rate": 5.5118222385076056e-06,
97512
+ "loss": 0.7876,
97513
+ "step": 13929
97514
+ },
97515
+ {
97516
+ "epoch": 15.866096866096866,
97517
+ "grad_norm": 0.19266986846923828,
97518
+ "learning_rate": 5.508904212717789e-06,
97519
+ "loss": 0.5271,
97520
+ "step": 13930
97521
+ },
97522
+ {
97523
+ "epoch": 15.867236467236467,
97524
+ "grad_norm": 0.2516765594482422,
97525
+ "learning_rate": 5.5059868639073305e-06,
97526
+ "loss": 0.5491,
97527
+ "step": 13931
97528
+ },
97529
+ {
97530
+ "epoch": 15.868376068376069,
97531
+ "grad_norm": 0.260698139667511,
97532
+ "learning_rate": 5.5030701921775645e-06,
97533
+ "loss": 0.7285,
97534
+ "step": 13932
97535
+ },
97536
+ {
97537
+ "epoch": 15.86951566951567,
97538
+ "grad_norm": 0.17928585410118103,
97539
+ "learning_rate": 5.5001541976297724e-06,
97540
+ "loss": 0.605,
97541
+ "step": 13933
97542
+ },
97543
+ {
97544
+ "epoch": 15.87065527065527,
97545
+ "grad_norm": 0.20798265933990479,
97546
+ "learning_rate": 5.497238880365258e-06,
97547
+ "loss": 0.6186,
97548
+ "step": 13934
97549
+ },
97550
+ {
97551
+ "epoch": 15.871794871794872,
97552
+ "grad_norm": 0.19151423871517181,
97553
+ "learning_rate": 5.494324240485277e-06,
97554
+ "loss": 0.6788,
97555
+ "step": 13935
97556
+ },
97557
+ {
97558
+ "epoch": 15.872934472934473,
97559
+ "grad_norm": 0.216191828250885,
97560
+ "learning_rate": 5.4914102780910474e-06,
97561
+ "loss": 0.7635,
97562
+ "step": 13936
97563
+ },
97564
+ {
97565
+ "epoch": 15.874074074074073,
97566
+ "grad_norm": 0.2170773595571518,
97567
+ "learning_rate": 5.4884969932837895e-06,
97568
+ "loss": 0.7002,
97569
+ "step": 13937
97570
+ },
97571
+ {
97572
+ "epoch": 15.875213675213676,
97573
+ "grad_norm": 0.201252281665802,
97574
+ "learning_rate": 5.485584386164688e-06,
97575
+ "loss": 0.7075,
97576
+ "step": 13938
97577
+ },
97578
+ {
97579
+ "epoch": 15.876353276353276,
97580
+ "grad_norm": 0.2165941447019577,
97581
+ "learning_rate": 5.482672456834911e-06,
97582
+ "loss": 0.6854,
97583
+ "step": 13939
97584
+ },
97585
+ {
97586
+ "epoch": 15.877492877492877,
97587
+ "grad_norm": 0.22835934162139893,
97588
+ "learning_rate": 5.479761205395587e-06,
97589
+ "loss": 0.4414,
97590
+ "step": 13940
97591
+ },
97592
+ {
97593
+ "epoch": 15.878632478632479,
97594
+ "grad_norm": 0.23190470039844513,
97595
+ "learning_rate": 5.476850631947836e-06,
97596
+ "loss": 0.5443,
97597
+ "step": 13941
97598
+ },
97599
+ {
97600
+ "epoch": 15.87977207977208,
97601
+ "grad_norm": 0.22439834475517273,
97602
+ "learning_rate": 5.47394073659275e-06,
97603
+ "loss": 0.3856,
97604
+ "step": 13942
97605
+ },
97606
+ {
97607
+ "epoch": 15.88091168091168,
97608
+ "grad_norm": 0.2105487734079361,
97609
+ "learning_rate": 5.471031519431408e-06,
97610
+ "loss": 0.7456,
97611
+ "step": 13943
97612
+ },
97613
+ {
97614
+ "epoch": 15.882051282051282,
97615
+ "grad_norm": 0.18805000185966492,
97616
+ "learning_rate": 5.468122980564833e-06,
97617
+ "loss": 0.7202,
97618
+ "step": 13944
97619
+ },
97620
+ {
97621
+ "epoch": 15.883190883190883,
97622
+ "grad_norm": 0.2164195030927658,
97623
+ "learning_rate": 5.465215120094067e-06,
97624
+ "loss": 0.5933,
97625
+ "step": 13945
97626
+ },
97627
+ {
97628
+ "epoch": 15.884330484330484,
97629
+ "grad_norm": 0.1648697853088379,
97630
+ "learning_rate": 5.462307938120103e-06,
97631
+ "loss": 0.7291,
97632
+ "step": 13946
97633
+ },
97634
+ {
97635
+ "epoch": 15.885470085470086,
97636
+ "grad_norm": 0.2595181465148926,
97637
+ "learning_rate": 5.459401434743911e-06,
97638
+ "loss": 0.4453,
97639
+ "step": 13947
97640
+ },
97641
+ {
97642
+ "epoch": 15.886609686609686,
97643
+ "grad_norm": 0.18670688569545746,
97644
+ "learning_rate": 5.456495610066442e-06,
97645
+ "loss": 0.6382,
97646
+ "step": 13948
97647
+ },
97648
+ {
97649
+ "epoch": 15.887749287749287,
97650
+ "grad_norm": 0.20756377279758453,
97651
+ "learning_rate": 5.4535904641886265e-06,
97652
+ "loss": 0.808,
97653
+ "step": 13949
97654
+ },
97655
+ {
97656
+ "epoch": 15.88888888888889,
97657
+ "grad_norm": 0.2046612799167633,
97658
+ "learning_rate": 5.450685997211375e-06,
97659
+ "loss": 0.6016,
97660
+ "step": 13950
97661
+ },
97662
+ {
97663
+ "epoch": 15.89002849002849,
97664
+ "grad_norm": 0.24006542563438416,
97665
+ "learning_rate": 5.44778220923555e-06,
97666
+ "loss": 0.5609,
97667
+ "step": 13951
97668
+ },
97669
+ {
97670
+ "epoch": 15.89116809116809,
97671
+ "grad_norm": 0.1908271312713623,
97672
+ "learning_rate": 5.444879100362019e-06,
97673
+ "loss": 0.9716,
97674
+ "step": 13952
97675
+ },
97676
+ {
97677
+ "epoch": 15.892307692307693,
97678
+ "grad_norm": 0.19450271129608154,
97679
+ "learning_rate": 5.441976670691615e-06,
97680
+ "loss": 0.6022,
97681
+ "step": 13953
97682
+ },
97683
+ {
97684
+ "epoch": 15.893447293447293,
97685
+ "grad_norm": 0.23146270215511322,
97686
+ "learning_rate": 5.439074920325149e-06,
97687
+ "loss": 0.6238,
97688
+ "step": 13954
97689
+ },
97690
+ {
97691
+ "epoch": 15.894586894586894,
97692
+ "grad_norm": 0.2440861463546753,
97693
+ "learning_rate": 5.436173849363393e-06,
97694
+ "loss": 0.5213,
97695
+ "step": 13955
97696
+ },
97697
+ {
97698
+ "epoch": 15.895726495726496,
97699
+ "grad_norm": 0.1997850388288498,
97700
+ "learning_rate": 5.433273457907126e-06,
97701
+ "loss": 0.7222,
97702
+ "step": 13956
97703
+ },
97704
+ {
97705
+ "epoch": 15.896866096866097,
97706
+ "grad_norm": 0.21447142958641052,
97707
+ "learning_rate": 5.430373746057088e-06,
97708
+ "loss": 0.664,
97709
+ "step": 13957
97710
+ },
97711
+ {
97712
+ "epoch": 15.898005698005697,
97713
+ "grad_norm": 0.22905848920345306,
97714
+ "learning_rate": 5.42747471391398e-06,
97715
+ "loss": 0.8078,
97716
+ "step": 13958
97717
+ },
97718
+ {
97719
+ "epoch": 15.8991452991453,
97720
+ "grad_norm": 0.22671881318092346,
97721
+ "learning_rate": 5.424576361578499e-06,
97722
+ "loss": 0.6527,
97723
+ "step": 13959
97724
+ },
97725
+ {
97726
+ "epoch": 15.9002849002849,
97727
+ "grad_norm": 0.18757790327072144,
97728
+ "learning_rate": 5.421678689151313e-06,
97729
+ "loss": 0.6938,
97730
+ "step": 13960
97731
+ },
97732
+ {
97733
+ "epoch": 15.9014245014245,
97734
+ "grad_norm": 0.21530726552009583,
97735
+ "learning_rate": 5.418781696733074e-06,
97736
+ "loss": 0.6772,
97737
+ "step": 13961
97738
+ },
97739
+ {
97740
+ "epoch": 15.902564102564103,
97741
+ "grad_norm": 0.21243935823440552,
97742
+ "learning_rate": 5.415885384424388e-06,
97743
+ "loss": 0.5114,
97744
+ "step": 13962
97745
+ },
97746
+ {
97747
+ "epoch": 15.903703703703703,
97748
+ "grad_norm": 0.2314883917570114,
97749
+ "learning_rate": 5.412989752325862e-06,
97750
+ "loss": 0.55,
97751
+ "step": 13963
97752
+ },
97753
+ {
97754
+ "epoch": 15.904843304843304,
97755
+ "grad_norm": 0.16625399887561798,
97756
+ "learning_rate": 5.410094800538062e-06,
97757
+ "loss": 0.7069,
97758
+ "step": 13964
97759
+ },
97760
+ {
97761
+ "epoch": 15.905982905982906,
97762
+ "grad_norm": 0.24676908552646637,
97763
+ "learning_rate": 5.407200529161552e-06,
97764
+ "loss": 0.5306,
97765
+ "step": 13965
97766
+ },
97767
+ {
97768
+ "epoch": 15.907122507122507,
97769
+ "grad_norm": 0.1859494149684906,
97770
+ "learning_rate": 5.404306938296832e-06,
97771
+ "loss": 0.6454,
97772
+ "step": 13966
97773
+ },
97774
+ {
97775
+ "epoch": 15.908262108262107,
97776
+ "grad_norm": 0.18464453518390656,
97777
+ "learning_rate": 5.4014140280444296e-06,
97778
+ "loss": 0.9086,
97779
+ "step": 13967
97780
+ },
97781
+ {
97782
+ "epoch": 15.90940170940171,
97783
+ "grad_norm": 0.2452298402786255,
97784
+ "learning_rate": 5.398521798504813e-06,
97785
+ "loss": 0.6533,
97786
+ "step": 13968
97787
+ },
97788
+ {
97789
+ "epoch": 15.91054131054131,
97790
+ "grad_norm": 0.19522514939308167,
97791
+ "learning_rate": 5.3956302497784466e-06,
97792
+ "loss": 0.6949,
97793
+ "step": 13969
97794
+ },
97795
+ {
97796
+ "epoch": 15.91168091168091,
97797
+ "grad_norm": 0.23665203154087067,
97798
+ "learning_rate": 5.392739381965744e-06,
97799
+ "loss": 0.5759,
97800
+ "step": 13970
97801
+ },
97802
+ {
97803
+ "epoch": 15.912820512820513,
97804
+ "grad_norm": 0.1601344347000122,
97805
+ "learning_rate": 5.389849195167127e-06,
97806
+ "loss": 0.6828,
97807
+ "step": 13971
97808
+ },
97809
+ {
97810
+ "epoch": 15.913960113960114,
97811
+ "grad_norm": 0.23100726306438446,
97812
+ "learning_rate": 5.386959689482973e-06,
97813
+ "loss": 0.4421,
97814
+ "step": 13972
97815
+ },
97816
+ {
97817
+ "epoch": 15.915099715099714,
97818
+ "grad_norm": 0.19434283673763275,
97819
+ "learning_rate": 5.384070865013652e-06,
97820
+ "loss": 0.7509,
97821
+ "step": 13973
97822
+ },
97823
+ {
97824
+ "epoch": 15.916239316239317,
97825
+ "grad_norm": 0.25062304735183716,
97826
+ "learning_rate": 5.3811827218594874e-06,
97827
+ "loss": 0.3491,
97828
+ "step": 13974
97829
+ },
97830
+ {
97831
+ "epoch": 15.917378917378917,
97832
+ "grad_norm": 0.18781554698944092,
97833
+ "learning_rate": 5.3782952601208e-06,
97834
+ "loss": 0.7871,
97835
+ "step": 13975
97836
+ },
97837
+ {
97838
+ "epoch": 15.918518518518518,
97839
+ "grad_norm": 0.20198141038417816,
97840
+ "learning_rate": 5.3754084798978754e-06,
97841
+ "loss": 0.5347,
97842
+ "step": 13976
97843
+ },
97844
+ {
97845
+ "epoch": 15.91965811965812,
97846
+ "grad_norm": 0.2023342251777649,
97847
+ "learning_rate": 5.372522381290984e-06,
97848
+ "loss": 0.777,
97849
+ "step": 13977
97850
+ },
97851
+ {
97852
+ "epoch": 15.92079772079772,
97853
+ "grad_norm": 0.17312119901180267,
97854
+ "learning_rate": 5.3696369644003654e-06,
97855
+ "loss": 0.5332,
97856
+ "step": 13978
97857
+ },
97858
+ {
97859
+ "epoch": 15.921937321937321,
97860
+ "grad_norm": 0.2182493656873703,
97861
+ "learning_rate": 5.366752229326241e-06,
97862
+ "loss": 0.6611,
97863
+ "step": 13979
97864
+ },
97865
+ {
97866
+ "epoch": 15.923076923076923,
97867
+ "grad_norm": 0.21295364201068878,
97868
+ "learning_rate": 5.363868176168807e-06,
97869
+ "loss": 0.6564,
97870
+ "step": 13980
97871
+ },
97872
+ {
97873
+ "epoch": 15.924216524216524,
97874
+ "grad_norm": 0.24037398397922516,
97875
+ "learning_rate": 5.360984805028227e-06,
97876
+ "loss": 0.5228,
97877
+ "step": 13981
97878
+ },
97879
+ {
97880
+ "epoch": 15.925356125356124,
97881
+ "grad_norm": 0.19931496679782867,
97882
+ "learning_rate": 5.3581021160046486e-06,
97883
+ "loss": 0.5939,
97884
+ "step": 13982
97885
+ },
97886
+ {
97887
+ "epoch": 15.926495726495727,
97888
+ "grad_norm": 0.19876810908317566,
97889
+ "learning_rate": 5.355220109198203e-06,
97890
+ "loss": 0.5636,
97891
+ "step": 13983
97892
+ },
97893
+ {
97894
+ "epoch": 15.927635327635327,
97895
+ "grad_norm": 0.19912245869636536,
97896
+ "learning_rate": 5.352338784708991e-06,
97897
+ "loss": 0.404,
97898
+ "step": 13984
97899
+ },
97900
+ {
97901
+ "epoch": 15.928774928774928,
97902
+ "grad_norm": 0.2745014429092407,
97903
+ "learning_rate": 5.349458142637076e-06,
97904
+ "loss": 0.4361,
97905
+ "step": 13985
97906
+ },
97907
+ {
97908
+ "epoch": 15.92991452991453,
97909
+ "grad_norm": 0.19565151631832123,
97910
+ "learning_rate": 5.34657818308252e-06,
97911
+ "loss": 0.7867,
97912
+ "step": 13986
97913
+ },
97914
+ {
97915
+ "epoch": 15.93105413105413,
97916
+ "grad_norm": 0.1679268777370453,
97917
+ "learning_rate": 5.343698906145353e-06,
97918
+ "loss": 0.6697,
97919
+ "step": 13987
97920
+ },
97921
+ {
97922
+ "epoch": 15.932193732193731,
97923
+ "grad_norm": 0.17869971692562103,
97924
+ "learning_rate": 5.340820311925576e-06,
97925
+ "loss": 0.7855,
97926
+ "step": 13988
97927
+ },
97928
+ {
97929
+ "epoch": 15.933333333333334,
97930
+ "grad_norm": 0.19192154705524445,
97931
+ "learning_rate": 5.337942400523174e-06,
97932
+ "loss": 0.7875,
97933
+ "step": 13989
97934
+ },
97935
+ {
97936
+ "epoch": 15.934472934472934,
97937
+ "grad_norm": 0.27600088715553284,
97938
+ "learning_rate": 5.335065172038101e-06,
97939
+ "loss": 0.4982,
97940
+ "step": 13990
97941
+ },
97942
+ {
97943
+ "epoch": 15.935612535612536,
97944
+ "grad_norm": 0.21197514235973358,
97945
+ "learning_rate": 5.3321886265703035e-06,
97946
+ "loss": 0.6903,
97947
+ "step": 13991
97948
+ },
97949
+ {
97950
+ "epoch": 15.936752136752137,
97951
+ "grad_norm": 0.20699726045131683,
97952
+ "learning_rate": 5.329312764219671e-06,
97953
+ "loss": 0.642,
97954
+ "step": 13992
97955
+ },
97956
+ {
97957
+ "epoch": 15.937891737891738,
97958
+ "grad_norm": 0.1790648102760315,
97959
+ "learning_rate": 5.326437585086102e-06,
97960
+ "loss": 0.8265,
97961
+ "step": 13993
97962
+ },
97963
+ {
97964
+ "epoch": 15.93903133903134,
97965
+ "grad_norm": 0.2005932480096817,
97966
+ "learning_rate": 5.323563089269459e-06,
97967
+ "loss": 0.6435,
97968
+ "step": 13994
97969
+ },
97970
+ {
97971
+ "epoch": 15.94017094017094,
97972
+ "grad_norm": 0.22714604437351227,
97973
+ "learning_rate": 5.320689276869586e-06,
97974
+ "loss": 0.3956,
97975
+ "step": 13995
97976
+ },
97977
+ {
97978
+ "epoch": 15.941310541310541,
97979
+ "grad_norm": 0.23463019728660583,
97980
+ "learning_rate": 5.317816147986287e-06,
97981
+ "loss": 0.7648,
97982
+ "step": 13996
97983
+ },
97984
+ {
97985
+ "epoch": 15.942450142450143,
97986
+ "grad_norm": 0.20150230824947357,
97987
+ "learning_rate": 5.314943702719361e-06,
97988
+ "loss": 0.7364,
97989
+ "step": 13997
97990
+ },
97991
+ {
97992
+ "epoch": 15.943589743589744,
97993
+ "grad_norm": 0.19925189018249512,
97994
+ "learning_rate": 5.312071941168572e-06,
97995
+ "loss": 0.6879,
97996
+ "step": 13998
97997
+ },
97998
+ {
97999
+ "epoch": 15.944729344729344,
98000
+ "grad_norm": 0.22107404470443726,
98001
+ "learning_rate": 5.309200863433667e-06,
98002
+ "loss": 0.5548,
98003
+ "step": 13999
98004
+ },
98005
+ {
98006
+ "epoch": 15.945868945868947,
98007
+ "grad_norm": 0.15994355082511902,
98008
+ "learning_rate": 5.3063304696143655e-06,
98009
+ "loss": 0.5801,
98010
+ "step": 14000
98011
+ },
98012
+ {
98013
+ "epoch": 15.947008547008547,
98014
+ "grad_norm": 0.17757205665111542,
98015
+ "learning_rate": 5.303460759810366e-06,
98016
+ "loss": 0.7727,
98017
+ "step": 14001
98018
+ },
98019
+ {
98020
+ "epoch": 15.948148148148148,
98021
+ "grad_norm": 0.2089131623506546,
98022
+ "learning_rate": 5.300591734121338e-06,
98023
+ "loss": 0.5178,
98024
+ "step": 14002
98025
+ },
98026
+ {
98027
+ "epoch": 15.94928774928775,
98028
+ "grad_norm": 0.1872301995754242,
98029
+ "learning_rate": 5.297723392646942e-06,
98030
+ "loss": 0.7741,
98031
+ "step": 14003
98032
+ },
98033
+ {
98034
+ "epoch": 15.95042735042735,
98035
+ "grad_norm": 0.18554730713367462,
98036
+ "learning_rate": 5.294855735486784e-06,
98037
+ "loss": 0.7561,
98038
+ "step": 14004
98039
+ },
98040
+ {
98041
+ "epoch": 15.951566951566951,
98042
+ "grad_norm": 0.16955998539924622,
98043
+ "learning_rate": 5.291988762740477e-06,
98044
+ "loss": 0.7384,
98045
+ "step": 14005
98046
+ },
98047
+ {
98048
+ "epoch": 15.952706552706553,
98049
+ "grad_norm": 0.18044869601726532,
98050
+ "learning_rate": 5.289122474507599e-06,
98051
+ "loss": 0.7132,
98052
+ "step": 14006
98053
+ },
98054
+ {
98055
+ "epoch": 15.953846153846154,
98056
+ "grad_norm": 0.26764407753944397,
98057
+ "learning_rate": 5.286256870887707e-06,
98058
+ "loss": 0.4791,
98059
+ "step": 14007
98060
+ },
98061
+ {
98062
+ "epoch": 15.954985754985755,
98063
+ "grad_norm": 0.19423027336597443,
98064
+ "learning_rate": 5.283391951980324e-06,
98065
+ "loss": 0.5809,
98066
+ "step": 14008
98067
+ },
98068
+ {
98069
+ "epoch": 15.956125356125357,
98070
+ "grad_norm": 0.2263379991054535,
98071
+ "learning_rate": 5.280527717884956e-06,
98072
+ "loss": 0.5008,
98073
+ "step": 14009
98074
+ },
98075
+ {
98076
+ "epoch": 15.957264957264957,
98077
+ "grad_norm": 0.25239741802215576,
98078
+ "learning_rate": 5.277664168701088e-06,
98079
+ "loss": 0.6367,
98080
+ "step": 14010
98081
+ },
98082
+ {
98083
+ "epoch": 15.958404558404558,
98084
+ "grad_norm": 0.16616038978099823,
98085
+ "learning_rate": 5.274801304528182e-06,
98086
+ "loss": 0.7751,
98087
+ "step": 14011
98088
+ },
98089
+ {
98090
+ "epoch": 15.95954415954416,
98091
+ "grad_norm": 0.23653560876846313,
98092
+ "learning_rate": 5.271939125465672e-06,
98093
+ "loss": 0.6919,
98094
+ "step": 14012
98095
+ },
98096
+ {
98097
+ "epoch": 15.96068376068376,
98098
+ "grad_norm": 0.23039647936820984,
98099
+ "learning_rate": 5.269077631612967e-06,
98100
+ "loss": 0.6409,
98101
+ "step": 14013
98102
+ },
98103
+ {
98104
+ "epoch": 15.961823361823361,
98105
+ "grad_norm": 0.1734488606452942,
98106
+ "learning_rate": 5.2662168230694645e-06,
98107
+ "loss": 0.8209,
98108
+ "step": 14014
98109
+ },
98110
+ {
98111
+ "epoch": 15.962962962962964,
98112
+ "grad_norm": 0.22865897417068481,
98113
+ "learning_rate": 5.263356699934513e-06,
98114
+ "loss": 0.3679,
98115
+ "step": 14015
98116
+ },
98117
+ {
98118
+ "epoch": 15.964102564102564,
98119
+ "grad_norm": 0.20063550770282745,
98120
+ "learning_rate": 5.260497262307456e-06,
98121
+ "loss": 0.7736,
98122
+ "step": 14016
98123
+ },
98124
+ {
98125
+ "epoch": 15.965242165242165,
98126
+ "grad_norm": 0.17283949255943298,
98127
+ "learning_rate": 5.2576385102876155e-06,
98128
+ "loss": 0.7293,
98129
+ "step": 14017
98130
+ },
98131
+ {
98132
+ "epoch": 15.966381766381767,
98133
+ "grad_norm": 0.30130085349082947,
98134
+ "learning_rate": 5.254780443974289e-06,
98135
+ "loss": 0.53,
98136
+ "step": 14018
98137
+ },
98138
+ {
98139
+ "epoch": 15.967521367521368,
98140
+ "grad_norm": 0.22147002816200256,
98141
+ "learning_rate": 5.2519230634667295e-06,
98142
+ "loss": 0.4588,
98143
+ "step": 14019
98144
+ },
98145
+ {
98146
+ "epoch": 15.968660968660968,
98147
+ "grad_norm": 0.21945025026798248,
98148
+ "learning_rate": 5.249066368864189e-06,
98149
+ "loss": 0.4553,
98150
+ "step": 14020
98151
+ },
98152
+ {
98153
+ "epoch": 15.96980056980057,
98154
+ "grad_norm": 0.22448524832725525,
98155
+ "learning_rate": 5.246210360265888e-06,
98156
+ "loss": 0.4898,
98157
+ "step": 14021
98158
+ },
98159
+ {
98160
+ "epoch": 15.970940170940171,
98161
+ "grad_norm": 0.21025843918323517,
98162
+ "learning_rate": 5.243355037771028e-06,
98163
+ "loss": 0.554,
98164
+ "step": 14022
98165
+ },
98166
+ {
98167
+ "epoch": 15.972079772079772,
98168
+ "grad_norm": 0.315996915102005,
98169
+ "learning_rate": 5.240500401478774e-06,
98170
+ "loss": 0.5518,
98171
+ "step": 14023
98172
+ },
98173
+ {
98174
+ "epoch": 15.973219373219374,
98175
+ "grad_norm": 0.22453878819942474,
98176
+ "learning_rate": 5.237646451488282e-06,
98177
+ "loss": 0.6244,
98178
+ "step": 14024
98179
+ },
98180
+ {
98181
+ "epoch": 15.974358974358974,
98182
+ "grad_norm": 0.22095011174678802,
98183
+ "learning_rate": 5.234793187898682e-06,
98184
+ "loss": 0.7162,
98185
+ "step": 14025
98186
+ },
98187
+ {
98188
+ "epoch": 15.975498575498575,
98189
+ "grad_norm": 0.20376168191432953,
98190
+ "learning_rate": 5.231940610809063e-06,
98191
+ "loss": 0.563,
98192
+ "step": 14026
98193
+ },
98194
+ {
98195
+ "epoch": 15.976638176638177,
98196
+ "grad_norm": 0.21002911031246185,
98197
+ "learning_rate": 5.229088720318507e-06,
98198
+ "loss": 0.5615,
98199
+ "step": 14027
98200
+ },
98201
+ {
98202
+ "epoch": 15.977777777777778,
98203
+ "grad_norm": 0.22033274173736572,
98204
+ "learning_rate": 5.226237516526072e-06,
98205
+ "loss": 0.6323,
98206
+ "step": 14028
98207
+ },
98208
+ {
98209
+ "epoch": 15.978917378917378,
98210
+ "grad_norm": 0.24790626764297485,
98211
+ "learning_rate": 5.223386999530791e-06,
98212
+ "loss": 0.518,
98213
+ "step": 14029
98214
+ },
98215
+ {
98216
+ "epoch": 15.98005698005698,
98217
+ "grad_norm": 0.17082878947257996,
98218
+ "learning_rate": 5.2205371694316606e-06,
98219
+ "loss": 0.7464,
98220
+ "step": 14030
98221
+ },
98222
+ {
98223
+ "epoch": 15.981196581196581,
98224
+ "grad_norm": 0.2568652331829071,
98225
+ "learning_rate": 5.217688026327666e-06,
98226
+ "loss": 0.6016,
98227
+ "step": 14031
98228
+ },
98229
+ {
98230
+ "epoch": 15.982336182336182,
98231
+ "grad_norm": 0.17630797624588013,
98232
+ "learning_rate": 5.21483957031777e-06,
98233
+ "loss": 0.5679,
98234
+ "step": 14032
98235
+ },
98236
+ {
98237
+ "epoch": 15.983475783475784,
98238
+ "grad_norm": 0.2258668839931488,
98239
+ "learning_rate": 5.2119918015009036e-06,
98240
+ "loss": 0.5935,
98241
+ "step": 14033
98242
+ },
98243
+ {
98244
+ "epoch": 15.984615384615385,
98245
+ "grad_norm": 0.18470749258995056,
98246
+ "learning_rate": 5.209144719975981e-06,
98247
+ "loss": 0.8318,
98248
+ "step": 14034
98249
+ },
98250
+ {
98251
+ "epoch": 15.985754985754985,
98252
+ "grad_norm": 0.19650182127952576,
98253
+ "learning_rate": 5.206298325841885e-06,
98254
+ "loss": 0.7677,
98255
+ "step": 14035
98256
+ },
98257
+ {
98258
+ "epoch": 15.986894586894588,
98259
+ "grad_norm": 0.18926526606082916,
98260
+ "learning_rate": 5.203452619197488e-06,
98261
+ "loss": 0.7555,
98262
+ "step": 14036
98263
+ },
98264
+ {
98265
+ "epoch": 15.988034188034188,
98266
+ "grad_norm": 0.24393440783023834,
98267
+ "learning_rate": 5.200607600141619e-06,
98268
+ "loss": 0.5547,
98269
+ "step": 14037
98270
+ },
98271
+ {
98272
+ "epoch": 15.989173789173789,
98273
+ "grad_norm": 0.2429366260766983,
98274
+ "learning_rate": 5.197763268773093e-06,
98275
+ "loss": 0.4874,
98276
+ "step": 14038
98277
+ },
98278
+ {
98279
+ "epoch": 15.990313390313391,
98280
+ "grad_norm": 0.19000403583049774,
98281
+ "learning_rate": 5.194919625190706e-06,
98282
+ "loss": 0.6184,
98283
+ "step": 14039
98284
+ },
98285
+ {
98286
+ "epoch": 15.991452991452991,
98287
+ "grad_norm": 0.21088244020938873,
98288
+ "learning_rate": 5.192076669493231e-06,
98289
+ "loss": 0.4815,
98290
+ "step": 14040
98291
+ },
98292
+ {
98293
+ "epoch": 15.992592592592592,
98294
+ "grad_norm": 0.17279838025569916,
98295
+ "learning_rate": 5.1892344017794e-06,
98296
+ "loss": 0.5564,
98297
+ "step": 14041
98298
+ },
98299
+ {
98300
+ "epoch": 15.993732193732194,
98301
+ "grad_norm": 0.24155011773109436,
98302
+ "learning_rate": 5.186392822147934e-06,
98303
+ "loss": 0.6469,
98304
+ "step": 14042
98305
+ },
98306
+ {
98307
+ "epoch": 15.994871794871795,
98308
+ "grad_norm": 0.23470929265022278,
98309
+ "learning_rate": 5.1835519306975305e-06,
98310
+ "loss": 0.5868,
98311
+ "step": 14043
98312
+ },
98313
+ {
98314
+ "epoch": 15.996011396011395,
98315
+ "grad_norm": 0.17550434172153473,
98316
+ "learning_rate": 5.180711727526877e-06,
98317
+ "loss": 0.6681,
98318
+ "step": 14044
98319
+ },
98320
+ {
98321
+ "epoch": 15.997150997150998,
98322
+ "grad_norm": 0.188474640250206,
98323
+ "learning_rate": 5.1778722127346e-06,
98324
+ "loss": 0.7394,
98325
+ "step": 14045
98326
+ },
98327
+ {
98328
+ "epoch": 15.998290598290598,
98329
+ "grad_norm": 0.27023807168006897,
98330
+ "learning_rate": 5.1750333864193315e-06,
98331
+ "loss": 0.4689,
98332
+ "step": 14046
98333
+ },
98334
+ {
98335
+ "epoch": 15.999430199430199,
98336
+ "grad_norm": 0.2202572524547577,
98337
+ "learning_rate": 5.1721952486796736e-06,
98338
+ "loss": 0.7047,
98339
+ "step": 14047
98340
+ },
98341
+ {
98342
+ "epoch": 16.0,
98343
+ "grad_norm": 0.3407000005245209,
98344
+ "learning_rate": 5.169357799614208e-06,
98345
+ "loss": 1.1228,
98346
+ "step": 14048
98347
+ },
98348
+ {
98349
+ "epoch": 16.001139601139602,
98350
+ "grad_norm": 0.17774610221385956,
98351
+ "learning_rate": 5.166521039321473e-06,
98352
+ "loss": 0.6193,
98353
+ "step": 14049
98354
+ },
98355
+ {
98356
+ "epoch": 16.0022792022792,
98357
+ "grad_norm": 0.1734650731086731,
98358
+ "learning_rate": 5.163684967900007e-06,
98359
+ "loss": 0.8355,
98360
+ "step": 14050
98361
+ },
98362
+ {
98363
+ "epoch": 16.003418803418803,
98364
+ "grad_norm": 0.18855872750282288,
98365
+ "learning_rate": 5.16084958544831e-06,
98366
+ "loss": 0.595,
98367
+ "step": 14051
98368
+ },
98369
+ {
98370
+ "epoch": 16.004558404558406,
98371
+ "grad_norm": 0.17589649558067322,
98372
+ "learning_rate": 5.1580148920648715e-06,
98373
+ "loss": 0.932,
98374
+ "step": 14052
98375
+ },
98376
+ {
98377
+ "epoch": 16.005698005698004,
98378
+ "grad_norm": 0.1905054897069931,
98379
+ "learning_rate": 5.155180887848135e-06,
98380
+ "loss": 0.6708,
98381
+ "step": 14053
98382
+ },
98383
+ {
98384
+ "epoch": 16.006837606837607,
98385
+ "grad_norm": 0.203451007604599,
98386
+ "learning_rate": 5.152347572896535e-06,
98387
+ "loss": 0.6731,
98388
+ "step": 14054
98389
+ },
98390
+ {
98391
+ "epoch": 16.00797720797721,
98392
+ "grad_norm": 0.1736338585615158,
98393
+ "learning_rate": 5.149514947308495e-06,
98394
+ "loss": 0.7534,
98395
+ "step": 14055
98396
+ },
98397
+ {
98398
+ "epoch": 16.009116809116808,
98399
+ "grad_norm": 0.18473488092422485,
98400
+ "learning_rate": 5.146683011182388e-06,
98401
+ "loss": 0.5515,
98402
+ "step": 14056
98403
+ },
98404
+ {
98405
+ "epoch": 16.01025641025641,
98406
+ "grad_norm": 0.19602778553962708,
98407
+ "learning_rate": 5.143851764616572e-06,
98408
+ "loss": 0.4613,
98409
+ "step": 14057
98410
+ },
98411
+ {
98412
+ "epoch": 16.011396011396013,
98413
+ "grad_norm": 0.1879139393568039,
98414
+ "learning_rate": 5.14102120770939e-06,
98415
+ "loss": 0.5354,
98416
+ "step": 14058
98417
+ },
98418
+ {
98419
+ "epoch": 16.01253561253561,
98420
+ "grad_norm": 0.21041598916053772,
98421
+ "learning_rate": 5.138191340559162e-06,
98422
+ "loss": 0.4618,
98423
+ "step": 14059
98424
+ },
98425
+ {
98426
+ "epoch": 16.013675213675214,
98427
+ "grad_norm": 0.19878196716308594,
98428
+ "learning_rate": 5.1353621632641625e-06,
98429
+ "loss": 0.581,
98430
+ "step": 14060
98431
+ },
98432
+ {
98433
+ "epoch": 16.014814814814816,
98434
+ "grad_norm": 0.21880239248275757,
98435
+ "learning_rate": 5.13253367592266e-06,
98436
+ "loss": 0.6558,
98437
+ "step": 14061
98438
+ },
98439
+ {
98440
+ "epoch": 16.015954415954415,
98441
+ "grad_norm": 0.21958042681217194,
98442
+ "learning_rate": 5.129705878632901e-06,
98443
+ "loss": 0.6628,
98444
+ "step": 14062
98445
+ },
98446
+ {
98447
+ "epoch": 16.017094017094017,
98448
+ "grad_norm": 0.1946631819009781,
98449
+ "learning_rate": 5.126878771493107e-06,
98450
+ "loss": 0.7045,
98451
+ "step": 14063
98452
+ },
98453
+ {
98454
+ "epoch": 16.01823361823362,
98455
+ "grad_norm": 0.19736795127391815,
98456
+ "learning_rate": 5.124052354601458e-06,
98457
+ "loss": 0.9372,
98458
+ "step": 14064
98459
+ },
98460
+ {
98461
+ "epoch": 16.019373219373218,
98462
+ "grad_norm": 0.17857308685779572,
98463
+ "learning_rate": 5.1212266280561225e-06,
98464
+ "loss": 0.5775,
98465
+ "step": 14065
98466
+ },
98467
+ {
98468
+ "epoch": 16.02051282051282,
98469
+ "grad_norm": 0.1766318827867508,
98470
+ "learning_rate": 5.118401591955269e-06,
98471
+ "loss": 0.77,
98472
+ "step": 14066
98473
+ },
98474
+ {
98475
+ "epoch": 16.021652421652423,
98476
+ "grad_norm": 0.16231514513492584,
98477
+ "learning_rate": 5.115577246396991e-06,
98478
+ "loss": 0.6573,
98479
+ "step": 14067
98480
+ },
98481
+ {
98482
+ "epoch": 16.02279202279202,
98483
+ "grad_norm": 0.21469400823116302,
98484
+ "learning_rate": 5.112753591479402e-06,
98485
+ "loss": 0.5812,
98486
+ "step": 14068
98487
+ },
98488
+ {
98489
+ "epoch": 16.023931623931624,
98490
+ "grad_norm": 0.1972481906414032,
98491
+ "learning_rate": 5.109930627300569e-06,
98492
+ "loss": 0.7897,
98493
+ "step": 14069
98494
+ },
98495
+ {
98496
+ "epoch": 16.025071225071226,
98497
+ "grad_norm": 0.20298215746879578,
98498
+ "learning_rate": 5.107108353958551e-06,
98499
+ "loss": 0.4743,
98500
+ "step": 14070
98501
+ },
98502
+ {
98503
+ "epoch": 16.026210826210825,
98504
+ "grad_norm": 0.20357541739940643,
98505
+ "learning_rate": 5.104286771551356e-06,
98506
+ "loss": 0.5591,
98507
+ "step": 14071
98508
+ },
98509
+ {
98510
+ "epoch": 16.027350427350427,
98511
+ "grad_norm": 0.24694040417671204,
98512
+ "learning_rate": 5.101465880176998e-06,
98513
+ "loss": 0.6901,
98514
+ "step": 14072
98515
+ },
98516
+ {
98517
+ "epoch": 16.02849002849003,
98518
+ "grad_norm": 0.19539234042167664,
98519
+ "learning_rate": 5.098645679933451e-06,
98520
+ "loss": 0.7029,
98521
+ "step": 14073
98522
+ },
98523
+ {
98524
+ "epoch": 16.02962962962963,
98525
+ "grad_norm": 0.19871358573436737,
98526
+ "learning_rate": 5.095826170918674e-06,
98527
+ "loss": 0.7805,
98528
+ "step": 14074
98529
+ },
98530
+ {
98531
+ "epoch": 16.03076923076923,
98532
+ "grad_norm": 0.17123480141162872,
98533
+ "learning_rate": 5.093007353230584e-06,
98534
+ "loss": 0.5841,
98535
+ "step": 14075
98536
+ },
98537
+ {
98538
+ "epoch": 16.031908831908833,
98539
+ "grad_norm": 0.18302345275878906,
98540
+ "learning_rate": 5.090189226967085e-06,
98541
+ "loss": 0.5956,
98542
+ "step": 14076
98543
+ },
98544
+ {
98545
+ "epoch": 16.03304843304843,
98546
+ "grad_norm": 0.1640847623348236,
98547
+ "learning_rate": 5.087371792226084e-06,
98548
+ "loss": 0.8549,
98549
+ "step": 14077
98550
+ },
98551
+ {
98552
+ "epoch": 16.034188034188034,
98553
+ "grad_norm": 0.22754384577274323,
98554
+ "learning_rate": 5.08455504910541e-06,
98555
+ "loss": 0.6512,
98556
+ "step": 14078
98557
+ },
98558
+ {
98559
+ "epoch": 16.035327635327636,
98560
+ "grad_norm": 0.2154824435710907,
98561
+ "learning_rate": 5.081738997702909e-06,
98562
+ "loss": 0.6456,
98563
+ "step": 14079
98564
+ },
98565
+ {
98566
+ "epoch": 16.036467236467235,
98567
+ "grad_norm": 0.19186750054359436,
98568
+ "learning_rate": 5.078923638116387e-06,
98569
+ "loss": 0.7609,
98570
+ "step": 14080
98571
+ },
98572
+ {
98573
+ "epoch": 16.037606837606837,
98574
+ "grad_norm": 0.21435517072677612,
98575
+ "learning_rate": 5.07610897044363e-06,
98576
+ "loss": 0.6266,
98577
+ "step": 14081
98578
+ },
98579
+ {
98580
+ "epoch": 16.03874643874644,
98581
+ "grad_norm": 0.25843796133995056,
98582
+ "learning_rate": 5.073294994782407e-06,
98583
+ "loss": 0.3241,
98584
+ "step": 14082
98585
+ },
98586
+ {
98587
+ "epoch": 16.03988603988604,
98588
+ "grad_norm": 0.2305968701839447,
98589
+ "learning_rate": 5.070481711230441e-06,
98590
+ "loss": 0.646,
98591
+ "step": 14083
98592
+ },
98593
+ {
98594
+ "epoch": 16.04102564102564,
98595
+ "grad_norm": 0.1847716122865677,
98596
+ "learning_rate": 5.0676691198854485e-06,
98597
+ "loss": 0.7921,
98598
+ "step": 14084
98599
+ },
98600
+ {
98601
+ "epoch": 16.042165242165243,
98602
+ "grad_norm": 0.19992592930793762,
98603
+ "learning_rate": 5.0648572208451235e-06,
98604
+ "loss": 0.4748,
98605
+ "step": 14085
98606
+ },
98607
+ {
98608
+ "epoch": 16.043304843304842,
98609
+ "grad_norm": 0.22528088092803955,
98610
+ "learning_rate": 5.062046014207136e-06,
98611
+ "loss": 0.4691,
98612
+ "step": 14086
98613
+ },
98614
+ {
98615
+ "epoch": 16.044444444444444,
98616
+ "grad_norm": 0.23095327615737915,
98617
+ "learning_rate": 5.059235500069106e-06,
98618
+ "loss": 0.6832,
98619
+ "step": 14087
98620
+ },
98621
+ {
98622
+ "epoch": 16.045584045584047,
98623
+ "grad_norm": 0.2332068383693695,
98624
+ "learning_rate": 5.056425678528673e-06,
98625
+ "loss": 0.3386,
98626
+ "step": 14088
98627
+ },
98628
+ {
98629
+ "epoch": 16.046723646723645,
98630
+ "grad_norm": 0.16111916303634644,
98631
+ "learning_rate": 5.053616549683427e-06,
98632
+ "loss": 0.7517,
98633
+ "step": 14089
98634
+ },
98635
+ {
98636
+ "epoch": 16.047863247863248,
98637
+ "grad_norm": 0.2254699319601059,
98638
+ "learning_rate": 5.050808113630925e-06,
98639
+ "loss": 0.6548,
98640
+ "step": 14090
98641
+ },
98642
+ {
98643
+ "epoch": 16.04900284900285,
98644
+ "grad_norm": 0.21958674490451813,
98645
+ "learning_rate": 5.048000370468717e-06,
98646
+ "loss": 0.5918,
98647
+ "step": 14091
98648
+ },
98649
+ {
98650
+ "epoch": 16.05014245014245,
98651
+ "grad_norm": 0.1744617074728012,
98652
+ "learning_rate": 5.045193320294323e-06,
98653
+ "loss": 0.7799,
98654
+ "step": 14092
98655
+ },
98656
+ {
98657
+ "epoch": 16.05128205128205,
98658
+ "grad_norm": 0.22298863530158997,
98659
+ "learning_rate": 5.0423869632052475e-06,
98660
+ "loss": 0.4798,
98661
+ "step": 14093
98662
+ },
98663
+ {
98664
+ "epoch": 16.052421652421653,
98665
+ "grad_norm": 0.22837935388088226,
98666
+ "learning_rate": 5.0395812992989535e-06,
98667
+ "loss": 0.5823,
98668
+ "step": 14094
98669
+ },
98670
+ {
98671
+ "epoch": 16.053561253561252,
98672
+ "grad_norm": 0.21403367817401886,
98673
+ "learning_rate": 5.0367763286728875e-06,
98674
+ "loss": 0.6041,
98675
+ "step": 14095
98676
+ },
98677
+ {
98678
+ "epoch": 16.054700854700855,
98679
+ "grad_norm": 0.21587662398815155,
98680
+ "learning_rate": 5.033972051424482e-06,
98681
+ "loss": 0.3409,
98682
+ "step": 14096
98683
+ },
98684
+ {
98685
+ "epoch": 16.055840455840457,
98686
+ "grad_norm": 0.18038514256477356,
98687
+ "learning_rate": 5.03116846765114e-06,
98688
+ "loss": 0.7024,
98689
+ "step": 14097
98690
+ },
98691
+ {
98692
+ "epoch": 16.056980056980056,
98693
+ "grad_norm": 0.2202543467283249,
98694
+ "learning_rate": 5.028365577450217e-06,
98695
+ "loss": 0.507,
98696
+ "step": 14098
98697
+ },
98698
+ {
98699
+ "epoch": 16.058119658119658,
98700
+ "grad_norm": 0.22714479267597198,
98701
+ "learning_rate": 5.025563380919088e-06,
98702
+ "loss": 0.2714,
98703
+ "step": 14099
98704
+ },
98705
+ {
98706
+ "epoch": 16.05925925925926,
98707
+ "grad_norm": 0.18912501633167267,
98708
+ "learning_rate": 5.02276187815508e-06,
98709
+ "loss": 0.7436,
98710
+ "step": 14100
98711
  }
98712
  ],
98713
  "logging_steps": 1,
 
98727
  "attributes": {}
98728
  }
98729
  },
98730
+ "total_flos": 7.883484268832244e+19,
98731
  "train_batch_size": 8,
98732
  "trial_name": null,
98733
  "trial_params": null