ErrorAI commited on
Commit
13974a1
·
verified ·
1 Parent(s): 1affd1a

Training in progress, step 855, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d56806e729b1bc09c41529ad3fe6db99652883082d9030a3c4e6464cd0e2d271
3
  size 80792096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7c51bb5b178a5a10f48262d602f1dd1f4c878efc0c8a71a5d20d462d4a57ff0
3
  size 80792096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:635bf289efd5388be0eab256cf78422a54f54489201556ff30a9b967913901c3
3
  size 41460084
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c1662d9d0c088f2c11816e081cfaf2a1cb3633e4eb4346dd8344524118189b
3
  size 41460084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7697c1a796ff5bc9124b3a3e352ec168a526045980cfb17913ff562d542d311f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ffb892767ac33d9d888c16470956f7387ee6fe3b220b4a2dee598697bb8026
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdc912e78a9c43914f40c2bd1be4d0e78da1022f2fd58c13611a30d1991d4b7d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27537edb793eaf638a01a0f3e3d9d913d146711fb62c3555e6abdde4209a80fa
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7508771929824561,
5
  "eval_steps": 214,
6
- "global_step": 642,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4533,6 +4533,1497 @@
4533
  "eval_samples_per_second": 31.569,
4534
  "eval_steps_per_second": 15.828,
4535
  "step": 642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4536
  }
4537
  ],
4538
  "logging_steps": 1,
@@ -4547,12 +6038,12 @@
4547
  "should_evaluate": false,
4548
  "should_log": false,
4549
  "should_save": true,
4550
- "should_training_stop": false
4551
  },
4552
  "attributes": {}
4553
  }
4554
  },
4555
- "total_flos": 1.241628182839296e+17,
4556
  "train_batch_size": 2,
4557
  "trial_name": null,
4558
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 214,
6
+ "global_step": 855,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4533
  "eval_samples_per_second": 31.569,
4534
  "eval_steps_per_second": 15.828,
4535
  "step": 642
4536
+ },
4537
+ {
4538
+ "epoch": 0.752046783625731,
4539
+ "grad_norm": 0.4816937744617462,
4540
+ "learning_rate": 2.948676587993834e-05,
4541
+ "loss": 1.2186,
4542
+ "step": 643
4543
+ },
4544
+ {
4545
+ "epoch": 0.7532163742690059,
4546
+ "grad_norm": 0.6862644553184509,
4547
+ "learning_rate": 2.922362931409851e-05,
4548
+ "loss": 1.6134,
4549
+ "step": 644
4550
+ },
4551
+ {
4552
+ "epoch": 0.7543859649122807,
4553
+ "grad_norm": 0.453056663274765,
4554
+ "learning_rate": 2.8961471052904852e-05,
4555
+ "loss": 1.8659,
4556
+ "step": 645
4557
+ },
4558
+ {
4559
+ "epoch": 0.7555555555555555,
4560
+ "grad_norm": 1.0425024032592773,
4561
+ "learning_rate": 2.8700294720033483e-05,
4562
+ "loss": 1.5892,
4563
+ "step": 646
4564
+ },
4565
+ {
4566
+ "epoch": 0.7567251461988304,
4567
+ "grad_norm": 0.7134360671043396,
4568
+ "learning_rate": 2.84401039255879e-05,
4569
+ "loss": 1.535,
4570
+ "step": 647
4571
+ },
4572
+ {
4573
+ "epoch": 0.7578947368421053,
4574
+ "grad_norm": 0.3440370261669159,
4575
+ "learning_rate": 2.8180902266048948e-05,
4576
+ "loss": 1.6847,
4577
+ "step": 648
4578
+ },
4579
+ {
4580
+ "epoch": 0.7590643274853801,
4581
+ "grad_norm": 0.458816260099411,
4582
+ "learning_rate": 2.7922693324225168e-05,
4583
+ "loss": 1.7274,
4584
+ "step": 649
4585
+ },
4586
+ {
4587
+ "epoch": 0.7602339181286549,
4588
+ "grad_norm": 0.7467341423034668,
4589
+ "learning_rate": 2.766548066920338e-05,
4590
+ "loss": 1.1546,
4591
+ "step": 650
4592
+ },
4593
+ {
4594
+ "epoch": 0.7614035087719299,
4595
+ "grad_norm": 0.8349908590316772,
4596
+ "learning_rate": 2.7409267856299147e-05,
4597
+ "loss": 1.5008,
4598
+ "step": 651
4599
+ },
4600
+ {
4601
+ "epoch": 0.7625730994152047,
4602
+ "grad_norm": 0.9590814113616943,
4603
+ "learning_rate": 2.715405842700782e-05,
4604
+ "loss": 0.9238,
4605
+ "step": 652
4606
+ },
4607
+ {
4608
+ "epoch": 0.7637426900584795,
4609
+ "grad_norm": 0.43833237886428833,
4610
+ "learning_rate": 2.6899855908955464e-05,
4611
+ "loss": 1.0527,
4612
+ "step": 653
4613
+ },
4614
+ {
4615
+ "epoch": 0.7649122807017544,
4616
+ "grad_norm": 0.34467437863349915,
4617
+ "learning_rate": 2.6646663815850092e-05,
4618
+ "loss": 1.5622,
4619
+ "step": 654
4620
+ },
4621
+ {
4622
+ "epoch": 0.7660818713450293,
4623
+ "grad_norm": 0.42286065220832825,
4624
+ "learning_rate": 2.6394485647433277e-05,
4625
+ "loss": 1.6389,
4626
+ "step": 655
4627
+ },
4628
+ {
4629
+ "epoch": 0.7672514619883041,
4630
+ "grad_norm": 0.34687599539756775,
4631
+ "learning_rate": 2.614332488943152e-05,
4632
+ "loss": 1.4794,
4633
+ "step": 656
4634
+ },
4635
+ {
4636
+ "epoch": 0.7684210526315789,
4637
+ "grad_norm": 0.46390998363494873,
4638
+ "learning_rate": 2.5893185013508194e-05,
4639
+ "loss": 1.5318,
4640
+ "step": 657
4641
+ },
4642
+ {
4643
+ "epoch": 0.7695906432748538,
4644
+ "grad_norm": 0.43160146474838257,
4645
+ "learning_rate": 2.564406947721566e-05,
4646
+ "loss": 1.0743,
4647
+ "step": 658
4648
+ },
4649
+ {
4650
+ "epoch": 0.7707602339181286,
4651
+ "grad_norm": 0.41312840580940247,
4652
+ "learning_rate": 2.539598172394727e-05,
4653
+ "loss": 2.2316,
4654
+ "step": 659
4655
+ },
4656
+ {
4657
+ "epoch": 0.7719298245614035,
4658
+ "grad_norm": 0.30184900760650635,
4659
+ "learning_rate": 2.514892518288988e-05,
4660
+ "loss": 1.9576,
4661
+ "step": 660
4662
+ },
4663
+ {
4664
+ "epoch": 0.7730994152046784,
4665
+ "grad_norm": 0.30861276388168335,
4666
+ "learning_rate": 2.490290326897653e-05,
4667
+ "loss": 1.7012,
4668
+ "step": 661
4669
+ },
4670
+ {
4671
+ "epoch": 0.7742690058479532,
4672
+ "grad_norm": 0.7322350740432739,
4673
+ "learning_rate": 2.4657919382839033e-05,
4674
+ "loss": 1.1043,
4675
+ "step": 662
4676
+ },
4677
+ {
4678
+ "epoch": 0.775438596491228,
4679
+ "grad_norm": 0.5217798948287964,
4680
+ "learning_rate": 2.4413976910761116e-05,
4681
+ "loss": 1.1938,
4682
+ "step": 663
4683
+ },
4684
+ {
4685
+ "epoch": 0.776608187134503,
4686
+ "grad_norm": 0.4567161798477173,
4687
+ "learning_rate": 2.4171079224631686e-05,
4688
+ "loss": 1.4945,
4689
+ "step": 664
4690
+ },
4691
+ {
4692
+ "epoch": 0.7777777777777778,
4693
+ "grad_norm": 0.4545726776123047,
4694
+ "learning_rate": 2.3929229681898003e-05,
4695
+ "loss": 1.7862,
4696
+ "step": 665
4697
+ },
4698
+ {
4699
+ "epoch": 0.7789473684210526,
4700
+ "grad_norm": 0.5831303596496582,
4701
+ "learning_rate": 2.3688431625519415e-05,
4702
+ "loss": 1.3472,
4703
+ "step": 666
4704
+ },
4705
+ {
4706
+ "epoch": 0.7801169590643274,
4707
+ "grad_norm": 0.25880950689315796,
4708
+ "learning_rate": 2.3448688383921182e-05,
4709
+ "loss": 1.5453,
4710
+ "step": 667
4711
+ },
4712
+ {
4713
+ "epoch": 0.7812865497076024,
4714
+ "grad_norm": 0.5524426698684692,
4715
+ "learning_rate": 2.3210003270948365e-05,
4716
+ "loss": 1.894,
4717
+ "step": 668
4718
+ },
4719
+ {
4720
+ "epoch": 0.7824561403508772,
4721
+ "grad_norm": 0.375410258769989,
4722
+ "learning_rate": 2.2972379585820048e-05,
4723
+ "loss": 2.255,
4724
+ "step": 669
4725
+ },
4726
+ {
4727
+ "epoch": 0.783625730994152,
4728
+ "grad_norm": 0.8052981495857239,
4729
+ "learning_rate": 2.2735820613083834e-05,
4730
+ "loss": 2.1831,
4731
+ "step": 670
4732
+ },
4733
+ {
4734
+ "epoch": 0.7847953216374269,
4735
+ "grad_norm": 0.4980434775352478,
4736
+ "learning_rate": 2.250032962257029e-05,
4737
+ "loss": 0.8607,
4738
+ "step": 671
4739
+ },
4740
+ {
4741
+ "epoch": 0.7859649122807018,
4742
+ "grad_norm": 0.33631038665771484,
4743
+ "learning_rate": 2.2265909869347825e-05,
4744
+ "loss": 2.0506,
4745
+ "step": 672
4746
+ },
4747
+ {
4748
+ "epoch": 0.7871345029239766,
4749
+ "grad_norm": 0.47964972257614136,
4750
+ "learning_rate": 2.2032564593677774e-05,
4751
+ "loss": 1.4278,
4752
+ "step": 673
4753
+ },
4754
+ {
4755
+ "epoch": 0.7883040935672515,
4756
+ "grad_norm": 0.532234251499176,
4757
+ "learning_rate": 2.1800297020969463e-05,
4758
+ "loss": 1.7963,
4759
+ "step": 674
4760
+ },
4761
+ {
4762
+ "epoch": 0.7894736842105263,
4763
+ "grad_norm": 0.6282833218574524,
4764
+ "learning_rate": 2.1569110361735677e-05,
4765
+ "loss": 1.0567,
4766
+ "step": 675
4767
+ },
4768
+ {
4769
+ "epoch": 0.7906432748538011,
4770
+ "grad_norm": 0.43357419967651367,
4771
+ "learning_rate": 2.1339007811548395e-05,
4772
+ "loss": 1.755,
4773
+ "step": 676
4774
+ },
4775
+ {
4776
+ "epoch": 0.791812865497076,
4777
+ "grad_norm": 0.33183780312538147,
4778
+ "learning_rate": 2.110999255099444e-05,
4779
+ "loss": 1.5154,
4780
+ "step": 677
4781
+ },
4782
+ {
4783
+ "epoch": 0.7929824561403509,
4784
+ "grad_norm": 0.6004055142402649,
4785
+ "learning_rate": 2.0882067745631605e-05,
4786
+ "loss": 1.5257,
4787
+ "step": 678
4788
+ },
4789
+ {
4790
+ "epoch": 0.7941520467836257,
4791
+ "grad_norm": 0.4232582151889801,
4792
+ "learning_rate": 2.0655236545944966e-05,
4793
+ "loss": 2.1641,
4794
+ "step": 679
4795
+ },
4796
+ {
4797
+ "epoch": 0.7953216374269005,
4798
+ "grad_norm": 0.4737587571144104,
4799
+ "learning_rate": 2.0429502087303164e-05,
4800
+ "loss": 0.7159,
4801
+ "step": 680
4802
+ },
4803
+ {
4804
+ "epoch": 0.7964912280701755,
4805
+ "grad_norm": 0.36999645829200745,
4806
+ "learning_rate": 2.0204867489915258e-05,
4807
+ "loss": 1.9891,
4808
+ "step": 681
4809
+ },
4810
+ {
4811
+ "epoch": 0.7976608187134503,
4812
+ "grad_norm": 0.4537879526615143,
4813
+ "learning_rate": 1.998133585878743e-05,
4814
+ "loss": 1.393,
4815
+ "step": 682
4816
+ },
4817
+ {
4818
+ "epoch": 0.7988304093567251,
4819
+ "grad_norm": 0.33755865693092346,
4820
+ "learning_rate": 1.9758910283680132e-05,
4821
+ "loss": 2.0622,
4822
+ "step": 683
4823
+ },
4824
+ {
4825
+ "epoch": 0.8,
4826
+ "grad_norm": 0.7674013376235962,
4827
+ "learning_rate": 1.9537593839065483e-05,
4828
+ "loss": 1.2588,
4829
+ "step": 684
4830
+ },
4831
+ {
4832
+ "epoch": 0.8011695906432749,
4833
+ "grad_norm": 0.38102641701698303,
4834
+ "learning_rate": 1.9317389584084568e-05,
4835
+ "loss": 1.3908,
4836
+ "step": 685
4837
+ },
4838
+ {
4839
+ "epoch": 0.8023391812865497,
4840
+ "grad_norm": 0.7196425199508667,
4841
+ "learning_rate": 1.9098300562505266e-05,
4842
+ "loss": 1.1806,
4843
+ "step": 686
4844
+ },
4845
+ {
4846
+ "epoch": 0.8035087719298246,
4847
+ "grad_norm": 1.120670199394226,
4848
+ "learning_rate": 1.888032980268025e-05,
4849
+ "loss": 1.5959,
4850
+ "step": 687
4851
+ },
4852
+ {
4853
+ "epoch": 0.8046783625730994,
4854
+ "grad_norm": 0.35074350237846375,
4855
+ "learning_rate": 1.8663480317504988e-05,
4856
+ "loss": 1.3714,
4857
+ "step": 688
4858
+ },
4859
+ {
4860
+ "epoch": 0.8058479532163743,
4861
+ "grad_norm": 0.5168773531913757,
4862
+ "learning_rate": 1.844775510437613e-05,
4863
+ "loss": 1.1505,
4864
+ "step": 689
4865
+ },
4866
+ {
4867
+ "epoch": 0.8070175438596491,
4868
+ "grad_norm": 0.3984200060367584,
4869
+ "learning_rate": 1.823315714515018e-05,
4870
+ "loss": 1.3853,
4871
+ "step": 690
4872
+ },
4873
+ {
4874
+ "epoch": 0.808187134502924,
4875
+ "grad_norm": 0.3879501223564148,
4876
+ "learning_rate": 1.8019689406102126e-05,
4877
+ "loss": 1.9904,
4878
+ "step": 691
4879
+ },
4880
+ {
4881
+ "epoch": 0.8093567251461988,
4882
+ "grad_norm": 0.5114380121231079,
4883
+ "learning_rate": 1.780735483788458e-05,
4884
+ "loss": 1.6035,
4885
+ "step": 692
4886
+ },
4887
+ {
4888
+ "epoch": 0.8105263157894737,
4889
+ "grad_norm": 0.5603318810462952,
4890
+ "learning_rate": 1.7596156375486862e-05,
4891
+ "loss": 1.2323,
4892
+ "step": 693
4893
+ },
4894
+ {
4895
+ "epoch": 0.8116959064327486,
4896
+ "grad_norm": 0.44819122552871704,
4897
+ "learning_rate": 1.7386096938194585e-05,
4898
+ "loss": 1.8956,
4899
+ "step": 694
4900
+ },
4901
+ {
4902
+ "epoch": 0.8128654970760234,
4903
+ "grad_norm": 0.625581681728363,
4904
+ "learning_rate": 1.717717942954914e-05,
4905
+ "loss": 1.8117,
4906
+ "step": 695
4907
+ },
4908
+ {
4909
+ "epoch": 0.8140350877192982,
4910
+ "grad_norm": 0.5764881372451782,
4911
+ "learning_rate": 1.6969406737307625e-05,
4912
+ "loss": 1.9899,
4913
+ "step": 696
4914
+ },
4915
+ {
4916
+ "epoch": 0.8152046783625732,
4917
+ "grad_norm": 1.0194729566574097,
4918
+ "learning_rate": 1.6762781733403033e-05,
4919
+ "loss": 1.4643,
4920
+ "step": 697
4921
+ },
4922
+ {
4923
+ "epoch": 0.816374269005848,
4924
+ "grad_norm": 0.5557692050933838,
4925
+ "learning_rate": 1.6557307273904354e-05,
4926
+ "loss": 1.3692,
4927
+ "step": 698
4928
+ },
4929
+ {
4930
+ "epoch": 0.8175438596491228,
4931
+ "grad_norm": 0.5595893263816833,
4932
+ "learning_rate": 1.6352986198977325e-05,
4933
+ "loss": 1.707,
4934
+ "step": 699
4935
+ },
4936
+ {
4937
+ "epoch": 0.8187134502923976,
4938
+ "grad_norm": 0.4992705285549164,
4939
+ "learning_rate": 1.614982133284495e-05,
4940
+ "loss": 1.7299,
4941
+ "step": 700
4942
+ },
4943
+ {
4944
+ "epoch": 0.8198830409356725,
4945
+ "grad_norm": 0.6664674282073975,
4946
+ "learning_rate": 1.5947815483748574e-05,
4947
+ "loss": 1.5864,
4948
+ "step": 701
4949
+ },
4950
+ {
4951
+ "epoch": 0.8210526315789474,
4952
+ "grad_norm": 0.5072648525238037,
4953
+ "learning_rate": 1.574697144390914e-05,
4954
+ "loss": 1.9092,
4955
+ "step": 702
4956
+ },
4957
+ {
4958
+ "epoch": 0.8222222222222222,
4959
+ "grad_norm": 0.5502724051475525,
4960
+ "learning_rate": 1.5547291989488444e-05,
4961
+ "loss": 1.8396,
4962
+ "step": 703
4963
+ },
4964
+ {
4965
+ "epoch": 0.8233918128654971,
4966
+ "grad_norm": 0.4326912760734558,
4967
+ "learning_rate": 1.534877988055081e-05,
4968
+ "loss": 0.9664,
4969
+ "step": 704
4970
+ },
4971
+ {
4972
+ "epoch": 0.8245614035087719,
4973
+ "grad_norm": 0.3104591965675354,
4974
+ "learning_rate": 1.515143786102503e-05,
4975
+ "loss": 1.6584,
4976
+ "step": 705
4977
+ },
4978
+ {
4979
+ "epoch": 0.8257309941520468,
4980
+ "grad_norm": 0.6304602026939392,
4981
+ "learning_rate": 1.49552686586663e-05,
4982
+ "loss": 1.1046,
4983
+ "step": 706
4984
+ },
4985
+ {
4986
+ "epoch": 0.8269005847953217,
4987
+ "grad_norm": 0.34990617632865906,
4988
+ "learning_rate": 1.4760274985018618e-05,
4989
+ "loss": 1.7938,
4990
+ "step": 707
4991
+ },
4992
+ {
4993
+ "epoch": 0.8280701754385965,
4994
+ "grad_norm": 0.5276802778244019,
4995
+ "learning_rate": 1.4566459535377252e-05,
4996
+ "loss": 1.4686,
4997
+ "step": 708
4998
+ },
4999
+ {
5000
+ "epoch": 0.8292397660818713,
5001
+ "grad_norm": 0.6775768399238586,
5002
+ "learning_rate": 1.4373824988751471e-05,
5003
+ "loss": 1.1786,
5004
+ "step": 709
5005
+ },
5006
+ {
5007
+ "epoch": 0.8304093567251462,
5008
+ "grad_norm": 0.6719319820404053,
5009
+ "learning_rate": 1.4182374007827603e-05,
5010
+ "loss": 1.5589,
5011
+ "step": 710
5012
+ },
5013
+ {
5014
+ "epoch": 0.8315789473684211,
5015
+ "grad_norm": 0.531869649887085,
5016
+ "learning_rate": 1.3992109238932105e-05,
5017
+ "loss": 2.1176,
5018
+ "step": 711
5019
+ },
5020
+ {
5021
+ "epoch": 0.8327485380116959,
5022
+ "grad_norm": 0.41226381063461304,
5023
+ "learning_rate": 1.3803033311995072e-05,
5024
+ "loss": 1.7806,
5025
+ "step": 712
5026
+ },
5027
+ {
5028
+ "epoch": 0.8339181286549707,
5029
+ "grad_norm": 0.6923168301582336,
5030
+ "learning_rate": 1.3615148840513881e-05,
5031
+ "loss": 0.6939,
5032
+ "step": 713
5033
+ },
5034
+ {
5035
+ "epoch": 0.8350877192982457,
5036
+ "grad_norm": 0.32801055908203125,
5037
+ "learning_rate": 1.3428458421517032e-05,
5038
+ "loss": 2.2524,
5039
+ "step": 714
5040
+ },
5041
+ {
5042
+ "epoch": 0.8362573099415205,
5043
+ "grad_norm": 0.5189927816390991,
5044
+ "learning_rate": 1.324296463552821e-05,
5045
+ "loss": 1.3771,
5046
+ "step": 715
5047
+ },
5048
+ {
5049
+ "epoch": 0.8374269005847953,
5050
+ "grad_norm": 0.42794156074523926,
5051
+ "learning_rate": 1.3058670046530775e-05,
5052
+ "loss": 2.0186,
5053
+ "step": 716
5054
+ },
5055
+ {
5056
+ "epoch": 0.8385964912280702,
5057
+ "grad_norm": 0.55548495054245,
5058
+ "learning_rate": 1.2875577201932132e-05,
5059
+ "loss": 1.6141,
5060
+ "step": 717
5061
+ },
5062
+ {
5063
+ "epoch": 0.839766081871345,
5064
+ "grad_norm": 0.4545220732688904,
5065
+ "learning_rate": 1.2693688632528622e-05,
5066
+ "loss": 1.3453,
5067
+ "step": 718
5068
+ },
5069
+ {
5070
+ "epoch": 0.8409356725146199,
5071
+ "grad_norm": 0.6801018118858337,
5072
+ "learning_rate": 1.2513006852470555e-05,
5073
+ "loss": 1.2189,
5074
+ "step": 719
5075
+ },
5076
+ {
5077
+ "epoch": 0.8421052631578947,
5078
+ "grad_norm": 1.013476848602295,
5079
+ "learning_rate": 1.2333534359227384e-05,
5080
+ "loss": 1.9237,
5081
+ "step": 720
5082
+ },
5083
+ {
5084
+ "epoch": 0.8432748538011696,
5085
+ "grad_norm": 0.5349302291870117,
5086
+ "learning_rate": 1.215527363355322e-05,
5087
+ "loss": 0.9729,
5088
+ "step": 721
5089
+ },
5090
+ {
5091
+ "epoch": 0.8444444444444444,
5092
+ "grad_norm": 0.5281980633735657,
5093
+ "learning_rate": 1.1978227139452625e-05,
5094
+ "loss": 1.7217,
5095
+ "step": 722
5096
+ },
5097
+ {
5098
+ "epoch": 0.8456140350877193,
5099
+ "grad_norm": 0.5399391055107117,
5100
+ "learning_rate": 1.1802397324146374e-05,
5101
+ "loss": 1.7664,
5102
+ "step": 723
5103
+ },
5104
+ {
5105
+ "epoch": 0.8467836257309942,
5106
+ "grad_norm": 0.8852502703666687,
5107
+ "learning_rate": 1.1627786618037762e-05,
5108
+ "loss": 0.5362,
5109
+ "step": 724
5110
+ },
5111
+ {
5112
+ "epoch": 0.847953216374269,
5113
+ "grad_norm": 0.45778024196624756,
5114
+ "learning_rate": 1.1454397434679021e-05,
5115
+ "loss": 2.0303,
5116
+ "step": 725
5117
+ },
5118
+ {
5119
+ "epoch": 0.8491228070175438,
5120
+ "grad_norm": 0.44471511244773865,
5121
+ "learning_rate": 1.128223217073786e-05,
5122
+ "loss": 1.9459,
5123
+ "step": 726
5124
+ },
5125
+ {
5126
+ "epoch": 0.8502923976608188,
5127
+ "grad_norm": 0.5390698909759521,
5128
+ "learning_rate": 1.1111293205964412e-05,
5129
+ "loss": 1.0607,
5130
+ "step": 727
5131
+ },
5132
+ {
5133
+ "epoch": 0.8514619883040936,
5134
+ "grad_norm": 0.44605326652526855,
5135
+ "learning_rate": 1.0941582903158343e-05,
5136
+ "loss": 1.8983,
5137
+ "step": 728
5138
+ },
5139
+ {
5140
+ "epoch": 0.8526315789473684,
5141
+ "grad_norm": 0.42205414175987244,
5142
+ "learning_rate": 1.0773103608136126e-05,
5143
+ "loss": 1.8101,
5144
+ "step": 729
5145
+ },
5146
+ {
5147
+ "epoch": 0.8538011695906432,
5148
+ "grad_norm": 0.3277212977409363,
5149
+ "learning_rate": 1.0605857649698669e-05,
5150
+ "loss": 0.8485,
5151
+ "step": 730
5152
+ },
5153
+ {
5154
+ "epoch": 0.8549707602339182,
5155
+ "grad_norm": 0.6796973347663879,
5156
+ "learning_rate": 1.0439847339599174e-05,
5157
+ "loss": 1.2562,
5158
+ "step": 731
5159
+ },
5160
+ {
5161
+ "epoch": 0.856140350877193,
5162
+ "grad_norm": 0.4901828169822693,
5163
+ "learning_rate": 1.0275074972511034e-05,
5164
+ "loss": 1.4787,
5165
+ "step": 732
5166
+ },
5167
+ {
5168
+ "epoch": 0.8573099415204678,
5169
+ "grad_norm": 0.38275349140167236,
5170
+ "learning_rate": 1.0111542825996245e-05,
5171
+ "loss": 2.1962,
5172
+ "step": 733
5173
+ },
5174
+ {
5175
+ "epoch": 0.8584795321637427,
5176
+ "grad_norm": 0.5255086421966553,
5177
+ "learning_rate": 9.949253160473915e-06,
5178
+ "loss": 1.5463,
5179
+ "step": 734
5180
+ },
5181
+ {
5182
+ "epoch": 0.8596491228070176,
5183
+ "grad_norm": 0.49351197481155396,
5184
+ "learning_rate": 9.788208219188932e-06,
5185
+ "loss": 1.4428,
5186
+ "step": 735
5187
+ },
5188
+ {
5189
+ "epoch": 0.8608187134502924,
5190
+ "grad_norm": 0.6366466879844666,
5191
+ "learning_rate": 9.628410228181084e-06,
5192
+ "loss": 1.8165,
5193
+ "step": 736
5194
+ },
5195
+ {
5196
+ "epoch": 0.8619883040935673,
5197
+ "grad_norm": 0.6942663788795471,
5198
+ "learning_rate": 9.469861396254153e-06,
5199
+ "loss": 1.1972,
5200
+ "step": 737
5201
+ },
5202
+ {
5203
+ "epoch": 0.8631578947368421,
5204
+ "grad_norm": 0.3593437075614929,
5205
+ "learning_rate": 9.31256391494546e-06,
5206
+ "loss": 1.6453,
5207
+ "step": 738
5208
+ },
5209
+ {
5210
+ "epoch": 0.8643274853801169,
5211
+ "grad_norm": 0.5943644642829895,
5212
+ "learning_rate": 9.156519958495602e-06,
5213
+ "loss": 1.2414,
5214
+ "step": 739
5215
+ },
5216
+ {
5217
+ "epoch": 0.8654970760233918,
5218
+ "grad_norm": 0.41532373428344727,
5219
+ "learning_rate": 9.001731683818337e-06,
5220
+ "loss": 1.4155,
5221
+ "step": 740
5222
+ },
5223
+ {
5224
+ "epoch": 0.8666666666666667,
5225
+ "grad_norm": 0.5489165782928467,
5226
+ "learning_rate": 8.848201230470776e-06,
5227
+ "loss": 1.839,
5228
+ "step": 741
5229
+ },
5230
+ {
5231
+ "epoch": 0.8678362573099415,
5232
+ "grad_norm": 0.6106531023979187,
5233
+ "learning_rate": 8.695930720623857e-06,
5234
+ "loss": 1.9636,
5235
+ "step": 742
5236
+ },
5237
+ {
5238
+ "epoch": 0.8690058479532163,
5239
+ "grad_norm": 0.39489027857780457,
5240
+ "learning_rate": 8.54492225903295e-06,
5241
+ "loss": 0.6472,
5242
+ "step": 743
5243
+ },
5244
+ {
5245
+ "epoch": 0.8701754385964913,
5246
+ "grad_norm": 0.4068044424057007,
5247
+ "learning_rate": 8.395177933008802e-06,
5248
+ "loss": 1.7972,
5249
+ "step": 744
5250
+ },
5251
+ {
5252
+ "epoch": 0.8713450292397661,
5253
+ "grad_norm": 0.5814440846443176,
5254
+ "learning_rate": 8.246699812388714e-06,
5255
+ "loss": 1.1727,
5256
+ "step": 745
5257
+ },
5258
+ {
5259
+ "epoch": 0.8725146198830409,
5260
+ "grad_norm": 0.5304514765739441,
5261
+ "learning_rate": 8.099489949507843e-06,
5262
+ "loss": 1.2336,
5263
+ "step": 746
5264
+ },
5265
+ {
5266
+ "epoch": 0.8736842105263158,
5267
+ "grad_norm": 0.45569664239883423,
5268
+ "learning_rate": 7.953550379170893e-06,
5269
+ "loss": 1.5075,
5270
+ "step": 747
5271
+ },
5272
+ {
5273
+ "epoch": 0.8748538011695907,
5274
+ "grad_norm": 0.43300554156303406,
5275
+ "learning_rate": 7.80888311862401e-06,
5276
+ "loss": 1.5435,
5277
+ "step": 748
5278
+ },
5279
+ {
5280
+ "epoch": 0.8760233918128655,
5281
+ "grad_norm": 0.5079240798950195,
5282
+ "learning_rate": 7.665490167526856e-06,
5283
+ "loss": 2.0697,
5284
+ "step": 749
5285
+ },
5286
+ {
5287
+ "epoch": 0.8771929824561403,
5288
+ "grad_norm": 0.5567116737365723,
5289
+ "learning_rate": 7.523373507924947e-06,
5290
+ "loss": 1.7936,
5291
+ "step": 750
5292
+ },
5293
+ {
5294
+ "epoch": 0.8783625730994152,
5295
+ "grad_norm": 0.5715877413749695,
5296
+ "learning_rate": 7.382535104222366e-06,
5297
+ "loss": 1.4835,
5298
+ "step": 751
5299
+ },
5300
+ {
5301
+ "epoch": 0.87953216374269,
5302
+ "grad_norm": 0.4187256693840027,
5303
+ "learning_rate": 7.242976903154442e-06,
5304
+ "loss": 1.5308,
5305
+ "step": 752
5306
+ },
5307
+ {
5308
+ "epoch": 0.8807017543859649,
5309
+ "grad_norm": 0.5101755857467651,
5310
+ "learning_rate": 7.104700833761013e-06,
5311
+ "loss": 1.5338,
5312
+ "step": 753
5313
+ },
5314
+ {
5315
+ "epoch": 0.8818713450292398,
5316
+ "grad_norm": 0.41772782802581787,
5317
+ "learning_rate": 6.967708807359663e-06,
5318
+ "loss": 1.8949,
5319
+ "step": 754
5320
+ },
5321
+ {
5322
+ "epoch": 0.8830409356725146,
5323
+ "grad_norm": 0.4892766773700714,
5324
+ "learning_rate": 6.8320027175192706e-06,
5325
+ "loss": 1.1041,
5326
+ "step": 755
5327
+ },
5328
+ {
5329
+ "epoch": 0.8842105263157894,
5330
+ "grad_norm": 0.5039598345756531,
5331
+ "learning_rate": 6.697584440033988e-06,
5332
+ "loss": 0.749,
5333
+ "step": 756
5334
+ },
5335
+ {
5336
+ "epoch": 0.8853801169590644,
5337
+ "grad_norm": 0.5067999958992004,
5338
+ "learning_rate": 6.564455832897099e-06,
5339
+ "loss": 1.0401,
5340
+ "step": 757
5341
+ },
5342
+ {
5343
+ "epoch": 0.8865497076023392,
5344
+ "grad_norm": 0.7063804864883423,
5345
+ "learning_rate": 6.432618736275553e-06,
5346
+ "loss": 1.4961,
5347
+ "step": 758
5348
+ },
5349
+ {
5350
+ "epoch": 0.887719298245614,
5351
+ "grad_norm": 0.303244024515152,
5352
+ "learning_rate": 6.302074972484362e-06,
5353
+ "loss": 0.8921,
5354
+ "step": 759
5355
+ },
5356
+ {
5357
+ "epoch": 0.8888888888888888,
5358
+ "grad_norm": 0.49389979243278503,
5359
+ "learning_rate": 6.1728263459614796e-06,
5360
+ "loss": 1.9556,
5361
+ "step": 760
5362
+ },
5363
+ {
5364
+ "epoch": 0.8900584795321638,
5365
+ "grad_norm": 0.34622922539711,
5366
+ "learning_rate": 6.044874643242904e-06,
5367
+ "loss": 0.6971,
5368
+ "step": 761
5369
+ },
5370
+ {
5371
+ "epoch": 0.8912280701754386,
5372
+ "grad_norm": 0.3411808907985687,
5373
+ "learning_rate": 5.9182216329378705e-06,
5374
+ "loss": 1.7343,
5375
+ "step": 762
5376
+ },
5377
+ {
5378
+ "epoch": 0.8923976608187134,
5379
+ "grad_norm": 0.3007182776927948,
5380
+ "learning_rate": 5.7928690657045535e-06,
5381
+ "loss": 1.4504,
5382
+ "step": 763
5383
+ },
5384
+ {
5385
+ "epoch": 0.8935672514619883,
5386
+ "grad_norm": 0.36203962564468384,
5387
+ "learning_rate": 5.668818674225685e-06,
5388
+ "loss": 1.2621,
5389
+ "step": 764
5390
+ },
5391
+ {
5392
+ "epoch": 0.8947368421052632,
5393
+ "grad_norm": 0.4814302623271942,
5394
+ "learning_rate": 5.546072173184791e-06,
5395
+ "loss": 1.1898,
5396
+ "step": 765
5397
+ },
5398
+ {
5399
+ "epoch": 0.895906432748538,
5400
+ "grad_norm": 0.6297724843025208,
5401
+ "learning_rate": 5.424631259242352e-06,
5402
+ "loss": 1.6179,
5403
+ "step": 766
5404
+ },
5405
+ {
5406
+ "epoch": 0.8970760233918129,
5407
+ "grad_norm": 0.47519174218177795,
5408
+ "learning_rate": 5.3044976110124155e-06,
5409
+ "loss": 2.0664,
5410
+ "step": 767
5411
+ },
5412
+ {
5413
+ "epoch": 0.8982456140350877,
5414
+ "grad_norm": 0.3763788938522339,
5415
+ "learning_rate": 5.185672889039394e-06,
5416
+ "loss": 1.6694,
5417
+ "step": 768
5418
+ },
5419
+ {
5420
+ "epoch": 0.8994152046783626,
5421
+ "grad_norm": 0.5559266805648804,
5422
+ "learning_rate": 5.068158735775097e-06,
5423
+ "loss": 1.5592,
5424
+ "step": 769
5425
+ },
5426
+ {
5427
+ "epoch": 0.9005847953216374,
5428
+ "grad_norm": 0.3948187232017517,
5429
+ "learning_rate": 4.951956775555999e-06,
5430
+ "loss": 1.8212,
5431
+ "step": 770
5432
+ },
5433
+ {
5434
+ "epoch": 0.9017543859649123,
5435
+ "grad_norm": 0.40267279744148254,
5436
+ "learning_rate": 4.837068614580875e-06,
5437
+ "loss": 0.8669,
5438
+ "step": 771
5439
+ },
5440
+ {
5441
+ "epoch": 0.9029239766081871,
5442
+ "grad_norm": 0.4965238869190216,
5443
+ "learning_rate": 4.723495840888493e-06,
5444
+ "loss": 1.5074,
5445
+ "step": 772
5446
+ },
5447
+ {
5448
+ "epoch": 0.904093567251462,
5449
+ "grad_norm": 0.6691774129867554,
5450
+ "learning_rate": 4.611240024335706e-06,
5451
+ "loss": 1.5161,
5452
+ "step": 773
5453
+ },
5454
+ {
5455
+ "epoch": 0.9052631578947369,
5456
+ "grad_norm": 0.6021882891654968,
5457
+ "learning_rate": 4.5003027165758215e-06,
5458
+ "loss": 1.5402,
5459
+ "step": 774
5460
+ },
5461
+ {
5462
+ "epoch": 0.9064327485380117,
5463
+ "grad_norm": 0.5868009924888611,
5464
+ "learning_rate": 4.390685451037025e-06,
5465
+ "loss": 1.6296,
5466
+ "step": 775
5467
+ },
5468
+ {
5469
+ "epoch": 0.9076023391812865,
5470
+ "grad_norm": 0.601019024848938,
5471
+ "learning_rate": 4.282389742901283e-06,
5472
+ "loss": 1.1937,
5473
+ "step": 776
5474
+ },
5475
+ {
5476
+ "epoch": 0.9087719298245615,
5477
+ "grad_norm": 0.5701804757118225,
5478
+ "learning_rate": 4.175417089083378e-06,
5479
+ "loss": 1.4124,
5480
+ "step": 777
5481
+ },
5482
+ {
5483
+ "epoch": 0.9099415204678363,
5484
+ "grad_norm": 0.9340035319328308,
5485
+ "learning_rate": 4.069768968210186e-06,
5486
+ "loss": 1.3639,
5487
+ "step": 778
5488
+ },
5489
+ {
5490
+ "epoch": 0.9111111111111111,
5491
+ "grad_norm": 0.3220817446708679,
5492
+ "learning_rate": 3.9654468406002396e-06,
5493
+ "loss": 1.7019,
5494
+ "step": 779
5495
+ },
5496
+ {
5497
+ "epoch": 0.9122807017543859,
5498
+ "grad_norm": 0.5050005316734314,
5499
+ "learning_rate": 3.862452148243622e-06,
5500
+ "loss": 1.6152,
5501
+ "step": 780
5502
+ },
5503
+ {
5504
+ "epoch": 0.9134502923976608,
5505
+ "grad_norm": 0.3310493528842926,
5506
+ "learning_rate": 3.7607863147819166e-06,
5507
+ "loss": 1.5067,
5508
+ "step": 781
5509
+ },
5510
+ {
5511
+ "epoch": 0.9146198830409357,
5512
+ "grad_norm": 0.6780093908309937,
5513
+ "learning_rate": 3.6604507454886083e-06,
5514
+ "loss": 1.029,
5515
+ "step": 782
5516
+ },
5517
+ {
5518
+ "epoch": 0.9157894736842105,
5519
+ "grad_norm": 0.5026019811630249,
5520
+ "learning_rate": 3.561446827249659e-06,
5521
+ "loss": 1.3767,
5522
+ "step": 783
5523
+ },
5524
+ {
5525
+ "epoch": 0.9169590643274854,
5526
+ "grad_norm": 0.6126399040222168,
5527
+ "learning_rate": 3.4637759285442882e-06,
5528
+ "loss": 1.2892,
5529
+ "step": 784
5530
+ },
5531
+ {
5532
+ "epoch": 0.9181286549707602,
5533
+ "grad_norm": 0.5534948110580444,
5534
+ "learning_rate": 3.367439399426087e-06,
5535
+ "loss": 1.2748,
5536
+ "step": 785
5537
+ },
5538
+ {
5539
+ "epoch": 0.9192982456140351,
5540
+ "grad_norm": 0.44695568084716797,
5541
+ "learning_rate": 3.2724385715043883e-06,
5542
+ "loss": 1.696,
5543
+ "step": 786
5544
+ },
5545
+ {
5546
+ "epoch": 0.92046783625731,
5547
+ "grad_norm": 0.4182886779308319,
5548
+ "learning_rate": 3.178774757925762e-06,
5549
+ "loss": 2.054,
5550
+ "step": 787
5551
+ },
5552
+ {
5553
+ "epoch": 0.9216374269005848,
5554
+ "grad_norm": 0.49121958017349243,
5555
+ "learning_rate": 3.0864492533560165e-06,
5556
+ "loss": 1.884,
5557
+ "step": 788
5558
+ },
5559
+ {
5560
+ "epoch": 0.9228070175438596,
5561
+ "grad_norm": 0.5378497242927551,
5562
+ "learning_rate": 2.9954633339621564e-06,
5563
+ "loss": 1.5816,
5564
+ "step": 789
5565
+ },
5566
+ {
5567
+ "epoch": 0.9239766081871345,
5568
+ "grad_norm": 0.3458714187145233,
5569
+ "learning_rate": 2.905818257394799e-06,
5570
+ "loss": 1.9412,
5571
+ "step": 790
5572
+ },
5573
+ {
5574
+ "epoch": 0.9251461988304094,
5575
+ "grad_norm": 0.6665580868721008,
5576
+ "learning_rate": 2.817515262770842e-06,
5577
+ "loss": 0.8671,
5578
+ "step": 791
5579
+ },
5580
+ {
5581
+ "epoch": 0.9263157894736842,
5582
+ "grad_norm": 0.3785383999347687,
5583
+ "learning_rate": 2.7305555706562457e-06,
5584
+ "loss": 1.8415,
5585
+ "step": 792
5586
+ },
5587
+ {
5588
+ "epoch": 0.927485380116959,
5589
+ "grad_norm": 0.41982072591781616,
5590
+ "learning_rate": 2.6449403830492104e-06,
5591
+ "loss": 1.8585,
5592
+ "step": 793
5593
+ },
5594
+ {
5595
+ "epoch": 0.928654970760234,
5596
+ "grad_norm": 0.33484476804733276,
5597
+ "learning_rate": 2.5606708833635917e-06,
5598
+ "loss": 1.5949,
5599
+ "step": 794
5600
+ },
5601
+ {
5602
+ "epoch": 0.9298245614035088,
5603
+ "grad_norm": 0.6077429056167603,
5604
+ "learning_rate": 2.4777482364124695e-06,
5605
+ "loss": 1.1995,
5606
+ "step": 795
5607
+ },
5608
+ {
5609
+ "epoch": 0.9309941520467836,
5610
+ "grad_norm": 0.6394246816635132,
5611
+ "learning_rate": 2.39617358839207e-06,
5612
+ "loss": 1.4034,
5613
+ "step": 796
5614
+ },
5615
+ {
5616
+ "epoch": 0.9321637426900585,
5617
+ "grad_norm": 0.7234786152839661,
5618
+ "learning_rate": 2.315948066866003e-06,
5619
+ "loss": 1.4246,
5620
+ "step": 797
5621
+ },
5622
+ {
5623
+ "epoch": 0.9333333333333333,
5624
+ "grad_norm": 0.6573183536529541,
5625
+ "learning_rate": 2.2370727807495497e-06,
5626
+ "loss": 1.1776,
5627
+ "step": 798
5628
+ },
5629
+ {
5630
+ "epoch": 0.9345029239766082,
5631
+ "grad_norm": 0.5006548166275024,
5632
+ "learning_rate": 2.1595488202944103e-06,
5633
+ "loss": 1.8393,
5634
+ "step": 799
5635
+ },
5636
+ {
5637
+ "epoch": 0.935672514619883,
5638
+ "grad_norm": 0.6787004470825195,
5639
+ "learning_rate": 2.0833772570736375e-06,
5640
+ "loss": 1.5865,
5641
+ "step": 800
5642
+ },
5643
+ {
5644
+ "epoch": 0.9368421052631579,
5645
+ "grad_norm": 0.3747584819793701,
5646
+ "learning_rate": 2.0085591439667927e-06,
5647
+ "loss": 2.2697,
5648
+ "step": 801
5649
+ },
5650
+ {
5651
+ "epoch": 0.9380116959064327,
5652
+ "grad_norm": 0.4233306646347046,
5653
+ "learning_rate": 1.935095515145391e-06,
5654
+ "loss": 1.6865,
5655
+ "step": 802
5656
+ },
5657
+ {
5658
+ "epoch": 0.9391812865497076,
5659
+ "grad_norm": 0.45490002632141113,
5660
+ "learning_rate": 1.8629873860586566e-06,
5661
+ "loss": 1.7815,
5662
+ "step": 803
5663
+ },
5664
+ {
5665
+ "epoch": 0.9403508771929825,
5666
+ "grad_norm": 0.5193967223167419,
5667
+ "learning_rate": 1.7922357534194356e-06,
5668
+ "loss": 1.5525,
5669
+ "step": 804
5670
+ },
5671
+ {
5672
+ "epoch": 0.9415204678362573,
5673
+ "grad_norm": 0.4140605330467224,
5674
+ "learning_rate": 1.7228415951904165e-06,
5675
+ "loss": 1.791,
5676
+ "step": 805
5677
+ },
5678
+ {
5679
+ "epoch": 0.9426900584795321,
5680
+ "grad_norm": 0.7900307774543762,
5681
+ "learning_rate": 1.6548058705706526e-06,
5682
+ "loss": 1.2546,
5683
+ "step": 806
5684
+ },
5685
+ {
5686
+ "epoch": 0.9438596491228071,
5687
+ "grad_norm": 0.3959881663322449,
5688
+ "learning_rate": 1.5881295199822953e-06,
5689
+ "loss": 1.6344,
5690
+ "step": 807
5691
+ },
5692
+ {
5693
+ "epoch": 0.9450292397660819,
5694
+ "grad_norm": 0.6034178733825684,
5695
+ "learning_rate": 1.5228134650575265e-06,
5696
+ "loss": 0.9063,
5697
+ "step": 808
5698
+ },
5699
+ {
5700
+ "epoch": 0.9461988304093567,
5701
+ "grad_norm": 0.4347105920314789,
5702
+ "learning_rate": 1.458858608625957e-06,
5703
+ "loss": 1.7198,
5704
+ "step": 809
5705
+ },
5706
+ {
5707
+ "epoch": 0.9473684210526315,
5708
+ "grad_norm": 0.5537627935409546,
5709
+ "learning_rate": 1.396265834701982e-06,
5710
+ "loss": 1.4791,
5711
+ "step": 810
5712
+ },
5713
+ {
5714
+ "epoch": 0.9485380116959065,
5715
+ "grad_norm": 0.5163156986236572,
5716
+ "learning_rate": 1.335036008472701e-06,
5717
+ "loss": 1.4561,
5718
+ "step": 811
5719
+ },
5720
+ {
5721
+ "epoch": 0.9497076023391813,
5722
+ "grad_norm": 0.30538955330848694,
5723
+ "learning_rate": 1.2751699762858838e-06,
5724
+ "loss": 0.5194,
5725
+ "step": 812
5726
+ },
5727
+ {
5728
+ "epoch": 0.9508771929824561,
5729
+ "grad_norm": 0.8774105906486511,
5730
+ "learning_rate": 1.2166685656382903e-06,
5731
+ "loss": 0.8315,
5732
+ "step": 813
5733
+ },
5734
+ {
5735
+ "epoch": 0.952046783625731,
5736
+ "grad_norm": 0.5743803381919861,
5737
+ "learning_rate": 1.1595325851642137e-06,
5738
+ "loss": 2.1359,
5739
+ "step": 814
5740
+ },
5741
+ {
5742
+ "epoch": 0.9532163742690059,
5743
+ "grad_norm": 0.4251870810985565,
5744
+ "learning_rate": 1.103762824624377e-06,
5745
+ "loss": 2.0652,
5746
+ "step": 815
5747
+ },
5748
+ {
5749
+ "epoch": 0.9543859649122807,
5750
+ "grad_norm": 0.5636005401611328,
5751
+ "learning_rate": 1.0493600548948878e-06,
5752
+ "loss": 1.919,
5753
+ "step": 816
5754
+ },
5755
+ {
5756
+ "epoch": 0.9555555555555556,
5757
+ "grad_norm": 0.39051973819732666,
5758
+ "learning_rate": 9.963250279567239e-07,
5759
+ "loss": 0.4746,
5760
+ "step": 817
5761
+ },
5762
+ {
5763
+ "epoch": 0.9567251461988304,
5764
+ "grad_norm": 0.485362708568573,
5765
+ "learning_rate": 9.446584768852407e-07,
5766
+ "loss": 1.9436,
5767
+ "step": 818
5768
+ },
5769
+ {
5770
+ "epoch": 0.9578947368421052,
5771
+ "grad_norm": 0.6727136373519897,
5772
+ "learning_rate": 8.943611158400478e-07,
5773
+ "loss": 0.9032,
5774
+ "step": 819
5775
+ },
5776
+ {
5777
+ "epoch": 0.9590643274853801,
5778
+ "grad_norm": 0.8409953117370605,
5779
+ "learning_rate": 8.454336400552154e-07,
5780
+ "loss": 1.0934,
5781
+ "step": 820
5782
+ },
5783
+ {
5784
+ "epoch": 0.960233918128655,
5785
+ "grad_norm": 0.45199981331825256,
5786
+ "learning_rate": 7.978767258295494e-07,
5787
+ "loss": 1.5624,
5788
+ "step": 821
5789
+ },
5790
+ {
5791
+ "epoch": 0.9614035087719298,
5792
+ "grad_norm": 0.3910199701786041,
5793
+ "learning_rate": 7.516910305173431e-07,
5794
+ "loss": 1.7667,
5795
+ "step": 822
5796
+ },
5797
+ {
5798
+ "epoch": 0.9625730994152046,
5799
+ "grad_norm": 0.40123024582862854,
5800
+ "learning_rate": 7.068771925192286e-07,
5801
+ "loss": 1.7485,
5802
+ "step": 823
5803
+ },
5804
+ {
5805
+ "epoch": 0.9637426900584796,
5806
+ "grad_norm": 0.40403178334236145,
5807
+ "learning_rate": 6.634358312733957e-07,
5808
+ "loss": 2.3095,
5809
+ "step": 824
5810
+ },
5811
+ {
5812
+ "epoch": 0.9649122807017544,
5813
+ "grad_norm": 0.5259490013122559,
5814
+ "learning_rate": 6.21367547246976e-07,
5815
+ "loss": 1.6458,
5816
+ "step": 825
5817
+ },
5818
+ {
5819
+ "epoch": 0.9660818713450292,
5820
+ "grad_norm": 0.5003288388252258,
5821
+ "learning_rate": 5.806729219278051e-07,
5822
+ "loss": 1.5372,
5823
+ "step": 826
5824
+ },
5825
+ {
5826
+ "epoch": 0.9672514619883041,
5827
+ "grad_norm": 0.4602636694908142,
5828
+ "learning_rate": 5.413525178163292e-07,
5829
+ "loss": 1.7856,
5830
+ "step": 827
5831
+ },
5832
+ {
5833
+ "epoch": 0.968421052631579,
5834
+ "grad_norm": 0.5534040331840515,
5835
+ "learning_rate": 5.034068784178891e-07,
5836
+ "loss": 1.3398,
5837
+ "step": 828
5838
+ },
5839
+ {
5840
+ "epoch": 0.9695906432748538,
5841
+ "grad_norm": 0.6525434255599976,
5842
+ "learning_rate": 4.668365282351372e-07,
5843
+ "loss": 1.4224,
5844
+ "step": 829
5845
+ },
5846
+ {
5847
+ "epoch": 0.9707602339181286,
5848
+ "grad_norm": 0.4619835615158081,
5849
+ "learning_rate": 4.316419727608434e-07,
5850
+ "loss": 1.6593,
5851
+ "step": 830
5852
+ },
5853
+ {
5854
+ "epoch": 0.9719298245614035,
5855
+ "grad_norm": 0.4035587012767792,
5856
+ "learning_rate": 3.978236984708894e-07,
5857
+ "loss": 0.6571,
5858
+ "step": 831
5859
+ },
5860
+ {
5861
+ "epoch": 0.9730994152046784,
5862
+ "grad_norm": 0.4727626442909241,
5863
+ "learning_rate": 3.653821728175522e-07,
5864
+ "loss": 1.9151,
5865
+ "step": 832
5866
+ },
5867
+ {
5868
+ "epoch": 0.9742690058479532,
5869
+ "grad_norm": 0.47662097215652466,
5870
+ "learning_rate": 3.343178442230088e-07,
5871
+ "loss": 1.4586,
5872
+ "step": 833
5873
+ },
5874
+ {
5875
+ "epoch": 0.9754385964912281,
5876
+ "grad_norm": 0.539738118648529,
5877
+ "learning_rate": 3.0463114207317513e-07,
5878
+ "loss": 1.9575,
5879
+ "step": 834
5880
+ },
5881
+ {
5882
+ "epoch": 0.9766081871345029,
5883
+ "grad_norm": 0.45117852091789246,
5884
+ "learning_rate": 2.7632247671177667e-07,
5885
+ "loss": 1.776,
5886
+ "step": 835
5887
+ },
5888
+ {
5889
+ "epoch": 0.9777777777777777,
5890
+ "grad_norm": 0.41608095169067383,
5891
+ "learning_rate": 2.493922394346315e-07,
5892
+ "loss": 0.5535,
5893
+ "step": 836
5894
+ },
5895
+ {
5896
+ "epoch": 0.9789473684210527,
5897
+ "grad_norm": 0.4384686350822449,
5898
+ "learning_rate": 2.2384080248429863e-07,
5899
+ "loss": 1.425,
5900
+ "step": 837
5901
+ },
5902
+ {
5903
+ "epoch": 0.9801169590643275,
5904
+ "grad_norm": 0.6699833273887634,
5905
+ "learning_rate": 1.9966851904487106e-07,
5906
+ "loss": 1.6039,
5907
+ "step": 838
5908
+ },
5909
+ {
5910
+ "epoch": 0.9812865497076023,
5911
+ "grad_norm": 4.3168182373046875,
5912
+ "learning_rate": 1.768757232371576e-07,
5913
+ "loss": 1.8438,
5914
+ "step": 839
5915
+ },
5916
+ {
5917
+ "epoch": 0.9824561403508771,
5918
+ "grad_norm": 0.44814804196357727,
5919
+ "learning_rate": 1.554627301140199e-07,
5920
+ "loss": 1.1396,
5921
+ "step": 840
5922
+ },
5923
+ {
5924
+ "epoch": 0.9836257309941521,
5925
+ "grad_norm": 0.34025856852531433,
5926
+ "learning_rate": 1.354298356560091e-07,
5927
+ "loss": 1.9324,
5928
+ "step": 841
5929
+ },
5930
+ {
5931
+ "epoch": 0.9847953216374269,
5932
+ "grad_norm": 0.49703219532966614,
5933
+ "learning_rate": 1.1677731676733584e-07,
5934
+ "loss": 1.2839,
5935
+ "step": 842
5936
+ },
5937
+ {
5938
+ "epoch": 0.9859649122807017,
5939
+ "grad_norm": 0.4712502956390381,
5940
+ "learning_rate": 9.950543127198453e-08,
5941
+ "loss": 1.8165,
5942
+ "step": 843
5943
+ },
5944
+ {
5945
+ "epoch": 0.9871345029239766,
5946
+ "grad_norm": 0.5401002764701843,
5947
+ "learning_rate": 8.361441791016055e-08,
5948
+ "loss": 1.5518,
5949
+ "step": 844
5950
+ },
5951
+ {
5952
+ "epoch": 0.9883040935672515,
5953
+ "grad_norm": 0.561511218547821,
5954
+ "learning_rate": 6.910449633501514e-08,
5955
+ "loss": 1.5021,
5956
+ "step": 845
5957
+ },
5958
+ {
5959
+ "epoch": 0.9894736842105263,
5960
+ "grad_norm": 0.41412779688835144,
5961
+ "learning_rate": 5.5975867109570036e-08,
5962
+ "loss": 2.2531,
5963
+ "step": 846
5964
+ },
5965
+ {
5966
+ "epoch": 0.9906432748538012,
5967
+ "grad_norm": 0.3571871817111969,
5968
+ "learning_rate": 4.422871170398635e-08,
5969
+ "loss": 1.3078,
5970
+ "step": 847
5971
+ },
5972
+ {
5973
+ "epoch": 0.991812865497076,
5974
+ "grad_norm": 0.6715613007545471,
5975
+ "learning_rate": 3.386319249303327e-08,
5976
+ "loss": 1.2575,
5977
+ "step": 848
5978
+ },
5979
+ {
5980
+ "epoch": 0.9929824561403509,
5981
+ "grad_norm": 0.36819925904273987,
5982
+ "learning_rate": 2.48794527538454e-08,
5983
+ "loss": 1.4645,
5984
+ "step": 849
5985
+ },
5986
+ {
5987
+ "epoch": 0.9941520467836257,
5988
+ "grad_norm": 0.3651365339756012,
5989
+ "learning_rate": 1.727761666394656e-08,
5990
+ "loss": 1.7481,
5991
+ "step": 850
5992
+ },
5993
+ {
5994
+ "epoch": 0.9953216374269006,
5995
+ "grad_norm": 0.34068191051483154,
5996
+ "learning_rate": 1.105778929951784e-08,
5997
+ "loss": 1.8654,
5998
+ "step": 851
5999
+ },
6000
+ {
6001
+ "epoch": 0.9964912280701754,
6002
+ "grad_norm": 0.4798254370689392,
6003
+ "learning_rate": 6.220056633987614e-09,
6004
+ "loss": 1.67,
6005
+ "step": 852
6006
+ },
6007
+ {
6008
+ "epoch": 0.9976608187134502,
6009
+ "grad_norm": 0.2631950378417969,
6010
+ "learning_rate": 2.764485536776995e-09,
6011
+ "loss": 0.6319,
6012
+ "step": 853
6013
+ },
6014
+ {
6015
+ "epoch": 0.9988304093567252,
6016
+ "grad_norm": 0.6361366510391235,
6017
+ "learning_rate": 6.911237724560593e-10,
6018
+ "loss": 0.7551,
6019
+ "step": 854
6020
+ },
6021
+ {
6022
+ "epoch": 1.0,
6023
+ "grad_norm": 0.6016250252723694,
6024
+ "learning_rate": 0.0,
6025
+ "loss": 1.5066,
6026
+ "step": 855
6027
  }
6028
  ],
6029
  "logging_steps": 1,
 
6038
  "should_evaluate": false,
6039
  "should_log": false,
6040
  "should_save": true,
6041
+ "should_training_stop": true
6042
  },
6043
  "attributes": {}
6044
  }
6045
  },
6046
+ "total_flos": 1.649405017624412e+17,
6047
  "train_batch_size": 2,
6048
  "trial_name": null,
6049
  "trial_params": null