CocoRoF commited on
Commit
f7c3d5b
·
verified ·
1 Parent(s): 0051e8e

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0598cf4c0d949fb5d2ece8c2b9a49b542391a4aa397571645e38483ab748399c
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b16b8aa8e59ed005cab03437e858fc32c15da21710862fb7624c9d60fa09994
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d14bc3239240bf9024e747c23023fedf985bad39fcca09135776d56fd1cc9e50
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e9f520d78e8c2ec7d22e229de31961c7dc7141280c426b76ea47d2304f6e61e
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74386f26f36ed67f56395205881e5db2d0c28ffcbeed50dd95b28771d2dac588
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ec6e3926fa071bede113523efa3dc6e630c3c7958c54a9ca321cf4d62ed145
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41c88f9de084200454883a13c3717941ea3fd433e2f8735507fc30611f9c5501
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6127ee4f0c13500ec5038fce65af8f7beec63c137c7d4b7c157aa6303cf5879
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965b00d4cb4710ebab57c8787b9925bb3f77b8eeba94a186ec4bc1c2f326ef3f
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da01d1c5eb2cc3a323f97c1f590d13ccfac2a4c5b1479bd378b4e643304f5a4f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5dc374b8b9a4c45c950f9d136feab85a767081fa59f0c7d68ed3a62060c4949
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a3f04d76c0d3acc7d3dd95a04215f368f35a451ae8cba8a2fdba38cda9ca0a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c7c212fb779217f1edac0baf44f67b608eefc1e0e4e3f5a9dd7eb557032c1bc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df7d2c9825dba80cb544920f8cc0c72122f96514e6cd259052a8765b034393e2
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86e1effd626ce1e95dd68a0c8089fe19218f2b24dfe9e45ed2cab1c0ebc10ba1
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a20a42d44ff48cc162224010190e898fe28598ddad8cd1896d330a3bb1d8ec3
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799cc83f60dfc1c4243cfd6403592112414a2eec494e6832f10221c96ff62c20
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ac0dc4f09f25179860561fcea7c5c8f997aabdc46a170665f9dc5a72bc27c6
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:586777c398770c3255d3a1f48c7fef44ea9d89117c627c9ea490e16bfd9a49ba
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a16fcb5411ff961b47eff7378d85105fe9837e0492d19ea5ce3b7c4b77aa3b6
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:458ea0fd6c5fd9a917dfe3f8b6bf19bb7e7cc081c80d7da750a7340a0cfca1f2
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b5ca0374fb823ceea8ffdfbdcc41c0cd34efad2563ba3211ec3385b1ecc3188
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.11095023327286546,
5
  "eval_steps": 1000,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1773,6 +1773,1780 @@
1773
  "learning_rate": 9.995666009518137e-06,
1774
  "loss": 13.7492,
1775
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1776
  }
1777
  ],
1778
  "logging_steps": 10,
@@ -1792,7 +3566,7 @@
1792
  "attributes": {}
1793
  }
1794
  },
1795
- "total_flos": 8.72430801256448e+17,
1796
  "train_batch_size": 4,
1797
  "trial_name": null,
1798
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.22190046654573092,
5
  "eval_steps": 1000,
6
+ "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1773
  "learning_rate": 9.995666009518137e-06,
1774
  "loss": 13.7492,
1775
  "step": 2500
1776
+ },
1777
+ {
1778
+ "epoch": 0.11139403420595692,
1779
+ "grad_norm": 142.9310760498047,
1780
+ "learning_rate": 9.99564867355621e-06,
1781
+ "loss": 13.2862,
1782
+ "step": 2510
1783
+ },
1784
+ {
1785
+ "epoch": 0.11183783513904838,
1786
+ "grad_norm": 136.95596313476562,
1787
+ "learning_rate": 9.995631337594283e-06,
1788
+ "loss": 13.2403,
1789
+ "step": 2520
1790
+ },
1791
+ {
1792
+ "epoch": 0.11228163607213984,
1793
+ "grad_norm": 157.4971160888672,
1794
+ "learning_rate": 9.995614001632356e-06,
1795
+ "loss": 12.9846,
1796
+ "step": 2530
1797
+ },
1798
+ {
1799
+ "epoch": 0.1127254370052313,
1800
+ "grad_norm": 157.4886474609375,
1801
+ "learning_rate": 9.995596665670427e-06,
1802
+ "loss": 13.6239,
1803
+ "step": 2540
1804
+ },
1805
+ {
1806
+ "epoch": 0.11316923793832276,
1807
+ "grad_norm": 179.8428192138672,
1808
+ "learning_rate": 9.9955793297085e-06,
1809
+ "loss": 13.562,
1810
+ "step": 2550
1811
+ },
1812
+ {
1813
+ "epoch": 0.11361303887141423,
1814
+ "grad_norm": 115.2100601196289,
1815
+ "learning_rate": 9.995561993746573e-06,
1816
+ "loss": 13.7602,
1817
+ "step": 2560
1818
+ },
1819
+ {
1820
+ "epoch": 0.11405683980450569,
1821
+ "grad_norm": 152.49363708496094,
1822
+ "learning_rate": 9.995544657784645e-06,
1823
+ "loss": 12.7727,
1824
+ "step": 2570
1825
+ },
1826
+ {
1827
+ "epoch": 0.11450064073759715,
1828
+ "grad_norm": 111.91357421875,
1829
+ "learning_rate": 9.995527321822718e-06,
1830
+ "loss": 13.2489,
1831
+ "step": 2580
1832
+ },
1833
+ {
1834
+ "epoch": 0.11494444167068861,
1835
+ "grad_norm": 149.418701171875,
1836
+ "learning_rate": 9.99550998586079e-06,
1837
+ "loss": 13.5211,
1838
+ "step": 2590
1839
+ },
1840
+ {
1841
+ "epoch": 0.11538824260378007,
1842
+ "grad_norm": 158.1101531982422,
1843
+ "learning_rate": 9.995492649898862e-06,
1844
+ "loss": 13.5069,
1845
+ "step": 2600
1846
+ },
1847
+ {
1848
+ "epoch": 0.11583204353687154,
1849
+ "grad_norm": 152.66778564453125,
1850
+ "learning_rate": 9.995475313936935e-06,
1851
+ "loss": 13.44,
1852
+ "step": 2610
1853
+ },
1854
+ {
1855
+ "epoch": 0.116275844469963,
1856
+ "grad_norm": 133.16883850097656,
1857
+ "learning_rate": 9.995457977975008e-06,
1858
+ "loss": 13.4715,
1859
+ "step": 2620
1860
+ },
1861
+ {
1862
+ "epoch": 0.11671964540305446,
1863
+ "grad_norm": 145.34190368652344,
1864
+ "learning_rate": 9.99544064201308e-06,
1865
+ "loss": 13.2012,
1866
+ "step": 2630
1867
+ },
1868
+ {
1869
+ "epoch": 0.11716344633614592,
1870
+ "grad_norm": 141.71560668945312,
1871
+ "learning_rate": 9.995423306051153e-06,
1872
+ "loss": 13.389,
1873
+ "step": 2640
1874
+ },
1875
+ {
1876
+ "epoch": 0.11760724726923738,
1877
+ "grad_norm": 158.38853454589844,
1878
+ "learning_rate": 9.995405970089226e-06,
1879
+ "loss": 13.6967,
1880
+ "step": 2650
1881
+ },
1882
+ {
1883
+ "epoch": 0.11805104820232884,
1884
+ "grad_norm": 153.3102264404297,
1885
+ "learning_rate": 9.995388634127297e-06,
1886
+ "loss": 12.9532,
1887
+ "step": 2660
1888
+ },
1889
+ {
1890
+ "epoch": 0.11849484913542031,
1891
+ "grad_norm": 168.13534545898438,
1892
+ "learning_rate": 9.99537129816537e-06,
1893
+ "loss": 12.9198,
1894
+ "step": 2670
1895
+ },
1896
+ {
1897
+ "epoch": 0.11893865006851177,
1898
+ "grad_norm": 152.37652587890625,
1899
+ "learning_rate": 9.995353962203443e-06,
1900
+ "loss": 12.762,
1901
+ "step": 2680
1902
+ },
1903
+ {
1904
+ "epoch": 0.11938245100160323,
1905
+ "grad_norm": 175.31565856933594,
1906
+ "learning_rate": 9.995336626241515e-06,
1907
+ "loss": 13.3768,
1908
+ "step": 2690
1909
+ },
1910
+ {
1911
+ "epoch": 0.11982625193469469,
1912
+ "grad_norm": 142.70001220703125,
1913
+ "learning_rate": 9.995319290279588e-06,
1914
+ "loss": 14.0593,
1915
+ "step": 2700
1916
+ },
1917
+ {
1918
+ "epoch": 0.12027005286778615,
1919
+ "grad_norm": 138.56088256835938,
1920
+ "learning_rate": 9.995301954317661e-06,
1921
+ "loss": 12.9747,
1922
+ "step": 2710
1923
+ },
1924
+ {
1925
+ "epoch": 0.12071385380087761,
1926
+ "grad_norm": 159.65451049804688,
1927
+ "learning_rate": 9.995284618355732e-06,
1928
+ "loss": 13.163,
1929
+ "step": 2720
1930
+ },
1931
+ {
1932
+ "epoch": 0.12115765473396908,
1933
+ "grad_norm": 119.24352264404297,
1934
+ "learning_rate": 9.995267282393805e-06,
1935
+ "loss": 12.5898,
1936
+ "step": 2730
1937
+ },
1938
+ {
1939
+ "epoch": 0.12160145566706054,
1940
+ "grad_norm": 107.53023529052734,
1941
+ "learning_rate": 9.995249946431879e-06,
1942
+ "loss": 13.4115,
1943
+ "step": 2740
1944
+ },
1945
+ {
1946
+ "epoch": 0.122045256600152,
1947
+ "grad_norm": 141.00604248046875,
1948
+ "learning_rate": 9.995232610469952e-06,
1949
+ "loss": 13.4099,
1950
+ "step": 2750
1951
+ },
1952
+ {
1953
+ "epoch": 0.12248905753324346,
1954
+ "grad_norm": 121.9310302734375,
1955
+ "learning_rate": 9.995215274508023e-06,
1956
+ "loss": 13.0975,
1957
+ "step": 2760
1958
+ },
1959
+ {
1960
+ "epoch": 0.12293285846633492,
1961
+ "grad_norm": 170.2215576171875,
1962
+ "learning_rate": 9.995197938546096e-06,
1963
+ "loss": 13.1425,
1964
+ "step": 2770
1965
+ },
1966
+ {
1967
+ "epoch": 0.12337665939942638,
1968
+ "grad_norm": 134.97998046875,
1969
+ "learning_rate": 9.99518060258417e-06,
1970
+ "loss": 13.0698,
1971
+ "step": 2780
1972
+ },
1973
+ {
1974
+ "epoch": 0.12382046033251785,
1975
+ "grad_norm": 141.1931915283203,
1976
+ "learning_rate": 9.99516326662224e-06,
1977
+ "loss": 12.9536,
1978
+ "step": 2790
1979
+ },
1980
+ {
1981
+ "epoch": 0.12426426126560931,
1982
+ "grad_norm": 123.69408416748047,
1983
+ "learning_rate": 9.995145930660314e-06,
1984
+ "loss": 13.1385,
1985
+ "step": 2800
1986
+ },
1987
+ {
1988
+ "epoch": 0.12470806219870077,
1989
+ "grad_norm": 132.7433319091797,
1990
+ "learning_rate": 9.995128594698387e-06,
1991
+ "loss": 12.7706,
1992
+ "step": 2810
1993
+ },
1994
+ {
1995
+ "epoch": 0.12515186313179225,
1996
+ "grad_norm": 146.10305786132812,
1997
+ "learning_rate": 9.995111258736458e-06,
1998
+ "loss": 13.1492,
1999
+ "step": 2820
2000
+ },
2001
+ {
2002
+ "epoch": 0.1255956640648837,
2003
+ "grad_norm": 217.5339813232422,
2004
+ "learning_rate": 9.995093922774531e-06,
2005
+ "loss": 13.2251,
2006
+ "step": 2830
2007
+ },
2008
+ {
2009
+ "epoch": 0.12603946499797516,
2010
+ "grad_norm": 178.1737518310547,
2011
+ "learning_rate": 9.995076586812604e-06,
2012
+ "loss": 12.8123,
2013
+ "step": 2840
2014
+ },
2015
+ {
2016
+ "epoch": 0.12648326593106662,
2017
+ "grad_norm": 123.01746368408203,
2018
+ "learning_rate": 9.995059250850676e-06,
2019
+ "loss": 12.8128,
2020
+ "step": 2850
2021
+ },
2022
+ {
2023
+ "epoch": 0.12692706686415808,
2024
+ "grad_norm": 173.1932830810547,
2025
+ "learning_rate": 9.995041914888749e-06,
2026
+ "loss": 13.3028,
2027
+ "step": 2860
2028
+ },
2029
+ {
2030
+ "epoch": 0.12737086779724954,
2031
+ "grad_norm": 129.58663940429688,
2032
+ "learning_rate": 9.995024578926822e-06,
2033
+ "loss": 12.685,
2034
+ "step": 2870
2035
+ },
2036
+ {
2037
+ "epoch": 0.127814668730341,
2038
+ "grad_norm": 163.02256774902344,
2039
+ "learning_rate": 9.995007242964893e-06,
2040
+ "loss": 12.8868,
2041
+ "step": 2880
2042
+ },
2043
+ {
2044
+ "epoch": 0.12825846966343246,
2045
+ "grad_norm": 129.8294677734375,
2046
+ "learning_rate": 9.994989907002966e-06,
2047
+ "loss": 12.6222,
2048
+ "step": 2890
2049
+ },
2050
+ {
2051
+ "epoch": 0.12870227059652392,
2052
+ "grad_norm": 133.46365356445312,
2053
+ "learning_rate": 9.99497257104104e-06,
2054
+ "loss": 12.5605,
2055
+ "step": 2900
2056
+ },
2057
+ {
2058
+ "epoch": 0.12914607152961538,
2059
+ "grad_norm": 141.15603637695312,
2060
+ "learning_rate": 9.99495523507911e-06,
2061
+ "loss": 12.972,
2062
+ "step": 2910
2063
+ },
2064
+ {
2065
+ "epoch": 0.12958987246270684,
2066
+ "grad_norm": 128.41879272460938,
2067
+ "learning_rate": 9.994937899117184e-06,
2068
+ "loss": 13.1774,
2069
+ "step": 2920
2070
+ },
2071
+ {
2072
+ "epoch": 0.13003367339579833,
2073
+ "grad_norm": 136.0733184814453,
2074
+ "learning_rate": 9.994920563155257e-06,
2075
+ "loss": 12.7431,
2076
+ "step": 2930
2077
+ },
2078
+ {
2079
+ "epoch": 0.1304774743288898,
2080
+ "grad_norm": 135.34146118164062,
2081
+ "learning_rate": 9.994903227193328e-06,
2082
+ "loss": 13.4061,
2083
+ "step": 2940
2084
+ },
2085
+ {
2086
+ "epoch": 0.13092127526198125,
2087
+ "grad_norm": 143.00442504882812,
2088
+ "learning_rate": 9.994885891231401e-06,
2089
+ "loss": 12.9928,
2090
+ "step": 2950
2091
+ },
2092
+ {
2093
+ "epoch": 0.1313650761950727,
2094
+ "grad_norm": 150.8383026123047,
2095
+ "learning_rate": 9.994868555269474e-06,
2096
+ "loss": 12.8895,
2097
+ "step": 2960
2098
+ },
2099
+ {
2100
+ "epoch": 0.13180887712816416,
2101
+ "grad_norm": 144.29466247558594,
2102
+ "learning_rate": 9.994851219307547e-06,
2103
+ "loss": 13.2952,
2104
+ "step": 2970
2105
+ },
2106
+ {
2107
+ "epoch": 0.13225267806125562,
2108
+ "grad_norm": 130.09901428222656,
2109
+ "learning_rate": 9.994833883345619e-06,
2110
+ "loss": 13.1275,
2111
+ "step": 2980
2112
+ },
2113
+ {
2114
+ "epoch": 0.13269647899434708,
2115
+ "grad_norm": 161.23716735839844,
2116
+ "learning_rate": 9.994816547383692e-06,
2117
+ "loss": 12.7923,
2118
+ "step": 2990
2119
+ },
2120
+ {
2121
+ "epoch": 0.13314027992743854,
2122
+ "grad_norm": 126.25406646728516,
2123
+ "learning_rate": 9.994799211421765e-06,
2124
+ "loss": 12.9331,
2125
+ "step": 3000
2126
+ },
2127
+ {
2128
+ "epoch": 0.13314027992743854,
2129
+ "eval_loss": 0.40445423126220703,
2130
+ "eval_runtime": 673.2876,
2131
+ "eval_samples_per_second": 1803.673,
2132
+ "eval_steps_per_second": 56.365,
2133
+ "step": 3000
2134
+ },
2135
+ {
2136
+ "epoch": 0.13358408086053,
2137
+ "grad_norm": 121.59950256347656,
2138
+ "learning_rate": 9.994781875459836e-06,
2139
+ "loss": 12.1823,
2140
+ "step": 3010
2141
+ },
2142
+ {
2143
+ "epoch": 0.13402788179362146,
2144
+ "grad_norm": 112.05023193359375,
2145
+ "learning_rate": 9.99476453949791e-06,
2146
+ "loss": 12.886,
2147
+ "step": 3020
2148
+ },
2149
+ {
2150
+ "epoch": 0.13447168272671292,
2151
+ "grad_norm": 129.76803588867188,
2152
+ "learning_rate": 9.994747203535983e-06,
2153
+ "loss": 12.7748,
2154
+ "step": 3030
2155
+ },
2156
+ {
2157
+ "epoch": 0.1349154836598044,
2158
+ "grad_norm": 124.96532440185547,
2159
+ "learning_rate": 9.994729867574054e-06,
2160
+ "loss": 13.1191,
2161
+ "step": 3040
2162
+ },
2163
+ {
2164
+ "epoch": 0.13535928459289587,
2165
+ "grad_norm": 141.1378631591797,
2166
+ "learning_rate": 9.994712531612127e-06,
2167
+ "loss": 13.4907,
2168
+ "step": 3050
2169
+ },
2170
+ {
2171
+ "epoch": 0.13580308552598733,
2172
+ "grad_norm": 160.61422729492188,
2173
+ "learning_rate": 9.9946951956502e-06,
2174
+ "loss": 13.6199,
2175
+ "step": 3060
2176
+ },
2177
+ {
2178
+ "epoch": 0.1362468864590788,
2179
+ "grad_norm": 136.35914611816406,
2180
+ "learning_rate": 9.994677859688271e-06,
2181
+ "loss": 13.2168,
2182
+ "step": 3070
2183
+ },
2184
+ {
2185
+ "epoch": 0.13669068739217025,
2186
+ "grad_norm": 127.0990982055664,
2187
+ "learning_rate": 9.994660523726345e-06,
2188
+ "loss": 12.9859,
2189
+ "step": 3080
2190
+ },
2191
+ {
2192
+ "epoch": 0.1371344883252617,
2193
+ "grad_norm": 138.41912841796875,
2194
+ "learning_rate": 9.994643187764418e-06,
2195
+ "loss": 12.4187,
2196
+ "step": 3090
2197
+ },
2198
+ {
2199
+ "epoch": 0.13757828925835316,
2200
+ "grad_norm": 140.53573608398438,
2201
+ "learning_rate": 9.994625851802489e-06,
2202
+ "loss": 13.0548,
2203
+ "step": 3100
2204
+ },
2205
+ {
2206
+ "epoch": 0.13802209019144462,
2207
+ "grad_norm": 140.9633331298828,
2208
+ "learning_rate": 9.994608515840562e-06,
2209
+ "loss": 12.8993,
2210
+ "step": 3110
2211
+ },
2212
+ {
2213
+ "epoch": 0.13846589112453608,
2214
+ "grad_norm": 161.15667724609375,
2215
+ "learning_rate": 9.994591179878635e-06,
2216
+ "loss": 12.7675,
2217
+ "step": 3120
2218
+ },
2219
+ {
2220
+ "epoch": 0.13890969205762754,
2221
+ "grad_norm": 131.9075469970703,
2222
+ "learning_rate": 9.994573843916707e-06,
2223
+ "loss": 13.4033,
2224
+ "step": 3130
2225
+ },
2226
+ {
2227
+ "epoch": 0.139353492990719,
2228
+ "grad_norm": 144.76168823242188,
2229
+ "learning_rate": 9.99455650795478e-06,
2230
+ "loss": 12.9296,
2231
+ "step": 3140
2232
+ },
2233
+ {
2234
+ "epoch": 0.13979729392381046,
2235
+ "grad_norm": 128.33094787597656,
2236
+ "learning_rate": 9.994539171992853e-06,
2237
+ "loss": 13.0926,
2238
+ "step": 3150
2239
+ },
2240
+ {
2241
+ "epoch": 0.14024109485690195,
2242
+ "grad_norm": 120.40109252929688,
2243
+ "learning_rate": 9.994521836030924e-06,
2244
+ "loss": 12.6591,
2245
+ "step": 3160
2246
+ },
2247
+ {
2248
+ "epoch": 0.1406848957899934,
2249
+ "grad_norm": 177.44715881347656,
2250
+ "learning_rate": 9.994504500068997e-06,
2251
+ "loss": 13.0056,
2252
+ "step": 3170
2253
+ },
2254
+ {
2255
+ "epoch": 0.14112869672308487,
2256
+ "grad_norm": 133.28228759765625,
2257
+ "learning_rate": 9.99448716410707e-06,
2258
+ "loss": 12.9662,
2259
+ "step": 3180
2260
+ },
2261
+ {
2262
+ "epoch": 0.14157249765617633,
2263
+ "grad_norm": 139.16880798339844,
2264
+ "learning_rate": 9.994469828145143e-06,
2265
+ "loss": 12.55,
2266
+ "step": 3190
2267
+ },
2268
+ {
2269
+ "epoch": 0.1420162985892678,
2270
+ "grad_norm": 125.42192077636719,
2271
+ "learning_rate": 9.994452492183215e-06,
2272
+ "loss": 13.1687,
2273
+ "step": 3200
2274
+ },
2275
+ {
2276
+ "epoch": 0.14246009952235925,
2277
+ "grad_norm": 167.23255920410156,
2278
+ "learning_rate": 9.994435156221288e-06,
2279
+ "loss": 13.1131,
2280
+ "step": 3210
2281
+ },
2282
+ {
2283
+ "epoch": 0.1429039004554507,
2284
+ "grad_norm": 160.59434509277344,
2285
+ "learning_rate": 9.994417820259361e-06,
2286
+ "loss": 12.6661,
2287
+ "step": 3220
2288
+ },
2289
+ {
2290
+ "epoch": 0.14334770138854216,
2291
+ "grad_norm": 157.50003051757812,
2292
+ "learning_rate": 9.994400484297432e-06,
2293
+ "loss": 13.0758,
2294
+ "step": 3230
2295
+ },
2296
+ {
2297
+ "epoch": 0.14379150232163362,
2298
+ "grad_norm": 152.7613067626953,
2299
+ "learning_rate": 9.994383148335505e-06,
2300
+ "loss": 12.6521,
2301
+ "step": 3240
2302
+ },
2303
+ {
2304
+ "epoch": 0.14423530325472508,
2305
+ "grad_norm": 120.58446502685547,
2306
+ "learning_rate": 9.994365812373578e-06,
2307
+ "loss": 12.9558,
2308
+ "step": 3250
2309
+ },
2310
+ {
2311
+ "epoch": 0.14467910418781654,
2312
+ "grad_norm": 130.59677124023438,
2313
+ "learning_rate": 9.99434847641165e-06,
2314
+ "loss": 12.919,
2315
+ "step": 3260
2316
+ },
2317
+ {
2318
+ "epoch": 0.14512290512090803,
2319
+ "grad_norm": 118.21358489990234,
2320
+ "learning_rate": 9.994331140449723e-06,
2321
+ "loss": 12.8244,
2322
+ "step": 3270
2323
+ },
2324
+ {
2325
+ "epoch": 0.1455667060539995,
2326
+ "grad_norm": 139.61123657226562,
2327
+ "learning_rate": 9.994313804487796e-06,
2328
+ "loss": 12.6272,
2329
+ "step": 3280
2330
+ },
2331
+ {
2332
+ "epoch": 0.14601050698709095,
2333
+ "grad_norm": 118.49354553222656,
2334
+ "learning_rate": 9.994296468525867e-06,
2335
+ "loss": 12.9009,
2336
+ "step": 3290
2337
+ },
2338
+ {
2339
+ "epoch": 0.1464543079201824,
2340
+ "grad_norm": 189.260498046875,
2341
+ "learning_rate": 9.99427913256394e-06,
2342
+ "loss": 12.7051,
2343
+ "step": 3300
2344
+ },
2345
+ {
2346
+ "epoch": 0.14689810885327387,
2347
+ "grad_norm": 146.55145263671875,
2348
+ "learning_rate": 9.994261796602014e-06,
2349
+ "loss": 12.7917,
2350
+ "step": 3310
2351
+ },
2352
+ {
2353
+ "epoch": 0.14734190978636533,
2354
+ "grad_norm": 140.71144104003906,
2355
+ "learning_rate": 9.994244460640085e-06,
2356
+ "loss": 12.2915,
2357
+ "step": 3320
2358
+ },
2359
+ {
2360
+ "epoch": 0.1477857107194568,
2361
+ "grad_norm": 153.54945373535156,
2362
+ "learning_rate": 9.994227124678158e-06,
2363
+ "loss": 12.4179,
2364
+ "step": 3330
2365
+ },
2366
+ {
2367
+ "epoch": 0.14822951165254825,
2368
+ "grad_norm": 113.84798431396484,
2369
+ "learning_rate": 9.994209788716231e-06,
2370
+ "loss": 12.8944,
2371
+ "step": 3340
2372
+ },
2373
+ {
2374
+ "epoch": 0.1486733125856397,
2375
+ "grad_norm": 140.952392578125,
2376
+ "learning_rate": 9.994192452754304e-06,
2377
+ "loss": 13.0441,
2378
+ "step": 3350
2379
+ },
2380
+ {
2381
+ "epoch": 0.14911711351873116,
2382
+ "grad_norm": 118.15951538085938,
2383
+ "learning_rate": 9.994175116792376e-06,
2384
+ "loss": 12.8967,
2385
+ "step": 3360
2386
+ },
2387
+ {
2388
+ "epoch": 0.14956091445182262,
2389
+ "grad_norm": 153.52896118164062,
2390
+ "learning_rate": 9.994157780830449e-06,
2391
+ "loss": 12.8702,
2392
+ "step": 3370
2393
+ },
2394
+ {
2395
+ "epoch": 0.15000471538491408,
2396
+ "grad_norm": 127.21448516845703,
2397
+ "learning_rate": 9.994140444868522e-06,
2398
+ "loss": 12.4004,
2399
+ "step": 3380
2400
+ },
2401
+ {
2402
+ "epoch": 0.15044851631800557,
2403
+ "grad_norm": 131.0957489013672,
2404
+ "learning_rate": 9.994123108906593e-06,
2405
+ "loss": 13.3359,
2406
+ "step": 3390
2407
+ },
2408
+ {
2409
+ "epoch": 0.15089231725109703,
2410
+ "grad_norm": 128.41253662109375,
2411
+ "learning_rate": 9.994105772944666e-06,
2412
+ "loss": 12.3271,
2413
+ "step": 3400
2414
+ },
2415
+ {
2416
+ "epoch": 0.1513361181841885,
2417
+ "grad_norm": 134.6806640625,
2418
+ "learning_rate": 9.99408843698274e-06,
2419
+ "loss": 12.5313,
2420
+ "step": 3410
2421
+ },
2422
+ {
2423
+ "epoch": 0.15177991911727995,
2424
+ "grad_norm": 119.5546646118164,
2425
+ "learning_rate": 9.99407110102081e-06,
2426
+ "loss": 12.3448,
2427
+ "step": 3420
2428
+ },
2429
+ {
2430
+ "epoch": 0.1522237200503714,
2431
+ "grad_norm": 138.1034393310547,
2432
+ "learning_rate": 9.994053765058884e-06,
2433
+ "loss": 13.0286,
2434
+ "step": 3430
2435
+ },
2436
+ {
2437
+ "epoch": 0.15266752098346287,
2438
+ "grad_norm": 110.58820343017578,
2439
+ "learning_rate": 9.994036429096957e-06,
2440
+ "loss": 12.455,
2441
+ "step": 3440
2442
+ },
2443
+ {
2444
+ "epoch": 0.15311132191655433,
2445
+ "grad_norm": 168.3693084716797,
2446
+ "learning_rate": 9.994019093135028e-06,
2447
+ "loss": 12.7619,
2448
+ "step": 3450
2449
+ },
2450
+ {
2451
+ "epoch": 0.1535551228496458,
2452
+ "grad_norm": 140.2900390625,
2453
+ "learning_rate": 9.994001757173101e-06,
2454
+ "loss": 12.8144,
2455
+ "step": 3460
2456
+ },
2457
+ {
2458
+ "epoch": 0.15399892378273725,
2459
+ "grad_norm": 108.32037353515625,
2460
+ "learning_rate": 9.993984421211174e-06,
2461
+ "loss": 12.6378,
2462
+ "step": 3470
2463
+ },
2464
+ {
2465
+ "epoch": 0.1544427247158287,
2466
+ "grad_norm": 137.5443115234375,
2467
+ "learning_rate": 9.993967085249247e-06,
2468
+ "loss": 12.2405,
2469
+ "step": 3480
2470
+ },
2471
+ {
2472
+ "epoch": 0.15488652564892016,
2473
+ "grad_norm": 157.03700256347656,
2474
+ "learning_rate": 9.993949749287319e-06,
2475
+ "loss": 12.654,
2476
+ "step": 3490
2477
+ },
2478
+ {
2479
+ "epoch": 0.15533032658201165,
2480
+ "grad_norm": 145.71405029296875,
2481
+ "learning_rate": 9.993932413325392e-06,
2482
+ "loss": 12.7073,
2483
+ "step": 3500
2484
+ },
2485
+ {
2486
+ "epoch": 0.1557741275151031,
2487
+ "grad_norm": 130.26255798339844,
2488
+ "learning_rate": 9.993915077363465e-06,
2489
+ "loss": 12.0506,
2490
+ "step": 3510
2491
+ },
2492
+ {
2493
+ "epoch": 0.15621792844819457,
2494
+ "grad_norm": 131.19674682617188,
2495
+ "learning_rate": 9.993897741401536e-06,
2496
+ "loss": 12.4799,
2497
+ "step": 3520
2498
+ },
2499
+ {
2500
+ "epoch": 0.15666172938128603,
2501
+ "grad_norm": 116.03231811523438,
2502
+ "learning_rate": 9.99388040543961e-06,
2503
+ "loss": 12.8077,
2504
+ "step": 3530
2505
+ },
2506
+ {
2507
+ "epoch": 0.1571055303143775,
2508
+ "grad_norm": 117.18372344970703,
2509
+ "learning_rate": 9.993863069477682e-06,
2510
+ "loss": 12.754,
2511
+ "step": 3540
2512
+ },
2513
+ {
2514
+ "epoch": 0.15754933124746895,
2515
+ "grad_norm": 141.51046752929688,
2516
+ "learning_rate": 9.993845733515754e-06,
2517
+ "loss": 12.9482,
2518
+ "step": 3550
2519
+ },
2520
+ {
2521
+ "epoch": 0.1579931321805604,
2522
+ "grad_norm": 145.37522888183594,
2523
+ "learning_rate": 9.993828397553827e-06,
2524
+ "loss": 12.9102,
2525
+ "step": 3560
2526
+ },
2527
+ {
2528
+ "epoch": 0.15843693311365187,
2529
+ "grad_norm": 121.03300476074219,
2530
+ "learning_rate": 9.9938110615919e-06,
2531
+ "loss": 12.5428,
2532
+ "step": 3570
2533
+ },
2534
+ {
2535
+ "epoch": 0.15888073404674333,
2536
+ "grad_norm": 97.84991455078125,
2537
+ "learning_rate": 9.993793725629971e-06,
2538
+ "loss": 12.2027,
2539
+ "step": 3580
2540
+ },
2541
+ {
2542
+ "epoch": 0.1593245349798348,
2543
+ "grad_norm": 139.6411590576172,
2544
+ "learning_rate": 9.993776389668044e-06,
2545
+ "loss": 12.6947,
2546
+ "step": 3590
2547
+ },
2548
+ {
2549
+ "epoch": 0.15976833591292625,
2550
+ "grad_norm": 128.70204162597656,
2551
+ "learning_rate": 9.993759053706118e-06,
2552
+ "loss": 12.7905,
2553
+ "step": 3600
2554
+ },
2555
+ {
2556
+ "epoch": 0.1602121368460177,
2557
+ "grad_norm": 118.03040313720703,
2558
+ "learning_rate": 9.99374171774419e-06,
2559
+ "loss": 12.8123,
2560
+ "step": 3610
2561
+ },
2562
+ {
2563
+ "epoch": 0.1606559377791092,
2564
+ "grad_norm": 111.89470672607422,
2565
+ "learning_rate": 9.993724381782262e-06,
2566
+ "loss": 12.5624,
2567
+ "step": 3620
2568
+ },
2569
+ {
2570
+ "epoch": 0.16109973871220065,
2571
+ "grad_norm": 129.9048614501953,
2572
+ "learning_rate": 9.993707045820335e-06,
2573
+ "loss": 12.6917,
2574
+ "step": 3630
2575
+ },
2576
+ {
2577
+ "epoch": 0.1615435396452921,
2578
+ "grad_norm": 127.17546844482422,
2579
+ "learning_rate": 9.993689709858408e-06,
2580
+ "loss": 13.1546,
2581
+ "step": 3640
2582
+ },
2583
+ {
2584
+ "epoch": 0.16198734057838357,
2585
+ "grad_norm": 117.04710388183594,
2586
+ "learning_rate": 9.99367237389648e-06,
2587
+ "loss": 12.452,
2588
+ "step": 3650
2589
+ },
2590
+ {
2591
+ "epoch": 0.16243114151147503,
2592
+ "grad_norm": 133.599853515625,
2593
+ "learning_rate": 9.993655037934553e-06,
2594
+ "loss": 12.5182,
2595
+ "step": 3660
2596
+ },
2597
+ {
2598
+ "epoch": 0.1628749424445665,
2599
+ "grad_norm": 158.9300994873047,
2600
+ "learning_rate": 9.993637701972626e-06,
2601
+ "loss": 12.0698,
2602
+ "step": 3670
2603
+ },
2604
+ {
2605
+ "epoch": 0.16331874337765795,
2606
+ "grad_norm": 125.10253143310547,
2607
+ "learning_rate": 9.993620366010697e-06,
2608
+ "loss": 12.5825,
2609
+ "step": 3680
2610
+ },
2611
+ {
2612
+ "epoch": 0.1637625443107494,
2613
+ "grad_norm": 129.07908630371094,
2614
+ "learning_rate": 9.99360303004877e-06,
2615
+ "loss": 12.5439,
2616
+ "step": 3690
2617
+ },
2618
+ {
2619
+ "epoch": 0.16420634524384087,
2620
+ "grad_norm": 119.1418685913086,
2621
+ "learning_rate": 9.993585694086843e-06,
2622
+ "loss": 12.564,
2623
+ "step": 3700
2624
+ },
2625
+ {
2626
+ "epoch": 0.16465014617693233,
2627
+ "grad_norm": 128.37750244140625,
2628
+ "learning_rate": 9.993568358124915e-06,
2629
+ "loss": 12.9106,
2630
+ "step": 3710
2631
+ },
2632
+ {
2633
+ "epoch": 0.1650939471100238,
2634
+ "grad_norm": 142.9105987548828,
2635
+ "learning_rate": 9.993551022162988e-06,
2636
+ "loss": 12.9379,
2637
+ "step": 3720
2638
+ },
2639
+ {
2640
+ "epoch": 0.16553774804311527,
2641
+ "grad_norm": 118.88358306884766,
2642
+ "learning_rate": 9.99353368620106e-06,
2643
+ "loss": 12.3977,
2644
+ "step": 3730
2645
+ },
2646
+ {
2647
+ "epoch": 0.16598154897620673,
2648
+ "grad_norm": 118.91588592529297,
2649
+ "learning_rate": 9.993516350239134e-06,
2650
+ "loss": 13.2059,
2651
+ "step": 3740
2652
+ },
2653
+ {
2654
+ "epoch": 0.1664253499092982,
2655
+ "grad_norm": 139.88050842285156,
2656
+ "learning_rate": 9.993499014277205e-06,
2657
+ "loss": 13.0386,
2658
+ "step": 3750
2659
+ },
2660
+ {
2661
+ "epoch": 0.16686915084238965,
2662
+ "grad_norm": 162.86024475097656,
2663
+ "learning_rate": 9.993481678315278e-06,
2664
+ "loss": 12.238,
2665
+ "step": 3760
2666
+ },
2667
+ {
2668
+ "epoch": 0.1673129517754811,
2669
+ "grad_norm": 127.96647644042969,
2670
+ "learning_rate": 9.993464342353351e-06,
2671
+ "loss": 13.2272,
2672
+ "step": 3770
2673
+ },
2674
+ {
2675
+ "epoch": 0.16775675270857257,
2676
+ "grad_norm": 112.12999725341797,
2677
+ "learning_rate": 9.993447006391423e-06,
2678
+ "loss": 12.5457,
2679
+ "step": 3780
2680
+ },
2681
+ {
2682
+ "epoch": 0.16820055364166403,
2683
+ "grad_norm": 155.6081085205078,
2684
+ "learning_rate": 9.993429670429496e-06,
2685
+ "loss": 12.3183,
2686
+ "step": 3790
2687
+ },
2688
+ {
2689
+ "epoch": 0.1686443545747555,
2690
+ "grad_norm": 138.3584442138672,
2691
+ "learning_rate": 9.993412334467569e-06,
2692
+ "loss": 12.4889,
2693
+ "step": 3800
2694
+ },
2695
+ {
2696
+ "epoch": 0.16908815550784695,
2697
+ "grad_norm": 115.07450866699219,
2698
+ "learning_rate": 9.99339499850564e-06,
2699
+ "loss": 12.9653,
2700
+ "step": 3810
2701
+ },
2702
+ {
2703
+ "epoch": 0.1695319564409384,
2704
+ "grad_norm": 126.59088897705078,
2705
+ "learning_rate": 9.993377662543713e-06,
2706
+ "loss": 12.9003,
2707
+ "step": 3820
2708
+ },
2709
+ {
2710
+ "epoch": 0.16997575737402987,
2711
+ "grad_norm": 108.5206069946289,
2712
+ "learning_rate": 9.993360326581786e-06,
2713
+ "loss": 13.1251,
2714
+ "step": 3830
2715
+ },
2716
+ {
2717
+ "epoch": 0.17041955830712133,
2718
+ "grad_norm": 124.42080688476562,
2719
+ "learning_rate": 9.993342990619858e-06,
2720
+ "loss": 12.9417,
2721
+ "step": 3840
2722
+ },
2723
+ {
2724
+ "epoch": 0.17086335924021281,
2725
+ "grad_norm": 106.24859619140625,
2726
+ "learning_rate": 9.993325654657931e-06,
2727
+ "loss": 12.3298,
2728
+ "step": 3850
2729
+ },
2730
+ {
2731
+ "epoch": 0.17130716017330427,
2732
+ "grad_norm": 112.69037628173828,
2733
+ "learning_rate": 9.993308318696004e-06,
2734
+ "loss": 12.3989,
2735
+ "step": 3860
2736
+ },
2737
+ {
2738
+ "epoch": 0.17175096110639573,
2739
+ "grad_norm": 154.77882385253906,
2740
+ "learning_rate": 9.993290982734077e-06,
2741
+ "loss": 13.245,
2742
+ "step": 3870
2743
+ },
2744
+ {
2745
+ "epoch": 0.1721947620394872,
2746
+ "grad_norm": 123.77738952636719,
2747
+ "learning_rate": 9.993273646772148e-06,
2748
+ "loss": 12.6657,
2749
+ "step": 3880
2750
+ },
2751
+ {
2752
+ "epoch": 0.17263856297257865,
2753
+ "grad_norm": 126.2992172241211,
2754
+ "learning_rate": 9.993256310810222e-06,
2755
+ "loss": 12.3853,
2756
+ "step": 3890
2757
+ },
2758
+ {
2759
+ "epoch": 0.1730823639056701,
2760
+ "grad_norm": 112.3527603149414,
2761
+ "learning_rate": 9.993238974848295e-06,
2762
+ "loss": 13.0207,
2763
+ "step": 3900
2764
+ },
2765
+ {
2766
+ "epoch": 0.17352616483876157,
2767
+ "grad_norm": 106.91303253173828,
2768
+ "learning_rate": 9.993221638886366e-06,
2769
+ "loss": 11.9467,
2770
+ "step": 3910
2771
+ },
2772
+ {
2773
+ "epoch": 0.17396996577185303,
2774
+ "grad_norm": 122.75757598876953,
2775
+ "learning_rate": 9.993204302924439e-06,
2776
+ "loss": 12.1821,
2777
+ "step": 3920
2778
+ },
2779
+ {
2780
+ "epoch": 0.1744137667049445,
2781
+ "grad_norm": 116.6715316772461,
2782
+ "learning_rate": 9.993186966962512e-06,
2783
+ "loss": 12.1554,
2784
+ "step": 3930
2785
+ },
2786
+ {
2787
+ "epoch": 0.17485756763803595,
2788
+ "grad_norm": 100.55429077148438,
2789
+ "learning_rate": 9.993169631000584e-06,
2790
+ "loss": 12.6786,
2791
+ "step": 3940
2792
+ },
2793
+ {
2794
+ "epoch": 0.1753013685711274,
2795
+ "grad_norm": 116.2847671508789,
2796
+ "learning_rate": 9.993152295038657e-06,
2797
+ "loss": 11.9186,
2798
+ "step": 3950
2799
+ },
2800
+ {
2801
+ "epoch": 0.1757451695042189,
2802
+ "grad_norm": 107.30577087402344,
2803
+ "learning_rate": 9.99313495907673e-06,
2804
+ "loss": 12.8207,
2805
+ "step": 3960
2806
+ },
2807
+ {
2808
+ "epoch": 0.17618897043731035,
2809
+ "grad_norm": 103.84146118164062,
2810
+ "learning_rate": 9.993117623114803e-06,
2811
+ "loss": 12.4032,
2812
+ "step": 3970
2813
+ },
2814
+ {
2815
+ "epoch": 0.17663277137040181,
2816
+ "grad_norm": 108.5525131225586,
2817
+ "learning_rate": 9.993100287152874e-06,
2818
+ "loss": 12.3449,
2819
+ "step": 3980
2820
+ },
2821
+ {
2822
+ "epoch": 0.17707657230349327,
2823
+ "grad_norm": 132.21353149414062,
2824
+ "learning_rate": 9.993082951190947e-06,
2825
+ "loss": 12.8847,
2826
+ "step": 3990
2827
+ },
2828
+ {
2829
+ "epoch": 0.17752037323658473,
2830
+ "grad_norm": 114.80290985107422,
2831
+ "learning_rate": 9.99306561522902e-06,
2832
+ "loss": 12.5004,
2833
+ "step": 4000
2834
+ },
2835
+ {
2836
+ "epoch": 0.17752037323658473,
2837
+ "eval_loss": 0.38956519961357117,
2838
+ "eval_runtime": 673.1566,
2839
+ "eval_samples_per_second": 1804.024,
2840
+ "eval_steps_per_second": 56.376,
2841
+ "step": 4000
2842
+ },
2843
+ {
2844
+ "epoch": 0.1779641741696762,
2845
+ "grad_norm": 123.59729766845703,
2846
+ "learning_rate": 9.993048279267092e-06,
2847
+ "loss": 11.8187,
2848
+ "step": 4010
2849
+ },
2850
+ {
2851
+ "epoch": 0.17840797510276765,
2852
+ "grad_norm": 121.13542175292969,
2853
+ "learning_rate": 9.993030943305165e-06,
2854
+ "loss": 12.569,
2855
+ "step": 4020
2856
+ },
2857
+ {
2858
+ "epoch": 0.1788517760358591,
2859
+ "grad_norm": 112.8544692993164,
2860
+ "learning_rate": 9.993013607343238e-06,
2861
+ "loss": 12.4908,
2862
+ "step": 4030
2863
+ },
2864
+ {
2865
+ "epoch": 0.17929557696895057,
2866
+ "grad_norm": 108.49791717529297,
2867
+ "learning_rate": 9.99299627138131e-06,
2868
+ "loss": 12.1717,
2869
+ "step": 4040
2870
+ },
2871
+ {
2872
+ "epoch": 0.17973937790204203,
2873
+ "grad_norm": 112.78793334960938,
2874
+ "learning_rate": 9.992978935419382e-06,
2875
+ "loss": 12.4246,
2876
+ "step": 4050
2877
+ },
2878
+ {
2879
+ "epoch": 0.1801831788351335,
2880
+ "grad_norm": 111.55317687988281,
2881
+ "learning_rate": 9.992961599457455e-06,
2882
+ "loss": 12.5279,
2883
+ "step": 4060
2884
+ },
2885
+ {
2886
+ "epoch": 0.18062697976822495,
2887
+ "grad_norm": 129.81776428222656,
2888
+ "learning_rate": 9.992944263495527e-06,
2889
+ "loss": 12.0691,
2890
+ "step": 4070
2891
+ },
2892
+ {
2893
+ "epoch": 0.18107078070131644,
2894
+ "grad_norm": 159.69786071777344,
2895
+ "learning_rate": 9.9929269275336e-06,
2896
+ "loss": 12.4083,
2897
+ "step": 4080
2898
+ },
2899
+ {
2900
+ "epoch": 0.1815145816344079,
2901
+ "grad_norm": 119.66799926757812,
2902
+ "learning_rate": 9.992909591571673e-06,
2903
+ "loss": 12.1316,
2904
+ "step": 4090
2905
+ },
2906
+ {
2907
+ "epoch": 0.18195838256749935,
2908
+ "grad_norm": 143.12181091308594,
2909
+ "learning_rate": 9.992892255609746e-06,
2910
+ "loss": 12.5121,
2911
+ "step": 4100
2912
+ },
2913
+ {
2914
+ "epoch": 0.18240218350059081,
2915
+ "grad_norm": 153.3807373046875,
2916
+ "learning_rate": 9.992874919647817e-06,
2917
+ "loss": 12.0202,
2918
+ "step": 4110
2919
+ },
2920
+ {
2921
+ "epoch": 0.18284598443368227,
2922
+ "grad_norm": 127.23503875732422,
2923
+ "learning_rate": 9.99285758368589e-06,
2924
+ "loss": 12.3906,
2925
+ "step": 4120
2926
+ },
2927
+ {
2928
+ "epoch": 0.18328978536677373,
2929
+ "grad_norm": 130.62158203125,
2930
+ "learning_rate": 9.992840247723964e-06,
2931
+ "loss": 12.0698,
2932
+ "step": 4130
2933
+ },
2934
+ {
2935
+ "epoch": 0.1837335862998652,
2936
+ "grad_norm": 126.1018295288086,
2937
+ "learning_rate": 9.992822911762035e-06,
2938
+ "loss": 12.4342,
2939
+ "step": 4140
2940
+ },
2941
+ {
2942
+ "epoch": 0.18417738723295665,
2943
+ "grad_norm": 131.96340942382812,
2944
+ "learning_rate": 9.992805575800108e-06,
2945
+ "loss": 12.6071,
2946
+ "step": 4150
2947
+ },
2948
+ {
2949
+ "epoch": 0.1846211881660481,
2950
+ "grad_norm": 130.8789520263672,
2951
+ "learning_rate": 9.992788239838181e-06,
2952
+ "loss": 12.323,
2953
+ "step": 4160
2954
+ },
2955
+ {
2956
+ "epoch": 0.18506498909913957,
2957
+ "grad_norm": 125.24624633789062,
2958
+ "learning_rate": 9.992770903876252e-06,
2959
+ "loss": 11.9601,
2960
+ "step": 4170
2961
+ },
2962
+ {
2963
+ "epoch": 0.18550879003223103,
2964
+ "grad_norm": 107.27322387695312,
2965
+ "learning_rate": 9.992753567914326e-06,
2966
+ "loss": 12.4659,
2967
+ "step": 4180
2968
+ },
2969
+ {
2970
+ "epoch": 0.18595259096532252,
2971
+ "grad_norm": 118.50186920166016,
2972
+ "learning_rate": 9.992736231952399e-06,
2973
+ "loss": 12.4699,
2974
+ "step": 4190
2975
+ },
2976
+ {
2977
+ "epoch": 0.18639639189841398,
2978
+ "grad_norm": 132.64605712890625,
2979
+ "learning_rate": 9.99271889599047e-06,
2980
+ "loss": 12.551,
2981
+ "step": 4200
2982
+ },
2983
+ {
2984
+ "epoch": 0.18684019283150544,
2985
+ "grad_norm": 129.10557556152344,
2986
+ "learning_rate": 9.992701560028543e-06,
2987
+ "loss": 12.5445,
2988
+ "step": 4210
2989
+ },
2990
+ {
2991
+ "epoch": 0.1872839937645969,
2992
+ "grad_norm": 118.41376495361328,
2993
+ "learning_rate": 9.992684224066616e-06,
2994
+ "loss": 12.4178,
2995
+ "step": 4220
2996
+ },
2997
+ {
2998
+ "epoch": 0.18772779469768835,
2999
+ "grad_norm": 115.65239715576172,
3000
+ "learning_rate": 9.992666888104688e-06,
3001
+ "loss": 12.4483,
3002
+ "step": 4230
3003
+ },
3004
+ {
3005
+ "epoch": 0.18817159563077981,
3006
+ "grad_norm": 125.76007080078125,
3007
+ "learning_rate": 9.99264955214276e-06,
3008
+ "loss": 13.088,
3009
+ "step": 4240
3010
+ },
3011
+ {
3012
+ "epoch": 0.18861539656387127,
3013
+ "grad_norm": 88.38092041015625,
3014
+ "learning_rate": 9.992632216180834e-06,
3015
+ "loss": 12.9315,
3016
+ "step": 4250
3017
+ },
3018
+ {
3019
+ "epoch": 0.18905919749696273,
3020
+ "grad_norm": 104.6368179321289,
3021
+ "learning_rate": 9.992614880218907e-06,
3022
+ "loss": 12.2731,
3023
+ "step": 4260
3024
+ },
3025
+ {
3026
+ "epoch": 0.1895029984300542,
3027
+ "grad_norm": 127.25080871582031,
3028
+ "learning_rate": 9.992597544256978e-06,
3029
+ "loss": 12.2454,
3030
+ "step": 4270
3031
+ },
3032
+ {
3033
+ "epoch": 0.18994679936314565,
3034
+ "grad_norm": 101.77681732177734,
3035
+ "learning_rate": 9.992580208295051e-06,
3036
+ "loss": 12.4424,
3037
+ "step": 4280
3038
+ },
3039
+ {
3040
+ "epoch": 0.1903906002962371,
3041
+ "grad_norm": 136.33221435546875,
3042
+ "learning_rate": 9.992562872333124e-06,
3043
+ "loss": 11.9918,
3044
+ "step": 4290
3045
+ },
3046
+ {
3047
+ "epoch": 0.1908344012293286,
3048
+ "grad_norm": 112.94047546386719,
3049
+ "learning_rate": 9.992545536371196e-06,
3050
+ "loss": 12.4194,
3051
+ "step": 4300
3052
+ },
3053
+ {
3054
+ "epoch": 0.19127820216242006,
3055
+ "grad_norm": 132.7883758544922,
3056
+ "learning_rate": 9.992528200409269e-06,
3057
+ "loss": 12.3539,
3058
+ "step": 4310
3059
+ },
3060
+ {
3061
+ "epoch": 0.19172200309551152,
3062
+ "grad_norm": 115.22777557373047,
3063
+ "learning_rate": 9.992510864447342e-06,
3064
+ "loss": 12.176,
3065
+ "step": 4320
3066
+ },
3067
+ {
3068
+ "epoch": 0.19216580402860298,
3069
+ "grad_norm": 133.0673828125,
3070
+ "learning_rate": 9.992493528485413e-06,
3071
+ "loss": 12.1488,
3072
+ "step": 4330
3073
+ },
3074
+ {
3075
+ "epoch": 0.19260960496169444,
3076
+ "grad_norm": 154.4739227294922,
3077
+ "learning_rate": 9.992476192523486e-06,
3078
+ "loss": 12.4385,
3079
+ "step": 4340
3080
+ },
3081
+ {
3082
+ "epoch": 0.1930534058947859,
3083
+ "grad_norm": 169.39413452148438,
3084
+ "learning_rate": 9.99245885656156e-06,
3085
+ "loss": 12.2436,
3086
+ "step": 4350
3087
+ },
3088
+ {
3089
+ "epoch": 0.19349720682787735,
3090
+ "grad_norm": 127.55028533935547,
3091
+ "learning_rate": 9.99244152059963e-06,
3092
+ "loss": 12.1869,
3093
+ "step": 4360
3094
+ },
3095
+ {
3096
+ "epoch": 0.19394100776096881,
3097
+ "grad_norm": 124.5598373413086,
3098
+ "learning_rate": 9.992424184637704e-06,
3099
+ "loss": 12.6008,
3100
+ "step": 4370
3101
+ },
3102
+ {
3103
+ "epoch": 0.19438480869406027,
3104
+ "grad_norm": 140.38067626953125,
3105
+ "learning_rate": 9.992406848675777e-06,
3106
+ "loss": 12.8844,
3107
+ "step": 4380
3108
+ },
3109
+ {
3110
+ "epoch": 0.19482860962715173,
3111
+ "grad_norm": 111.26557922363281,
3112
+ "learning_rate": 9.992389512713848e-06,
3113
+ "loss": 12.7417,
3114
+ "step": 4390
3115
+ },
3116
+ {
3117
+ "epoch": 0.1952724105602432,
3118
+ "grad_norm": 109.17289733886719,
3119
+ "learning_rate": 9.992372176751921e-06,
3120
+ "loss": 12.5357,
3121
+ "step": 4400
3122
+ },
3123
+ {
3124
+ "epoch": 0.19571621149333465,
3125
+ "grad_norm": 93.85588073730469,
3126
+ "learning_rate": 9.992354840789995e-06,
3127
+ "loss": 13.057,
3128
+ "step": 4410
3129
+ },
3130
+ {
3131
+ "epoch": 0.19616001242642614,
3132
+ "grad_norm": 132.5755157470703,
3133
+ "learning_rate": 9.992337504828066e-06,
3134
+ "loss": 11.8174,
3135
+ "step": 4420
3136
+ },
3137
+ {
3138
+ "epoch": 0.1966038133595176,
3139
+ "grad_norm": 137.10018920898438,
3140
+ "learning_rate": 9.992320168866139e-06,
3141
+ "loss": 12.8481,
3142
+ "step": 4430
3143
+ },
3144
+ {
3145
+ "epoch": 0.19704761429260906,
3146
+ "grad_norm": 114.44795989990234,
3147
+ "learning_rate": 9.992302832904212e-06,
3148
+ "loss": 12.6655,
3149
+ "step": 4440
3150
+ },
3151
+ {
3152
+ "epoch": 0.19749141522570052,
3153
+ "grad_norm": 101.33007049560547,
3154
+ "learning_rate": 9.992285496942283e-06,
3155
+ "loss": 11.2619,
3156
+ "step": 4450
3157
+ },
3158
+ {
3159
+ "epoch": 0.19793521615879198,
3160
+ "grad_norm": 121.8523941040039,
3161
+ "learning_rate": 9.992268160980357e-06,
3162
+ "loss": 11.8743,
3163
+ "step": 4460
3164
+ },
3165
+ {
3166
+ "epoch": 0.19837901709188344,
3167
+ "grad_norm": 133.41293334960938,
3168
+ "learning_rate": 9.99225082501843e-06,
3169
+ "loss": 12.4099,
3170
+ "step": 4470
3171
+ },
3172
+ {
3173
+ "epoch": 0.1988228180249749,
3174
+ "grad_norm": 121.90374755859375,
3175
+ "learning_rate": 9.992233489056503e-06,
3176
+ "loss": 12.8135,
3177
+ "step": 4480
3178
+ },
3179
+ {
3180
+ "epoch": 0.19926661895806635,
3181
+ "grad_norm": 108.74071502685547,
3182
+ "learning_rate": 9.992216153094574e-06,
3183
+ "loss": 11.9765,
3184
+ "step": 4490
3185
+ },
3186
+ {
3187
+ "epoch": 0.19971041989115781,
3188
+ "grad_norm": 112.89215850830078,
3189
+ "learning_rate": 9.992198817132647e-06,
3190
+ "loss": 12.9006,
3191
+ "step": 4500
3192
+ },
3193
+ {
3194
+ "epoch": 0.20015422082424927,
3195
+ "grad_norm": 132.44119262695312,
3196
+ "learning_rate": 9.99218148117072e-06,
3197
+ "loss": 12.6348,
3198
+ "step": 4510
3199
+ },
3200
+ {
3201
+ "epoch": 0.20059802175734073,
3202
+ "grad_norm": 132.33868408203125,
3203
+ "learning_rate": 9.992164145208792e-06,
3204
+ "loss": 12.3398,
3205
+ "step": 4520
3206
+ },
3207
+ {
3208
+ "epoch": 0.20104182269043222,
3209
+ "grad_norm": 125.20413970947266,
3210
+ "learning_rate": 9.992146809246865e-06,
3211
+ "loss": 12.8135,
3212
+ "step": 4530
3213
+ },
3214
+ {
3215
+ "epoch": 0.20148562362352368,
3216
+ "grad_norm": 104.17672729492188,
3217
+ "learning_rate": 9.992129473284938e-06,
3218
+ "loss": 12.1691,
3219
+ "step": 4540
3220
+ },
3221
+ {
3222
+ "epoch": 0.20192942455661514,
3223
+ "grad_norm": 145.2282257080078,
3224
+ "learning_rate": 9.992112137323009e-06,
3225
+ "loss": 12.2538,
3226
+ "step": 4550
3227
+ },
3228
+ {
3229
+ "epoch": 0.2023732254897066,
3230
+ "grad_norm": 130.9412078857422,
3231
+ "learning_rate": 9.992094801361082e-06,
3232
+ "loss": 12.1611,
3233
+ "step": 4560
3234
+ },
3235
+ {
3236
+ "epoch": 0.20281702642279806,
3237
+ "grad_norm": 104.62641143798828,
3238
+ "learning_rate": 9.992077465399155e-06,
3239
+ "loss": 12.3785,
3240
+ "step": 4570
3241
+ },
3242
+ {
3243
+ "epoch": 0.20326082735588952,
3244
+ "grad_norm": 106.15484619140625,
3245
+ "learning_rate": 9.992060129437227e-06,
3246
+ "loss": 12.139,
3247
+ "step": 4580
3248
+ },
3249
+ {
3250
+ "epoch": 0.20370462828898098,
3251
+ "grad_norm": 135.23989868164062,
3252
+ "learning_rate": 9.9920427934753e-06,
3253
+ "loss": 11.6559,
3254
+ "step": 4590
3255
+ },
3256
+ {
3257
+ "epoch": 0.20414842922207244,
3258
+ "grad_norm": 129.04571533203125,
3259
+ "learning_rate": 9.992025457513373e-06,
3260
+ "loss": 12.2296,
3261
+ "step": 4600
3262
+ },
3263
+ {
3264
+ "epoch": 0.2045922301551639,
3265
+ "grad_norm": 100.73985290527344,
3266
+ "learning_rate": 9.992008121551444e-06,
3267
+ "loss": 12.429,
3268
+ "step": 4610
3269
+ },
3270
+ {
3271
+ "epoch": 0.20503603108825535,
3272
+ "grad_norm": 120.50385284423828,
3273
+ "learning_rate": 9.991990785589517e-06,
3274
+ "loss": 12.6906,
3275
+ "step": 4620
3276
+ },
3277
+ {
3278
+ "epoch": 0.20547983202134681,
3279
+ "grad_norm": 119.28755187988281,
3280
+ "learning_rate": 9.99197344962759e-06,
3281
+ "loss": 12.0725,
3282
+ "step": 4630
3283
+ },
3284
+ {
3285
+ "epoch": 0.20592363295443827,
3286
+ "grad_norm": 122.34222412109375,
3287
+ "learning_rate": 9.991956113665662e-06,
3288
+ "loss": 12.5552,
3289
+ "step": 4640
3290
+ },
3291
+ {
3292
+ "epoch": 0.20636743388752976,
3293
+ "grad_norm": 123.09967041015625,
3294
+ "learning_rate": 9.991938777703735e-06,
3295
+ "loss": 11.6526,
3296
+ "step": 4650
3297
+ },
3298
+ {
3299
+ "epoch": 0.20681123482062122,
3300
+ "grad_norm": 97.26423645019531,
3301
+ "learning_rate": 9.991921441741808e-06,
3302
+ "loss": 12.1968,
3303
+ "step": 4660
3304
+ },
3305
+ {
3306
+ "epoch": 0.20725503575371268,
3307
+ "grad_norm": 114.7996826171875,
3308
+ "learning_rate": 9.99190410577988e-06,
3309
+ "loss": 12.6007,
3310
+ "step": 4670
3311
+ },
3312
+ {
3313
+ "epoch": 0.20769883668680414,
3314
+ "grad_norm": 140.515869140625,
3315
+ "learning_rate": 9.991886769817952e-06,
3316
+ "loss": 12.2736,
3317
+ "step": 4680
3318
+ },
3319
+ {
3320
+ "epoch": 0.2081426376198956,
3321
+ "grad_norm": 125.37648010253906,
3322
+ "learning_rate": 9.991869433856025e-06,
3323
+ "loss": 12.1014,
3324
+ "step": 4690
3325
+ },
3326
+ {
3327
+ "epoch": 0.20858643855298706,
3328
+ "grad_norm": 128.2080078125,
3329
+ "learning_rate": 9.991852097894099e-06,
3330
+ "loss": 11.7794,
3331
+ "step": 4700
3332
+ },
3333
+ {
3334
+ "epoch": 0.20903023948607852,
3335
+ "grad_norm": 115.89141082763672,
3336
+ "learning_rate": 9.99183476193217e-06,
3337
+ "loss": 12.2364,
3338
+ "step": 4710
3339
+ },
3340
+ {
3341
+ "epoch": 0.20947404041916998,
3342
+ "grad_norm": 118.23058319091797,
3343
+ "learning_rate": 9.991817425970243e-06,
3344
+ "loss": 11.7807,
3345
+ "step": 4720
3346
+ },
3347
+ {
3348
+ "epoch": 0.20991784135226144,
3349
+ "grad_norm": 120.32998657226562,
3350
+ "learning_rate": 9.991800090008316e-06,
3351
+ "loss": 12.3347,
3352
+ "step": 4730
3353
+ },
3354
+ {
3355
+ "epoch": 0.2103616422853529,
3356
+ "grad_norm": 131.60520935058594,
3357
+ "learning_rate": 9.991782754046387e-06,
3358
+ "loss": 12.5857,
3359
+ "step": 4740
3360
+ },
3361
+ {
3362
+ "epoch": 0.21080544321844435,
3363
+ "grad_norm": 123.4472885131836,
3364
+ "learning_rate": 9.99176541808446e-06,
3365
+ "loss": 12.5721,
3366
+ "step": 4750
3367
+ },
3368
+ {
3369
+ "epoch": 0.21124924415153584,
3370
+ "grad_norm": 115.33013153076172,
3371
+ "learning_rate": 9.991748082122534e-06,
3372
+ "loss": 12.2645,
3373
+ "step": 4760
3374
+ },
3375
+ {
3376
+ "epoch": 0.2116930450846273,
3377
+ "grad_norm": 128.06332397460938,
3378
+ "learning_rate": 9.991730746160605e-06,
3379
+ "loss": 11.9115,
3380
+ "step": 4770
3381
+ },
3382
+ {
3383
+ "epoch": 0.21213684601771876,
3384
+ "grad_norm": 133.6379852294922,
3385
+ "learning_rate": 9.991713410198678e-06,
3386
+ "loss": 12.2299,
3387
+ "step": 4780
3388
+ },
3389
+ {
3390
+ "epoch": 0.21258064695081022,
3391
+ "grad_norm": 99.40995025634766,
3392
+ "learning_rate": 9.991696074236751e-06,
3393
+ "loss": 12.232,
3394
+ "step": 4790
3395
+ },
3396
+ {
3397
+ "epoch": 0.21302444788390168,
3398
+ "grad_norm": 128.07489013671875,
3399
+ "learning_rate": 9.991678738274823e-06,
3400
+ "loss": 12.5043,
3401
+ "step": 4800
3402
+ },
3403
+ {
3404
+ "epoch": 0.21346824881699314,
3405
+ "grad_norm": 106.18391418457031,
3406
+ "learning_rate": 9.991661402312896e-06,
3407
+ "loss": 12.515,
3408
+ "step": 4810
3409
+ },
3410
+ {
3411
+ "epoch": 0.2139120497500846,
3412
+ "grad_norm": 103.52220153808594,
3413
+ "learning_rate": 9.991644066350969e-06,
3414
+ "loss": 12.5614,
3415
+ "step": 4820
3416
+ },
3417
+ {
3418
+ "epoch": 0.21435585068317606,
3419
+ "grad_norm": 144.36888122558594,
3420
+ "learning_rate": 9.99162673038904e-06,
3421
+ "loss": 12.0411,
3422
+ "step": 4830
3423
+ },
3424
+ {
3425
+ "epoch": 0.21479965161626752,
3426
+ "grad_norm": 119.41954803466797,
3427
+ "learning_rate": 9.991609394427113e-06,
3428
+ "loss": 12.0378,
3429
+ "step": 4840
3430
+ },
3431
+ {
3432
+ "epoch": 0.21524345254935898,
3433
+ "grad_norm": 97.38760375976562,
3434
+ "learning_rate": 9.991592058465186e-06,
3435
+ "loss": 12.0504,
3436
+ "step": 4850
3437
+ },
3438
+ {
3439
+ "epoch": 0.21568725348245044,
3440
+ "grad_norm": 123.15618896484375,
3441
+ "learning_rate": 9.991574722503258e-06,
3442
+ "loss": 12.4191,
3443
+ "step": 4860
3444
+ },
3445
+ {
3446
+ "epoch": 0.2161310544155419,
3447
+ "grad_norm": 121.77255249023438,
3448
+ "learning_rate": 9.99155738654133e-06,
3449
+ "loss": 11.9067,
3450
+ "step": 4870
3451
+ },
3452
+ {
3453
+ "epoch": 0.21657485534863338,
3454
+ "grad_norm": 128.0386962890625,
3455
+ "learning_rate": 9.991540050579404e-06,
3456
+ "loss": 12.9843,
3457
+ "step": 4880
3458
+ },
3459
+ {
3460
+ "epoch": 0.21701865628172484,
3461
+ "grad_norm": 102.69422149658203,
3462
+ "learning_rate": 9.991522714617475e-06,
3463
+ "loss": 12.2082,
3464
+ "step": 4890
3465
+ },
3466
+ {
3467
+ "epoch": 0.2174624572148163,
3468
+ "grad_norm": 125.6636734008789,
3469
+ "learning_rate": 9.991505378655548e-06,
3470
+ "loss": 12.5348,
3471
+ "step": 4900
3472
+ },
3473
+ {
3474
+ "epoch": 0.21790625814790776,
3475
+ "grad_norm": 135.658203125,
3476
+ "learning_rate": 9.991488042693621e-06,
3477
+ "loss": 11.9152,
3478
+ "step": 4910
3479
+ },
3480
+ {
3481
+ "epoch": 0.21835005908099922,
3482
+ "grad_norm": 99.29556274414062,
3483
+ "learning_rate": 9.991470706731694e-06,
3484
+ "loss": 12.5844,
3485
+ "step": 4920
3486
+ },
3487
+ {
3488
+ "epoch": 0.21879386001409068,
3489
+ "grad_norm": 95.45423126220703,
3490
+ "learning_rate": 9.991453370769766e-06,
3491
+ "loss": 12.926,
3492
+ "step": 4930
3493
+ },
3494
+ {
3495
+ "epoch": 0.21923766094718214,
3496
+ "grad_norm": 118.17921447753906,
3497
+ "learning_rate": 9.991436034807839e-06,
3498
+ "loss": 12.2757,
3499
+ "step": 4940
3500
+ },
3501
+ {
3502
+ "epoch": 0.2196814618802736,
3503
+ "grad_norm": 120.35387420654297,
3504
+ "learning_rate": 9.991418698845912e-06,
3505
+ "loss": 12.4541,
3506
+ "step": 4950
3507
+ },
3508
+ {
3509
+ "epoch": 0.22012526281336506,
3510
+ "grad_norm": 109.4905776977539,
3511
+ "learning_rate": 9.991401362883983e-06,
3512
+ "loss": 11.7813,
3513
+ "step": 4960
3514
+ },
3515
+ {
3516
+ "epoch": 0.22056906374645652,
3517
+ "grad_norm": 102.00797271728516,
3518
+ "learning_rate": 9.991384026922056e-06,
3519
+ "loss": 11.9456,
3520
+ "step": 4970
3521
+ },
3522
+ {
3523
+ "epoch": 0.22101286467954798,
3524
+ "grad_norm": 104.43452453613281,
3525
+ "learning_rate": 9.99136669096013e-06,
3526
+ "loss": 11.8744,
3527
+ "step": 4980
3528
+ },
3529
+ {
3530
+ "epoch": 0.22145666561263946,
3531
+ "grad_norm": 91.9101791381836,
3532
+ "learning_rate": 9.991349354998201e-06,
3533
+ "loss": 12.1663,
3534
+ "step": 4990
3535
+ },
3536
+ {
3537
+ "epoch": 0.22190046654573092,
3538
+ "grad_norm": 139.61109924316406,
3539
+ "learning_rate": 9.991332019036274e-06,
3540
+ "loss": 12.5729,
3541
+ "step": 5000
3542
+ },
3543
+ {
3544
+ "epoch": 0.22190046654573092,
3545
+ "eval_loss": 0.37864938378334045,
3546
+ "eval_runtime": 675.391,
3547
+ "eval_samples_per_second": 1798.056,
3548
+ "eval_steps_per_second": 56.19,
3549
+ "step": 5000
3550
  }
3551
  ],
3552
  "logging_steps": 10,
 
3566
  "attributes": {}
3567
  }
3568
  },
3569
+ "total_flos": 1.744861602512896e+18,
3570
  "train_batch_size": 4,
3571
  "trial_name": null,
3572
  "trial_params": null