ErrorAI commited on
Commit
f3a8fc4
·
verified ·
1 Parent(s): a529840

Training in progress, step 301, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65dc7c94f10d1fd8453ca692791a5b0828673812ffffb21a0d1de89e54ec7d6b
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c59e4672ef7cc36f9f5487ce55f2ccfa0a52617841525b4bb1ac622d5b4aa80f
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8315c06588c0eadc24b6c01f05d7044bc9877ba97394bcc91e35c44e4df5291d
3
- size 41119636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00570a8c2ef93f07116ec0c50f9481945aab0c0c66ec1e76db067833c5ce46f6
3
+ size 41120084
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d4268bb260bb0810b614917f265d2383e6575486808e3cbab30ba6a3e94dab5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d35b3a9dd0a1f8f9486755fe2563ec30d796d72743863e4be5bb2ab47e364b5
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ad89cd045fe380c9e3c8629b5b884bcbf8e3c6af3cb83ad82730d6d0ed22b56
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207ebf8b22c8abb89ae63b0d6979b5f27d625d140e3311a16b44653eec5ce343
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7587354409317804,
5
  "eval_steps": 500,
6
- "global_step": 228,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1603,6 +1603,525 @@
1603
  "learning_rate": 1.427141579677374e-05,
1604
  "loss": 0.5648,
1605
  "step": 228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1606
  }
1607
  ],
1608
  "logging_steps": 1,
@@ -1617,12 +2136,12 @@
1617
  "should_evaluate": false,
1618
  "should_log": false,
1619
  "should_save": true,
1620
- "should_training_stop": false
1621
  },
1622
  "attributes": {}
1623
  }
1624
  },
1625
- "total_flos": 2.9872743929767526e+17,
1626
  "train_batch_size": 4,
1627
  "trial_name": null,
1628
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0016638935108153,
5
  "eval_steps": 500,
6
+ "global_step": 301,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1603
  "learning_rate": 1.427141579677374e-05,
1604
  "loss": 0.5648,
1605
  "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.762063227953411,
1609
+ "grad_norm": 0.5607278943061829,
1610
+ "learning_rate": 1.3902195302273779e-05,
1611
+ "loss": 0.3263,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.7653910149750416,
1616
+ "grad_norm": 0.5801076889038086,
1617
+ "learning_rate": 1.3537041048046695e-05,
1618
+ "loss": 0.4707,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.7687188019966722,
1623
+ "grad_norm": 0.7465184926986694,
1624
+ "learning_rate": 1.3175994166924394e-05,
1625
+ "loss": 0.4633,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.7720465890183028,
1630
+ "grad_norm": 0.893689751625061,
1631
+ "learning_rate": 1.2819095329063469e-05,
1632
+ "loss": 0.4621,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.7753743760399334,
1637
+ "grad_norm": 0.6686207056045532,
1638
+ "learning_rate": 1.246638473736378e-05,
1639
+ "loss": 0.5619,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.778702163061564,
1644
+ "grad_norm": 0.8450759053230286,
1645
+ "learning_rate": 1.2117902122939861e-05,
1646
+ "loss": 0.578,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.7820299500831946,
1651
+ "grad_norm": 0.7605635523796082,
1652
+ "learning_rate": 1.1773686740645384e-05,
1653
+ "loss": 0.6534,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.7853577371048253,
1658
+ "grad_norm": 0.9870882630348206,
1659
+ "learning_rate": 1.1433777364651271e-05,
1660
+ "loss": 0.8853,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.7886855241264559,
1665
+ "grad_norm": 0.5999244451522827,
1666
+ "learning_rate": 1.1098212284078036e-05,
1667
+ "loss": 0.3816,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.7920133111480865,
1672
+ "grad_norm": 0.8197599053382874,
1673
+ "learning_rate": 1.076702929868264e-05,
1674
+ "loss": 0.8135,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.7953410981697171,
1679
+ "grad_norm": 0.7237629294395447,
1680
+ "learning_rate": 1.0440265714600572e-05,
1681
+ "loss": 0.6637,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.7986688851913477,
1686
+ "grad_norm": 0.8603866696357727,
1687
+ "learning_rate": 1.0117958340143507e-05,
1688
+ "loss": 0.8529,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.8019966722129783,
1693
+ "grad_norm": 0.7199721336364746,
1694
+ "learning_rate": 9.800143481652979e-06,
1695
+ "loss": 0.8565,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.8053244592346089,
1700
+ "grad_norm": 0.6758659482002258,
1701
+ "learning_rate": 9.48685693941067e-06,
1702
+ "loss": 0.794,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.8086522462562395,
1707
+ "grad_norm": 0.7372802495956421,
1708
+ "learning_rate": 9.17813400360572e-06,
1709
+ "loss": 0.8753,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.8119800332778702,
1714
+ "grad_norm": 0.6816331148147583,
1715
+ "learning_rate": 8.874009450359427e-06,
1716
+ "loss": 0.7685,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.8153078202995009,
1721
+ "grad_norm": 0.6845401525497437,
1722
+ "learning_rate": 8.574517537807897e-06,
1723
+ "loss": 0.7908,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.8186356073211315,
1728
+ "grad_norm": 0.7325373291969299,
1729
+ "learning_rate": 8.279692002243027e-06,
1730
+ "loss": 0.9694,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.8219633943427621,
1735
+ "grad_norm": 0.7457829713821411,
1736
+ "learning_rate": 7.989566054312287e-06,
1737
+ "loss": 0.6347,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.8252911813643927,
1742
+ "grad_norm": 0.7450778484344482,
1743
+ "learning_rate": 7.704172375277691e-06,
1744
+ "loss": 0.8447,
1745
+ "step": 248
1746
+ },
1747
+ {
1748
+ "epoch": 0.8286189683860233,
1749
+ "grad_norm": 0.7165175676345825,
1750
+ "learning_rate": 7.423543113334436e-06,
1751
+ "loss": 0.786,
1752
+ "step": 249
1753
+ },
1754
+ {
1755
+ "epoch": 0.831946755407654,
1756
+ "grad_norm": 1.0344496965408325,
1757
+ "learning_rate": 7.14770987998954e-06,
1758
+ "loss": 0.8206,
1759
+ "step": 250
1760
+ },
1761
+ {
1762
+ "epoch": 0.8352745424292846,
1763
+ "grad_norm": 0.6772722005844116,
1764
+ "learning_rate": 6.876703746500984e-06,
1765
+ "loss": 0.4412,
1766
+ "step": 251
1767
+ },
1768
+ {
1769
+ "epoch": 0.8386023294509152,
1770
+ "grad_norm": 0.6309685111045837,
1771
+ "learning_rate": 6.610555240377652e-06,
1772
+ "loss": 0.4023,
1773
+ "step": 252
1774
+ },
1775
+ {
1776
+ "epoch": 0.8419301164725458,
1777
+ "grad_norm": 0.8043044805526733,
1778
+ "learning_rate": 6.349294341940593e-06,
1779
+ "loss": 0.4782,
1780
+ "step": 253
1781
+ },
1782
+ {
1783
+ "epoch": 0.8452579034941764,
1784
+ "grad_norm": 0.6318528056144714,
1785
+ "learning_rate": 6.092950480945897e-06,
1786
+ "loss": 0.4712,
1787
+ "step": 254
1788
+ },
1789
+ {
1790
+ "epoch": 0.848585690515807,
1791
+ "grad_norm": 0.5549326539039612,
1792
+ "learning_rate": 5.841552533269534e-06,
1793
+ "loss": 0.3927,
1794
+ "step": 255
1795
+ },
1796
+ {
1797
+ "epoch": 0.8519134775374376,
1798
+ "grad_norm": 0.5588014721870422,
1799
+ "learning_rate": 5.595128817654638e-06,
1800
+ "loss": 0.3666,
1801
+ "step": 256
1802
+ },
1803
+ {
1804
+ "epoch": 0.8552412645590682,
1805
+ "grad_norm": 0.6045800447463989,
1806
+ "learning_rate": 5.353707092521582e-06,
1807
+ "loss": 0.5723,
1808
+ "step": 257
1809
+ },
1810
+ {
1811
+ "epoch": 0.8585690515806988,
1812
+ "grad_norm": 0.6416806578636169,
1813
+ "learning_rate": 5.117314552841052e-06,
1814
+ "loss": 0.4304,
1815
+ "step": 258
1816
+ },
1817
+ {
1818
+ "epoch": 0.8618968386023295,
1819
+ "grad_norm": 0.5945389866828918,
1820
+ "learning_rate": 4.885977827070748e-06,
1821
+ "loss": 0.343,
1822
+ "step": 259
1823
+ },
1824
+ {
1825
+ "epoch": 0.8652246256239601,
1826
+ "grad_norm": 0.6209045052528381,
1827
+ "learning_rate": 4.659722974155767e-06,
1828
+ "loss": 0.3872,
1829
+ "step": 260
1830
+ },
1831
+ {
1832
+ "epoch": 0.8685524126455907,
1833
+ "grad_norm": 0.627796471118927,
1834
+ "learning_rate": 4.43857548059321e-06,
1835
+ "loss": 0.3544,
1836
+ "step": 261
1837
+ },
1838
+ {
1839
+ "epoch": 0.8718801996672213,
1840
+ "grad_norm": 0.5913266539573669,
1841
+ "learning_rate": 4.2225602575612755e-06,
1842
+ "loss": 0.4773,
1843
+ "step": 262
1844
+ },
1845
+ {
1846
+ "epoch": 0.8752079866888519,
1847
+ "grad_norm": 0.6416683793067932,
1848
+ "learning_rate": 4.011701638113063e-06,
1849
+ "loss": 0.412,
1850
+ "step": 263
1851
+ },
1852
+ {
1853
+ "epoch": 0.8785357737104825,
1854
+ "grad_norm": 0.8090274930000305,
1855
+ "learning_rate": 3.8060233744356633e-06,
1856
+ "loss": 0.5336,
1857
+ "step": 264
1858
+ },
1859
+ {
1860
+ "epoch": 0.8818635607321131,
1861
+ "grad_norm": 0.5056114792823792,
1862
+ "learning_rate": 3.605548635174533e-06,
1863
+ "loss": 0.2786,
1864
+ "step": 265
1865
+ },
1866
+ {
1867
+ "epoch": 0.8851913477537438,
1868
+ "grad_norm": 0.566896915435791,
1869
+ "learning_rate": 3.410300002823691e-06,
1870
+ "loss": 0.3718,
1871
+ "step": 266
1872
+ },
1873
+ {
1874
+ "epoch": 0.8885191347753744,
1875
+ "grad_norm": 0.5743393898010254,
1876
+ "learning_rate": 3.220299471181898e-06,
1877
+ "loss": 0.4044,
1878
+ "step": 267
1879
+ },
1880
+ {
1881
+ "epoch": 0.891846921797005,
1882
+ "grad_norm": 0.634122908115387,
1883
+ "learning_rate": 3.035568442875136e-06,
1884
+ "loss": 0.6042,
1885
+ "step": 268
1886
+ },
1887
+ {
1888
+ "epoch": 0.8951747088186356,
1889
+ "grad_norm": 0.39157751202583313,
1890
+ "learning_rate": 2.85612772694579e-06,
1891
+ "loss": 0.153,
1892
+ "step": 269
1893
+ },
1894
+ {
1895
+ "epoch": 0.8985024958402662,
1896
+ "grad_norm": 0.5736538171768188,
1897
+ "learning_rate": 2.6819975365085237e-06,
1898
+ "loss": 0.3097,
1899
+ "step": 270
1900
+ },
1901
+ {
1902
+ "epoch": 0.9018302828618968,
1903
+ "grad_norm": 0.5841122269630432,
1904
+ "learning_rate": 2.5131974864734066e-06,
1905
+ "loss": 0.4648,
1906
+ "step": 271
1907
+ },
1908
+ {
1909
+ "epoch": 0.9051580698835274,
1910
+ "grad_norm": 0.5904918313026428,
1911
+ "learning_rate": 2.349746591336405e-06,
1912
+ "loss": 0.5159,
1913
+ "step": 272
1914
+ },
1915
+ {
1916
+ "epoch": 0.908485856905158,
1917
+ "grad_norm": 0.5497831702232361,
1918
+ "learning_rate": 2.191663263037458e-06,
1919
+ "loss": 0.3491,
1920
+ "step": 273
1921
+ },
1922
+ {
1923
+ "epoch": 0.9118136439267887,
1924
+ "grad_norm": 0.6325455904006958,
1925
+ "learning_rate": 2.0389653088865036e-06,
1926
+ "loss": 0.4999,
1927
+ "step": 274
1928
+ },
1929
+ {
1930
+ "epoch": 0.9151414309484193,
1931
+ "grad_norm": 0.5889260172843933,
1932
+ "learning_rate": 1.8916699295575324e-06,
1933
+ "loss": 0.4644,
1934
+ "step": 275
1935
+ },
1936
+ {
1937
+ "epoch": 0.9184692179700499,
1938
+ "grad_norm": 0.5664623379707336,
1939
+ "learning_rate": 1.7497937171510547e-06,
1940
+ "loss": 0.3335,
1941
+ "step": 276
1942
+ },
1943
+ {
1944
+ "epoch": 0.9217970049916805,
1945
+ "grad_norm": 0.49772658944129944,
1946
+ "learning_rate": 1.6133526533250565e-06,
1947
+ "loss": 0.3306,
1948
+ "step": 277
1949
+ },
1950
+ {
1951
+ "epoch": 0.9251247920133111,
1952
+ "grad_norm": 0.4992808699607849,
1953
+ "learning_rate": 1.4823621074947503e-06,
1954
+ "loss": 0.354,
1955
+ "step": 278
1956
+ },
1957
+ {
1958
+ "epoch": 0.9284525790349417,
1959
+ "grad_norm": 0.4994637966156006,
1960
+ "learning_rate": 1.3568368351012717e-06,
1961
+ "loss": 0.3448,
1962
+ "step": 279
1963
+ },
1964
+ {
1965
+ "epoch": 0.9317803660565723,
1966
+ "grad_norm": 0.48505714535713196,
1967
+ "learning_rate": 1.236790975949592e-06,
1968
+ "loss": 0.3501,
1969
+ "step": 280
1970
+ },
1971
+ {
1972
+ "epoch": 0.9351081530782029,
1973
+ "grad_norm": 1.016489028930664,
1974
+ "learning_rate": 1.1222380526156928e-06,
1975
+ "loss": 0.4423,
1976
+ "step": 281
1977
+ },
1978
+ {
1979
+ "epoch": 0.9384359400998337,
1980
+ "grad_norm": 0.7452896237373352,
1981
+ "learning_rate": 1.0131909689233442e-06,
1982
+ "loss": 0.6174,
1983
+ "step": 282
1984
+ },
1985
+ {
1986
+ "epoch": 0.9417637271214643,
1987
+ "grad_norm": 1.0553852319717407,
1988
+ "learning_rate": 9.096620084905472e-07,
1989
+ "loss": 0.7014,
1990
+ "step": 283
1991
+ },
1992
+ {
1993
+ "epoch": 0.9450915141430949,
1994
+ "grad_norm": 0.7587735652923584,
1995
+ "learning_rate": 8.11662833345822e-07,
1996
+ "loss": 0.6485,
1997
+ "step": 284
1998
+ },
1999
+ {
2000
+ "epoch": 0.9484193011647255,
2001
+ "grad_norm": 0.7494546175003052,
2002
+ "learning_rate": 7.192044826145771e-07,
2003
+ "loss": 0.7379,
2004
+ "step": 285
2005
+ },
2006
+ {
2007
+ "epoch": 0.9517470881863561,
2008
+ "grad_norm": 0.8453697562217712,
2009
+ "learning_rate": 6.322973712755697e-07,
2010
+ "loss": 0.8027,
2011
+ "step": 286
2012
+ },
2013
+ {
2014
+ "epoch": 0.9550748752079867,
2015
+ "grad_norm": 0.6284394860267639,
2016
+ "learning_rate": 5.509512889877333e-07,
2017
+ "loss": 0.5964,
2018
+ "step": 287
2019
+ },
2020
+ {
2021
+ "epoch": 0.9584026622296173,
2022
+ "grad_norm": 0.8004696369171143,
2023
+ "learning_rate": 4.7517539898741524e-07,
2024
+ "loss": 0.6745,
2025
+ "step": 288
2026
+ },
2027
+ {
2028
+ "epoch": 0.961730449251248,
2029
+ "grad_norm": 0.7253815531730652,
2030
+ "learning_rate": 4.049782370561583e-07,
2031
+ "loss": 0.7672,
2032
+ "step": 289
2033
+ },
2034
+ {
2035
+ "epoch": 0.9650582362728786,
2036
+ "grad_norm": 0.7120758891105652,
2037
+ "learning_rate": 3.4036771055923066e-07,
2038
+ "loss": 0.6682,
2039
+ "step": 290
2040
+ },
2041
+ {
2042
+ "epoch": 0.9683860232945092,
2043
+ "grad_norm": 0.7899070978164673,
2044
+ "learning_rate": 2.813510975548772e-07,
2045
+ "loss": 0.7882,
2046
+ "step": 291
2047
+ },
2048
+ {
2049
+ "epoch": 0.9717138103161398,
2050
+ "grad_norm": 0.8241245150566101,
2051
+ "learning_rate": 2.2793504597447002e-07,
2052
+ "loss": 0.8099,
2053
+ "step": 292
2054
+ },
2055
+ {
2056
+ "epoch": 0.9750415973377704,
2057
+ "grad_norm": 0.7875584363937378,
2058
+ "learning_rate": 1.8012557287367392e-07,
2059
+ "loss": 1.0536,
2060
+ "step": 293
2061
+ },
2062
+ {
2063
+ "epoch": 0.978369384359401,
2064
+ "grad_norm": 0.8683494329452515,
2065
+ "learning_rate": 1.379280637546443e-07,
2066
+ "loss": 0.8868,
2067
+ "step": 294
2068
+ },
2069
+ {
2070
+ "epoch": 0.9816971713810316,
2071
+ "grad_norm": 0.7692094445228577,
2072
+ "learning_rate": 1.0134727195937333e-07,
2073
+ "loss": 0.7747,
2074
+ "step": 295
2075
+ },
2076
+ {
2077
+ "epoch": 0.9850249584026622,
2078
+ "grad_norm": 0.7598888278007507,
2079
+ "learning_rate": 7.038731813426291e-08,
2080
+ "loss": 0.7562,
2081
+ "step": 296
2082
+ },
2083
+ {
2084
+ "epoch": 0.9883527454242929,
2085
+ "grad_norm": 0.759955108165741,
2086
+ "learning_rate": 4.5051689765929214e-08,
2087
+ "loss": 0.775,
2088
+ "step": 297
2089
+ },
2090
+ {
2091
+ "epoch": 0.9916805324459235,
2092
+ "grad_norm": 0.764261782169342,
2093
+ "learning_rate": 2.534324078837802e-08,
2094
+ "loss": 0.6648,
2095
+ "step": 298
2096
+ },
2097
+ {
2098
+ "epoch": 0.9950083194675541,
2099
+ "grad_norm": 0.8436802625656128,
2100
+ "learning_rate": 1.1264191261528557e-08,
2101
+ "loss": 0.848,
2102
+ "step": 299
2103
+ },
2104
+ {
2105
+ "epoch": 0.9983361064891847,
2106
+ "grad_norm": 0.9775305390357971,
2107
+ "learning_rate": 2.8161271211024633e-09,
2108
+ "loss": 0.7154,
2109
+ "step": 300
2110
+ },
2111
+ {
2112
+ "epoch": 0.9983361064891847,
2113
+ "eval_loss": 0.49492859840393066,
2114
+ "eval_runtime": 26.7842,
2115
+ "eval_samples_per_second": 9.446,
2116
+ "eval_steps_per_second": 2.389,
2117
+ "step": 300
2118
+ },
2119
+ {
2120
+ "epoch": 1.0016638935108153,
2121
+ "grad_norm": 0.9796708226203918,
2122
+ "learning_rate": 0.0,
2123
+ "loss": 0.4979,
2124
+ "step": 301
2125
  }
2126
  ],
2127
  "logging_steps": 1,
 
2136
  "should_evaluate": false,
2137
  "should_log": false,
2138
  "should_save": true,
2139
+ "should_training_stop": true
2140
  },
2141
  "attributes": {}
2142
  }
2143
  },
2144
+ "total_flos": 3.93702389745451e+17,
2145
  "train_batch_size": 4,
2146
  "trial_name": null,
2147
  "trial_params": null