ErrorAI commited on
Commit
4facfdb
·
verified ·
1 Parent(s): d14afd3

Training in progress, step 319, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7663245967dcd438ca1bc197179b74400981a7f2e6c2dda91b6452cb974fbe7f
3
  size 17640136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2793cb941cf4e5be9ee6dd19ed4ee7939c42ea436e22067013e1e6437da83a63
3
  size 17640136
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d6950c73423891613a4dc03f77cc7c421f22c42e33cba6379929245061fe539
3
- size 9568884
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d88db198bae735a091dad5b931adda41f286b050e42862eeb28107bb22842db
3
+ size 9569204
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:361fb425516f83f38c261aa6c9128c83ed47651429def5cd3ac42d2b7b0342fc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cab4db58c0b7d45257acb195675ff85e37828305a1b72e7f13f6f2c7b58a19e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cf0ce84b55ef92ba1a34321398985dec341ef213dc7495da15b9effc346b4bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4222df64063a0da99a3d9b170b238ff21de6b814178c3cb5d1a90ab8274aa2a3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7541241162608012,
5
  "eval_steps": 80,
6
- "global_step": 240,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1719,6 +1719,559 @@
1719
  "eval_samples_per_second": 50.651,
1720
  "eval_steps_per_second": 25.326,
1721
  "step": 240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1722
  }
1723
  ],
1724
  "logging_steps": 1,
@@ -1733,12 +2286,12 @@
1733
  "should_evaluate": false,
1734
  "should_log": false,
1735
  "should_save": true,
1736
- "should_training_stop": false
1737
  },
1738
  "attributes": {}
1739
  }
1740
  },
1741
- "total_flos": 2136915931299840.0,
1742
  "train_batch_size": 2,
1743
  "trial_name": null,
1744
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.002356637863315,
5
  "eval_steps": 80,
6
+ "global_step": 319,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1719
  "eval_samples_per_second": 50.651,
1720
  "eval_steps_per_second": 25.326,
1721
  "step": 240
1722
+ },
1723
+ {
1724
+ "epoch": 0.7572663000785546,
1725
+ "grad_norm": 3.192919969558716,
1726
+ "learning_rate": 2.98305571716907e-05,
1727
+ "loss": 1.7715,
1728
+ "step": 241
1729
+ },
1730
+ {
1731
+ "epoch": 0.7604084838963079,
1732
+ "grad_norm": 2.2202773094177246,
1733
+ "learning_rate": 2.9109822211964043e-05,
1734
+ "loss": 1.1682,
1735
+ "step": 242
1736
+ },
1737
+ {
1738
+ "epoch": 0.7635506677140613,
1739
+ "grad_norm": 4.489591598510742,
1740
+ "learning_rate": 2.8396414908880098e-05,
1741
+ "loss": 1.5771,
1742
+ "step": 243
1743
+ },
1744
+ {
1745
+ "epoch": 0.7666928515318147,
1746
+ "grad_norm": 2.944850444793701,
1747
+ "learning_rate": 2.769040900472488e-05,
1748
+ "loss": 1.5206,
1749
+ "step": 244
1750
+ },
1751
+ {
1752
+ "epoch": 0.769835035349568,
1753
+ "grad_norm": 3.2115588188171387,
1754
+ "learning_rate": 2.699187747672899e-05,
1755
+ "loss": 1.8927,
1756
+ "step": 245
1757
+ },
1758
+ {
1759
+ "epoch": 0.7729772191673213,
1760
+ "grad_norm": 3.45051908493042,
1761
+ "learning_rate": 2.630089252952427e-05,
1762
+ "loss": 1.5243,
1763
+ "step": 246
1764
+ },
1765
+ {
1766
+ "epoch": 0.7761194029850746,
1767
+ "grad_norm": 3.194263458251953,
1768
+ "learning_rate": 2.5617525587680402e-05,
1769
+ "loss": 1.8308,
1770
+ "step": 247
1771
+ },
1772
+ {
1773
+ "epoch": 0.779261586802828,
1774
+ "grad_norm": 3.0666756629943848,
1775
+ "learning_rate": 2.4941847288321797e-05,
1776
+ "loss": 1.4071,
1777
+ "step": 248
1778
+ },
1779
+ {
1780
+ "epoch": 0.7824037706205813,
1781
+ "grad_norm": 2.672899007797241,
1782
+ "learning_rate": 2.427392747382623e-05,
1783
+ "loss": 1.3175,
1784
+ "step": 249
1785
+ },
1786
+ {
1787
+ "epoch": 0.7855459544383346,
1788
+ "grad_norm": 3.026980400085449,
1789
+ "learning_rate": 2.3613835184605525e-05,
1790
+ "loss": 1.5728,
1791
+ "step": 250
1792
+ },
1793
+ {
1794
+ "epoch": 0.788688138256088,
1795
+ "grad_norm": 3.842451333999634,
1796
+ "learning_rate": 2.2961638651968975e-05,
1797
+ "loss": 1.3117,
1798
+ "step": 251
1799
+ },
1800
+ {
1801
+ "epoch": 0.7918303220738413,
1802
+ "grad_norm": 3.67557430267334,
1803
+ "learning_rate": 2.231740529107057e-05,
1804
+ "loss": 1.4359,
1805
+ "step": 252
1806
+ },
1807
+ {
1808
+ "epoch": 0.7949725058915946,
1809
+ "grad_norm": 2.9803576469421387,
1810
+ "learning_rate": 2.1681201693940668e-05,
1811
+ "loss": 1.8331,
1812
+ "step": 253
1813
+ },
1814
+ {
1815
+ "epoch": 0.798114689709348,
1816
+ "grad_norm": 2.4087929725646973,
1817
+ "learning_rate": 2.1053093622602404e-05,
1818
+ "loss": 1.2012,
1819
+ "step": 254
1820
+ },
1821
+ {
1822
+ "epoch": 0.8012568735271013,
1823
+ "grad_norm": 2.690004587173462,
1824
+ "learning_rate": 2.043314600227425e-05,
1825
+ "loss": 1.3592,
1826
+ "step": 255
1827
+ },
1828
+ {
1829
+ "epoch": 0.8043990573448547,
1830
+ "grad_norm": 3.7722082138061523,
1831
+ "learning_rate": 1.982142291465896e-05,
1832
+ "loss": 1.6612,
1833
+ "step": 256
1834
+ },
1835
+ {
1836
+ "epoch": 0.8075412411626081,
1837
+ "grad_norm": 3.7470076084136963,
1838
+ "learning_rate": 1.921798759131953e-05,
1839
+ "loss": 1.9673,
1840
+ "step": 257
1841
+ },
1842
+ {
1843
+ "epoch": 0.8106834249803614,
1844
+ "grad_norm": 3.7614829540252686,
1845
+ "learning_rate": 1.8622902407143394e-05,
1846
+ "loss": 1.8566,
1847
+ "step": 258
1848
+ },
1849
+ {
1850
+ "epoch": 0.8138256087981147,
1851
+ "grad_norm": 3.7828967571258545,
1852
+ "learning_rate": 1.8036228873894746e-05,
1853
+ "loss": 2.0554,
1854
+ "step": 259
1855
+ },
1856
+ {
1857
+ "epoch": 0.816967792615868,
1858
+ "grad_norm": 4.305722713470459,
1859
+ "learning_rate": 1.7458027633856478e-05,
1860
+ "loss": 1.5327,
1861
+ "step": 260
1862
+ },
1863
+ {
1864
+ "epoch": 0.8201099764336214,
1865
+ "grad_norm": 3.276519775390625,
1866
+ "learning_rate": 1.6888358453561648e-05,
1867
+ "loss": 1.1896,
1868
+ "step": 261
1869
+ },
1870
+ {
1871
+ "epoch": 0.8232521602513747,
1872
+ "grad_norm": 3.5952844619750977,
1873
+ "learning_rate": 1.6327280217615792e-05,
1874
+ "loss": 1.9557,
1875
+ "step": 262
1876
+ },
1877
+ {
1878
+ "epoch": 0.826394344069128,
1879
+ "grad_norm": 3.9459924697875977,
1880
+ "learning_rate": 1.577485092261012e-05,
1881
+ "loss": 1.3272,
1882
+ "step": 263
1883
+ },
1884
+ {
1885
+ "epoch": 0.8295365278868814,
1886
+ "grad_norm": 4.268272399902344,
1887
+ "learning_rate": 1.5231127671126677e-05,
1888
+ "loss": 1.5323,
1889
+ "step": 264
1890
+ },
1891
+ {
1892
+ "epoch": 0.8326787117046347,
1893
+ "grad_norm": 3.08357834815979,
1894
+ "learning_rate": 1.4696166665835853e-05,
1895
+ "loss": 1.5671,
1896
+ "step": 265
1897
+ },
1898
+ {
1899
+ "epoch": 0.835820895522388,
1900
+ "grad_norm": 2.9765706062316895,
1901
+ "learning_rate": 1.4170023203686878e-05,
1902
+ "loss": 1.2923,
1903
+ "step": 266
1904
+ },
1905
+ {
1906
+ "epoch": 0.8389630793401414,
1907
+ "grad_norm": 4.222568511962891,
1908
+ "learning_rate": 1.3652751670192077e-05,
1909
+ "loss": 1.5935,
1910
+ "step": 267
1911
+ },
1912
+ {
1913
+ "epoch": 0.8421052631578947,
1914
+ "grad_norm": 3.0130860805511475,
1915
+ "learning_rate": 1.3144405533805138e-05,
1916
+ "loss": 1.2738,
1917
+ "step": 268
1918
+ },
1919
+ {
1920
+ "epoch": 0.845247446975648,
1921
+ "grad_norm": 2.5390868186950684,
1922
+ "learning_rate": 1.2645037340394284e-05,
1923
+ "loss": 1.2452,
1924
+ "step": 269
1925
+ },
1926
+ {
1927
+ "epoch": 0.8483896307934015,
1928
+ "grad_norm": 3.9467477798461914,
1929
+ "learning_rate": 1.215469870781093e-05,
1930
+ "loss": 1.6997,
1931
+ "step": 270
1932
+ },
1933
+ {
1934
+ "epoch": 0.8515318146111548,
1935
+ "grad_norm": 3.4239230155944824,
1936
+ "learning_rate": 1.167344032055394e-05,
1937
+ "loss": 1.4533,
1938
+ "step": 271
1939
+ },
1940
+ {
1941
+ "epoch": 0.8546739984289081,
1942
+ "grad_norm": 3.6655142307281494,
1943
+ "learning_rate": 1.120131192453069e-05,
1944
+ "loss": 1.7422,
1945
+ "step": 272
1946
+ },
1947
+ {
1948
+ "epoch": 0.8578161822466615,
1949
+ "grad_norm": 3.3172128200531006,
1950
+ "learning_rate": 1.0738362321914997e-05,
1951
+ "loss": 1.4802,
1952
+ "step": 273
1953
+ },
1954
+ {
1955
+ "epoch": 0.8609583660644148,
1956
+ "grad_norm": 2.381716728210449,
1957
+ "learning_rate": 1.02846393661026e-05,
1958
+ "loss": 0.9949,
1959
+ "step": 274
1960
+ },
1961
+ {
1962
+ "epoch": 0.8641005498821681,
1963
+ "grad_norm": 2.715409278869629,
1964
+ "learning_rate": 9.840189956764677e-06,
1965
+ "loss": 1.3225,
1966
+ "step": 275
1967
+ },
1968
+ {
1969
+ "epoch": 0.8672427336999214,
1970
+ "grad_norm": 3.293710470199585,
1971
+ "learning_rate": 9.405060035000135e-06,
1972
+ "loss": 1.7854,
1973
+ "step": 276
1974
+ },
1975
+ {
1976
+ "epoch": 0.8703849175176748,
1977
+ "grad_norm": 3.462662935256958,
1978
+ "learning_rate": 8.979294578586738e-06,
1979
+ "loss": 1.8729,
1980
+ "step": 277
1981
+ },
1982
+ {
1983
+ "epoch": 0.8735271013354281,
1984
+ "grad_norm": 2.9739253520965576,
1985
+ "learning_rate": 8.562937597331899e-06,
1986
+ "loss": 1.7112,
1987
+ "step": 278
1988
+ },
1989
+ {
1990
+ "epoch": 0.8766692851531814,
1991
+ "grad_norm": 2.937610387802124,
1992
+ "learning_rate": 8.156032128523694e-06,
1993
+ "loss": 1.3997,
1994
+ "step": 279
1995
+ },
1996
+ {
1997
+ "epoch": 0.8798114689709348,
1998
+ "grad_norm": 3.099679946899414,
1999
+ "learning_rate": 7.758620232482084e-06,
2000
+ "loss": 1.2978,
2001
+ "step": 280
2002
+ },
2003
+ {
2004
+ "epoch": 0.8829536527886881,
2005
+ "grad_norm": 3.333786964416504,
2006
+ "learning_rate": 7.370742988211365e-06,
2007
+ "loss": 1.3361,
2008
+ "step": 281
2009
+ },
2010
+ {
2011
+ "epoch": 0.8860958366064414,
2012
+ "grad_norm": 3.3246827125549316,
2013
+ "learning_rate": 6.992440489154051e-06,
2014
+ "loss": 1.7086,
2015
+ "step": 282
2016
+ },
2017
+ {
2018
+ "epoch": 0.8892380204241949,
2019
+ "grad_norm": 3.216891050338745,
2020
+ "learning_rate": 6.623751839046455e-06,
2021
+ "loss": 1.4404,
2022
+ "step": 283
2023
+ },
2024
+ {
2025
+ "epoch": 0.8923802042419482,
2026
+ "grad_norm": 3.9805006980895996,
2027
+ "learning_rate": 6.264715147876743e-06,
2028
+ "loss": 1.5001,
2029
+ "step": 284
2030
+ },
2031
+ {
2032
+ "epoch": 0.8955223880597015,
2033
+ "grad_norm": 3.0989222526550293,
2034
+ "learning_rate": 5.915367527945615e-06,
2035
+ "loss": 1.6261,
2036
+ "step": 285
2037
+ },
2038
+ {
2039
+ "epoch": 0.8986645718774549,
2040
+ "grad_norm": 3.5665836334228516,
2041
+ "learning_rate": 5.575745090030138e-06,
2042
+ "loss": 1.4717,
2043
+ "step": 286
2044
+ },
2045
+ {
2046
+ "epoch": 0.9018067556952082,
2047
+ "grad_norm": 2.7769277095794678,
2048
+ "learning_rate": 5.245882939651181e-06,
2049
+ "loss": 1.6286,
2050
+ "step": 287
2051
+ },
2052
+ {
2053
+ "epoch": 0.9049489395129615,
2054
+ "grad_norm": 3.361825466156006,
2055
+ "learning_rate": 4.92581517344457e-06,
2056
+ "loss": 1.5557,
2057
+ "step": 288
2058
+ },
2059
+ {
2060
+ "epoch": 0.9080911233307148,
2061
+ "grad_norm": 2.8311612606048584,
2062
+ "learning_rate": 4.61557487563673e-06,
2063
+ "loss": 1.9971,
2064
+ "step": 289
2065
+ },
2066
+ {
2067
+ "epoch": 0.9112333071484682,
2068
+ "grad_norm": 3.1088674068450928,
2069
+ "learning_rate": 4.315194114624888e-06,
2070
+ "loss": 1.4003,
2071
+ "step": 290
2072
+ },
2073
+ {
2074
+ "epoch": 0.9143754909662215,
2075
+ "grad_norm": 3.9171743392944336,
2076
+ "learning_rate": 4.0247039396622e-06,
2077
+ "loss": 1.7814,
2078
+ "step": 291
2079
+ },
2080
+ {
2081
+ "epoch": 0.9175176747839748,
2082
+ "grad_norm": 2.549408197402954,
2083
+ "learning_rate": 3.7441343776484117e-06,
2084
+ "loss": 1.0887,
2085
+ "step": 292
2086
+ },
2087
+ {
2088
+ "epoch": 0.9206598586017282,
2089
+ "grad_norm": 3.2176060676574707,
2090
+ "learning_rate": 3.473514430026026e-06,
2091
+ "loss": 1.068,
2092
+ "step": 293
2093
+ },
2094
+ {
2095
+ "epoch": 0.9238020424194815,
2096
+ "grad_norm": 3.184469223022461,
2097
+ "learning_rate": 3.212872069782513e-06,
2098
+ "loss": 1.8995,
2099
+ "step": 294
2100
+ },
2101
+ {
2102
+ "epoch": 0.9269442262372348,
2103
+ "grad_norm": 2.329665422439575,
2104
+ "learning_rate": 2.9622342385589254e-06,
2105
+ "loss": 1.1741,
2106
+ "step": 295
2107
+ },
2108
+ {
2109
+ "epoch": 0.9300864100549883,
2110
+ "grad_norm": 3.1904358863830566,
2111
+ "learning_rate": 2.7216268438649773e-06,
2112
+ "loss": 1.1471,
2113
+ "step": 296
2114
+ },
2115
+ {
2116
+ "epoch": 0.9332285938727416,
2117
+ "grad_norm": 3.151217460632324,
2118
+ "learning_rate": 2.4910747564010685e-06,
2119
+ "loss": 1.6419,
2120
+ "step": 297
2121
+ },
2122
+ {
2123
+ "epoch": 0.9363707776904949,
2124
+ "grad_norm": 3.322706937789917,
2125
+ "learning_rate": 2.2706018074875045e-06,
2126
+ "loss": 1.4371,
2127
+ "step": 298
2128
+ },
2129
+ {
2130
+ "epoch": 0.9395129615082483,
2131
+ "grad_norm": 2.8656439781188965,
2132
+ "learning_rate": 2.060230786601225e-06,
2133
+ "loss": 1.7465,
2134
+ "step": 299
2135
+ },
2136
+ {
2137
+ "epoch": 0.9426551453260016,
2138
+ "grad_norm": 3.795491933822632,
2139
+ "learning_rate": 1.8599834390199855e-06,
2140
+ "loss": 1.5261,
2141
+ "step": 300
2142
+ },
2143
+ {
2144
+ "epoch": 0.9457973291437549,
2145
+ "grad_norm": 4.55732536315918,
2146
+ "learning_rate": 1.6698804635747579e-06,
2147
+ "loss": 1.5042,
2148
+ "step": 301
2149
+ },
2150
+ {
2151
+ "epoch": 0.9489395129615082,
2152
+ "grad_norm": 3.409609794616699,
2153
+ "learning_rate": 1.4899415105101067e-06,
2154
+ "loss": 1.274,
2155
+ "step": 302
2156
+ },
2157
+ {
2158
+ "epoch": 0.9520816967792616,
2159
+ "grad_norm": 3.027373790740967,
2160
+ "learning_rate": 1.3201851794530373e-06,
2161
+ "loss": 1.2252,
2162
+ "step": 303
2163
+ },
2164
+ {
2165
+ "epoch": 0.9552238805970149,
2166
+ "grad_norm": 2.8063504695892334,
2167
+ "learning_rate": 1.160629017490389e-06,
2168
+ "loss": 1.4064,
2169
+ "step": 304
2170
+ },
2171
+ {
2172
+ "epoch": 0.9583660644147682,
2173
+ "grad_norm": 2.025984287261963,
2174
+ "learning_rate": 1.0112895173551185e-06,
2175
+ "loss": 0.9282,
2176
+ "step": 305
2177
+ },
2178
+ {
2179
+ "epoch": 0.9615082482325216,
2180
+ "grad_norm": 2.6216843128204346,
2181
+ "learning_rate": 8.721821157214316e-07,
2182
+ "loss": 1.1329,
2183
+ "step": 306
2184
+ },
2185
+ {
2186
+ "epoch": 0.9646504320502749,
2187
+ "grad_norm": 2.7919013500213623,
2188
+ "learning_rate": 7.433211916092142e-07,
2189
+ "loss": 1.5915,
2190
+ "step": 307
2191
+ },
2192
+ {
2193
+ "epoch": 0.9677926158680282,
2194
+ "grad_norm": 2.705536127090454,
2195
+ "learning_rate": 6.247200648976991e-07,
2196
+ "loss": 1.2777,
2197
+ "step": 308
2198
+ },
2199
+ {
2200
+ "epoch": 0.9709347996857817,
2201
+ "grad_norm": 2.7058138847351074,
2202
+ "learning_rate": 5.163909949486234e-07,
2203
+ "loss": 1.0402,
2204
+ "step": 309
2205
+ },
2206
+ {
2207
+ "epoch": 0.974076983503535,
2208
+ "grad_norm": 2.9219777584075928,
2209
+ "learning_rate": 4.1834517933907467e-07,
2210
+ "loss": 1.5433,
2211
+ "step": 310
2212
+ },
2213
+ {
2214
+ "epoch": 0.9772191673212883,
2215
+ "grad_norm": 3.0937201976776123,
2216
+ "learning_rate": 3.3059275270396207e-07,
2217
+ "loss": 1.3925,
2218
+ "step": 311
2219
+ },
2220
+ {
2221
+ "epoch": 0.9803613511390417,
2222
+ "grad_norm": 3.8803536891937256,
2223
+ "learning_rate": 2.5314278568850935e-07,
2224
+ "loss": 1.5227,
2225
+ "step": 312
2226
+ },
2227
+ {
2228
+ "epoch": 0.983503534956795,
2229
+ "grad_norm": 2.84466290473938,
2230
+ "learning_rate": 1.8600328401061629e-07,
2231
+ "loss": 1.407,
2232
+ "step": 313
2233
+ },
2234
+ {
2235
+ "epoch": 0.9866457187745483,
2236
+ "grad_norm": 3.9510879516601562,
2237
+ "learning_rate": 1.2918118763335373e-07,
2238
+ "loss": 2.1707,
2239
+ "step": 314
2240
+ },
2241
+ {
2242
+ "epoch": 0.9897879025923016,
2243
+ "grad_norm": 3.5713019371032715,
2244
+ "learning_rate": 8.268237004757096e-08,
2245
+ "loss": 1.8633,
2246
+ "step": 315
2247
+ },
2248
+ {
2249
+ "epoch": 0.992930086410055,
2250
+ "grad_norm": 3.355004072189331,
2251
+ "learning_rate": 4.651163766484779e-08,
2252
+ "loss": 1.8249,
2253
+ "step": 316
2254
+ },
2255
+ {
2256
+ "epoch": 0.9960722702278083,
2257
+ "grad_norm": 3.7547905445098877,
2258
+ "learning_rate": 2.0672729320581065e-08,
2259
+ "loss": 1.2968,
2260
+ "step": 317
2261
+ },
2262
+ {
2263
+ "epoch": 0.9992144540455616,
2264
+ "grad_norm": 2.9850101470947266,
2265
+ "learning_rate": 5.1683158875937e-09,
2266
+ "loss": 1.5139,
2267
+ "step": 318
2268
+ },
2269
+ {
2270
+ "epoch": 1.002356637863315,
2271
+ "grad_norm": 23.679458618164062,
2272
+ "learning_rate": 0.0,
2273
+ "loss": 3.406,
2274
+ "step": 319
2275
  }
2276
  ],
2277
  "logging_steps": 1,
 
2286
  "should_evaluate": false,
2287
  "should_log": false,
2288
  "should_save": true,
2289
+ "should_training_stop": true
2290
  },
2291
  "attributes": {}
2292
  }
2293
  },
2294
+ "total_flos": 2839204448305152.0,
2295
  "train_batch_size": 2,
2296
  "trial_name": null,
2297
  "trial_params": null