NBAmine commited on
Commit
0951e3f
·
verified ·
1 Parent(s): ab65e12

Training in progress, epoch 5, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:549ff55c9b7df5d39243d500e1f9ca5c2d8f81fd61f01bad6e770df99cd4642c
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c14b7d8a1648c56d9f25f88d48454e081b6bc178d61bd9f0aadd582257678003
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa110d49454799444aa7af7da9fa9973fcb4b4d870df4a2d26b538cde1855661
3
  size 117931203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae997529af0fecd00cf9ea60649b8488f2fcad93e7d57149ed2055f7e443e81c
3
  size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1327c263322022fd0e726fb88ce334f0d0abdf09e2e5d74868c8bdf3e82ffcd
3
- size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a806988fecdee5121c06d7240dec6e61421fb0008f39bed17de1e2ca05215f14
3
+ size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00a9d243dd1642fdda05d571b569759bcdedafccf20291ff236e4fc0f24bd4ce
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfca50dfc66d4be0e8bab60e1bfd495197005d876487c7e37b847562cfa51471
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a8c29a421e8a95f2d6d46e5ac0aa25be56966681afe38a5d47f15222c56ec6b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c8cc8d8f0165185e683fffb0aab5024d4cdc129dcf7f2bae80e3717e00f0c4e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
- "epoch": 4.0,
6
  "eval_steps": 500,
7
- "global_step": 1752,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1806,6 +1806,458 @@
1806
  "eval_samples_per_second": 3.172,
1807
  "eval_steps_per_second": 0.795,
1808
  "step": 1752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
  }
1810
  ],
1811
  "logging_steps": 10,
@@ -1820,12 +2272,12 @@
1820
  "should_evaluate": false,
1821
  "should_log": false,
1822
  "should_save": true,
1823
- "should_training_stop": false
1824
  },
1825
  "attributes": {}
1826
  }
1827
  },
1828
- "total_flos": 1.1760117320245248e+17,
1829
  "train_batch_size": 1,
1830
  "trial_name": null,
1831
  "trial_params": null
 
2
  "best_global_step": 438,
3
  "best_metric": 1.2615772485733032,
4
  "best_model_checkpoint": "./adapter-phase2/checkpoint-438",
5
+ "epoch": 5.0,
6
  "eval_steps": 500,
7
+ "global_step": 2190,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1806
  "eval_samples_per_second": 3.172,
1807
  "eval_steps_per_second": 0.795,
1808
  "step": 1752
1809
+ },
1810
+ {
1811
+ "entropy": 0.5481636302643701,
1812
+ "epoch": 4.018285714285715,
1813
+ "grad_norm": 1.847604513168335,
1814
+ "learning_rate": 1.9771689497716896e-06,
1815
+ "loss": 0.5007,
1816
+ "mean_token_accuracy": 0.8671465257280752,
1817
+ "num_tokens": 434791.0,
1818
+ "step": 1760
1819
+ },
1820
+ {
1821
+ "entropy": 0.5742504514753819,
1822
+ "epoch": 4.041142857142857,
1823
+ "grad_norm": 2.2244632244110107,
1824
+ "learning_rate": 1.931506849315069e-06,
1825
+ "loss": 0.5271,
1826
+ "mean_token_accuracy": 0.8591625761240721,
1827
+ "num_tokens": 447092.0,
1828
+ "step": 1770
1829
+ },
1830
+ {
1831
+ "entropy": 0.6953230138868094,
1832
+ "epoch": 4.064,
1833
+ "grad_norm": 3.42199444770813,
1834
+ "learning_rate": 1.8858447488584477e-06,
1835
+ "loss": 0.6469,
1836
+ "mean_token_accuracy": 0.8285580322146415,
1837
+ "num_tokens": 456254.0,
1838
+ "step": 1780
1839
+ },
1840
+ {
1841
+ "entropy": 0.7736836820840836,
1842
+ "epoch": 4.086857142857143,
1843
+ "grad_norm": 3.351454257965088,
1844
+ "learning_rate": 1.8401826484018268e-06,
1845
+ "loss": 0.6909,
1846
+ "mean_token_accuracy": 0.8115375626832246,
1847
+ "num_tokens": 463064.0,
1848
+ "step": 1790
1849
+ },
1850
+ {
1851
+ "entropy": 0.771463468298316,
1852
+ "epoch": 4.109714285714285,
1853
+ "grad_norm": 4.134479522705078,
1854
+ "learning_rate": 1.7945205479452058e-06,
1855
+ "loss": 0.6807,
1856
+ "mean_token_accuracy": 0.8126782298088073,
1857
+ "num_tokens": 468431.0,
1858
+ "step": 1800
1859
+ },
1860
+ {
1861
+ "entropy": 0.5649335160851479,
1862
+ "epoch": 4.132571428571429,
1863
+ "grad_norm": 2.1762540340423584,
1864
+ "learning_rate": 1.7488584474885847e-06,
1865
+ "loss": 0.5221,
1866
+ "mean_token_accuracy": 0.8567749988287687,
1867
+ "num_tokens": 482534.0,
1868
+ "step": 1810
1869
+ },
1870
+ {
1871
+ "entropy": 0.5837410872802138,
1872
+ "epoch": 4.155428571428572,
1873
+ "grad_norm": 2.349236011505127,
1874
+ "learning_rate": 1.7031963470319637e-06,
1875
+ "loss": 0.5371,
1876
+ "mean_token_accuracy": 0.8581233065575361,
1877
+ "num_tokens": 494845.0,
1878
+ "step": 1820
1879
+ },
1880
+ {
1881
+ "entropy": 0.6922593496739864,
1882
+ "epoch": 4.178285714285714,
1883
+ "grad_norm": 2.9896738529205322,
1884
+ "learning_rate": 1.6575342465753428e-06,
1885
+ "loss": 0.6648,
1886
+ "mean_token_accuracy": 0.8241277992725372,
1887
+ "num_tokens": 504153.0,
1888
+ "step": 1830
1889
+ },
1890
+ {
1891
+ "entropy": 0.7702124075964093,
1892
+ "epoch": 4.201142857142857,
1893
+ "grad_norm": 3.322385549545288,
1894
+ "learning_rate": 1.6118721461187218e-06,
1895
+ "loss": 0.6712,
1896
+ "mean_token_accuracy": 0.8188040722161531,
1897
+ "num_tokens": 511101.0,
1898
+ "step": 1840
1899
+ },
1900
+ {
1901
+ "entropy": 0.8012370727956295,
1902
+ "epoch": 4.224,
1903
+ "grad_norm": 4.359086036682129,
1904
+ "learning_rate": 1.5662100456621007e-06,
1905
+ "loss": 0.6748,
1906
+ "mean_token_accuracy": 0.8118188168853522,
1907
+ "num_tokens": 516460.0,
1908
+ "step": 1850
1909
+ },
1910
+ {
1911
+ "entropy": 0.5523576781153678,
1912
+ "epoch": 4.246857142857142,
1913
+ "grad_norm": 2.107539176940918,
1914
+ "learning_rate": 1.5205479452054797e-06,
1915
+ "loss": 0.5081,
1916
+ "mean_token_accuracy": 0.8673421230167151,
1917
+ "num_tokens": 530848.0,
1918
+ "step": 1860
1919
+ },
1920
+ {
1921
+ "entropy": 0.5729602897539735,
1922
+ "epoch": 4.269714285714286,
1923
+ "grad_norm": 2.5580902099609375,
1924
+ "learning_rate": 1.4748858447488584e-06,
1925
+ "loss": 0.5319,
1926
+ "mean_token_accuracy": 0.8585442833602428,
1927
+ "num_tokens": 543148.0,
1928
+ "step": 1870
1929
+ },
1930
+ {
1931
+ "entropy": 0.6956869766116143,
1932
+ "epoch": 4.292571428571429,
1933
+ "grad_norm": 3.1137397289276123,
1934
+ "learning_rate": 1.4292237442922373e-06,
1935
+ "loss": 0.6509,
1936
+ "mean_token_accuracy": 0.8259295519441366,
1937
+ "num_tokens": 552615.0,
1938
+ "step": 1880
1939
+ },
1940
+ {
1941
+ "entropy": 0.7814504994079471,
1942
+ "epoch": 4.315428571428572,
1943
+ "grad_norm": 3.9837899208068848,
1944
+ "learning_rate": 1.3835616438356165e-06,
1945
+ "loss": 0.6732,
1946
+ "mean_token_accuracy": 0.8206901982426643,
1947
+ "num_tokens": 559644.0,
1948
+ "step": 1890
1949
+ },
1950
+ {
1951
+ "entropy": 0.8006520505994559,
1952
+ "epoch": 4.338285714285714,
1953
+ "grad_norm": 4.293622016906738,
1954
+ "learning_rate": 1.3378995433789954e-06,
1955
+ "loss": 0.687,
1956
+ "mean_token_accuracy": 0.8162542518228293,
1957
+ "num_tokens": 565049.0,
1958
+ "step": 1900
1959
+ },
1960
+ {
1961
+ "entropy": 0.5543251828290522,
1962
+ "epoch": 4.361142857142857,
1963
+ "grad_norm": 2.0004706382751465,
1964
+ "learning_rate": 1.2922374429223744e-06,
1965
+ "loss": 0.5105,
1966
+ "mean_token_accuracy": 0.8651686757802963,
1967
+ "num_tokens": 579511.0,
1968
+ "step": 1910
1969
+ },
1970
+ {
1971
+ "entropy": 0.5778749627992511,
1972
+ "epoch": 4.384,
1973
+ "grad_norm": 2.4198007583618164,
1974
+ "learning_rate": 1.2465753424657535e-06,
1975
+ "loss": 0.5216,
1976
+ "mean_token_accuracy": 0.8559423860162496,
1977
+ "num_tokens": 591873.0,
1978
+ "step": 1920
1979
+ },
1980
+ {
1981
+ "entropy": 0.6606394873932004,
1982
+ "epoch": 4.406857142857143,
1983
+ "grad_norm": 3.485213279724121,
1984
+ "learning_rate": 1.2009132420091325e-06,
1985
+ "loss": 0.6086,
1986
+ "mean_token_accuracy": 0.8372324761003256,
1987
+ "num_tokens": 601337.0,
1988
+ "step": 1930
1989
+ },
1990
+ {
1991
+ "entropy": 0.7576876068487763,
1992
+ "epoch": 4.429714285714286,
1993
+ "grad_norm": 3.3860034942626953,
1994
+ "learning_rate": 1.1552511415525116e-06,
1995
+ "loss": 0.7,
1996
+ "mean_token_accuracy": 0.8150843985378742,
1997
+ "num_tokens": 608326.0,
1998
+ "step": 1940
1999
+ },
2000
+ {
2001
+ "entropy": 0.7779752794653177,
2002
+ "epoch": 4.452571428571429,
2003
+ "grad_norm": 4.422528266906738,
2004
+ "learning_rate": 1.1095890410958906e-06,
2005
+ "loss": 0.6755,
2006
+ "mean_token_accuracy": 0.8102645222097635,
2007
+ "num_tokens": 613860.0,
2008
+ "step": 1950
2009
+ },
2010
+ {
2011
+ "entropy": 0.5644787142053247,
2012
+ "epoch": 4.475428571428571,
2013
+ "grad_norm": 2.2278542518615723,
2014
+ "learning_rate": 1.0639269406392695e-06,
2015
+ "loss": 0.5188,
2016
+ "mean_token_accuracy": 0.8587661664932966,
2017
+ "num_tokens": 627853.0,
2018
+ "step": 1960
2019
+ },
2020
+ {
2021
+ "entropy": 0.5740617036819458,
2022
+ "epoch": 4.498285714285714,
2023
+ "grad_norm": 2.4340105056762695,
2024
+ "learning_rate": 1.0182648401826485e-06,
2025
+ "loss": 0.5167,
2026
+ "mean_token_accuracy": 0.8595852922648192,
2027
+ "num_tokens": 640020.0,
2028
+ "step": 1970
2029
+ },
2030
+ {
2031
+ "entropy": 0.671654068864882,
2032
+ "epoch": 4.521142857142857,
2033
+ "grad_norm": 3.127539873123169,
2034
+ "learning_rate": 9.726027397260274e-07,
2035
+ "loss": 0.6331,
2036
+ "mean_token_accuracy": 0.8256836850196123,
2037
+ "num_tokens": 649058.0,
2038
+ "step": 1980
2039
+ },
2040
+ {
2041
+ "entropy": 0.7583655359223485,
2042
+ "epoch": 4.5440000000000005,
2043
+ "grad_norm": 3.5964298248291016,
2044
+ "learning_rate": 9.269406392694065e-07,
2045
+ "loss": 0.679,
2046
+ "mean_token_accuracy": 0.8139733098447323,
2047
+ "num_tokens": 655831.0,
2048
+ "step": 1990
2049
+ },
2050
+ {
2051
+ "entropy": 0.7816360153257846,
2052
+ "epoch": 4.566857142857143,
2053
+ "grad_norm": 4.389492511749268,
2054
+ "learning_rate": 8.812785388127855e-07,
2055
+ "loss": 0.6784,
2056
+ "mean_token_accuracy": 0.8120625615119934,
2057
+ "num_tokens": 661256.0,
2058
+ "step": 2000
2059
+ },
2060
+ {
2061
+ "entropy": 0.5732687024399639,
2062
+ "epoch": 4.589714285714286,
2063
+ "grad_norm": 2.0767221450805664,
2064
+ "learning_rate": 8.356164383561644e-07,
2065
+ "loss": 0.5335,
2066
+ "mean_token_accuracy": 0.8624513667076826,
2067
+ "num_tokens": 675612.0,
2068
+ "step": 2010
2069
+ },
2070
+ {
2071
+ "entropy": 0.5804870082065463,
2072
+ "epoch": 4.612571428571428,
2073
+ "grad_norm": 2.554534673690796,
2074
+ "learning_rate": 7.899543378995435e-07,
2075
+ "loss": 0.5238,
2076
+ "mean_token_accuracy": 0.8590863507241011,
2077
+ "num_tokens": 687684.0,
2078
+ "step": 2020
2079
+ },
2080
+ {
2081
+ "entropy": 0.6967678766697645,
2082
+ "epoch": 4.635428571428571,
2083
+ "grad_norm": 3.255140542984009,
2084
+ "learning_rate": 7.442922374429224e-07,
2085
+ "loss": 0.6487,
2086
+ "mean_token_accuracy": 0.8244634248316288,
2087
+ "num_tokens": 696675.0,
2088
+ "step": 2030
2089
+ },
2090
+ {
2091
+ "entropy": 0.7526340587064624,
2092
+ "epoch": 4.658285714285714,
2093
+ "grad_norm": 3.69323992729187,
2094
+ "learning_rate": 6.986301369863015e-07,
2095
+ "loss": 0.6719,
2096
+ "mean_token_accuracy": 0.8216490592807532,
2097
+ "num_tokens": 703456.0,
2098
+ "step": 2040
2099
+ },
2100
+ {
2101
+ "entropy": 0.7950498787686229,
2102
+ "epoch": 4.6811428571428575,
2103
+ "grad_norm": 4.715794563293457,
2104
+ "learning_rate": 6.529680365296804e-07,
2105
+ "loss": 0.6808,
2106
+ "mean_token_accuracy": 0.8184644509106874,
2107
+ "num_tokens": 708782.0,
2108
+ "step": 2050
2109
+ },
2110
+ {
2111
+ "entropy": 0.5505135927349329,
2112
+ "epoch": 4.704,
2113
+ "grad_norm": 2.3146073818206787,
2114
+ "learning_rate": 6.073059360730594e-07,
2115
+ "loss": 0.507,
2116
+ "mean_token_accuracy": 0.8652824487537145,
2117
+ "num_tokens": 723247.0,
2118
+ "step": 2060
2119
+ },
2120
+ {
2121
+ "entropy": 0.5807493371888995,
2122
+ "epoch": 4.726857142857143,
2123
+ "grad_norm": 2.615732192993164,
2124
+ "learning_rate": 5.616438356164384e-07,
2125
+ "loss": 0.5342,
2126
+ "mean_token_accuracy": 0.854337964951992,
2127
+ "num_tokens": 735283.0,
2128
+ "step": 2070
2129
+ },
2130
+ {
2131
+ "entropy": 0.7081154704093933,
2132
+ "epoch": 4.749714285714286,
2133
+ "grad_norm": 3.0795960426330566,
2134
+ "learning_rate": 5.159817351598174e-07,
2135
+ "loss": 0.6499,
2136
+ "mean_token_accuracy": 0.8241405732929706,
2137
+ "num_tokens": 744298.0,
2138
+ "step": 2080
2139
+ },
2140
+ {
2141
+ "entropy": 0.783203998953104,
2142
+ "epoch": 4.772571428571428,
2143
+ "grad_norm": 3.7807230949401855,
2144
+ "learning_rate": 4.7031963470319636e-07,
2145
+ "loss": 0.6948,
2146
+ "mean_token_accuracy": 0.8167315106838942,
2147
+ "num_tokens": 751212.0,
2148
+ "step": 2090
2149
+ },
2150
+ {
2151
+ "entropy": 0.7742167858406901,
2152
+ "epoch": 4.795428571428571,
2153
+ "grad_norm": 4.185308933258057,
2154
+ "learning_rate": 4.2465753424657536e-07,
2155
+ "loss": 0.6705,
2156
+ "mean_token_accuracy": 0.8145616598427295,
2157
+ "num_tokens": 756648.0,
2158
+ "step": 2100
2159
+ },
2160
+ {
2161
+ "entropy": 0.5554421614855528,
2162
+ "epoch": 4.8182857142857145,
2163
+ "grad_norm": 2.0456132888793945,
2164
+ "learning_rate": 3.7899543378995436e-07,
2165
+ "loss": 0.4982,
2166
+ "mean_token_accuracy": 0.8656269229948521,
2167
+ "num_tokens": 771047.0,
2168
+ "step": 2110
2169
+ },
2170
+ {
2171
+ "entropy": 0.5584687992930413,
2172
+ "epoch": 4.841142857142858,
2173
+ "grad_norm": 2.591322422027588,
2174
+ "learning_rate": 3.3333333333333335e-07,
2175
+ "loss": 0.5038,
2176
+ "mean_token_accuracy": 0.8639244794845581,
2177
+ "num_tokens": 783484.0,
2178
+ "step": 2120
2179
+ },
2180
+ {
2181
+ "entropy": 0.6443877406418324,
2182
+ "epoch": 4.864,
2183
+ "grad_norm": 3.1148664951324463,
2184
+ "learning_rate": 2.8767123287671235e-07,
2185
+ "loss": 0.5898,
2186
+ "mean_token_accuracy": 0.8393101956695318,
2187
+ "num_tokens": 793159.0,
2188
+ "step": 2130
2189
+ },
2190
+ {
2191
+ "entropy": 0.7694938328117132,
2192
+ "epoch": 4.886857142857143,
2193
+ "grad_norm": 3.860647201538086,
2194
+ "learning_rate": 2.4200913242009135e-07,
2195
+ "loss": 0.6661,
2196
+ "mean_token_accuracy": 0.8193999473005533,
2197
+ "num_tokens": 800322.0,
2198
+ "step": 2140
2199
+ },
2200
+ {
2201
+ "entropy": 0.7594648722559214,
2202
+ "epoch": 4.909714285714285,
2203
+ "grad_norm": 4.108844757080078,
2204
+ "learning_rate": 1.9634703196347034e-07,
2205
+ "loss": 0.6513,
2206
+ "mean_token_accuracy": 0.820786502957344,
2207
+ "num_tokens": 805782.0,
2208
+ "step": 2150
2209
+ },
2210
+ {
2211
+ "entropy": 0.5718974178656936,
2212
+ "epoch": 4.932571428571428,
2213
+ "grad_norm": 2.0961592197418213,
2214
+ "learning_rate": 1.5068493150684934e-07,
2215
+ "loss": 0.5321,
2216
+ "mean_token_accuracy": 0.8587038304656744,
2217
+ "num_tokens": 819399.0,
2218
+ "step": 2160
2219
+ },
2220
+ {
2221
+ "entropy": 0.6284356378018856,
2222
+ "epoch": 4.9554285714285715,
2223
+ "grad_norm": 2.863541841506958,
2224
+ "learning_rate": 1.0502283105022832e-07,
2225
+ "loss": 0.5916,
2226
+ "mean_token_accuracy": 0.8409151379019022,
2227
+ "num_tokens": 830244.0,
2228
+ "step": 2170
2229
+ },
2230
+ {
2231
+ "entropy": 0.7672561943531037,
2232
+ "epoch": 4.978285714285715,
2233
+ "grad_norm": 3.52964186668396,
2234
+ "learning_rate": 5.936073059360731e-08,
2235
+ "loss": 0.6968,
2236
+ "mean_token_accuracy": 0.8189954232424498,
2237
+ "num_tokens": 837492.0,
2238
+ "step": 2180
2239
+ },
2240
+ {
2241
+ "entropy": 0.7868584784630098,
2242
+ "epoch": 5.0,
2243
+ "grad_norm": 6.119595050811768,
2244
+ "learning_rate": 1.3698630136986303e-08,
2245
+ "loss": 0.6771,
2246
+ "mean_token_accuracy": 0.8141911743502868,
2247
+ "num_tokens": 842388.0,
2248
+ "step": 2190
2249
+ },
2250
+ {
2251
+ "epoch": 5.0,
2252
+ "eval_accuracy": 0.009840881272949817,
2253
+ "eval_entropy": 0.8240771901193272,
2254
+ "eval_loss": 1.5346653461456299,
2255
+ "eval_mean_token_accuracy": 0.7217774188656605,
2256
+ "eval_num_tokens": 842388.0,
2257
+ "eval_runtime": 325.5092,
2258
+ "eval_samples_per_second": 3.177,
2259
+ "eval_steps_per_second": 0.796,
2260
+ "step": 2190
2261
  }
2262
  ],
2263
  "logging_steps": 10,
 
2272
  "should_evaluate": false,
2273
  "should_log": false,
2274
  "should_save": true,
2275
+ "should_training_stop": true
2276
  },
2277
  "attributes": {}
2278
  }
2279
  },
2280
+ "total_flos": 1.470014665030656e+17,
2281
  "train_batch_size": 1,
2282
  "trial_name": null,
2283
  "trial_params": null