Plofski commited on
Commit
49b0640
·
verified ·
1 Parent(s): 2ffdcc2

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07298ce60c7d8cb3c6e42c406800f94fe196f562e78c3b7b1b241dde9eb2a84a
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6323430495422b2f5e9b7844076fda108d2adbed4d0037f47d6e99938d8fca29
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27ee21889421d66393ba1ec9978f870de383bd198e67267eebcf4f41f26ae0d1
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23e6291f28b6db5850e454b2320b2900c167fefe5276101f07b3b0cce8757420
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff72c43fe4ca3fcf70d83ca8f9e37fa2293fdb34dea33c1c2460b564dd80a06f
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b2e328f1145450725e3266f16e300be997471c44b08eae4fb08a4a11d9367a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.40298206729800523,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1808,6 +1808,456 @@
1808
  "mean_token_accuracy": 0.7935750424861908,
1809
  "num_tokens": 2208982.0,
1810
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
  }
1812
  ],
1813
  "logging_steps": 10,
@@ -1827,7 +2277,7 @@
1827
  "attributes": {}
1828
  }
1829
  },
1830
- "total_flos": 2679210270320640.0,
1831
  "train_batch_size": 8,
1832
  "trial_name": null,
1833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5037275841225065,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1808
  "mean_token_accuracy": 0.7935750424861908,
1809
  "num_tokens": 2208982.0,
1810
  "step": 2000
1811
+ },
1812
+ {
1813
+ "epoch": 0.40499697763449527,
1814
+ "grad_norm": 17.875,
1815
+ "learning_rate": 1.7301363422661027e-05,
1816
+ "loss": 0.9803,
1817
+ "mean_token_accuracy": 0.7665176451206207,
1818
+ "num_tokens": 2219728.0,
1819
+ "step": 2010
1820
+ },
1821
+ {
1822
+ "epoch": 0.4070118879709853,
1823
+ "grad_norm": 12.0,
1824
+ "learning_rate": 1.7287930687084426e-05,
1825
+ "loss": 0.9941,
1826
+ "mean_token_accuracy": 0.7633516311645507,
1827
+ "num_tokens": 2230087.0,
1828
+ "step": 2020
1829
+ },
1830
+ {
1831
+ "epoch": 0.4090267983074753,
1832
+ "grad_norm": 11.0625,
1833
+ "learning_rate": 1.7274497951507828e-05,
1834
+ "loss": 0.8946,
1835
+ "mean_token_accuracy": 0.786914736032486,
1836
+ "num_tokens": 2240723.0,
1837
+ "step": 2030
1838
+ },
1839
+ {
1840
+ "epoch": 0.41104170864396533,
1841
+ "grad_norm": 11.0,
1842
+ "learning_rate": 1.7261065215931227e-05,
1843
+ "loss": 0.8631,
1844
+ "mean_token_accuracy": 0.7922929883003235,
1845
+ "num_tokens": 2250999.0,
1846
+ "step": 2040
1847
+ },
1848
+ {
1849
+ "epoch": 0.41305661898045537,
1850
+ "grad_norm": 12.0625,
1851
+ "learning_rate": 1.7247632480354626e-05,
1852
+ "loss": 0.8731,
1853
+ "mean_token_accuracy": 0.7846612274646759,
1854
+ "num_tokens": 2261138.0,
1855
+ "step": 2050
1856
+ },
1857
+ {
1858
+ "epoch": 0.4150715293169454,
1859
+ "grad_norm": 10.125,
1860
+ "learning_rate": 1.7234199744778028e-05,
1861
+ "loss": 0.885,
1862
+ "mean_token_accuracy": 0.7879790186882019,
1863
+ "num_tokens": 2271083.0,
1864
+ "step": 2060
1865
+ },
1866
+ {
1867
+ "epoch": 0.41708643965343545,
1868
+ "grad_norm": 11.375,
1869
+ "learning_rate": 1.7220767009201423e-05,
1870
+ "loss": 0.9247,
1871
+ "mean_token_accuracy": 0.7802027463912964,
1872
+ "num_tokens": 2282490.0,
1873
+ "step": 2070
1874
+ },
1875
+ {
1876
+ "epoch": 0.41910134998992543,
1877
+ "grad_norm": 10.625,
1878
+ "learning_rate": 1.7207334273624825e-05,
1879
+ "loss": 0.953,
1880
+ "mean_token_accuracy": 0.7730051100254058,
1881
+ "num_tokens": 2293586.0,
1882
+ "step": 2080
1883
+ },
1884
+ {
1885
+ "epoch": 0.42111626032641547,
1886
+ "grad_norm": 11.625,
1887
+ "learning_rate": 1.7193901538048224e-05,
1888
+ "loss": 1.0067,
1889
+ "mean_token_accuracy": 0.7572938621044158,
1890
+ "num_tokens": 2304802.0,
1891
+ "step": 2090
1892
+ },
1893
+ {
1894
+ "epoch": 0.4231311706629055,
1895
+ "grad_norm": 12.3125,
1896
+ "learning_rate": 1.7180468802471623e-05,
1897
+ "loss": 0.9289,
1898
+ "mean_token_accuracy": 0.7806627154350281,
1899
+ "num_tokens": 2315105.0,
1900
+ "step": 2100
1901
+ },
1902
+ {
1903
+ "epoch": 0.42514608099939555,
1904
+ "grad_norm": 12.9375,
1905
+ "learning_rate": 1.7167036066895025e-05,
1906
+ "loss": 0.8419,
1907
+ "mean_token_accuracy": 0.7955414175987243,
1908
+ "num_tokens": 2326101.0,
1909
+ "step": 2110
1910
+ },
1911
+ {
1912
+ "epoch": 0.42716099133588553,
1913
+ "grad_norm": 11.1875,
1914
+ "learning_rate": 1.7153603331318424e-05,
1915
+ "loss": 0.9516,
1916
+ "mean_token_accuracy": 0.7715274512767791,
1917
+ "num_tokens": 2337205.0,
1918
+ "step": 2120
1919
+ },
1920
+ {
1921
+ "epoch": 0.42917590167237557,
1922
+ "grad_norm": 11.8125,
1923
+ "learning_rate": 1.7140170595741826e-05,
1924
+ "loss": 0.8855,
1925
+ "mean_token_accuracy": 0.7864199817180634,
1926
+ "num_tokens": 2349394.0,
1927
+ "step": 2130
1928
+ },
1929
+ {
1930
+ "epoch": 0.4311908120088656,
1931
+ "grad_norm": 11.1875,
1932
+ "learning_rate": 1.7126737860165222e-05,
1933
+ "loss": 0.9328,
1934
+ "mean_token_accuracy": 0.7836083233356476,
1935
+ "num_tokens": 2360556.0,
1936
+ "step": 2140
1937
+ },
1938
+ {
1939
+ "epoch": 0.43320572234535565,
1940
+ "grad_norm": 11.4375,
1941
+ "learning_rate": 1.7113305124588624e-05,
1942
+ "loss": 0.9885,
1943
+ "mean_token_accuracy": 0.7649979829788208,
1944
+ "num_tokens": 2371734.0,
1945
+ "step": 2150
1946
+ },
1947
+ {
1948
+ "epoch": 0.43522063268184563,
1949
+ "grad_norm": 12.8125,
1950
+ "learning_rate": 1.7099872389012023e-05,
1951
+ "loss": 0.9407,
1952
+ "mean_token_accuracy": 0.7679962277412414,
1953
+ "num_tokens": 2382510.0,
1954
+ "step": 2160
1955
+ },
1956
+ {
1957
+ "epoch": 0.43723554301833567,
1958
+ "grad_norm": 12.625,
1959
+ "learning_rate": 1.708643965343542e-05,
1960
+ "loss": 0.9365,
1961
+ "mean_token_accuracy": 0.7730210840702056,
1962
+ "num_tokens": 2395814.0,
1963
+ "step": 2170
1964
+ },
1965
+ {
1966
+ "epoch": 0.4392504533548257,
1967
+ "grad_norm": 14.4375,
1968
+ "learning_rate": 1.7073006917858824e-05,
1969
+ "loss": 0.9407,
1970
+ "mean_token_accuracy": 0.773440134525299,
1971
+ "num_tokens": 2407124.0,
1972
+ "step": 2180
1973
+ },
1974
+ {
1975
+ "epoch": 0.44126536369131575,
1976
+ "grad_norm": 11.375,
1977
+ "learning_rate": 1.7059574182282223e-05,
1978
+ "loss": 0.9601,
1979
+ "mean_token_accuracy": 0.7624564170837402,
1980
+ "num_tokens": 2418793.0,
1981
+ "step": 2190
1982
+ },
1983
+ {
1984
+ "epoch": 0.4432802740278058,
1985
+ "grad_norm": 10.4375,
1986
+ "learning_rate": 1.7046141446705625e-05,
1987
+ "loss": 0.8615,
1988
+ "mean_token_accuracy": 0.7828535497188568,
1989
+ "num_tokens": 2430231.0,
1990
+ "step": 2200
1991
+ },
1992
+ {
1993
+ "epoch": 0.44529518436429577,
1994
+ "grad_norm": 13.0625,
1995
+ "learning_rate": 1.7032708711129024e-05,
1996
+ "loss": 0.994,
1997
+ "mean_token_accuracy": 0.7661273539066314,
1998
+ "num_tokens": 2441407.0,
1999
+ "step": 2210
2000
+ },
2001
+ {
2002
+ "epoch": 0.4473100947007858,
2003
+ "grad_norm": 14.4375,
2004
+ "learning_rate": 1.7019275975552423e-05,
2005
+ "loss": 0.975,
2006
+ "mean_token_accuracy": 0.7622927308082581,
2007
+ "num_tokens": 2453109.0,
2008
+ "step": 2220
2009
+ },
2010
+ {
2011
+ "epoch": 0.44932500503727585,
2012
+ "grad_norm": 8.875,
2013
+ "learning_rate": 1.7005843239975825e-05,
2014
+ "loss": 0.9016,
2015
+ "mean_token_accuracy": 0.7804525554180145,
2016
+ "num_tokens": 2463823.0,
2017
+ "step": 2230
2018
+ },
2019
+ {
2020
+ "epoch": 0.4513399153737659,
2021
+ "grad_norm": 13.8125,
2022
+ "learning_rate": 1.699241050439922e-05,
2023
+ "loss": 0.8886,
2024
+ "mean_token_accuracy": 0.7791055798530578,
2025
+ "num_tokens": 2474062.0,
2026
+ "step": 2240
2027
+ },
2028
+ {
2029
+ "epoch": 0.45335482571025587,
2030
+ "grad_norm": 12.5625,
2031
+ "learning_rate": 1.6978977768822623e-05,
2032
+ "loss": 0.8376,
2033
+ "mean_token_accuracy": 0.791073453426361,
2034
+ "num_tokens": 2485012.0,
2035
+ "step": 2250
2036
+ },
2037
+ {
2038
+ "epoch": 0.4553697360467459,
2039
+ "grad_norm": 12.4375,
2040
+ "learning_rate": 1.696554503324602e-05,
2041
+ "loss": 0.9526,
2042
+ "mean_token_accuracy": 0.7676692366600036,
2043
+ "num_tokens": 2497094.0,
2044
+ "step": 2260
2045
+ },
2046
+ {
2047
+ "epoch": 0.45738464638323595,
2048
+ "grad_norm": 10.75,
2049
+ "learning_rate": 1.695211229766942e-05,
2050
+ "loss": 1.0348,
2051
+ "mean_token_accuracy": 0.7595704078674317,
2052
+ "num_tokens": 2509251.0,
2053
+ "step": 2270
2054
+ },
2055
+ {
2056
+ "epoch": 0.459399556719726,
2057
+ "grad_norm": 11.375,
2058
+ "learning_rate": 1.6938679562092822e-05,
2059
+ "loss": 0.8975,
2060
+ "mean_token_accuracy": 0.7786314010620117,
2061
+ "num_tokens": 2519167.0,
2062
+ "step": 2280
2063
+ },
2064
+ {
2065
+ "epoch": 0.46141446705621597,
2066
+ "grad_norm": 10.5,
2067
+ "learning_rate": 1.692524682651622e-05,
2068
+ "loss": 0.931,
2069
+ "mean_token_accuracy": 0.780303293466568,
2070
+ "num_tokens": 2530095.0,
2071
+ "step": 2290
2072
+ },
2073
+ {
2074
+ "epoch": 0.463429377392706,
2075
+ "grad_norm": 10.0,
2076
+ "learning_rate": 1.6911814090939623e-05,
2077
+ "loss": 0.9055,
2078
+ "mean_token_accuracy": 0.7792095363140106,
2079
+ "num_tokens": 2542753.0,
2080
+ "step": 2300
2081
+ },
2082
+ {
2083
+ "epoch": 0.46544428772919605,
2084
+ "grad_norm": 12.625,
2085
+ "learning_rate": 1.689838135536302e-05,
2086
+ "loss": 0.8634,
2087
+ "mean_token_accuracy": 0.7921158850193024,
2088
+ "num_tokens": 2553761.0,
2089
+ "step": 2310
2090
+ },
2091
+ {
2092
+ "epoch": 0.4674591980656861,
2093
+ "grad_norm": 10.75,
2094
+ "learning_rate": 1.688494861978642e-05,
2095
+ "loss": 0.8504,
2096
+ "mean_token_accuracy": 0.7890695691108703,
2097
+ "num_tokens": 2564639.0,
2098
+ "step": 2320
2099
+ },
2100
+ {
2101
+ "epoch": 0.4694741084021761,
2102
+ "grad_norm": 13.4375,
2103
+ "learning_rate": 1.687151588420982e-05,
2104
+ "loss": 0.9293,
2105
+ "mean_token_accuracy": 0.7696837067604065,
2106
+ "num_tokens": 2576449.0,
2107
+ "step": 2330
2108
+ },
2109
+ {
2110
+ "epoch": 0.4714890187386661,
2111
+ "grad_norm": 12.375,
2112
+ "learning_rate": 1.685808314863322e-05,
2113
+ "loss": 0.867,
2114
+ "mean_token_accuracy": 0.7936202645301819,
2115
+ "num_tokens": 2588236.0,
2116
+ "step": 2340
2117
+ },
2118
+ {
2119
+ "epoch": 0.47350392907515615,
2120
+ "grad_norm": 12.875,
2121
+ "learning_rate": 1.684465041305662e-05,
2122
+ "loss": 0.8259,
2123
+ "mean_token_accuracy": 0.7961400330066681,
2124
+ "num_tokens": 2599720.0,
2125
+ "step": 2350
2126
+ },
2127
+ {
2128
+ "epoch": 0.4755188394116462,
2129
+ "grad_norm": 12.875,
2130
+ "learning_rate": 1.683121767748002e-05,
2131
+ "loss": 0.9056,
2132
+ "mean_token_accuracy": 0.780539608001709,
2133
+ "num_tokens": 2609396.0,
2134
+ "step": 2360
2135
+ },
2136
+ {
2137
+ "epoch": 0.4775337497481362,
2138
+ "grad_norm": 11.625,
2139
+ "learning_rate": 1.681778494190342e-05,
2140
+ "loss": 0.9019,
2141
+ "mean_token_accuracy": 0.7818064391613007,
2142
+ "num_tokens": 2621392.0,
2143
+ "step": 2370
2144
+ },
2145
+ {
2146
+ "epoch": 0.4795486600846262,
2147
+ "grad_norm": 10.75,
2148
+ "learning_rate": 1.680435220632682e-05,
2149
+ "loss": 0.7993,
2150
+ "mean_token_accuracy": 0.8025827884674073,
2151
+ "num_tokens": 2633038.0,
2152
+ "step": 2380
2153
+ },
2154
+ {
2155
+ "epoch": 0.48156357042111625,
2156
+ "grad_norm": 11.4375,
2157
+ "learning_rate": 1.679091947075022e-05,
2158
+ "loss": 0.9316,
2159
+ "mean_token_accuracy": 0.7733452200889588,
2160
+ "num_tokens": 2644078.0,
2161
+ "step": 2390
2162
+ },
2163
+ {
2164
+ "epoch": 0.4835784807576063,
2165
+ "grad_norm": 9.5625,
2166
+ "learning_rate": 1.6777486735173622e-05,
2167
+ "loss": 0.8044,
2168
+ "mean_token_accuracy": 0.8011213660240173,
2169
+ "num_tokens": 2655374.0,
2170
+ "step": 2400
2171
+ },
2172
+ {
2173
+ "epoch": 0.4855933910940963,
2174
+ "grad_norm": 11.8125,
2175
+ "learning_rate": 1.6764053999597017e-05,
2176
+ "loss": 0.8751,
2177
+ "mean_token_accuracy": 0.7866755127906799,
2178
+ "num_tokens": 2665838.0,
2179
+ "step": 2410
2180
+ },
2181
+ {
2182
+ "epoch": 0.48760830143058637,
2183
+ "grad_norm": 10.125,
2184
+ "learning_rate": 1.675062126402042e-05,
2185
+ "loss": 0.827,
2186
+ "mean_token_accuracy": 0.7927514970302582,
2187
+ "num_tokens": 2675934.0,
2188
+ "step": 2420
2189
+ },
2190
+ {
2191
+ "epoch": 0.48962321176707635,
2192
+ "grad_norm": 12.8125,
2193
+ "learning_rate": 1.673718852844382e-05,
2194
+ "loss": 0.9346,
2195
+ "mean_token_accuracy": 0.7792349219322204,
2196
+ "num_tokens": 2687584.0,
2197
+ "step": 2430
2198
+ },
2199
+ {
2200
+ "epoch": 0.4916381221035664,
2201
+ "grad_norm": 11.875,
2202
+ "learning_rate": 1.6723755792867217e-05,
2203
+ "loss": 0.8867,
2204
+ "mean_token_accuracy": 0.7851879954338074,
2205
+ "num_tokens": 2697927.0,
2206
+ "step": 2440
2207
+ },
2208
+ {
2209
+ "epoch": 0.4936530324400564,
2210
+ "grad_norm": 11.75,
2211
+ "learning_rate": 1.671032305729062e-05,
2212
+ "loss": 0.8585,
2213
+ "mean_token_accuracy": 0.7973346531391143,
2214
+ "num_tokens": 2708092.0,
2215
+ "step": 2450
2216
+ },
2217
+ {
2218
+ "epoch": 0.49566794277654647,
2219
+ "grad_norm": 14.25,
2220
+ "learning_rate": 1.669689032171402e-05,
2221
+ "loss": 0.9075,
2222
+ "mean_token_accuracy": 0.7788807570934295,
2223
+ "num_tokens": 2719293.0,
2224
+ "step": 2460
2225
+ },
2226
+ {
2227
+ "epoch": 0.49768285311303645,
2228
+ "grad_norm": 9.875,
2229
+ "learning_rate": 1.668345758613742e-05,
2230
+ "loss": 0.8931,
2231
+ "mean_token_accuracy": 0.7861545145511627,
2232
+ "num_tokens": 2730531.0,
2233
+ "step": 2470
2234
+ },
2235
+ {
2236
+ "epoch": 0.4996977634495265,
2237
+ "grad_norm": 10.4375,
2238
+ "learning_rate": 1.6670024850560816e-05,
2239
+ "loss": 0.9438,
2240
+ "mean_token_accuracy": 0.7664293229579926,
2241
+ "num_tokens": 2741732.0,
2242
+ "step": 2480
2243
+ },
2244
+ {
2245
+ "epoch": 0.5017126737860165,
2246
+ "grad_norm": 11.125,
2247
+ "learning_rate": 1.6656592114984218e-05,
2248
+ "loss": 0.8719,
2249
+ "mean_token_accuracy": 0.7909434497356415,
2250
+ "num_tokens": 2753005.0,
2251
+ "step": 2490
2252
+ },
2253
+ {
2254
+ "epoch": 0.5037275841225065,
2255
+ "grad_norm": 11.25,
2256
+ "learning_rate": 1.6643159379407617e-05,
2257
+ "loss": 0.8739,
2258
+ "mean_token_accuracy": 0.7839685261249543,
2259
+ "num_tokens": 2765568.0,
2260
+ "step": 2500
2261
  }
2262
  ],
2263
  "logging_steps": 10,
 
2277
  "attributes": {}
2278
  }
2279
  },
2280
+ "total_flos": 3352572806252544.0,
2281
  "train_batch_size": 8,
2282
  "trial_name": null,
2283
  "trial_params": null