Whispful commited on
Commit
09a2bcc
·
verified ·
1 Parent(s): 04e5a42

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55e66dc7851df4e6d5b66790d487a73539b81f014ab105f600fed18eb3819b45
3
  size 97307544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3dd2c7fef6854dded71270b43f0e219e0e385e468e3866c850a76fffaf329d1
3
  size 97307544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61f5cfdf0aec3c48ce5b00fb5d8b7dd207d8287d71d7391c65c176cc8d7f31d0
3
  size 194840426
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb7650112bd10ab3e207424fb68c73e772429116566efa506ced1f9aa984ef52
3
  size 194840426
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b0e6721fa2828608f2d5da29d605b6b413409f454fe733652314f9a7d7393eb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc14897b4c8d12b2fd679afea6e33ff956fb43524baeeaef8422be9b77027753
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3249b9e1557f804ed2ad1ca4c42f835dae98f959cd1e774b1c1514a23b4e40e4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a965b4f99c848f0b9008cc4d4102df26407e8276df19747f112091a2e2ab78f1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.1015679836273193,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-250",
4
- "epoch": 0.1450873425802333,
5
  "eval_steps": 25,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1845,6 +1845,372 @@
1845
  "eval_samples_per_second": 8.366,
1846
  "eval_steps_per_second": 2.844,
1847
  "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1848
  }
1849
  ],
1850
  "logging_steps": 1,
@@ -1873,7 +2239,7 @@
1873
  "attributes": {}
1874
  }
1875
  },
1876
- "total_flos": 2.6201652461568e+17,
1877
  "train_batch_size": 3,
1878
  "trial_name": null,
1879
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.07566499710083,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-300",
4
+ "epoch": 0.17410481109627995,
5
  "eval_steps": 25,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1845
  "eval_samples_per_second": 8.366,
1846
  "eval_steps_per_second": 2.844,
1847
  "step": 250
1848
+ },
1849
+ {
1850
+ "epoch": 0.14566769195055423,
1851
+ "grad_norm": 0.3264401853084564,
1852
+ "learning_rate": 1.7341257188154625e-05,
1853
+ "loss": 1.2756,
1854
+ "step": 251
1855
+ },
1856
+ {
1857
+ "epoch": 0.14624804132087516,
1858
+ "grad_norm": 0.3468761146068573,
1859
+ "learning_rate": 1.720347135357937e-05,
1860
+ "loss": 1.2133,
1861
+ "step": 252
1862
+ },
1863
+ {
1864
+ "epoch": 0.1468283906911961,
1865
+ "grad_norm": 0.41729360818862915,
1866
+ "learning_rate": 1.7065987398863007e-05,
1867
+ "loss": 1.3962,
1868
+ "step": 253
1869
+ },
1870
+ {
1871
+ "epoch": 0.14740874006151702,
1872
+ "grad_norm": 0.40159016847610474,
1873
+ "learning_rate": 1.6928813976854267e-05,
1874
+ "loss": 1.2083,
1875
+ "step": 254
1876
+ },
1877
+ {
1878
+ "epoch": 0.14798908943183797,
1879
+ "grad_norm": 0.43321022391319275,
1880
+ "learning_rate": 1.6791959720857855e-05,
1881
+ "loss": 1.2977,
1882
+ "step": 255
1883
+ },
1884
+ {
1885
+ "epoch": 0.1485694388021589,
1886
+ "grad_norm": 0.4180416464805603,
1887
+ "learning_rate": 1.665543324409106e-05,
1888
+ "loss": 1.1649,
1889
+ "step": 256
1890
+ },
1891
+ {
1892
+ "epoch": 0.14914978817247984,
1893
+ "grad_norm": 0.48152637481689453,
1894
+ "learning_rate": 1.651924313914168e-05,
1895
+ "loss": 1.2545,
1896
+ "step": 257
1897
+ },
1898
+ {
1899
+ "epoch": 0.14973013754280076,
1900
+ "grad_norm": 0.4317726194858551,
1901
+ "learning_rate": 1.6383397977427237e-05,
1902
+ "loss": 1.1986,
1903
+ "step": 258
1904
+ },
1905
+ {
1906
+ "epoch": 0.1503104869131217,
1907
+ "grad_norm": 0.47157958149909973,
1908
+ "learning_rate": 1.6247906308655506e-05,
1909
+ "loss": 1.3727,
1910
+ "step": 259
1911
+ },
1912
+ {
1913
+ "epoch": 0.15089083628344263,
1914
+ "grad_norm": 0.4798511862754822,
1915
+ "learning_rate": 1.611277666028641e-05,
1916
+ "loss": 1.2121,
1917
+ "step": 260
1918
+ },
1919
+ {
1920
+ "epoch": 0.15147118565376358,
1921
+ "grad_norm": 0.5214879512786865,
1922
+ "learning_rate": 1.5978017536995366e-05,
1923
+ "loss": 1.2163,
1924
+ "step": 261
1925
+ },
1926
+ {
1927
+ "epoch": 0.1520515350240845,
1928
+ "grad_norm": 0.5045842528343201,
1929
+ "learning_rate": 1.5843637420137964e-05,
1930
+ "loss": 1.2251,
1931
+ "step": 262
1932
+ },
1933
+ {
1934
+ "epoch": 0.15263188439440542,
1935
+ "grad_norm": 0.5405669212341309,
1936
+ "learning_rate": 1.5709644767216233e-05,
1937
+ "loss": 1.3358,
1938
+ "step": 263
1939
+ },
1940
+ {
1941
+ "epoch": 0.15321223376472637,
1942
+ "grad_norm": 0.51566481590271,
1943
+ "learning_rate": 1.557604801134631e-05,
1944
+ "loss": 1.2341,
1945
+ "step": 264
1946
+ },
1947
+ {
1948
+ "epoch": 0.1537925831350473,
1949
+ "grad_norm": 0.5638424158096313,
1950
+ "learning_rate": 1.54428555607277e-05,
1951
+ "loss": 1.1778,
1952
+ "step": 265
1953
+ },
1954
+ {
1955
+ "epoch": 0.15437293250536824,
1956
+ "grad_norm": 0.549906313419342,
1957
+ "learning_rate": 1.5310075798114107e-05,
1958
+ "loss": 1.213,
1959
+ "step": 266
1960
+ },
1961
+ {
1962
+ "epoch": 0.15495328187568916,
1963
+ "grad_norm": 0.5369941592216492,
1964
+ "learning_rate": 1.5177717080285796e-05,
1965
+ "loss": 1.1816,
1966
+ "step": 267
1967
+ },
1968
+ {
1969
+ "epoch": 0.1555336312460101,
1970
+ "grad_norm": 0.5978056788444519,
1971
+ "learning_rate": 1.5045787737523685e-05,
1972
+ "loss": 1.2219,
1973
+ "step": 268
1974
+ },
1975
+ {
1976
+ "epoch": 0.15611398061633103,
1977
+ "grad_norm": 0.5928854942321777,
1978
+ "learning_rate": 1.491429607308506e-05,
1979
+ "loss": 1.226,
1980
+ "step": 269
1981
+ },
1982
+ {
1983
+ "epoch": 0.15669432998665198,
1984
+ "grad_norm": 0.6016438603401184,
1985
+ "learning_rate": 1.478325036268096e-05,
1986
+ "loss": 1.2603,
1987
+ "step": 270
1988
+ },
1989
+ {
1990
+ "epoch": 0.1572746793569729,
1991
+ "grad_norm": 0.6282125115394592,
1992
+ "learning_rate": 1.4652658853955349e-05,
1993
+ "loss": 1.2888,
1994
+ "step": 271
1995
+ },
1996
+ {
1997
+ "epoch": 0.15785502872729384,
1998
+ "grad_norm": 0.6291938424110413,
1999
+ "learning_rate": 1.4522529765966048e-05,
2000
+ "loss": 1.1523,
2001
+ "step": 272
2002
+ },
2003
+ {
2004
+ "epoch": 0.15843537809761477,
2005
+ "grad_norm": 0.6878090500831604,
2006
+ "learning_rate": 1.4392871288667415e-05,
2007
+ "loss": 1.2104,
2008
+ "step": 273
2009
+ },
2010
+ {
2011
+ "epoch": 0.15901572746793569,
2012
+ "grad_norm": 0.7197153568267822,
2013
+ "learning_rate": 1.4263691582394912e-05,
2014
+ "loss": 1.2435,
2015
+ "step": 274
2016
+ },
2017
+ {
2018
+ "epoch": 0.15959607683825663,
2019
+ "grad_norm": 0.7492052912712097,
2020
+ "learning_rate": 1.4134998777351533e-05,
2021
+ "loss": 1.1018,
2022
+ "step": 275
2023
+ },
2024
+ {
2025
+ "epoch": 0.15959607683825663,
2026
+ "eval_loss": 1.087742567062378,
2027
+ "eval_runtime": 5.9755,
2028
+ "eval_samples_per_second": 8.368,
2029
+ "eval_steps_per_second": 2.845,
2030
+ "step": 275
2031
+ },
2032
+ {
2033
+ "epoch": 0.16017642620857755,
2034
+ "grad_norm": 0.7975369095802307,
2035
+ "learning_rate": 1.4006800973096066e-05,
2036
+ "loss": 1.124,
2037
+ "step": 276
2038
+ },
2039
+ {
2040
+ "epoch": 0.1607567755788985,
2041
+ "grad_norm": 0.8354909420013428,
2042
+ "learning_rate": 1.3879106238033392e-05,
2043
+ "loss": 1.2082,
2044
+ "step": 277
2045
+ },
2046
+ {
2047
+ "epoch": 0.16133712494921942,
2048
+ "grad_norm": 0.7988197803497314,
2049
+ "learning_rate": 1.3751922608906614e-05,
2050
+ "loss": 1.0521,
2051
+ "step": 278
2052
+ },
2053
+ {
2054
+ "epoch": 0.16191747431954037,
2055
+ "grad_norm": 1.0019845962524414,
2056
+ "learning_rate": 1.3625258090291312e-05,
2057
+ "loss": 1.2305,
2058
+ "step": 279
2059
+ },
2060
+ {
2061
+ "epoch": 0.1624978236898613,
2062
+ "grad_norm": 0.8843457102775574,
2063
+ "learning_rate": 1.3499120654091716e-05,
2064
+ "loss": 1.1107,
2065
+ "step": 280
2066
+ },
2067
+ {
2068
+ "epoch": 0.16307817306018224,
2069
+ "grad_norm": 1.0474600791931152,
2070
+ "learning_rate": 1.3373518239038985e-05,
2071
+ "loss": 0.8914,
2072
+ "step": 281
2073
+ },
2074
+ {
2075
+ "epoch": 0.16365852243050316,
2076
+ "grad_norm": 1.0813342332839966,
2077
+ "learning_rate": 1.3248458750191566e-05,
2078
+ "loss": 1.265,
2079
+ "step": 282
2080
+ },
2081
+ {
2082
+ "epoch": 0.1642388718008241,
2083
+ "grad_norm": 1.2061476707458496,
2084
+ "learning_rate": 1.3123950058437696e-05,
2085
+ "loss": 1.1564,
2086
+ "step": 283
2087
+ },
2088
+ {
2089
+ "epoch": 0.16481922117114503,
2090
+ "grad_norm": 1.3668428659439087,
2091
+ "learning_rate": 1.3000000000000006e-05,
2092
+ "loss": 0.9441,
2093
+ "step": 284
2094
+ },
2095
+ {
2096
+ "epoch": 0.16539957054146595,
2097
+ "grad_norm": 1.4264423847198486,
2098
+ "learning_rate": 1.2876616375942311e-05,
2099
+ "loss": 1.0423,
2100
+ "step": 285
2101
+ },
2102
+ {
2103
+ "epoch": 0.1659799199117869,
2104
+ "grad_norm": 1.2884799242019653,
2105
+ "learning_rate": 1.2753806951678694e-05,
2106
+ "loss": 1.0806,
2107
+ "step": 286
2108
+ },
2109
+ {
2110
+ "epoch": 0.16656026928210782,
2111
+ "grad_norm": 1.1869494915008545,
2112
+ "learning_rate": 1.2631579456484704e-05,
2113
+ "loss": 1.0565,
2114
+ "step": 287
2115
+ },
2116
+ {
2117
+ "epoch": 0.16714061865242877,
2118
+ "grad_norm": 1.4182007312774658,
2119
+ "learning_rate": 1.2509941583010959e-05,
2120
+ "loss": 0.9715,
2121
+ "step": 288
2122
+ },
2123
+ {
2124
+ "epoch": 0.1677209680227497,
2125
+ "grad_norm": 1.5443942546844482,
2126
+ "learning_rate": 1.2388900986798953e-05,
2127
+ "loss": 1.056,
2128
+ "step": 289
2129
+ },
2130
+ {
2131
+ "epoch": 0.16830131739307064,
2132
+ "grad_norm": 1.4688053131103516,
2133
+ "learning_rate": 1.226846528579925e-05,
2134
+ "loss": 1.064,
2135
+ "step": 290
2136
+ },
2137
+ {
2138
+ "epoch": 0.16888166676339156,
2139
+ "grad_norm": 1.9034485816955566,
2140
+ "learning_rate": 1.2148642059892022e-05,
2141
+ "loss": 1.4109,
2142
+ "step": 291
2143
+ },
2144
+ {
2145
+ "epoch": 0.1694620161337125,
2146
+ "grad_norm": 1.4713436365127563,
2147
+ "learning_rate": 1.2029438850410018e-05,
2148
+ "loss": 0.7642,
2149
+ "step": 292
2150
+ },
2151
+ {
2152
+ "epoch": 0.17004236550403343,
2153
+ "grad_norm": 1.6940679550170898,
2154
+ "learning_rate": 1.1910863159663908e-05,
2155
+ "loss": 1.0706,
2156
+ "step": 293
2157
+ },
2158
+ {
2159
+ "epoch": 0.17062271487435435,
2160
+ "grad_norm": 1.5762420892715454,
2161
+ "learning_rate": 1.179292245047013e-05,
2162
+ "loss": 0.8392,
2163
+ "step": 294
2164
+ },
2165
+ {
2166
+ "epoch": 0.1712030642446753,
2167
+ "grad_norm": 2.418851613998413,
2168
+ "learning_rate": 1.1675624145681177e-05,
2169
+ "loss": 0.8652,
2170
+ "step": 295
2171
+ },
2172
+ {
2173
+ "epoch": 0.17178341361499622,
2174
+ "grad_norm": 2.076580762863159,
2175
+ "learning_rate": 1.1558975627718437e-05,
2176
+ "loss": 1.0094,
2177
+ "step": 296
2178
+ },
2179
+ {
2180
+ "epoch": 0.17236376298531716,
2181
+ "grad_norm": 2.1403160095214844,
2182
+ "learning_rate": 1.1442984238107577e-05,
2183
+ "loss": 0.8598,
2184
+ "step": 297
2185
+ },
2186
+ {
2187
+ "epoch": 0.17294411235563809,
2188
+ "grad_norm": 2.948251962661743,
2189
+ "learning_rate": 1.1327657277016475e-05,
2190
+ "loss": 1.1042,
2191
+ "step": 298
2192
+ },
2193
+ {
2194
+ "epoch": 0.17352446172595903,
2195
+ "grad_norm": 4.150631904602051,
2196
+ "learning_rate": 1.1213002002795757e-05,
2197
+ "loss": 1.4053,
2198
+ "step": 299
2199
+ },
2200
+ {
2201
+ "epoch": 0.17410481109627995,
2202
+ "grad_norm": 6.424019813537598,
2203
+ "learning_rate": 1.1099025631522004e-05,
2204
+ "loss": 1.8729,
2205
+ "step": 300
2206
+ },
2207
+ {
2208
+ "epoch": 0.17410481109627995,
2209
+ "eval_loss": 1.07566499710083,
2210
+ "eval_runtime": 5.9749,
2211
+ "eval_samples_per_second": 8.368,
2212
+ "eval_steps_per_second": 2.845,
2213
+ "step": 300
2214
  }
2215
  ],
2216
  "logging_steps": 1,
 
2239
  "attributes": {}
2240
  }
2241
  },
2242
+ "total_flos": 3.14419829538816e+17,
2243
  "train_batch_size": 3,
2244
  "trial_name": null,
2245
  "trial_params": null