rovdetection commited on
Commit
f140112
·
verified ·
1 Parent(s): c236366

Training in progress, step 2500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdf2871a23de26395412fbb80cd5cfc6261483030011b659b66248a001490ba5
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea03fbd5faff9829b79932a9492534fbbbe2845de9ce69e896c0b8d109c1a825
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13d0d3ac532ad9924ef2b3bb9206e041a19d9bb2aae0a0f9b0e9fb94268b3e2f
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ca4bfe766f5a9ce1a39e0d776749658d826fc560902b47178ff40c41d18b94a
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96666620a506272b19319944e27b166707266143df40b9e008c7e67e99eb3d33
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd840ceb0cbd2bc41560fadd05ab11cb9d3690eebf99ba42e453854e5f372ed8
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad3ae1599c24410db8dc749055bc50d225b3704ca4ce296c6043ed130093cd3d
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e68bcbbf919727508b1f5613e7b10a32a3e07fdef6c3370ef48c8724f2e31e4
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4aa03f6e0cd07cf67ce1fbe3101d545f5771ef9148b9debf02b11cf6948da5c
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861ce13e6ca091acee9a68ebfc5ca38479baf4b537c37b3949f071f77b81e9f0
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa56fa8fa334bce407f019356c2a989207ab5f10b19e9753e7cbc5ea11bcd4ec
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5bce3ff1203929d6808ab229d6e6d4d185a3da8ef87a3b682b0eec04e6bacf2
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.436707500537288,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2008,6 +2008,506 @@
2008
  "mean_token_accuracy": 0.6531724959611893,
2009
  "num_tokens": 11896615.0,
2010
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2011
  }
2012
  ],
2013
  "logging_steps": 10,
@@ -2027,7 +2527,7 @@
2027
  "attributes": {}
2028
  }
2029
  },
2030
- "total_flos": 9.772738986953933e+16,
2031
  "train_batch_size": 2,
2032
  "trial_name": null,
2033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.295723189340211,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2008
  "mean_token_accuracy": 0.6531724959611893,
2009
  "num_tokens": 11896615.0,
2010
  "step": 2000
2011
+ },
2012
+ {
2013
+ "entropy": 1.8952298507094383,
2014
+ "epoch": 3.453900709219858,
2015
+ "grad_norm": 0.7292787432670593,
2016
+ "learning_rate": 0.00011964,
2017
+ "loss": 1.9302806854248047,
2018
+ "mean_token_accuracy": 0.6462091594934464,
2019
+ "num_tokens": 11954949.0,
2020
+ "step": 2010
2021
+ },
2022
+ {
2023
+ "entropy": 1.8723753660917282,
2024
+ "epoch": 3.4710939179024285,
2025
+ "grad_norm": 0.730530858039856,
2026
+ "learning_rate": 0.00011923999999999999,
2027
+ "loss": 1.9216194152832031,
2028
+ "mean_token_accuracy": 0.6504904717206955,
2029
+ "num_tokens": 12013803.0,
2030
+ "step": 2020
2031
+ },
2032
+ {
2033
+ "entropy": 1.8673277243971824,
2034
+ "epoch": 3.488287126584999,
2035
+ "grad_norm": 0.7530126571655273,
2036
+ "learning_rate": 0.00011884,
2037
+ "loss": 1.968985366821289,
2038
+ "mean_token_accuracy": 0.646847129613161,
2039
+ "num_tokens": 12073284.0,
2040
+ "step": 2030
2041
+ },
2042
+ {
2043
+ "entropy": 1.8757897645235062,
2044
+ "epoch": 3.505480335267569,
2045
+ "grad_norm": 0.7031217813491821,
2046
+ "learning_rate": 0.00011844,
2047
+ "loss": 1.9071741104125977,
2048
+ "mean_token_accuracy": 0.6450003884732723,
2049
+ "num_tokens": 12126451.0,
2050
+ "step": 2040
2051
+ },
2052
+ {
2053
+ "entropy": 1.7986262783408165,
2054
+ "epoch": 3.5226735439501398,
2055
+ "grad_norm": 0.7223983407020569,
2056
+ "learning_rate": 0.00011804,
2057
+ "loss": 1.8450950622558593,
2058
+ "mean_token_accuracy": 0.6576410517096519,
2059
+ "num_tokens": 12183343.0,
2060
+ "step": 2050
2061
+ },
2062
+ {
2063
+ "entropy": 1.8884935915470122,
2064
+ "epoch": 3.53986675263271,
2065
+ "grad_norm": 0.7206518650054932,
2066
+ "learning_rate": 0.00011763999999999999,
2067
+ "loss": 1.9660964965820313,
2068
+ "mean_token_accuracy": 0.6422303304076195,
2069
+ "num_tokens": 12243607.0,
2070
+ "step": 2060
2071
+ },
2072
+ {
2073
+ "entropy": 1.8009026944637299,
2074
+ "epoch": 3.5570599613152805,
2075
+ "grad_norm": 0.7229637503623962,
2076
+ "learning_rate": 0.00011724000000000002,
2077
+ "loss": 1.851433563232422,
2078
+ "mean_token_accuracy": 0.6556052915751934,
2079
+ "num_tokens": 12304867.0,
2080
+ "step": 2070
2081
+ },
2082
+ {
2083
+ "entropy": 1.7949693977832795,
2084
+ "epoch": 3.574253169997851,
2085
+ "grad_norm": 0.6935518383979797,
2086
+ "learning_rate": 0.00011684000000000001,
2087
+ "loss": 1.8848058700561523,
2088
+ "mean_token_accuracy": 0.6580755174160003,
2089
+ "num_tokens": 12367633.0,
2090
+ "step": 2080
2091
+ },
2092
+ {
2093
+ "entropy": 1.8038981169462205,
2094
+ "epoch": 3.591446378680421,
2095
+ "grad_norm": 0.7003904581069946,
2096
+ "learning_rate": 0.00011644000000000002,
2097
+ "loss": 1.8867233276367188,
2098
+ "mean_token_accuracy": 0.655081395432353,
2099
+ "num_tokens": 12423928.0,
2100
+ "step": 2090
2101
+ },
2102
+ {
2103
+ "entropy": 1.850062020123005,
2104
+ "epoch": 3.6086395873629917,
2105
+ "grad_norm": 0.6852926015853882,
2106
+ "learning_rate": 0.00011604000000000002,
2107
+ "loss": 1.9325201034545898,
2108
+ "mean_token_accuracy": 0.6472255479544401,
2109
+ "num_tokens": 12479411.0,
2110
+ "step": 2100
2111
+ },
2112
+ {
2113
+ "entropy": 1.8294448778033257,
2114
+ "epoch": 3.625832796045562,
2115
+ "grad_norm": 0.7044693827629089,
2116
+ "learning_rate": 0.00011564000000000001,
2117
+ "loss": 1.8989273071289063,
2118
+ "mean_token_accuracy": 0.6499249216169118,
2119
+ "num_tokens": 12539175.0,
2120
+ "step": 2110
2121
+ },
2122
+ {
2123
+ "entropy": 1.8719267755746842,
2124
+ "epoch": 3.6430260047281324,
2125
+ "grad_norm": 0.7180586457252502,
2126
+ "learning_rate": 0.00011524000000000001,
2127
+ "loss": 1.925216293334961,
2128
+ "mean_token_accuracy": 0.648950444161892,
2129
+ "num_tokens": 12598337.0,
2130
+ "step": 2120
2131
+ },
2132
+ {
2133
+ "entropy": 1.88923449665308,
2134
+ "epoch": 3.660219213410703,
2135
+ "grad_norm": 0.7464597821235657,
2136
+ "learning_rate": 0.00011484000000000002,
2137
+ "loss": 1.990826416015625,
2138
+ "mean_token_accuracy": 0.6456409864127636,
2139
+ "num_tokens": 12656592.0,
2140
+ "step": 2130
2141
+ },
2142
+ {
2143
+ "entropy": 1.8126205861568452,
2144
+ "epoch": 3.677412422093273,
2145
+ "grad_norm": 0.7253774404525757,
2146
+ "learning_rate": 0.00011444000000000001,
2147
+ "loss": 1.9414216995239257,
2148
+ "mean_token_accuracy": 0.6552157323807478,
2149
+ "num_tokens": 12717791.0,
2150
+ "step": 2140
2151
+ },
2152
+ {
2153
+ "entropy": 1.8930377542972565,
2154
+ "epoch": 3.6946056307758437,
2155
+ "grad_norm": 0.7404170036315918,
2156
+ "learning_rate": 0.00011404000000000001,
2157
+ "loss": 1.9364784240722657,
2158
+ "mean_token_accuracy": 0.6434980578720569,
2159
+ "num_tokens": 12775445.0,
2160
+ "step": 2150
2161
+ },
2162
+ {
2163
+ "entropy": 1.7652419656515121,
2164
+ "epoch": 3.711798839458414,
2165
+ "grad_norm": 0.688732385635376,
2166
+ "learning_rate": 0.00011364000000000002,
2167
+ "loss": 1.7636165618896484,
2168
+ "mean_token_accuracy": 0.6639453627169132,
2169
+ "num_tokens": 12834599.0,
2170
+ "step": 2160
2171
+ },
2172
+ {
2173
+ "entropy": 1.7745767116546631,
2174
+ "epoch": 3.7289920481409844,
2175
+ "grad_norm": 0.7011992335319519,
2176
+ "learning_rate": 0.00011324000000000001,
2177
+ "loss": 1.8347841262817384,
2178
+ "mean_token_accuracy": 0.6586773280054331,
2179
+ "num_tokens": 12889887.0,
2180
+ "step": 2170
2181
+ },
2182
+ {
2183
+ "entropy": 1.7952505484223367,
2184
+ "epoch": 3.746185256823555,
2185
+ "grad_norm": 0.7646785378456116,
2186
+ "learning_rate": 0.00011284000000000001,
2187
+ "loss": 1.883163070678711,
2188
+ "mean_token_accuracy": 0.6589437790215016,
2189
+ "num_tokens": 12950286.0,
2190
+ "step": 2180
2191
+ },
2192
+ {
2193
+ "entropy": 1.8878965258598328,
2194
+ "epoch": 3.763378465506125,
2195
+ "grad_norm": 0.7722623944282532,
2196
+ "learning_rate": 0.00011244000000000001,
2197
+ "loss": 1.9674694061279296,
2198
+ "mean_token_accuracy": 0.6422343414276839,
2199
+ "num_tokens": 13011083.0,
2200
+ "step": 2190
2201
+ },
2202
+ {
2203
+ "entropy": 1.919720321893692,
2204
+ "epoch": 3.780571674188695,
2205
+ "grad_norm": 0.7656893134117126,
2206
+ "learning_rate": 0.00011204000000000002,
2207
+ "loss": 1.9919773101806642,
2208
+ "mean_token_accuracy": 0.6393908958882093,
2209
+ "num_tokens": 13069376.0,
2210
+ "step": 2200
2211
+ },
2212
+ {
2213
+ "entropy": 1.77825688123703,
2214
+ "epoch": 3.797764882871266,
2215
+ "grad_norm": 0.8324808478355408,
2216
+ "learning_rate": 0.00011164000000000001,
2217
+ "loss": 1.8173160552978516,
2218
+ "mean_token_accuracy": 0.659475727379322,
2219
+ "num_tokens": 13124851.0,
2220
+ "step": 2210
2221
+ },
2222
+ {
2223
+ "entropy": 1.8232837438583374,
2224
+ "epoch": 3.8149580915538364,
2225
+ "grad_norm": 0.741481363773346,
2226
+ "learning_rate": 0.00011124000000000001,
2227
+ "loss": 1.860748291015625,
2228
+ "mean_token_accuracy": 0.6524971850216389,
2229
+ "num_tokens": 13182576.0,
2230
+ "step": 2220
2231
+ },
2232
+ {
2233
+ "entropy": 1.8588940657675266,
2234
+ "epoch": 3.8321513002364065,
2235
+ "grad_norm": 0.7748705148696899,
2236
+ "learning_rate": 0.00011084000000000002,
2237
+ "loss": 1.9206954956054687,
2238
+ "mean_token_accuracy": 0.6516353718936443,
2239
+ "num_tokens": 13242703.0,
2240
+ "step": 2230
2241
+ },
2242
+ {
2243
+ "entropy": 1.823398308455944,
2244
+ "epoch": 3.849344508918977,
2245
+ "grad_norm": 0.6341049671173096,
2246
+ "learning_rate": 0.00011044,
2247
+ "loss": 1.8718917846679688,
2248
+ "mean_token_accuracy": 0.6596139155328273,
2249
+ "num_tokens": 13303181.0,
2250
+ "step": 2240
2251
+ },
2252
+ {
2253
+ "entropy": 1.8098929420113563,
2254
+ "epoch": 3.866537717601547,
2255
+ "grad_norm": 0.6672969460487366,
2256
+ "learning_rate": 0.00011004000000000001,
2257
+ "loss": 1.8999752044677733,
2258
+ "mean_token_accuracy": 0.6594760783016682,
2259
+ "num_tokens": 13364371.0,
2260
+ "step": 2250
2261
+ },
2262
+ {
2263
+ "entropy": 1.7795367375016213,
2264
+ "epoch": 3.8837309262841178,
2265
+ "grad_norm": 0.6343891024589539,
2266
+ "learning_rate": 0.00010964000000000001,
2267
+ "loss": 1.827276611328125,
2268
+ "mean_token_accuracy": 0.6668465688824654,
2269
+ "num_tokens": 13425450.0,
2270
+ "step": 2260
2271
+ },
2272
+ {
2273
+ "entropy": 1.8673226684331894,
2274
+ "epoch": 3.9009241349666883,
2275
+ "grad_norm": 0.7357877492904663,
2276
+ "learning_rate": 0.00010924,
2277
+ "loss": 1.9206443786621095,
2278
+ "mean_token_accuracy": 0.647479448094964,
2279
+ "num_tokens": 13485806.0,
2280
+ "step": 2270
2281
+ },
2282
+ {
2283
+ "entropy": 1.806484942883253,
2284
+ "epoch": 3.9181173436492585,
2285
+ "grad_norm": 0.7172144055366516,
2286
+ "learning_rate": 0.00010884000000000001,
2287
+ "loss": 1.8789045333862304,
2288
+ "mean_token_accuracy": 0.6594084780663252,
2289
+ "num_tokens": 13544934.0,
2290
+ "step": 2280
2291
+ },
2292
+ {
2293
+ "entropy": 1.7970930591225625,
2294
+ "epoch": 3.935310552331829,
2295
+ "grad_norm": 0.7578801512718201,
2296
+ "learning_rate": 0.00010844000000000001,
2297
+ "loss": 1.8405040740966796,
2298
+ "mean_token_accuracy": 0.6608923889696598,
2299
+ "num_tokens": 13606653.0,
2300
+ "step": 2290
2301
+ },
2302
+ {
2303
+ "entropy": 1.8469372361898422,
2304
+ "epoch": 3.952503761014399,
2305
+ "grad_norm": 0.7626324892044067,
2306
+ "learning_rate": 0.00010804,
2307
+ "loss": 1.8629837036132812,
2308
+ "mean_token_accuracy": 0.6560039456933737,
2309
+ "num_tokens": 13663938.0,
2310
+ "step": 2300
2311
+ },
2312
+ {
2313
+ "entropy": 1.836122378706932,
2314
+ "epoch": 3.9696969696969697,
2315
+ "grad_norm": 0.7074365615844727,
2316
+ "learning_rate": 0.00010764,
2317
+ "loss": 1.8942070007324219,
2318
+ "mean_token_accuracy": 0.647238065674901,
2319
+ "num_tokens": 13722549.0,
2320
+ "step": 2310
2321
+ },
2322
+ {
2323
+ "entropy": 1.821449062973261,
2324
+ "epoch": 3.9868901783795403,
2325
+ "grad_norm": 0.6956577301025391,
2326
+ "learning_rate": 0.00010724000000000001,
2327
+ "loss": 1.8947336196899414,
2328
+ "mean_token_accuracy": 0.6528103355318308,
2329
+ "num_tokens": 13785922.0,
2330
+ "step": 2320
2331
+ },
2332
+ {
2333
+ "entropy": 1.839719023023333,
2334
+ "epoch": 4.003438641736514,
2335
+ "grad_norm": 0.6865222454071045,
2336
+ "learning_rate": 0.00010684,
2337
+ "loss": 1.8803377151489258,
2338
+ "mean_token_accuracy": 0.6526942384707464,
2339
+ "num_tokens": 13844647.0,
2340
+ "step": 2330
2341
+ },
2342
+ {
2343
+ "entropy": 1.855065654218197,
2344
+ "epoch": 4.020631850419084,
2345
+ "grad_norm": 0.7424384355545044,
2346
+ "learning_rate": 0.00010644,
2347
+ "loss": 1.9461166381835937,
2348
+ "mean_token_accuracy": 0.6463506512343884,
2349
+ "num_tokens": 13904724.0,
2350
+ "step": 2340
2351
+ },
2352
+ {
2353
+ "entropy": 1.7508789122104644,
2354
+ "epoch": 4.037825059101655,
2355
+ "grad_norm": 0.6670609712600708,
2356
+ "learning_rate": 0.00010604000000000001,
2357
+ "loss": 1.781893539428711,
2358
+ "mean_token_accuracy": 0.6653038747608662,
2359
+ "num_tokens": 13963472.0,
2360
+ "step": 2350
2361
+ },
2362
+ {
2363
+ "entropy": 1.8165026590228082,
2364
+ "epoch": 4.0550182677842255,
2365
+ "grad_norm": 0.7823750376701355,
2366
+ "learning_rate": 0.00010564000000000001,
2367
+ "loss": 1.8847312927246094,
2368
+ "mean_token_accuracy": 0.6607359856367111,
2369
+ "num_tokens": 14019708.0,
2370
+ "step": 2360
2371
+ },
2372
+ {
2373
+ "entropy": 1.794335062801838,
2374
+ "epoch": 4.072211476466796,
2375
+ "grad_norm": 0.8262340426445007,
2376
+ "learning_rate": 0.00010524,
2377
+ "loss": 1.8576740264892577,
2378
+ "mean_token_accuracy": 0.6582343481481076,
2379
+ "num_tokens": 14076178.0,
2380
+ "step": 2370
2381
+ },
2382
+ {
2383
+ "entropy": 1.8828865155577659,
2384
+ "epoch": 4.089404685149366,
2385
+ "grad_norm": 0.784656822681427,
2386
+ "learning_rate": 0.00010484,
2387
+ "loss": 1.9146394729614258,
2388
+ "mean_token_accuracy": 0.6491621173918247,
2389
+ "num_tokens": 14133662.0,
2390
+ "step": 2380
2391
+ },
2392
+ {
2393
+ "entropy": 1.918326808512211,
2394
+ "epoch": 4.106597893831936,
2395
+ "grad_norm": 0.7571077346801758,
2396
+ "learning_rate": 0.00010444000000000001,
2397
+ "loss": 2.024713897705078,
2398
+ "mean_token_accuracy": 0.643079025298357,
2399
+ "num_tokens": 14196967.0,
2400
+ "step": 2390
2401
+ },
2402
+ {
2403
+ "entropy": 1.7909317679703236,
2404
+ "epoch": 4.123791102514507,
2405
+ "grad_norm": 0.7276471257209778,
2406
+ "learning_rate": 0.00010404,
2407
+ "loss": 1.845133399963379,
2408
+ "mean_token_accuracy": 0.6548417568206787,
2409
+ "num_tokens": 14256866.0,
2410
+ "step": 2400
2411
+ },
2412
+ {
2413
+ "entropy": 1.7750686906278133,
2414
+ "epoch": 4.140984311197077,
2415
+ "grad_norm": 0.668246328830719,
2416
+ "learning_rate": 0.00010364,
2417
+ "loss": 1.7945009231567384,
2418
+ "mean_token_accuracy": 0.6641525950282812,
2419
+ "num_tokens": 14318324.0,
2420
+ "step": 2410
2421
+ },
2422
+ {
2423
+ "entropy": 1.823828212916851,
2424
+ "epoch": 4.158177519879647,
2425
+ "grad_norm": 0.7596518993377686,
2426
+ "learning_rate": 0.00010324000000000001,
2427
+ "loss": 1.898871612548828,
2428
+ "mean_token_accuracy": 0.6519910141825676,
2429
+ "num_tokens": 14380775.0,
2430
+ "step": 2420
2431
+ },
2432
+ {
2433
+ "entropy": 1.7938876405358315,
2434
+ "epoch": 4.175370728562218,
2435
+ "grad_norm": 0.6834619641304016,
2436
+ "learning_rate": 0.00010284,
2437
+ "loss": 1.8518138885498048,
2438
+ "mean_token_accuracy": 0.6622516691684723,
2439
+ "num_tokens": 14440862.0,
2440
+ "step": 2430
2441
+ },
2442
+ {
2443
+ "entropy": 1.8744625180959702,
2444
+ "epoch": 4.192563937244788,
2445
+ "grad_norm": 0.8088146448135376,
2446
+ "learning_rate": 0.00010244,
2447
+ "loss": 1.9542848587036132,
2448
+ "mean_token_accuracy": 0.6499028638005256,
2449
+ "num_tokens": 14500841.0,
2450
+ "step": 2440
2451
+ },
2452
+ {
2453
+ "entropy": 1.8284114554524422,
2454
+ "epoch": 4.209757145927359,
2455
+ "grad_norm": 0.82193523645401,
2456
+ "learning_rate": 0.00010204,
2457
+ "loss": 1.9107404708862306,
2458
+ "mean_token_accuracy": 0.6551219135522842,
2459
+ "num_tokens": 14564257.0,
2460
+ "step": 2450
2461
+ },
2462
+ {
2463
+ "entropy": 1.8538024842739105,
2464
+ "epoch": 4.226950354609929,
2465
+ "grad_norm": 0.7263757586479187,
2466
+ "learning_rate": 0.00010164,
2467
+ "loss": 1.8713863372802735,
2468
+ "mean_token_accuracy": 0.6510257624089718,
2469
+ "num_tokens": 14623019.0,
2470
+ "step": 2460
2471
+ },
2472
+ {
2473
+ "entropy": 1.756752038002014,
2474
+ "epoch": 4.244143563292499,
2475
+ "grad_norm": 0.7334346175193787,
2476
+ "learning_rate": 0.00010124,
2477
+ "loss": 1.7855047225952148,
2478
+ "mean_token_accuracy": 0.6687729060649872,
2479
+ "num_tokens": 14682191.0,
2480
+ "step": 2470
2481
+ },
2482
+ {
2483
+ "entropy": 1.7032470375299453,
2484
+ "epoch": 4.26133677197507,
2485
+ "grad_norm": 0.7168938517570496,
2486
+ "learning_rate": 0.00010084,
2487
+ "loss": 1.7648530960083009,
2488
+ "mean_token_accuracy": 0.6696467150002718,
2489
+ "num_tokens": 14739840.0,
2490
+ "step": 2480
2491
+ },
2492
+ {
2493
+ "entropy": 1.7426577515900135,
2494
+ "epoch": 4.27852998065764,
2495
+ "grad_norm": 0.7091065645217896,
2496
+ "learning_rate": 0.00010044000000000001,
2497
+ "loss": 1.8180946350097655,
2498
+ "mean_token_accuracy": 0.6640235505998134,
2499
+ "num_tokens": 14798444.0,
2500
+ "step": 2490
2501
+ },
2502
+ {
2503
+ "entropy": 1.8743537411093711,
2504
+ "epoch": 4.295723189340211,
2505
+ "grad_norm": 0.6376718878746033,
2506
+ "learning_rate": 0.00010004,
2507
+ "loss": 1.9534942626953125,
2508
+ "mean_token_accuracy": 0.6467559643089771,
2509
+ "num_tokens": 14861262.0,
2510
+ "step": 2500
2511
  }
2512
  ],
2513
  "logging_steps": 10,
 
2527
  "attributes": {}
2528
  }
2529
  },
2530
+ "total_flos": 1.2204939073814528e+17,
2531
  "train_batch_size": 2,
2532
  "trial_name": null,
2533
  "trial_params": null