Azrail commited on
Commit
43dcfec
·
verified ·
1 Parent(s): 2529d88

Training in progress, step 11000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593df4add94d8349a8e2c27dd6a4c8e410dc62c59535de38e2c844bae1bf9105
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd378ab1a42d536af5db20740b7c6ba4c863b9ff3eeb07dfe7d4b811a689ab5f
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ca220deb73713912b17a381232ea629f59c26aebf972823900e92efe4bee200
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d5f198053dbdfaa8c376a0fdaef1cec44750b494d68cc275559bc743db6f9c6
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5148f4a0429b56039088b4393cfcab680c3af25b037593fe69f3727d64615009
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe27678952c245c0bb175fc5ebd37cf8ebfcd743a407e4957181be4dbbc6146b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d15ebff9b6275f35ed91d179fc6aa0df6144af185e5ca68cd213907d032111d8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c68adf80be0bee4802e3498e2b20587f8f5db858b7307e722582d5bdeff1cda7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.413778535540233,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2188,11 +2188,229 @@
2188
  "eval_steps_per_second": 20.488,
2189
  "num_input_tokens_seen": 4830743425,
2190
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2191
  }
2192
  ],
2193
  "logging_steps": 50,
2194
  "max_steps": 16568,
2195
- "num_input_tokens_seen": 4830743425,
2196
  "num_train_epochs": 4,
2197
  "save_steps": 1000,
2198
  "stateful_callbacks": {
@@ -2207,7 +2425,7 @@
2207
  "attributes": {}
2208
  }
2209
  },
2210
- "total_flos": 1.292271014243328e+18,
2211
  "train_batch_size": 16,
2212
  "trial_name": null,
2213
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.6551895831132972,
6
  "eval_steps": 500,
7
+ "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2188
  "eval_steps_per_second": 20.488,
2189
  "num_input_tokens_seen": 4830743425,
2190
  "step": 10000
2191
+ },
2192
+ {
2193
+ "epoch": 2.425849087918886,
2194
+ "grad_norm": 0.263671875,
2195
+ "learning_rate": 2.4588803380111665e-05,
2196
+ "loss": 2.102,
2197
+ "mean_token_accuracy": 0.5541372266598046,
2198
+ "num_input_tokens_seen": 4855099809,
2199
+ "num_tokens": 2046084283.0,
2200
+ "step": 10050
2201
+ },
2202
+ {
2203
+ "epoch": 2.437919640297539,
2204
+ "grad_norm": 0.26171875,
2205
+ "learning_rate": 2.4400181077410594e-05,
2206
+ "loss": 2.0991,
2207
+ "mean_token_accuracy": 0.5542299181595445,
2208
+ "num_input_tokens_seen": 4879214129,
2209
+ "num_tokens": 2056326330.0,
2210
+ "step": 10100
2211
+ },
2212
+ {
2213
+ "epoch": 2.4499901926761924,
2214
+ "grad_norm": 0.25390625,
2215
+ "learning_rate": 2.4211558774709522e-05,
2216
+ "loss": 2.0834,
2217
+ "mean_token_accuracy": 0.5564426334574819,
2218
+ "num_input_tokens_seen": 4903399553,
2219
+ "num_tokens": 2066490013.0,
2220
+ "step": 10150
2221
+ },
2222
+ {
2223
+ "epoch": 2.4620607450548455,
2224
+ "grad_norm": 0.263671875,
2225
+ "learning_rate": 2.402293647200845e-05,
2226
+ "loss": 2.098,
2227
+ "mean_token_accuracy": 0.5545364746823906,
2228
+ "num_input_tokens_seen": 4927492609,
2229
+ "num_tokens": 2076526539.0,
2230
+ "step": 10200
2231
+ },
2232
+ {
2233
+ "epoch": 2.474131297433499,
2234
+ "grad_norm": 0.23828125,
2235
+ "learning_rate": 2.383431416930738e-05,
2236
+ "loss": 2.0885,
2237
+ "mean_token_accuracy": 0.555601441822946,
2238
+ "num_input_tokens_seen": 4951732929,
2239
+ "num_tokens": 2086768431.0,
2240
+ "step": 10250
2241
+ },
2242
+ {
2243
+ "epoch": 2.486201849812152,
2244
+ "grad_norm": 0.255859375,
2245
+ "learning_rate": 2.3645691866606308e-05,
2246
+ "loss": 2.0909,
2247
+ "mean_token_accuracy": 0.5558399046584964,
2248
+ "num_input_tokens_seen": 4975948097,
2249
+ "num_tokens": 2096961030.0,
2250
+ "step": 10300
2251
+ },
2252
+ {
2253
+ "epoch": 2.498272402190805,
2254
+ "grad_norm": 0.326171875,
2255
+ "learning_rate": 2.3457069563905237e-05,
2256
+ "loss": 2.0906,
2257
+ "mean_token_accuracy": 0.5556136939302087,
2258
+ "num_input_tokens_seen": 5000143905,
2259
+ "num_tokens": 2107303887.0,
2260
+ "step": 10350
2261
+ },
2262
+ {
2263
+ "epoch": 2.5103429545694587,
2264
+ "grad_norm": 0.267578125,
2265
+ "learning_rate": 2.3268447261204166e-05,
2266
+ "loss": 2.0976,
2267
+ "mean_token_accuracy": 0.5541230865567922,
2268
+ "num_input_tokens_seen": 5024212113,
2269
+ "num_tokens": 2117576166.0,
2270
+ "step": 10400
2271
+ },
2272
+ {
2273
+ "epoch": 2.5224135069481117,
2274
+ "grad_norm": 0.29296875,
2275
+ "learning_rate": 2.3079824958503094e-05,
2276
+ "loss": 2.0935,
2277
+ "mean_token_accuracy": 0.5555445018038153,
2278
+ "num_input_tokens_seen": 5048313681,
2279
+ "num_tokens": 2127734721.0,
2280
+ "step": 10450
2281
+ },
2282
+ {
2283
+ "epoch": 2.534484059326765,
2284
+ "grad_norm": 0.2421875,
2285
+ "learning_rate": 2.2891202655802023e-05,
2286
+ "loss": 2.0982,
2287
+ "num_input_tokens_seen": 5072508817,
2288
+ "step": 10500
2289
+ },
2290
+ {
2291
+ "epoch": 2.534484059326765,
2292
+ "eval_loss": 1.9683516025543213,
2293
+ "eval_mean_token_accuracy": 0.5784807712440619,
2294
+ "eval_num_tokens": 2137987548.0,
2295
+ "eval_runtime": 130.4075,
2296
+ "eval_samples_per_second": 82.143,
2297
+ "eval_steps_per_second": 20.536,
2298
+ "num_input_tokens_seen": 5072508817,
2299
+ "step": 10500
2300
+ },
2301
+ {
2302
+ "epoch": 2.5465546117054183,
2303
+ "grad_norm": 0.267578125,
2304
+ "learning_rate": 2.270258035310095e-05,
2305
+ "loss": 2.0924,
2306
+ "mean_token_accuracy": 0.5551841219887137,
2307
+ "num_input_tokens_seen": 5096586577,
2308
+ "num_tokens": 2148155987.0,
2309
+ "step": 10550
2310
+ },
2311
+ {
2312
+ "epoch": 2.5586251640840714,
2313
+ "grad_norm": 0.2734375,
2314
+ "learning_rate": 2.251395805039988e-05,
2315
+ "loss": 2.0982,
2316
+ "mean_token_accuracy": 0.5541262343525887,
2317
+ "num_input_tokens_seen": 5120875729,
2318
+ "num_tokens": 2158352820.0,
2319
+ "step": 10600
2320
+ },
2321
+ {
2322
+ "epoch": 2.5706957164627244,
2323
+ "grad_norm": 0.251953125,
2324
+ "learning_rate": 2.232533574769881e-05,
2325
+ "loss": 2.0908,
2326
+ "mean_token_accuracy": 0.5560182608664036,
2327
+ "num_input_tokens_seen": 5145050353,
2328
+ "num_tokens": 2168407807.0,
2329
+ "step": 10650
2330
+ },
2331
+ {
2332
+ "epoch": 2.582766268841378,
2333
+ "grad_norm": 0.2734375,
2334
+ "learning_rate": 2.2136713444997737e-05,
2335
+ "loss": 2.0958,
2336
+ "mean_token_accuracy": 0.5551287305355072,
2337
+ "num_input_tokens_seen": 5169266849,
2338
+ "num_tokens": 2178592858.0,
2339
+ "step": 10700
2340
+ },
2341
+ {
2342
+ "epoch": 2.594836821220031,
2343
+ "grad_norm": 0.2451171875,
2344
+ "learning_rate": 2.1948091142296666e-05,
2345
+ "loss": 2.0904,
2346
+ "mean_token_accuracy": 0.5559819753468037,
2347
+ "num_input_tokens_seen": 5193472705,
2348
+ "num_tokens": 2188792925.0,
2349
+ "step": 10750
2350
+ },
2351
+ {
2352
+ "epoch": 2.606907373598684,
2353
+ "grad_norm": 0.2578125,
2354
+ "learning_rate": 2.1759468839595595e-05,
2355
+ "loss": 2.1003,
2356
+ "mean_token_accuracy": 0.5538398388028145,
2357
+ "num_input_tokens_seen": 5217541665,
2358
+ "num_tokens": 2199063266.0,
2359
+ "step": 10800
2360
+ },
2361
+ {
2362
+ "epoch": 2.6189779259773376,
2363
+ "grad_norm": 0.2578125,
2364
+ "learning_rate": 2.1570846536894523e-05,
2365
+ "loss": 2.0996,
2366
+ "mean_token_accuracy": 0.5539121518284083,
2367
+ "num_input_tokens_seen": 5241669153,
2368
+ "num_tokens": 2209236507.0,
2369
+ "step": 10850
2370
+ },
2371
+ {
2372
+ "epoch": 2.6310484783559906,
2373
+ "grad_norm": 0.2412109375,
2374
+ "learning_rate": 2.1382224234193452e-05,
2375
+ "loss": 2.0898,
2376
+ "mean_token_accuracy": 0.5560731103271246,
2377
+ "num_input_tokens_seen": 5265851553,
2378
+ "num_tokens": 2219375503.0,
2379
+ "step": 10900
2380
+ },
2381
+ {
2382
+ "epoch": 2.643119030734644,
2383
+ "grad_norm": 0.255859375,
2384
+ "learning_rate": 2.119360193149238e-05,
2385
+ "loss": 2.0887,
2386
+ "mean_token_accuracy": 0.5559511515125632,
2387
+ "num_input_tokens_seen": 5290120305,
2388
+ "num_tokens": 2229631896.0,
2389
+ "step": 10950
2390
+ },
2391
+ {
2392
+ "epoch": 2.6551895831132972,
2393
+ "grad_norm": 0.267578125,
2394
+ "learning_rate": 2.100497962879131e-05,
2395
+ "loss": 2.0941,
2396
+ "num_input_tokens_seen": 5314253297,
2397
+ "step": 11000
2398
+ },
2399
+ {
2400
+ "epoch": 2.6551895831132972,
2401
+ "eval_loss": 1.9683243036270142,
2402
+ "eval_mean_token_accuracy": 0.5784822298106727,
2403
+ "eval_num_tokens": 2239778564.0,
2404
+ "eval_runtime": 131.1903,
2405
+ "eval_samples_per_second": 81.652,
2406
+ "eval_steps_per_second": 20.413,
2407
+ "num_input_tokens_seen": 5314253297,
2408
+ "step": 11000
2409
  }
2410
  ],
2411
  "logging_steps": 50,
2412
  "max_steps": 16568,
2413
+ "num_input_tokens_seen": 5314253297,
2414
  "num_train_epochs": 4,
2415
  "save_steps": 1000,
2416
  "stateful_callbacks": {
 
2425
  "attributes": {}
2426
  }
2427
  },
2428
+ "total_flos": 1.4216146240596787e+18,
2429
  "train_batch_size": 16,
2430
  "trial_name": null,
2431
  "trial_params": null