shulijia commited on
Commit
ea5744e
·
verified ·
1 Parent(s): a83bb35

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9954efb766a4618113b96a524d55f94d09d87a3792b59e5ed9bd7a12a9c3cf8e
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b32b64478cbdeda9875145bb265427fdfdcfc88b4dfa3e973d6646373e0a0f
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17b68b3ca976f7fd977408a7a53989ae17957719ab53b8d074cea81d998e9c4d
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cc9bb74074cb17bdb9685f131305e9e7ae2f53f82516e862de8e7a638f79006
3
  size 4768663315
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de69a2834426ff9ef8199d077e00892579278af31d8969d77f98235b5cfc010a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ee2e9e1516affc6da40a027df47b003edcf1d9f7876c83392a1a532d8732f81
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ccf15ab626b17b6464b860472b5e0620f2d570991113393ae691c84ea2b523
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4509903504316912,
6
  "eval_steps": 100,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2258,6 +2258,906 @@
2258
  "mean_token_accuracy": 0.7767490215599537,
2259
  "num_tokens": 20478464.0,
2260
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2261
  }
2262
  ],
2263
  "logging_steps": 10,
@@ -2277,7 +3177,7 @@
2277
  "attributes": {}
2278
  }
2279
  },
2280
- "total_flos": 5.41205816452055e+16,
2281
  "train_batch_size": 2,
2282
  "trial_name": null,
2283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0313429587172602,
6
  "eval_steps": 100,
7
+ "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2258
  "mean_token_accuracy": 0.7767490215599537,
2259
  "num_tokens": 20478464.0,
2260
  "step": 2500
2261
+ },
2262
+ {
2263
+ "epoch": 1.4567946020459988,
2264
+ "grad_norm": 1.3819377422332764,
2265
+ "learning_rate": 5.717970765262253e-06,
2266
+ "loss": 0.0996,
2267
+ "mean_token_accuracy": 0.7780210342258215,
2268
+ "num_tokens": 20560384.0,
2269
+ "step": 2510
2270
+ },
2271
+ {
2272
+ "epoch": 1.4625988536603063,
2273
+ "grad_norm": 1.2591315507888794,
2274
+ "learning_rate": 5.696474634565779e-06,
2275
+ "loss": 0.0934,
2276
+ "mean_token_accuracy": 0.7870963752269745,
2277
+ "num_tokens": 20642304.0,
2278
+ "step": 2520
2279
+ },
2280
+ {
2281
+ "epoch": 1.4684031052746136,
2282
+ "grad_norm": 1.2373124361038208,
2283
+ "learning_rate": 5.674978503869304e-06,
2284
+ "loss": 0.09,
2285
+ "mean_token_accuracy": 0.7776785731315613,
2286
+ "num_tokens": 20724224.0,
2287
+ "step": 2530
2288
+ },
2289
+ {
2290
+ "epoch": 1.4742073568889211,
2291
+ "grad_norm": 1.255920648574829,
2292
+ "learning_rate": 5.653482373172829e-06,
2293
+ "loss": 0.1054,
2294
+ "mean_token_accuracy": 0.7527152694761753,
2295
+ "num_tokens": 20806144.0,
2296
+ "step": 2540
2297
+ },
2298
+ {
2299
+ "epoch": 1.4800116085032287,
2300
+ "grad_norm": 1.8799701929092407,
2301
+ "learning_rate": 5.631986242476355e-06,
2302
+ "loss": 0.0987,
2303
+ "mean_token_accuracy": 0.7734344460070133,
2304
+ "num_tokens": 20888064.0,
2305
+ "step": 2550
2306
+ },
2307
+ {
2308
+ "epoch": 1.485815860117536,
2309
+ "grad_norm": 1.6360814571380615,
2310
+ "learning_rate": 5.61049011177988e-06,
2311
+ "loss": 0.0818,
2312
+ "mean_token_accuracy": 0.7760029345750808,
2313
+ "num_tokens": 20969984.0,
2314
+ "step": 2560
2315
+ },
2316
+ {
2317
+ "epoch": 1.4916201117318435,
2318
+ "grad_norm": 1.651768445968628,
2319
+ "learning_rate": 5.588993981083406e-06,
2320
+ "loss": 0.0922,
2321
+ "mean_token_accuracy": 0.787769079208374,
2322
+ "num_tokens": 21051904.0,
2323
+ "step": 2570
2324
+ },
2325
+ {
2326
+ "epoch": 1.497424363346151,
2327
+ "grad_norm": 1.5460911989212036,
2328
+ "learning_rate": 5.567497850386931e-06,
2329
+ "loss": 0.1021,
2330
+ "mean_token_accuracy": 0.7730185888707638,
2331
+ "num_tokens": 21133824.0,
2332
+ "step": 2580
2333
+ },
2334
+ {
2335
+ "epoch": 1.5032286149604586,
2336
+ "grad_norm": 1.3855364322662354,
2337
+ "learning_rate": 5.546001719690456e-06,
2338
+ "loss": 0.1081,
2339
+ "mean_token_accuracy": 0.7509540095925331,
2340
+ "num_tokens": 21215744.0,
2341
+ "step": 2590
2342
+ },
2343
+ {
2344
+ "epoch": 1.509032866574766,
2345
+ "grad_norm": 1.7831671237945557,
2346
+ "learning_rate": 5.524505588993981e-06,
2347
+ "loss": 0.0976,
2348
+ "mean_token_accuracy": 0.7773605681955814,
2349
+ "num_tokens": 21297664.0,
2350
+ "step": 2600
2351
+ },
2352
+ {
2353
+ "epoch": 1.5148371181890736,
2354
+ "grad_norm": 1.4737831354141235,
2355
+ "learning_rate": 5.503009458297507e-06,
2356
+ "loss": 0.091,
2357
+ "mean_token_accuracy": 0.7802592940628529,
2358
+ "num_tokens": 21379584.0,
2359
+ "step": 2610
2360
+ },
2361
+ {
2362
+ "epoch": 1.520641369803381,
2363
+ "grad_norm": 1.8076776266098022,
2364
+ "learning_rate": 5.481513327601032e-06,
2365
+ "loss": 0.0919,
2366
+ "mean_token_accuracy": 0.7831947166472674,
2367
+ "num_tokens": 21461504.0,
2368
+ "step": 2620
2369
+ },
2370
+ {
2371
+ "epoch": 1.5264456214176885,
2372
+ "grad_norm": 1.5028278827667236,
2373
+ "learning_rate": 5.460017196904558e-06,
2374
+ "loss": 0.1015,
2375
+ "mean_token_accuracy": 0.7705234806984663,
2376
+ "num_tokens": 21543424.0,
2377
+ "step": 2630
2378
+ },
2379
+ {
2380
+ "epoch": 1.5322498730319958,
2381
+ "grad_norm": 1.859098196029663,
2382
+ "learning_rate": 5.4385210662080835e-06,
2383
+ "loss": 0.1083,
2384
+ "mean_token_accuracy": 0.7652274943888188,
2385
+ "num_tokens": 21625344.0,
2386
+ "step": 2640
2387
+ },
2388
+ {
2389
+ "epoch": 1.5380541246463033,
2390
+ "grad_norm": 2.0585951805114746,
2391
+ "learning_rate": 5.417024935511609e-06,
2392
+ "loss": 0.0912,
2393
+ "mean_token_accuracy": 0.7723703488707543,
2394
+ "num_tokens": 21707264.0,
2395
+ "step": 2650
2396
+ },
2397
+ {
2398
+ "epoch": 1.5438583762606108,
2399
+ "grad_norm": 1.7115544080734253,
2400
+ "learning_rate": 5.395528804815133e-06,
2401
+ "loss": 0.1011,
2402
+ "mean_token_accuracy": 0.7763698607683182,
2403
+ "num_tokens": 21789184.0,
2404
+ "step": 2660
2405
+ },
2406
+ {
2407
+ "epoch": 1.5496626278749184,
2408
+ "grad_norm": 1.1767747402191162,
2409
+ "learning_rate": 5.374032674118659e-06,
2410
+ "loss": 0.0948,
2411
+ "mean_token_accuracy": 0.7808953076601028,
2412
+ "num_tokens": 21871104.0,
2413
+ "step": 2670
2414
+ },
2415
+ {
2416
+ "epoch": 1.555466879489226,
2417
+ "grad_norm": 1.825332522392273,
2418
+ "learning_rate": 5.352536543422184e-06,
2419
+ "loss": 0.0841,
2420
+ "mean_token_accuracy": 0.7940068498253823,
2421
+ "num_tokens": 21953024.0,
2422
+ "step": 2680
2423
+ },
2424
+ {
2425
+ "epoch": 1.5612711311035334,
2426
+ "grad_norm": 1.7559678554534912,
2427
+ "learning_rate": 5.33104041272571e-06,
2428
+ "loss": 0.1111,
2429
+ "mean_token_accuracy": 0.7540117435157299,
2430
+ "num_tokens": 22034944.0,
2431
+ "step": 2690
2432
+ },
2433
+ {
2434
+ "epoch": 1.567075382717841,
2435
+ "grad_norm": 1.1683419942855835,
2436
+ "learning_rate": 5.309544282029235e-06,
2437
+ "loss": 0.0968,
2438
+ "mean_token_accuracy": 0.782081701233983,
2439
+ "num_tokens": 22116864.0,
2440
+ "step": 2700
2441
+ },
2442
+ {
2443
+ "epoch": 1.5728796343321483,
2444
+ "grad_norm": 1.3004035949707031,
2445
+ "learning_rate": 5.288048151332761e-06,
2446
+ "loss": 0.0986,
2447
+ "mean_token_accuracy": 0.7779843434691429,
2448
+ "num_tokens": 22198784.0,
2449
+ "step": 2710
2450
+ },
2451
+ {
2452
+ "epoch": 1.5786838859464558,
2453
+ "grad_norm": 1.7196249961853027,
2454
+ "learning_rate": 5.266552020636287e-06,
2455
+ "loss": 0.0918,
2456
+ "mean_token_accuracy": 0.7894324842840433,
2457
+ "num_tokens": 22280704.0,
2458
+ "step": 2720
2459
+ },
2460
+ {
2461
+ "epoch": 1.5844881375607631,
2462
+ "grad_norm": 1.1833781003952026,
2463
+ "learning_rate": 5.2450558899398105e-06,
2464
+ "loss": 0.1001,
2465
+ "mean_token_accuracy": 0.7578400194644928,
2466
+ "num_tokens": 22362624.0,
2467
+ "step": 2730
2468
+ },
2469
+ {
2470
+ "epoch": 1.5902923891750707,
2471
+ "grad_norm": 1.5964421033859253,
2472
+ "learning_rate": 5.223559759243336e-06,
2473
+ "loss": 0.0889,
2474
+ "mean_token_accuracy": 0.7843933455646038,
2475
+ "num_tokens": 22444544.0,
2476
+ "step": 2740
2477
+ },
2478
+ {
2479
+ "epoch": 1.5960966407893782,
2480
+ "grad_norm": 1.2686461210250854,
2481
+ "learning_rate": 5.202063628546862e-06,
2482
+ "loss": 0.1015,
2483
+ "mean_token_accuracy": 0.7669398248195648,
2484
+ "num_tokens": 22526464.0,
2485
+ "step": 2750
2486
+ },
2487
+ {
2488
+ "epoch": 1.6019008924036857,
2489
+ "grad_norm": 1.5164870023727417,
2490
+ "learning_rate": 5.180567497850387e-06,
2491
+ "loss": 0.0903,
2492
+ "mean_token_accuracy": 0.7708292577415705,
2493
+ "num_tokens": 22608384.0,
2494
+ "step": 2760
2495
+ },
2496
+ {
2497
+ "epoch": 1.6077051440179932,
2498
+ "grad_norm": 1.2695127725601196,
2499
+ "learning_rate": 5.159071367153913e-06,
2500
+ "loss": 0.0926,
2501
+ "mean_token_accuracy": 0.7750244583934546,
2502
+ "num_tokens": 22690304.0,
2503
+ "step": 2770
2504
+ },
2505
+ {
2506
+ "epoch": 1.6135093956323008,
2507
+ "grad_norm": 1.2489911317825317,
2508
+ "learning_rate": 5.1375752364574386e-06,
2509
+ "loss": 0.0918,
2510
+ "mean_token_accuracy": 0.7785225044935942,
2511
+ "num_tokens": 22772224.0,
2512
+ "step": 2780
2513
+ },
2514
+ {
2515
+ "epoch": 1.619313647246608,
2516
+ "grad_norm": 1.5641402006149292,
2517
+ "learning_rate": 5.116079105760964e-06,
2518
+ "loss": 0.1098,
2519
+ "mean_token_accuracy": 0.76334393247962,
2520
+ "num_tokens": 22854144.0,
2521
+ "step": 2790
2522
+ },
2523
+ {
2524
+ "epoch": 1.6251178988609156,
2525
+ "grad_norm": 1.3082184791564941,
2526
+ "learning_rate": 5.094582975064489e-06,
2527
+ "loss": 0.0995,
2528
+ "mean_token_accuracy": 0.7862891420722008,
2529
+ "num_tokens": 22936064.0,
2530
+ "step": 2800
2531
+ },
2532
+ {
2533
+ "epoch": 1.6309221504752232,
2534
+ "grad_norm": 1.783996820449829,
2535
+ "learning_rate": 5.073086844368014e-06,
2536
+ "loss": 0.0896,
2537
+ "mean_token_accuracy": 0.7687989220023155,
2538
+ "num_tokens": 23017984.0,
2539
+ "step": 2810
2540
+ },
2541
+ {
2542
+ "epoch": 1.6367264020895305,
2543
+ "grad_norm": 1.7711577415466309,
2544
+ "learning_rate": 5.051590713671539e-06,
2545
+ "loss": 0.1018,
2546
+ "mean_token_accuracy": 0.7766756389290095,
2547
+ "num_tokens": 23099904.0,
2548
+ "step": 2820
2549
+ },
2550
+ {
2551
+ "epoch": 1.642530653703838,
2552
+ "grad_norm": 1.7994273900985718,
2553
+ "learning_rate": 5.030094582975065e-06,
2554
+ "loss": 0.0824,
2555
+ "mean_token_accuracy": 0.7806262217462063,
2556
+ "num_tokens": 23181824.0,
2557
+ "step": 2830
2558
+ },
2559
+ {
2560
+ "epoch": 1.6483349053181455,
2561
+ "grad_norm": 1.2382689714431763,
2562
+ "learning_rate": 5.0085984522785905e-06,
2563
+ "loss": 0.0908,
2564
+ "mean_token_accuracy": 0.7679672215133906,
2565
+ "num_tokens": 23263744.0,
2566
+ "step": 2840
2567
+ },
2568
+ {
2569
+ "epoch": 1.654139156932453,
2570
+ "grad_norm": 1.6400649547576904,
2571
+ "learning_rate": 4.987102321582116e-06,
2572
+ "loss": 0.0932,
2573
+ "mean_token_accuracy": 0.7889799430966378,
2574
+ "num_tokens": 23345664.0,
2575
+ "step": 2850
2576
+ },
2577
+ {
2578
+ "epoch": 1.6599434085467606,
2579
+ "grad_norm": 1.4534789323806763,
2580
+ "learning_rate": 4.965606190885641e-06,
2581
+ "loss": 0.1011,
2582
+ "mean_token_accuracy": 0.7663038164377213,
2583
+ "num_tokens": 23427584.0,
2584
+ "step": 2860
2585
+ },
2586
+ {
2587
+ "epoch": 1.6657476601610681,
2588
+ "grad_norm": 1.167546272277832,
2589
+ "learning_rate": 4.9441100601891665e-06,
2590
+ "loss": 0.0926,
2591
+ "mean_token_accuracy": 0.7886497065424919,
2592
+ "num_tokens": 23509504.0,
2593
+ "step": 2870
2594
+ },
2595
+ {
2596
+ "epoch": 1.6715519117753754,
2597
+ "grad_norm": 1.3506172895431519,
2598
+ "learning_rate": 4.922613929492692e-06,
2599
+ "loss": 0.0967,
2600
+ "mean_token_accuracy": 0.7687377691268921,
2601
+ "num_tokens": 23591424.0,
2602
+ "step": 2880
2603
+ },
2604
+ {
2605
+ "epoch": 1.677356163389683,
2606
+ "grad_norm": 1.5580819845199585,
2607
+ "learning_rate": 4.901117798796217e-06,
2608
+ "loss": 0.1032,
2609
+ "mean_token_accuracy": 0.7531555775552988,
2610
+ "num_tokens": 23673344.0,
2611
+ "step": 2890
2612
+ },
2613
+ {
2614
+ "epoch": 1.6831604150039903,
2615
+ "grad_norm": 1.21241295337677,
2616
+ "learning_rate": 4.8796216680997424e-06,
2617
+ "loss": 0.0814,
2618
+ "mean_token_accuracy": 0.785212817415595,
2619
+ "num_tokens": 23755264.0,
2620
+ "step": 2900
2621
+ },
2622
+ {
2623
+ "epoch": 1.6889646666182978,
2624
+ "grad_norm": 1.4660733938217163,
2625
+ "learning_rate": 4.858125537403268e-06,
2626
+ "loss": 0.0898,
2627
+ "mean_token_accuracy": 0.7939579278230667,
2628
+ "num_tokens": 23837184.0,
2629
+ "step": 2910
2630
+ },
2631
+ {
2632
+ "epoch": 1.6947689182326053,
2633
+ "grad_norm": 1.4002984762191772,
2634
+ "learning_rate": 4.836629406706793e-06,
2635
+ "loss": 0.0982,
2636
+ "mean_token_accuracy": 0.7646648734807968,
2637
+ "num_tokens": 23919104.0,
2638
+ "step": 2920
2639
+ },
2640
+ {
2641
+ "epoch": 1.7005731698469129,
2642
+ "grad_norm": 1.2702486515045166,
2643
+ "learning_rate": 4.815133276010318e-06,
2644
+ "loss": 0.0932,
2645
+ "mean_token_accuracy": 0.7652764186263085,
2646
+ "num_tokens": 24001024.0,
2647
+ "step": 2930
2648
+ },
2649
+ {
2650
+ "epoch": 1.7063774214612204,
2651
+ "grad_norm": 1.3194500207901,
2652
+ "learning_rate": 4.793637145313844e-06,
2653
+ "loss": 0.0849,
2654
+ "mean_token_accuracy": 0.794006847590208,
2655
+ "num_tokens": 24082944.0,
2656
+ "step": 2940
2657
+ },
2658
+ {
2659
+ "epoch": 1.712181673075528,
2660
+ "grad_norm": 1.3983765840530396,
2661
+ "learning_rate": 4.772141014617369e-06,
2662
+ "loss": 0.0923,
2663
+ "mean_token_accuracy": 0.788772015273571,
2664
+ "num_tokens": 24164864.0,
2665
+ "step": 2950
2666
+ },
2667
+ {
2668
+ "epoch": 1.7179859246898355,
2669
+ "grad_norm": 1.535221815109253,
2670
+ "learning_rate": 4.750644883920894e-06,
2671
+ "loss": 0.089,
2672
+ "mean_token_accuracy": 0.7784368902444839,
2673
+ "num_tokens": 24246784.0,
2674
+ "step": 2960
2675
+ },
2676
+ {
2677
+ "epoch": 1.7237901763041428,
2678
+ "grad_norm": 1.2413194179534912,
2679
+ "learning_rate": 4.72914875322442e-06,
2680
+ "loss": 0.0914,
2681
+ "mean_token_accuracy": 0.7653987266123294,
2682
+ "num_tokens": 24328704.0,
2683
+ "step": 2970
2684
+ },
2685
+ {
2686
+ "epoch": 1.7295944279184503,
2687
+ "grad_norm": 1.0277676582336426,
2688
+ "learning_rate": 4.707652622527945e-06,
2689
+ "loss": 0.0808,
2690
+ "mean_token_accuracy": 0.7767734847962856,
2691
+ "num_tokens": 24410624.0,
2692
+ "step": 2980
2693
+ },
2694
+ {
2695
+ "epoch": 1.7353986795327576,
2696
+ "grad_norm": 1.6544444561004639,
2697
+ "learning_rate": 4.68615649183147e-06,
2698
+ "loss": 0.0971,
2699
+ "mean_token_accuracy": 0.7798067525029182,
2700
+ "num_tokens": 24492544.0,
2701
+ "step": 2990
2702
+ },
2703
+ {
2704
+ "epoch": 1.7412029311470651,
2705
+ "grad_norm": 1.4925342798233032,
2706
+ "learning_rate": 4.664660361134996e-06,
2707
+ "loss": 0.0874,
2708
+ "mean_token_accuracy": 0.7851394325494766,
2709
+ "num_tokens": 24574464.0,
2710
+ "step": 3000
2711
+ },
2712
+ {
2713
+ "epoch": 1.7470071827613727,
2714
+ "grad_norm": 1.1997790336608887,
2715
+ "learning_rate": 4.6431642304385216e-06,
2716
+ "loss": 0.0885,
2717
+ "mean_token_accuracy": 0.7740826837718486,
2718
+ "num_tokens": 24656384.0,
2719
+ "step": 3010
2720
+ },
2721
+ {
2722
+ "epoch": 1.7528114343756802,
2723
+ "grad_norm": 1.6726058721542358,
2724
+ "learning_rate": 4.621668099742046e-06,
2725
+ "loss": 0.102,
2726
+ "mean_token_accuracy": 0.7693126246333122,
2727
+ "num_tokens": 24738304.0,
2728
+ "step": 3020
2729
+ },
2730
+ {
2731
+ "epoch": 1.7586156859899877,
2732
+ "grad_norm": 1.4176825284957886,
2733
+ "learning_rate": 4.600171969045572e-06,
2734
+ "loss": 0.09,
2735
+ "mean_token_accuracy": 0.7685176137834787,
2736
+ "num_tokens": 24820224.0,
2737
+ "step": 3030
2738
+ },
2739
+ {
2740
+ "epoch": 1.7644199376042953,
2741
+ "grad_norm": 0.9416372179985046,
2742
+ "learning_rate": 4.5786758383490975e-06,
2743
+ "loss": 0.0876,
2744
+ "mean_token_accuracy": 0.791475048661232,
2745
+ "num_tokens": 24902144.0,
2746
+ "step": 3040
2747
+ },
2748
+ {
2749
+ "epoch": 1.7702241892186026,
2750
+ "grad_norm": 1.2992349863052368,
2751
+ "learning_rate": 4.557179707652623e-06,
2752
+ "loss": 0.0766,
2753
+ "mean_token_accuracy": 0.7893101766705513,
2754
+ "num_tokens": 24984064.0,
2755
+ "step": 3050
2756
+ },
2757
+ {
2758
+ "epoch": 1.7760284408329101,
2759
+ "grad_norm": 1.262165904045105,
2760
+ "learning_rate": 4.535683576956148e-06,
2761
+ "loss": 0.0809,
2762
+ "mean_token_accuracy": 0.7902886483818292,
2763
+ "num_tokens": 25065984.0,
2764
+ "step": 3060
2765
+ },
2766
+ {
2767
+ "epoch": 1.7818326924472174,
2768
+ "grad_norm": 1.4758257865905762,
2769
+ "learning_rate": 4.5141874462596735e-06,
2770
+ "loss": 0.1039,
2771
+ "mean_token_accuracy": 0.7650684904307127,
2772
+ "num_tokens": 25147904.0,
2773
+ "step": 3070
2774
+ },
2775
+ {
2776
+ "epoch": 1.787636944061525,
2777
+ "grad_norm": 2.127101182937622,
2778
+ "learning_rate": 4.492691315563199e-06,
2779
+ "loss": 0.0975,
2780
+ "mean_token_accuracy": 0.7761130124330521,
2781
+ "num_tokens": 25229824.0,
2782
+ "step": 3080
2783
+ },
2784
+ {
2785
+ "epoch": 1.7934411956758325,
2786
+ "grad_norm": 1.079978346824646,
2787
+ "learning_rate": 4.471195184866725e-06,
2788
+ "loss": 0.0787,
2789
+ "mean_token_accuracy": 0.7821550846099854,
2790
+ "num_tokens": 25311744.0,
2791
+ "step": 3090
2792
+ },
2793
+ {
2794
+ "epoch": 1.79924544729014,
2795
+ "grad_norm": 1.8025544881820679,
2796
+ "learning_rate": 4.4496990541702495e-06,
2797
+ "loss": 0.0905,
2798
+ "mean_token_accuracy": 0.7709515657275915,
2799
+ "num_tokens": 25393664.0,
2800
+ "step": 3100
2801
+ },
2802
+ {
2803
+ "epoch": 1.8050496989044476,
2804
+ "grad_norm": 1.2720634937286377,
2805
+ "learning_rate": 4.428202923473775e-06,
2806
+ "loss": 0.0822,
2807
+ "mean_token_accuracy": 0.7785836607217789,
2808
+ "num_tokens": 25475584.0,
2809
+ "step": 3110
2810
+ },
2811
+ {
2812
+ "epoch": 1.810853950518755,
2813
+ "grad_norm": 1.397544503211975,
2814
+ "learning_rate": 4.406706792777301e-06,
2815
+ "loss": 0.0922,
2816
+ "mean_token_accuracy": 0.7664016611874104,
2817
+ "num_tokens": 25557504.0,
2818
+ "step": 3120
2819
+ },
2820
+ {
2821
+ "epoch": 1.8166582021330626,
2822
+ "grad_norm": 1.4213645458221436,
2823
+ "learning_rate": 4.385210662080826e-06,
2824
+ "loss": 0.1,
2825
+ "mean_token_accuracy": 0.7702666360884904,
2826
+ "num_tokens": 25639424.0,
2827
+ "step": 3130
2828
+ },
2829
+ {
2830
+ "epoch": 1.82246245374737,
2831
+ "grad_norm": 1.2967301607131958,
2832
+ "learning_rate": 4.363714531384351e-06,
2833
+ "loss": 0.1063,
2834
+ "mean_token_accuracy": 0.7763576325029135,
2835
+ "num_tokens": 25721344.0,
2836
+ "step": 3140
2837
+ },
2838
+ {
2839
+ "epoch": 1.8282667053616775,
2840
+ "grad_norm": 1.6965436935424805,
2841
+ "learning_rate": 4.342218400687877e-06,
2842
+ "loss": 0.0872,
2843
+ "mean_token_accuracy": 0.7696550872176886,
2844
+ "num_tokens": 25803264.0,
2845
+ "step": 3150
2846
+ },
2847
+ {
2848
+ "epoch": 1.8340709569759848,
2849
+ "grad_norm": 1.4721688032150269,
2850
+ "learning_rate": 4.320722269991402e-06,
2851
+ "loss": 0.0964,
2852
+ "mean_token_accuracy": 0.7688478477299213,
2853
+ "num_tokens": 25885184.0,
2854
+ "step": 3160
2855
+ },
2856
+ {
2857
+ "epoch": 1.8398752085902923,
2858
+ "grad_norm": 1.4195119142532349,
2859
+ "learning_rate": 4.299226139294928e-06,
2860
+ "loss": 0.0991,
2861
+ "mean_token_accuracy": 0.7586594883352518,
2862
+ "num_tokens": 25967104.0,
2863
+ "step": 3170
2864
+ },
2865
+ {
2866
+ "epoch": 1.8456794602045998,
2867
+ "grad_norm": 1.3307346105575562,
2868
+ "learning_rate": 4.277730008598453e-06,
2869
+ "loss": 0.0854,
2870
+ "mean_token_accuracy": 0.7778987269848585,
2871
+ "num_tokens": 26049024.0,
2872
+ "step": 3180
2873
+ },
2874
+ {
2875
+ "epoch": 1.8514837118189074,
2876
+ "grad_norm": 1.5801392793655396,
2877
+ "learning_rate": 4.256233877901978e-06,
2878
+ "loss": 0.094,
2879
+ "mean_token_accuracy": 0.7833414874970913,
2880
+ "num_tokens": 26130944.0,
2881
+ "step": 3190
2882
+ },
2883
+ {
2884
+ "epoch": 1.857287963433215,
2885
+ "grad_norm": 1.1459014415740967,
2886
+ "learning_rate": 4.234737747205504e-06,
2887
+ "loss": 0.098,
2888
+ "mean_token_accuracy": 0.7790851287543774,
2889
+ "num_tokens": 26212864.0,
2890
+ "step": 3200
2891
+ },
2892
+ {
2893
+ "epoch": 1.8630922150475224,
2894
+ "grad_norm": 1.6354542970657349,
2895
+ "learning_rate": 4.213241616509029e-06,
2896
+ "loss": 0.0866,
2897
+ "mean_token_accuracy": 0.7839407980442047,
2898
+ "num_tokens": 26294784.0,
2899
+ "step": 3210
2900
+ },
2901
+ {
2902
+ "epoch": 1.8688964666618297,
2903
+ "grad_norm": 1.173956274986267,
2904
+ "learning_rate": 4.191745485812554e-06,
2905
+ "loss": 0.0911,
2906
+ "mean_token_accuracy": 0.7827788643538952,
2907
+ "num_tokens": 26376704.0,
2908
+ "step": 3220
2909
+ },
2910
+ {
2911
+ "epoch": 1.8747007182761373,
2912
+ "grad_norm": 1.160238265991211,
2913
+ "learning_rate": 4.17024935511608e-06,
2914
+ "loss": 0.1016,
2915
+ "mean_token_accuracy": 0.7670009769499302,
2916
+ "num_tokens": 26458624.0,
2917
+ "step": 3230
2918
+ },
2919
+ {
2920
+ "epoch": 1.8805049698904448,
2921
+ "grad_norm": 1.2570644617080688,
2922
+ "learning_rate": 4.148753224419605e-06,
2923
+ "loss": 0.0752,
2924
+ "mean_token_accuracy": 0.7890044003725052,
2925
+ "num_tokens": 26540544.0,
2926
+ "step": 3240
2927
+ },
2928
+ {
2929
+ "epoch": 1.886309221504752,
2930
+ "grad_norm": 1.4081250429153442,
2931
+ "learning_rate": 4.12725709372313e-06,
2932
+ "loss": 0.0872,
2933
+ "mean_token_accuracy": 0.7651051852852107,
2934
+ "num_tokens": 26622464.0,
2935
+ "step": 3250
2936
+ },
2937
+ {
2938
+ "epoch": 1.8921134731190596,
2939
+ "grad_norm": 1.629294753074646,
2940
+ "learning_rate": 4.105760963026656e-06,
2941
+ "loss": 0.0879,
2942
+ "mean_token_accuracy": 0.7650929551571608,
2943
+ "num_tokens": 26704384.0,
2944
+ "step": 3260
2945
+ },
2946
+ {
2947
+ "epoch": 1.8979177247333672,
2948
+ "grad_norm": 1.3318721055984497,
2949
+ "learning_rate": 4.0842648323301806e-06,
2950
+ "loss": 0.0849,
2951
+ "mean_token_accuracy": 0.7725538168102503,
2952
+ "num_tokens": 26786304.0,
2953
+ "step": 3270
2954
+ },
2955
+ {
2956
+ "epoch": 1.9037219763476747,
2957
+ "grad_norm": 1.236822485923767,
2958
+ "learning_rate": 4.062768701633706e-06,
2959
+ "loss": 0.0897,
2960
+ "mean_token_accuracy": 0.7780821904540062,
2961
+ "num_tokens": 26868224.0,
2962
+ "step": 3280
2963
+ },
2964
+ {
2965
+ "epoch": 1.9095262279619822,
2966
+ "grad_norm": 1.0354118347167969,
2967
+ "learning_rate": 4.041272570937232e-06,
2968
+ "loss": 0.0907,
2969
+ "mean_token_accuracy": 0.7778742648661137,
2970
+ "num_tokens": 26950144.0,
2971
+ "step": 3290
2972
+ },
2973
+ {
2974
+ "epoch": 1.9153304795762898,
2975
+ "grad_norm": 1.2480920553207397,
2976
+ "learning_rate": 4.0197764402407565e-06,
2977
+ "loss": 0.0867,
2978
+ "mean_token_accuracy": 0.7747186873108148,
2979
+ "num_tokens": 27032064.0,
2980
+ "step": 3300
2981
+ },
2982
+ {
2983
+ "epoch": 1.921134731190597,
2984
+ "grad_norm": 1.3865294456481934,
2985
+ "learning_rate": 3.998280309544282e-06,
2986
+ "loss": 0.0956,
2987
+ "mean_token_accuracy": 0.784209881350398,
2988
+ "num_tokens": 27113984.0,
2989
+ "step": 3310
2990
+ },
2991
+ {
2992
+ "epoch": 1.9269389828049046,
2993
+ "grad_norm": 1.4300016164779663,
2994
+ "learning_rate": 3.976784178847808e-06,
2995
+ "loss": 0.1034,
2996
+ "mean_token_accuracy": 0.7619985315948725,
2997
+ "num_tokens": 27195904.0,
2998
+ "step": 3320
2999
+ },
3000
+ {
3001
+ "epoch": 1.932743234419212,
3002
+ "grad_norm": 1.2628036737442017,
3003
+ "learning_rate": 3.9552880481513325e-06,
3004
+ "loss": 0.1013,
3005
+ "mean_token_accuracy": 0.7635885566473007,
3006
+ "num_tokens": 27277824.0,
3007
+ "step": 3330
3008
+ },
3009
+ {
3010
+ "epoch": 1.9385474860335195,
3011
+ "grad_norm": 1.4809879064559937,
3012
+ "learning_rate": 3.933791917454858e-06,
3013
+ "loss": 0.0914,
3014
+ "mean_token_accuracy": 0.7683708433061838,
3015
+ "num_tokens": 27359744.0,
3016
+ "step": 3340
3017
+ },
3018
+ {
3019
+ "epoch": 1.944351737647827,
3020
+ "grad_norm": 1.3218101263046265,
3021
+ "learning_rate": 3.912295786758384e-06,
3022
+ "loss": 0.1039,
3023
+ "mean_token_accuracy": 0.7580479431897402,
3024
+ "num_tokens": 27441664.0,
3025
+ "step": 3350
3026
+ },
3027
+ {
3028
+ "epoch": 1.9501559892621345,
3029
+ "grad_norm": 2.1898369789123535,
3030
+ "learning_rate": 3.890799656061909e-06,
3031
+ "loss": 0.0868,
3032
+ "mean_token_accuracy": 0.7617172211408615,
3033
+ "num_tokens": 27523584.0,
3034
+ "step": 3360
3035
+ },
3036
+ {
3037
+ "epoch": 1.955960240876442,
3038
+ "grad_norm": 1.6249711513519287,
3039
+ "learning_rate": 3.869303525365434e-06,
3040
+ "loss": 0.0848,
3041
+ "mean_token_accuracy": 0.791927594691515,
3042
+ "num_tokens": 27605504.0,
3043
+ "step": 3370
3044
+ },
3045
+ {
3046
+ "epoch": 1.9617644924907496,
3047
+ "grad_norm": 1.3207077980041504,
3048
+ "learning_rate": 3.84780739466896e-06,
3049
+ "loss": 0.0811,
3050
+ "mean_token_accuracy": 0.7898116454482078,
3051
+ "num_tokens": 27687424.0,
3052
+ "step": 3380
3053
+ },
3054
+ {
3055
+ "epoch": 1.967568744105057,
3056
+ "grad_norm": 1.4129955768585205,
3057
+ "learning_rate": 3.826311263972485e-06,
3058
+ "loss": 0.0852,
3059
+ "mean_token_accuracy": 0.7753913916647435,
3060
+ "num_tokens": 27769344.0,
3061
+ "step": 3390
3062
+ },
3063
+ {
3064
+ "epoch": 1.9733729957193644,
3065
+ "grad_norm": 2.2473080158233643,
3066
+ "learning_rate": 3.804815133276011e-06,
3067
+ "loss": 0.089,
3068
+ "mean_token_accuracy": 0.7784858100116253,
3069
+ "num_tokens": 27851264.0,
3070
+ "step": 3400
3071
+ },
3072
+ {
3073
+ "epoch": 1.979177247333672,
3074
+ "grad_norm": 1.0944265127182007,
3075
+ "learning_rate": 3.7833190025795357e-06,
3076
+ "loss": 0.0988,
3077
+ "mean_token_accuracy": 0.7655577287077904,
3078
+ "num_tokens": 27933184.0,
3079
+ "step": 3410
3080
+ },
3081
+ {
3082
+ "epoch": 1.9849814989479793,
3083
+ "grad_norm": 1.1985645294189453,
3084
+ "learning_rate": 3.7618228718830613e-06,
3085
+ "loss": 0.0829,
3086
+ "mean_token_accuracy": 0.7765900172293186,
3087
+ "num_tokens": 28015104.0,
3088
+ "step": 3420
3089
+ },
3090
+ {
3091
+ "epoch": 1.9907857505622868,
3092
+ "grad_norm": 1.6854956150054932,
3093
+ "learning_rate": 3.740326741186587e-06,
3094
+ "loss": 0.0837,
3095
+ "mean_token_accuracy": 0.7793052829802036,
3096
+ "num_tokens": 28097024.0,
3097
+ "step": 3430
3098
+ },
3099
+ {
3100
+ "epoch": 1.9965900021765943,
3101
+ "grad_norm": 1.2909150123596191,
3102
+ "learning_rate": 3.7188306104901125e-06,
3103
+ "loss": 0.093,
3104
+ "mean_token_accuracy": 0.7680161453783512,
3105
+ "num_tokens": 28178944.0,
3106
+ "step": 3440
3107
+ },
3108
+ {
3109
+ "epoch": 2.002321700645723,
3110
+ "grad_norm": 1.2459158897399902,
3111
+ "learning_rate": 3.6973344797936372e-06,
3112
+ "loss": 0.0914,
3113
+ "mean_token_accuracy": 0.7728331138061572,
3114
+ "num_tokens": 28259328.0,
3115
+ "step": 3450
3116
+ },
3117
+ {
3118
+ "epoch": 2.0081259522600305,
3119
+ "grad_norm": 1.108630895614624,
3120
+ "learning_rate": 3.675838349097163e-06,
3121
+ "loss": 0.0655,
3122
+ "mean_token_accuracy": 0.7846501931548119,
3123
+ "num_tokens": 28341248.0,
3124
+ "step": 3460
3125
+ },
3126
+ {
3127
+ "epoch": 2.013930203874338,
3128
+ "grad_norm": 1.8924497365951538,
3129
+ "learning_rate": 3.6543422184006884e-06,
3130
+ "loss": 0.0778,
3131
+ "mean_token_accuracy": 0.7803449083119631,
3132
+ "num_tokens": 28423168.0,
3133
+ "step": 3470
3134
+ },
3135
+ {
3136
+ "epoch": 2.0197344554886456,
3137
+ "grad_norm": 1.6642768383026123,
3138
+ "learning_rate": 3.6328460877042136e-06,
3139
+ "loss": 0.0753,
3140
+ "mean_token_accuracy": 0.7758439350873232,
3141
+ "num_tokens": 28505088.0,
3142
+ "step": 3480
3143
+ },
3144
+ {
3145
+ "epoch": 2.0255387071029527,
3146
+ "grad_norm": 2.043719530105591,
3147
+ "learning_rate": 3.611349957007739e-06,
3148
+ "loss": 0.0772,
3149
+ "mean_token_accuracy": 0.7852984338998794,
3150
+ "num_tokens": 28587008.0,
3151
+ "step": 3490
3152
+ },
3153
+ {
3154
+ "epoch": 2.0313429587172602,
3155
+ "grad_norm": 1.5983003377914429,
3156
+ "learning_rate": 3.5898538263112644e-06,
3157
+ "loss": 0.074,
3158
+ "mean_token_accuracy": 0.7707681007683277,
3159
+ "num_tokens": 28668928.0,
3160
+ "step": 3500
3161
  }
3162
  ],
3163
  "logging_steps": 10,
 
3177
  "attributes": {}
3178
  }
3179
  },
3180
+ "total_flos": 7.5766378694443e+16,
3181
  "train_batch_size": 2,
3182
  "trial_name": null,
3183
  "trial_params": null