Plofski commited on
Commit
6ef2d11
·
verified ·
1 Parent(s): 49b0640

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6323430495422b2f5e9b7844076fda108d2adbed4d0037f47d6e99938d8fca29
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6c549f8867ec3aa46fdb16d23e60b8f0ae222fa21f2d19da894e88c1f3b09c3
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23e6291f28b6db5850e454b2320b2900c167fefe5276101f07b3b0cce8757420
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53021cc365fb35689cf7935de3e1b4f7d09b54591f9f81b2f7c83736f1ee6045
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37b2e328f1145450725e3266f16e300be997471c44b08eae4fb08a4a11d9367a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:852ff1feb145f352899b6aa5117c88c8890d68604ca6bf2baf1e72eb1508c72e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5037275841225065,
6
  "eval_steps": 500,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2258,6 +2258,456 @@
2258
  "mean_token_accuracy": 0.7839685261249543,
2259
  "num_tokens": 2765568.0,
2260
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2261
  }
2262
  ],
2263
  "logging_steps": 10,
@@ -2277,7 +2727,7 @@
2277
  "attributes": {}
2278
  }
2279
  },
2280
- "total_flos": 3352572806252544.0,
2281
  "train_batch_size": 8,
2282
  "trial_name": null,
2283
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6044731009470079,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2258
  "mean_token_accuracy": 0.7839685261249543,
2259
  "num_tokens": 2765568.0,
2260
  "step": 2500
2261
+ },
2262
+ {
2263
+ "epoch": 0.5057424944589965,
2264
+ "grad_norm": 10.5,
2265
+ "learning_rate": 1.6629726643831016e-05,
2266
+ "loss": 0.9455,
2267
+ "mean_token_accuracy": 0.7747004866600037,
2268
+ "num_tokens": 2777172.0,
2269
+ "step": 2510
2270
+ },
2271
+ {
2272
+ "epoch": 0.5077574047954866,
2273
+ "grad_norm": 10.6875,
2274
+ "learning_rate": 1.6616293908254418e-05,
2275
+ "loss": 0.9188,
2276
+ "mean_token_accuracy": 0.7760649502277375,
2277
+ "num_tokens": 2789318.0,
2278
+ "step": 2520
2279
+ },
2280
+ {
2281
+ "epoch": 0.5097723151319766,
2282
+ "grad_norm": 12.625,
2283
+ "learning_rate": 1.6602861172677817e-05,
2284
+ "loss": 0.9055,
2285
+ "mean_token_accuracy": 0.7822152853012085,
2286
+ "num_tokens": 2800307.0,
2287
+ "step": 2530
2288
+ },
2289
+ {
2290
+ "epoch": 0.5117872254684667,
2291
+ "grad_norm": 12.9375,
2292
+ "learning_rate": 1.6589428437101216e-05,
2293
+ "loss": 0.9045,
2294
+ "mean_token_accuracy": 0.7808859765529632,
2295
+ "num_tokens": 2811228.0,
2296
+ "step": 2540
2297
+ },
2298
+ {
2299
+ "epoch": 0.5138021358049567,
2300
+ "grad_norm": 11.25,
2301
+ "learning_rate": 1.6575995701524618e-05,
2302
+ "loss": 0.8518,
2303
+ "mean_token_accuracy": 0.7890688478946686,
2304
+ "num_tokens": 2821512.0,
2305
+ "step": 2550
2306
+ },
2307
+ {
2308
+ "epoch": 0.5158170461414467,
2309
+ "grad_norm": 13.5,
2310
+ "learning_rate": 1.6562562965948017e-05,
2311
+ "loss": 0.9224,
2312
+ "mean_token_accuracy": 0.7703654527664184,
2313
+ "num_tokens": 2832524.0,
2314
+ "step": 2560
2315
+ },
2316
+ {
2317
+ "epoch": 0.5178319564779368,
2318
+ "grad_norm": 11.1875,
2319
+ "learning_rate": 1.6549130230371416e-05,
2320
+ "loss": 0.9554,
2321
+ "mean_token_accuracy": 0.7688623070716858,
2322
+ "num_tokens": 2844366.0,
2323
+ "step": 2570
2324
+ },
2325
+ {
2326
+ "epoch": 0.5198468668144267,
2327
+ "grad_norm": 16.25,
2328
+ "learning_rate": 1.6535697494794815e-05,
2329
+ "loss": 0.8999,
2330
+ "mean_token_accuracy": 0.7806227684020997,
2331
+ "num_tokens": 2855210.0,
2332
+ "step": 2580
2333
+ },
2334
+ {
2335
+ "epoch": 0.5218617771509168,
2336
+ "grad_norm": 12.25,
2337
+ "learning_rate": 1.6522264759218217e-05,
2338
+ "loss": 0.951,
2339
+ "mean_token_accuracy": 0.7751711547374726,
2340
+ "num_tokens": 2865914.0,
2341
+ "step": 2590
2342
+ },
2343
+ {
2344
+ "epoch": 0.5238766874874068,
2345
+ "grad_norm": 13.375,
2346
+ "learning_rate": 1.6508832023641616e-05,
2347
+ "loss": 0.951,
2348
+ "mean_token_accuracy": 0.7700467705726624,
2349
+ "num_tokens": 2875344.0,
2350
+ "step": 2600
2351
+ },
2352
+ {
2353
+ "epoch": 0.5258915978238968,
2354
+ "grad_norm": 11.25,
2355
+ "learning_rate": 1.6495399288065014e-05,
2356
+ "loss": 0.9551,
2357
+ "mean_token_accuracy": 0.7730932533740997,
2358
+ "num_tokens": 2886246.0,
2359
+ "step": 2610
2360
+ },
2361
+ {
2362
+ "epoch": 0.5279065081603869,
2363
+ "grad_norm": 11.25,
2364
+ "learning_rate": 1.6481966552488417e-05,
2365
+ "loss": 0.8656,
2366
+ "mean_token_accuracy": 0.7870718777179718,
2367
+ "num_tokens": 2896640.0,
2368
+ "step": 2620
2369
+ },
2370
+ {
2371
+ "epoch": 0.5299214184968769,
2372
+ "grad_norm": 13.5,
2373
+ "learning_rate": 1.6468533816911816e-05,
2374
+ "loss": 0.8316,
2375
+ "mean_token_accuracy": 0.790769350528717,
2376
+ "num_tokens": 2907645.0,
2377
+ "step": 2630
2378
+ },
2379
+ {
2380
+ "epoch": 0.531936328833367,
2381
+ "grad_norm": 10.5,
2382
+ "learning_rate": 1.6455101081335218e-05,
2383
+ "loss": 0.8114,
2384
+ "mean_token_accuracy": 0.7977836310863495,
2385
+ "num_tokens": 2918502.0,
2386
+ "step": 2640
2387
+ },
2388
+ {
2389
+ "epoch": 0.533951239169857,
2390
+ "grad_norm": 13.0,
2391
+ "learning_rate": 1.6441668345758613e-05,
2392
+ "loss": 0.7938,
2393
+ "mean_token_accuracy": 0.8016635835170746,
2394
+ "num_tokens": 2929723.0,
2395
+ "step": 2650
2396
+ },
2397
+ {
2398
+ "epoch": 0.535966149506347,
2399
+ "grad_norm": 8.8125,
2400
+ "learning_rate": 1.6428235610182015e-05,
2401
+ "loss": 0.8881,
2402
+ "mean_token_accuracy": 0.7800805151462555,
2403
+ "num_tokens": 2941026.0,
2404
+ "step": 2660
2405
+ },
2406
+ {
2407
+ "epoch": 0.537981059842837,
2408
+ "grad_norm": 10.75,
2409
+ "learning_rate": 1.6414802874605414e-05,
2410
+ "loss": 0.98,
2411
+ "mean_token_accuracy": 0.7662722408771515,
2412
+ "num_tokens": 2952300.0,
2413
+ "step": 2670
2414
+ },
2415
+ {
2416
+ "epoch": 0.539995970179327,
2417
+ "grad_norm": 11.1875,
2418
+ "learning_rate": 1.6401370139028813e-05,
2419
+ "loss": 0.9393,
2420
+ "mean_token_accuracy": 0.7773295342922211,
2421
+ "num_tokens": 2963688.0,
2422
+ "step": 2680
2423
+ },
2424
+ {
2425
+ "epoch": 0.542010880515817,
2426
+ "grad_norm": 10.5,
2427
+ "learning_rate": 1.6387937403452215e-05,
2428
+ "loss": 0.8579,
2429
+ "mean_token_accuracy": 0.7857769846916198,
2430
+ "num_tokens": 2975798.0,
2431
+ "step": 2690
2432
+ },
2433
+ {
2434
+ "epoch": 0.5440257908523071,
2435
+ "grad_norm": 11.5625,
2436
+ "learning_rate": 1.6374504667875614e-05,
2437
+ "loss": 0.8915,
2438
+ "mean_token_accuracy": 0.7862484276294708,
2439
+ "num_tokens": 2986958.0,
2440
+ "step": 2700
2441
+ },
2442
+ {
2443
+ "epoch": 0.5460407011887971,
2444
+ "grad_norm": 14.0625,
2445
+ "learning_rate": 1.6361071932299013e-05,
2446
+ "loss": 0.944,
2447
+ "mean_token_accuracy": 0.774361002445221,
2448
+ "num_tokens": 2998027.0,
2449
+ "step": 2710
2450
+ },
2451
+ {
2452
+ "epoch": 0.5480556115252871,
2453
+ "grad_norm": 9.8125,
2454
+ "learning_rate": 1.6347639196722412e-05,
2455
+ "loss": 0.8616,
2456
+ "mean_token_accuracy": 0.7841140806674958,
2457
+ "num_tokens": 3008990.0,
2458
+ "step": 2720
2459
+ },
2460
+ {
2461
+ "epoch": 0.5500705218617772,
2462
+ "grad_norm": 11.625,
2463
+ "learning_rate": 1.6334206461145814e-05,
2464
+ "loss": 0.995,
2465
+ "mean_token_accuracy": 0.764478224515915,
2466
+ "num_tokens": 3020180.0,
2467
+ "step": 2730
2468
+ },
2469
+ {
2470
+ "epoch": 0.5520854321982672,
2471
+ "grad_norm": 16.375,
2472
+ "learning_rate": 1.6320773725569213e-05,
2473
+ "loss": 0.9769,
2474
+ "mean_token_accuracy": 0.762674230337143,
2475
+ "num_tokens": 3031468.0,
2476
+ "step": 2740
2477
+ },
2478
+ {
2479
+ "epoch": 0.5541003425347572,
2480
+ "grad_norm": 11.0625,
2481
+ "learning_rate": 1.6307340989992612e-05,
2482
+ "loss": 0.9019,
2483
+ "mean_token_accuracy": 0.7809956490993499,
2484
+ "num_tokens": 3043604.0,
2485
+ "step": 2750
2486
+ },
2487
+ {
2488
+ "epoch": 0.5561152528712472,
2489
+ "grad_norm": 10.3125,
2490
+ "learning_rate": 1.6293908254416014e-05,
2491
+ "loss": 0.8849,
2492
+ "mean_token_accuracy": 0.7833445549011231,
2493
+ "num_tokens": 3054038.0,
2494
+ "step": 2760
2495
+ },
2496
+ {
2497
+ "epoch": 0.5581301632077372,
2498
+ "grad_norm": 13.3125,
2499
+ "learning_rate": 1.6280475518839413e-05,
2500
+ "loss": 0.9858,
2501
+ "mean_token_accuracy": 0.7633112788200378,
2502
+ "num_tokens": 3064515.0,
2503
+ "step": 2770
2504
+ },
2505
+ {
2506
+ "epoch": 0.5601450735442273,
2507
+ "grad_norm": 12.3125,
2508
+ "learning_rate": 1.626704278326281e-05,
2509
+ "loss": 0.8388,
2510
+ "mean_token_accuracy": 0.7929341971874238,
2511
+ "num_tokens": 3075661.0,
2512
+ "step": 2780
2513
+ },
2514
+ {
2515
+ "epoch": 0.5621599838807173,
2516
+ "grad_norm": 11.6875,
2517
+ "learning_rate": 1.6253610047686214e-05,
2518
+ "loss": 0.9444,
2519
+ "mean_token_accuracy": 0.7728216648101807,
2520
+ "num_tokens": 3087402.0,
2521
+ "step": 2790
2522
+ },
2523
+ {
2524
+ "epoch": 0.5641748942172073,
2525
+ "grad_norm": 15.3125,
2526
+ "learning_rate": 1.6240177312109613e-05,
2527
+ "loss": 0.9326,
2528
+ "mean_token_accuracy": 0.7675645828247071,
2529
+ "num_tokens": 3097807.0,
2530
+ "step": 2800
2531
+ },
2532
+ {
2533
+ "epoch": 0.5661898045536974,
2534
+ "grad_norm": 11.6875,
2535
+ "learning_rate": 1.6226744576533015e-05,
2536
+ "loss": 0.8888,
2537
+ "mean_token_accuracy": 0.7739431619644165,
2538
+ "num_tokens": 3109256.0,
2539
+ "step": 2810
2540
+ },
2541
+ {
2542
+ "epoch": 0.5682047148901874,
2543
+ "grad_norm": 9.9375,
2544
+ "learning_rate": 1.621331184095641e-05,
2545
+ "loss": 0.8275,
2546
+ "mean_token_accuracy": 0.796435010433197,
2547
+ "num_tokens": 3119274.0,
2548
+ "step": 2820
2549
+ },
2550
+ {
2551
+ "epoch": 0.5702196252266775,
2552
+ "grad_norm": 11.8125,
2553
+ "learning_rate": 1.6199879105379813e-05,
2554
+ "loss": 0.9389,
2555
+ "mean_token_accuracy": 0.7698701798915863,
2556
+ "num_tokens": 3129740.0,
2557
+ "step": 2830
2558
+ },
2559
+ {
2560
+ "epoch": 0.5722345355631674,
2561
+ "grad_norm": 10.5625,
2562
+ "learning_rate": 1.618644636980321e-05,
2563
+ "loss": 0.9461,
2564
+ "mean_token_accuracy": 0.7643969297409058,
2565
+ "num_tokens": 3141041.0,
2566
+ "step": 2840
2567
+ },
2568
+ {
2569
+ "epoch": 0.5742494458996574,
2570
+ "grad_norm": 13.8125,
2571
+ "learning_rate": 1.617301363422661e-05,
2572
+ "loss": 1.0038,
2573
+ "mean_token_accuracy": 0.7637837052345275,
2574
+ "num_tokens": 3151060.0,
2575
+ "step": 2850
2576
+ },
2577
+ {
2578
+ "epoch": 0.5762643562361475,
2579
+ "grad_norm": 10.75,
2580
+ "learning_rate": 1.6159580898650012e-05,
2581
+ "loss": 0.862,
2582
+ "mean_token_accuracy": 0.785337769985199,
2583
+ "num_tokens": 3162459.0,
2584
+ "step": 2860
2585
+ },
2586
+ {
2587
+ "epoch": 0.5782792665726375,
2588
+ "grad_norm": 10.5625,
2589
+ "learning_rate": 1.614614816307341e-05,
2590
+ "loss": 0.9322,
2591
+ "mean_token_accuracy": 0.7696834802627563,
2592
+ "num_tokens": 3172476.0,
2593
+ "step": 2870
2594
+ },
2595
+ {
2596
+ "epoch": 0.5802941769091275,
2597
+ "grad_norm": 13.3125,
2598
+ "learning_rate": 1.613271542749681e-05,
2599
+ "loss": 0.9818,
2600
+ "mean_token_accuracy": 0.7669282913208008,
2601
+ "num_tokens": 3182571.0,
2602
+ "step": 2880
2603
+ },
2604
+ {
2605
+ "epoch": 0.5823090872456176,
2606
+ "grad_norm": 10.0,
2607
+ "learning_rate": 1.611928269192021e-05,
2608
+ "loss": 0.8449,
2609
+ "mean_token_accuracy": 0.7930525064468383,
2610
+ "num_tokens": 3194456.0,
2611
+ "step": 2890
2612
+ },
2613
+ {
2614
+ "epoch": 0.5843239975821076,
2615
+ "grad_norm": 10.4375,
2616
+ "learning_rate": 1.610584995634361e-05,
2617
+ "loss": 0.862,
2618
+ "mean_token_accuracy": 0.7849324703216553,
2619
+ "num_tokens": 3204377.0,
2620
+ "step": 2900
2621
+ },
2622
+ {
2623
+ "epoch": 0.5863389079185977,
2624
+ "grad_norm": 11.875,
2625
+ "learning_rate": 1.609241722076701e-05,
2626
+ "loss": 0.8853,
2627
+ "mean_token_accuracy": 0.7843615412712097,
2628
+ "num_tokens": 3216082.0,
2629
+ "step": 2910
2630
+ },
2631
+ {
2632
+ "epoch": 0.5883538182550877,
2633
+ "grad_norm": 9.375,
2634
+ "learning_rate": 1.607898448519041e-05,
2635
+ "loss": 0.7821,
2636
+ "mean_token_accuracy": 0.7997645199298858,
2637
+ "num_tokens": 3226582.0,
2638
+ "step": 2920
2639
+ },
2640
+ {
2641
+ "epoch": 0.5903687285915776,
2642
+ "grad_norm": 9.1875,
2643
+ "learning_rate": 1.606555174961381e-05,
2644
+ "loss": 0.897,
2645
+ "mean_token_accuracy": 0.7760909557342529,
2646
+ "num_tokens": 3238387.0,
2647
+ "step": 2930
2648
+ },
2649
+ {
2650
+ "epoch": 0.5923836389280677,
2651
+ "grad_norm": 12.0,
2652
+ "learning_rate": 1.605211901403721e-05,
2653
+ "loss": 0.9027,
2654
+ "mean_token_accuracy": 0.78245330452919,
2655
+ "num_tokens": 3248670.0,
2656
+ "step": 2940
2657
+ },
2658
+ {
2659
+ "epoch": 0.5943985492645577,
2660
+ "grad_norm": 10.0625,
2661
+ "learning_rate": 1.603868627846061e-05,
2662
+ "loss": 0.7964,
2663
+ "mean_token_accuracy": 0.8025750041007995,
2664
+ "num_tokens": 3259282.0,
2665
+ "step": 2950
2666
+ },
2667
+ {
2668
+ "epoch": 0.5964134596010477,
2669
+ "grad_norm": 10.6875,
2670
+ "learning_rate": 1.602525354288401e-05,
2671
+ "loss": 0.8159,
2672
+ "mean_token_accuracy": 0.7951205134391784,
2673
+ "num_tokens": 3270771.0,
2674
+ "step": 2960
2675
+ },
2676
+ {
2677
+ "epoch": 0.5984283699375378,
2678
+ "grad_norm": 12.75,
2679
+ "learning_rate": 1.601182080730741e-05,
2680
+ "loss": 1.0354,
2681
+ "mean_token_accuracy": 0.7645578503608703,
2682
+ "num_tokens": 3282321.0,
2683
+ "step": 2970
2684
+ },
2685
+ {
2686
+ "epoch": 0.6004432802740278,
2687
+ "grad_norm": 14.6875,
2688
+ "learning_rate": 1.599838807173081e-05,
2689
+ "loss": 0.9655,
2690
+ "mean_token_accuracy": 0.7646917760372162,
2691
+ "num_tokens": 3294284.0,
2692
+ "step": 2980
2693
+ },
2694
+ {
2695
+ "epoch": 0.6024581906105179,
2696
+ "grad_norm": 11.375,
2697
+ "learning_rate": 1.5984955336154207e-05,
2698
+ "loss": 0.9532,
2699
+ "mean_token_accuracy": 0.7740876019001007,
2700
+ "num_tokens": 3305259.0,
2701
+ "step": 2990
2702
+ },
2703
+ {
2704
+ "epoch": 0.6044731009470079,
2705
+ "grad_norm": 11.5625,
2706
+ "learning_rate": 1.597152260057761e-05,
2707
+ "loss": 0.8269,
2708
+ "mean_token_accuracy": 0.7911386549472809,
2709
+ "num_tokens": 3316348.0,
2710
+ "step": 3000
2711
  }
2712
  ],
2713
  "logging_steps": 10,
 
2727
  "attributes": {}
2728
  }
2729
  },
2730
+ "total_flos": 4014132187054080.0,
2731
  "train_batch_size": 8,
2732
  "trial_name": null,
2733
  "trial_params": null